Coverage for little_loops / fsm / evaluators.py: 12%
258 statements
« prev ^ index » next coverage.py v7.12.0, created at 2026-03-18 16:18 -0500
« prev ^ index » next coverage.py v7.12.0, created at 2026-03-18 16:18 -0500
1"""FSM Evaluators for loop execution.
3This module provides evaluators that interpret action output and produce
4verdicts for state transitions.
6Supported evaluator types:
8Tier 1 (Deterministic - no API calls):
9 exit_code: Map Unix exit codes to verdicts (0=success, 1=failure, 2+=error)
10 output_numeric: Compare numeric output to target value
11 output_json: Extract and compare JSON path values
12 output_contains: Pattern matching on stdout
13 convergence: Track progress toward a target value
14 diff_stall: Detect stalled iterations via git diff comparison
16Tier 2 (LLM-based):
17 llm_structured: Use LLM with structured output for natural language evaluation
18"""
20from __future__ import annotations
22import hashlib
23import json
24import re
25import subprocess
26import time
27from collections.abc import Callable
28from dataclasses import dataclass
29from pathlib import Path
30from typing import Any
32from little_loops.fsm.interpolation import (
33 InterpolationContext,
34 InterpolationError,
35 interpolate,
36)
37from little_loops.fsm.schema import DEFAULT_LLM_MODEL, EvaluateConfig
40@dataclass
41class EvaluationResult:
42 """Result from an evaluator.
44 Attributes:
45 verdict: The routing key for state transitions
46 details: Evaluator-specific metadata for debugging/logging
47 """
49 verdict: str
50 details: dict[str, Any]
53# Default schema for LLM structured evaluation
54DEFAULT_LLM_SCHEMA: dict[str, Any] = {
55 "type": "object",
56 "properties": {
57 "verdict": {
58 "type": "string",
59 "enum": ["yes", "no", "blocked", "partial"],
60 "description": (
61 "- yes: The condition/check evaluated to true\n"
62 "- no: The condition/check evaluated to false\n"
63 "- blocked: Cannot proceed without external help\n"
64 "- partial: Made progress but not complete"
65 ),
66 },
67 "confidence": {
68 "type": "number",
69 "minimum": 0,
70 "maximum": 1,
71 "description": "Confidence in this verdict (0-1)",
72 },
73 "reason": {
74 "type": "string",
75 "description": "Brief explanation",
76 },
77 },
78 "required": ["verdict", "confidence", "reason"],
79}
81DEFAULT_LLM_PROMPT = "Evaluate whether this action succeeded based on its output."
83_NUMERIC_OPERATORS: dict[str, Callable[[float, float], bool]] = {
84 "eq": lambda v, t: v == t,
85 "ne": lambda v, t: v != t,
86 "lt": lambda v, t: v < t,
87 "le": lambda v, t: v <= t,
88 "gt": lambda v, t: v > t,
89 "ge": lambda v, t: v >= t,
90}
93def evaluate_exit_code(exit_code: int) -> EvaluationResult:
94 """Map Unix exit code to verdict.
96 Args:
97 exit_code: The process exit code
99 Returns:
100 EvaluationResult with verdict:
101 - 0 -> yes
102 - 1 -> no
103 - 2+ -> error
104 """
105 if exit_code == 0:
106 verdict = "yes"
107 elif exit_code == 1:
108 verdict = "no"
109 else:
110 verdict = "error"
112 return EvaluationResult(verdict=verdict, details={"exit_code": exit_code})
115def evaluate_output_numeric(
116 output: str,
117 operator: str,
118 target: float,
119) -> EvaluationResult:
120 """Parse stdout as number and compare to target.
122 Args:
123 output: The action stdout to parse as a number
124 operator: Comparison operator (eq, ne, lt, le, gt, ge)
125 target: Target value to compare against
127 Returns:
128 EvaluationResult with verdict:
129 - Condition met -> yes
130 - Condition not met -> no
131 - Parse error -> error
132 """
133 try:
134 value = float(output.strip())
135 except ValueError:
136 return EvaluationResult(
137 verdict="error",
138 details={"error": f"Cannot parse as number: {output[:100]}"},
139 )
141 if operator not in _NUMERIC_OPERATORS:
142 return EvaluationResult(
143 verdict="error",
144 details={"error": f"Unknown operator: {operator}"},
145 )
147 condition_met = _NUMERIC_OPERATORS[operator](value, target)
148 return EvaluationResult(
149 verdict="yes" if condition_met else "no",
150 details={"value": value, "target": target, "operator": operator},
151 )
154def _extract_json_path(data: Any, path: str) -> Any:
155 """Extract value from dict using jq-style path like '.summary.failed'.
157 Args:
158 data: The parsed JSON data (dict or list)
159 path: Dot-separated path, optionally starting with '.'
161 Returns:
162 The value at the specified path
164 Raises:
165 KeyError: If path not found in data
166 """
167 if path.startswith("."):
168 path = path[1:]
169 parts = path.split(".")
170 current = data
171 for part in parts:
172 if isinstance(current, dict) and part in current:
173 current = current[part]
174 elif isinstance(current, list) and part.isdigit():
175 idx = int(part)
176 if 0 <= idx < len(current):
177 current = current[idx]
178 else:
179 raise KeyError(path)
180 else:
181 raise KeyError(path)
182 return current
185def _compare_values(
186 value: int | float, operator: str, target: int | float, path: str
187) -> EvaluationResult:
188 """Compare numeric values using operator.
190 Args:
191 value: The extracted value to compare
192 operator: Comparison operator
193 target: Target value
194 path: JSON path for details
196 Returns:
197 EvaluationResult with comparison result
198 """
199 if operator not in _NUMERIC_OPERATORS:
200 return EvaluationResult(
201 verdict="error",
202 details={"error": f"Unknown operator: {operator}"},
203 )
205 condition_met = _NUMERIC_OPERATORS[operator](value, target)
206 return EvaluationResult(
207 verdict="yes" if condition_met else "no",
208 details={"value": value, "path": path, "target": target, "operator": operator},
209 )
212def evaluate_output_json(
213 output: str,
214 path: str,
215 operator: str,
216 target: Any,
217) -> EvaluationResult:
218 """Parse JSON and extract value at path, then compare.
220 Args:
221 output: The action stdout containing JSON
222 path: jq-style dot notation path (e.g., '.summary.failed')
223 operator: Comparison operator (eq, ne, lt, le, gt, ge)
224 target: Target value for comparison
226 Returns:
227 EvaluationResult with verdict:
228 - Condition met -> yes
229 - Condition not met -> no
230 - Parse/path error -> error
231 """
232 try:
233 data = json.loads(output)
234 except json.JSONDecodeError as e:
235 return EvaluationResult(
236 verdict="error",
237 details={"error": f"Invalid JSON: {e}"},
238 )
240 try:
241 value = _extract_json_path(data, path)
242 except KeyError:
243 return EvaluationResult(
244 verdict="error",
245 details={"error": f"Path not found: {path}"},
246 )
248 # Use numeric comparison if both values are numeric
249 if isinstance(value, (int, float)) and isinstance(target, (int, float)):
250 return _compare_values(value, operator, target, path)
252 # For non-numeric values, only eq and ne are supported
253 if operator == "eq":
254 verdict = "yes" if value == target else "no"
255 elif operator == "ne":
256 verdict = "yes" if value != target else "no"
257 else:
258 return EvaluationResult(
259 verdict="error",
260 details={"error": f"Operator {operator} not supported for non-numeric values"},
261 )
263 return EvaluationResult(
264 verdict=verdict,
265 details={"value": value, "path": path, "target": target, "operator": operator},
266 )
269def evaluate_output_contains(
270 output: str,
271 pattern: str,
272 negate: bool = False,
273) -> EvaluationResult:
274 """Check if pattern exists in output.
276 Pattern can be regex or substring. If regex fails to compile,
277 falls back to substring matching.
279 Args:
280 output: The action stdout to search
281 pattern: Regex pattern or substring
282 negate: If True, invert the match result
284 Returns:
285 EvaluationResult with verdict:
286 - Found (negate=False) -> yes
287 - Found (negate=True) -> no
288 - Not found (negate=False) -> no
289 - Not found (negate=True) -> yes
290 """
291 # Try regex first, fall back to substring
292 try:
293 matched = bool(re.search(pattern, output))
294 except re.error:
295 matched = pattern in output
297 if negate:
298 verdict = "no" if matched else "yes"
299 else:
300 verdict = "yes" if matched else "no"
302 return EvaluationResult(
303 verdict=verdict,
304 details={"matched": matched, "pattern": pattern, "negate": negate},
305 )
308def evaluate_convergence(
309 current: float,
310 previous: float | None,
311 target: float,
312 tolerance: float = 0,
313 direction: str = "minimize",
314) -> EvaluationResult:
315 """Compare current value to target and previous.
317 Args:
318 current: Current metric value
319 previous: Previous metric value (None if first iteration)
320 target: Target value to reach
321 tolerance: Acceptable distance from target
322 direction: 'minimize' or 'maximize'
324 Returns:
325 EvaluationResult with verdict:
326 - Value within tolerance of target -> target
327 - Value improved toward target -> progress
328 - Value unchanged or worsened -> stall
329 """
330 # Check if target reached (within tolerance)
331 if abs(current - target) <= tolerance:
332 return EvaluationResult(
333 verdict="target",
334 details={"current": current, "target": target, "delta": 0},
335 )
337 # First iteration has no previous value
338 if previous is None:
339 return EvaluationResult(
340 verdict="progress",
341 details={
342 "current": current,
343 "previous": None,
344 "target": target,
345 "delta": None,
346 },
347 )
349 # Calculate progress
350 delta = current - previous
352 if direction == "minimize":
353 # For minimizing, negative delta is progress
354 made_progress = delta < 0
355 else:
356 # For maximizing, positive delta is progress
357 made_progress = delta > 0
359 verdict = "progress" if made_progress else "stall"
361 return EvaluationResult(
362 verdict=verdict,
363 details={
364 "current": current,
365 "previous": previous,
366 "target": target,
367 "delta": delta,
368 "direction": direction,
369 },
370 )
373def evaluate_diff_stall(
374 scope: list[str] | None = None,
375 max_stall: int = 1,
376) -> EvaluationResult:
377 """Detect stalled iterations by comparing git diff --stat between runs.
379 On first call, snapshots the current diff and returns 'yes'.
380 On subsequent calls, compares current diff to the previous snapshot.
381 If the diff is identical for max_stall consecutive iterations, returns
382 'no' (stalled). If different, resets the stall counter and returns
383 'yes' (progress).
385 State is persisted in /tmp using a key derived from the scope argument,
386 so different loops with different scopes maintain independent stall counters.
388 Args:
389 scope: Optional list of paths to limit the git diff to. Defaults to
390 the entire working tree.
391 max_stall: Number of consecutive no-change iterations before stall
392 verdict. Defaults to 1.
394 Returns:
395 EvaluationResult with verdict:
396 - yes: diff changed since last iteration (progress made)
397 - no: diff unchanged for max_stall iterations (stalled)
398 - error: git command failed or timed out
399 """
400 cmd = ["git", "diff", "--stat"]
401 if scope:
402 cmd += ["--"] + scope
404 try:
405 proc = subprocess.run(cmd, capture_output=True, text=True, timeout=30)
406 except subprocess.TimeoutExpired:
407 return EvaluationResult(verdict="error", details={"error": "git diff timed out"})
408 except FileNotFoundError:
409 return EvaluationResult(verdict="error", details={"error": "git not found in PATH"})
411 if proc.returncode != 0:
412 return EvaluationResult(
413 verdict="error",
414 details={"error": f"git diff failed: {proc.stderr[:200]}"},
415 )
417 current_diff = proc.stdout
419 # Derive a stable cache key from the scope so independent loops don't collide
420 scope_str = "|".join(sorted(scope)) if scope else "_root_"
421 cache_key = hashlib.md5(scope_str.encode()).hexdigest()[:12]
422 state_file = Path(f"/tmp/ll-diff-stall-{cache_key}.txt")
423 count_file = Path(f"/tmp/ll-diff-stall-{cache_key}.count")
425 # Read previous snapshot and stall count
426 previous_diff: str | None = None
427 stall_count = 0
428 try:
429 previous_diff = state_file.read_text()
430 stall_count = int(count_file.read_text().strip())
431 except (FileNotFoundError, ValueError):
432 pass
434 # First iteration: save snapshot and report progress
435 if previous_diff is None:
436 state_file.write_text(current_diff)
437 count_file.write_text("0")
438 return EvaluationResult(
439 verdict="yes",
440 details={"stall_count": 0, "max_stall": max_stall, "diff_changed": True},
441 )
443 if current_diff == previous_diff:
444 stall_count += 1
445 count_file.write_text(str(stall_count))
446 if stall_count >= max_stall:
447 return EvaluationResult(
448 verdict="no",
449 details={"stall_count": stall_count, "max_stall": max_stall, "diff_changed": False},
450 )
451 # Not yet at max_stall threshold — still report yes so loop continues
452 return EvaluationResult(
453 verdict="yes",
454 details={"stall_count": stall_count, "max_stall": max_stall, "diff_changed": False},
455 )
456 else:
457 # Progress: update snapshot and reset counter
458 state_file.write_text(current_diff)
459 count_file.write_text("0")
460 return EvaluationResult(
461 verdict="yes",
462 details={"stall_count": 0, "max_stall": max_stall, "diff_changed": True},
463 )
466def evaluate_mcp_result(output: str, exit_code: int) -> EvaluationResult:
467 """Evaluate an MCP tool call result from the mcp-call subprocess.
469 Maps exit codes and MCP response envelope fields to routing verdicts.
471 Exit code conventions (set by mcp-call):
472 0 → parse isError from JSON envelope
473 1 → tool_error (tool ran but isError: true)
474 124 → timeout (transport-level timeout)
475 127 → not_found (server or tool missing from .mcp.json)
477 Args:
478 output: stdout from mcp-call (MCP response envelope JSON)
479 exit_code: Exit code from mcp-call subprocess
481 Returns:
482 EvaluationResult with verdict:
483 - success → isError: false
484 - tool_error → isError: true
485 - not_found → server/tool not in .mcp.json (exit 127)
486 - timeout → transport-level timeout (exit 124)
487 """
488 if exit_code == 127:
489 return EvaluationResult(
490 verdict="not_found",
491 details={"exit_code": exit_code, "error": "Server or tool not found in .mcp.json"},
492 )
494 if exit_code == 124:
495 return EvaluationResult(
496 verdict="timeout",
497 details={"exit_code": exit_code, "error": "MCP tool call timed out"},
498 )
500 # Parse MCP envelope JSON from stdout
501 try:
502 envelope = json.loads(output.strip()) if output.strip() else {}
503 except json.JSONDecodeError:
504 return EvaluationResult(
505 verdict="tool_error",
506 details={
507 "exit_code": exit_code,
508 "error": f"Invalid JSON from mcp-call: {output[:200]}",
509 },
510 )
512 is_error = envelope.get("isError", exit_code != 0)
514 if is_error:
515 return EvaluationResult(
516 verdict="tool_error",
517 details={"exit_code": exit_code, "envelope": envelope},
518 )
520 return EvaluationResult(
521 verdict="success",
522 details={"exit_code": exit_code, "envelope": envelope},
523 )
526def evaluate_llm_structured(
527 output: str,
528 prompt: str | None = None,
529 schema: dict[str, Any] | None = None,
530 min_confidence: float = 0.5,
531 uncertain_suffix: bool = False,
532 model: str = DEFAULT_LLM_MODEL,
533 max_tokens: int = 256,
534 timeout: int = 1800,
535) -> EvaluationResult:
536 """Evaluate action output using LLM with structured output via Claude CLI.
538 This is the ONLY place in the FSM system that uses LLM structured output.
539 Requires the ``claude`` CLI to be installed and authenticated.
541 Args:
542 output: Action stdout to evaluate
543 prompt: Custom evaluation prompt (defaults to basic success check)
544 schema: Custom JSON schema for structured response
545 min_confidence: Minimum confidence threshold (0-1)
546 uncertain_suffix: If True, append _uncertain to low-confidence verdicts
547 model: Model identifier (CLI aliases like "sonnet" or full names)
548 max_tokens: Maximum tokens for response (passed to --max-turns is not
549 applicable; kept for signature compat)
550 timeout: Timeout in seconds
552 Returns:
553 EvaluationResult with verdict from LLM and confidence/reason in details
554 """
555 effective_schema = schema or DEFAULT_LLM_SCHEMA
556 effective_prompt = prompt or DEFAULT_LLM_PROMPT
558 # Truncate output to avoid context limits (keep last 4000 chars)
559 truncated = output[-4000:] if len(output) > 4000 else output
561 user_prompt = f"{effective_prompt}\n\n<action_output>\n{truncated}\n</action_output>"
563 cmd = [
564 "claude",
565 "-p",
566 user_prompt,
567 "--output-format",
568 "json",
569 "--json-schema",
570 json.dumps(effective_schema),
571 "--model",
572 model,
573 "--dangerously-skip-permissions",
574 "--no-session-persistence",
575 ]
577 t0 = time.monotonic()
578 try:
579 proc = subprocess.run(cmd, capture_output=True, text=True, timeout=timeout)
580 except subprocess.TimeoutExpired:
581 return EvaluationResult(
582 verdict="error",
583 details={"error": "LLM evaluation timeout", "timeout": True},
584 )
585 except FileNotFoundError:
586 return EvaluationResult(
587 verdict="error",
588 details={
589 "error": "claude CLI not found. Install from https://docs.anthropic.com/en/docs/claude-code",
590 "missing_dependency": True,
591 },
592 )
593 llm_latency_ms = int((time.monotonic() - t0) * 1000)
595 if proc.returncode != 0:
596 return EvaluationResult(
597 verdict="error",
598 details={"error": f"Claude CLI error: {proc.stderr.strip()}", "api_error": True},
599 )
601 # Guard: empty stdout with exit 0 (API error not reflected in exit code)
602 if not proc.stdout.strip():
603 stderr_info = proc.stderr.strip()[:200] if proc.stderr else ""
604 error_msg = "Claude CLI returned empty output"
605 if stderr_info:
606 error_msg += f" (stderr: {stderr_info})"
607 return EvaluationResult(
608 verdict="error",
609 details={"error": error_msg, "empty_output": True},
610 )
612 # Parse the CLI JSON envelope and extract structured result.
613 # With --json-schema the envelope is:
614 # success: {"type":"result","subtype":"success","structured_output":{...},...}
615 # failure: {"type":"result","subtype":"error_max_structured_output_retries",...}
616 # If stdout is JSONL (multiple JSON objects), use the last non-empty line.
617 try:
618 stdout = proc.stdout.strip()
619 try:
620 envelope = json.loads(stdout)
621 except json.JSONDecodeError:
622 # Try JSONL: take the last non-empty line
623 lines = [line for line in stdout.split("\n") if line.strip()]
624 if not lines:
625 raise
626 envelope = json.loads(lines[-1])
628 # Check structured-output retry exhaustion (--json-schema failure mode)
629 if envelope.get("subtype") == "error_max_structured_output_retries":
630 return EvaluationResult(
631 verdict="error",
632 details={
633 "error": "Claude CLI could not produce valid structured output after retries",
634 "api_error": True,
635 },
636 )
638 # Check legacy is_error flag (some CLI versions exit 0 but report error in envelope)
639 if envelope.get("is_error", False):
640 err_text = str(envelope.get("result", "") or "")[:200]
641 return EvaluationResult(
642 verdict="error",
643 details={"error": f"Claude CLI reported error: {err_text}", "api_error": True},
644 )
646 # --json-schema mode returns validated dict in "structured_output"
647 if isinstance(envelope.get("structured_output"), dict):
648 llm_result: dict[str, Any] = envelope["structured_output"]
649 else:
650 raw_result = envelope.get("result", "")
651 if isinstance(raw_result, dict):
652 llm_result = raw_result
653 elif raw_result:
654 llm_result = json.loads(raw_result)
655 elif "verdict" in envelope:
656 llm_result = envelope
657 else:
658 raw_preview = proc.stdout[:300]
659 return EvaluationResult(
660 verdict="error",
661 details={
662 "error": "Empty result field in Claude CLI response",
663 "raw_preview": raw_preview,
664 },
665 )
666 except (json.JSONDecodeError, TypeError, ValueError) as e:
667 raw_preview = proc.stdout[:300] if proc.stdout else "(empty)"
668 return EvaluationResult(
669 verdict="error",
670 details={"error": f"Failed to parse LLM response: {e}", "raw_preview": raw_preview},
671 )
673 # Build result with confidence handling
674 verdict = str(llm_result.get("verdict", "error"))
675 confidence = float(llm_result.get("confidence", 1.0))
676 confident = confidence >= min_confidence
678 # Optionally modify verdict for low confidence
679 if uncertain_suffix and not confident:
680 verdict = f"{verdict}_uncertain"
682 return EvaluationResult(
683 verdict=verdict,
684 details={
685 "confidence": confidence,
686 "confident": confident,
687 "reason": llm_result.get("reason", ""),
688 "raw": llm_result,
689 "llm_model": model,
690 "llm_latency_ms": llm_latency_ms,
691 "llm_prompt": user_prompt[:500],
692 "llm_raw_output": proc.stdout[:500] if proc.stdout else "",
693 },
694 )
697def evaluate(
698 config: EvaluateConfig,
699 output: str,
700 exit_code: int,
701 context: InterpolationContext,
702) -> EvaluationResult:
703 """Dispatch to appropriate evaluator based on config type.
705 Args:
706 config: Evaluator configuration with type and parameters
707 output: Action stdout
708 exit_code: Action exit code
709 context: Runtime context for variable interpolation
711 Returns:
712 EvaluationResult from the appropriate evaluator
714 Raises:
715 ValueError: If evaluator type is unknown
716 """
717 eval_type = config.type
719 if eval_type == "exit_code":
720 return evaluate_exit_code(exit_code)
722 elif eval_type == "output_numeric":
723 if config.target is None:
724 raise ValueError("output_numeric evaluator requires 'target' to be set")
725 elif isinstance(config.target, str):
726 try:
727 resolved = interpolate(config.target, context) if context else config.target
728 numeric_target = float(resolved)
729 except (InterpolationError, ValueError) as e:
730 raise ValueError(
731 f"output_numeric target must be numeric, got: {config.target!r}"
732 ) from e
733 else:
734 numeric_target = float(config.target)
735 return evaluate_output_numeric(
736 output=output,
737 operator=config.operator or "eq",
738 target=numeric_target,
739 )
741 elif eval_type == "output_json":
742 return evaluate_output_json(
743 output=output,
744 path=config.path or "",
745 operator=config.operator or "eq",
746 target=config.target,
747 )
749 elif eval_type == "output_contains":
750 return evaluate_output_contains(
751 output=output,
752 pattern=config.pattern or "",
753 negate=config.negate,
754 )
756 elif eval_type == "convergence":
757 # Resolve previous value from interpolation if configured
758 previous: float | None = None
759 if config.previous:
760 try:
761 previous = float(interpolate(config.previous, context))
762 except (InterpolationError, ValueError):
763 # Previous unavailable on first iteration, continue with None
764 pass
766 # Parse current value from output
767 try:
768 current = float(output.strip())
769 except ValueError:
770 return EvaluationResult(
771 verdict="error",
772 details={"error": f"Cannot parse output as number: {output[:100]}"},
773 )
775 # Resolve target (may be interpolated string like "${context.target}")
776 convergence_target: float
777 if isinstance(config.target, str):
778 try:
779 convergence_target = float(interpolate(config.target, context))
780 except (InterpolationError, ValueError) as e:
781 return EvaluationResult(
782 verdict="error",
783 details={"error": f"Cannot resolve target: {e}"},
784 )
785 else:
786 if config.target is None:
787 raise ValueError("convergence evaluator requires 'target' to be set")
788 convergence_target = float(config.target)
790 # Resolve tolerance (may be interpolated string)
791 tolerance: float = 0.0
792 if config.tolerance is not None:
793 if isinstance(config.tolerance, str):
794 try:
795 tolerance = float(interpolate(config.tolerance, context))
796 except (InterpolationError, ValueError):
797 tolerance = 0.0
798 else:
799 tolerance = float(config.tolerance)
801 return evaluate_convergence(
802 current=current,
803 previous=previous,
804 target=convergence_target,
805 tolerance=tolerance,
806 direction=config.direction,
807 )
809 elif eval_type == "diff_stall":
810 return evaluate_diff_stall(
811 scope=config.scope,
812 max_stall=config.max_stall,
813 )
815 elif eval_type == "llm_structured":
816 prompt = config.prompt
817 if prompt and context:
818 try:
819 prompt = interpolate(prompt, context)
820 except InterpolationError:
821 pass # Use raw prompt on resolution failure
822 return evaluate_llm_structured(
823 output=output,
824 prompt=prompt,
825 schema=config.schema,
826 min_confidence=config.min_confidence,
827 uncertain_suffix=config.uncertain_suffix,
828 )
830 elif eval_type == "mcp_result":
831 return evaluate_mcp_result(output=output, exit_code=exit_code)
833 else:
834 raise ValueError(f"Unknown evaluator type: {eval_type}")