Coverage for little_loops / fsm / evaluators.py: 12%

258 statements  

« prev     ^ index     » next       coverage.py v7.12.0, created at 2026-03-18 16:18 -0500

1"""FSM Evaluators for loop execution. 

2 

3This module provides evaluators that interpret action output and produce 

4verdicts for state transitions. 

5 

6Supported evaluator types: 

7 

8Tier 1 (Deterministic - no API calls): 

9 exit_code: Map Unix exit codes to verdicts (0=success, 1=failure, 2+=error) 

10 output_numeric: Compare numeric output to target value 

11 output_json: Extract and compare JSON path values 

12 output_contains: Pattern matching on stdout 

13 convergence: Track progress toward a target value 

14 diff_stall: Detect stalled iterations via git diff comparison 

15 

16Tier 2 (LLM-based): 

17 llm_structured: Use LLM with structured output for natural language evaluation 

18""" 

19 

20from __future__ import annotations 

21 

22import hashlib 

23import json 

24import re 

25import subprocess 

26import time 

27from collections.abc import Callable 

28from dataclasses import dataclass 

29from pathlib import Path 

30from typing import Any 

31 

32from little_loops.fsm.interpolation import ( 

33 InterpolationContext, 

34 InterpolationError, 

35 interpolate, 

36) 

37from little_loops.fsm.schema import DEFAULT_LLM_MODEL, EvaluateConfig 

38 

39 

40@dataclass 

41class EvaluationResult: 

42 """Result from an evaluator. 

43 

44 Attributes: 

45 verdict: The routing key for state transitions 

46 details: Evaluator-specific metadata for debugging/logging 

47 """ 

48 

49 verdict: str 

50 details: dict[str, Any] 

51 

52 

53# Default schema for LLM structured evaluation 

54DEFAULT_LLM_SCHEMA: dict[str, Any] = { 

55 "type": "object", 

56 "properties": { 

57 "verdict": { 

58 "type": "string", 

59 "enum": ["yes", "no", "blocked", "partial"], 

60 "description": ( 

61 "- yes: The condition/check evaluated to true\n" 

62 "- no: The condition/check evaluated to false\n" 

63 "- blocked: Cannot proceed without external help\n" 

64 "- partial: Made progress but not complete" 

65 ), 

66 }, 

67 "confidence": { 

68 "type": "number", 

69 "minimum": 0, 

70 "maximum": 1, 

71 "description": "Confidence in this verdict (0-1)", 

72 }, 

73 "reason": { 

74 "type": "string", 

75 "description": "Brief explanation", 

76 }, 

77 }, 

78 "required": ["verdict", "confidence", "reason"], 

79} 

80 

81DEFAULT_LLM_PROMPT = "Evaluate whether this action succeeded based on its output." 

82 

83_NUMERIC_OPERATORS: dict[str, Callable[[float, float], bool]] = { 

84 "eq": lambda v, t: v == t, 

85 "ne": lambda v, t: v != t, 

86 "lt": lambda v, t: v < t, 

87 "le": lambda v, t: v <= t, 

88 "gt": lambda v, t: v > t, 

89 "ge": lambda v, t: v >= t, 

90} 

91 

92 

93def evaluate_exit_code(exit_code: int) -> EvaluationResult: 

94 """Map Unix exit code to verdict. 

95 

96 Args: 

97 exit_code: The process exit code 

98 

99 Returns: 

100 EvaluationResult with verdict: 

101 - 0 -> yes 

102 - 1 -> no 

103 - 2+ -> error 

104 """ 

105 if exit_code == 0: 

106 verdict = "yes" 

107 elif exit_code == 1: 

108 verdict = "no" 

109 else: 

110 verdict = "error" 

111 

112 return EvaluationResult(verdict=verdict, details={"exit_code": exit_code}) 

113 

114 

115def evaluate_output_numeric( 

116 output: str, 

117 operator: str, 

118 target: float, 

119) -> EvaluationResult: 

120 """Parse stdout as number and compare to target. 

121 

122 Args: 

123 output: The action stdout to parse as a number 

124 operator: Comparison operator (eq, ne, lt, le, gt, ge) 

125 target: Target value to compare against 

126 

127 Returns: 

128 EvaluationResult with verdict: 

129 - Condition met -> yes 

130 - Condition not met -> no 

131 - Parse error -> error 

132 """ 

133 try: 

134 value = float(output.strip()) 

135 except ValueError: 

136 return EvaluationResult( 

137 verdict="error", 

138 details={"error": f"Cannot parse as number: {output[:100]}"}, 

139 ) 

140 

141 if operator not in _NUMERIC_OPERATORS: 

142 return EvaluationResult( 

143 verdict="error", 

144 details={"error": f"Unknown operator: {operator}"}, 

145 ) 

146 

147 condition_met = _NUMERIC_OPERATORS[operator](value, target) 

148 return EvaluationResult( 

149 verdict="yes" if condition_met else "no", 

150 details={"value": value, "target": target, "operator": operator}, 

151 ) 

152 

153 

154def _extract_json_path(data: Any, path: str) -> Any: 

155 """Extract value from dict using jq-style path like '.summary.failed'. 

156 

157 Args: 

158 data: The parsed JSON data (dict or list) 

159 path: Dot-separated path, optionally starting with '.' 

160 

161 Returns: 

162 The value at the specified path 

163 

164 Raises: 

165 KeyError: If path not found in data 

166 """ 

167 if path.startswith("."): 

168 path = path[1:] 

169 parts = path.split(".") 

170 current = data 

171 for part in parts: 

172 if isinstance(current, dict) and part in current: 

173 current = current[part] 

174 elif isinstance(current, list) and part.isdigit(): 

175 idx = int(part) 

176 if 0 <= idx < len(current): 

177 current = current[idx] 

178 else: 

179 raise KeyError(path) 

180 else: 

181 raise KeyError(path) 

182 return current 

183 

184 

185def _compare_values( 

186 value: int | float, operator: str, target: int | float, path: str 

187) -> EvaluationResult: 

188 """Compare numeric values using operator. 

189 

190 Args: 

191 value: The extracted value to compare 

192 operator: Comparison operator 

193 target: Target value 

194 path: JSON path for details 

195 

196 Returns: 

197 EvaluationResult with comparison result 

198 """ 

199 if operator not in _NUMERIC_OPERATORS: 

200 return EvaluationResult( 

201 verdict="error", 

202 details={"error": f"Unknown operator: {operator}"}, 

203 ) 

204 

205 condition_met = _NUMERIC_OPERATORS[operator](value, target) 

206 return EvaluationResult( 

207 verdict="yes" if condition_met else "no", 

208 details={"value": value, "path": path, "target": target, "operator": operator}, 

209 ) 

210 

211 

212def evaluate_output_json( 

213 output: str, 

214 path: str, 

215 operator: str, 

216 target: Any, 

217) -> EvaluationResult: 

218 """Parse JSON and extract value at path, then compare. 

219 

220 Args: 

221 output: The action stdout containing JSON 

222 path: jq-style dot notation path (e.g., '.summary.failed') 

223 operator: Comparison operator (eq, ne, lt, le, gt, ge) 

224 target: Target value for comparison 

225 

226 Returns: 

227 EvaluationResult with verdict: 

228 - Condition met -> yes 

229 - Condition not met -> no 

230 - Parse/path error -> error 

231 """ 

232 try: 

233 data = json.loads(output) 

234 except json.JSONDecodeError as e: 

235 return EvaluationResult( 

236 verdict="error", 

237 details={"error": f"Invalid JSON: {e}"}, 

238 ) 

239 

240 try: 

241 value = _extract_json_path(data, path) 

242 except KeyError: 

243 return EvaluationResult( 

244 verdict="error", 

245 details={"error": f"Path not found: {path}"}, 

246 ) 

247 

248 # Use numeric comparison if both values are numeric 

249 if isinstance(value, (int, float)) and isinstance(target, (int, float)): 

250 return _compare_values(value, operator, target, path) 

251 

252 # For non-numeric values, only eq and ne are supported 

253 if operator == "eq": 

254 verdict = "yes" if value == target else "no" 

255 elif operator == "ne": 

256 verdict = "yes" if value != target else "no" 

257 else: 

258 return EvaluationResult( 

259 verdict="error", 

260 details={"error": f"Operator {operator} not supported for non-numeric values"}, 

261 ) 

262 

263 return EvaluationResult( 

264 verdict=verdict, 

265 details={"value": value, "path": path, "target": target, "operator": operator}, 

266 ) 

267 

268 

269def evaluate_output_contains( 

270 output: str, 

271 pattern: str, 

272 negate: bool = False, 

273) -> EvaluationResult: 

274 """Check if pattern exists in output. 

275 

276 Pattern can be regex or substring. If regex fails to compile, 

277 falls back to substring matching. 

278 

279 Args: 

280 output: The action stdout to search 

281 pattern: Regex pattern or substring 

282 negate: If True, invert the match result 

283 

284 Returns: 

285 EvaluationResult with verdict: 

286 - Found (negate=False) -> yes 

287 - Found (negate=True) -> no 

288 - Not found (negate=False) -> no 

289 - Not found (negate=True) -> yes 

290 """ 

291 # Try regex first, fall back to substring 

292 try: 

293 matched = bool(re.search(pattern, output)) 

294 except re.error: 

295 matched = pattern in output 

296 

297 if negate: 

298 verdict = "no" if matched else "yes" 

299 else: 

300 verdict = "yes" if matched else "no" 

301 

302 return EvaluationResult( 

303 verdict=verdict, 

304 details={"matched": matched, "pattern": pattern, "negate": negate}, 

305 ) 

306 

307 

308def evaluate_convergence( 

309 current: float, 

310 previous: float | None, 

311 target: float, 

312 tolerance: float = 0, 

313 direction: str = "minimize", 

314) -> EvaluationResult: 

315 """Compare current value to target and previous. 

316 

317 Args: 

318 current: Current metric value 

319 previous: Previous metric value (None if first iteration) 

320 target: Target value to reach 

321 tolerance: Acceptable distance from target 

322 direction: 'minimize' or 'maximize' 

323 

324 Returns: 

325 EvaluationResult with verdict: 

326 - Value within tolerance of target -> target 

327 - Value improved toward target -> progress 

328 - Value unchanged or worsened -> stall 

329 """ 

330 # Check if target reached (within tolerance) 

331 if abs(current - target) <= tolerance: 

332 return EvaluationResult( 

333 verdict="target", 

334 details={"current": current, "target": target, "delta": 0}, 

335 ) 

336 

337 # First iteration has no previous value 

338 if previous is None: 

339 return EvaluationResult( 

340 verdict="progress", 

341 details={ 

342 "current": current, 

343 "previous": None, 

344 "target": target, 

345 "delta": None, 

346 }, 

347 ) 

348 

349 # Calculate progress 

350 delta = current - previous 

351 

352 if direction == "minimize": 

353 # For minimizing, negative delta is progress 

354 made_progress = delta < 0 

355 else: 

356 # For maximizing, positive delta is progress 

357 made_progress = delta > 0 

358 

359 verdict = "progress" if made_progress else "stall" 

360 

361 return EvaluationResult( 

362 verdict=verdict, 

363 details={ 

364 "current": current, 

365 "previous": previous, 

366 "target": target, 

367 "delta": delta, 

368 "direction": direction, 

369 }, 

370 ) 

371 

372 

373def evaluate_diff_stall( 

374 scope: list[str] | None = None, 

375 max_stall: int = 1, 

376) -> EvaluationResult: 

377 """Detect stalled iterations by comparing git diff --stat between runs. 

378 

379 On first call, snapshots the current diff and returns 'yes'. 

380 On subsequent calls, compares current diff to the previous snapshot. 

381 If the diff is identical for max_stall consecutive iterations, returns 

382 'no' (stalled). If different, resets the stall counter and returns 

383 'yes' (progress). 

384 

385 State is persisted in /tmp using a key derived from the scope argument, 

386 so different loops with different scopes maintain independent stall counters. 

387 

388 Args: 

389 scope: Optional list of paths to limit the git diff to. Defaults to 

390 the entire working tree. 

391 max_stall: Number of consecutive no-change iterations before stall 

392 verdict. Defaults to 1. 

393 

394 Returns: 

395 EvaluationResult with verdict: 

396 - yes: diff changed since last iteration (progress made) 

397 - no: diff unchanged for max_stall iterations (stalled) 

398 - error: git command failed or timed out 

399 """ 

400 cmd = ["git", "diff", "--stat"] 

401 if scope: 

402 cmd += ["--"] + scope 

403 

404 try: 

405 proc = subprocess.run(cmd, capture_output=True, text=True, timeout=30) 

406 except subprocess.TimeoutExpired: 

407 return EvaluationResult(verdict="error", details={"error": "git diff timed out"}) 

408 except FileNotFoundError: 

409 return EvaluationResult(verdict="error", details={"error": "git not found in PATH"}) 

410 

411 if proc.returncode != 0: 

412 return EvaluationResult( 

413 verdict="error", 

414 details={"error": f"git diff failed: {proc.stderr[:200]}"}, 

415 ) 

416 

417 current_diff = proc.stdout 

418 

419 # Derive a stable cache key from the scope so independent loops don't collide 

420 scope_str = "|".join(sorted(scope)) if scope else "_root_" 

421 cache_key = hashlib.md5(scope_str.encode()).hexdigest()[:12] 

422 state_file = Path(f"/tmp/ll-diff-stall-{cache_key}.txt") 

423 count_file = Path(f"/tmp/ll-diff-stall-{cache_key}.count") 

424 

425 # Read previous snapshot and stall count 

426 previous_diff: str | None = None 

427 stall_count = 0 

428 try: 

429 previous_diff = state_file.read_text() 

430 stall_count = int(count_file.read_text().strip()) 

431 except (FileNotFoundError, ValueError): 

432 pass 

433 

434 # First iteration: save snapshot and report progress 

435 if previous_diff is None: 

436 state_file.write_text(current_diff) 

437 count_file.write_text("0") 

438 return EvaluationResult( 

439 verdict="yes", 

440 details={"stall_count": 0, "max_stall": max_stall, "diff_changed": True}, 

441 ) 

442 

443 if current_diff == previous_diff: 

444 stall_count += 1 

445 count_file.write_text(str(stall_count)) 

446 if stall_count >= max_stall: 

447 return EvaluationResult( 

448 verdict="no", 

449 details={"stall_count": stall_count, "max_stall": max_stall, "diff_changed": False}, 

450 ) 

451 # Not yet at max_stall threshold — still report yes so loop continues 

452 return EvaluationResult( 

453 verdict="yes", 

454 details={"stall_count": stall_count, "max_stall": max_stall, "diff_changed": False}, 

455 ) 

456 else: 

457 # Progress: update snapshot and reset counter 

458 state_file.write_text(current_diff) 

459 count_file.write_text("0") 

460 return EvaluationResult( 

461 verdict="yes", 

462 details={"stall_count": 0, "max_stall": max_stall, "diff_changed": True}, 

463 ) 

464 

465 

466def evaluate_mcp_result(output: str, exit_code: int) -> EvaluationResult: 

467 """Evaluate an MCP tool call result from the mcp-call subprocess. 

468 

469 Maps exit codes and MCP response envelope fields to routing verdicts. 

470 

471 Exit code conventions (set by mcp-call): 

472 0 → parse isError from JSON envelope 

473 1 → tool_error (tool ran but isError: true) 

474 124 → timeout (transport-level timeout) 

475 127 → not_found (server or tool missing from .mcp.json) 

476 

477 Args: 

478 output: stdout from mcp-call (MCP response envelope JSON) 

479 exit_code: Exit code from mcp-call subprocess 

480 

481 Returns: 

482 EvaluationResult with verdict: 

483 - success → isError: false 

484 - tool_error → isError: true 

485 - not_found → server/tool not in .mcp.json (exit 127) 

486 - timeout → transport-level timeout (exit 124) 

487 """ 

488 if exit_code == 127: 

489 return EvaluationResult( 

490 verdict="not_found", 

491 details={"exit_code": exit_code, "error": "Server or tool not found in .mcp.json"}, 

492 ) 

493 

494 if exit_code == 124: 

495 return EvaluationResult( 

496 verdict="timeout", 

497 details={"exit_code": exit_code, "error": "MCP tool call timed out"}, 

498 ) 

499 

500 # Parse MCP envelope JSON from stdout 

501 try: 

502 envelope = json.loads(output.strip()) if output.strip() else {} 

503 except json.JSONDecodeError: 

504 return EvaluationResult( 

505 verdict="tool_error", 

506 details={ 

507 "exit_code": exit_code, 

508 "error": f"Invalid JSON from mcp-call: {output[:200]}", 

509 }, 

510 ) 

511 

512 is_error = envelope.get("isError", exit_code != 0) 

513 

514 if is_error: 

515 return EvaluationResult( 

516 verdict="tool_error", 

517 details={"exit_code": exit_code, "envelope": envelope}, 

518 ) 

519 

520 return EvaluationResult( 

521 verdict="success", 

522 details={"exit_code": exit_code, "envelope": envelope}, 

523 ) 

524 

525 

526def evaluate_llm_structured( 

527 output: str, 

528 prompt: str | None = None, 

529 schema: dict[str, Any] | None = None, 

530 min_confidence: float = 0.5, 

531 uncertain_suffix: bool = False, 

532 model: str = DEFAULT_LLM_MODEL, 

533 max_tokens: int = 256, 

534 timeout: int = 1800, 

535) -> EvaluationResult: 

536 """Evaluate action output using LLM with structured output via Claude CLI. 

537 

538 This is the ONLY place in the FSM system that uses LLM structured output. 

539 Requires the ``claude`` CLI to be installed and authenticated. 

540 

541 Args: 

542 output: Action stdout to evaluate 

543 prompt: Custom evaluation prompt (defaults to basic success check) 

544 schema: Custom JSON schema for structured response 

545 min_confidence: Minimum confidence threshold (0-1) 

546 uncertain_suffix: If True, append _uncertain to low-confidence verdicts 

547 model: Model identifier (CLI aliases like "sonnet" or full names) 

548 max_tokens: Maximum tokens for response (passed to --max-turns is not 

549 applicable; kept for signature compat) 

550 timeout: Timeout in seconds 

551 

552 Returns: 

553 EvaluationResult with verdict from LLM and confidence/reason in details 

554 """ 

555 effective_schema = schema or DEFAULT_LLM_SCHEMA 

556 effective_prompt = prompt or DEFAULT_LLM_PROMPT 

557 

558 # Truncate output to avoid context limits (keep last 4000 chars) 

559 truncated = output[-4000:] if len(output) > 4000 else output 

560 

561 user_prompt = f"{effective_prompt}\n\n<action_output>\n{truncated}\n</action_output>" 

562 

563 cmd = [ 

564 "claude", 

565 "-p", 

566 user_prompt, 

567 "--output-format", 

568 "json", 

569 "--json-schema", 

570 json.dumps(effective_schema), 

571 "--model", 

572 model, 

573 "--dangerously-skip-permissions", 

574 "--no-session-persistence", 

575 ] 

576 

577 t0 = time.monotonic() 

578 try: 

579 proc = subprocess.run(cmd, capture_output=True, text=True, timeout=timeout) 

580 except subprocess.TimeoutExpired: 

581 return EvaluationResult( 

582 verdict="error", 

583 details={"error": "LLM evaluation timeout", "timeout": True}, 

584 ) 

585 except FileNotFoundError: 

586 return EvaluationResult( 

587 verdict="error", 

588 details={ 

589 "error": "claude CLI not found. Install from https://docs.anthropic.com/en/docs/claude-code", 

590 "missing_dependency": True, 

591 }, 

592 ) 

593 llm_latency_ms = int((time.monotonic() - t0) * 1000) 

594 

595 if proc.returncode != 0: 

596 return EvaluationResult( 

597 verdict="error", 

598 details={"error": f"Claude CLI error: {proc.stderr.strip()}", "api_error": True}, 

599 ) 

600 

601 # Guard: empty stdout with exit 0 (API error not reflected in exit code) 

602 if not proc.stdout.strip(): 

603 stderr_info = proc.stderr.strip()[:200] if proc.stderr else "" 

604 error_msg = "Claude CLI returned empty output" 

605 if stderr_info: 

606 error_msg += f" (stderr: {stderr_info})" 

607 return EvaluationResult( 

608 verdict="error", 

609 details={"error": error_msg, "empty_output": True}, 

610 ) 

611 

612 # Parse the CLI JSON envelope and extract structured result. 

613 # With --json-schema the envelope is: 

614 # success: {"type":"result","subtype":"success","structured_output":{...},...} 

615 # failure: {"type":"result","subtype":"error_max_structured_output_retries",...} 

616 # If stdout is JSONL (multiple JSON objects), use the last non-empty line. 

617 try: 

618 stdout = proc.stdout.strip() 

619 try: 

620 envelope = json.loads(stdout) 

621 except json.JSONDecodeError: 

622 # Try JSONL: take the last non-empty line 

623 lines = [line for line in stdout.split("\n") if line.strip()] 

624 if not lines: 

625 raise 

626 envelope = json.loads(lines[-1]) 

627 

628 # Check structured-output retry exhaustion (--json-schema failure mode) 

629 if envelope.get("subtype") == "error_max_structured_output_retries": 

630 return EvaluationResult( 

631 verdict="error", 

632 details={ 

633 "error": "Claude CLI could not produce valid structured output after retries", 

634 "api_error": True, 

635 }, 

636 ) 

637 

638 # Check legacy is_error flag (some CLI versions exit 0 but report error in envelope) 

639 if envelope.get("is_error", False): 

640 err_text = str(envelope.get("result", "") or "")[:200] 

641 return EvaluationResult( 

642 verdict="error", 

643 details={"error": f"Claude CLI reported error: {err_text}", "api_error": True}, 

644 ) 

645 

646 # --json-schema mode returns validated dict in "structured_output" 

647 if isinstance(envelope.get("structured_output"), dict): 

648 llm_result: dict[str, Any] = envelope["structured_output"] 

649 else: 

650 raw_result = envelope.get("result", "") 

651 if isinstance(raw_result, dict): 

652 llm_result = raw_result 

653 elif raw_result: 

654 llm_result = json.loads(raw_result) 

655 elif "verdict" in envelope: 

656 llm_result = envelope 

657 else: 

658 raw_preview = proc.stdout[:300] 

659 return EvaluationResult( 

660 verdict="error", 

661 details={ 

662 "error": "Empty result field in Claude CLI response", 

663 "raw_preview": raw_preview, 

664 }, 

665 ) 

666 except (json.JSONDecodeError, TypeError, ValueError) as e: 

667 raw_preview = proc.stdout[:300] if proc.stdout else "(empty)" 

668 return EvaluationResult( 

669 verdict="error", 

670 details={"error": f"Failed to parse LLM response: {e}", "raw_preview": raw_preview}, 

671 ) 

672 

673 # Build result with confidence handling 

674 verdict = str(llm_result.get("verdict", "error")) 

675 confidence = float(llm_result.get("confidence", 1.0)) 

676 confident = confidence >= min_confidence 

677 

678 # Optionally modify verdict for low confidence 

679 if uncertain_suffix and not confident: 

680 verdict = f"{verdict}_uncertain" 

681 

682 return EvaluationResult( 

683 verdict=verdict, 

684 details={ 

685 "confidence": confidence, 

686 "confident": confident, 

687 "reason": llm_result.get("reason", ""), 

688 "raw": llm_result, 

689 "llm_model": model, 

690 "llm_latency_ms": llm_latency_ms, 

691 "llm_prompt": user_prompt[:500], 

692 "llm_raw_output": proc.stdout[:500] if proc.stdout else "", 

693 }, 

694 ) 

695 

696 

697def evaluate( 

698 config: EvaluateConfig, 

699 output: str, 

700 exit_code: int, 

701 context: InterpolationContext, 

702) -> EvaluationResult: 

703 """Dispatch to appropriate evaluator based on config type. 

704 

705 Args: 

706 config: Evaluator configuration with type and parameters 

707 output: Action stdout 

708 exit_code: Action exit code 

709 context: Runtime context for variable interpolation 

710 

711 Returns: 

712 EvaluationResult from the appropriate evaluator 

713 

714 Raises: 

715 ValueError: If evaluator type is unknown 

716 """ 

717 eval_type = config.type 

718 

719 if eval_type == "exit_code": 

720 return evaluate_exit_code(exit_code) 

721 

722 elif eval_type == "output_numeric": 

723 if config.target is None: 

724 raise ValueError("output_numeric evaluator requires 'target' to be set") 

725 elif isinstance(config.target, str): 

726 try: 

727 resolved = interpolate(config.target, context) if context else config.target 

728 numeric_target = float(resolved) 

729 except (InterpolationError, ValueError) as e: 

730 raise ValueError( 

731 f"output_numeric target must be numeric, got: {config.target!r}" 

732 ) from e 

733 else: 

734 numeric_target = float(config.target) 

735 return evaluate_output_numeric( 

736 output=output, 

737 operator=config.operator or "eq", 

738 target=numeric_target, 

739 ) 

740 

741 elif eval_type == "output_json": 

742 return evaluate_output_json( 

743 output=output, 

744 path=config.path or "", 

745 operator=config.operator or "eq", 

746 target=config.target, 

747 ) 

748 

749 elif eval_type == "output_contains": 

750 return evaluate_output_contains( 

751 output=output, 

752 pattern=config.pattern or "", 

753 negate=config.negate, 

754 ) 

755 

756 elif eval_type == "convergence": 

757 # Resolve previous value from interpolation if configured 

758 previous: float | None = None 

759 if config.previous: 

760 try: 

761 previous = float(interpolate(config.previous, context)) 

762 except (InterpolationError, ValueError): 

763 # Previous unavailable on first iteration, continue with None 

764 pass 

765 

766 # Parse current value from output 

767 try: 

768 current = float(output.strip()) 

769 except ValueError: 

770 return EvaluationResult( 

771 verdict="error", 

772 details={"error": f"Cannot parse output as number: {output[:100]}"}, 

773 ) 

774 

775 # Resolve target (may be interpolated string like "${context.target}") 

776 convergence_target: float 

777 if isinstance(config.target, str): 

778 try: 

779 convergence_target = float(interpolate(config.target, context)) 

780 except (InterpolationError, ValueError) as e: 

781 return EvaluationResult( 

782 verdict="error", 

783 details={"error": f"Cannot resolve target: {e}"}, 

784 ) 

785 else: 

786 if config.target is None: 

787 raise ValueError("convergence evaluator requires 'target' to be set") 

788 convergence_target = float(config.target) 

789 

790 # Resolve tolerance (may be interpolated string) 

791 tolerance: float = 0.0 

792 if config.tolerance is not None: 

793 if isinstance(config.tolerance, str): 

794 try: 

795 tolerance = float(interpolate(config.tolerance, context)) 

796 except (InterpolationError, ValueError): 

797 tolerance = 0.0 

798 else: 

799 tolerance = float(config.tolerance) 

800 

801 return evaluate_convergence( 

802 current=current, 

803 previous=previous, 

804 target=convergence_target, 

805 tolerance=tolerance, 

806 direction=config.direction, 

807 ) 

808 

809 elif eval_type == "diff_stall": 

810 return evaluate_diff_stall( 

811 scope=config.scope, 

812 max_stall=config.max_stall, 

813 ) 

814 

815 elif eval_type == "llm_structured": 

816 prompt = config.prompt 

817 if prompt and context: 

818 try: 

819 prompt = interpolate(prompt, context) 

820 except InterpolationError: 

821 pass # Use raw prompt on resolution failure 

822 return evaluate_llm_structured( 

823 output=output, 

824 prompt=prompt, 

825 schema=config.schema, 

826 min_confidence=config.min_confidence, 

827 uncertain_suffix=config.uncertain_suffix, 

828 ) 

829 

830 elif eval_type == "mcp_result": 

831 return evaluate_mcp_result(output=output, exit_code=exit_code) 

832 

833 else: 

834 raise ValueError(f"Unknown evaluator type: {eval_type}")