Source code for dash_evals.runner.scorers.flutter_test

"""Flutter test runner scorer.

Reusable scorer that runs ``flutter test`` and scores based on pass/fail.
"""

from inspect_ai.scorer import Score, Scorer, Target, accuracy, scorer
from inspect_ai.solver import TaskState
from inspect_ai.util import sandbox


[docs] @scorer(metrics=[accuracy()]) def flutter_test_scorer(test_path: str = "test/") -> Scorer: """ Score based on Flutter test results. Runs ``flutter test`` on the specified path and scores: - CORRECT if all tests pass - INCORRECT if any tests fail Args: test_path: Path to test directory or file. Default "test/". Returns: A Scorer that evaluates code by running Flutter tests. """ async def score(state: TaskState, target: Target) -> Score: sb = sandbox() workspace = state.metadata.get("workspace") if not workspace: return Score( value=0.0, explanation="No workspace found - setup may have failed", ) # Run flutter test # Scope to project_dir if set in metadata (for multi-project repos) cwd = workspace metadata_project_dir = state.metadata.get("project_dir") if metadata_project_dir: import os cwd = os.path.join(workspace, metadata_project_dir) result = await sb.exec( ["flutter", "test", test_path, "--no-pub"], cwd=cwd, timeout=180, ) stdout = result.stdout or "" stderr = result.stderr or "" output = stdout + stderr # Parse test results test_info = _parse_test_output(output) total_tests = test_info["passed"] + test_info["failed"] if total_tests == 0: return Score( value=0.0, explanation="No tests found or executed", metadata={"test_output": output, "passed": 0, "failed": 0}, ) pass_rate = test_info["passed"] / total_tests if result.returncode == 0: return Score( value=1.0, explanation=f"All tests passed ({test_info['passed']} tests)", metadata={ "test_output": output, "passed": test_info["passed"], "failed": 0, "pass_rate": 1.0, }, ) else: return Score( value=pass_rate, # Return actual percentage explanation=f"{test_info['passed']}/{total_tests} tests passed ({pass_rate:.0%}):\n{output[:1500]}", metadata={ "test_output": output, "passed": test_info["passed"], "failed": test_info["failed"], "pass_rate": pass_rate, }, ) return score
def _parse_test_output(output: str) -> dict: """Parse flutter test output to extract pass/fail counts.""" import re # Normalize carriage returns to make regex work on all line endings output = output.replace("\r\n", "\n").replace("\r", "\n") # Look for patterns like "+3 -1" or "+5" # Format: "00:04 +3 -1: Some tests failed" (find the LAST occurrence) matches = re.findall(r"\+(\d+)(?:\s+-(\d+))?:", output) if matches: # Take the last match - this gives the final test counts last_match = matches[-1] passed = int(last_match[0]) failed = int(last_match[1]) if last_match[1] else 0 return {"passed": passed, "failed": failed} return {"passed": 0, "failed": 0}