Source code for dash_evals.runner.scorers.flutter_test
"""Flutter test runner scorer.
Reusable scorer that runs ``flutter test`` and scores based on pass/fail.
"""
from inspect_ai.scorer import Score, Scorer, Target, accuracy, scorer
from inspect_ai.solver import TaskState
from inspect_ai.util import sandbox
[docs]
@scorer(metrics=[accuracy()])
def flutter_test_scorer(test_path: str = "test/") -> Scorer:
"""
Score based on Flutter test results.
Runs ``flutter test`` on the specified path and scores:
- CORRECT if all tests pass
- INCORRECT if any tests fail
Args:
test_path: Path to test directory or file. Default "test/".
Returns:
A Scorer that evaluates code by running Flutter tests.
"""
async def score(state: TaskState, target: Target) -> Score:
sb = sandbox()
workspace = state.metadata.get("workspace")
if not workspace:
return Score(
value=0.0,
explanation="No workspace found - setup may have failed",
)
# Run flutter test
# Scope to project_dir if set in metadata (for multi-project repos)
cwd = workspace
metadata_project_dir = state.metadata.get("project_dir")
if metadata_project_dir:
import os
cwd = os.path.join(workspace, metadata_project_dir)
result = await sb.exec(
["flutter", "test", test_path, "--no-pub"],
cwd=cwd,
timeout=180,
)
stdout = result.stdout or ""
stderr = result.stderr or ""
output = stdout + stderr
# Parse test results
test_info = _parse_test_output(output)
total_tests = test_info["passed"] + test_info["failed"]
if total_tests == 0:
return Score(
value=0.0,
explanation="No tests found or executed",
metadata={"test_output": output, "passed": 0, "failed": 0},
)
pass_rate = test_info["passed"] / total_tests
if result.returncode == 0:
return Score(
value=1.0,
explanation=f"All tests passed ({test_info['passed']} tests)",
metadata={
"test_output": output,
"passed": test_info["passed"],
"failed": 0,
"pass_rate": 1.0,
},
)
else:
return Score(
value=pass_rate, # Return actual percentage
explanation=f"{test_info['passed']}/{total_tests} tests passed ({pass_rate:.0%}):\n{output[:1500]}",
metadata={
"test_output": output,
"passed": test_info["passed"],
"failed": test_info["failed"],
"pass_rate": pass_rate,
},
)
return score
def _parse_test_output(output: str) -> dict:
"""Parse flutter test output to extract pass/fail counts."""
import re
# Normalize carriage returns to make regex work on all line endings
output = output.replace("\r\n", "\n").replace("\r", "\n")
# Look for patterns like "+3 -1" or "+5"
# Format: "00:04 +3 -1: Some tests failed" (find the LAST occurrence)
matches = re.findall(r"\+(\d+)(?:\s+-(\d+))?:", output)
if matches:
# Take the last match - this gives the final test counts
last_match = matches[-1]
passed = int(last_match[0])
failed = int(last_match[1]) if last_match[1] else 0
return {"passed": passed, "failed": failed}
return {"passed": 0, "failed": 0}