Source code for dash_evals.runner.scorers.flutter_code
"""Scorer for Flutter code quality evaluation."""
from inspect_ai.scorer import Score, Scorer, Target, accuracy, scorer
from inspect_ai.solver import TaskState
from inspect_ai.util import sandbox
from .flutter_output_parser import parse_analyzer_output, parse_test_output
from .flutter_scoring import (
calculate_analyzer_score,
calculate_final_score,
calculate_test_score,
validate_code_structure,
)
[docs]
@scorer(metrics=[accuracy()])
def flutter_code_scorer() -> Scorer:
"""
Custom scorer that evaluates Flutter code based on:
1. Code analysis (flutter analyze)
2. Test results (flutter test)
3. Code structure validation
The final score is a weighted combination of these factors:
- Analyzer: 30%
- Tests: 50%
- Structure: 20%
A score >= 0.7 is considered passing for the accuracy metric.
Returns:
A Scorer that evaluates Flutter code quality.
"""
async def score(state: TaskState, target: Target) -> Score:
# Check for setup errors first
if setup_error := state.metadata.get("setup_error"):
return Score(
value=0.0,
answer="",
explanation=f"✗ Setup failed: {setup_error}",
metadata={"setup_error": setup_error},
)
sb = sandbox()
workspace = state.metadata.get("workspace")
if not workspace:
return Score(value=0.0, explanation="No workspace found - setup may have failed")
explanation_parts = []
# 1. Run flutter analyze
analyze_result = await sb.exec(["flutter", "analyze", "--no-pub"], cwd=workspace)
if analyze_result.success:
output = analyze_result.stdout + analyze_result.stderr
analyzer_result = parse_analyzer_output(output)
analyzer_score, analyzer_explanation = calculate_analyzer_score(analyzer_result)
explanation_parts.append(analyzer_explanation)
else:
analyzer_score = 0.0
explanation_parts.append("✗ Code analysis failed (syntax errors)")
# 2. Run flutter test
test_result = await sb.exec(["flutter", "test", "--no-pub"], cwd=workspace)
output = test_result.stdout + test_result.stderr
test_result_parsed = parse_test_output(output, test_result.success)
test_score, test_explanation = calculate_test_score(test_result_parsed)
explanation_parts.append(test_explanation)
# 3. Validate code structure
code = state.metadata.get("generated_code", "")
required_widgets = state.metadata.get("required_widgets", [])
structure_score, structure_explanation = validate_code_structure(code, required_widgets)
explanation_parts.append(structure_explanation)
# Calculate final score
final_score = calculate_final_score(analyzer_score, test_score, structure_score)
return Score(
value=final_score, # Return actual weighted score (0.0-1.0)
answer=state.output.completion[:200] + "...",
explanation="\n".join(explanation_parts),
metadata={
"analyzer_score": analyzer_score,
"test_score": test_score,
"structure_score": structure_score,
"final_score": final_score,
"analyzer_output": analyze_result.stdout if analyze_result else "",
"test_output": test_result.stdout if test_result else "",
},
)
return score