Source code for dash_evals.runner.scorers.flutter_code

"""Scorer for Flutter code quality evaluation."""

from inspect_ai.scorer import Score, Scorer, Target, accuracy, scorer
from inspect_ai.solver import TaskState
from inspect_ai.util import sandbox

from .flutter_output_parser import parse_analyzer_output, parse_test_output
from .flutter_scoring import (
    calculate_analyzer_score,
    calculate_final_score,
    calculate_test_score,
    validate_code_structure,
)


[docs] @scorer(metrics=[accuracy()]) def flutter_code_scorer() -> Scorer: """ Custom scorer that evaluates Flutter code based on: 1. Code analysis (flutter analyze) 2. Test results (flutter test) 3. Code structure validation The final score is a weighted combination of these factors: - Analyzer: 30% - Tests: 50% - Structure: 20% A score >= 0.7 is considered passing for the accuracy metric. Returns: A Scorer that evaluates Flutter code quality. """ async def score(state: TaskState, target: Target) -> Score: # Check for setup errors first if setup_error := state.metadata.get("setup_error"): return Score( value=0.0, answer="", explanation=f"✗ Setup failed: {setup_error}", metadata={"setup_error": setup_error}, ) sb = sandbox() workspace = state.metadata.get("workspace") if not workspace: return Score(value=0.0, explanation="No workspace found - setup may have failed") explanation_parts = [] # 1. Run flutter analyze analyze_result = await sb.exec(["flutter", "analyze", "--no-pub"], cwd=workspace) if analyze_result.success: output = analyze_result.stdout + analyze_result.stderr analyzer_result = parse_analyzer_output(output) analyzer_score, analyzer_explanation = calculate_analyzer_score(analyzer_result) explanation_parts.append(analyzer_explanation) else: analyzer_score = 0.0 explanation_parts.append("✗ Code analysis failed (syntax errors)") # 2. Run flutter test test_result = await sb.exec(["flutter", "test", "--no-pub"], cwd=workspace) output = test_result.stdout + test_result.stderr test_result_parsed = parse_test_output(output, test_result.success) test_score, test_explanation = calculate_test_score(test_result_parsed) explanation_parts.append(test_explanation) # 3. Validate code structure code = state.metadata.get("generated_code", "") required_widgets = state.metadata.get("required_widgets", []) structure_score, structure_explanation = validate_code_structure(code, required_widgets) explanation_parts.append(structure_explanation) # Calculate final score final_score = calculate_final_score(analyzer_score, test_score, structure_score) return Score( value=final_score, # Return actual weighted score (0.0-1.0) answer=state.output.completion[:200] + "...", explanation="\n".join(explanation_parts), metadata={ "analyzer_score": analyzer_score, "test_score": test_score, "structure_score": structure_score, "final_score": final_score, "analyzer_output": analyze_result.stdout if analyze_result else "", "test_output": test_result.stdout if test_result else "", }, ) return score