Source code for dash_evals.runner.scorers.code_quality
"""LLM-graded code quality scorer.
Reusable scorer that uses an LLM to evaluate subjective code quality aspects.
"""
import json
import re
from inspect_ai.model import get_model
from inspect_ai.scorer import Score, Scorer, Target, mean, scorer, stderr
from inspect_ai.solver import TaskState
DEFAULT_RUBRIC = """
Evaluate this code fix on subjective quality (0-3 scale):
1. **Minimality**: Is the fix focused? Does it avoid unnecessary changes?
- 0: Bloated, touches unrelated code or adds unnecessary complexity
- 1: Some unnecessary changes but mostly focused
- 2: Focused with minor extras
- 3: Surgical, changes only what's needed
2. **Elegance**: Would a senior developer approve of this approach?
- 0: Hacky, works but ugly or non-idiomatic
- 1: Works but has style issues
- 2: Good but not exemplary
- 3: Clean, idiomatic, follows language conventions
3. **Robustness**: Does it handle edge cases appropriately?
- 0: Fragile, likely breaks on edge cases
- 1: Handles basic cases only
- 2: Handles most edge cases
- 3: Defensive, handles nulls/empty states/errors gracefully
Respond with ONLY a JSON object (no markdown):
{"minimality": N, "elegance": N, "robustness": N, "reasoning": "Brief explanation"}
"""
[docs]
@scorer(metrics=[mean(), stderr()])
def code_quality_scorer(rubric: str | None = None, model: str | None = None) -> Scorer:
"""
Score code quality using LLM judgment.
Uses a rubric to evaluate subjective aspects of code quality that
static analysis can't capture: minimality, elegance, robustness.
Args:
rubric: Custom rubric prompt. If None, uses default Dart/Flutter rubric.
model: Model to use for grading. If None, uses the task's model.
Returns:
A Scorer that evaluates code quality on a 0-1 scale.
"""
grading_rubric = rubric or DEFAULT_RUBRIC
async def score(state: TaskState, target: Target) -> Score:
code = state.output.completion
# Build grading prompt
prompt = f"{grading_rubric}\n\nCode to evaluate:\n```dart\n{code}\n```"
# Get grader model
grader = get_model(model) if model else get_model()
try:
result = await grader.generate(prompt)
response_text = result.completion
# Parse JSON from response
scores = _parse_json_response(response_text)
if scores is None:
return Score(
value=0.0,
explanation=f"Failed to parse grader response: {response_text[:500]}",
metadata={"raw_response": response_text},
)
# Calculate normalized score (0-1)
# Use `or 0` pattern to handle None values (not just missing keys)
minimality = scores.get("minimality") or 0
elegance = scores.get("elegance") or 0
robustness = scores.get("robustness") or 0
total = minimality + elegance + robustness
normalized_score = total / 9.0 # Max possible is 9 (3 + 3 + 3)
return Score(
value=normalized_score,
explanation=scores.get("reasoning", "No reasoning provided"),
metadata={
"minimality": minimality,
"elegance": elegance,
"robustness": robustness,
"raw_response": response_text,
},
)
except Exception as e:
return Score(
value=0.0,
explanation=f"Grading failed: {e!s}",
metadata={"error": str(e)},
)
return score
def _parse_json_response(text: str) -> dict | None:
"""Extract JSON from LLM response, handling markdown code blocks."""
# Try direct parse first
try:
return json.loads(text.strip())
except json.JSONDecodeError:
pass
# Try extracting from markdown code block
match = re.search(r"```(?:json)?\s*(\{.*?\})\s*```", text, re.DOTALL)
if match:
try:
return json.loads(match.group(1))
except json.JSONDecodeError:
pass
# Try finding any JSON object in the text
match = re.search(r"\{[^{}]*\}", text)
if match:
try:
return json.loads(match.group(0))
except json.JSONDecodeError:
pass
return None