Source code for dash_evals.runner.scorers.code_quality

"""LLM-graded code quality scorer.

Reusable scorer that uses an LLM to evaluate subjective code quality aspects.
"""

import json
import re

from inspect_ai.model import get_model
from inspect_ai.scorer import Score, Scorer, Target, mean, scorer, stderr
from inspect_ai.solver import TaskState

DEFAULT_RUBRIC = """
Evaluate this code fix on subjective quality (0-3 scale):

1. **Minimality**: Is the fix focused? Does it avoid unnecessary changes?
   - 0: Bloated, touches unrelated code or adds unnecessary complexity
   - 1: Some unnecessary changes but mostly focused
   - 2: Focused with minor extras
   - 3: Surgical, changes only what's needed

2. **Elegance**: Would a senior developer approve of this approach?
   - 0: Hacky, works but ugly or non-idiomatic
   - 1: Works but has style issues
   - 2: Good but not exemplary
   - 3: Clean, idiomatic, follows language conventions

3. **Robustness**: Does it handle edge cases appropriately?
   - 0: Fragile, likely breaks on edge cases
   - 1: Handles basic cases only
   - 2: Handles most edge cases
   - 3: Defensive, handles nulls/empty states/errors gracefully

Respond with ONLY a JSON object (no markdown):
{"minimality": N, "elegance": N, "robustness": N, "reasoning": "Brief explanation"}
"""



[docs]
@scorer(metrics=[mean(), stderr()])
def code_quality_scorer(rubric: str | None = None, model: str | None = None) -> Scorer:
    """
    Score code quality using LLM judgment.

    Uses a rubric to evaluate subjective aspects of code quality that
    static analysis can't capture: minimality, elegance, robustness.

    Args:
        rubric: Custom rubric prompt. If None, uses default Dart/Flutter rubric.
        model: Model to use for grading. If None, uses the task's model.

    Returns:
        A Scorer that evaluates code quality on a 0-1 scale.
    """
    grading_rubric = rubric or DEFAULT_RUBRIC

    async def score(state: TaskState, target: Target) -> Score:
        code = state.output.completion

        # Build grading prompt
        prompt = f"{grading_rubric}\n\nCode to evaluate:\n```dart\n{code}\n```"

        # Get grader model
        grader = get_model(model) if model else get_model()

        try:
            result = await grader.generate(prompt)
            response_text = result.completion

            # Parse JSON from response
            scores = _parse_json_response(response_text)

            if scores is None:
                return Score(
                    value=0.0,
                    explanation=f"Failed to parse grader response: {response_text[:500]}",
                    metadata={"raw_response": response_text},
                )

            # Calculate normalized score (0-1)
            # Use `or 0` pattern to handle None values (not just missing keys)
            minimality = scores.get("minimality") or 0
            elegance = scores.get("elegance") or 0
            robustness = scores.get("robustness") or 0
            total = minimality + elegance + robustness
            normalized_score = total / 9.0  # Max possible is 9 (3 + 3 + 3)

            return Score(
                value=normalized_score,
                explanation=scores.get("reasoning", "No reasoning provided"),
                metadata={
                    "minimality": minimality,
                    "elegance": elegance,
                    "robustness": robustness,
                    "raw_response": response_text,
                },
            )

        except Exception as e:
            return Score(
                value=0.0,
                explanation=f"Grading failed: {e!s}",
                metadata={"error": str(e)},
            )

    return score



def _parse_json_response(text: str) -> dict | None:
    """Extract JSON from LLM response, handling markdown code blocks."""
    # Try direct parse first
    try:
        return json.loads(text.strip())
    except json.JSONDecodeError:
        pass

    # Try extracting from markdown code block
    match = re.search(r"```(?:json)?\s*(\{.*?\})\s*```", text, re.DOTALL)
    if match:
        try:
            return json.loads(match.group(1))
        except json.JSONDecodeError:
            pass

    # Try finding any JSON object in the text
    match = re.search(r"\{[^{}]*\}", text)
    if match:
        try:
            return json.loads(match.group(0))
        except json.JSONDecodeError:
            pass

    return None