Source code for dash_evals.runner.scorers.skill_usage

"""Scorer for verifying skill usage during evaluations."""

from inspect_ai.scorer import Score, Scorer, Target, accuracy, scorer
from inspect_ai.solver import TaskState

# The skill tool name used by Inspect AI's built-in skill() function.
SKILL_TOOL_NAME = "skill"



[docs]
@scorer(metrics=[accuracy()])
def skill_usage_scorer() -> Scorer:
    """Scorer that checks if the agent used the skill tool.

    Examines the message history to determine whether the model
    actually called the skill tool to read/discover available skills,
    rather than answering from its training data alone.

    Returns:
        A Scorer that returns "C" if the skill tool was used, "I" otherwise.

    Example::

        from dash_evals.runner.scorers import skill_usage_scorer

        Task(
            dataset=my_dataset,
            solver=react(tools=[skill_tool, bash(timeout=120)]),
            scorer=[
                model_graded_fact(),    # Check answer correctness
                skill_usage_scorer(),   # Check skill tool was used
            ],
        )
    """

    async def score(state: TaskState, target: Target) -> Score:
        tools_called: list[str] = []
        skill_call_count = 0

        for message in state.messages:
            if hasattr(message, "tool_calls") and message.tool_calls:
                for tool_call in message.tool_calls:
                    tool_name = tool_call.function
                    tools_called.append(tool_name)
                    if tool_name == SKILL_TOOL_NAME:
                        skill_call_count += 1

        skill_tool_used = skill_call_count > 0
        if skill_tool_used:
            explanation = f"Skill tool was used ({skill_call_count} call(s))"
        else:
            explanation = (
                f"Skill tool was NOT used. "
                f"All tools called: {tools_called if tools_called else 'none'}"
            )

        return Score(
            value="C" if skill_tool_used else "I",
            answer=f"{skill_call_count} skill call(s)",
            explanation=explanation,
            metadata={
                "skill_tool_used": skill_tool_used,
                "skill_call_count": skill_call_count,
                "all_tools_called": tools_called,
            },
        )

    return score