Source code for dash_evals.runner.scorers.skill_usage

"""Scorer for verifying skill usage during evaluations."""

from inspect_ai.scorer import Score, Scorer, Target, accuracy, scorer
from inspect_ai.solver import TaskState

# The skill tool name used by Inspect AI's built-in skill() function.
SKILL_TOOL_NAME = "skill"


[docs] @scorer(metrics=[accuracy()]) def skill_usage_scorer() -> Scorer: """Scorer that checks if the agent used the skill tool. Examines the message history to determine whether the model actually called the skill tool to read/discover available skills, rather than answering from its training data alone. Returns: A Scorer that returns "C" if the skill tool was used, "I" otherwise. Example:: from dash_evals.runner.scorers import skill_usage_scorer Task( dataset=my_dataset, solver=react(tools=[skill_tool, bash(timeout=120)]), scorer=[ model_graded_fact(), # Check answer correctness skill_usage_scorer(), # Check skill tool was used ], ) """ async def score(state: TaskState, target: Target) -> Score: tools_called: list[str] = [] skill_call_count = 0 for message in state.messages: if hasattr(message, "tool_calls") and message.tool_calls: for tool_call in message.tool_calls: tool_name = tool_call.function tools_called.append(tool_name) if tool_name == SKILL_TOOL_NAME: skill_call_count += 1 skill_tool_used = skill_call_count > 0 if skill_tool_used: explanation = f"Skill tool was used ({skill_call_count} call(s))" else: explanation = ( f"Skill tool was NOT used. " f"All tools called: {tools_called if tools_called else 'none'}" ) return Score( value="C" if skill_tool_used else "I", answer=f"{skill_call_count} skill call(s)", explanation=explanation, metadata={ "skill_tool_used": skill_tool_used, "skill_call_count": skill_call_count, "all_tools_called": tools_called, }, ) return score