Source code for dash_evals.runner.scorers.mcp_tool_usage

"""Scorer for verifying MCP tool usage during evaluations."""

from inspect_ai.scorer import Score, Scorer, Target, accuracy, scorer
from inspect_ai.solver import TaskState

# Complete list of Dart MCP server tools from:
# https://github.com/dart-lang/ai/tree/main/pkgs/dart_mcp_server
DART_MCP_TOOLS: set[str] = {
    "add_roots",
    "analyze_files",
    "connect_dart_tooling_daemon",
    "create_project",
    "dart_fix",
    "dart_format",
    "flutter_driver",
    "get_active_location",
    "get_app_logs",
    "get_runtime_errors",
    "get_selected_widget",
    "get_widget_tree",
    "hot_reload",
    "hot_restart",
    "hover",
    "launch_app",
    "list_devices",
    "list_running_apps",
    "pub",
    "pub_dev_search",
    "read_package_uris",
    "remove_roots",
    "resolve_workspace_symbol",
    "rip_grep_packages",
    "run_tests",
    "set_widget_selection_mode",
    "signature_help",
    "stop_app",
}



[docs]
@scorer(metrics=[accuracy()])
def mcp_tool_usage(
    mcp_server_name: str = "Dart",
    mcp_tool_names: list[str] | None = None,
    required_tools: list[str] | None = None,
) -> Scorer:
    """
    Scorer that checks if an MCP tool from the specified server was called.

    This scorer examines the message history to determine whether the model
    actually used an MCP tool (vs. answering from its training data).

    Args:
        mcp_server_name: The name prefix of the MCP server tools. Tools matching
                         "{mcp_server_name}_*" pattern will be identified as
                         MCP tools.
        mcp_tool_names: Optional list of specific tool names to identify as MCP
                        tools. If not provided and mcp_server_name is "Dart",
                        defaults to the full DART_MCP_TOOLS list.
        required_tools: Optional list of specific MCP tool names that MUST have
                        been called for a passing score. If provided, the scorer
                        checks that every tool in this list was used. If not
                        provided, any MCP tool usage counts as a pass.

    Returns:
        A Scorer that returns "C" if MCP tool(s) were used as required, "I" otherwise.

    Example::

        from dash_evals.scorers import mcp_tool_usage

        Task(
            dataset=my_dataset,
            solver=react(),
            tools=[dart_mcp_server],
            scorer=[
                includes(ignore_case=True),  # Check answer correctness
                mcp_tool_usage(),             # Uses DART_MCP_TOOLS by default
                # Or check specific tools:
                # mcp_tool_usage(required_tools=["create_project"]),
            ],
        )
    """
    # Default to DART_MCP_TOOLS for Dart server, otherwise use provided list
    if mcp_tool_names is not None:
        known_mcp_tools = set(mcp_tool_names)
    elif mcp_server_name == "Dart":
        known_mcp_tools = DART_MCP_TOOLS
    else:
        known_mcp_tools = set()

    async def score(state: TaskState, target: Target) -> Score:
        # Track all tools called and whether MCP tool was used
        tools_called: list[str] = []
        mcp_tool_used = False
        mcp_tools_called: list[str] = []

        # Look through all messages for tool calls
        for message in state.messages:
            # Check if message has tool_calls attribute (assistant messages with tool use)
            if hasattr(message, "tool_calls") and message.tool_calls:
                for tool_call in message.tool_calls:
                    tool_name = tool_call.function
                    tools_called.append(tool_name)

                    # Check if this is an MCP tool:
                    # 1. Prefixed with server name (e.g., "Dart_search_packages")
                    # 2. OR in the explicit list of known MCP tool names
                    is_mcp_tool = tool_name.startswith(f"{mcp_server_name}_") or (
                        tool_name in known_mcp_tools
                    )
                    if is_mcp_tool:
                        mcp_tool_used = True
                        mcp_tools_called.append(tool_name)

        # Check required_tools if specified
        if required_tools:
            mcp_tools_called_set = set(mcp_tools_called)
            missing_tools = [t for t in required_tools if t not in mcp_tools_called_set]
            if missing_tools:
                explanation = (
                    f"Required MCP tool(s) NOT used: {missing_tools}. "
                    f"MCP tools called: {mcp_tools_called if mcp_tools_called else 'none'}. "
                    f"All tools called: {tools_called if tools_called else 'none'}"
                )
                return Score(
                    value="I",
                    answer=", ".join(mcp_tools_called) if mcp_tools_called else "none",
                    explanation=explanation,
                    metadata={
                        "mcp_server_name": mcp_server_name,
                        "mcp_tool_used": mcp_tool_used,
                        "mcp_tools_called": mcp_tools_called,
                        "all_tools_called": tools_called,
                        "required_tools": required_tools,
                        "missing_tools": missing_tools,
                    },
                )

        # Build explanation
        if mcp_tool_used:
            explanation = (
                f"MCP tool(s) from '{mcp_server_name}' server were used: {mcp_tools_called}"
            )
        else:
            explanation = (
                f"MCP tool from '{mcp_server_name}' server was NOT used. "
                f"All tools called: {tools_called if tools_called else 'none'}"
            )

        return Score(
            value="C" if mcp_tool_used else "I",
            answer=", ".join(mcp_tools_called) if mcp_tools_called else "none",
            explanation=explanation,
            metadata={
                "mcp_server_name": mcp_server_name,
                "mcp_tool_used": mcp_tool_used,
                "mcp_tools_called": mcp_tools_called,
                "all_tools_called": tools_called,
            },
        )

    return score