Source code for dash_evals.runner.tasks.mcp_tool

"""
MCP Tool Usage Task (Unified)

Tests an agent's ability to use a specific MCP server tool. Consolidates the
former `mcp_create_project` and `mcp_pub_dev_search` tasks into a single
configurable task.

Config keys:
    required_tools: list[str] — MCP tool names the agent should use (for scoring)
    inject_temp_dir: bool — if True, replace {root_path} in sample inputs with a
                     temp directory (needed for create_project-style tasks)
"""

import tempfile

from inspect_ai import Task, task
from inspect_ai.dataset import Dataset, MemoryDataset, Sample
from inspect_ai.scorer import includes

from ..scorers import mcp_tool_usage
from ..solvers import add_system_message
from .task_helpers import (
    append_context_injection,
    append_model_interaction,
    build_task_metadata,
)



[docs]
@task
def mcp_tool(dataset: Dataset, config: dict) -> Task:
    """
    Unified task for evaluating MCP tool usage.

    Args:
        dataset: Inspect dataset loaded from JSONL.
        config: Task manifest entry with:
            - required_tools: list of MCP tool names the agent should call
            - inject_temp_dir: if True, replaces {root_path} in inputs
            - system_message: custom system prompt (optional)
    """
    required_tools = config.get("required_tools", [])
    # inject_temp_dir can be set via task.yaml metadata.task_parameters
    task_params = (config.get("metadata") or {}).get("task_parameters") or {}
    inject_temp_dir = config.get("inject_temp_dir", False) or task_params.get("inject_temp_dir", False)

    # Pre-process samples if temp directory injection is needed
    active_dataset = dataset
    if inject_temp_dir:
        temp_root = tempfile.mkdtemp(prefix="mcp_tool_")
        processed_samples = []
        for sample in dataset:
            input_str = sample.input if isinstance(sample.input, str) else str(sample.input)
            processed_samples.append(
                Sample(
                    input=input_str.replace("{root_path}", temp_root),
                    target=sample.target,
                    id=sample.id,
                    metadata=sample.metadata,
                )
            )
        active_dataset = MemoryDataset(
            samples=processed_samples,
            name=config.get("task_name", "mcp_tool"),
        )

    # Build solver chain
    system_msg = config.get("system_message", "You are a helpful assistant.")
    solver_chain = [add_system_message(system_msg)]
    append_context_injection(solver_chain, config)
    append_model_interaction(solver_chain, config)

    return Task(
        name=config["task_name"],
        dataset=active_dataset,
        solver=solver_chain,
        scorer=[
            includes(ignore_case=True),
            mcp_tool_usage(required_tools=required_tools if required_tools else None),
        ],
        time_limit=config.get("time_limit", 300),
        message_limit=config.get("message_limit", 50),
        metadata=build_task_metadata(config),
    )