Source code for dash_evals.runner.tasks.mcp_tool

"""
MCP Tool Usage Task (Unified)

Tests an agent's ability to use a specific MCP server tool. Consolidates the
former `mcp_create_project` and `mcp_pub_dev_search` tasks into a single
configurable task.

Config keys:
    required_tools: list[str] — MCP tool names the agent should use (for scoring)
    inject_temp_dir: bool — if True, replace {root_path} in sample inputs with a
                     temp directory (needed for create_project-style tasks)
"""

import tempfile

from inspect_ai import Task, task
from inspect_ai.dataset import Dataset, MemoryDataset, Sample
from inspect_ai.scorer import includes

from ..scorers import mcp_tool_usage
from ..solvers import add_system_message
from .task_helpers import (
    append_context_injection,
    append_model_interaction,
    build_task_metadata,
)


[docs] @task def mcp_tool(dataset: Dataset, config: dict) -> Task: """ Unified task for evaluating MCP tool usage. Args: dataset: Inspect dataset loaded from JSONL. config: Task manifest entry with: - required_tools: list of MCP tool names the agent should call - inject_temp_dir: if True, replaces {root_path} in inputs - system_message: custom system prompt (optional) """ required_tools = config.get("required_tools", []) inject_temp_dir = config.get("inject_temp_dir", False) # Pre-process samples if temp directory injection is needed active_dataset = dataset if inject_temp_dir: temp_root = tempfile.mkdtemp(prefix="mcp_tool_") processed_samples = [] for sample in dataset: input_str = sample.input if isinstance(sample.input, str) else str(sample.input) processed_samples.append( Sample( input=input_str.replace("{root_path}", temp_root), target=sample.target, id=sample.id, metadata=sample.metadata, ) ) active_dataset = MemoryDataset( samples=processed_samples, name=config.get("task_name", "mcp_tool"), ) # Build solver chain system_msg = config.get("system_message", "You are a helpful assistant.") solver_chain = [add_system_message(system_msg)] append_context_injection(solver_chain, config) append_model_interaction(solver_chain, config) return Task( name=config["task_name"], dataset=active_dataset, solver=solver_chain, scorer=[ includes(ignore_case=True), mcp_tool_usage(required_tools=required_tools if required_tools else None), ], time_limit=config.get("time_limit", 300), message_limit=config.get("message_limit", 50), metadata=build_task_metadata(config), )