"""Thin shim: read InspectEvalSet JSON, build Tasks, call eval_set().
The JSON file maps ~1:1 to eval_set() kwargs. The 'tasks' key contains
task definitions with inline datasets (InspectDataset with InspectSample
objects).
"""
import importlib
import json
import logging
from pathlib import Path
import inspect_ai
from inspect_ai.dataset import MemoryDataset, Sample, csv_dataset, json_dataset
from dash_evals.utils.logging import capture_output, setup_logging
logger = logging.getLogger(__name__)
# Keys in the JSON that are NOT eval_set() kwargs.
# They are consumed separately to build Task objects.
_NON_EVAL_SET_KEYS = {"tasks"}
def _resolve_task_func(name: str):
"""Resolve a task function by name using importlib.
Supports:
- Short names: "flutter_code_gen" → dash_evals.runner.tasks.flutter_code_gen
- Colon syntax: "my_package.tasks:my_task" → import my_package.tasks, get my_task
- Dotted paths: "dash_evals.runner.tasks.flutter_code_gen.flutter_code_gen"
For short names, first tries to import a module with the same name.
If that fails, falls back to looking up the function in the tasks
package's __init__ (e.g., flutter_bug_fix is exported from bug_fix.py
via __init__.py).
Returns the callable task function.
"""
# Colon syntax: "module.path:function_name"
if ":" in name:
module_path, func_name = name.split(":", 1)
try:
module = importlib.import_module(module_path)
except ModuleNotFoundError:
raise ValueError(
f"Could not find module '{module_path}' for task function '{name}'. "
f"Check that the module exists and is importable."
)
func = getattr(module, func_name, None)
if func is None:
raise ValueError(f"Module '{module_path}' does not have a function '{func_name}'.")
return func
if "." not in name:
# Short name: try module with the same name first
module_path = f"dash_evals.runner.tasks.{name}"
func_name = name
try:
module = importlib.import_module(module_path)
func = getattr(module, func_name, None)
if func is not None:
return func
except ModuleNotFoundError:
pass
# Fall back to the tasks package __init__ (handles re-exports
# like flutter_bug_fix from bug_fix.py)
package = importlib.import_module("dash_evals.runner.tasks")
func = getattr(package, func_name, None)
if func is not None:
return func
raise ValueError(
f"Could not find task function '{name}'. "
f"Check that the function exists in dash_evals.runner.tasks "
f"and is exported in __init__.py."
)
else:
# Dotted path: last segment is the function name
module_path, _, func_name = name.rpartition(".")
try:
module = importlib.import_module(module_path)
except ModuleNotFoundError:
raise ValueError(
f"Could not find module '{module_path}' for task function '{name}'. "
f"Check that the module exists and is importable."
)
func = getattr(module, func_name, None)
if func is None:
raise ValueError(f"Module '{module_path}' does not have a function '{func_name}'.")
return func
def _build_dataset(task_def: dict):
"""Build an Inspect AI dataset from a task definition.
Dispatches on ``task_def["dataset"]["format"]``:
- ``"memory"`` (default): builds a ``MemoryDataset`` from inline samples.
- ``"json"``: delegates to ``inspect_ai.dataset.json_dataset(source, **args)``.
- ``"csv"``: delegates to ``inspect_ai.dataset.csv_dataset(source, **args)``.
Args:
task_def: A task entry from the EvalSet JSON manifest.
Returns:
An Inspect AI dataset object.
Raises:
ValueError: If the dataset format is unrecognized or required fields
(e.g. ``source`` for json/csv) are missing.
"""
dataset_def = task_def.get("dataset")
task_name = task_def.get("name", "")
if not dataset_def:
return MemoryDataset([], name=task_name)
fmt = dataset_def.get("format", "memory")
extra_args: dict = dataset_def.get("args") or {}
if fmt == "json":
source = dataset_def.get("source")
if not source:
raise ValueError(
f"Task '{task_name}': dataset format 'json' requires a 'source' field."
)
return json_dataset(source, **extra_args)
if fmt == "csv":
source = dataset_def.get("source")
if not source:
raise ValueError(
f"Task '{task_name}': dataset format 'csv' requires a 'source' field."
)
return csv_dataset(source, **extra_args)
if fmt == "memory":
raw_samples = dataset_def.get("samples", [])
samples = []
for raw in raw_samples:
sample = Sample(
input=raw["input"],
target=raw.get("target", ""),
id=raw.get("id"),
metadata=raw.get("metadata"),
files=raw.get("files"),
setup=raw.get("setup"),
sandbox=raw.get("sandbox"),
)
samples.append(sample)
return MemoryDataset(
samples,
name=dataset_def.get("name", task_name),
)
raise ValueError(
f"Task '{task_name}': unknown dataset format '{fmt}'. "
f"Expected one of: 'memory', 'json', 'csv'."
)
[docs]
def run_from_json(manifest_path: str | Path) -> bool:
"""Load an InspectEvalSet JSON, build Tasks, and call eval_set().
Args:
manifest_path: Path to eval_set.json emitted by the Dart CLI.
Returns:
True if any tasks failed, False if all succeeded.
"""
manifest_path = Path(manifest_path)
with open(manifest_path) as f:
raw = json.load(f)
# Support single eval_set or list (one per flutter channel)
manifests = raw if isinstance(raw, list) else [raw]
any_failures = False
for manifest in manifests:
if _run_single_manifest(manifest):
any_failures = True
return any_failures
def _run_single_manifest(manifest: dict) -> bool:
"""Run a single InspectEvalSet entry.
Returns True if any tasks failed.
"""
log_dir = manifest["log_dir"]
Path(log_dir).mkdir(parents=True, exist_ok=True)
job_logger, log_file_path = setup_logging(Path(log_dir), name="dash_evals")
# Build Task objects from task definitions
task_defs = manifest["tasks"]
task_instances: list[inspect_ai.Task] = []
for task_def in task_defs:
task_func_name = task_def.get("func")
task_name = task_def.get("name", task_func_name or "(unknown)")
if not task_func_name:
# Mode 2: hydrate directly from JSON (future)
job_logger.warning(f" ⚠ {task_name}: no func — Mode 2 hydration not yet supported")
continue
try:
task_func = _resolve_task_func(task_func_name)
except ValueError as e:
job_logger.warning(f" ✗ {task_name}: {e}")
continue
# Build dataset (dispatches on format: memory | json | csv)
try:
dataset = _build_dataset(task_def)
except ValueError as e:
job_logger.warning(f" ✗ {task_name}: {e}")
continue
# Inject task_name into the config for task functions that expect it.
# The Dart CLI emits "name" but task functions use "task_name".
if "task_name" not in task_def and "name" in task_def:
task_def["task_name"] = task_def["name"]
# Inject sandbox_type for task functions that check it.
# The Dart CLI emits "sandbox" as ["type", "path"] or a string,
# but task functions check "sandbox_type".
if "sandbox_type" not in task_def:
sandbox = task_def.get("sandbox") or manifest.get("sandbox")
if isinstance(sandbox, list) and len(sandbox) >= 1:
task_def["sandbox_type"] = sandbox[0]
elif isinstance(sandbox, str) and sandbox != "local":
task_def["sandbox_type"] = sandbox
try:
task_instance = task_func(dataset, task_def)
task_instances.append(task_instance)
job_logger.info(f" ✓ {task_name} ({len(dataset)} samples)")
except Exception as e:
job_logger.error(f" ✗ {task_name}: {e}")
if not task_instances:
job_logger.warning("No valid tasks to run")
return True
# Build eval_set kwargs from remaining manifest keys
eval_set_kwargs = {k: v for k, v in manifest.items() if k not in _NON_EVAL_SET_KEYS}
# Convert sandbox list to tuple (eval_set expects tuple for ("type", "path"))
sandbox = eval_set_kwargs.get("sandbox")
if isinstance(sandbox, list) and len(sandbox) == 2:
eval_set_kwargs["sandbox"] = tuple(sandbox)
job_logger.info(f"\n{'=' * 70}\n🚀 RUNNING {len(task_instances)} TASKS\n{'=' * 70}")
try:
with capture_output(log_file_path):
success, _ = inspect_ai.eval_set(
tasks=task_instances,
**eval_set_kwargs,
)
return not success
except Exception as e:
job_logger.error(f"Evaluation failed: {e}")
return True