Source code for dash_evals.main

# Copyright 2025 The Flutter Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.

"""CLI entry point for running evaluations.

Usage:
    run-evals --json ./eval_set.json
    run-evals --task my_task --model openai/gpt-4o --dataset samples.jsonl
"""

import argparse
import logging
import sys
from pathlib import Path

from dotenv import load_dotenv

# Import sandbox environments to register them with InspectAI
# The @sandboxenv decorator registers the sandbox type when the module is imported
import dash_evals.runner.sandboxes.podman.podman  # noqa: F401  # Registers 'podman'
from dash_evals.runner.args_runner import _run_from_args
from dash_evals.runner.json_runner import run_from_json

# Basic console logger for early startup messages
logging.basicConfig(level=logging.INFO, format="%(message)s")
_startup_logger = logging.getLogger("startup")


[docs] def main(): """Parse command-line arguments and run evaluations.""" # Load .env from the repo root (walks up from cwd). # This populates os.environ with API keys, credentials, etc. # System env vars take precedence over .env values (python-dotenv default). load_dotenv(override=False) parser = argparse.ArgumentParser( description="Run Inspect AI evaluations for the Dart/Flutter plugin.", epilog="Example: run-evals --json ./eval_set.json", ) # ---------- JSON mode (mutually exclusive with direct args) ---------- parser.add_argument( "--json", type=Path, help="Path to eval_set.json (emitted by Dart CLI).", ) # ---------- Direct-args mode ---------- parser.add_argument( "--task", type=str, help="Task function name (e.g. 'flutter_code_gen' or dotted path).", ) parser.add_argument( "--model", type=str, action="append", help="Model to evaluate (can be repeated). Example: openai/gpt-4o", ) parser.add_argument( "--dataset", type=Path, help="Path to a dataset file (JSON/JSONL/CSV).", ) parser.add_argument( "--log-dir", type=Path, help="Directory to write evaluation logs.", ) parser.add_argument( "--sandbox", type=str, nargs=2, metavar=("TYPE", "CONFIG"), help="Sandbox type and config path. Example: podman compose.yaml", ) parser.add_argument( "--max-connections", type=int, help="Maximum concurrent model connections.", ) parser.add_argument( "--max-samples", type=int, help="Maximum concurrent samples per task.", ) parser.add_argument( "--fail-on-error", type=float, help="Proportion of sample errors to tolerate (0.0-1.0).", ) args = parser.parse_args() # Ensure either --json or direct args are provided, but not both. direct_args_provided = any([args.task, args.model, args.dataset]) if args.json and direct_args_provided: parser.error( "Cannot combine --json with --task/--model/--dataset. Use one mode or the other." ) if not args.json and not direct_args_provided: parser.error("Provide either --json or at least --task and --model.") try: if args.json: has_failures = run_from_json(args.json) else: has_failures = _run_from_args(args) except Exception as e: _startup_logger.error(f"Failed to run evaluation: {e}") sys.exit(1) sys.exit(1 if has_failures else 0)
if __name__ == "__main__": main()