Source code for dash_evals.main

# Copyright 2025 The Flutter Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.

"""CLI entry point for running evaluations.

Usage:
    run-evals --json ./eval_set.json
    run-evals --task my_task --model openai/gpt-4o --dataset samples.jsonl
"""

import argparse
import logging
import sys
from pathlib import Path

from dotenv import load_dotenv

# Import sandbox environments to register them with InspectAI
# The @sandboxenv decorator registers the sandbox type when the module is imported
import dash_evals.runner.sandboxes.podman.podman  # noqa: F401  # Registers 'podman'
from dash_evals.runner.args_runner import _run_from_args
from dash_evals.runner.json_runner import run_from_json

# Basic console logger for early startup messages
logging.basicConfig(level=logging.INFO, format="%(message)s")
_startup_logger = logging.getLogger("startup")



[docs]
def main():
    """Parse command-line arguments and run evaluations."""
    # Load .env from the repo root (walks up from cwd).
    # This populates os.environ with API keys, credentials, etc.
    # System env vars take precedence over .env values (python-dotenv default).
    load_dotenv(override=False)

    parser = argparse.ArgumentParser(
        description="Run Inspect AI evaluations for the Dart/Flutter plugin.",
        epilog="Example: run-evals --json ./eval_set.json",
    )

    # ---------- JSON mode (mutually exclusive with direct args) ----------
    parser.add_argument(
        "--json",
        type=Path,
        help="Path to eval_set.json (emitted by Dart CLI).",
    )

    # ---------- Direct-args mode ----------
    parser.add_argument(
        "--task",
        type=str,
        help="Task function name (e.g. 'flutter_code_gen' or dotted path).",
    )
    parser.add_argument(
        "--model",
        type=str,
        action="append",
        help="Model to evaluate (can be repeated). Example: openai/gpt-4o",
    )
    parser.add_argument(
        "--dataset",
        type=Path,
        help="Path to a dataset file (JSON/JSONL/CSV).",
    )
    parser.add_argument(
        "--log-dir",
        type=Path,
        help="Directory to write evaluation logs.",
    )
    parser.add_argument(
        "--sandbox",
        type=str,
        nargs=2,
        metavar=("TYPE", "CONFIG"),
        help="Sandbox type and config path. Example: podman compose.yaml",
    )
    parser.add_argument(
        "--max-connections",
        type=int,
        help="Maximum concurrent model connections.",
    )
    parser.add_argument(
        "--max-samples",
        type=int,
        help="Maximum concurrent samples per task.",
    )
    parser.add_argument(
        "--fail-on-error",
        type=float,
        help="Proportion of sample errors to tolerate (0.0-1.0).",
    )

    args = parser.parse_args()

    # Ensure either --json or direct args are provided, but not both.
    direct_args_provided = any([args.task, args.model, args.dataset])
    if args.json and direct_args_provided:
        parser.error(
            "Cannot combine --json with --task/--model/--dataset. Use one mode or the other."
        )
    if not args.json and not direct_args_provided:
        parser.error("Provide either --json or at least --task and --model.")

    try:
        if args.json:
            has_failures = run_from_json(args.json)
        else:
            has_failures = _run_from_args(args)
    except Exception as e:
        _startup_logger.error(f"Failed to run evaluation: {e}")
        sys.exit(1)

    sys.exit(1 if has_failures else 0)



if __name__ == "__main__":
    main()