flutter · ericwindmill · Mar 30, 2026 · Mar 30, 2026 · Mar 30, 2026
diff --git a/README.md b/README.md
@@ -11,6 +11,7 @@ This repo includes
 
 - **eval runner** — Python package for running LLM evaluations with configurable tasks, variants, and models
 - **config packages** — Dart and Python packages that resolve dataset YAML into EvalSet JSON for the runner
+  - **NB**: These packages largely overlap, and coexist for backwards compatibility purposes. In time, the Dart package will be deprecated.  
 - **devals CLI** — Dart CLI for creating and managing dataset samples, tasks, and jobs
 - **Evaluation Explorer** — Dart/Flutter app for browsing and analyzing results
 

diff --git a/docs/contributing/packages/dash_evals.md b/docs/contributing/packages/dash_evals.md
@@ -41,9 +41,10 @@ src/dash_evals/
 
 1. **Configure**: The Dart `dataset_config_dart` package parses dataset YAML and resolves it into an `EvalSet` JSON manifest
 2. **Load**: The Python runner reads the JSON manifest via `json_runner.py`, resolving task functions dynamically with `importlib`
-3. **Execute**: Each task function receives its dataset and task definition, producing an `inspect_ai.Task`
-4. **Score**: Scorers evaluate model outputs against targets
-5. **Log**: Results written to the configured `log_dir`
+3. **Hydrate**: Config dicts are converted to Inspect AI objects (datasets, MCP servers, skills) using shared helpers from `dataset_config_python.hydrate`
+4. **Execute**: Each task function receives its dataset and task definition, producing an `inspect_ai.Task`
+5. **Score**: Scorers evaluate model outputs against targets
+6. **Log**: Results written to the configured `log_dir`
 
 Alternatively, the runner can be invoked directly with `--task` and `--model` arguments (via `args_runner.py`), bypassing the Dart config pipeline.
 

diff --git a/docs/contributing/repository_structure.md b/docs/contributing/repository_structure.md
@@ -10,7 +10,7 @@ evals/
 │   ├── devals_cli/             # Dart CLI for managing dataset (devals)
 │   ├── dataset_config_dart/    # Dart library: YAML → EvalSet JSON
 │   ├── dash_evals/             # Python evaluation runner
-│   ├── dataset_config_python/  # Python configuration models
+│   ├── dataset_config_python/  # Python config: YAML → EvalSet JSON + config → Inspect AI objects
 │   └── eval_explorer/          # Dart/Flutter results viewer (Serverpod)
 ├── tool/                       # Utility scripts
 ├── pubspec.yaml                # Dart workspace configuration

diff --git a/docs/guides/about_the_framework.md b/docs/guides/about_the_framework.md
@@ -18,6 +18,7 @@ YAML config → Dart resolver → JSON manifest → Python runner → Inspect AI
 |-------|---------|-------------|
 | **YAML config** | — | Your `task.yaml` and `job.yaml` files |
 | **Dart resolver** | `dataset_config_dart` | Parses YAML, resolves globs and references, produces a JSON manifest |
+| **Hydration** | `dataset_config_python` | Converts config dicts into Inspect AI objects (datasets, MCP servers, skills) |
 | **Python runner** | `dash_evals` | Reads the manifest, builds Inspect AI `Task` objects, calls `eval_set()` |
 | **Inspect AI** | `inspect_ai` | Runs solver chains, sends prompts, collects responses, runs scorers |
 
@@ -148,16 +149,19 @@ calling `submit()`.
 
 ## Shared helpers
 
-The `task_helpers.py` module contains functions used across all tasks:
-
-| Helper | What it does |
-|--------|-------------|
-| `append_context_injection(chain, config)` | Adds a `context_injector` solver if the variant has `files` |
-| `append_model_interaction(chain, config)` | Adds `react()` (if tools exist) or `generate()` (if not) |
-| `get_skill_tool(config)` | Creates a skill tool if the variant has `skills` configured |
-| `build_task_metadata(config)` | Builds the metadata dict for the `Task` object |
-| `create_mcp_servers(configs, sandbox_type)` | Creates MCP server objects from variant config |
-| `validate_sandbox_tools(config, tool_names)` | Checks that sandbox-requiring tools aren't used on local |
+The `task_helpers.py` module contains functions used across all tasks. Some of
+these are re-exported from `dataset_config_python.hydrate` — the shared
+config-interpretation layer that both `dash_evals` and external consumers (like
+yardstick) use to ensure consistent hydration of config into Inspect AI objects.
+
+| Helper | Source | What it does |
+|--------|--------|-------------|
+| `create_mcp_servers(configs, sandbox_type)` | `dataset_config_python` | Creates MCP server objects from variant config |
+| `get_skill_tool(config)` | `dataset_config_python` | Creates a skill tool if the variant has `skills` configured |
+| `build_task_metadata(config)` | `dataset_config_python` | Builds the metadata dict for the `Task` object |
+| `append_context_injection(chain, config)` | `dash_evals` | Adds a `context_injector` solver if the variant has `files` |
+| `append_model_interaction(chain, config)` | `dash_evals` | Adds `react()` (if tools exist) or `generate()` (if not) |
+| `validate_sandbox_tools(config, tool_names)` | `dash_evals` | Checks that sandbox-requiring tools aren't used on local |
 
 These helpers mean that most of the variant logic (context injection, MCP tools,
 skills) is handled **automatically**. You just need to define the core solver

diff --git a/packages/dash_evals/pyproject.toml b/packages/dash_evals/pyproject.toml
@@ -15,6 +15,7 @@ dependencies = [
     "openai>=2.8.1,<3.0.0",
     "firebase-admin>=6.0.0,<8.0.0",
     "pydantic>=2.0.0,<3.0.0",
+    "dataset-config-python",
 ]
 
 [project.optional-dependencies]

diff --git a/packages/dash_evals/src/dash_evals/runner/json_runner.py b/packages/dash_evals/src/dash_evals/runner/json_runner.py
@@ -11,7 +11,7 @@
 from pathlib import Path
 
 import inspect_ai
-from inspect_ai.dataset import MemoryDataset, Sample, csv_dataset, json_dataset
+from dataset_config_python.hydrate import build_dataset as _build_dataset
 
 from dash_evals.utils.logging import capture_output, setup_logging
 
@@ -94,74 +94,8 @@ def _resolve_task_func(name: str):
         return func
 
 
-def _build_dataset(task_def: dict):
-    """Build an Inspect AI dataset from a task definition.
 
-    Dispatches on ``task_def["dataset"]["format"]``:
 
-    - ``"memory"`` (default): builds a ``MemoryDataset`` from inline samples.
-    - ``"json"``: delegates to ``inspect_ai.dataset.json_dataset(source, **args)``.
-    - ``"csv"``: delegates to ``inspect_ai.dataset.csv_dataset(source, **args)``.
-
-    Args:
-        task_def: A task entry from the EvalSet JSON manifest.
-
-    Returns:
-        An Inspect AI dataset object.
-
-    Raises:
-        ValueError: If the dataset format is unrecognized or required fields
-            (e.g. ``source`` for json/csv) are missing.
-    """
-    dataset_def = task_def.get("dataset")
-    task_name = task_def.get("name", "")
-
-    if not dataset_def:
-        return MemoryDataset([], name=task_name)
-
-    fmt = dataset_def.get("format", "memory")
-    extra_args: dict = dataset_def.get("args") or {}
-
-    if fmt == "json":
-        source = dataset_def.get("source")
-        if not source:
-            raise ValueError(
-                f"Task '{task_name}': dataset format 'json' requires a 'source' field."
-            )
-        return json_dataset(source, **extra_args)
-
-    if fmt == "csv":
-        source = dataset_def.get("source")
-        if not source:
-            raise ValueError(
-                f"Task '{task_name}': dataset format 'csv' requires a 'source' field."
-            )
-        return csv_dataset(source, **extra_args)
-
-    if fmt == "memory":
-        raw_samples = dataset_def.get("samples", [])
-        samples = []
-        for raw in raw_samples:
-            sample = Sample(
-                input=raw["input"],
-                target=raw.get("target", ""),
-                id=raw.get("id"),
-                metadata=raw.get("metadata"),
-                files=raw.get("files"),
-                setup=raw.get("setup"),
-                sandbox=raw.get("sandbox"),
-            )
-            samples.append(sample)
-
-        return MemoryDataset(
-            samples,
-            name=dataset_def.get("name", task_name),
-        )
-
-    raise ValueError(
-        f"Task '{task_name}': unknown dataset format '{fmt}'. "
-        f"Expected one of: 'memory', 'json', 'csv'."
-    )
 
 
 def run_from_json(manifest_path: str | Path) -> bool:

diff --git a/packages/dash_evals/src/dash_evals/runner/tasks/task_helpers.py b/packages/dash_evals/src/dash_evals/runner/tasks/task_helpers.py
@@ -11,18 +11,22 @@
 
 from __future__ import annotations
 
-import importlib
-from typing import Any, cast
-
+from typing import cast
+
+# Re-export config-interpretation helpers from the shared package.
+# These are the single source of truth for interpreting config dicts
+# as Inspect AI objects; both dash_evals and yardstick use them.
+from dataset_config_python.hydrate import (  # noqa: F401
+    build_task_metadata,
+    create_mcp_servers,
+    get_skill_tool,
+)
 from inspect_ai.agent import react
 from inspect_ai.solver import Solver, generate
 from inspect_ai.tool import (
     MCPServer,
     Tool,
-    mcp_server_http,
-    mcp_server_sandbox,
     mcp_server_stdio,
-    skill,
 )
 
 from dash_evals.runner.solvers import context_injector
@@ -66,127 +70,6 @@ def validate_sandbox_tools(config: dict, tool_names: list[str]) -> None:
     )
 
 
-def _resolve_mcp_ref(ref: str) -> MCPServer:
-    """Resolve a Python import reference to an MCPServer object.
-
-    Supports ``"module.path:variable_name"`` format.
-
-    Args:
-        ref: Import reference (e.g. ``"my_package.mcp:staging_server"``).
-
-    Returns:
-        The resolved MCPServer object.
-    """
-    if ":" not in ref:
-        raise ValueError(
-            f"Invalid MCP server ref '{ref}'. Expected format: 'module.path:variable_name'"
-        )
-    module_path, attr_name = ref.rsplit(":", 1)
-    try:
-        module = importlib.import_module(module_path)
-    except ImportError as e:
-        raise ImportError(
-            f"Could not import module '{module_path}' for MCP server ref '{ref}': {e}"
-        ) from e
-    try:
-        server = getattr(module, attr_name)
-    except AttributeError as e:
-        raise AttributeError(
-            f"Module '{module_path}' has no attribute '{attr_name}' "
-            f"(referenced by MCP server ref '{ref}')"
-        ) from e
-    return server
-
-
-def create_mcp_servers(
-    mcp_configs: list[dict],
-    sandbox_type: str = "local",
-) -> list[MCPServer]:
-    """Create MCP server objects from variant config.
-
-    Supports three modes per entry:
-    - **Declarative stdio/sandbox**: dict with ``command``, ``args``, etc.
-    - **Declarative HTTP**: dict with ``url``, and optionally ``authorization``/``headers``.
-    - **Python ref**: dict with ``ref`` key pointing to a pre-built MCPServer.
-
-    Transport is auto-selected when not explicit:
-    - If ``url`` is present → ``mcp_server_http``
-    - If sandbox is non-local → ``mcp_server_sandbox``
-    - Otherwise → ``mcp_server_stdio``
-
-    Args:
-        mcp_configs: List of MCP server config dicts from variant_config.
-        sandbox_type: The sandbox type for the current eval run.
-
-    Returns:
-        List of MCPServer objects.
-    """
-    servers: list[MCPServer] = []
-    for cfg in mcp_configs:
-        # Ref mode — import a pre-built MCPServer from Python
-        if cfg.get("ref"):
-            servers.append(_resolve_mcp_ref(cfg["ref"]))
-            continue
-
-        # HTTP mode — url-based server
-        url = cfg.get("url")
-        if url:
-            name = cfg.get("name", url)
-            authorization = cfg.get("authorization") or cfg.get("auth")
-            headers = cfg.get("headers")
-            servers.append(
-                mcp_server_http(
-                    url=url,
-                    name=name,
-                    authorization=authorization,
-                    headers=headers,
-                )
-            )
-            continue
-
-        # Stdio / sandbox mode — command-based server
-        command = cfg.get("command")
-        if not command:
-            raise ValueError(
-                f"MCP server config missing 'command' or 'url' for server "
-                f"'{cfg.get('name', 'unknown')}': {cfg}"
-            )
-
-        name = cfg.get("name", command)
-        args = cfg.get("args", [])
-        env = cfg.get("env")
-        cwd = cfg.get("cwd")
-
-        transport = cfg.get("transport")
-        if transport is None:
-            transport = "sandbox" if sandbox_type != "local" else "stdio"
-
-        if transport == "stdio":
-            servers.append(
-                mcp_server_stdio(
-                    name=name,
-                    command=command,
-                    args=args,
-                    env=env,
-                    cwd=cwd,
-                )
-            )
-        elif transport == "sandbox":
-            servers.append(
-                mcp_server_sandbox(
-                    name=name,
-                    command=command,
-                    args=args,
-                    env=env,
-                    cwd=cwd,
-                )
-            )
-        else:
-            raise ValueError(f"Unknown MCP transport '{transport}' for server '{name}'")
-
-    return servers
-
-
 # Backwards-compatible alias
 def create_mcp_server(config: dict | None = None):
     """Create the default Dart MCP server (backwards-compatible alias)."""
@@ -202,28 +85,6 @@ def create_dart_mcp_server():
     return create_mcp_server()
 
 
-def build_task_metadata(config: dict) -> dict:
-    """Build task metadata dictionary from manifest config.
-
-    Args:
-        config: Task manifest entry with 'variant', 'save_examples', etc.
-
-    Returns:
-        Metadata dictionary for Task.
-    """
-    metadata: dict[str, Any] = {}
-    variant = config.get("variant", {})
-    if variant:
-        metadata["variant_config"] = variant
-
-    if config.get("save_examples") and config.get("examples_dir"):
-        metadata["save_examples"] = True
-        metadata["examples_dir"] = config["examples_dir"]
-        metadata["task_variant"] = config.get("task_name", "unknown")
-
-    return metadata
-
-
 def append_context_injection(solver_chain: list, config: dict) -> None:
     """Append context injection solver if the variant has context files.
 
@@ -238,23 +99,6 @@ def append_context_injection(solver_chain: list, config: dict) -> None:
         solver_chain.append(context_injector(context_files))
 
 
-def get_skill_tool(config: dict) -> Tool | None:
-    """Create the skill tool if the variant has skills configured.
-
-    Args:
-        config: Task manifest entry with 'variant' key.
-
-    Returns:
-        The skill Tool, or None if no skills are configured.
-    """
-    variant = config.get("variant", {})
-    # Support both old "skill_paths" and new "skills" key
-    skill_paths = variant.get("skills") or variant.get("skill_paths", [])
-    if skill_paths:
-        return skill(skill_paths)
-    return None
-
-
 def append_model_interaction(
     solver_chain: list,
     config: dict,