From b23c6567e6f03b9f11e8be88f36ff2f62b84cd2d Mon Sep 17 00:00:00 2001 From: PeteFedora Date: Thu, 28 May 2026 17:08:27 +0200 Subject: [PATCH] feat: inject workspace file context into LLM prompts for data-aware code generation --- treesearch/minimal_agent.py | 76 ++++++++++++-- utils/workspace_context.py | 203 ++++++++++++++++++++++++++++++++++++ 2 files changed, 269 insertions(+), 10 deletions(-) create mode 100644 utils/workspace_context.py diff --git a/treesearch/minimal_agent.py b/treesearch/minimal_agent.py index 7a1f9d0..21111c5 100644 --- a/treesearch/minimal_agent.py +++ b/treesearch/minimal_agent.py @@ -22,6 +22,11 @@ from treesearch.utils.response import strip_markdown_fences from utils.log import _ROOT_LOGGER from utils.path import mkdir +from utils.workspace_context import ( + find_project_root, + find_workspace_dir, + get_workspace_context_block, +) logger = _ROOT_LOGGER.getChild("nodeAgent") @@ -29,6 +34,7 @@ class MinimalAgent: """A minimal agent class that only contains what's needed for processing nodes""" + def __init__( self, task_desc: str, @@ -44,6 +50,14 @@ def __init__( self.evaluation_metrics = evaluation_metrics self.stage_name = stage_name self._out_dir = mkdir(Path(cfg.out_dir)) + + # Dynamically discover project root and workspace directories + # (these can differ between Docker, pip, and local runs) + self._project_root = find_project_root() + self._workspace_dir = find_workspace_dir() + logger.info(f"Project root: {self._project_root}") + logger.info(f"Workspace dir: {self._workspace_dir}") + logger.info("Agent initialized!") # Setup MCP connections for documentation search @@ -77,8 +91,15 @@ def _prompt_environment(self): ] pkg_str = ", ".join([f"`{p}`" for p in pkgs]) + # Dynamically build workspace context so the LLM knows where files are + workspace_block = get_workspace_context_block( + project_root=self._project_root, + workspace_dir=self._workspace_dir, + ) + env_prompt = { - "Installed Packages": f"Your solution can use the following machine learning packages: {pkg_str}. You MUST use these libraries as much as possible instead of implementing from scratch." + "Installed Packages": f"Your solution can use the following machine learning packages: {pkg_str}. You MUST use these libraries as much as possible instead of implementing from scratch.", + "Workspace Context": workspace_block, } return env_prompt @@ -94,12 +115,20 @@ def _prompt_impl_guideline(self): "4. Environment Setup:", " - Create working directory: `working_dir = os.path.join(os.getcwd(), 'working'); os.makedirs(working_dir, exist_ok=True)`", f" - Complete execution within {humanize.naturaldelta(self.cfg.exec.timeout)}", - "5. Data Tracking:", + "5. Data Loading:", + " - Use the `dataloader` package for all standard datasets:", + " ```python", + " from dataloader.loaders.registry import _run_loader", + " df = _run_loader('MovieLens100K') # Downloads & caches automatically", + " ```", + " - For custom data files: place them in the workspace directory and reference by filename", + " (code runs inside the workspace, so just use the filename directly)", + "6. Data Tracking:", " - Track all relevant data points (e.g., metrics, losses)", - "6. Evaluation:", + "7. Evaluation:", f" - Metrics: {', '.join(self.evaluation_metrics) if self.evaluation_metrics else 'Choose appropriate metrics'}", " - Print metrics during execution for monitoring", - "7. API Verification (CRITICAL):", + "8. API Verification (CRITICAL):", " - Check constructor signatures before use", " - Verify object attributes exist (e.g., SplitData structure)", " - Use only public APIs (no underscore-prefixed methods)", @@ -287,6 +316,12 @@ def _new_node(self, plan: str, code: str, parent: Optional[Node] = None): async def plan_and_code_query(self, prompt, retries=3) -> tuple[str, str]: """Generate a natural language plan + code in the same LLM call and split them apart.""" + # Build workspace context block for the system prompt + workspace_block = get_workspace_context_block( + project_root=self._project_root, + workspace_dir=self._workspace_dir, + ) + plan_and_code_result = ( await Query(tool_budget=40) .with_mcp(self._mcp_docs) @@ -305,7 +340,10 @@ async def plan_and_code_query(self, prompt, retries=3) -> tuple[str, str]: "- Data structures and return types\n" "\n" "In 'nl_text', include '## Documentation Verified' section listing all verified methods.\n" - "Search for examples and Verify critical details in documentation." + "Search for examples and Verify critical details in documentation.\n" + "\n" + "IMPORTANT — File system awareness:\n" + f"{workspace_block}" ) .run(prompt, PlanAndCode) ) @@ -316,22 +354,32 @@ async def plan_and_code_query(self, prompt, retries=3) -> tuple[str, str]: async def _select_datasets(self) -> list[str]: """Select appropriate datasets for the research task using LLM.""" + # Check for any custom/local data files the user might be referencing + workspace_block = get_workspace_context_block( + project_root=self._project_root, + workspace_dir=self._workspace_dir, + ) + prompt: Prompt = { "Instruction:": ( f"You are a recommender system researcher selecting datasets for a research task.\n\n" f"Research task:\n{self.task_desc}\n\n" "Instructions:\n" "1. Check if the research task specifies any datasets\n" - "2. If specified, select those datasets; otherwise choose appropriate ones from the list below\n" - "3. Return only a list of dataset identifiers\n\n" - f"Available datasets:\n{get_datasets_table()}" + "2. Check the 'Workspace Context' below — if a data file matching the task is found, " + "the user likely wants to use that file (e.g., if they mention 'movielens.csv' and it exists on disk)\n" + "3. If the task doesn't specify a dataset, choose appropriate ones from the available datasets below\n" + "4. Return ONLY dataset identifiers from the list below, OR the filename if a local file is to be used\n\n" + f"Workspace Context:\n{workspace_block}\n\n" + f"Available datasets (use these identifiers if no local file matches):\n{get_datasets_table()}" ) } result = ( await Query() .with_mcp(self._mcp_docs) .with_system( - "Search OmniRec documentation for dataset characteristics and usage patterns if needed." + "Search OmniRec documentation for dataset characteristics and usage patterns if needed. " + "If the user mentioned a dataset by name, check if there's a matching file on disk in the workspace context." ) .run(prompt, SelectDatasets) ) @@ -339,12 +387,19 @@ async def _select_datasets(self) -> list[str]: async def _set_code_requirements(self): logger.info("Engineering code requirements...") + workspace_block = get_workspace_context_block( + project_root=self._project_root, + workspace_dir=self._workspace_dir, + ) requirements_prompt = f""" You are an expert recommender systems researcher defining experiment requirements. Research task: {self.task_desc} Selected datasets: {self.selected_datasets} + Files available in workspace: + {workspace_block} + Generate requirements that specify critical aspects of the experiment that must be fulfilled. PRINCIPLES: @@ -356,11 +411,12 @@ async def _set_code_requirements(self): 2. Abstraction: State objectives and constraints at an appropriate level - Avoid excessive implementation details (exact formulas, nested conditional logic, code-level instructions) - Include critical technical specifications where they matter (framework to use, specific datasets, evaluation metrics, split ratios) + - If there are local data files, include a requirement about using the correct file path 3. Atomicity: Each requirement should test one distinct aspect of the experiment 4. Coverage: Include requirements for all essential aspects: - - Data loading and preprocessing + - Data loading and preprocessing (use correct paths for local files or dataloader for remote datasets) - Experimental methodology (data splitting, reproducibility requirements) - Model/algorithm selection and configuration — ALWAYS include a requirement that OmniRec must be used for all recommender system functionality; raw backend libraries (Lenskit, RecBole, etc.) must not be called directly - Training procedures diff --git a/utils/workspace_context.py b/utils/workspace_context.py new file mode 100644 index 0000000..553079f --- /dev/null +++ b/utils/workspace_context.py @@ -0,0 +1,203 @@ +""" +Workspace context utilities. + +Provides the LLM planner with awareness of what custom data files the user +placed in the workspace directory. Since the workspace can be in different +locations (Docker: /app/out/workspace, local: ./out/workspace), +this module dynamically discovers the correct paths. +""" +import os +from pathlib import Path +from typing import Optional + +from utils.log import _ROOT_LOGGER + +logger = _ROOT_LOGGER.getChild("workspace_context") + + +def find_project_root() -> Path: + """ + Find the project root directory by looking for sentinel files + (pyproject.toml, setup.py, .git). + + Returns: + Path: The absolute path to the project root. + """ + cwd = Path.cwd().resolve() + for parent in [cwd] + list(cwd.parents): + sentinels = [ + parent / "pyproject.toml", + parent / "setup.py", + parent / "setup.cfg", + parent / ".git", + parent / "main.py", + ] + for sentinel in sentinels: + if sentinel.exists(): + logger.debug(f"Found project root at {parent}") + return parent + + logger.warning(f"Could not determine project root, falling back to {cwd}") + return cwd + + +def find_workspace_dir() -> Optional[Path]: + """ + Find the current workspace directory by checking common locations. + + The workspace is where code gets executed (Interpreter writes runfile.py here). + It can be: + - ./out/workspace (when running locally via `uv run main.py`) + - /app/out/workspace (when running inside Docker) + - A custom path set via ARL_out_dir env var + + Returns: + Optional[Path]: Path to the workspace directory, or None if not found. + """ + env_out_dir = os.environ.get("ARL_out_dir") + if env_out_dir: + candidate = Path(env_out_dir).resolve() / "workspace" + if candidate.exists(): + return candidate + + candidates = [ + Path.cwd() / "out" / "workspace", + Path.cwd().resolve() / "out" / "workspace", + Path("/app/out/workspace"), + ] + + if Path("/app").exists(): + candidates.append(Path("/app/out/workspace")) + + for candidate in candidates: + if candidate.exists(): + logger.debug(f"Found workspace at {candidate}") + return candidate + + logger.debug("No workspace directory found yet (will be created during execution)") + return None + + +def scan_workspace_for_data_files(workspace_dir: Optional[Path] = None) -> str: + """ + Scan ONLY the workspace directory for user-placed data files. + + Standard datasets should be loaded via the dataloader package. + Only custom/user-provided files (CSV, Parquet, JSON, etc.) placed + directly in the workspace are listed here. + Internal files (.pkl, .py, .log) are ignored automatically. + + Args: + workspace_dir: The workspace directory to scan. If None, auto-detect. + + Returns: + str: A formatted block describing available files, or empty string if none found. + """ + if workspace_dir is None: + workspace_dir = find_workspace_dir() + + if workspace_dir is None or not workspace_dir.exists(): + return "" + + relevant_extensions = { + ".csv": "CSV data file", + ".tsv": "TSV data file", + ".parquet": "Parquet data file", + ".json": "JSON data file", + ".jsonl": "JSONL data file", + } + + found_files: list[tuple[Path, str]] = [] + + # Only scan the workspace directory itself (non-recursive) + # to avoid picking up runfile.py, working/, etc. + for item in workspace_dir.iterdir(): + if item.is_file() and item.suffix.lower() in relevant_extensions: + found_files.append((item, relevant_extensions.get(item.suffix.lower(), "Data file"))) + + if not found_files: + return "" + + lines = ["## Custom data files in workspace", ""] + for fpath, desc in sorted(found_files, key=lambda x: x[0].name): + size_str = _format_size(fpath.stat().st_size) + lines.append(f"- `{fpath.name}` — {desc} ({size_str})") + lines.append("") + lines.append("To use a custom file in your code, reference it by its filename") + lines.append("(the code runs inside the workspace directory):") + lines.append("```python") + lines.append("df = pd.read_csv('filename.csv')") + lines.append("```") + lines.append("") + + return "\n".join(lines) + + +def _format_size(size_bytes: int) -> str: + """Format byte size into human-readable string.""" + for unit in ["B", "KB", "MB", "GB"]: + if size_bytes < 1024: + return f"{size_bytes:.1f} {unit}" + size_bytes /= 1024 + return f"{size_bytes:.1f} TB" + + +def get_workspace_context_block( + project_root: Optional[Path] = None, + workspace_dir: Optional[Path] = None, +) -> str: + """ + Get a formatted context block about the workspace that can be injected + into LLM prompts. + + This tells the planner: + - Where the project root and workspace are + - How to use the dataloader package for standard datasets + - What custom data files the user placed in the workspace + + Args: + project_root: Override for project root (auto-detected if None). + workspace_dir: Override for workspace dir (auto-detected if None). + + Returns: + str: A context block to inject into prompts. + """ + if project_root is None: + project_root = find_project_root() + if workspace_dir is None: + workspace_dir = find_workspace_dir() + + parts = [ + "## Workspace & File Context", + "", + f"- **Project root:** `{project_root}`", + f"- **Code execution workspace:** `{workspace_dir if workspace_dir else 'Not yet created (will be ./out/workspace)'}`", + "", + "### How to access data files", + "", + "When your code runs, the working directory is the code execution workspace listed above.", + "", + "You have two options to load datasets:", + "", + "1. **Use the `dataloader` package** (recommended for standard datasets):", + " ```python", + " from dataloader.loaders.registry import _run_loader", + " df = _run_loader('MovieLens100K') # Downloads & caches automatically", + " ```", + " Available datasets include: MovieLens100K, MovieLens1M, MovieLens10M, MovieLens20M,", + " MovieLens25M, MovieLensLatest, MovieLensLatestSmall, MovieLens1BSynthetic,", + " Amazon2014*, Amazon2018*, Amazon2023*, Yelp2023, Gowalla, BeerAdvocate, etc.", + "", + "2. **Use custom files placed in the workspace** (listed below):", + "", + ] + + # Only scan the workspace for user-placed custom files + custom_files_block = scan_workspace_for_data_files(workspace_dir) + if custom_files_block: + parts.append(custom_files_block) + else: + parts.append(" *(No custom data files found in workspace.)*") + parts.append("") + + return "\n".join(parts)