diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index d0ffdb8..c867f6d 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -5,9 +5,57 @@ on: branches: [main, develop] pull_request: +env: + FORCE_JAVASCRIPT_ACTIONS_TO_NODE24: true + jobs: + lint: + name: Lint & Format + runs-on: ubuntu-latest + permissions: + contents: write # Needed for auto-commit + + steps: + - uses: actions/checkout@v4 + with: + ref: ${{ github.head_ref }} + + - name: Set up Rust + uses: dtolnay/rust-toolchain@stable + + - name: Install uv + uses: astral-sh/setup-uv@v5 + with: + cache-dependency-glob: "pyproject.toml" + + - name: Set up Python + run: uv python install 3.12 + + - name: Sync dependencies + run: uv sync --all-extras + + - name: Run pre-commit hooks + id: precommit + run: uv run pre-commit run --all-files + + - name: Check README sync + run: uv run python scripts/sync_readme.py --check + + - name: Auto-commit formatting fixes + if: failure() + uses: stefanzweifel/git-auto-commit-action@v5 + with: + commit_message: "style: auto-fix formatting and linting issues" + commit_options: "--no-verify" + # Fail the job even if we commit so the user knows they need to pull + + - name: Fail if pre-commit failed + if: steps.precommit.outcome == 'failure' + run: exit 1 + test: name: ${{ matrix.os }} / py${{ matrix.python-version }} + needs: lint runs-on: ${{ matrix.os }} strategy: @@ -41,12 +89,6 @@ jobs: - name: Check native import run: uv run python -c "import scriber._native; print('native ok')" - - name: Rust format check - run: cargo fmt --check - - - name: Rust clippy - run: cargo clippy --all-targets -- -D warnings - - name: Rust tests run: cargo test @@ -54,4 +96,4 @@ jobs: run: uv run pytest - name: CLI smoke - run: uv run scriber . --only-tree --output - \ No newline at end of file + run: uv run scriber . --only-tree --output - diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index da08b96..2378218 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -5,6 +5,9 @@ on: tags: - "v*" +env: + FORCE_JAVASCRIPT_ACTIONS_TO_NODE24: true + jobs: build: name: Build ${{ matrix.os }} @@ -82,4 +85,4 @@ jobs: - name: Publish uses: pypa/gh-action-pypi-publish@release/v1 with: - packages-dir: dist-artifacts \ No newline at end of file + packages-dir: dist-artifacts diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..3ec7f23 --- /dev/null +++ b/.gitignore @@ -0,0 +1,57 @@ +# Environments +.env +.venv +env/ +venv/ +ENV/ + +# Python +__pycache__/ +*.py[cod] +*$py.class +*.so +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# Rust +target/ +**/*.rs.bk + +# Caches and tooling +.pytest_cache/ +.ruff_cache/ +.mypy_cache/ +.coverage +htmlcov/ +.tox/ +.nox/ + +# Scriber specific +.scriber/ +scriber_pack.md +*.scriber_pack.md +context.md + +# IDEs and Editors +.idea/ +.vscode/ +*.swp +*.swo +*~ +.DS_Store diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..12a0146 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,30 @@ +repos: +- repo: https://github.com/pre-commit/pre-commit-hooks + rev: v4.5.0 + hooks: + - id: trailing-whitespace + - id: end-of-file-fixer + - id: check-yaml + - id: check-toml + +- repo: https://github.com/astral-sh/ruff-pre-commit + rev: v0.3.5 + hooks: + - id: ruff + args: [ --fix ] + - id: ruff-format + +- repo: local + hooks: + - id: cargo-fmt + name: cargo fmt + entry: cargo fmt --manifest-path Cargo.toml -- + language: system + types: [rust] + pass_filenames: true + - id: cargo-clippy + name: cargo clippy + entry: cargo clippy --manifest-path Cargo.toml -- -D warnings + language: system + types: [rust] + pass_filenames: false diff --git a/CHANGELOG.md b/CHANGELOG.md index bcd7481..dd220be 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,13 +5,29 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [2.1.0] - 2026-05-31 + +### Added +- **Frontend Graph Tracking**: Added dependency parsing support for modern frontend frameworks (`.vue`, `.svelte`, `.astro`), HTML templates, and CSS stylesheets within JS/TS graph construction. +- **Packaging Profiles (`--profile`)**: Added `default`, `audit`, `debug`, `refactor`, and `docs` profiles to quickly bias the file scoring and inclusion criteria without manually tweaking config options. +- **CLI Introspection**: Added `--explain` flag as an alias. Enhanced `--why` output to show estimated token cost, content mode, and omission reasons for any target file. +- **Automated README Sync**: Added `scripts/sync_readme.py` tool to automatically sync CLI arguments, profiles documentation, and version tags across the `README.md`. +- **AI-Native Navigation & Optimization**: Implemented XML anchors for symbols, aggressive test file quarantine, and support file pruning to keep focused mode clean and strictly token-capped. +- **Dependency Limiting**: Introduced `top_dependencies` (defaulting to 10) in the configuration to limit the width of the graph traversal and pull in only the highest-confidence dependencies per file. +- **Version Alignment**: Synchronized Python and Rust crate versions. `scriber --version` now reports both Python and native API versions. + +### Fixed +- **Cache Stability**: Fixed graph warm-cache edge generation and stale import cache validation (now strictly validating `mtime` and `size`). +- **Resilience & Scanners**: Added pure-Python fallback for `read_text_lossy`, optimized scanner ordering (whitelist before binary check), and corrected the test role classifier to prevent false positives on files naturally named `tests.py`. +- **Excerpt Fallback Bug**: Fixed rendering and token estimations for empty excerpt files; they now correctly fall back to outline AST structures or full content if budget allows. + ## [2.0.0] - 2026-05-30 ### Added -- **⚑ Native Rust Acceleration (`scriber._native`)**: Full transition of filesystem scanning, high-performance file reading/writing, and binary classification to a compiled Rust extension built using Maturin and PyO3. -- **🌳 Fast Parallel Scanner**: Re-engineered directory scanning utilizing the `WalkBuilder` from the `ignore` crate, fully respecting `.gitignore` rules with blazing fast native execution. -- **πŸ§ͺ Rigorous Verification & Equivalence Testing**: Comprehensive suite of regression and equivalence tests validating 100% exact matching behavior between Rust and Python scanner modules. -- **πŸ“¦ Multi-Platform Binary Wheels**: CI/CD integration using `PyO3/maturin-action` to compile and distribute native wheels across Linux, macOS, and Windows. +- **Native Rust Acceleration (`scriber._native`)**: Full transition of filesystem scanning, high-performance file reading/writing, and binary classification to a compiled Rust extension built using Maturin and PyO3. +- **Fast Parallel Scanner**: Re-engineered directory scanning utilizing the `WalkBuilder` from the `ignore` crate, fully respecting `.gitignore` rules with blazing fast native execution. +- **Rigorous Verification & Equivalence Testing**: Comprehensive suite of regression and equivalence tests validating 100% exact matching behavior between Rust and Python scanner modules. +- **Multi-Platform Binary Wheels**: CI/CD integration using `PyO3/maturin-action` to compile and distribute native wheels across Linux, macOS, and Windows. ## [1.1.2] - 2025-09-30 @@ -53,7 +69,7 @@ The CLI now falls back to simple text-based output if `rich` is not installed. ### Added - Configured a GitHub Actions pipeline for automated testing and releases. -- `-v` and `--version` to scriber app +- `-v` and `--version` to scriber app - The `--config` flag now accepts a path to a `pyproject.toml` file, providing more flexibility for monorepo configurations. ### Fixed @@ -68,4 +84,4 @@ The CLI now falls back to simple text-based output if `rich` is not installed. - **Clipboard Integration**: Enabled copying the generated project structure to the clipboard. - **Command-Line Interface**: Created a command-line tool with a configurable `init` command for saving settings to `pyproject.toml`. - **Configuration**: Introduced `pyproject.toml` as the single source of truth for project metadata and configuration. -- **Testing**: Added a test suite using `pytest` to ensure core functionality and CLI commands work as expected. \ No newline at end of file +- **Testing**: Added a test suite using `pytest` to ensure core functionality and CLI commands work as expected. diff --git a/Cargo.toml b/Cargo.toml index 91e0426..a57d2f4 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "project-scriber-native" -version = "2.0.0" +version = "2.1.0" edition = "2021" [lib] @@ -16,4 +16,3 @@ serde = { version = "1.0", features = ["derive"] } serde_json = "1.0" memchr = "2.7" regex = "1.10" - diff --git a/README.md b/README.md index 7dc7d1a..e711a7b 100644 --- a/README.md +++ b/README.md @@ -9,13 +9,13 @@ PyPI Version

-An intelligent tool to map, analyze, and compile project source code into a single, context-optimized text file for Large Language Models (LLMs). **Version 2.0** brings advanced dependency graph analysis, strict whitelist-based file inclusion, zero-dependency lightweight execution, and progress tracking! +An intelligent tool to map, analyze, and compile project source code into a single, context-optimized text file for Large Language Models (LLMs). **Version 2** brings advanced dependency graph analysis, strict whitelist-based file inclusion, zero-dependency lightweight execution, and progress tracking! ----- ## πŸ“– Table of Contents -- [πŸ€” Why ProjectScriber 2.0?](#-why-projectscriber-20) +- [πŸ€” Why ProjectScriber?](#-why-projectscriber) - [✨ Key Features](#-key-features) - [πŸš€ Quick Start](#-quick-start) - [πŸ’Ύ Installation](#-installation) @@ -25,14 +25,14 @@ An intelligent tool to map, analyze, and compile project source code into a sing ----- -## πŸ€” Why ProjectScriber 2.0? +## πŸ€” Why ProjectScriber? When working with Large Language Models, providing the full context of a codebase is crucial for getting accurate analysis, documentation, or refactoring suggestions. However, blindly pasting an entire project wastes tokens and introduces noise. -**ProjectScriber 2.0** automates context building using a **Whitelist-First** philosophy and an **Intelligent Scoring Engine**. It analyzes your codebase's dependency graph (e.g., Python imports), determines which files are most relevant to the code you're working on, and bundles them into a single, clean markdown file, strictly respecting your token budgets and file-type configurations. +**ProjectScriber** automates context building using a **Whitelist-First** philosophy and an **Intelligent Scoring Engine**. It analyzes your codebase's dependency graph (e.g., Python imports), determines which files are most relevant to the code you're working on, and bundles them into a single, clean markdown file, strictly respecting your token budgets and file-type configurations.

- πŸ“ Your Codebase β†’ πŸ“¦ ProjectScriber 2.0 β†’ πŸ“‹ LLM-Ready Context + πŸ“ Your Codebase β†’ πŸ“¦ ProjectScriber β†’ πŸ“‹ LLM-Ready Context

----- @@ -123,32 +123,78 @@ uv pip install project-scriber ### CLI Options + | Option | Description | |:---|:---| -| `paths` | Project file/folder paths used as seeds. Defaults to current directory `.`. | -| `--config [path]` | Path to `pyproject.toml`. Its parent directory becomes the project root. | -| `--path-base [base]`| Base for relative paths: `project` (default) or `cwd`. | -| `--format [md, txt]` | Output format. Defaults to `md` (Markdown). | -| `--output [file]` | Output file path. Use `-` for stdout. | -| `--dry-run` | Show pack summary without writing the output file. | -| `--open` | Open the generated file in the default editor. | -| `--validate-config`| Validate the `[tool.scriber]` configuration and exit. | -| `--only-tree` | Render only the scored tree/map, without any file contents. | -| `--[no-]modules` | Enable/Disable automatic related module selection (dependency graph scanning). | -| `--[no-]support` | Enable/Disable support files (like `.env.example`, `.github/workflows`). | -| `--support-content` | Override support file content policy (`full`, `auto`, `tree_only`). | +| `paths` | Project file/folder paths used as seeds. Defaults to current directory. | +| `--profile` | Preset configuration profile. | +| `--config` | Path to pyproject.toml. Its parent directory becomes the project root. | +| `--path-base` | Base directory for relative paths when --config is used. | +| `--format` | Output format. | +| `--output` | Output file path, relative to project root unless absolute. Use '-' for stdout. | +| `--only-tree` | Render only scored tree/map, without file contents. | +| `--modules` | Enable automatic related module selection. | +| `--no-modules` | Disable automatic related module selection. | +| `--support` | Enable support files. | +| `--no-support` | Disable support files. | +| `--support-content` | Override default support file content policy. | | `--max-files` | Maximum number of files in the pack. | -| `--max-tokens` | Approximate token budget using char-based estimation. `0` disables budget. | -| `--min-score` | Minimum relevance score (0-100) for non-seed files to be included. | -| `--init` | Append a default `[tool.scriber]` config to `pyproject.toml` and exit. | -| `--force` | Force overwrite of the config block when used with `--init`. | -| `--version` | Show program's version number and exit. | +| `--max-tokens` | Approximate token budget for included file contents. 0 disables budget. | +| `--min-score` | Minimum score for non-seed files. | +| `--init` | Append a default [tool.scriber] config to pyproject.toml and exit. | +| `--force` | Allow --init to append even if [tool.scriber] already exists. | +| `--project` | Force project snapshot mode. | +| `--explain, --explain-selection` | Explain reason for file selection in detail. | +| `--explain-graph` | Print relation graph statistics and relations. | +| `--why` | Print exactly which rules/edges pulled the specified file into the pack. | +| `--graph-json` | Export the RelationGraph as a JSON file to the specified path. | +| `--validate-config` | Validate pyproject.toml scriber config. | +| `--dry-run` | Perform a dry run without saving the pack file. | +| `--open` | Open the output file automatically after creation. | +| `--timings` | Show execution timings for each phase. | +| `--version` | Show version information and exit. | + + + +### Profiles + +ProjectScriber comes with several preset profiles to quickly bias the file scoring and inclusion criteria: + +| Profile | Description | +|:---|:---| +| `default` | Standard scoring behavior. | +| `audit` | Boosts tests, config files, CI environments, and dependency files. Assumes full support content inclusion. | +| `debug` | Boosts direct/reverse dependencies, tests, runtime support, and files close to the seed path. | +| `refactor` | Boosts files within the same package, related tests, and direct dependencies. | +| `docs` | Heavily boosts documentation files while suppressing test and code file scores. Assumes tree_only support content by default. | + + +----- + +## πŸ› οΈ IDE Integrations + +### PyCharm / IntelliJ IDEA (External Tools) + +You can integrate ProjectScriber directly into PyCharm's right-click context menu to quickly generate LLM context packs for any selected file or folder! + +1. Open **Settings / Preferences** βž” **Tools** βž” **External Tools**. +2. Click the **`+`** button to add a new tool. +3. Configure it as follows: + +* **Name:** `Scriber` +* **Group:** `External Tools` +* **Description:** `Runs ProjectScriber on the selected directory and copies output to clipboard` +* **Program:** `scriber` *(or the absolute path to your `scriber.exe` e.g., `C:\Tools\Python\Python313\Scripts\scriber.exe`)* +* **Arguments:** `"$FilePath$" --config $ProjectFileDir$/pyproject.toml` +* **Working directory:** `$ProjectFileDir$` + +Now, you can simply right-click any file or directory in your Project tree, select **External Tools** βž” **Scriber**, and the context pack will be generated instantly based on your project configuration! ----- ## βš™οΈ Configuration -ProjectScriber 2.0 configures itself through the standard `pyproject.toml` using the `[tool.scriber]` table. +ProjectScriber 2.1.0 configures itself through the standard `pyproject.toml` using the `[tool.scriber]` table. Generate the default block using: ```shell @@ -217,7 +263,7 @@ patterns = [ ``` ### Whitelist Policy -ProjectScriber 2.0 uses a strict **whitelist** approach: +ProjectScriber 2.1.0 uses a strict **whitelist** approach: 1. Files must match either a `code_pattern` or a `support_pattern` to be considered. 2. Unrecognized extensions and binary files are automatically excluded, keeping your LLM context safe from binary garbage. 3. Lock files are included in the tree by default, but their contents are omitted to save tokens. @@ -246,4 +292,4 @@ Contributions are welcome! 3. **Run Tests**: ```shell uv run pytest - ``` \ No newline at end of file + ``` diff --git a/assets/scriber_name.svg b/assets/scriber_name.svg index 2c84186..b7cbe31 100644 --- a/assets/scriber_name.svg +++ b/assets/scriber_name.svg @@ -2,4 +2,4 @@ ProjectScriber - \ No newline at end of file + diff --git a/pyproject.toml b/pyproject.toml index 6339246..fd78d83 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "maturin" [project] name = "project-scriber" -version = "2.0.0" +version = "2.1.0" description = "Scriber 2.0: build intelligent code packs from one or more project paths." readme = "README.md" requires-python = ">=3.10" @@ -34,6 +34,7 @@ dependencies = [ dev = [ "pytest>=8", "maturin>=1.7,<2", + "pre-commit", ] [project.scripts] @@ -54,8 +55,8 @@ format = "md" output = ".scriber/scriber_pack.md" only_tree = false use_gitignore = true -max_files = 60 -max_tokens = 100000 +max_files = 0 +max_tokens = 0 min_score = 45 path_style = "project-relative" allow_external_paths = false @@ -93,6 +94,7 @@ include_tests = true include_same_package = true include_parent_entrypoints = true include_project_configs = true +top_dependencies = 10 content_min_score = 50 tree_min_score = 30 diff --git a/rust/scriber_native/src/import.rs b/rust/scriber_native/src/import.rs index c86a963..66b751c 100644 --- a/rust/scriber_native/src/import.rs +++ b/rust/scriber_native/src/import.rs @@ -355,7 +355,9 @@ pub fn build_import_graph( let base_normalized = normalize_posix_path(&raw_base); let mut resolved = false; - let extensions = vec!["", ".ts", ".tsx", ".js", ".jsx", ".d.ts"]; + let extensions = vec![ + "", ".ts", ".tsx", ".js", ".jsx", ".d.ts", ".vue", ".svelte", ".astro", ".json", + ]; for ext in extensions { let cand = if ext.is_empty() { base_normalized.clone() @@ -376,7 +378,15 @@ pub fn build_import_graph( } if !resolved { - let index_names = vec!["index.ts", "index.tsx", "index.js", "index.jsx"]; + let index_names = vec![ + "index.ts", + "index.tsx", + "index.js", + "index.jsx", + "index.vue", + "index.svelte", + "index.astro", + ]; for idx in index_names { let cand = format!("{}/{}", base_normalized, idx); if let Some(target) = absolute_to_file.get(&cand) { @@ -620,3 +630,79 @@ pub fn build_import_graph( Ok(edges) } + +#[pyclass] +#[derive(Clone, Debug)] +pub struct NativeRelationEdge { + #[pyo3(get)] + pub source: String, + #[pyo3(get)] + pub target: String, + #[pyo3(get)] + pub kind: String, + #[pyo3(get)] + pub weight: f64, + #[pyo3(get)] + pub confidence: f64, + #[pyo3(get)] + pub evidence: Option, + #[pyo3(get)] + pub line: Option, + #[pyo3(get)] + pub analyzer: String, +} + +#[pymethods] +impl NativeRelationEdge { + #[new] + #[pyo3(signature = (source, target, kind, weight, confidence, evidence, line, analyzer))] + #[allow(clippy::too_many_arguments)] + fn new( + source: String, + target: String, + kind: String, + weight: f64, + confidence: f64, + evidence: Option, + line: Option, + analyzer: String, + ) -> Self { + NativeRelationEdge { + source, + target, + kind, + weight, + confidence, + evidence, + line, + analyzer, + } + } +} + +#[pyfunction] +pub fn build_relation_graph( + root: &str, + files: Vec, + python_source_roots: Vec, + python_module_init_files: Vec, +) -> PyResult> { + let import_edges = + build_import_graph(root, files, python_source_roots, python_module_init_files)?; + + let mut relation_edges = Vec::with_capacity(import_edges.len()); + for edge in import_edges { + relation_edges.push(NativeRelationEdge { + source: edge.from, + target: edge.to, + kind: "import".to_string(), // we map everything to "import" for now to match python + weight: 1.0, + confidence: 0.98, + evidence: None, + line: None, + analyzer: "imports:native".to_string(), + }); + } + + Ok(relation_edges) +} diff --git a/rust/scriber_native/src/lib.rs b/rust/scriber_native/src/lib.rs index 4b854dd..90d4285 100644 --- a/rust/scriber_native/src/lib.rs +++ b/rust/scriber_native/src/lib.rs @@ -76,6 +76,7 @@ fn build_info() -> PyResult { fn _native(_py: Python, m: &PyModule) -> PyResult<()> { m.add_class::()?; m.add_class::()?; + m.add_class::()?; m.add_class::()?; m.add_class::()?; m.add_function(wrap_pyfunction!(read_text, m)?)?; @@ -84,6 +85,7 @@ fn _native(_py: Python, m: &PyModule) -> PyResult<()> { m.add_function(wrap_pyfunction!(read_many_text, m)?)?; m.add_function(wrap_pyfunction!(scan_project, m)?)?; m.add_function(wrap_pyfunction!(import::build_import_graph, m)?)?; + m.add_function(wrap_pyfunction!(import::build_relation_graph, m)?)?; m.add_function(wrap_pyfunction!(score::score_candidates_native, m)?)?; m.add_function(wrap_pyfunction!(render::render_tree, m)?)?; m.add_function(wrap_pyfunction!(native_api_version, m)?)?; diff --git a/rust/scriber_native/src/score.rs b/rust/scriber_native/src/score.rs index a4efbff..e2c3c5a 100644 --- a/rust/scriber_native/src/score.rs +++ b/rust/scriber_native/src/score.rs @@ -1,4 +1,4 @@ -use crate::import::NativeImportEdge; +use crate::import::NativeRelationEdge; use crate::scan::NativeFileInfo; use pyo3::prelude::*; use std::collections::{HashMap, HashSet}; @@ -64,6 +64,14 @@ pub struct NativePackOptions { pub documentation_score: i32, #[pyo3(get, set)] pub shared_dependency_bonus: i32, + #[pyo3(get, set)] + pub entrypoint_file_score: i32, + #[pyo3(get, set)] + pub code_file_score: i32, + #[pyo3(get, set)] + pub test_file_score: i32, + #[pyo3(get, set)] + pub other_file_score: i32, // Module flags #[pyo3(get, set)] @@ -82,6 +90,8 @@ pub struct NativePackOptions { pub include_project_configs: bool, #[pyo3(get, set)] pub depth: usize, + #[pyo3(get, set)] + pub top_dependencies: usize, // Support file scanning #[pyo3(get, set)] @@ -116,6 +126,10 @@ impl NativePackOptions { runtime_support_score = 50, documentation_score = 45, shared_dependency_bonus = 10, + entrypoint_file_score = 90, + code_file_score = 80, + test_file_score = 60, + other_file_score = 40, modules_enabled = true, include_direct_dependencies = true, include_reverse_dependencies = true, @@ -124,6 +138,7 @@ impl NativePackOptions { include_tests = true, include_project_configs = true, depth = 2, + top_dependencies = 10, support_enabled = true, entrypoint_patterns = Vec::new(), test_roots = Vec::new(), @@ -148,6 +163,10 @@ impl NativePackOptions { runtime_support_score: i32, documentation_score: i32, shared_dependency_bonus: i32, + entrypoint_file_score: i32, + code_file_score: i32, + test_file_score: i32, + other_file_score: i32, modules_enabled: bool, include_direct_dependencies: bool, include_reverse_dependencies: bool, @@ -156,6 +175,7 @@ impl NativePackOptions { include_tests: bool, include_project_configs: bool, depth: usize, + top_dependencies: usize, support_enabled: bool, entrypoint_patterns: Vec, test_roots: Vec, @@ -179,6 +199,10 @@ impl NativePackOptions { runtime_support_score, documentation_score, shared_dependency_bonus, + entrypoint_file_score, + code_file_score, + test_file_score, + other_file_score, modules_enabled, include_direct_dependencies, include_reverse_dependencies, @@ -187,6 +211,7 @@ impl NativePackOptions { include_tests, include_project_configs, depth, + top_dependencies, support_enabled, entrypoint_patterns, test_roots, @@ -311,9 +336,15 @@ fn is_test_file(rel: &str, test_roots: &[String]) -> bool { .unwrap_or(std::ffi::OsStr::new("")) .to_string_lossy() .to_lowercase(); - for part in p.components().filter_map(|c| c.as_os_str().to_str()) { - if test_roots.contains(&part.to_string()) { - return true; + let components: Vec<_> = p + .components() + .filter_map(|c| c.as_os_str().to_str()) + .collect(); + if components.len() > 1 { + for part in &components[0..components.len() - 1] { + if test_roots.contains(&part.to_string()) { + return true; + } } } name.starts_with("test_") || name.ends_with("_test.py") || name.ends_with(".test.py") @@ -351,37 +382,127 @@ fn is_near_seed(support_file: &str, seed: &str) -> bool { || seed_parent.starts_with(sf_parent) } -fn walk_neighbors( - edges: &HashMap>, +use std::cmp::Ordering; +use std::collections::BinaryHeap; + +#[derive(Debug, Clone)] +struct QueueState { + strength: f64, + depth: usize, + node: String, +} + +impl Eq for QueueState {} + +impl PartialEq for QueueState { + fn eq(&self, other: &Self) -> bool { + self.strength == other.strength && self.depth == other.depth && self.node == other.node + } +} + +impl Ord for QueueState { + fn cmp(&self, other: &Self) -> Ordering { + self.strength + .partial_cmp(&other.strength) + .unwrap_or(Ordering::Equal) + .then_with(|| other.depth.cmp(&self.depth)) + } +} + +impl PartialOrd for QueueState { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + +fn walk_weighted_neighbors( + edges: &[NativeRelationEdge], start: &str, depth: usize, -) -> HashMap { - let mut found = HashMap::new(); - let mut frontier = HashSet::new(); - frontier.insert(start.to_string()); - let mut visited = HashSet::new(); - visited.insert(start.to_string()); - - for distance in 1..=depth { - let mut next_frontier = HashSet::new(); - for item in frontier { - if let Some(neighbors) = edges.get(&item) { - for neighbor in neighbors { - if visited.contains(neighbor) { - continue; + top_dependencies: usize, + reverse: bool, +) -> HashMap { + let mut adj: HashMap> = HashMap::new(); + for edge in edges { + let u = if reverse { &edge.target } else { &edge.source }; + let v = if reverse { &edge.source } else { &edge.target }; + adj.entry(u.clone()).or_default().push((v.clone(), edge)); + } + + if top_dependencies > 0 { + for edges_from_u in adj.values_mut() { + if edges_from_u.len() > top_dependencies { + edges_from_u.sort_by(|a, b| { + let str_a = a.1.weight * a.1.confidence; + let str_b = b.1.weight * b.1.confidence; + str_b.partial_cmp(&str_a).unwrap_or(Ordering::Equal) + }); + edges_from_u.truncate(top_dependencies); + } + } + } + + let mut max_strength: HashMap = HashMap::new(); + max_strength.insert(start.to_string(), 1.0); + + let mut best_at_state: HashMap<(String, usize), f64> = HashMap::new(); + best_at_state.insert((start.to_string(), 0), 1.0); + + let mut heap = BinaryHeap::new(); + heap.push(QueueState { + strength: 1.0, + depth: 0, + node: start.to_string(), + }); + + while let Some(QueueState { + strength: u_str, + depth: u_depth, + node: u, + }) = heap.pop() + { + if u_str < *best_at_state.get(&(u.clone(), u_depth)).unwrap_or(&0.0) { + continue; + } + + if u_depth >= depth { + continue; + } + + if let Some(neighbors) = adj.get(&u) { + for (neighbor, edge) in neighbors { + let edge_str = if edge.kind == "import" || edge.kind == "reexport" { + if u_depth == 0 { + 1.0 + } else { + 0.88 } - visited.insert(neighbor.clone()); - found.insert(neighbor.clone(), distance); - next_frontier.insert(neighbor.clone()); + } else { + edge.weight * edge.confidence + }; + + let next_str = u_str * edge_str; + let next_depth = u_depth + 1; + + if next_str > *max_strength.get(neighbor).unwrap_or(&0.0) { + max_strength.insert(neighbor.clone(), next_str); + } + + let state_key = (neighbor.clone(), next_depth); + if next_str > *best_at_state.get(&state_key).unwrap_or(&0.0) { + best_at_state.insert(state_key, next_str); + heap.push(QueueState { + strength: next_str, + depth: next_depth, + node: neighbor.clone(), + }); } } } - frontier = next_frontier; - if frontier.is_empty() { - break; - } } - found + + max_strength.remove(start); + max_strength } fn support_base_score(file: &NativeFileInfo, options: &NativePackOptions) -> i32 { @@ -429,7 +550,7 @@ fn matches_entrypoint(rel: &str, entrypoint_patterns: &[String]) -> bool { pub fn score_candidates_native( files: Vec, seeds_list: Vec, - edges: Vec, + edges: Vec, options: NativePackOptions, ) -> PyResult> { let mut mapped_files = HashMap::new(); @@ -450,28 +571,30 @@ pub fn score_candidates_native( // Build graph edges maps let mut graph_imports: HashMap> = HashMap::new(); let mut graph_imported_by: HashMap> = HashMap::new(); - for edge in edges { - graph_imports - .entry(edge.from.clone()) - .or_default() - .insert(edge.to.clone()); - graph_imported_by - .entry(edge.to.clone()) - .or_default() - .insert(edge.from.clone()); + for edge in &edges { + if edge.kind == "import" || edge.kind == "reexport" { + graph_imports + .entry(edge.source.clone()) + .or_default() + .insert(edge.target.clone()); + graph_imported_by + .entry(edge.target.clone()) + .or_default() + .insert(edge.source.clone()); + } } if options.mode == "project_snapshot" { for (rel, c) in &mut mapped_files { if c.info.kind == "code" { if matches_entrypoint(rel, &options.entrypoint_patterns) { - c.score = 90; + c.score = options.entrypoint_file_score; add_reason(c, "entrypoint", "entrypoint file", None); } else if is_test_file(rel, &options.test_roots) { - c.score = 60; + c.score = options.test_file_score; add_reason(c, "test_file", "test file", None); } else { - c.score = 80; + c.score = options.code_file_score; add_reason(c, "code_file", "code file", None); } } else if c.info.kind == "support" && options.support_enabled { @@ -531,10 +654,16 @@ pub fn score_candidates_native( for seed_rel in &seed_files { // Direct dependencies if options.include_direct_dependencies { - for (dep, distance) in walk_neighbors(&graph_imports, seed_rel, options.depth) { + for (dep, strength) in walk_weighted_neighbors( + &edges, + seed_rel, + options.depth, + options.top_dependencies, + false, + ) { let score = std::cmp::max( options.tree_min_score, - options.direct_dependency_score - ((distance as i32 - 1) * 10), + (options.direct_dependency_score as f64 * strength) as i32, ); if let Some(c) = mapped_files.get_mut(&dep) { c.score = std::cmp::max(c.score, score); @@ -551,12 +680,16 @@ pub fn score_candidates_native( // Reverse dependencies if options.include_reverse_dependencies { - for (dep, distance) in - walk_neighbors(&graph_imported_by, seed_rel, options.depth) - { + for (dep, strength) in walk_weighted_neighbors( + &edges, + seed_rel, + options.depth, + options.top_dependencies, + true, + ) { let score = std::cmp::max( options.tree_min_score, - options.reverse_dependency_score - ((distance as i32 - 1) * 10), + (options.reverse_dependency_score as f64 * strength) as i32, ); if let Some(c) = mapped_files.get_mut(&dep) { c.score = std::cmp::max(c.score, score); diff --git a/scripts/bench_scan.py b/scripts/bench_scan.py new file mode 100644 index 0000000..25226c3 --- /dev/null +++ b/scripts/bench_scan.py @@ -0,0 +1,35 @@ +from __future__ import annotations + +import statistics +import time +from pathlib import Path + +from scriber.core.config import load_config +from scriber.scanner.scan import scan_project as scan_rust +from scriber.scanner.scan_py import scan_project as scan_python + + +def bench(name, fn, rounds=10): + times = [] + for _ in range(rounds): + start = time.perf_counter() + result = fn() + times.append(time.perf_counter() - start) + + print(f"{name}:") + print(f" files: {len(result)}") + print(f" min: {min(times):.4f}s") + print(f" avg: {statistics.mean(times):.4f}s") + print(f" p95: {sorted(times)[int(len(times) * 0.95) - 1]:.4f}s") + + +def main(): + root = Path.cwd() + config = load_config(root / "pyproject.toml") + + bench("python scan", lambda: scan_python(root, config)) + bench("rust scan", lambda: scan_rust(root, config)) + + +if __name__ == "__main__": + main() diff --git a/scripts/sync_readme.py b/scripts/sync_readme.py new file mode 100644 index 0000000..152b4f9 --- /dev/null +++ b/scripts/sync_readme.py @@ -0,0 +1,143 @@ +import argparse +import sys +import re +from pathlib import Path + +# Must be run from project root, or specify path +try: + import tomli +except ImportError: + import tomllib as tomli + + +def get_version(root: Path) -> str: + with open(root / "pyproject.toml", "rb") as f: + data = tomli.load(f) + return data["project"]["version"] + + +def generate_cli_options() -> str: + # We must import scriber to build the parser + # Assume we run it inside the environment + from scriber.cli.main import build_parser + + parser = build_parser() + + lines = ["| Option | Description |", "|:---|:---|"] + for action in parser._actions: + if action.dest == "help": + continue + + flags = ", ".join(action.option_strings) + if not flags: + flags = action.dest + + help_text = action.help or "" + lines.append(f"| `{flags}` | {help_text} |") + + return "\n".join(lines) + + +def generate_profiles() -> str: + from scriber.core.profiles import PROFILE_CHOICES + + lines = [ + "### Profiles", + "", + "ProjectScriber comes with several preset profiles to quickly bias the file scoring and inclusion criteria:", + "", + "| Profile | Description |", + "|:---|:---|", + ] + + descriptions = { + "default": "Standard scoring behavior.", + "audit": "Boosts tests, config files, CI environments, and dependency files. Assumes full support content inclusion.", + "debug": "Boosts direct/reverse dependencies, tests, runtime support, and files close to the seed path.", + "refactor": "Boosts files within the same package, related tests, and direct dependencies.", + "docs": "Heavily boosts documentation files while suppressing test and code file scores. Assumes tree_only support content by default.", + } + + for p in PROFILE_CHOICES: + lines.append(f"| `{p}` | {descriptions.get(p, '')} |") + + return "\n".join(lines) + + +def sync_readme(root: Path, write: bool = False) -> bool: + readme_path = root / "README.md" + content = readme_path.read_text(encoding="utf-8") + original_content = content + + version = get_version(root) + + # 1. Update Version tags + version_pattern = re.compile( + r".*?", re.DOTALL + ) + content = version_pattern.sub( + f"{version}", content + ) + + # 2. Update CLI Options + cli_options = generate_cli_options() + cli_pattern = re.compile( + r".*?", + re.DOTALL, + ) + content = cli_pattern.sub( + f"\n{cli_options}\n", + content, + ) + + # 3. Update Profiles + profiles = generate_profiles() + profiles_pattern = re.compile( + r".*?", re.DOTALL + ) + content = profiles_pattern.sub( + f"\n{profiles}\n", + content, + ) + + # Also enforce 2.x references + content = re.sub( + r"\*\*Version 2\.\d+(\.\d+)?\*\*", f"**Version {version}**", content + ) + content = re.sub( + r"ProjectScriber 2\.\d+(\.\d+)?", f"ProjectScriber {version}", content + ) + content = re.sub(r"Scriber 2\.\d+(\.\d+)?", f"Scriber {version}", content) + + if content == original_content: + print("README.md is up to date.") + return True + + if write: + readme_path.write_text(content, encoding="utf-8") + print("README.md has been updated.") + return True + else: + print( + "Error: README.md is stale. Run 'python scripts/sync_readme.py --write' to update." + ) + return False + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "--write", action="store_true", help="Write changes to README.md" + ) + parser.add_argument( + "--check", action="store_true", help="Check if README.md is up to date" + ) + args = parser.parse_args() + + root = Path(__file__).parent.parent + if args.write: + sys.exit(0 if sync_readme(root, write=True) else 1) + elif args.check: + sys.exit(0 if sync_readme(root, write=False) else 1) + else: + parser.print_help() diff --git a/src/scriber/__init__.py b/src/scriber/__init__.py index 1aef752..16060c4 100644 --- a/src/scriber/__init__.py +++ b/src/scriber/__init__.py @@ -1,8 +1,8 @@ -"""ProjectScriber 2.0.""" +"""ProjectScriber 2.1.""" from .packer.pack import build_pack, build_and_write_pack from .core.models import ScriberPack __all__ = ["build_pack", "build_and_write_pack", "ScriberPack"] -__version__ = "2.0.0" +__version__ = "2.1.0" diff --git a/src/scriber/budget/allocator.py b/src/scriber/budget/allocator.py new file mode 100644 index 0000000..f501b13 --- /dev/null +++ b/src/scriber/budget/allocator.py @@ -0,0 +1,73 @@ +from __future__ import annotations +from dataclasses import dataclass +from scriber.core.models import Candidate, ContentMode, PackItem + + +@dataclass(slots=True) +class BudgetPolicy: + target_tokens: int + hard_limit_tokens: int + mode: str = "full" + header_budget_ratio: float = 0.12 + graph_budget_ratio: float = 0.08 + full_code_budget_ratio: float = 0.55 + outline_budget_ratio: float = 0.20 + reserve_ratio: float = 0.05 + + +def allocate_budget( + candidates: list[Candidate], policy: BudgetPolicy, explicit_seeds: set +) -> list[PackItem]: + items = [] + + current_tokens = 0 + full_budget = int(policy.target_tokens * policy.full_code_budget_ratio) + + for i, c in enumerate(candidates): + item_id = f"F{i + 1:03d}" + role = getattr(c, "role", "unknown") + + mode: ContentMode = "tree" + + is_seed = c.file.relative in explicit_seeds + + if is_seed: + mode = "full" + elif c.file.content_policy == "tree_only": + mode = "tree" + elif c.file.content_policy == "full" and policy.mode != "focused": + mode = "full" + elif ( + c.token_estimate <= 1200 and c.score >= 80 and current_tokens < full_budget + ): + mode = "full" + elif ( + c.score >= 85 and c.token_estimate <= 2400 and current_tokens < full_budget + ): + mode = "full" + elif c.score >= 75: + mode = "excerpt" + elif c.score >= 45: + mode = "outline" + else: + mode = "tree" + + if mode == "full": + current_tokens += c.token_estimate + + item = PackItem( + file=c.file, + score=c.score, + role=role, + content_mode=mode, + reason=c.reason_summary, + reasons=c.reasons, + relation_evidence=[], + token_estimate=c.token_estimate, + utility=c.utility, + raw_score=c.raw_score, + item_id=item_id, + ) + items.append(item) + + return items diff --git a/src/scriber/cache.py b/src/scriber/cache.py index f96f0fc..cb97650 100644 --- a/src/scriber/cache.py +++ b/src/scriber/cache.py @@ -1,6 +1,5 @@ from __future__ import annotations -import os import sys import json import hashlib @@ -13,6 +12,7 @@ def get_config_hash(config: ScriberConfig) -> str: from scriber import __version__ + data = { "code_patterns": config.code_patterns, "support_patterns": config.support_patterns, @@ -35,49 +35,62 @@ def get_config_hash(config: ScriberConfig) -> str: class ScriberCache: def __init__(self, config: ScriberConfig, project_root: Path): self.enabled = config.cache.enabled - self.cache_dir = project_root / config.cache.dir + self.project_root = project_root.resolve() + self.cache_dir = self.project_root / config.cache.dir self.files_cache_path = self.cache_dir / "files.json" - self.graph_cache_path = self.cache_dir / "import_graph.json" + self.imports_cache_path = self.cache_dir / "imports_v2.json" + self.relations_cache_path = self.cache_dir / "relations_v1.jsonl" self.config_hash = get_config_hash(config) self.python_version = f"{sys.version_info.major}.{sys.version_info.minor}" - + + self.reads = 0 + self.hits = 0 + self.writes = 0 + self.files_data: dict[str, dict[str, Any]] = {} - self.graph_data: dict[str, list[str]] = {} + self.imports_data: dict[str, dict[str, Any]] = {} self._load() def _load(self) -> None: if not self.enabled: return - + try: if self.files_cache_path.exists(): with self.files_cache_path.open("r", encoding="utf-8") as f: self.files_data = json.load(f) - if self.graph_cache_path.exists(): - with self.graph_cache_path.open("r", encoding="utf-8") as f: - self.graph_data = json.load(f) + if self.imports_cache_path.exists(): + with self.imports_cache_path.open("r", encoding="utf-8") as f: + self.imports_data = json.load(f) + # relations_v1.jsonl will be append-only or rewritten on save, we don't load it entirely into memory for now except Exception: # Silently fallback to empty cache on read errors self.files_data = {} - self.graph_data = {} + self.imports_data = {} - def get_file(self, rel_path: Path, mtime_ns: int, size: int) -> dict[str, Any] | None: + def get_file( + self, rel_path: Path, mtime_ns: int, size: int + ) -> dict[str, Any] | None: if not self.enabled: return None - + key = rel_path.as_posix() entry = self.files_data.get(key) if entry is None: return None - - if (entry.get("mtime_ns") == mtime_ns and - entry.get("size") == size and - entry.get("python_version") == self.python_version and - entry.get("config_hash") == self.config_hash): + + if ( + entry.get("mtime_ns") == mtime_ns + and entry.get("size") == size + and entry.get("python_version") == self.python_version + and entry.get("config_hash") == self.config_hash + ): return entry.get("data") return None - def set_file(self, rel_path: Path, mtime_ns: int, size: int, data: dict[str, Any]) -> None: + def set_file( + self, rel_path: Path, mtime_ns: int, size: int, data: dict[str, Any] + ) -> None: if not self.enabled: return key = rel_path.as_posix() @@ -86,50 +99,102 @@ def set_file(self, rel_path: Path, mtime_ns: int, size: int, data: dict[str, Any "size": size, "python_version": self.python_version, "config_hash": self.config_hash, - "data": data + "data": data, } - def get_imports(self, rel_path: Path) -> set[Path] | None: + def get_imports(self, rel_path: Path, mtime_ns: int, size: int) -> set[Path] | None: if not self.enabled: return None + self.reads += 1 key = rel_path.as_posix() - imports = self.graph_data.get(key) + imports = self.imports_data.get(key) if imports is not None: - return {Path(p) for p in imports} + if ( + imports.get("mtime_ns") == mtime_ns + and imports.get("size") == size + and imports.get("config_hash") == self.config_hash + ): + self.hits += 1 + return {Path(p) for p in imports.get("targets", [])} return None def set_imports(self, rel_path: Path, imports: set[Path]) -> None: if not self.enabled: return + self.writes += 1 key = rel_path.as_posix() - self.graph_data[key] = [p.as_posix() for p in sorted(imports)] + try: + stat = (self.project_root / rel_path).stat() + mtime_ns = stat.st_mtime_ns + size = stat.st_size + except OSError: + mtime_ns = 0 + size = 0 + self.imports_data[key] = { + "mtime_ns": mtime_ns, + "size": size, + "config_hash": self.config_hash, + "targets": [p.as_posix() for p in sorted(imports)], + } + + def add_import_edge(self, source: Path, target: Path) -> None: + if not self.enabled: + return + self.writes += 1 + key = source.as_posix() + target_str = target.as_posix() + if key not in self.imports_data: + try: + stat = (self.project_root / source).stat() + mtime_ns = stat.st_mtime_ns + size = stat.st_size + except OSError: + mtime_ns = 0 + size = 0 + self.imports_data[key] = { + "mtime_ns": mtime_ns, + "size": size, + "config_hash": self.config_hash, + "targets": [target_str], + } + else: + if target_str not in self.imports_data[key].get("targets", []): + self.imports_data[key].setdefault("targets", []).append(target_str) + self.imports_data[key]["targets"].sort() def save(self, active_files: set[Path] | None = None) -> None: if not self.enabled: return - + try: self.cache_dir.mkdir(parents=True, exist_ok=True) - + # Simple cleanup mechanism: # 1. Prune stale cache entries (entries for files no longer in active_files) if active_files is not None: active_keys = {p.as_posix() for p in active_files} - self.files_data = {k: v for k, v in self.files_data.items() if k in active_keys} - self.graph_data = {k: v for k, v in self.graph_data.items() if k in active_keys} + self.files_data = { + k: v for k, v in self.files_data.items() if k in active_keys + } + self.imports_data = { + k: v for k, v in self.imports_data.items() if k in active_keys + } # 2. Enforce absolute limit of max 1000 entries to prevent infinite growth if len(self.files_data) > 1000: # Remove oldest keys - sorted_keys = sorted(self.files_data.keys(), key=lambda k: self.files_data[k].get("mtime_ns", 0)) - to_remove = sorted_keys[:len(sorted_keys) - 1000] + sorted_keys = sorted( + self.files_data.keys(), + key=lambda k: self.files_data[k].get("mtime_ns", 0), + ) + to_remove = sorted_keys[: len(sorted_keys) - 1000] for k in to_remove: self.files_data.pop(k, None) - self.graph_data.pop(k, None) + self.imports_data.pop(k, None) with self.files_cache_path.open("w", encoding="utf-8") as f: json.dump(self.files_data, f, indent=2) - with self.graph_cache_path.open("w", encoding="utf-8") as f: - json.dump(self.graph_data, f, indent=2) + with self.imports_cache_path.open("w", encoding="utf-8") as f: + json.dump(self.imports_data, f, indent=2) except Exception: pass # Fail silently on write errors to not interrupt execution diff --git a/src/scriber/cli/main.py b/src/scriber/cli/main.py index c943226..1b69a25 100644 --- a/src/scriber/cli/main.py +++ b/src/scriber/cli/main.py @@ -5,14 +5,138 @@ from pathlib import Path from typing import Sequence -from scriber.core.config import DEFAULT_CONFIG_BLOCK, load_raw_pyproject, load_config, validate_config, validate_raw_config +from scriber.core.config import ( + load_raw_pyproject, + load_config, + validate_config, + validate_raw_config, +) from scriber.core.errors import ScriberError from scriber.core.init_config import init_project from scriber.core.root import resolve_config_path from scriber.packer.pack import build_and_write_pack +def handle_introspection(args, pack) -> None: + import json + # 1. Export Graph JSON if requested + if args.graph_json: + edges_data = [] + for edge in pack.graph.edges: + edges_data.append( + { + "source": str(edge.source), + "target": str(edge.target), + "kind": edge.kind, + "weight": edge.weight, + "confidence": edge.confidence, + "evidence": edge.evidence, + "line": edge.line, + "analyzer": edge.analyzer, + } + ) + + graph_data = {"edges": edges_data} + json_path = Path(args.graph_json) + try: + with open(json_path, "w", encoding="utf-8") as f: + json.dump(graph_data, f, indent=2) + print(f"Exported relation graph to {json_path}", file=sys.stderr) + except Exception as e: + print(f"Error exporting relation graph to JSON: {e}", file=sys.stderr) + + # 2. Explain Graph + if args.explain_graph: + edges = pack.graph.edges + total_edges = len(edges) + + # Group by kind + kind_counts = {} + for edge in edges: + kind_counts[edge.kind] = kind_counts.get(edge.kind, 0) + 1 + + # Get unique nodes + nodes = set() + for edge in edges: + nodes.add(edge.source) + nodes.add(edge.target) + unique_nodes = len(nodes) + avg_degree = (total_edges * 2.0 / unique_nodes) if unique_nodes > 0 else 0.0 + + print("\n========================================", file=sys.stderr) + print("SCRIBER RELATION GRAPH EXPLANATION", file=sys.stderr) + print("========================================", file=sys.stderr) + print(f"Total Edges: {total_edges}", file=sys.stderr) + print("Edges by Kind:", file=sys.stderr) + for kind, count in sorted( + kind_counts.items(), key=lambda x: x[1], reverse=True + ): + print(f" - {kind.ljust(20)}: {count}", file=sys.stderr) + print(f"Unique Nodes: {unique_nodes}", file=sys.stderr) + print(f"Average Degree: {avg_degree:.2f}", file=sys.stderr) + print("========================================\n", file=sys.stderr) + + # 3. Why + if args.why: + why_target = args.why.replace("\\", "/").lower() + target_c = None + + candidates_or_items = getattr(pack, "candidates", getattr(pack, "items", [])) + for c in candidates_or_items: + rel_str = c.file.relative.as_posix().lower() + abs_str = c.file.absolute.as_posix().lower() + if why_target in rel_str or why_target in abs_str: + target_c = c + break + + if not target_c: + print( + f"\nCould not find file matching '{args.why}' in the analyzed candidates.", + file=sys.stderr, + ) + return + + print("\n========================================", file=sys.stderr) + print(f"WHY WAS '{target_c.file.relative}' INCLUDED?", file=sys.stderr) + print("========================================", file=sys.stderr) + print(f"Score: {target_c.score}", file=sys.stderr) + if hasattr(target_c, "role"): + print(f"Role: {target_c.role}", file=sys.stderr) + if hasattr(target_c, "token_estimate"): + print(f"Token Cost: {target_c.token_estimate}", file=sys.stderr) + if hasattr(target_c, "content_mode"): + print(f"Content Mode: {target_c.content_mode}", file=sys.stderr) + if hasattr(target_c, "omitted_reason") and target_c.omitted_reason: + print(f"Omitted Reason: {target_c.omitted_reason}", file=sys.stderr) + + reasons = getattr(target_c, "reasons", []) + if reasons: + print("Selection Reasons:", file=sys.stderr) + for r in reasons: + print(f" - {r}", file=sys.stderr) + else: + reason_summary = getattr( + target_c, "reason_summary", getattr(target_c, "reason", "None") + ) + print(f"Selection Reasons: {reason_summary}", file=sys.stderr) + + incoming = [] + for edge in pack.graph.edges: + if edge.target == target_c.file.relative: + incoming.append(edge) + + if incoming: + print("\nIncoming Relation Edges:", file=sys.stderr) + for edge in sorted(incoming, key=lambda e: (e.kind, str(e.source))): + ev = f" ({edge.evidence})" if edge.evidence else "" + print( + f" - {edge.source} -> [this file] (kind: {edge.kind}, weight: {edge.weight}, confidence: {edge.confidence}){ev}", + file=sys.stderr, + ) + else: + print("\nNo incoming relation edges found in graph.", file=sys.stderr) + print("========================================\n", file=sys.stderr) def _progress(msg: str) -> None: @@ -26,31 +150,132 @@ def build_parser() -> argparse.ArgumentParser: prog="scriber", description="Scriber 2.0: build an intelligent code pack from one or more project paths.", ) - parser.add_argument("paths", nargs="*", help="Project file/folder paths used as seeds. Defaults to current directory.") - parser.add_argument("--config", help="Path to pyproject.toml. Its parent directory becomes the project root.") - parser.add_argument("--path-base", choices=["project", "cwd"], default="project", help="Base directory for relative paths when --config is used.") - parser.add_argument("--format", choices=["md", "txt"], dest="output_format", help="Output format.") - parser.add_argument("--output", help="Output file path, relative to project root unless absolute. Use '-' for stdout.") - parser.add_argument("--only-tree", action="store_true", help="Render only scored tree/map, without file contents.") - parser.add_argument("--modules", dest="modules", action="store_true", help="Enable automatic related module selection.") - parser.add_argument("--no-modules", dest="modules", action="store_false", help="Disable automatic related module selection.") + parser.add_argument( + "paths", + nargs="*", + help="Project file/folder paths used as seeds. Defaults to current directory.", + ) + parser.add_argument( + "--profile", + choices=["default", "audit", "debug", "refactor", "docs"], + default="default", + help="Preset configuration profile.", + ) + parser.add_argument( + "--config", + help="Path to pyproject.toml. Its parent directory becomes the project root.", + ) + parser.add_argument( + "--path-base", + choices=["project", "cwd"], + default="project", + help="Base directory for relative paths when --config is used.", + ) + parser.add_argument( + "--format", choices=["md", "txt"], dest="output_format", help="Output format." + ) + parser.add_argument( + "--output", + help="Output file path, relative to project root unless absolute. Use '-' for stdout.", + ) + parser.add_argument( + "--only-tree", + action="store_true", + help="Render only scored tree/map, without file contents.", + ) + parser.add_argument( + "--modules", + dest="modules", + action="store_true", + help="Enable automatic related module selection.", + ) + parser.add_argument( + "--no-modules", + dest="modules", + action="store_false", + help="Disable automatic related module selection.", + ) parser.set_defaults(modules=None) - parser.add_argument("--support", dest="support", action="store_true", help="Enable support files.") - parser.add_argument("--no-support", dest="support", action="store_false", help="Disable support files.") + parser.add_argument( + "--support", dest="support", action="store_true", help="Enable support files." + ) + parser.add_argument( + "--no-support", + dest="support", + action="store_false", + help="Disable support files.", + ) parser.set_defaults(support=None) - parser.add_argument("--support-content", choices=["full", "auto", "tree_only"], help="Override default support file content policy.") - parser.add_argument("--max-files", type=int, help="Maximum number of files in the pack.") - parser.add_argument("--max-tokens", type=int, help="Approximate token budget for included file contents. 0 disables budget.") - parser.add_argument("--min-score", type=int, help="Minimum score for non-seed files.") - parser.add_argument("--init", action="store_true", help="Append a default [tool.scriber] config to pyproject.toml and exit.") - parser.add_argument("--force", action="store_true", help="Allow --init to append even if [tool.scriber] already exists.") - parser.add_argument("--project", action="store_true", help="Force project snapshot mode.") - parser.add_argument("--explain-selection", action="store_true", help="Explain reason for file selection in detail.") - parser.add_argument("--validate-config", action="store_true", help="Validate pyproject.toml scriber config.") - parser.add_argument("--dry-run", action="store_true", help="Perform a dry run without saving the pack file.") - parser.add_argument("--open", action="store_true", help="Open the output file automatically after creation.") - parser.add_argument("--timings", action="store_true", help="Show execution timings for each phase.") - parser.add_argument("--version", action="store_true", help="Show version information and exit.") + parser.add_argument( + "--support-content", + choices=["full", "auto", "tree_only"], + help="Override default support file content policy.", + ) + parser.add_argument( + "--max-files", type=int, help="Maximum number of files in the pack." + ) + parser.add_argument( + "--max-tokens", + type=int, + help="Approximate token budget for included file contents. 0 disables budget.", + ) + parser.add_argument( + "--min-score", type=int, help="Minimum score for non-seed files." + ) + parser.add_argument( + "--init", + action="store_true", + help="Append a default [tool.scriber] config to pyproject.toml and exit.", + ) + parser.add_argument( + "--force", + action="store_true", + help="Allow --init to append even if [tool.scriber] already exists.", + ) + parser.add_argument( + "--project", action="store_true", help="Force project snapshot mode." + ) + parser.add_argument( + "--explain", + "--explain-selection", + dest="explain_selection", + action="store_true", + help="Explain reason for file selection in detail.", + ) + parser.add_argument( + "--explain-graph", + action="store_true", + help="Print relation graph statistics and relations.", + ) + parser.add_argument( + "--why", + help="Print exactly which rules/edges pulled the specified file into the pack.", + ) + parser.add_argument( + "--graph-json", + help="Export the RelationGraph as a JSON file to the specified path.", + ) + parser.add_argument( + "--validate-config", + action="store_true", + help="Validate pyproject.toml scriber config.", + ) + parser.add_argument( + "--dry-run", + action="store_true", + help="Perform a dry run without saving the pack file.", + ) + parser.add_argument( + "--open", + action="store_true", + help="Open the output file automatically after creation.", + ) + parser.add_argument( + "--timings", action="store_true", help="Show execution timings for each phase." + ) + parser.add_argument( + "--version", action="store_true", help="Show version information and exit." + ) return parser @@ -61,12 +286,19 @@ def main(argv: Sequence[str] | None = None) -> int: try: if args.version: from scriber import __version__ + print(f"scriber {__version__}") from scriber.native import is_native_available, require_native + if is_native_available(): native = require_native() if hasattr(native, "build_info"): - print(f"native {native.build_info()}") + api_ver = ( + native.native_api_version() + if hasattr(native, "native_api_version") + else "unknown" + ) + print(f"native {native.build_info()} (API v{api_ver})") return 0 if args.validate_config: @@ -82,7 +314,7 @@ def main(argv: Sequence[str] | None = None) -> int: else: config = load_config(config_path) issues = validate_config(config, raw_data, config_path) - + if not issues: print("Scriber config is valid.", file=sys.stderr) return 0 @@ -96,7 +328,10 @@ def main(argv: Sequence[str] | None = None) -> int: else: warnings += 1 print(f"[{severity}] {issue.message}", file=sys.stderr) - print(f"\nValidation completed: {errors} error(s), {warnings} warning(s)", file=sys.stderr) + print( + f"\nValidation completed: {errors} error(s), {warnings} warning(s)", + file=sys.stderr, + ) return 1 if errors > 0 else 0 except Exception as exc: print(f"Error: Failed to parse pyproject.toml: {exc}", file=sys.stderr) @@ -110,9 +345,11 @@ def main(argv: Sequence[str] | None = None) -> int: if args.dry_run: from scriber.packer.pack import build_pack from scriber.core.config import apply_overrides + pack = build_pack( args.paths or ["."], config_path=args.config, + profile=args.profile, output=args.output, output_format=args.output_format, only_tree=True if args.only_tree else None, @@ -129,9 +366,32 @@ def main(argv: Sequence[str] | None = None) -> int: sys.stderr.write("\r".ljust(80) + "\r") sys.stderr.flush() - code_count = len([c for c in pack.candidates if c.file.kind == "code" and c.include_content]) - support_count = len([c for c in pack.candidates if c.file.kind == "support" and c.include_content]) - total_count = len(pack.candidates) + is_llm_pack = hasattr(pack, "items") + items = getattr(pack, "items", getattr(pack, "candidates", [])) + if is_llm_pack: + code_count = len( + [ + c + for c in items + if c.file.kind == "code" and c.content_mode != "tree" + ] + ) + support_count = len( + [ + c + for c in items + if c.file.kind == "support" and c.content_mode != "tree" + ] + ) + total_count = len([c for c in items if c.content_mode != "tree"]) + else: + code_count = len( + [c for c in items if c.file.kind == "code" and c.include_content] + ) + support_count = len( + [c for c in items if c.file.kind == "support" and c.include_content] + ) + total_count = len([c for c in items if c.include_content]) print("Scriber dry-run completed.", file=sys.stderr) print("----------------------------------------", file=sys.stderr) @@ -139,13 +399,47 @@ def main(argv: Sequence[str] | None = None) -> int: print(f" Code files selected: {code_count}", file=sys.stderr) print(f" Support files selected: {support_count}", file=sys.stderr) print(f" Total files in pack: {total_count}", file=sys.stderr) - print(f" Estimated tokens: {pack.total_tokens}", file=sys.stderr) - if args.timings and pack.timings: - print("----------------------------------------", file=sys.stderr) - print("Timings:", file=sys.stderr) - for phase, duration in pack.timings.items(): - print(f" {phase.replace('_', ' ').ljust(15)}: {duration:.4f}s", file=sys.stderr) - print(f" total: {sum(pack.timings.values()):.4f}s", file=sys.stderr) + total_tokens = getattr( + pack, "budget_actual", getattr(pack, "total_tokens", 0) + ) + print(f" Estimated tokens: {total_tokens}", file=sys.stderr) + if args.timings: + if pack.stats: + print("----------------------------------------", file=sys.stderr) + print("Stats:", file=sys.stderr) + if "graph_edges_built" in pack.stats: + print( + f" Graph edges built: {pack.stats['graph_edges_built']}", + file=sys.stderr, + ) + print( + f" Graph cache reads: {pack.stats['graph_cache_reads']}", + file=sys.stderr, + ) + print( + f" Graph cache hits: {pack.stats['graph_cache_hits']}", + file=sys.stderr, + ) + print( + f" Graph cache writes: {pack.stats['graph_cache_writes']}", + file=sys.stderr, + ) + print( + f" Graph source: {pack.stats['graph_source']}", + file=sys.stderr, + ) + if pack.timings: + print("----------------------------------------", file=sys.stderr) + print("Timings:", file=sys.stderr) + for phase, duration in pack.timings.items(): + print( + f" {phase.replace('_', ' ').ljust(15)}: {duration:.4f}s", + file=sys.stderr, + ) + print( + f" total: {sum(pack.timings.values()):.4f}s", + file=sys.stderr, + ) config = load_config(pack.config_path) config = apply_overrides(config, output=args.output) @@ -154,11 +448,14 @@ def main(argv: Sequence[str] | None = None) -> int: output_path = pack.project_root / output_path print(f" Proposed output path: {output_path}", file=sys.stderr) print("----------------------------------------", file=sys.stderr) + if args.explain_graph or args.why or args.graph_json: + handle_introspection(args, pack) return 0 output, pack = build_and_write_pack( args.paths or ["."], config_path=args.config, + profile=args.profile, output=args.output, output_format=args.output_format, only_tree=True if args.only_tree else None, @@ -177,36 +474,78 @@ def main(argv: Sequence[str] | None = None) -> int: sys.stderr.write("\r".ljust(80) + "\r") sys.stderr.flush() + is_llm_pack = hasattr(pack, "items") + items = getattr(pack, "items", getattr(pack, "candidates", [])) + code_count = 0 support_count = 0 omitted_count = 0 - for cand in pack.candidates: - if cand.include_content: - if cand.file.kind == "code": - code_count += 1 - elif cand.file.kind == "support": - support_count += 1 + + for cand in items: + if is_llm_pack: + if cand.content_mode != "tree": + if cand.file.kind == "code": + code_count += 1 + elif cand.file.kind == "support": + support_count += 1 + else: + omitted_count += 1 else: - omitted_count += 1 + if cand.include_content: + if cand.file.kind == "code": + code_count += 1 + elif cand.file.kind == "support": + support_count += 1 + else: + omitted_count += 1 sys.stderr.write("Scriber build completed.\n") sys.stderr.write("----------------------------------------\n") sys.stderr.write(f" Code files included: {code_count}\n") sys.stderr.write(f" Support files included: {support_count}\n") sys.stderr.write(f" Files omitted/skipped: {omitted_count}\n") - sys.stderr.write(f" Estimated tokens: {pack.total_tokens}\n") + total_tokens = getattr(pack, "budget_actual", getattr(pack, "total_tokens", 0)) + sys.stderr.write(f" Estimated tokens: {total_tokens}\n") sys.stderr.write("----------------------------------------\n") - if args.timings and pack.timings: - sys.stderr.write("Timings:\n") - for phase, duration in pack.timings.items(): - sys.stderr.write(f" - {phase.replace('_', ' ').ljust(15)}: {duration:.4f}s\n") - sys.stderr.write(f" - total: {sum(pack.timings.values()):.4f}s\n") - sys.stderr.write("----------------------------------------\n") + if args.timings: + if pack.stats: + sys.stderr.write("Stats:\n") + if "graph_edges_built" in pack.stats: + sys.stderr.write( + f" - Graph edges built: {pack.stats['graph_edges_built']}\n" + ) + sys.stderr.write( + f" - Graph cache reads: {pack.stats['graph_cache_reads']}\n" + ) + sys.stderr.write( + f" - Graph cache hits: {pack.stats['graph_cache_hits']}\n" + ) + sys.stderr.write( + f" - Graph cache writes: {pack.stats['graph_cache_writes']}\n" + ) + sys.stderr.write( + f" - Graph source: {pack.stats['graph_source']}\n" + ) + sys.stderr.write("----------------------------------------\n") + if pack.timings: + sys.stderr.write("Timings:\n") + for phase, duration in pack.timings.items(): + sys.stderr.write( + f" - {phase.replace('_', ' ').ljust(15)}: {duration:.4f}s\n" + ) + sys.stderr.write( + f" - total: {sum(pack.timings.values()):.4f}s\n" + ) + sys.stderr.write("----------------------------------------\n") + + if args.explain_graph or args.why or args.graph_json: + handle_introspection(args, pack) if output is not None: print(f"Scriber pack written to: {output}") if args.open: from scriber.core.open_file import open_path + open_path(output) return 0 except ScriberError as exc: diff --git a/src/scriber/core/config.py b/src/scriber/core/config.py index fed0545..f0e1fb0 100644 --- a/src/scriber/core/config.py +++ b/src/scriber/core/config.py @@ -9,7 +9,14 @@ except ModuleNotFoundError: # pragma: no cover import tomli as tomllib # type: ignore[no-redef] -from .models import CacheConfig, ModuleConfig, PythonConfig, ScriberConfig, SupportContentConfig, TokenConfig +from .models import ( + CacheConfig, + ModuleConfig, + PythonConfig, + ScriberConfig, + SupportContentConfig, + TokenConfig, +) DEFAULT_CODE_PATTERNS = [ "**/*.py", @@ -26,6 +33,15 @@ "**/*.cpp", "**/*.h", "**/*.hpp", + "**/*.html", + "**/*.htm", + "**/*.vue", + "**/*.svelte", + "**/*.astro", + "**/*.css", + "**/*.scss", + "**/*.sass", + "**/*.less", ] DEFAULT_SUPPORT_PATTERNS = [ @@ -81,6 +97,7 @@ "pnpm-lock.yaml", "yarn.lock", "**/*.svg", + "**/*.json", ] DEFAULT_SUPPORT_FULL = [ @@ -104,6 +121,7 @@ "README.md", "Cargo.toml", "go.mod", + "**/*.json", ] DEFAULT_SUPPORT_TREE_ONLY = [ @@ -147,8 +165,8 @@ output = ".scriber/scriber_pack.md" only_tree = false use_gitignore = true -max_files = 60 -max_tokens = 100000 +max_files = 0 +max_tokens = 0 min_score = 45 path_style = "project-relative" allow_external_paths = false @@ -186,6 +204,7 @@ include_same_package = true include_parent_entrypoints = true include_project_configs = true +top_dependencies = 10 content_min_score = 50 tree_min_score = 30 @@ -228,7 +247,9 @@ def load_config(config_path: Path) -> ScriberConfig: config.max_tokens = int(data.get("max_tokens", config.max_tokens)) config.min_score = int(data.get("min_score", config.min_score)) config.path_style = str(data.get("path_style", config.path_style)) - config.allow_external_paths = bool(data.get("allow_external_paths", config.allow_external_paths)) + config.allow_external_paths = bool( + data.get("allow_external_paths", config.allow_external_paths) + ) code_files = data.get("code_files", {}) if isinstance(code_files, dict) and isinstance(code_files.get("patterns"), list): @@ -243,9 +264,19 @@ def load_config(config_path: Path) -> ScriberConfig: if isinstance(content, dict): config.support_content = SupportContentConfig( default=content.get("default", config.support_content.default), - full=[str(item) for item in content.get("full", config.support_content.full)], - tree_only=[str(item) for item in content.get("tree_only", config.support_content.tree_only)], - auto_max_bytes=int(content.get("auto_max_bytes", config.support_content.auto_max_bytes)), + full=[ + str(item) + for item in content.get("full", config.support_content.full) + ], + tree_only=[ + str(item) + for item in content.get( + "tree_only", config.support_content.tree_only + ) + ], + auto_max_bytes=int( + content.get("auto_max_bytes", config.support_content.auto_max_bytes) + ), ) if not config.support_content.full: config.support_content.full = list(DEFAULT_SUPPORT_FULL) @@ -265,14 +296,49 @@ def load_config(config_path: Path) -> ScriberConfig: config.modules_config = ModuleConfig( enabled=bool(modules.get("enabled", config.modules_config.enabled)), depth=int(modules.get("depth", config.modules_config.depth)), - include_direct_dependencies=bool(modules.get("include_direct_dependencies", config.modules_config.include_direct_dependencies)), - include_reverse_dependencies=bool(modules.get("include_reverse_dependencies", config.modules_config.include_reverse_dependencies)), - include_tests=bool(modules.get("include_tests", config.modules_config.include_tests)), - include_same_package=bool(modules.get("include_same_package", config.modules_config.include_same_package)), - include_parent_entrypoints=bool(modules.get("include_parent_entrypoints", config.modules_config.include_parent_entrypoints)), - include_project_configs=bool(modules.get("include_project_configs", config.modules_config.include_project_configs)), - content_min_score=int(modules.get("content_min_score", config.modules_config.content_min_score)), - tree_min_score=int(modules.get("tree_min_score", config.modules_config.tree_min_score)), + include_direct_dependencies=bool( + modules.get( + "include_direct_dependencies", + config.modules_config.include_direct_dependencies, + ) + ), + include_reverse_dependencies=bool( + modules.get( + "include_reverse_dependencies", + config.modules_config.include_reverse_dependencies, + ) + ), + include_tests=bool( + modules.get("include_tests", config.modules_config.include_tests) + ), + include_same_package=bool( + modules.get( + "include_same_package", config.modules_config.include_same_package + ) + ), + include_parent_entrypoints=bool( + modules.get( + "include_parent_entrypoints", + config.modules_config.include_parent_entrypoints, + ) + ), + include_project_configs=bool( + modules.get( + "include_project_configs", + config.modules_config.include_project_configs, + ) + ), + top_dependencies=int( + modules.get("top_dependencies", config.modules_config.top_dependencies) + ), + content_min_score=int( + modules.get( + "content_min_score", config.modules_config.content_min_score + ) + ), + tree_min_score=int( + modules.get("tree_min_score", config.modules_config.tree_min_score) + ), scoring=scoring, ) config.modules = config.modules_config.enabled @@ -280,17 +346,34 @@ def load_config(config_path: Path) -> ScriberConfig: python = data.get("python", {}) if isinstance(python, dict): config.python = PythonConfig( - source_roots=[str(item) for item in python.get("source_roots", config.python.source_roots)], - test_roots=[str(item) for item in python.get("test_roots", config.python.test_roots)], - module_init_files=[str(item) for item in python.get("module_init_files", config.python.module_init_files)], - entrypoint_patterns=[str(item) for item in python.get("entrypoint_patterns", config.python.entrypoint_patterns)], + source_roots=[ + str(item) + for item in python.get("source_roots", config.python.source_roots) + ], + test_roots=[ + str(item) for item in python.get("test_roots", config.python.test_roots) + ], + module_init_files=[ + str(item) + for item in python.get( + "module_init_files", config.python.module_init_files + ) + ], + entrypoint_patterns=[ + str(item) + for item in python.get( + "entrypoint_patterns", config.python.entrypoint_patterns + ) + ], ) tokens = data.get("tokens", {}) if isinstance(tokens, dict): config.tokens = TokenConfig( estimator=str(tokens.get("estimator", config.tokens.estimator)), - chars_per_token=int(tokens.get("chars_per_token", config.tokens.chars_per_token)), + chars_per_token=int( + tokens.get("chars_per_token", config.tokens.chars_per_token) + ), ) cache = data.get("cache", {}) @@ -348,25 +431,31 @@ class ConfigIssue: def validate_raw_config(raw_data: dict[str, Any]) -> list[ConfigIssue]: issues: list[ConfigIssue] = [] - + # 1. check if raw_data contains tool.scriber tool = raw_data.get("tool", {}) if isinstance(raw_data, dict) else {} if not isinstance(tool, dict): issues.append(ConfigIssue("error", "[tool] in pyproject.toml must be a table.")) return issues - + data = tool.get("scriber", {}) if isinstance(tool, dict) else {} if not data: - issues.append(ConfigIssue("warning", "[tool.scriber] section is missing or empty.")) + issues.append( + ConfigIssue("warning", "[tool.scriber] section is missing or empty.") + ) return issues - + if not isinstance(data, dict): issues.append(ConfigIssue("error", "[tool.scriber] must be a table.")) return issues # 2. check output format if "format" in data and data["format"] not in {"md", "txt"}: - issues.append(ConfigIssue("error", f"Invalid format: '{data['format']}'. Must be 'md' or 'txt'.")) + issues.append( + ConfigIssue( + "error", f"Invalid format: '{data['format']}'. Must be 'md' or 'txt'." + ) + ) # 4. check support_content default support_files = data.get("support_files", {}) @@ -375,7 +464,12 @@ def validate_raw_config(raw_data: dict[str, Any]) -> list[ConfigIssue]: if isinstance(content, dict) and "default" in content: val = content["default"] if val not in {"full", "auto", "tree_only"}: - issues.append(ConfigIssue("error", f"Invalid support_files.content.default: '{val}'. Must be 'full', 'auto', or 'tree_only'.")) + issues.append( + ConfigIssue( + "error", + f"Invalid support_files.content.default: '{val}'. Must be 'full', 'auto', or 'tree_only'.", + ) + ) # 5. check numeric values >= 0 for field in ["max_files", "max_tokens", "min_score"]: @@ -383,20 +477,57 @@ def validate_raw_config(raw_data: dict[str, Any]) -> list[ConfigIssue]: try: val = int(data[field]) if val < 0: - issues.append(ConfigIssue("error", f"{field} must be a number >= 0. Got: {val}")) + issues.append( + ConfigIssue( + "error", f"{field} must be a number >= 0. Got: {val}" + ) + ) + except (ValueError, TypeError): + issues.append( + ConfigIssue( + "error", f"{field} must be an integer. Got: {data[field]}" + ) + ) + + modules = data.get("modules", {}) + if isinstance(modules, dict): + if "top_dependencies" in modules: + try: + val = int(modules["top_dependencies"]) + if val < 0: + issues.append( + ConfigIssue( + "error", + f"modules.top_dependencies must be a number >= 0. Got: {val}", + ) + ) except (ValueError, TypeError): - issues.append(ConfigIssue("error", f"{field} must be an integer. Got: {data[field]}")) + issues.append( + ConfigIssue( + "error", + f"modules.top_dependencies must be an integer. Got: {modules['top_dependencies']}", + ) + ) # 6. check patterns are list of strings def check_pattern_list(parent_dict: dict[str, Any], path_name: str) -> None: if "patterns" in parent_dict: patterns = parent_dict["patterns"] if not isinstance(patterns, list): - issues.append(ConfigIssue("error", f"{path_name}.patterns must be a list of strings.")) + issues.append( + ConfigIssue( + "error", f"{path_name}.patterns must be a list of strings." + ) + ) else: for item in patterns: if not isinstance(item, str): - issues.append(ConfigIssue("error", f"Pattern in {path_name}.patterns must be a string. Got: {item}")) + issues.append( + ConfigIssue( + "error", + f"Pattern in {path_name}.patterns must be a string. Got: {item}", + ) + ) code_files = data.get("code_files", {}) if isinstance(code_files, dict): @@ -406,7 +537,7 @@ def check_pattern_list(parent_dict: dict[str, Any], path_name: str) -> None: if isinstance(support_files, dict): check_pattern_list(support_files, "support_files") - + # Check support_files.content full and tree_only patterns content = support_files.get("content", {}) if isinstance(content, dict): @@ -414,11 +545,21 @@ def check_pattern_list(parent_dict: dict[str, Any], path_name: str) -> None: if field in content: patterns = content[field] if not isinstance(patterns, list): - issues.append(ConfigIssue("error", f"support_files.content.{field} must be a list of strings.")) + issues.append( + ConfigIssue( + "error", + f"support_files.content.{field} must be a list of strings.", + ) + ) else: for item in patterns: if not isinstance(item, str): - issues.append(ConfigIssue("error", f"Pattern in support_files.content.{field} must be a string. Got: {item}")) + issues.append( + ConfigIssue( + "error", + f"Pattern in support_files.content.{field} must be a string. Got: {item}", + ) + ) elif "support_files" in data: issues.append(ConfigIssue("error", "support_files must be a table.")) @@ -430,18 +571,29 @@ def check_pattern_list(parent_dict: dict[str, Any], path_name: str) -> None: return issues -def validate_config(config: ScriberConfig, raw_data: dict[str, Any], config_path: Path | None = None) -> list[ConfigIssue]: + +def validate_config( + config: ScriberConfig, raw_data: dict[str, Any], config_path: Path | None = None +) -> list[ConfigIssue]: issues = validate_raw_config(raw_data) - + # Check output path is not a directory output_path = config.output if not output_path.is_absolute() and config_path: output_path = config_path.parent / output_path - + if output_path.suffix == "" and not str(output_path).endswith("-"): - issues.append(ConfigIssue("warning", f"Output path '{output_path}' has no extension. Is it a directory?")) + issues.append( + ConfigIssue( + "warning", + f"Output path '{output_path}' has no extension. Is it a directory?", + ) + ) if output_path.exists() and output_path.is_dir(): - issues.append(ConfigIssue("error", f"Output path '{output_path}' points to an existing directory.")) + issues.append( + ConfigIssue( + "error", f"Output path '{output_path}' points to an existing directory." + ) + ) return issues - diff --git a/src/scriber/core/init_config.py b/src/scriber/core/init_config.py index f345fe8..0809771 100644 --- a/src/scriber/core/init_config.py +++ b/src/scriber/core/init_config.py @@ -9,7 +9,7 @@ def replace_existing_tool_scriber_block(content: str, default_block: str) -> str lines = content.splitlines() new_lines = [] in_scriber = False - + for line in lines: stripped = line.strip() if stripped.startswith("[") and stripped.endswith("]"): @@ -19,10 +19,10 @@ def replace_existing_tool_scriber_block(content: str, default_block: str) -> str continue else: in_scriber = False - + if not in_scriber: new_lines.append(line) - + cleaned = "\n".join(new_lines).strip() if cleaned: return cleaned + "\n\n" + default_block + "\n" @@ -35,23 +35,27 @@ def init_project(config_path: str | None = None, force: bool = False) -> Path: path = path / "pyproject.toml" if not path.is_absolute(): path = Path.cwd() / path - + if path.exists(): content = path.read_text(encoding="utf-8") has_scriber = "[tool.scriber]" in content - + if has_scriber and not force: - raise ScriberError(f"Scriber config already exists. Use --force to replace it.") - + raise ScriberError( + "Scriber config already exists. Use --force to replace it." + ) + if has_scriber: - new_content = replace_existing_tool_scriber_block(content, DEFAULT_CONFIG_BLOCK) + new_content = replace_existing_tool_scriber_block( + content, DEFAULT_CONFIG_BLOCK + ) else: if content and not content.endswith("\n"): content += "\n" new_content = content + "\n" + DEFAULT_CONFIG_BLOCK + "\n" - + path.write_text(new_content, encoding="utf-8") else: path.write_text(DEFAULT_CONFIG_BLOCK + "\n", encoding="utf-8") - + return path diff --git a/src/scriber/core/models.py b/src/scriber/core/models.py index c521c49..ca40354 100644 --- a/src/scriber/core/models.py +++ b/src/scriber/core/models.py @@ -2,7 +2,7 @@ from dataclasses import dataclass, field from pathlib import Path -from typing import Literal +from typing import Any, Literal FileKind = Literal["code", "support", "other"] ContentPolicy = Literal["full", "auto", "tree_only"] @@ -10,7 +10,6 @@ PackMode = Literal["focused", "project_snapshot"] - DEFAULT_SCORING: dict[str, int] = { "seed_file": 100, "seed_folder_file": 100, @@ -26,6 +25,10 @@ "documentation": 45, "name_similarity": 45, "shared_dependency_bonus": 10, + "entrypoint_file": 90, + "code_file": 80, + "test_file": 60, + "other_file": 40, } @@ -39,6 +42,7 @@ class ModuleConfig: include_same_package: bool = True include_parent_entrypoints: bool = True include_project_configs: bool = True + top_dependencies: int = 10 content_min_score: int = 50 tree_min_score: int = 30 scoring: dict[str, int] = field(default_factory=lambda: dict(DEFAULT_SCORING)) @@ -50,7 +54,14 @@ class PythonConfig: test_roots: list[str] = field(default_factory=lambda: ["tests", "test"]) module_init_files: list[str] = field(default_factory=lambda: ["__init__.py"]) entrypoint_patterns: list[str] = field( - default_factory=lambda: ["main.py", "app.py", "asgi.py", "wsgi.py", "routes.py", "router.py"] + default_factory=lambda: [ + "main.py", + "app.py", + "asgi.py", + "wsgi.py", + "routes.py", + "router.py", + ] ) @@ -108,13 +119,24 @@ class FileNode: is_binary: bool = False support_category: str | None = None content_policy: ContentPolicy = "auto" - _cached_text: str | None = field(default=None, init=False, repr=False, compare=False, hash=False) + _cached_text: str | None = field( + default=None, init=False, repr=False, compare=False, hash=False + ) def read_text(self) -> str: if self._cached_text is not None: return self._cached_text - from scriber.native import require_native - text = require_native().read_text(str(self.absolute)) + + try: + from scriber.native import is_native_available, require_native + + if is_native_available(): + text = require_native().read_text(str(self.absolute)) + else: + text = self.absolute.read_text(encoding="utf-8", errors="replace") + except Exception: + text = self.absolute.read_text(encoding="utf-8", errors="replace") + object.__setattr__(self, "_cached_text", text) return text @@ -141,12 +163,12 @@ class Candidate: reason_counts: dict[str, int] = field(default_factory=dict) reason_examples: dict[str, list[Path]] = field(default_factory=dict) reason_summary: str = "" + utility: float = 0.0 + raw_score: float = 0.0 + role: str = "unknown" -@dataclass(slots=True) -class ModuleGraph: - imports: dict[Path, set[Path]] = field(default_factory=dict) - imported_by: dict[Path, set[Path]] = field(default_factory=dict) +from scriber.graph.model import RelationEdge, RelationGraph, ModuleGraph # noqa: E402 @dataclass(slots=True) @@ -160,8 +182,88 @@ class ScriberPack: output_format: OutputFormat mode: PackMode total_tokens: int = 0 + stats: dict[str, Any] = field(default_factory=dict) timings: dict[str, float] = field(default_factory=dict) @property def included_paths(self) -> list[Path]: return [candidate.file.relative for candidate in self.candidates] + + +ContentMode = Literal["full", "excerpt", "outline", "tree", "omit"] + +FileRole = Literal[ + "entrypoint", + "orchestrator", + "model", + "config", + "graph", + "ranker", + "renderer", + "scanner", + "language_adapter", + "native_adapter", + "test", + "support", + "docs", + "generated", + "unknown", +] + + +@dataclass(frozen=True, slots=True) +class FileRef: + path: Path + kind: FileKind + language: str + size_bytes: int + token_estimate: int + role: FileRole = "unknown" + + +@dataclass(frozen=True, slots=True) +class FileOutline: + path: Path + language: str + purpose: str | None + imports: list[str] + exports: list[str] + classes: list[str] + functions: list[str] + constants: list[str] + notes: list[str] + token_estimate: int + + +@dataclass(slots=True) +class PackItem: + file: FileNode + score: int + role: FileRole + content_mode: ContentMode + reason: str + reasons: list[str] + relation_evidence: list[RelationEdge] + outline: FileOutline | None = None + content: str | None = None + excerpts: list[str] = field(default_factory=list) + token_estimate: int = 0 + item_id: str = "" + utility: float = 0.0 + raw_score: float = 0.0 + + +@dataclass(slots=True) +class LlmPack: + project_root: Path + config_path: Path + profile: str + mode: PackMode + goal: str | None + budget_target: int + budget_actual: int + items: list[PackItem] + graph: RelationGraph + stats: dict[str, Any] + warnings: list[str] + timings: dict[str, float] = field(default_factory=dict) diff --git a/src/scriber/core/profiles.py b/src/scriber/core/profiles.py new file mode 100644 index 0000000..c8de117 --- /dev/null +++ b/src/scriber/core/profiles.py @@ -0,0 +1,46 @@ +from __future__ import annotations +from copy import deepcopy +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from scriber.core.models import ScriberConfig + +PROFILE_CHOICES = ("default", "audit", "debug", "refactor", "docs") + + +def apply_profile(config: ScriberConfig, profile: str) -> ScriberConfig: + if profile == "default" or not profile: + return config + + cfg = deepcopy(config) + scoring = cfg.modules_config.scoring + + if profile == "audit": + scoring["test_file"] = 80 + scoring["project_config"] = 90 + scoring["dependency_file"] = 90 + scoring["runtime_support"] = 85 + scoring["documentation"] = 70 + + elif profile == "debug": + scoring["direct_dependency"] = 90 + scoring["reverse_dependency"] = 80 + scoring["test_file"] = 70 + scoring["runtime_support"] = 80 + scoring["support_near_seed"] = 80 + + elif profile == "refactor": + scoring["same_package"] = 80 + scoring["related_test"] = 90 + scoring["test_file"] = 75 + scoring["direct_dependency"] = 60 + + elif profile == "docs": + scoring["documentation"] = 95 + scoring["project_config"] = 50 + scoring["dependency_file"] = 30 + scoring["test_file"] = 10 + scoring["code_file"] = 30 + cfg.support_content.default = "tree_only" + + return cfg diff --git a/src/scriber/core/root.py b/src/scriber/core/root.py index b8042b5..93c6743 100644 --- a/src/scriber/core/root.py +++ b/src/scriber/core/root.py @@ -16,7 +16,9 @@ def resolve_config_path(paths: list[str], explicit_config: str | None = None) -> if not config.exists(): raise ScriberError(f"Config not found: {config}") if config.name != "pyproject.toml": - raise ScriberError("Scriber 2.0 expects --config to point to pyproject.toml") + raise ScriberError( + "Scriber 2.0 expects --config to point to pyproject.toml" + ) return config starts: list[Path] = [] @@ -44,7 +46,9 @@ def resolve_config_path(paths: list[str], explicit_config: str | None = None) -> if candidate.exists(): return candidate.resolve() - raise ScriberError("No pyproject.toml found. Run `scriber init` or pass `--config /path/to/pyproject.toml`.") + raise ScriberError( + "No pyproject.toml found. Run `scriber init` or pass `--config /path/to/pyproject.toml`." + ) def project_root_from_config(config_path: Path) -> Path: diff --git a/src/scriber/core/symbols.py b/src/scriber/core/symbols.py new file mode 100644 index 0000000..8877930 --- /dev/null +++ b/src/scriber/core/symbols.py @@ -0,0 +1,22 @@ +from dataclasses import dataclass, field +from pathlib import Path + + +@dataclass(slots=True) +class SymbolNode: + name: str + kind: str # "class" or "function" + line_start: int + line_end: int + parent_name: str | None = None + + +@dataclass(slots=True) +class SymbolIndex: + symbols_by_file: dict[Path, list[SymbolNode]] = field(default_factory=dict) + + def add_symbol(self, file_path: Path, symbol: SymbolNode) -> None: + self.symbols_by_file.setdefault(file_path, []).append(symbol) + + def get_symbols(self, file_path: Path) -> list[SymbolNode]: + return self.symbols_by_file.get(file_path, []) diff --git a/src/scriber/engine/ranker.py b/src/scriber/engine/ranker.py new file mode 100644 index 0000000..623f880 --- /dev/null +++ b/src/scriber/engine/ranker.py @@ -0,0 +1,152 @@ +from __future__ import annotations +from pathlib import Path +import math +from collections import deque, defaultdict +from scriber.core.models import FileNode, RelationGraph, ScriberConfig, Candidate +from scriber.engine.roles import classify_file_role, ROLE_SCORE + +RELATION_WEIGHT = { + "import": 90, + "reexport": 80, + "test_of": 78, + "entrypoint_to_module": 75, + "config_refs_code": 58, + "env_key": 52, + "doc_mentions_code": 42, + "git_cochange": 40, + "same_package": 28, + "same_dir": 20, + "name_similarity": 18, + "semantic_similarity": 15, +} + + +def rank_context( + files: dict[Path, FileNode], + graph: RelationGraph, + seeds: list[Path], + config: ScriberConfig, + mode: str, +) -> list[Candidate]: + candidates = [] + + explicit_seeds = {s for s in seeds} + + distances = {} + if mode == "focused": + adj_out = defaultdict(list) + adj_in = defaultdict(list) + for edge in graph.edges: + adj_out[edge.source].append(edge.target) + adj_in[edge.target].append(edge.source) + + q_out = deque() + q_in = deque() + dist_out = {} + dist_in = {} + + for s in explicit_seeds: + if s in files: + dist_out[s] = 0 + dist_in[s] = 0 + q_out.append(s) + q_in.append(s) + + while q_out: + curr = q_out.popleft() + d = dist_out[curr] + for nbr in adj_out[curr]: + if nbr not in dist_out: + dist_out[nbr] = d + 1 + q_out.append(nbr) + + while q_in: + curr = q_in.popleft() + d = dist_in[curr] + for nbr in adj_in[curr]: + if nbr not in dist_in: + dist_in[nbr] = d + 1 + q_in.append(nbr) + + for rel in files.keys(): + d_out = dist_out.get(rel, 999) + d_in = dist_in.get(rel, 999) + distances[rel] = min(d_out, d_in) + + for rel, node in files.items(): + role = classify_file_role(node, graph) + role_score = ROLE_SCORE.get(role, 20) + + relation_score = 0.0 + incoming = graph.incoming.get(rel, []) + for edge in incoming: + weight = RELATION_WEIGHT.get(edge.kind, 10) * edge.weight * edge.confidence + relation_score += weight + + centrality_bonus = 0 + evidence_bonus = len(incoming) * 2 + noise_penalty = 0 + + if node.language in {"json", "lock", "svg"}: + noise_penalty += 50 + + if mode == "focused": + dist = distances.get(rel, 999) + if dist == 0: + decay = 1.0 + seed_bonus = 100 + max_score = 100 + elif dist == 1: + decay = 1.0 + seed_bonus = 0 + max_score = 79 + elif dist == 2: + decay = 0.5 + seed_bonus = 0 + max_score = 74 + else: + decay = 0.1 + seed_bonus = 0 + max_score = 44 + else: + decay = 1.0 + seed_bonus = 100 if rel in explicit_seeds else 0 + max_score = 100 + + if mode == "focused" and role == "test" and rel not in explicit_seeds: + noise_penalty += 80 + max_score = min( + max_score, 44 + ) # Force test files to tree mode unless specifically targeted + + raw_score = ( + role_score + + relation_score + + seed_bonus + + centrality_bonus + + evidence_bonus + - noise_penalty + ) * decay + + token_estimate = node.size_bytes // 4 + utility = raw_score / math.sqrt(token_estimate + 200) + + c = Candidate( + file=node, + score=int( + min(max_score, max(0, raw_score)) + ), # clamp to distance-based max_score + reasons=[f"Role {role}: {role_score}", f"Relations: {relation_score:.1f}"], + include_content=False, + token_estimate=token_estimate, + ) + + object.__setattr__(c, "utility", utility) + object.__setattr__(c, "raw_score", raw_score) + object.__setattr__(c, "role", role) + + candidates.append(c) + + # Primary sort by utility, then score + candidates.sort(key=lambda c: (getattr(c, "utility", 0), c.score), reverse=True) + return candidates diff --git a/src/scriber/engine/roles.py b/src/scriber/engine/roles.py new file mode 100644 index 0000000..8607b56 --- /dev/null +++ b/src/scriber/engine/roles.py @@ -0,0 +1,72 @@ +from __future__ import annotations +from scriber.core.models import FileNode, FileRole, RelationGraph + +ROLE_SCORE: dict[FileRole, int] = { + "entrypoint": 95, + "orchestrator": 95, + "graph": 90, + "ranker": 90, + "renderer": 90, + "model": 88, + "config": 82, + "scanner": 75, + "native_adapter": 65, + "language_adapter": 65, + "test": 55, + "support": 45, + "docs": 35, + "generated": 5, + "unknown": 20, +} + + +def _is_test_path(rel: str, test_roots: set[str]) -> bool: + parts = rel.split("/") + name = parts[-1] + parent_parts = parts[:-1] + return ( + any(part in test_roots for part in parent_parts) + or name.startswith("test_") + or name.endswith("_test.py") + or name.endswith(".test.py") + ) + + +def classify_file_role(file: FileNode, graph: RelationGraph) -> FileRole: + rel = file.relative.as_posix().lower() + + if rel in {"cli/main.py", "src/scriber/cli/main.py", "src/main.py", "main.py"}: + return "entrypoint" + if "orchestrator" in rel or "pack.py" in rel or "build.py" in rel: + return "orchestrator" + if "core/models.py" in rel or "model.py" in rel: + return "model" + if "core/config.py" in rel or "config.py" in rel: + return "config" + if file.kind == "code" and _is_test_path(rel, {"tests", "test"}): + return "test" + if "languages/" in rel: + return "language_adapter" + if "graph/" in rel: + return "graph" + if "ranker.py" in rel or "scorer.py" in rel: + return "ranker" + if "renderer" in rel or "llm_report" in rel: + return "renderer" + if "scanner/" in rel: + return "scanner" + if ( + rel.endswith("native.py") + or "rust/scriber_native/" in rel + or ("native" in rel and file.language == "rust") + ): + return "native_adapter" + if "readme" in rel or rel.startswith("docs"): + return "docs" + if ( + rel in {"pyproject.toml", "package.json", "cargo.toml"} + or file.kind == "support" + ): + return "support" + + return "unknown" diff --git a/src/scriber/engine/scorer.py b/src/scriber/engine/scorer.py index 0dbfb24..db018a8 100644 --- a/src/scriber/engine/scorer.py +++ b/src/scriber/engine/scorer.py @@ -3,14 +3,23 @@ from pathlib import Path from scriber.core.matchers import match_pattern -from scriber.core.models import Candidate, FileNode, ModuleGraph, ScriberConfig, SeedPath +from scriber.core.models import ( + Candidate, + FileNode, + ModuleGraph, + ScriberConfig, + SeedPath, + RelationEdge, +) def _score(config: ScriberConfig, key: str) -> int: return int(config.modules_config.scoring.get(key, 0)) -def _add_reason(candidate: Candidate, kind: str, label: str, example: Path | None = None) -> None: +def _add_reason( + candidate: Candidate, kind: str, label: str, example: Path | None = None +) -> None: candidate.reason_counts[kind] = candidate.reason_counts.get(kind, 0) + 1 if example is not None: if kind not in candidate.reason_examples: @@ -89,18 +98,22 @@ def _add( candidates[rel] = existing else: existing.score = max(existing.score, score) - + _add_reason(existing, kind, label, example=seed) if seed is not None: existing.seed_sources.add(seed) def _is_test_file(rel: Path, config: ScriberConfig) -> bool: - parts = rel.parts + parts = rel.parts[:-1] if len(rel.parts) > 1 else () name = rel.name.lower() if any(part in set(config.python.test_roots) for part in parts): return True - return name.startswith("test_") or name.endswith("_test.py") or name.endswith(".test.py") + return ( + name.startswith("test_") + or name.endswith("_test.py") + or name.endswith(".test.py") + ) def _name_related(a: Path, b: Path) -> bool: @@ -111,7 +124,68 @@ def _name_related(a: Path, b: Path) -> bool: return a_stem in b_stem or b_stem in a_stem -def _walk_neighbors(edges: dict[Path, set[Path]], start: Path, depth: int) -> dict[Path, int]: +def _walk_weighted_neighbors( + edges: list[RelationEdge], + start: Path, + depth_limit: int, + top_dependencies: int, + reverse: bool = False, +) -> dict[Path, float]: + import heapq + + adj: dict[Path, list[tuple[Path, RelationEdge]]] = {} + for edge in edges: + u = edge.target if reverse else edge.source + v = edge.source if reverse else edge.target + adj.setdefault(u, []).append((v, edge)) + + if top_dependencies > 0: + for u, edges_from_u in adj.items(): + if len(edges_from_u) > top_dependencies: + edges_from_u.sort( + key=lambda item: item[1].weight * item[1].confidence, reverse=True + ) + adj[u] = edges_from_u[:top_dependencies] + + queue = [(-1.0, 0, start)] + max_strength: dict[Path, float] = {start: 1.0} + best_at_state: dict[tuple[Path, int], float] = {(start, 0): 1.0} + + while queue: + neg_str, depth, u = heapq.heappop(queue) + u_str = -neg_str + + if u_str < best_at_state.get((u, depth), 0.0): + continue + + if depth >= depth_limit: + continue + + for neighbor, edge in adj.get(u, []): + if edge.kind in {"import", "reexport"}: + edge_str = 1.0 if depth == 0 else 0.88 + else: + edge_str = edge.weight * edge.confidence + + next_str = u_str * edge_str + next_depth = depth + 1 + + if next_str > max_strength.get(neighbor, 0.0): + max_strength[neighbor] = next_str + + if next_str > best_at_state.get((neighbor, next_depth), 0.0): + best_at_state[(neighbor, next_depth)] = next_str + heapq.heappush(queue, (-next_str, next_depth, neighbor)) + + if start in max_strength: + del max_strength[start] + + return max_strength + + +def _walk_neighbors( + edges: dict[Path, set[Path]], start: Path, depth: int +) -> dict[Path, int]: found: dict[Path, int] = {} frontier = {start} visited = {start} @@ -136,7 +210,12 @@ def _support_base_score(file: FileNode, config: ScriberConfig) -> int: return _score(config, "project_config") if category == "dependency file": return _score(config, "dependency_file") - if category in {"runtime support", "runtime config", "ci support", "tooling config"}: + if category in { + "runtime support", + "runtime config", + "ci support", + "tooling config", + }: return _score(config, "runtime_support") if category == "documentation": return _score(config, "documentation") @@ -147,11 +226,18 @@ def _is_near_seed(support_file: Path, seed: Path) -> bool: if support_file.parent == Path("."): return True seed_parent = seed.parent - return support_file.parent == seed_parent or support_file.parent in seed_parent.parents or seed_parent in support_file.parent.parents + return ( + support_file.parent == seed_parent + or support_file.parent in seed_parent.parents + or seed_parent in support_file.parent.parents + ) def _matches_entrypoint(rel: Path, config: ScriberConfig) -> bool: - return any(match_pattern(rel.name, pattern) for pattern in config.python.entrypoint_patterns) + return any( + match_pattern(rel.name, pattern) + for pattern in config.python.entrypoint_patterns + ) def score_candidates_project_snapshot( @@ -165,17 +251,45 @@ def score_candidates_project_snapshot( for rel, file in files.items(): if file.kind == "code": if _matches_entrypoint(rel, config): - _add(candidates, files, rel, 90, "entrypoint", "entrypoint file") + _add( + candidates, + files, + rel, + _score(config, "entrypoint_file"), + "entrypoint", + "entrypoint file", + ) elif _is_test_file(rel, config): - _add(candidates, files, rel, 60, "test_file", "test file") + _add( + candidates, + files, + rel, + _score(config, "test_file"), + "test_file", + "test file", + ) else: - _add(candidates, files, rel, 80, "code_file", "code file") + _add( + candidates, + files, + rel, + _score(config, "code_file"), + "code_file", + "code file", + ) elif file.kind == "support" and config.support: base = _support_base_score(file, config) category = file.support_category or "support file" _add(candidates, files, rel, base, "project_support", category) elif file.kind == "other": - _add(candidates, files, rel, 40, "other_file", "other file") + _add( + candidates, + files, + rel, + _score(config, "other_file"), + "other_file", + "other file", + ) for candidate in candidates.values(): candidate.reason_summary = _build_reason_summary(candidate) @@ -183,16 +297,37 @@ def score_candidates_project_snapshot( filtered = [ candidate for rel, candidate in candidates.items() - if candidate.score >= config.min_score or candidate.score >= config.modules_config.tree_min_score + if candidate.score >= config.min_score + or candidate.score >= config.modules_config.tree_min_score ] - filtered.sort(key=lambda item: (-item.score, item.file.kind != "code", item.file.relative.as_posix())) + filtered.sort( + key=lambda item: ( + -item.score, + item.file.kind != "code", + item.file.relative.as_posix(), + ) + ) if config.max_files > 0 and len(filtered) > config.max_files: - pinned = [c for c in filtered if c.file.relative.name in {"pyproject.toml", "README.md"}] - rest = [c for c in filtered if c.file.relative.name not in {"pyproject.toml", "README.md"}] + pinned = [ + c + for c in filtered + if c.file.relative.name in {"pyproject.toml", "README.md"} + ] + rest = [ + c + for c in filtered + if c.file.relative.name not in {"pyproject.toml", "README.md"} + ] remaining = max(0, config.max_files - len(pinned)) filtered = pinned + rest[:remaining] - filtered.sort(key=lambda item: (-item.score, item.file.kind != "code", item.file.relative.as_posix())) + filtered.sort( + key=lambda item: ( + -item.score, + item.file.kind != "code", + item.file.relative.as_posix(), + ) + ) return filtered @@ -206,7 +341,9 @@ def score_candidates( mode: str = "focused", ) -> list[Candidate]: if mode == "project_snapshot": - return score_candidates_project_snapshot(files=files, graph=graph, config=config) + return score_candidates_project_snapshot( + files=files, graph=graph, config=config + ) candidates: dict[Path, Candidate] = {} scoring = config.modules_config @@ -216,43 +353,131 @@ def score_candidates( for seed in seeds: for rel in seed.expanded_files: key = "seed_folder_file" if seed.is_dir else "seed_file" - reason = f"file inside seed folder `{seed.relative.as_posix()}`" if seed.is_dir else "seed file" - _add(candidates, files, rel, _score(config, key), "seed_folder_file" if seed.is_dir else "seed_file", reason, seed=rel) + reason = ( + f"file inside seed folder `{seed.relative.as_posix()}`" + if seed.is_dir + else "seed file" + ) + _add( + candidates, + files, + rel, + _score(config, key), + "seed_folder_file" if seed.is_dir else "seed_file", + reason, + seed=rel, + ) if config.modules and scoring.enabled: for seed_rel in seed_files: if scoring.include_direct_dependencies: - for dep, distance in _walk_neighbors(graph.imports, seed_rel, scoring.depth).items(): - score = max(scoring.tree_min_score, _score(config, "direct_dependency") - ((distance - 1) * 10)) - _add(candidates, files, dep, score, "direct_dependency", f"direct dependency of `{seed_rel.as_posix()}`", seed=seed_rel) + for dep, strength in _walk_weighted_neighbors( + graph.edges, seed_rel, scoring.depth, scoring.top_dependencies + ).items(): + score = max( + scoring.tree_min_score, + int(_score(config, "direct_dependency") * strength), + ) + _add( + candidates, + files, + dep, + score, + "direct_dependency", + f"direct dependency of `{seed_rel.as_posix()}`", + seed=seed_rel, + ) if scoring.include_reverse_dependencies: - for dep, distance in _walk_neighbors(graph.imported_by, seed_rel, scoring.depth).items(): - score = max(scoring.tree_min_score, _score(config, "reverse_dependency") - ((distance - 1) * 10)) - _add(candidates, files, dep, score, "reverse_dependency", f"imports seed `{seed_rel.as_posix()}`", seed=seed_rel) + for dep, strength in _walk_weighted_neighbors( + graph.edges, + seed_rel, + scoring.depth, + scoring.top_dependencies, + reverse=True, + ).items(): + score = max( + scoring.tree_min_score, + int(_score(config, "reverse_dependency") * strength), + ) + _add( + candidates, + files, + dep, + score, + "reverse_dependency", + f"imports seed `{seed_rel.as_posix()}`", + seed=seed_rel, + ) if scoring.include_same_package: seed_parent = seed_rel.parent for rel, file in files.items(): - if file.kind == "code" and rel.parent == seed_parent and rel not in seed_set: - _add(candidates, files, rel, _score(config, "same_package"), "same_package", f"same package as `{seed_rel.as_posix()}`", seed=seed_rel) + if ( + file.kind == "code" + and rel.parent == seed_parent + and rel not in seed_set + ): + _add( + candidates, + files, + rel, + _score(config, "same_package"), + "same_package", + f"same package as `{seed_rel.as_posix()}`", + seed=seed_rel, + ) if scoring.include_parent_entrypoints: for rel, file in files.items(): if file.kind == "code" and _matches_entrypoint(rel, config): - if rel.parent == Path(".") or rel.parent in seed_rel.parents or seed_rel.parent in rel.parents: - _add(candidates, files, rel, _score(config, "parent_entrypoint"), "parent_entrypoint", f"parent/entrypoint near `{seed_rel.as_posix()}`", seed=seed_rel) + if ( + rel.parent == Path(".") + or rel.parent in seed_rel.parents + or seed_rel.parent in rel.parents + ): + _add( + candidates, + files, + rel, + _score(config, "parent_entrypoint"), + "parent_entrypoint", + f"parent/entrypoint near `{seed_rel.as_posix()}`", + seed=seed_rel, + ) if scoring.include_tests: for rel, file in files.items(): if file.kind != "code" or not _is_test_file(rel, config): continue - if _name_related(rel, seed_rel) or seed_rel in graph.imports.get(rel, set()): - _add(candidates, files, rel, _score(config, "related_test"), "related_test", f"related test for `{seed_rel.as_posix()}`", seed=seed_rel) + if _name_related(rel, seed_rel) or seed_rel in graph.imports.get( + rel, set() + ): + _add( + candidates, + files, + rel, + _score(config, "related_test"), + "related_test", + f"related test for `{seed_rel.as_posix()}`", + seed=seed_rel, + ) for rel, file in files.items(): - if file.kind == "code" and rel not in seed_set and _name_related(rel, seed_rel): - _add(candidates, files, rel, _score(config, "name_similarity"), "name_similarity", f"name similarity with `{seed_rel.as_posix()}`", seed=seed_rel) + if ( + file.kind == "code" + and rel not in seed_set + and _name_related(rel, seed_rel) + ): + _add( + candidates, + files, + rel, + _score(config, "name_similarity"), + "name_similarity", + f"name similarity with `{seed_rel.as_posix()}`", + seed=seed_rel, + ) if config.support: for rel, file in files.items(): @@ -261,24 +486,52 @@ def score_candidates( base = _support_base_score(file, config) reason = file.support_category or "support file" if rel.name == "pyproject.toml": - _add(candidates, files, rel, _score(config, "project_config"), "project_support", "project config/root file") + _add( + candidates, + files, + rel, + _score(config, "project_config"), + "project_support", + "project config/root file", + ) continue added = False for seed_rel in seed_files: if _is_near_seed(rel, seed_rel): - _add(candidates, files, rel, max(base, _score(config, "support_near_seed")), "support_near_seed", f"{reason} near `{seed_rel.as_posix()}`", seed=seed_rel) + _add( + candidates, + files, + rel, + max(base, _score(config, "support_near_seed")), + "support_near_seed", + f"{reason} near `{seed_rel.as_posix()}`", + seed=seed_rel, + ) added = True - if not added and file.relative.parent == Path(".") and scoring.include_project_configs: + if ( + not added + and file.relative.parent == Path(".") + and scoring.include_project_configs + ): _add(candidates, files, rel, base, "project_support", reason) else: if config.support: pyproject = files.get(Path("pyproject.toml")) if pyproject: - _add(candidates, files, Path("pyproject.toml"), _score(config, "project_config"), "project_support", "project config/root file") + _add( + candidates, + files, + Path("pyproject.toml"), + _score(config, "project_config"), + "project_support", + "project config/root file", + ) for candidate in candidates.values(): if len(candidate.seed_sources) > 1: - candidate.score = min(100, candidate.score + _score(config, "shared_dependency_bonus")) + candidate.score = min( + 100, candidate.score + _score(config, "shared_dependency_bonus") + ) _add_reason(candidate, "shared_dependency", "shared by multiple seed paths") for candidate in candidates.values(): @@ -288,15 +541,39 @@ def score_candidates( filtered = [ candidate for rel, candidate in candidates.items() - if rel in required or candidate.score >= config.min_score or candidate.score >= config.modules_config.tree_min_score + if rel in required + or candidate.score >= config.min_score + or candidate.score >= config.modules_config.tree_min_score ] - filtered.sort(key=lambda item: (-item.score, item.file.kind != "code", item.file.relative.as_posix())) + filtered.sort( + key=lambda item: ( + -item.score, + item.file.kind != "code", + item.file.relative.as_posix(), + ) + ) if config.max_files > 0 and len(filtered) > config.max_files: - seeds_first = [candidate for candidate in filtered if candidate.file.relative in required or candidate.file.relative.name in {"pyproject.toml", "README.md"}] - rest = [candidate for candidate in filtered if candidate.file.relative not in required and candidate.file.relative.name not in {"pyproject.toml", "README.md"}] + seeds_first = [ + candidate + for candidate in filtered + if candidate.file.relative in required + or candidate.file.relative.name in {"pyproject.toml", "README.md"} + ] + rest = [ + candidate + for candidate in filtered + if candidate.file.relative not in required + and candidate.file.relative.name not in {"pyproject.toml", "README.md"} + ] remaining = max(0, config.max_files - len(seeds_first)) filtered = seeds_first + rest[:remaining] - filtered.sort(key=lambda item: (-item.score, item.file.kind != "code", item.file.relative.as_posix())) + filtered.sort( + key=lambda item: ( + -item.score, + item.file.kind != "code", + item.file.relative.as_posix(), + ) + ) return filtered diff --git a/src/scriber/graph/analyzers/__init__.py b/src/scriber/graph/analyzers/__init__.py new file mode 100644 index 0000000..307bcf6 --- /dev/null +++ b/src/scriber/graph/analyzers/__init__.py @@ -0,0 +1,29 @@ +from pathlib import Path +from typing import Any +from scriber.graph.indexes import GraphIndexes +from scriber.graph.analyzers.tests import TestsAnalyzer +from scriber.graph.analyzers.package import PackageAnalyzer +from scriber.graph.analyzers.env import EnvAnalyzer +from scriber.graph.analyzers.config_refs import ConfigRefsAnalyzer +from scriber.graph.analyzers.docs import DocsAnalyzer + + +def generate_cheap_relations( + files: dict[Path, Any], edge_cls: Any, is_native: bool = False +) -> list[Any]: + indexes = GraphIndexes.build(files) + config = None # Passed as None for these simple analyzers + + analyzers = [ + TestsAnalyzer(), + PackageAnalyzer(), + EnvAnalyzer(), + ConfigRefsAnalyzer(), + DocsAnalyzer(), + ] + + edges = [] + for analyzer in analyzers: + edges.extend(analyzer.analyze(files, indexes, config, edge_cls, is_native)) + + return edges diff --git a/src/scriber/graph/analyzers/base.py b/src/scriber/graph/analyzers/base.py new file mode 100644 index 0000000..827d388 --- /dev/null +++ b/src/scriber/graph/analyzers/base.py @@ -0,0 +1,16 @@ +from __future__ import annotations + +from typing import Iterable, Protocol +from pathlib import Path + +from scriber.core.models import FileNode, ScriberConfig +from scriber.graph.indexes import GraphIndexes +from scriber.graph.model import RelationEdge + + +class RelationAnalyzer(Protocol): + name: str + + def analyze( + self, files: dict[Path, FileNode], indexes: GraphIndexes, config: ScriberConfig + ) -> Iterable[RelationEdge]: ... diff --git a/src/scriber/graph/analyzers/config_refs.py b/src/scriber/graph/analyzers/config_refs.py new file mode 100644 index 0000000..e7e0f17 --- /dev/null +++ b/src/scriber/graph/analyzers/config_refs.py @@ -0,0 +1,55 @@ +from __future__ import annotations +from typing import Iterable, Any +from pathlib import Path +from scriber.core.models import FileNode, ScriberConfig +from scriber.graph.indexes import GraphIndexes + + +def is_config_file(f: FileNode) -> bool: + name = f.relative.name.lower() + return name in { + "pyproject.toml", + "setup.py", + "package.json", + "dockerfile", + } or f.relative.suffix.lower() in {".toml", ".yaml", ".yml", ".json"} + + +class ConfigRefsAnalyzer: + name = "config_refs" + + def analyze( + self, + files: dict[Path, FileNode], + indexes: GraphIndexes, + config: ScriberConfig | None, + edge_cls: Any, + is_native: bool, + ) -> Iterable: + edges = [] + for rel, node in files.items(): + if is_config_file(node): + try: + content = node.absolute.read_text(encoding="utf-8", errors="ignore") + for crel, cnode in files.items(): + if cnode.kind == "code": + if crel.as_posix() in content or ( + len(crel.name) > 4 + and crel.name != "__init__.py" + and crel.name in content + ): + edges.append( + edge_cls( + source=str(rel) if is_native else rel, + target=str(crel) if is_native else crel, + kind="config_refs_code", + weight=0.6, + confidence=0.8, + evidence=f"Config {rel.name} references {crel.name}", + line=None, + analyzer="config_refs:indexed", + ) + ) + except Exception: + pass + return edges diff --git a/src/scriber/graph/analyzers/docs.py b/src/scriber/graph/analyzers/docs.py new file mode 100644 index 0000000..ca623f7 --- /dev/null +++ b/src/scriber/graph/analyzers/docs.py @@ -0,0 +1,49 @@ +from __future__ import annotations +from typing import Iterable, Any +from pathlib import Path +from scriber.core.models import FileNode, ScriberConfig +from scriber.graph.indexes import GraphIndexes + + +class DocsAnalyzer: + name = "docs" + + def analyze( + self, + files: dict[Path, FileNode], + indexes: GraphIndexes, + config: ScriberConfig | None, + edge_cls: Any, + is_native: bool, + ) -> Iterable: + edges = [] + for rel, node in files.items(): + name_lower = node.relative.name.lower() + if ( + name_lower in {"readme.md", "readme.txt", "readme"} + or "doc" in name_lower + ): + try: + content = node.absolute.read_text(encoding="utf-8", errors="ignore") + for crel, cnode in files.items(): + if cnode.kind == "code": + if crel.as_posix() in content or ( + len(crel.name) > 4 + and crel.name != "__init__.py" + and crel.name in content + ): + edges.append( + edge_cls( + source=str(rel) if is_native else rel, + target=str(crel) if is_native else crel, + kind="doc_mentions_code", + weight=0.42, + confidence=0.8, + evidence=f"{node.relative.name} mentions {crel.name}", + line=None, + analyzer="docs:indexed", + ) + ) + except Exception: + pass + return edges diff --git a/src/scriber/graph/analyzers/env.py b/src/scriber/graph/analyzers/env.py new file mode 100644 index 0000000..566b679 --- /dev/null +++ b/src/scriber/graph/analyzers/env.py @@ -0,0 +1,66 @@ +from __future__ import annotations +from typing import Iterable, Any +from pathlib import Path +import re +from scriber.core.models import FileNode, ScriberConfig +from scriber.graph.indexes import GraphIndexes + + +class EnvAnalyzer: + name = "env" + + def analyze( + self, + files: dict[Path, FileNode], + indexes: GraphIndexes, + config: ScriberConfig | None, + edge_cls: Any, + is_native: bool, + ) -> Iterable: + edges = [] + file_envs = {} + for rel, node in files.items(): + if node.kind != "code": + continue + try: + content = node.absolute.read_text(encoding="utf-8", errors="ignore") + keys = self.extract_env_keys(content) + if keys: + file_envs[rel] = keys + for k in keys: + indexes.env_key_to_files.setdefault(k, []).append(node) + except Exception: + pass + + for key, nodes in indexes.env_key_to_files.items(): + for i, n1 in enumerate(nodes): + for j, n2 in enumerate(nodes): + if i == j: + continue + edges.append( + edge_cls( + source=str(n1.relative) if is_native else n1.relative, + target=str(n2.relative) if is_native else n2.relative, + kind="env_key", + weight=0.4, + confidence=0.9, + evidence=f"Shared env key: {key}", + line=None, + analyzer="env:indexed", + ) + ) + return edges + + def extract_env_keys(self, content: str) -> set[str]: + keys = set() + for match in re.finditer( + r'os\.environ(?:\[|\.get\()[\'"]([A-Za-z0-9_]+)[\'"]', content + ): + keys.add(match.group(1)) + for match in re.finditer(r'os\.getenv\([\'"]([A-Za-z0-9_]+)[\'"]\)', content): + keys.add(match.group(1)) + for match in re.finditer( + r'process\.env(?:\[[\'"]([A-Za-z0-9_]+)[\'"]\]|\.([A-Za-z0-9_]+))', content + ): + keys.add(match.group(1) or match.group(2)) + return keys diff --git a/src/scriber/graph/analyzers/package.py b/src/scriber/graph/analyzers/package.py new file mode 100644 index 0000000..b7f7c2f --- /dev/null +++ b/src/scriber/graph/analyzers/package.py @@ -0,0 +1,42 @@ +from __future__ import annotations +from typing import Iterable, Any +from pathlib import Path +from scriber.core.models import FileNode, ScriberConfig +from scriber.graph.indexes import GraphIndexes + + +class PackageAnalyzer: + name = "package" + + def analyze( + self, + files: dict[Path, FileNode], + indexes: GraphIndexes, + config: ScriberConfig | None, + edge_cls: Any, + is_native: bool, + ) -> Iterable: + edges = [] + for d, siblings in indexes.by_dir.items(): + code_siblings = [s for s in siblings if s.kind == "code"] + for s1 in code_siblings: + count = 0 + for s2 in code_siblings: + if s1 == s2: + continue + count += 1 + if count > 8: + break + edges.append( + edge_cls( + source=str(s1.relative) if is_native else s1.relative, + target=str(s2.relative) if is_native else s2.relative, + kind="same_package", + weight=0.5, + confidence=1.0, + evidence=None, + line=None, + analyzer="package:indexed", + ) + ) + return edges diff --git a/src/scriber/graph/analyzers/tests.py b/src/scriber/graph/analyzers/tests.py new file mode 100644 index 0000000..24d7f0f --- /dev/null +++ b/src/scriber/graph/analyzers/tests.py @@ -0,0 +1,60 @@ +from __future__ import annotations +from typing import Iterable, Any +from pathlib import Path +from scriber.core.models import FileNode, ScriberConfig +from scriber.graph.indexes import GraphIndexes + + +class TestsAnalyzer: + name = "tests" + + def analyze( + self, + files: dict[Path, FileNode], + indexes: GraphIndexes, + config: ScriberConfig | None, + edge_cls: Any, + is_native: bool, + ) -> Iterable: + edges = [] + for rel, node in files.items(): + if node.kind != "code": + continue + stem = rel.stem.lower() + name = rel.name.lower() + clean_stem = ( + stem.replace("test_", "").replace("_test", "").replace(".test", "") + ) + is_test = ( + name.startswith("test_") + or name.endswith("_test.py") + or ".test." in name + ) + + if is_test and clean_stem: + targets = indexes.by_clean_stem.get(clean_stem, []) + for target_node in targets: + if target_node.relative == rel: + continue + target_name = target_node.relative.name.lower() + target_is_test = ( + target_name.startswith("test_") + or target_name.endswith("_test.py") + or ".test." in target_name + ) + if not target_is_test: + edges.append( + edge_cls( + source=str(rel) if is_native else rel, + target=str(target_node.relative) + if is_native + else target_node.relative, + kind="test_of", + weight=0.85, + confidence=0.9, + evidence=f"test filename {rel.name} matches {target_node.relative.name}", + line=None, + analyzer="tests:indexed", + ) + ) + return edges diff --git a/src/scriber/graph/builder.py b/src/scriber/graph/builder.py index a181441..33f2c98 100644 --- a/src/scriber/graph/builder.py +++ b/src/scriber/graph/builder.py @@ -1,13 +1,25 @@ from __future__ import annotations +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from scriber.cache import ScriberCache from pathlib import Path -from scriber.core.models import FileNode, ModuleGraph, ScriberConfig -from scriber.graph.languages.python import build_module_map, parse_python_imports, resolve_import_record -from scriber.scanner.files import read_text_lossy +from scriber.core.models import FileNode, ScriberConfig +from scriber.graph.model import ModuleGraph, RelationEdge +from scriber.graph.languages.python import ( + build_module_map, + parse_python_imports, + resolve_import_record, +) -def build_graph(files: dict[Path, FileNode], config: ScriberConfig) -> ModuleGraph: +def build_graph( + files: dict[Path, FileNode], + config: ScriberConfig, + cache: ScriberCache | None = None, +) -> ModuleGraph: graph = ModuleGraph() if not files: return graph @@ -22,15 +34,26 @@ def build_graph(files: dict[Path, FileNode], config: ScriberConfig) -> ModuleGra dir_to_files.setdefault(node.absolute.parent, []).append(node) sample = next(iter(files.values())) - root = Path(sample.absolute.as_posix()[:len(sample.absolute.as_posix()) - len(sample.relative.as_posix())]).resolve() + root = Path( + sample.absolute.as_posix()[ + : len(sample.absolute.as_posix()) - len(sample.relative.as_posix()) + ] + ).resolve() - from scriber.cache import ScriberCache - cache = ScriberCache(config, root) + if cache is None: + from scriber.cache import ScriberCache + + cache = ScriberCache(config, root) module_to_path, path_to_module = build_module_map(files, config.python) for rel, file in files.items(): - if file.kind != "code" or file.is_binary or file.language not in {"python", "javascript", "typescript", "rust", "go", "c", "cpp"}: + if ( + file.kind != "code" + or file.is_binary + or file.language + not in {"python", "javascript", "typescript", "rust", "go", "c", "cpp"} + ): continue try: @@ -42,12 +65,20 @@ def build_graph(files: dict[Path, FileNode], config: ScriberConfig) -> ModuleGra cached_data = cache.get_file(rel, mtime_ns, size) if cached_data is not None: - cached_imports = cache.get_imports(rel) + cached_imports = cache.get_imports(rel, mtime_ns, size) if cached_imports is not None: for target in cached_imports: if target in files: - graph.imports.setdefault(rel, set()).add(target) - graph.imported_by.setdefault(target, set()).add(rel) + graph.add_edge( + RelationEdge( + source=rel, + target=target, + kind="import", + weight=1.0, + confidence=0.98, + analyzer=f"imports:{file.language}", + ) + ) continue resolved_set = set() @@ -72,7 +103,11 @@ def build_graph(files: dict[Path, FileNode], config: ScriberConfig) -> ModuleGra resolved_set.add(target) elif file.language in {"javascript", "typescript", "react"}: - from scriber.graph.languages.javascript import parse_javascript_imports, resolve_javascript_import + from scriber.graph.languages.javascript import ( + parse_javascript_imports, + resolve_javascript_import, + ) + try: source = file.read_text() except OSError: @@ -85,7 +120,11 @@ def build_graph(files: dict[Path, FileNode], config: ScriberConfig) -> ModuleGra resolved_set.add(target) elif file.language == "rust": - from scriber.graph.languages.rust import parse_rust_imports, resolve_rust_import + from scriber.graph.languages.rust import ( + parse_rust_imports, + resolve_rust_import, + ) + try: source = file.read_text() except OSError: @@ -99,6 +138,7 @@ def build_graph(files: dict[Path, FileNode], config: ScriberConfig) -> ModuleGra elif file.language == "go": from scriber.graph.languages.go import parse_go_imports, resolve_go_import + try: source = file.read_text() except OSError: @@ -111,7 +151,11 @@ def build_graph(files: dict[Path, FileNode], config: ScriberConfig) -> ModuleGra resolved_set.add(target) elif file.language in {"c", "cpp"}: - from scriber.graph.languages.cpp import parse_cpp_includes, resolve_cpp_include + from scriber.graph.languages.cpp import ( + parse_cpp_includes, + resolve_cpp_include, + ) + try: source = file.read_text() except OSError: @@ -123,10 +167,17 @@ def build_graph(files: dict[Path, FileNode], config: ScriberConfig) -> ModuleGra continue resolved_set.add(target) - for target in resolved_set: - graph.imports.setdefault(rel, set()).add(target) - graph.imported_by.setdefault(target, set()).add(rel) + graph.add_edge( + RelationEdge( + source=rel, + target=target, + kind="import", + weight=1.0, + confidence=0.98, + analyzer=f"imports:{file.language}", + ) + ) cache.set_imports(rel, resolved_set) diff --git a/src/scriber/graph/indexes.py b/src/scriber/graph/indexes.py new file mode 100644 index 0000000..905d952 --- /dev/null +++ b/src/scriber/graph/indexes.py @@ -0,0 +1,37 @@ +from __future__ import annotations + +import re +from dataclasses import dataclass, field +from pathlib import Path + +from scriber.core.models import FileNode + + +@dataclass(slots=True) +class GraphIndexes: + by_dir: dict[Path, list[FileNode]] = field(default_factory=dict) + by_stem: dict[str, list[FileNode]] = field(default_factory=dict) + by_clean_stem: dict[str, list[FileNode]] = field(default_factory=dict) + by_language: dict[str, list[FileNode]] = field(default_factory=dict) + env_key_to_files: dict[str, list[FileNode]] = field(default_factory=dict) + config_tokens: dict[Path, set[str]] = field(default_factory=dict) + doc_tokens: dict[Path, set[str]] = field(default_factory=dict) + + @classmethod + def build(cls, files: dict[Path, FileNode]) -> GraphIndexes: + indexes = cls() + + for rel, node in files.items(): + indexes.by_dir.setdefault(rel.parent, []).append(node) + indexes.by_stem.setdefault(rel.stem, []).append(node) + + clean_stem = re.sub(r"[^a-zA-Z0-9]", "", rel.stem).lower() + if clean_stem: + indexes.by_clean_stem.setdefault(clean_stem, []).append(node) + + indexes.by_language.setdefault(node.language, []).append(node) + + # Simple indexing for .env and docs is done per analyzer as needed, + # but we can initialize the dicts here. + + return indexes diff --git a/src/scriber/graph/languages/cpp.py b/src/scriber/graph/languages/cpp.py index 5c19732..2de541a 100644 --- a/src/scriber/graph/languages/cpp.py +++ b/src/scriber/graph/languages/cpp.py @@ -19,20 +19,18 @@ def parse_cpp_includes(source: str) -> list[str]: def resolve_cpp_include( - include_spec: str, - current_file: FileNode, - absolute_to_file: dict[Path, FileNode] + include_spec: str, current_file: FileNode, absolute_to_file: dict[Path, FileNode] ) -> set[Path]: """Resolve a C/C++ include specifier to a project file path.""" resolved = set() parent = current_file.absolute.parent - + # 1. Try resolving relative to current file's directory try: candidate = (parent / include_spec).resolve(strict=False) except Exception: candidate = parent / include_spec - + node = absolute_to_file.get(candidate) if node and not node.is_binary: resolved.add(node.relative) diff --git a/src/scriber/graph/languages/extractor.py b/src/scriber/graph/languages/extractor.py new file mode 100644 index 0000000..2d7dc75 --- /dev/null +++ b/src/scriber/graph/languages/extractor.py @@ -0,0 +1,77 @@ +import ast +from pathlib import Path +from typing import Any +from scriber.core.symbols import SymbolNode, SymbolIndex + + +class PythonSymbolVisitor(ast.NodeVisitor): + def __init__(self, file_path: Path, index: SymbolIndex): + self.file_path = file_path + self.index = index + self.current_parent: str | None = None + + def visit_ClassDef(self, node: ast.ClassDef) -> Any: + start = node.lineno + end = getattr(node, "end_lineno", start) + + symbol = SymbolNode( + name=node.name, + kind="class", + line_start=start, + line_end=end, + parent_name=self.current_parent, + ) + self.index.add_symbol(self.file_path, symbol) + + old_parent = self.current_parent + self.current_parent = node.name + self.generic_visit(node) + self.current_parent = old_parent + + def visit_FunctionDef(self, node: ast.FunctionDef) -> Any: + start = node.lineno + end = getattr(node, "end_lineno", start) + + symbol = SymbolNode( + name=node.name, + kind="function", + line_start=start, + line_end=end, + parent_name=self.current_parent, + ) + self.index.add_symbol(self.file_path, symbol) + + old_parent = self.current_parent + self.current_parent = node.name + self.generic_visit(node) + self.current_parent = old_parent + + def visit_AsyncFunctionDef(self, node: ast.AsyncFunctionDef) -> Any: + start = node.lineno + end = getattr(node, "end_lineno", start) + + symbol = SymbolNode( + name=node.name, + kind="function", + line_start=start, + line_end=end, + parent_name=self.current_parent, + ) + self.index.add_symbol(self.file_path, symbol) + + old_parent = self.current_parent + self.current_parent = node.name + self.generic_visit(node) + self.current_parent = old_parent + + +def extract_python_symbols( + file_path: Path, source_code: str, index: SymbolIndex +) -> None: + try: + tree = ast.parse(source_code, filename=str(file_path)) + visitor = PythonSymbolVisitor(file_path, index) + visitor.visit(tree) + except Exception: + # Gracefully handle syntactically invalid or unparseable files + pass diff --git a/src/scriber/graph/languages/go.py b/src/scriber/graph/languages/go.py index 25b5fab..49bd77f 100644 --- a/src/scriber/graph/languages/go.py +++ b/src/scriber/graph/languages/go.py @@ -6,7 +6,7 @@ IMPORT_SINGLE_RE = re.compile(r'\bimport\s+[\'"]([^\'"]+)[\'"]') -IMPORT_BLOCK_RE = re.compile(r'\bimport\s*\(([^)]+)\)') +IMPORT_BLOCK_RE = re.compile(r"\bimport\s*\(([^)]+)\)") def parse_go_imports(source: str) -> list[str]: @@ -25,24 +25,29 @@ def parse_go_imports(source: str) -> list[str]: return imports -def resolve_go_import(import_spec: str, current_file: FileNode, dir_to_files: dict[Path, list[FileNode]], project_root: Path) -> set[Path]: +def resolve_go_import( + import_spec: str, + current_file: FileNode, + dir_to_files: dict[Path, list[FileNode]], + project_root: Path, +) -> set[Path]: resolved = set() go_mod_path = project_root / "go.mod" module_name = None if go_mod_path.exists(): try: content = go_mod_path.read_text(encoding="utf-8") - m = re.search(r'^\s*module\s+(\S+)', content, re.MULTILINE) + m = re.search(r"^\s*module\s+(\S+)", content, re.MULTILINE) if m: module_name = m.group(1) except Exception: pass if module_name and import_spec.startswith(module_name): - rel_spec = import_spec[len(module_name):].lstrip("/") + rel_spec = import_spec[len(module_name) :].lstrip("/") target_dir = (project_root / rel_spec).resolve() for node in dir_to_files.get(target_dir, []): if node.language == "go": resolved.add(node.relative) - + return resolved diff --git a/src/scriber/graph/languages/javascript.py b/src/scriber/graph/languages/javascript.py index 9ca43f7..385e918 100644 --- a/src/scriber/graph/languages/javascript.py +++ b/src/scriber/graph/languages/javascript.py @@ -21,7 +21,9 @@ def parse_javascript_imports(source: str) -> list[str]: return imports -def resolve_javascript_import(import_spec: str, current_file: FileNode, absolute_to_file: dict[Path, FileNode]) -> set[Path]: +def resolve_javascript_import( + import_spec: str, current_file: FileNode, absolute_to_file: dict[Path, FileNode] +) -> set[Path]: resolved = set() if not import_spec.startswith("."): return resolved @@ -31,8 +33,19 @@ def resolve_javascript_import(import_spec: str, current_file: FileNode, absolute base_path = Path(os.path.abspath(parent / import_spec)) except Exception: base_path = (parent / import_spec).resolve(strict=False) - - extensions = ["", ".ts", ".tsx", ".js", ".jsx", ".d.ts"] + + extensions = [ + "", + ".ts", + ".tsx", + ".js", + ".jsx", + ".d.ts", + ".vue", + ".svelte", + ".astro", + ".json", + ] for ext in extensions: candidate = base_path.with_name(base_path.name + ext) if ext else base_path node = absolute_to_file.get(candidate) @@ -41,7 +54,15 @@ def resolve_javascript_import(import_spec: str, current_file: FileNode, absolute return resolved # Try index files - for index_name in ["index.ts", "index.tsx", "index.js", "index.jsx"]: + for index_name in [ + "index.ts", + "index.tsx", + "index.js", + "index.jsx", + "index.vue", + "index.svelte", + "index.astro", + ]: candidate = base_path / index_name node = absolute_to_file.get(candidate) if node and not node.is_binary: diff --git a/src/scriber/graph/languages/python.py b/src/scriber/graph/languages/python.py index 60af766..cc8fc1e 100644 --- a/src/scriber/graph/languages/python.py +++ b/src/scriber/graph/languages/python.py @@ -25,11 +25,15 @@ def parse_python_imports(path: Path, source: str) -> list[ImportRecord]: for node in ast.walk(tree): if isinstance(node, ast.Import): for alias in node.names: - imports.append(ImportRecord(kind="import", module=alias.name, names=(), level=0)) + imports.append( + ImportRecord(kind="import", module=alias.name, names=(), level=0) + ) elif isinstance(node, ast.ImportFrom): module = node.module or "" names = tuple(alias.name for alias in node.names if alias.name != "*") - imports.append(ImportRecord(kind="from", module=module, names=names, level=node.level)) + imports.append( + ImportRecord(kind="from", module=module, names=names, level=node.level) + ) return imports @@ -55,7 +59,11 @@ def module_name_for_file(file: FileNode, python: PythonConfig) -> str | None: if file.language != "python": return None rel = file.relative - roots = sorted(python.source_roots, key=lambda item: 0 if item == "." else len(item), reverse=True) + roots = sorted( + python.source_roots, + key=lambda item: 0 if item == "." else len(item), + reverse=True, + ) for source_root in roots: if not _is_under(rel, source_root): continue @@ -73,7 +81,9 @@ def module_name_for_file(file: FileNode, python: PythonConfig) -> str | None: return None -def build_module_map(files: dict[Path, FileNode], python: PythonConfig) -> tuple[dict[str, Path], dict[Path, str]]: +def build_module_map( + files: dict[Path, FileNode], python: PythonConfig +) -> tuple[dict[str, Path], dict[Path, str]]: module_to_path: dict[str, Path] = {} path_to_module: dict[Path, str] = {} for rel, file in files.items(): @@ -85,7 +95,9 @@ def build_module_map(files: dict[Path, FileNode], python: PythonConfig) -> tuple return module_to_path, path_to_module -def resolve_relative_module(current_module: str, current_is_init: bool, record: ImportRecord) -> str: +def resolve_relative_module( + current_module: str, current_is_init: bool, record: ImportRecord +) -> str: if record.level <= 0: return record.module if current_is_init: @@ -114,7 +126,11 @@ def resolve_import_record( if record.kind == "import": candidates.append(record.module) else: - base = resolve_relative_module(current_module, current_is_init, record) if record.level else record.module + base = ( + resolve_relative_module(current_module, current_is_init, record) + if record.level + else record.module + ) for name in record.names: if base: candidates.append(f"{base}.{name}") diff --git a/src/scriber/graph/languages/rust.py b/src/scriber/graph/languages/rust.py index 14feecc..07a9f73 100644 --- a/src/scriber/graph/languages/rust.py +++ b/src/scriber/graph/languages/rust.py @@ -5,8 +5,8 @@ from scriber.core.models import FileNode -MOD_RE = re.compile(r'\bmod\s+(\w+)\s*;') -USE_RE = re.compile(r'\buse\s+([^;]+)\s*;') +MOD_RE = re.compile(r"\bmod\s+(\w+)\s*;") +USE_RE = re.compile(r"\buse\s+([^;]+)\s*;") def parse_rust_imports(source: str) -> list[tuple[str, str]]: @@ -28,15 +28,14 @@ def parse_rust_imports(source: str) -> list[tuple[str, str]]: return imports -def resolve_rust_import(kind: str, spec: str, current_file: FileNode, absolute_to_file: dict[Path, FileNode]) -> set[Path]: +def resolve_rust_import( + kind: str, spec: str, current_file: FileNode, absolute_to_file: dict[Path, FileNode] +) -> set[Path]: resolved = set() parent = current_file.absolute.parent if kind == "mod": - candidates = [ - parent / f"{spec}.rs", - parent / spec / "mod.rs" - ] + candidates = [parent / f"{spec}.rs", parent / spec / "mod.rs"] for cand in candidates: node = absolute_to_file.get(cand) if node: @@ -65,7 +64,7 @@ def resolve_rust_import(kind: str, spec: str, current_file: FileNode, absolute_t module_path = crate_root / Path(*sub_parts[:end]) candidates = [ module_path.with_name(module_path.name + ".rs"), - module_path / "mod.rs" + module_path / "mod.rs", ] for cand in candidates: node = absolute_to_file.get(cand) @@ -80,7 +79,7 @@ def resolve_rust_import(kind: str, spec: str, current_file: FileNode, absolute_t module_path = crate_root / Path(*sub_parts[:end]) candidates = [ module_path.with_name(module_path.name + ".rs"), - module_path / "mod.rs" + module_path / "mod.rs", ] for cand in candidates: node = absolute_to_file.get(cand) @@ -95,7 +94,7 @@ def resolve_rust_import(kind: str, spec: str, current_file: FileNode, absolute_t module_path = crate_root / Path(*sub_parts[:end]) candidates = [ module_path.with_name(module_path.name + ".rs"), - module_path / "mod.rs" + module_path / "mod.rs", ] for cand in candidates: node = absolute_to_file.get(cand) diff --git a/src/scriber/graph/model.py b/src/scriber/graph/model.py new file mode 100644 index 0000000..f0996e1 --- /dev/null +++ b/src/scriber/graph/model.py @@ -0,0 +1,61 @@ +from __future__ import annotations + +from dataclasses import dataclass, field +from pathlib import Path +from typing import Literal + +RelationKind = Literal[ + "import", + "reexport", + "call", + "type_reference", + "inherits", + "implements", + "test_of", + "fixture_for", + "config_refs_code", + "env_key", + "doc_mentions_symbol", + "doc_mentions_code", + "same_package", + "same_dir", + "name_similarity", + "git_cochange", + "semantic_similarity", + "entrypoint_to_module", +] + + +@dataclass(frozen=True, slots=True) +class RelationEdge: + source: Path + target: Path + kind: RelationKind + weight: float = 1.0 + confidence: float = 1.0 + evidence: str | None = None + line: int | None = None + analyzer: str = "unknown" + + +@dataclass(slots=True) +class RelationGraph: + edges: list[RelationEdge] = field(default_factory=list) + outgoing: dict[Path, list[RelationEdge]] = field(default_factory=dict) + incoming: dict[Path, list[RelationEdge]] = field(default_factory=dict) + imports: dict[Path, set[Path]] = field(default_factory=dict) + imported_by: dict[Path, set[Path]] = field(default_factory=dict) + + def add_edge(self, edge: RelationEdge) -> None: + self.edges.append(edge) + self.outgoing.setdefault(edge.source, []).append(edge) + self.incoming.setdefault(edge.target, []).append(edge) + + if edge.kind in {"import", "reexport"}: + self.imports.setdefault(edge.source, set()).add(edge.target) + self.imported_by.setdefault(edge.target, set()).add(edge.source) + + +@dataclass(slots=True) +class ModuleGraph(RelationGraph): + pass diff --git a/src/scriber/native.py b/src/scriber/native.py index 08b415c..64b494c 100644 --- a/src/scriber/native.py +++ b/src/scriber/native.py @@ -14,6 +14,7 @@ def _load_native() -> Any: raise _IMPORT_ERROR try: from scriber import _native + _NATIVE_MODULE = _native return _NATIVE_MODULE except ImportError as e: @@ -35,7 +36,9 @@ def require_native() -> Any: try: native = _load_native() if hasattr(native, "native_api_version") and native.native_api_version() != 1: - raise RuntimeError("Niezgodna wersja natywnego backendu Scriber (oczekiwano wersji 1).") + raise RuntimeError( + "Niezgodna wersja natywnego backendu Scriber (oczekiwano wersji 1)." + ) return native except ImportError as e: raise ImportError( diff --git a/src/scriber/outline/__init__.py b/src/scriber/outline/__init__.py new file mode 100644 index 0000000..b9c7888 --- /dev/null +++ b/src/scriber/outline/__init__.py @@ -0,0 +1,14 @@ +from scriber.core.models import FileNode, FileOutline +from scriber.outline.base import Outliner +from scriber.outline.generic import GenericOutliner +from scriber.outline.python import PythonOutliner + +_outliners: dict[str, Outliner] = { + "python": PythonOutliner(), +} +_generic = GenericOutliner() + + +def generate_outline(file: FileNode, content: str) -> FileOutline: + outliner = _outliners.get(file.language, _generic) + return outliner.outline(file, content) diff --git a/src/scriber/outline/base.py b/src/scriber/outline/base.py new file mode 100644 index 0000000..32affb4 --- /dev/null +++ b/src/scriber/outline/base.py @@ -0,0 +1,7 @@ +from __future__ import annotations +from typing import Protocol +from scriber.core.models import FileNode, FileOutline + + +class Outliner(Protocol): + def outline(self, file: FileNode, content: str) -> FileOutline: ... diff --git a/src/scriber/outline/generic.py b/src/scriber/outline/generic.py new file mode 100644 index 0000000..eaabd36 --- /dev/null +++ b/src/scriber/outline/generic.py @@ -0,0 +1,21 @@ +from __future__ import annotations +from scriber.core.models import FileNode, FileOutline +from scriber.outline.base import Outliner + + +class GenericOutliner(Outliner): + def outline(self, file: FileNode, content: str) -> FileOutline: + return FileOutline( + path=file.relative, + language=file.language, + purpose=None, + imports=[], + exports=[], + classes=[], + functions=[], + constants=[], + notes=[ + "Static outline not implemented for this language. Showing generic info." + ], + token_estimate=20, + ) diff --git a/src/scriber/outline/python.py b/src/scriber/outline/python.py new file mode 100644 index 0000000..4056ba9 --- /dev/null +++ b/src/scriber/outline/python.py @@ -0,0 +1,40 @@ +from __future__ import annotations +import ast +from scriber.core.models import FileNode, FileOutline +from scriber.outline.base import Outliner + + +class PythonOutliner(Outliner): + def outline(self, file: FileNode, content: str) -> FileOutline: + classes = [] + functions = [] + imports = [] + try: + tree = ast.parse(content) + for node in tree.body: + if isinstance(node, ast.ClassDef): + classes.append(node.name) + elif isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)): + functions.append(node.name) + elif isinstance(node, ast.Import): + for alias in node.names: + imports.append(alias.name) + elif isinstance(node, ast.ImportFrom): + module = node.module or "" + for alias in node.names: + imports.append(f"{module}.{alias.name}") + except SyntaxError: + pass + + return FileOutline( + path=file.relative, + language="python", + purpose=None, + imports=imports[:20], + exports=[], + classes=classes, + functions=functions, + constants=[], + notes=[], + token_estimate=len(classes) * 5 + len(functions) * 3 + len(imports) * 2, + ) diff --git a/src/scriber/packer/pack.py b/src/scriber/packer/pack.py index 2e7011c..7469bc5 100644 --- a/src/scriber/packer/pack.py +++ b/src/scriber/packer/pack.py @@ -6,16 +6,23 @@ from scriber.core.config import apply_overrides, load_config from scriber.core.errors import ScriberError from scriber.core.models import Candidate, FileNode, ScriberPack, SeedPath -from scriber.core.root import ensure_inside_root, project_root_from_config, rel_to_root, resolve_config_path +from scriber.core.root import ( + ensure_inside_root, + project_root_from_config, + rel_to_root, + resolve_config_path, +) from scriber.engine.scorer import score_candidates -from scriber.graph.builder import build_graph from scriber.rendering.renderer import render_pack -from scriber.scanner.files import classify_file, is_text_readable, read_text_lossy +from scriber.scanner.files import classify_file, is_text_readable from scriber.tokens import estimate_tokens from scriber.scanner.scan import scan_project +from scriber.core.models import LlmPack -def _resolve_input(path_value: str, root: Path, allow_external: bool, path_base: str = "cwd") -> Path: +def _resolve_input( + path_value: str, root: Path, allow_external: bool, path_base: str = "cwd" +) -> Path: path = Path(path_value).expanduser() if not path.is_absolute(): if path_base == "project": @@ -35,7 +42,9 @@ def _resolve_input(path_value: str, root: Path, allow_external: bool, path_base: return path.resolve() -def _ensure_seed_file(path: Path, root: Path, files: dict[Path, FileNode], config) -> FileNode: +def _ensure_seed_file( + path: Path, root: Path, files: dict[Path, FileNode], config +) -> FileNode: rel = rel_to_root(path, root) existing = files.get(rel) if existing is not None: @@ -59,11 +68,19 @@ def _ensure_seed_file(path: Path, root: Path, files: dict[Path, FileNode], confi return node -def _expand_seed(path: Path, root: Path, files: dict[Path, FileNode], config) -> SeedPath: +def _expand_seed( + path: Path, root: Path, files: dict[Path, FileNode], config +) -> SeedPath: rel = rel_to_root(path, root) if path.is_file(): node = _ensure_seed_file(path, root, files, config) - return SeedPath(original=Path(path), absolute=path, relative=rel, is_dir=False, expanded_files=[node.relative]) + return SeedPath( + original=Path(path), + absolute=path, + relative=rel, + is_dir=False, + expanded_files=[node.relative], + ) expanded: list[Path] = [] for file_rel, node in files.items(): @@ -75,11 +92,26 @@ def _expand_seed(path: Path, root: Path, files: dict[Path, FileNode], config) -> expanded.append(file_rel) expanded.sort(key=lambda item: item.as_posix()) if not expanded: - raise ScriberError(f"No readable project files found inside seed folder: {rel.as_posix()}") - return SeedPath(original=Path(path), absolute=path, relative=rel, is_dir=True, expanded_files=expanded) + raise ScriberError( + f"No readable project files found inside seed folder: {rel.as_posix()}" + ) + return SeedPath( + original=Path(path), + absolute=path, + relative=rel, + is_dir=True, + expanded_files=expanded, + ) -def _decide_content(candidate: Candidate, *, config, only_tree: bool, budget_left: int | None, is_seed: bool) -> tuple[bool, str | None, str | None, int]: +def _decide_content( + candidate: Candidate, + *, + config, + only_tree: bool, + budget_left: int | None, + is_seed: bool, +) -> tuple[bool, str | None, str | None, int]: if only_tree: return False, None, "only-tree mode", 0 file = candidate.file @@ -126,9 +158,16 @@ def _decide_content(candidate: Candidate, *, config, only_tree: bool, budget_lef def _apply_content_policy(pack: ScriberPack, config) -> None: if pack.mode == "focused": - explicit_seed_files = {rel for seed in pack.seed_paths for rel in seed.expanded_files} + explicit_seed_files = { + rel for seed in pack.seed_paths for rel in seed.expanded_files + } else: - explicit_seed_files = {rel for seed in pack.seed_paths if not seed.is_dir for rel in seed.expanded_files} + explicit_seed_files = { + rel + for seed in pack.seed_paths + if not seed.is_dir + for rel in seed.expanded_files + } budget_left = config.max_tokens if config.max_tokens > 0 else None total = 0 for candidate in pack.candidates: @@ -151,31 +190,26 @@ def _apply_content_policy(pack: ScriberPack, config) -> None: pack.total_tokens = total -def build_pack( - paths: list[str] | None = None, - *, - config_path: str | None = None, - output: str | None = None, - output_format: str | None = None, - only_tree: bool | None = None, - modules: bool | None = None, - support: bool | None = None, - max_files: int | None = None, - max_tokens: int | None = None, - min_score: int | None = None, - support_content: str | None = None, - progress_callback: Callable[[str], None] | None = None, - project: bool | None = None, - path_base: str = "project", -) -> ScriberPack: - from time import perf_counter - timings = {} - - t_start = perf_counter() - paths = paths or ["."] +def _load_and_apply_config( + paths, + config_path, + profile, + output, + output_format, + only_tree, + modules, + support, + max_files, + max_tokens, + min_score, + support_content, +): resolved_config = resolve_config_path(paths, config_path) root = project_root_from_config(resolved_config) config = load_config(resolved_config) + from scriber.core.profiles import apply_profile + + config = apply_profile(config, profile) config = apply_overrides( config, output=output, @@ -188,60 +222,106 @@ def build_pack( min_score=min_score, support_content=support_content, ) - timings["config_load"] = perf_counter() - t_start + return resolved_config, root, config + + +def _scan_files(paths, root, config, path_base, progress_callback): + if progress_callback: + progress_callback("Skanowanie plikow...") + from scriber.native import is_native_available - t_scan = perf_counter() - if progress_callback: progress_callback("Skanowanie plikow...") - from scriber.native import require_native, is_native_available native_files = None if is_native_available(): from scriber.scanner.scan import scan_project_with_native + files, native_files = scan_project_with_native(root, config) else: files = scan_project(root, config) - resolved_inputs = [_resolve_input(item, root, config.allow_external_paths, path_base) for item in paths] + resolved_inputs = [ + _resolve_input(item, root, config.allow_external_paths, path_base) + for item in paths + ] seeds = [_expand_seed(path, root, files, config) for path in resolved_inputs] - timings["scan"] = perf_counter() - t_scan - # Detect mode is_project_snapshot = False - if project: - is_project_snapshot = True - else: - for path in resolved_inputs: - if path == root: - is_project_snapshot = True - break - mode = "project_snapshot" if is_project_snapshot else "focused" + for path in resolved_inputs: + if path == root: + is_project_snapshot = True + break + + return files, native_files, seeds, is_project_snapshot + + +def _build_graph_and_score( + mode, files, seeds, native_files, root, config, progress_callback +): + from time import perf_counter + + timings = {} + stats = {} + from scriber.native import is_native_available - # Use native code pack builder if available if is_native_available(): + from scriber.native import require_native + native = require_native() - + t_graph = perf_counter() - if progress_callback: progress_callback("Budowanie grafu modulow (natywnie)...") - + if progress_callback: + progress_callback("Budowanie grafu modulow (natywnie)...") + assert native_files is not None - - edges = native.build_import_graph( + + edges = native.build_relation_graph( str(root), native_files, config.python.source_roots, - config.python.module_init_files + config.python.module_init_files, ) - - from scriber.core.models import ModuleGraph + + from scriber.graph.analyzers import generate_cheap_relations + + edges.extend( + generate_cheap_relations(files, native.NativeRelationEdge, is_native=True) + ) + + from scriber.cache import ScriberCache + + cache = ScriberCache(config, root) + + from scriber.core.models import ModuleGraph, RelationEdge + graph = ModuleGraph() for edge in edges: - from_path = Path(getattr(edge, "from")) - to_path = Path(edge.to) - graph.imports.setdefault(from_path, set()).add(to_path) - graph.imported_by.setdefault(to_path, set()).add(from_path) - + from_path = Path(getattr(edge, "source")) + to_path = Path(edge.target) + py_edge = RelationEdge( + source=from_path, + target=to_path, + kind=edge.kind, + weight=edge.weight, + confidence=edge.confidence, + evidence=edge.evidence, + line=edge.line, + analyzer=edge.analyzer, + ) + graph.add_edge(py_edge) + if py_edge.kind in {"import", "reexport"}: + cache.add_import_edge(from_path, to_path) + + cache.save(set(files.keys())) + + stats["graph_edges_built"] = len(edges) + stats["graph_source"] = "native" + stats["graph_cache_reads"] = cache.reads + stats["graph_cache_hits"] = cache.hits + stats["graph_cache_writes"] = cache.writes + timings["graph_build"] = perf_counter() - t_graph - + t_score = perf_counter() - if progress_callback: progress_callback("Ocenianie zaleznosci (natywnie)...") + if progress_callback: + progress_callback("Ocenianie zaleznosci (natywnie)...") scoring = config.modules_config.scoring opts = native.NativePackOptions( mode=mode, @@ -262,6 +342,10 @@ def build_pack( runtime_support_score=scoring.get("runtime_support", 50), documentation_score=scoring.get("documentation", 45), shared_dependency_bonus=scoring.get("shared_dependency_bonus", 10), + entrypoint_file_score=scoring.get("entrypoint_file", 90), + code_file_score=scoring.get("code_file", 80), + test_file_score=scoring.get("test_file", 60), + other_file_score=scoring.get("other_file", 40), modules_enabled=config.modules, include_direct_dependencies=config.modules_config.include_direct_dependencies, include_reverse_dependencies=config.modules_config.include_reverse_dependencies, @@ -270,18 +354,16 @@ def build_pack( include_tests=config.modules_config.include_tests, include_project_configs=config.modules_config.include_project_configs, depth=config.modules_config.depth, + top_dependencies=config.modules_config.top_dependencies, support_enabled=config.support, entrypoint_patterns=config.python.entrypoint_patterns, test_roots=config.python.test_roots, ) - + rs_candidates = native.score_candidates_native( - native_files, - [seed.relative.as_posix() for seed in seeds], - edges, - opts + native_files, [seed.relative.as_posix() for seed in seeds], edges, opts ) - + candidates = [] for rc in rs_candidates: rel = Path(rc.path) @@ -299,15 +381,184 @@ def build_pack( timings["scoring"] = perf_counter() - t_score else: t_graph = perf_counter() - if progress_callback: progress_callback("Budowanie grafu modulow...") - graph = build_graph(files, config) + if progress_callback: + progress_callback("Budowanie grafu modulow...") + from scriber.cache import ScriberCache + + cache = ScriberCache(config, root) + from scriber.graph.builder import build_graph + + graph = build_graph(files, config, cache) + + from scriber.graph.analyzers import generate_cheap_relations + from scriber.core.models import RelationEdge + + cheap_edges = generate_cheap_relations(files, RelationEdge, is_native=False) + for edge in cheap_edges: + graph.add_edge(edge) + + stats["graph_edges_built"] = len(graph.edges) + stats["graph_source"] = "python" + stats["graph_cache_reads"] = cache.reads + stats["graph_cache_hits"] = cache.hits + stats["graph_cache_writes"] = cache.writes + timings["graph_build"] = perf_counter() - t_graph - + t_score = perf_counter() - if progress_callback: progress_callback("Ocenianie zaleznosci...") - candidates = score_candidates(files=files, seeds=seeds, graph=graph, config=config, mode=mode) + if progress_callback: + progress_callback("Ocenianie zaleznosci...") + candidates = score_candidates( + files=files, seeds=seeds, graph=graph, config=config, mode=mode + ) timings["scoring"] = perf_counter() - t_score + return candidates, graph, timings, stats + + +def build_pack( + paths: list[str] | None = None, + *, + config_path: str | None = None, + profile: str | None = None, + output: str | None = None, + output_format: str | None = None, + only_tree: bool | None = None, + modules: bool | None = None, + support: bool | None = None, + max_files: int | None = None, + max_tokens: int | None = None, + min_score: int | None = None, + support_content: str | None = None, + progress_callback: Callable[[str], None] | None = None, + project: bool | None = None, + path_base: str = "project", +) -> ScriberPack | LlmPack: + from time import perf_counter + + t_start = perf_counter() + paths = paths or ["."] + resolved_config, root, config = _load_and_apply_config( + paths, + config_path, + profile, + output, + output_format, + only_tree, + modules, + support, + max_files, + max_tokens, + min_score, + support_content, + ) + t_config_load = perf_counter() - t_start + + t_scan = perf_counter() + files, native_files, seeds, is_project_snapshot = _scan_files( + paths, root, config, path_base, progress_callback + ) + t_scan_time = perf_counter() - t_scan + + mode = "project_snapshot" if (project or is_project_snapshot) else "focused" + + if profile == "full": + mode = "project_snapshot" + elif profile == "focused-gpt": + mode = "focused" + + candidates, graph, sub_timings, stats = _build_graph_and_score( + mode, files, seeds, native_files, root, config, progress_callback + ) + + if profile in {"gpt", "focused-gpt", "full"}: + from scriber.engine.ranker import rank_context + from scriber.budget.allocator import allocate_budget, BudgetPolicy + from time import perf_counter + + t_rank = perf_counter() + if progress_callback: + progress_callback("Rankowanie kontekstu...") + seed_paths = [seed for p in seeds for seed in p.expanded_files] + new_candidates = rank_context(files, graph, seed_paths, config, mode) + sub_timings["rank_context"] = perf_counter() - t_rank + + t_budget = perf_counter() + if progress_callback: + progress_callback("Alokacja budzetu...") + policy = BudgetPolicy( + target_tokens=config.max_tokens if config.max_tokens > 0 else 30000, + hard_limit_tokens=config.max_tokens if config.max_tokens > 0 else 100000, + mode=mode, + ) + if mode == "focused": + explicit_seeds = {seed for p in seeds for seed in p.expanded_files} + else: + explicit_seeds = { + seed for p in seeds if not p.is_dir for seed in p.expanded_files + } + + items = allocate_budget(new_candidates, policy, explicit_seeds) + sub_timings["budget_allocation"] = perf_counter() - t_budget + + t_content = perf_counter() + if progress_callback: + progress_callback("Czytanie i outline...") + from scriber.outline import generate_outline + + actual_tokens = 0 + for item in items: + if item.content_mode == "full": + try: + item.content = item.file.read_text() + actual_tokens += item.token_estimate + except Exception: + item.content_mode = "tree" + elif item.content_mode in ("outline", "excerpt"): + try: + content = item.file.read_text() + item.outline = generate_outline(item.file, content) + has_outline_symbols = bool( + item.outline.classes + or item.outline.functions + or item.outline.constants + or item.outline.imports + ) + if item.content_mode == "excerpt" and not has_outline_symbols: + if actual_tokens + item.token_estimate <= policy.target_tokens: + item.content_mode = "full" + item.content = content + actual_tokens += item.token_estimate + else: + item.content_mode = "tree" + else: + actual_tokens += item.outline.token_estimate + except Exception: + item.content_mode = "tree" + + sub_timings["content_read"] = perf_counter() - t_content + + stats["input_paths"] = paths + pack = LlmPack( + project_root=root, + config_path=resolved_config, + profile=profile, + mode=mode, + goal=None, + budget_target=policy.target_tokens, + budget_actual=actual_tokens, + items=items, + graph=graph, + stats=stats, + warnings=[], + ) + pack.timings = { + "config_load": t_config_load, + "scan": t_scan_time, + **sub_timings, + } + return pack + pack = ScriberPack( project_root=root, config_path=resolved_config, @@ -317,18 +568,27 @@ def build_pack( only_tree=config.only_tree, output_format=config.format, mode=mode, + stats=stats, ) - + t_content = perf_counter() - if progress_callback: progress_callback("Aplikowanie regul zawartosci...") + if progress_callback: + progress_callback("Aplikowanie regul zawartosci...") _apply_content_policy(pack, config) - timings["content_read"] = perf_counter() - t_content - - pack.timings = timings + t_content_time = perf_counter() - t_content + + pack.timings = { + "config_load": t_config_load, + "scan": t_scan_time, + "content_read": t_content_time, + **sub_timings, + } return pack -def build_and_write_pack(paths: list[str] | None = None, **kwargs) -> tuple[Path | None, ScriberPack]: +def build_and_write_pack( + paths: list[str] | None = None, **kwargs +) -> tuple[Path | None, ScriberPack | LlmPack]: explain_selection = kwargs.pop("explain_selection", False) pack = build_pack(paths, **kwargs) config_path = resolve_config_path(paths or ["."], kwargs.get("config_path")) @@ -346,11 +606,23 @@ def build_and_write_pack(paths: list[str] | None = None, **kwargs) -> tuple[Path support_content=kwargs.get("support_content"), ) progress = kwargs.get("progress_callback") - if progress: progress("Renderowanie Markdown...") - rendered = render_pack(pack, explain_selection=explain_selection) + if progress: + progress("Renderowanie Markdown...") + + if isinstance(pack, LlmPack): + from scriber.renderer.llm_report import render_llm_report + import io + + buf = io.StringIO() + render_llm_report(pack, buf) + rendered = buf.getvalue() + else: + rendered = render_pack(pack, explain_selection=explain_selection) + output = config.output if str(output) == "-": import sys + try: sys.stdout.buffer.write(rendered.encode("utf-8")) sys.stdout.flush() @@ -360,6 +632,14 @@ def build_and_write_pack(paths: list[str] | None = None, **kwargs) -> tuple[Path if not output.is_absolute(): output = pack.project_root / output output.parent.mkdir(parents=True, exist_ok=True) - from scriber.native import require_native - require_native().write_text(str(output), rendered) + try: + from scriber.native import is_native_available, require_native + + if is_native_available(): + require_native().write_text(str(output), rendered) + else: + output.write_text(rendered, encoding="utf-8") + except Exception: + output.write_text(rendered, encoding="utf-8") + return output, pack diff --git a/src/scriber/renderer/llm_report.py b/src/scriber/renderer/llm_report.py new file mode 100644 index 0000000..b720af1 --- /dev/null +++ b/src/scriber/renderer/llm_report.py @@ -0,0 +1,233 @@ +from __future__ import annotations +from typing import TextIO +from collections import defaultdict +import re + +from scriber.core.models import LlmPack, PackItem, FileOutline + + +def render_llm_report(pack: LlmPack, out: TextIO) -> None: + out.write("# Scriber Pack v3\n\n") + + out.write("\n") + out.write("You are reading a generated codebase context pack.\n") + out.write("Prefer facts from , , and blocks.\n") + out.write("If a file is tree_only or omitted, do not infer its contents.\n") + out.write("When proposing patches, cite file IDs and line ranges.\n") + out.write("\n\n") + + out.write('\n') + out.write("project:\n") + out.write(f" mode: {pack.mode}\n") + out.write(f" goal: {pack.goal or 'null'}\n") + out.write(f" target_tokens: {pack.budget_target}\n") + out.write(f" actual_tokens: {pack.budget_actual}\n") + + input_paths = pack.stats.get("input_paths", []) + if input_paths: + out.write(" analyzed_targets:\n") + for p in input_paths: + out.write(f" - {p}\n") + out.write("\n") + + out.write("read_order:\n") + for item in pack.items: + if item.content_mode not in ("tree", "omit"): + out.write(f" - {item.item_id} # {item.file.relative.as_posix()}\n") + + out.write("\nfiles:\n") + for item in pack.items: + if item.content_mode in ("omit",): + continue + out.write(f" {item.item_id}:\n") + out.write(f" path: {item.file.relative.as_posix()}\n") + out.write(f" role: {item.role}\n") + out.write(f" mode: {item.content_mode}\n") + out.write(f" score: {item.score}\n") + out.write(f" utility: {item.utility:.2f}\n") + out.write(f" tokens: {item.token_estimate}\n") + if item.outline and item.outline.purpose: + out.write(f" purpose: {item.outline.purpose}\n") + out.write("\n\n") + + out.write("## Architecture map\n") + out.write("```\n") + _render_tree(pack.items, out) + out.write("```\n\n") + + out.write("\n") + _render_graph(pack, out) + out.write("\n\n") + + warnings = _generate_warnings(pack) + if warnings: + out.write("## Pack quality warnings\n\n") + for w in warnings: + out.write(f"- {w}\n") + out.write("\n") + + out.write("## Files Content\n\n") + + for item in pack.items: + if item.content_mode in ("tree", "omit"): + continue + + out.write( + f'\n' + ) + + if item.outline and item.outline.purpose: + out.write("\n") + out.write(f"{item.outline.purpose}\n") + out.write("\n\n") + + if item.outline: + _render_symbols_manifest(item.outline, out) + + if item.content_mode == "full" and item.content: + out.write(f'```{item.file.language} linenums="1"\n') + out.write( + _add_line_numbers( + item.content, item.file.relative.as_posix(), item.file.language + ) + ) + if not item.content.endswith("\n"): + out.write("\n") + out.write("```\n") + + elif item.content_mode == "excerpt": + if item.excerpts: + for excerpt in item.excerpts: + out.write(f"```{item.file.language}\n") + out.write(excerpt) + out.write("\n```\n\n") + elif item.outline: + _render_outline_fallback(item, out) + else: + out.write("_Excerpt unavailable; falling back to metadata only._\n\n") + + elif item.content_mode == "outline" and item.outline: + _render_outline_fallback(item, out) + + out.write("\n\n") + + +def _add_line_numbers(content: str, path: str, language: str) -> str: + lines = content.splitlines() + out = [] + out.append(f"# file: {path}") + out.append(f"# lines: 1-{len(lines)}") + for i, line in enumerate(lines, 1): + if language in ("python", "py"): + m = re.match(r"^(\s*)(class|def|async def)\s+([a-zA-Z0-9_]+)", line) + if m: + indent, _, name = m.groups() + out.append(f'{i:04d} {indent}# ') + out.append(f"{i:04d} {line}") + return "\n".join(out) + + +def _render_symbols_manifest(outline: FileOutline, out: TextIO) -> None: + symbols = [] + if outline.classes: + symbols.extend(outline.classes) + if outline.functions: + symbols.extend(outline.functions) + if not symbols: + return + + out.write("\n") + for sym in symbols: + out.write(f"- {sym}\n") + out.write("\n\n") + + +def _render_outline_fallback(item: PackItem, out: TextIO) -> None: + out.write("```python\n") + out.write(f"# Outline for {item.file.relative.name}\n") + if item.outline.classes: + out.write("Classes: " + ", ".join(item.outline.classes) + "\n") + if item.outline.functions: + out.write("Functions: " + ", ".join(item.outline.functions) + "\n") + if item.outline.imports: + out.write("Imports: " + ", ".join(item.outline.imports) + "\n") + out.write("```\n\n") + + +def _generate_warnings(pack: LlmPack) -> list[str]: + warnings = [] + empty_excerpts = sum( + 1 for i in pack.items if i.content_mode == "excerpt" and not i.excerpts + ) + if empty_excerpts > 0: + warnings.append( + f"{empty_excerpts} files are marked excerpt but have no excerpts (falling back to outline)." + ) + + unknown_roles = sum(1 for i in pack.items if i.role == "unknown") + if unknown_roles > 0: + warnings.append(f"{unknown_roles} files have role=unknown.") + + return warnings + + +def _render_tree(items: list[PackItem], out: TextIO) -> None: + tree = {} + item_map = {item.file.relative.as_posix(): item for item in items} + + for item in items: + parts = item.file.relative.parts + curr = tree + for part in parts: + if part not in curr: + curr[part] = {} + curr = curr[part] + + def print_node(path_parts, current_dict, prefix=""): + keys = sorted(current_dict.keys()) + for i, k in enumerate(keys): + is_last = i == len(keys) - 1 + child_prefix = prefix + (" " if is_last else "β”‚ ") + connector = "└── " if is_last else "β”œβ”€β”€ " + + full_path = "/".join(path_parts + (k,)) + item = item_map.get(full_path) + + if item: + badge = f"[{item.item_id} {item.role} {item.content_mode} score={item.score}]" + name_str = f"{prefix}{connector}{k}" + out.write(f"{name_str:<50} {badge}\n") + else: + out.write(f"{prefix}{connector}{k}/\n") + print_node(path_parts + (k,), current_dict[k], child_prefix) + + out.write(".\n") + print_node((), tree, "") + + +def _render_graph(pack: LlmPack, out: TextIO) -> None: + included_paths = {item.file.relative for item in pack.items} + item_id_map = {item.file.relative: item.item_id for item in pack.items} + + groups = defaultdict(list) + for edge in pack.graph.edges: + if edge.source in included_paths and edge.target in included_paths: + key = (edge.source, edge.target, edge.kind) + groups[key].append(edge) + + sorted_groups = sorted( + groups.items(), key=lambda x: (x[0][0].as_posix(), x[0][1].as_posix()) + ) + + for (source, target, kind), edges in sorted_groups: + count = len(edges) + max_conf = max(e.confidence for e in edges) + analyzers = sorted({e.analyzer for e in edges}) + + s_id = item_id_map[source] + t_id = item_id_map[target] + + analyzer_str = ",".join(analyzers) + out.write( + f"{s_id} -> {t_id} [{kind}] x{count} (analyzers=[{analyzer_str}], conf={max_conf:.2f})\n" + ) diff --git a/src/scriber/rendering/renderer.py b/src/scriber/rendering/renderer.py index 657cd55..77b6e96 100644 --- a/src/scriber/rendering/renderer.py +++ b/src/scriber/rendering/renderer.py @@ -26,7 +26,11 @@ def _table(candidates: list[Candidate], explain_selection: bool = False) -> str: return "_None._\n" lines = ["| Score | Content | Path | Reason |", "|---:|---|---|---|"] for candidate in candidates: - reason = "; ".join(candidate.reasons) if explain_selection else candidate.reason_summary + reason = ( + "; ".join(candidate.reasons) + if explain_selection + else candidate.reason_summary + ) lines.append( f"| {candidate.score} | {_escape_table(_content_flag(candidate))} | `{_escape_table(_path(candidate.file.relative))}` | {_escape_table(reason)} |" ) @@ -65,31 +69,39 @@ def render_module_graph(pack: ScriberPack) -> str: imports = len(pack.graph.imports.get(path, set()) & included) if imports > 0: import_counts.append((path, imports)) - + imported_by = len(pack.graph.imported_by.get(path, set()) & included) if imported_by > 0: imported_by_counts.append((path, imported_by)) - + import_counts.sort(key=lambda x: (-x[1], x[0].as_posix())) imported_by_counts.sort(key=lambda x: (-x[1], x[0].as_posix())) - + lines.append("Top 5 files with most dependencies:") for path, count in import_counts[:5]: lines.append(f"- `{_path(path)}`: imports {count} included files") - + lines.append("") lines.append("Top 5 most imported files:") for path, count in imported_by_counts[:5]: lines.append(f"- `{_path(path)}`: imported by {count} included files") - + return "\n".join(lines).strip() or "No module graph available." for seed in pack.seed_paths: for seed_file in seed.expanded_files: lines.append(_path(seed_file)) - imports = sorted(pack.graph.imports.get(seed_file, set()) & included, key=lambda item: item.as_posix()) - imported_by = sorted(pack.graph.imported_by.get(seed_file, set()) & included, key=lambda item: item.as_posix()) - edges = [("imports", item) for item in imports] + [("imported by", item) for item in imported_by] + imports = sorted( + pack.graph.imports.get(seed_file, set()) & included, + key=lambda item: item.as_posix(), + ) + imported_by = sorted( + pack.graph.imported_by.get(seed_file, set()) & included, + key=lambda item: item.as_posix(), + ) + edges = [("imports", item) for item in imports] + [ + ("imported by", item) for item in imported_by + ] for index, (kind, target) in enumerate(edges): branch = "└──" if index == len(edges) - 1 else "β”œβ”€β”€" lines.append(f"{branch} {kind} {_path(target)}") @@ -100,7 +112,23 @@ def render_module_graph(pack: ScriberPack) -> str: def _language_fence(language: str) -> str: - if language in {"python", "rust", "javascript", "typescript", "go", "java", "kotlin", "c", "cpp", "toml", "yaml", "json", "markdown", "dockerfile", "ini"}: + if language in { + "python", + "rust", + "javascript", + "typescript", + "go", + "java", + "kotlin", + "c", + "cpp", + "toml", + "yaml", + "json", + "markdown", + "dockerfile", + "ini", + }: return language return "text" @@ -133,7 +161,7 @@ def render_summary(pack: ScriberPack) -> str: f"- Content files: `{content_count}`", f"- Tree-only files: `{tree_only_count}`", f"- Estimated tokens: `{pack.total_tokens}`", - "" + "", ] return "\n".join(lines) @@ -154,15 +182,19 @@ def render_summary_text(pack: ScriberPack) -> str: f"Content files: {content_count}", f"Tree-only files: {tree_only_count}", f"Estimated tokens: {pack.total_tokens}", - "" + "", ] return "\n".join(lines) def render_markdown(pack: ScriberPack, explain_selection: bool = False) -> str: code = [candidate for candidate in pack.candidates if candidate.file.kind == "code"] - support = [candidate for candidate in pack.candidates if candidate.file.kind == "support"] - other = [candidate for candidate in pack.candidates if candidate.file.kind == "other"] + support = [ + candidate for candidate in pack.candidates if candidate.file.kind == "support" + ] + other = [ + candidate for candidate in pack.candidates if candidate.file.kind == "other" + ] lines: list[str] = [] lines.append("# Scriber 2.0 Pack") @@ -172,7 +204,9 @@ def render_markdown(pack: ScriberPack, explain_selection: bool = False) -> str: lines.append("## Project") lines.append("") lines.append(f"Root: `{pack.project_root}`") - lines.append(f"Config: `{pack.config_path.relative_to(pack.project_root).as_posix()}`") + lines.append( + f"Config: `{pack.config_path.relative_to(pack.project_root).as_posix()}`" + ) lines.append(f"Format: `{pack.output_format}`") lines.append(f"Only tree: `{str(pack.only_tree).lower()}`") lines.append("") @@ -214,7 +248,9 @@ def render_markdown(pack: ScriberPack, explain_selection: bool = False) -> str: lines.append(f"### `{_path(candidate.file.relative)}`") lines.append("") if not candidate.include_content: - lines.append(f"_Content omitted: {candidate.omitted_reason or 'not selected for content'}._") + lines.append( + f"_Content omitted: {candidate.omitted_reason or 'not selected for content'}._" + ) continue content = candidate.content or "" fence = _fence_for(content) @@ -235,7 +271,9 @@ def render_text(pack: ScriberPack, explain_selection: bool = False) -> str: lines.append(render_summary_text(pack).rstrip()) lines.append("") lines.append(f"PROJECT ROOT: {pack.project_root}") - lines.append(f"CONFIG: {pack.config_path.relative_to(pack.project_root).as_posix()}") + lines.append( + f"CONFIG: {pack.config_path.relative_to(pack.project_root).as_posix()}" + ) lines.append(f"FORMAT: {pack.output_format}") lines.append(f"ONLY TREE: {str(pack.only_tree).lower()}") lines.append("") @@ -245,7 +283,11 @@ def render_text(pack: ScriberPack, explain_selection: bool = False) -> str: lines.append("") lines.append("INCLUDED FILES") for candidate in pack.candidates: - reason = "; ".join(candidate.reasons) if explain_selection else candidate.reason_summary + reason = ( + "; ".join(candidate.reasons) + if explain_selection + else candidate.reason_summary + ) lines.append(f"[{candidate.score:03d}] {_path(candidate.file.relative)}") lines.append(f" kind: {candidate.file.kind}") lines.append(f" content: {_content_flag(candidate)}") @@ -265,7 +307,9 @@ def render_text(pack: ScriberPack, explain_selection: bool = False) -> str: lines.append("") lines.append(f"--- FILE: {_path(candidate.file.relative)} ---") if not candidate.include_content: - lines.append(f"[content omitted: {candidate.omitted_reason or 'not selected for content'}]") + lines.append( + f"[content omitted: {candidate.omitted_reason or 'not selected for content'}]" + ) continue lines.append(candidate.content or "") lines.append("") diff --git a/src/scriber/scanner/files.py b/src/scriber/scanner/files.py index f203dde..701d093 100644 --- a/src/scriber/scanner/files.py +++ b/src/scriber/scanner/files.py @@ -2,7 +2,7 @@ from pathlib import Path -from scriber.core.matchers import match_pattern, matches_any +from scriber.core.matchers import matches_any from scriber.core.models import ContentPolicy, FileKind, FileNode, ScriberConfig LANGUAGE_BY_SUFFIX = { @@ -39,10 +39,15 @@ def is_probably_binary(path: Path) -> bool: from scriber.native import require_native + try: return require_native().is_probably_binary(str(path)) except Exception: - return True + try: + chunk = path.read_bytes()[:4096] + return b"\0" in chunk + except OSError: + return True def language_for(path: Path) -> str: @@ -54,19 +59,63 @@ def language_for(path: Path) -> str: def support_category(rel: Path) -> str: s = rel.as_posix() name = rel.name - if name == "pyproject.toml" or name.endswith(".toml") or name in {"setup.py", "setup.cfg", "tox.ini", "pytest.ini", "mypy.ini", "ruff.toml", ".ruff.toml"}: + if ( + name == "pyproject.toml" + or name.endswith(".toml") + or name + in { + "setup.py", + "setup.cfg", + "tox.ini", + "pytest.ini", + "mypy.ini", + "ruff.toml", + ".ruff.toml", + } + ): return "project config" - if name.endswith(".lock") or name in {"requirements.txt", "poetry.lock", "uv.lock", "Pipfile", "Pipfile.lock", "package.json", "package-lock.json", "pnpm-lock.yaml", "yarn.lock", "Cargo.toml", "Cargo.lock", "go.mod", "go.sum"} or s.startswith("requirements/"): + if ( + name.endswith(".lock") + or name + in { + "requirements.txt", + "poetry.lock", + "uv.lock", + "Pipfile", + "Pipfile.lock", + "package.json", + "package-lock.json", + "pnpm-lock.yaml", + "yarn.lock", + "Cargo.toml", + "Cargo.lock", + "go.mod", + "go.sum", + } + or s.startswith("requirements/") + ): return "dependency file" - if name.startswith("README") or name in {"CHANGELOG.md", "CONTRIBUTING.md"} or s.startswith("docs/"): + if ( + name.startswith("README") + or name in {"CHANGELOG.md", "CONTRIBUTING.md"} + or s.startswith("docs/") + ): return "documentation" - if name.startswith("Dockerfile") or name.startswith("docker-compose") or name.startswith("compose"): + if ( + name.startswith("Dockerfile") + or name.startswith("docker-compose") + or name.startswith("compose") + ): return "runtime support" if s.startswith(".github/workflows/") or name == ".gitlab-ci.yml": return "ci support" if name.startswith(".env") or s.startswith("config/") or s.startswith("settings/"): return "runtime config" - if name in {".pre-commit-config.yaml", "tsconfig.json"} or name.startswith("vite.config") or name.startswith("webpack.config"): + if ( + name in {".pre-commit-config.yaml", "tsconfig.json"} + or name.startswith("vite.config") + or name.startswith("webpack.config") + ): return "tooling config" return "support file" @@ -87,7 +136,6 @@ def classify_file(path: Path, root: Path, config: ScriberConfig) -> FileNode | N if matches_any(rel_s, config.hard_ignore_patterns): return None - binary = is_probably_binary(path) kind: FileKind = "other" category = None policy: ContentPolicy = "auto" @@ -101,6 +149,8 @@ def classify_file(path: Path, root: Path, config: ScriberConfig) -> FileNode | N else: return None + binary = is_probably_binary(path) + try: size = path.stat().st_size except OSError: @@ -135,8 +185,11 @@ def is_text_readable(path: Path) -> bool: def read_text_lossy(path: Path) -> str: - from scriber.native import require_native - return require_native().read_text(str(path)) - - + try: + from scriber.native import is_native_available, require_native + if is_native_available(): + return require_native().read_text(str(path)) + except Exception: + pass + return path.read_text(encoding="utf-8", errors="replace") diff --git a/src/scriber/scanner/scan.py b/src/scriber/scanner/scan.py index e2fa8a4..7b51c1b 100644 --- a/src/scriber/scanner/scan.py +++ b/src/scriber/scanner/scan.py @@ -6,11 +6,23 @@ def scan_project(root: Path, config: ScriberConfig) -> dict[Path, FileNode]: - files, _ = scan_project_with_native(root, config) - return files + try: + from scriber.native import is_native_available + if is_native_available(): + files, _ = scan_project_with_native(root, config) + return files + except Exception: + pass -def scan_project_with_native(root: Path, config: ScriberConfig) -> tuple[dict[Path, FileNode], list]: + from scriber.scanner.scan_py import scan_project as scan_project_py + + return scan_project_py(root, config) + + +def scan_project_with_native( + root: Path, config: ScriberConfig +) -> tuple[dict[Path, FileNode], list]: root = root.resolve() native = require_native() @@ -23,12 +35,13 @@ def scan_project_with_native(root: Path, config: ScriberConfig) -> tuple[dict[Pa config.support_content.full, config.support_content.tree_only, config.support_content.default, - config.support + config.support, ) files: dict[Path, FileNode] = {} - + from scriber.cache import ScriberCache + cache = ScriberCache(config, root) active_files: set[Path] = set() @@ -46,7 +59,7 @@ def scan_project_with_native(root: Path, config: ScriberConfig) -> tuple[dict[Pa size_bytes=cached_data["size_bytes"], is_binary=cached_data["is_binary"], support_category=cached_data["support_category"], - content_policy=cached_data["content_policy"] + content_policy=cached_data["content_policy"], ) files[node.relative] = node else: @@ -58,18 +71,23 @@ def scan_project_with_native(root: Path, config: ScriberConfig) -> tuple[dict[Pa size_bytes=item.size_bytes, is_binary=item.is_binary, support_category=item.support_category, - content_policy=item.content_policy + content_policy=item.content_policy, ) files[node.relative] = node - cache.set_file(rel, item.mtime_ns, item.size_bytes, { - "relative": node.relative.as_posix(), - "kind": node.kind, - "language": node.language, - "size_bytes": node.size_bytes, - "is_binary": node.is_binary, - "support_category": node.support_category, - "content_policy": node.content_policy - }) + cache.set_file( + rel, + item.mtime_ns, + item.size_bytes, + { + "relative": node.relative.as_posix(), + "kind": node.kind, + "language": node.language, + "size_bytes": node.size_bytes, + "is_binary": node.is_binary, + "support_category": node.support_category, + "content_policy": node.content_policy, + }, + ) cache.save(active_files) return files, native_files diff --git a/src/scriber/scanner/scan_py.py b/src/scriber/scanner/scan_py.py index 2c0ebae..e4a5818 100644 --- a/src/scriber/scanner/scan_py.py +++ b/src/scriber/scanner/scan_py.py @@ -10,10 +10,15 @@ def scan_project(root: Path, config: ScriberConfig) -> dict[Path, FileNode]: root = root.resolve() - gitignore = SimpleGitIgnore.from_file(root / ".gitignore") if config.use_gitignore else SimpleGitIgnore([]) + gitignore = ( + SimpleGitIgnore.from_file(root / ".gitignore") + if config.use_gitignore + else SimpleGitIgnore([]) + ) files: dict[Path, FileNode] = {} - + from scriber.cache import ScriberCache + cache = ScriberCache(config, root) active_files: set[Path] = set() @@ -23,10 +28,14 @@ def scan_project(root: Path, config: ScriberConfig) -> dict[Path, FileNode]: kept_dirs: list[str] = [] for dirname in dirnames: - child_rel = (rel_dir / dirname) if rel_dir.as_posix() != "." else Path(dirname) + child_rel = ( + (rel_dir / dirname) if rel_dir.as_posix() != "." else Path(dirname) + ) if should_hard_ignore(child_rel, config): continue - if config.use_gitignore and gitignore.ignores(child_rel.as_posix(), is_dir=True): + if config.use_gitignore and gitignore.ignores( + child_rel.as_posix(), is_dir=True + ): continue kept_dirs.append(dirname) dirnames[:] = kept_dirs @@ -47,33 +56,40 @@ def scan_project(root: Path, config: ScriberConfig) -> dict[Path, FileNode]: continue active_files.add(rel) - + cached_data = cache.get_file(rel, mtime_ns, size) if cached_data is not None: node = FileNode( - absolute=(root / Path(cached_data["relative"])).resolve(strict=False), + absolute=(root / Path(cached_data["relative"])).resolve( + strict=False + ), relative=Path(cached_data["relative"]), kind=cached_data["kind"], language=cached_data["language"], size_bytes=cached_data["size_bytes"], is_binary=cached_data["is_binary"], support_category=cached_data["support_category"], - content_policy=cached_data["content_policy"] + content_policy=cached_data["content_policy"], ) files[node.relative] = node else: node = classify_file(path, root, config) if node is not None: files[node.relative] = node - cache.set_file(rel, mtime_ns, size, { - "relative": node.relative.as_posix(), - "kind": node.kind, - "language": node.language, - "size_bytes": node.size_bytes, - "is_binary": node.is_binary, - "support_category": node.support_category, - "content_policy": node.content_policy - }) + cache.set_file( + rel, + mtime_ns, + size, + { + "relative": node.relative.as_posix(), + "kind": node.kind, + "language": node.language, + "size_bytes": node.size_bytes, + "is_binary": node.is_binary, + "support_category": node.support_category, + "content_policy": node.content_policy, + }, + ) cache.save(active_files) return files diff --git a/tests/test_cache.py b/tests/test_cache.py index 5c141bf..1b87a51 100644 --- a/tests/test_cache.py +++ b/tests/test_cache.py @@ -1,9 +1,8 @@ from __future__ import annotations -import json from pathlib import Path from scriber.core.models import ScriberConfig -from scriber.cache import ScriberCache, get_config_hash +from scriber.cache import ScriberCache def test_cache_functionality(tmp_path: Path) -> None: @@ -11,33 +10,47 @@ def test_cache_functionality(tmp_path: Path) -> None: # Ensure cache is enabled config.cache.enabled = True config.cache.dir = ".scriber/cache" - + cache = ScriberCache(config, tmp_path) - + rel_path = Path("src/main.py") - mtime = 123456789 - size = 1000 - data = {"kind": "code", "language": "python", "size_bytes": 1000, "is_binary": False, "support_category": None, "content_policy": "auto", "absolute": "src/main.py", "relative": "src/main.py"} - + (tmp_path / "src").mkdir(exist_ok=True) + (tmp_path / rel_path).write_text("print('hello')") + + stat = (tmp_path / rel_path).stat() + mtime = stat.st_mtime_ns + size = stat.st_size + + data = { + "kind": "code", + "language": "python", + "size_bytes": 1000, + "is_binary": False, + "support_category": None, + "content_policy": "auto", + "absolute": "src/main.py", + "relative": "src/main.py", + } + assert cache.get_file(rel_path, mtime, size) is None - + cache.set_file(rel_path, mtime, size, data) assert cache.get_file(rel_path, mtime, size) == data - + # Check imports cache imports = {Path("src/auth.py"), Path("src/db.py")} - assert cache.get_imports(rel_path) is None + assert cache.get_imports(rel_path, mtime, size) is None cache.set_imports(rel_path, imports) - assert cache.get_imports(rel_path) == imports - + assert cache.get_imports(rel_path, mtime, size) == imports + # Save cache cache.save(active_files={rel_path}) - + # Check that cache files were created assert (tmp_path / ".scriber/cache/files.json").exists() - assert (tmp_path / ".scriber/cache/import_graph.json").exists() - + assert (tmp_path / ".scriber/cache/imports_v2.json").exists() + # Reload cache and check if retrieved properly new_cache = ScriberCache(config, tmp_path) assert new_cache.get_file(rel_path, mtime, size) == data - assert new_cache.get_imports(rel_path) == imports + assert new_cache.get_imports(rel_path, mtime, size) == imports diff --git a/tests/test_config_schema.py b/tests/test_config_schema.py index ab377de..a071d34 100644 --- a/tests/test_config_schema.py +++ b/tests/test_config_schema.py @@ -6,7 +6,8 @@ def test_config_schema_parsing(tmp_path: Path) -> None: config_file = tmp_path / "pyproject.toml" - config_file.write_text(""" + config_file.write_text( + """ [tool.scriber] format = "txt" max_tokens = 50000 @@ -33,30 +34,32 @@ def test_config_schema_parsing(tmp_path: Path) -> None: [tool.scriber.hard_ignore] patterns = [".git/**", "node_modules/**"] -""".strip(), encoding="utf-8") +""".strip(), + encoding="utf-8", + ) config = load_config(config_file) - + assert config.format == "txt" assert config.max_tokens == 50000 assert config.max_files == 30 assert config.only_tree is True assert config.allow_external_paths is True - + assert config.modules is False assert config.modules_config.enabled is False assert config.modules_config.content_min_score == 40 - + assert config.code_patterns == ["**/*.py", "**/*.rs"] - + assert config.support is True assert config.support_patterns == ["pyproject.toml", "Dockerfile"] - + assert config.support_content.default == "tree_only" assert config.support_content.auto_max_bytes == 20000 assert config.support_content.full == ["pyproject.toml"] assert config.support_content.tree_only == ["Dockerfile"] - + assert config.hard_ignore_patterns == [".git/**", "node_modules/**"] diff --git a/tests/test_fixes.py b/tests/test_fixes.py new file mode 100644 index 0000000..176e127 --- /dev/null +++ b/tests/test_fixes.py @@ -0,0 +1,166 @@ +from pathlib import Path +from unittest.mock import patch + +from scriber.core.config import ScriberConfig +from scriber.core.models import FileNode, ModuleGraph +from scriber.engine.roles import classify_file_role +from scriber.engine.scorer import _is_test_file +from scriber.scanner.files import classify_file, read_text_lossy + + +def test_role_classifier_does_not_mark_production_tests_analyzer_as_test(): + config = ScriberConfig() + config.python.test_roots = ["tests", "test"] + rel = Path("src/scriber/graph/analyzers/tests.py") + assert not _is_test_file(rel, config) + + rel2 = Path("tests/test_something.py") + assert _is_test_file(rel2, config) + + +def test_classify_file_does_not_binary_check_unmatched_files(): + config = ScriberConfig() + config.code_patterns = ["**/*.py"] + config.support = False + + with patch("scriber.scanner.files.is_probably_binary") as mock_binary: + # Not a match to any pattern + res = classify_file(Path("/fake/file.unknown"), Path("/fake"), config) + assert res is None + mock_binary.assert_not_called() + + +def test_read_text_lossy_without_native(tmp_path): + p = tmp_path / "test.txt" + p.write_bytes(b"hello \xff world") # invalid utf-8 + + with patch("scriber.native.is_native_available", return_value=False): + content = read_text_lossy(p) + assert "hello \ufffd world" in content or "hello world" in content + + +def test_classify_file_role_does_not_mark_graph_analyzers_tests_py_as_test(): + file = FileNode( + absolute=Path("/src/scriber/graph/analyzers/tests.py"), + relative=Path("src/scriber/graph/analyzers/tests.py"), + kind="code", + language="python", + size_bytes=100, + ) + graph = ModuleGraph() + role = classify_file_role(file, graph) + assert role != "test" + + +def test_read_text_lossy_falls_back_when_native_read_raises(tmp_path): + p = tmp_path / "test.txt" + p.write_bytes(b"hello") + + with patch("scriber.native.is_native_available", return_value=True): + with patch("scriber.native.require_native") as mock_require: + mock_require.return_value.read_text.side_effect = Exception( + "Native read failed" + ) + content = read_text_lossy(p) + assert content == "hello" + + +def test_import_cache_works_with_custom_cache_dir(tmp_path): + config = ScriberConfig() + config.cache.dir = "custom/cache/dir" + + from scriber.cache import ScriberCache + + cache = ScriberCache(config, tmp_path) + + assert cache.cache_dir == tmp_path / "custom" / "cache" / "dir" + + f1 = tmp_path / "a.py" + f2 = tmp_path / "b.py" + f1.write_text("import b") + f2.write_text("") + + cache.set_imports(Path("a.py"), {Path("b.py")}) + assert cache.imports_data["a.py"]["targets"] == ["b.py"] + + +def test_project_snapshot_docs_profile_changes_code_and_test_scores(): + from scriber.core.profiles import apply_profile + + config = ScriberConfig() + config = apply_profile(config, "docs") + + from scriber.engine.scorer import score_candidates_project_snapshot + + files = { + Path("app.py"): FileNode( + Path("/app.py"), Path("app.py"), "code", "python", 100 + ), + Path("test_app.py"): FileNode( + Path("/test_app.py"), Path("test_app.py"), "code", "python", 100 + ), + Path("utils.py"): FileNode( + Path("/utils.py"), Path("utils.py"), "code", "python", 100 + ), + } + graph = ModuleGraph() + # Mocking minimums so we see all files in output + config.min_score = 0 + config.modules_config.tree_min_score = 0 + + candidates = score_candidates_project_snapshot( + files=files, graph=graph, config=config + ) + + c_app = next(c for c in candidates if c.file.relative.name == "app.py") + assert c_app.score == config.modules_config.scoring.get("entrypoint_file", 90) + + c_test = next(c for c in candidates if c.file.relative.name == "test_app.py") + assert c_test.score == config.modules_config.scoring.get("test_file", 60) + + c_utils = next(c for c in candidates if c.file.relative.name == "utils.py") + assert c_utils.score == config.modules_config.scoring.get("code_file", 80) + + +def test_native_project_snapshot_uses_profile_code_and_test_scores(): + from scriber.core.profiles import apply_profile + + config = ScriberConfig() + config = apply_profile(config, "docs") + from scriber.native import is_native_available, require_native + + if not is_native_available(): + return # skip if native not built + + native = require_native() + scoring = config.modules_config.scoring + opts = native.NativePackOptions( + mode="project_snapshot", + max_files=10, + min_score=0, + tree_min_score=0, + entrypoint_patterns=config.python.entrypoint_patterns, + test_roots=config.python.test_roots, + entrypoint_file_score=scoring.get("entrypoint_file", 90), + code_file_score=scoring.get("code_file", 80), + test_file_score=scoring.get("test_file", 60), + other_file_score=scoring.get("other_file", 40), + ) + assert opts.entrypoint_file_score == scoring.get("entrypoint_file", 90) + assert opts.test_file_score == scoring.get("test_file", 60) + assert opts.code_file_score == scoring.get("code_file", 80) + assert opts.other_file_score == scoring.get("other_file", 40) + + +def test_llm_pack_gpt_profile_does_not_access_missing_outline_symbols(tmp_path): + from scriber.packer.pack import build_pack + + config_path = tmp_path / "pyproject.toml" + config_path.write_text("") + + code_path = tmp_path / "test.py" + code_path.write_text("def my_func(): pass") + + # Just verify it builds without Exception on outline.symbols + pack = build_pack(paths=[str(code_path)], profile="gpt", path_base="cwd") + assert pack is not None diff --git a/tests/test_init_config.py b/tests/test_init_config.py index 59d0e85..1b98815 100644 --- a/tests/test_init_config.py +++ b/tests/test_init_config.py @@ -26,7 +26,8 @@ def test_replace_existing_block() -> None: version = "2" """.strip() - expected = """ + expected = ( + """ [build-system] requires = ["setuptools>=61"] @@ -35,7 +36,9 @@ def test_replace_existing_block() -> None: [tool.scriber] version = "2" -""".strip() + "\n" +""".strip() + + "\n" + ) res = replace_existing_tool_scriber_block(content, default_block) assert res == expected @@ -44,7 +47,7 @@ def test_replace_existing_block() -> None: def test_init_project_file_missing(tmp_path: Path) -> None: config_path = tmp_path / "pyproject.toml" assert not config_path.exists() - + path = init_project(str(config_path)) assert path == config_path.resolve() assert config_path.exists() @@ -54,7 +57,7 @@ def test_init_project_file_missing(tmp_path: Path) -> None: def test_init_project_exists_no_scriber(tmp_path: Path) -> None: config_path = tmp_path / "pyproject.toml" config_path.write_text("[build-system]\n", encoding="utf-8") - + init_project(str(config_path)) content = config_path.read_text(encoding="utf-8") assert "[build-system]" in content @@ -64,26 +67,30 @@ def test_init_project_exists_no_scriber(tmp_path: Path) -> None: def test_init_project_exists_with_scriber_raises(tmp_path: Path) -> None: config_path = tmp_path / "pyproject.toml" config_path.write_text("[tool.scriber]\nversion = '1'\n", encoding="utf-8") - + with pytest.raises(ScriberError, match="Scriber config already exists"): init_project(str(config_path)) def test_init_project_exists_with_scriber_force(tmp_path: Path) -> None: config_path = tmp_path / "pyproject.toml" - config_path.write_text(""" + config_path.write_text( + """ [build-system] requires = ["setuptools>=61"] [tool.scriber] version = '1' -""".strip() + "\n", encoding="utf-8") - +""".strip() + + "\n", + encoding="utf-8", + ) + init_project(str(config_path), force=True) content = config_path.read_text(encoding="utf-8") assert "[build-system]" in content assert "[tool.scriber]" in content assert "version = '1'" not in content # must be replaced with the default block - + # Ensure there is exactly one [tool.scriber] header in pyproject.toml assert content.count("[tool.scriber]") == 1 diff --git a/tests/test_languages.py b/tests/test_languages.py index 5f53f23..fc4f737 100644 --- a/tests/test_languages.py +++ b/tests/test_languages.py @@ -7,31 +7,33 @@ def test_javascript_typescript_graph(tmp_path: Path) -> None: config = ScriberConfig() - + auth_path = tmp_path / "src/auth.ts" auth_path.parent.mkdir(parents=True, exist_ok=True) auth_path.write_text("export class Auth {}", encoding="utf-8") - + main_path = tmp_path / "src/main.ts" - main_path.write_text("import { Auth } from './auth';\nimport 'lodash';", encoding="utf-8") - + main_path.write_text( + "import { Auth } from './auth';\nimport 'lodash';", encoding="utf-8" + ) + files = { Path("src/auth.ts"): FileNode( absolute=auth_path.resolve(), relative=Path("src/auth.ts"), kind="code", language="typescript", - size_bytes=auth_path.stat().st_size + size_bytes=auth_path.stat().st_size, ), Path("src/main.ts"): FileNode( absolute=main_path.resolve(), relative=Path("src/main.ts"), kind="code", language="typescript", - size_bytes=main_path.stat().st_size - ) + size_bytes=main_path.stat().st_size, + ), } - + graph = build_graph(files, config) assert Path("src/auth.ts") in graph.imports[Path("src/main.ts")] assert Path("src/main.ts") in graph.imported_by[Path("src/auth.ts")] @@ -39,34 +41,36 @@ def test_javascript_typescript_graph(tmp_path: Path) -> None: def test_rust_graph(tmp_path: Path) -> None: config = ScriberConfig() - + cargo_toml = tmp_path / "Cargo.toml" cargo_toml.write_text("[package]\nname = 'test'", encoding="utf-8") - + auth_path = tmp_path / "src/auth.rs" auth_path.parent.mkdir(parents=True, exist_ok=True) auth_path.write_text("pub struct Auth;", encoding="utf-8") - + main_path = tmp_path / "src/main.rs" - main_path.write_text("mod auth;\nuse crate::auth::Auth;\nuse super::unrelated;", encoding="utf-8") - + main_path.write_text( + "mod auth;\nuse crate::auth::Auth;\nuse super::unrelated;", encoding="utf-8" + ) + files = { Path("src/auth.rs"): FileNode( absolute=auth_path.resolve(), relative=Path("src/auth.rs"), kind="code", language="rust", - size_bytes=auth_path.stat().st_size + size_bytes=auth_path.stat().st_size, ), Path("src/main.rs"): FileNode( absolute=main_path.resolve(), relative=Path("src/main.rs"), kind="code", language="rust", - size_bytes=main_path.stat().st_size - ) + size_bytes=main_path.stat().st_size, + ), } - + graph = build_graph(files, config) assert Path("src/auth.rs") in graph.imports[Path("src/main.rs")] assert Path("src/main.rs") in graph.imported_by[Path("src/auth.rs")] @@ -74,35 +78,37 @@ def test_rust_graph(tmp_path: Path) -> None: def test_go_graph(tmp_path: Path) -> None: config = ScriberConfig() - + go_mod = tmp_path / "go.mod" go_mod.write_text("module github.com/user/project\n", encoding="utf-8") - + db_path = tmp_path / "pkg/db/db.go" db_path.parent.mkdir(parents=True, exist_ok=True) db_path.write_text("package db\n", encoding="utf-8") - + main_path = tmp_path / "cmd/main.go" main_path.parent.mkdir(parents=True, exist_ok=True) - main_path.write_text('package main\nimport "github.com/user/project/pkg/db"\n', encoding="utf-8") - + main_path.write_text( + 'package main\nimport "github.com/user/project/pkg/db"\n', encoding="utf-8" + ) + files = { Path("pkg/db/db.go"): FileNode( absolute=db_path.resolve(), relative=Path("pkg/db/db.go"), kind="code", language="go", - size_bytes=db_path.stat().st_size + size_bytes=db_path.stat().st_size, ), Path("cmd/main.go"): FileNode( absolute=main_path.resolve(), relative=Path("cmd/main.go"), kind="code", language="go", - size_bytes=main_path.stat().st_size - ) + size_bytes=main_path.stat().st_size, + ), } - + graph = build_graph(files, config) assert Path("pkg/db/db.go") in graph.imports[Path("cmd/main.go")] assert Path("cmd/main.go") in graph.imported_by[Path("pkg/db/db.go")] @@ -110,14 +116,17 @@ def test_go_graph(tmp_path: Path) -> None: def test_cpp_graph(tmp_path: Path) -> None: config = ScriberConfig() - + header_path = tmp_path / "src/auth.h" header_path.parent.mkdir(parents=True, exist_ok=True) header_path.write_text("class Auth {};", encoding="utf-8") - + main_path = tmp_path / "src/main.cpp" - main_path.write_text('#include "auth.h"\n#include \n#include "utils/helper.hpp"', encoding="utf-8") - + main_path.write_text( + '#include "auth.h"\n#include \n#include "utils/helper.hpp"', + encoding="utf-8", + ) + helper_path = tmp_path / "src/utils/helper.hpp" helper_path.parent.mkdir(parents=True, exist_ok=True) helper_path.write_text("void helper();", encoding="utf-8") @@ -128,27 +137,26 @@ def test_cpp_graph(tmp_path: Path) -> None: relative=Path("src/auth.h"), kind="code", language="c", - size_bytes=header_path.stat().st_size + size_bytes=header_path.stat().st_size, ), Path("src/main.cpp"): FileNode( absolute=main_path.resolve(), relative=Path("src/main.cpp"), kind="code", language="cpp", - size_bytes=main_path.stat().st_size + size_bytes=main_path.stat().st_size, ), Path("src/utils/helper.hpp"): FileNode( absolute=helper_path.resolve(), relative=Path("src/utils/helper.hpp"), kind="code", language="cpp", - size_bytes=helper_path.stat().st_size - ) + size_bytes=helper_path.stat().st_size, + ), } - + graph = build_graph(files, config) assert Path("src/auth.h") in graph.imports[Path("src/main.cpp")] assert Path("src/main.cpp") in graph.imported_by[Path("src/auth.h")] assert Path("src/utils/helper.hpp") in graph.imports[Path("src/main.cpp")] assert Path("src/main.cpp") in graph.imported_by[Path("src/utils/helper.hpp")] - diff --git a/tests/test_native.py b/tests/test_native.py index 643d795..39d55c5 100644 --- a/tests/test_native.py +++ b/tests/test_native.py @@ -18,22 +18,22 @@ def test_native_read_write(tmp_path: Path) -> None: native = require_native() test_file = tmp_path / "test.txt" content = "Hello, native Rust world!\nWith some special characters: Ε‚Γ³Δ…dΕΊΕ›\n" - + native.write_text(str(test_file), content) assert test_file.exists() - + read_back = native.read_text(str(test_file)) assert read_back == content def test_native_binary_check(tmp_path: Path) -> None: native = require_native() - + # Test text file txt_file = tmp_path / "normal.txt" txt_file.write_text("Hello world", encoding="utf-8") assert not native.is_probably_binary(str(txt_file)) - + # Test binary file bin_file = tmp_path / "binary.bin" bin_file.write_bytes(b"Hello\x00world") @@ -47,19 +47,21 @@ def test_native_scan_matches_python_scan(tmp_path: Path) -> None: (tmp_path / "src" / "helper.py").write_text("import sys", encoding="utf-8") (tmp_path / "src" / "binary.dat").write_bytes(b"\x00\x01\x02") (tmp_path / "README.md").write_text("# Test Project", encoding="utf-8") - (tmp_path / "pyproject.toml").write_text("[tool.scriber]\nversion='2'", encoding="utf-8") - + (tmp_path / "pyproject.toml").write_text( + "[tool.scriber]\nversion='2'", encoding="utf-8" + ) + # Hidden dir and ignored patterns (tmp_path / ".git").mkdir() (tmp_path / ".git" / "config").write_text("git config", encoding="utf-8") - + config = ScriberConfig( use_gitignore=True, code_patterns=["**/*.py"], support_patterns=["pyproject.toml", "README.md", "requirements.txt"], hard_ignore_patterns=[".git/**", "**/binary.dat"], ) - + # Create gitignore (tmp_path / ".gitignore").write_text("*.pyc\n", encoding="utf-8") @@ -71,7 +73,7 @@ def test_native_scan_matches_python_scan(tmp_path: Path) -> None: for path, rust_node in rust_result.items(): py_node = python_result[path] - + # Verify fields match exactly assert rust_node.relative == py_node.relative assert rust_node.kind == py_node.kind @@ -86,7 +88,9 @@ def test_native_no_support(tmp_path: Path) -> None: (tmp_path / "src").mkdir() (tmp_path / "src" / "main.py").write_text("print('hello')", encoding="utf-8") (tmp_path / "README.md").write_text("# Test Project", encoding="utf-8") - (tmp_path / "pyproject.toml").write_text("[tool.scriber]\nversion='2'", encoding="utf-8") + (tmp_path / "pyproject.toml").write_text( + "[tool.scriber]\nversion='2'", encoding="utf-8" + ) config = ScriberConfig( support=False, @@ -177,10 +181,11 @@ def test_native_graph_matches_python_graph_mixed_project(tmp_path: Path) -> None config = make_config() python_files = scan_python(tmp_path, config) - + from scriber.graph.builder import build_graph as build_python_graph + py_graph = build_python_graph(python_files, config) - + native = require_native() native_files = native.scan_project( str(tmp_path), @@ -191,22 +196,38 @@ def test_native_graph_matches_python_graph_mixed_project(tmp_path: Path) -> None config.support_content.full, config.support_content.tree_only, config.support_content.default, - config.support + config.support, ) - edges = native.build_import_graph( + edges = native.build_relation_graph( str(tmp_path), native_files, config.python.source_roots, - config.python.module_init_files + config.python.module_init_files, ) rs_imports = {} for edge in edges: - rs_imports.setdefault(Path(getattr(edge, "from")), set()).add(Path(edge.to)) - + if ( + edge.kind == "import" + or edge.kind == "mod" + or edge.kind == "use" + or edge.kind == "include" + ): + rs_imports.setdefault(Path(getattr(edge, "source")), set()).add( + Path(edge.target) + ) + for path, targets in py_graph.imports.items(): file = python_files[path] - if file.language in {"python", "javascript", "typescript", "rust", "go", "c", "cpp"}: + if file.language in { + "python", + "javascript", + "typescript", + "rust", + "go", + "c", + "cpp", + }: rs_targets = rs_imports.get(path, set()) assert rs_targets == targets @@ -214,22 +235,26 @@ def test_native_graph_matches_python_graph_mixed_project(tmp_path: Path) -> None def test_native_scoring_matches_python_for_focused_pack(tmp_path: Path) -> None: make_mixed_project(tmp_path) config = make_config() - + python_files = scan_python(tmp_path, config) from scriber.graph.builder import build_graph as build_python_graph + py_graph = build_python_graph(python_files, config) - + from scriber.engine.scorer import score_candidates as score_python from scriber.core.models import SeedPath + seed = SeedPath( original=Path("src/main.py"), absolute=(tmp_path / "src/main.py").resolve(), relative=Path("src/main.py"), is_dir=False, - expanded_files=[Path("src/main.py")] + expanded_files=[Path("src/main.py")], ) - py_candidates = score_python(files=python_files, seeds=[seed], graph=py_graph, config=config, mode="focused") - + py_candidates = score_python( + files=python_files, seeds=[seed], graph=py_graph, config=config, mode="focused" + ) + native = require_native() native_files = native.scan_project( str(tmp_path), @@ -240,15 +265,15 @@ def test_native_scoring_matches_python_for_focused_pack(tmp_path: Path) -> None: config.support_content.full, config.support_content.tree_only, config.support_content.default, - config.support + config.support, ) - edges = native.build_import_graph( + edges = native.build_relation_graph( str(tmp_path), native_files, config.python.source_roots, - config.python.module_init_files + config.python.module_init_files, ) - + scoring = config.modules_config.scoring opts = native.NativePackOptions( mode="focused", @@ -281,17 +306,14 @@ def test_native_scoring_matches_python_for_focused_pack(tmp_path: Path) -> None: entrypoint_patterns=config.python.entrypoint_patterns, test_roots=config.python.test_roots, ) - + rs_candidates = native.score_candidates_native( - native_files, - ["src/main.py"], - edges, - opts + native_files, ["src/main.py"], edges, opts ) - + py_map = {c.file.relative.as_posix(): c for c in py_candidates} rs_map = {c.path: c for c in rs_candidates} - + assert set(py_map.keys()) == set(rs_map.keys()) for path, py_c in py_map.items(): rs_c = rs_map[path] @@ -308,12 +330,13 @@ def test_native_render_tree_matches_python() -> None: "pyproject.toml", "README.md", ] - + from scriber.rendering.renderer import render_tree as render_python_tree + py_tree = render_python_tree([Path(p) for p in paths]) - + rs_tree = native.render_tree(paths) - + assert rs_tree.strip() == py_tree.strip() @@ -324,14 +347,16 @@ def test_default_toml_and_lock_support(tmp_path: Path) -> None: # Create dummy files (tmp_path / "src").mkdir() (tmp_path / "src" / "main.py").write_text("print('hello')", encoding="utf-8") - (tmp_path / "pyproject.toml").write_text("[tool.scriber]\nversion='2'", encoding="utf-8") + (tmp_path / "pyproject.toml").write_text( + "[tool.scriber]\nversion='2'", encoding="utf-8" + ) (tmp_path / "some_random_config.toml").write_text("a = 1", encoding="utf-8") (tmp_path / "some_random_lockfile.lock").write_text("lock", encoding="utf-8") # Load default config config = load_config(tmp_path / "pyproject.toml") config.use_gitignore = False - + # Assert that **/*.toml and **/*.lock are in support patterns assert "**/*.toml" in config.support_patterns assert "**/*.toml" in config.support_content.full @@ -362,7 +387,7 @@ def test_native_import_complex_python(tmp_path: Path) -> None: (tmp_path / "src" / "b.py").write_text("class B: pass", encoding="utf-8") (tmp_path / "src" / "c.py").write_text("class C: pass", encoding="utf-8") (tmp_path / "src" / "d.py").write_text("class D: pass", encoding="utf-8") - + import_test_content = """ import os, sys import math as m, json @@ -374,7 +399,9 @@ def test_native_import_complex_python(tmp_path: Path) -> None: from .c import D """ (tmp_path / "src" / "main.py").write_text(import_test_content, encoding="utf-8") - (tmp_path / "pyproject.toml").write_text("[tool.scriber]\nversion='2'", encoding="utf-8") + (tmp_path / "pyproject.toml").write_text( + "[tool.scriber]\nversion='2'", encoding="utf-8" + ) config = ScriberConfig( use_gitignore=False, @@ -383,8 +410,9 @@ def test_native_import_complex_python(tmp_path: Path) -> None: ) from scriber.scanner.scan import scan_project - files = scan_project(tmp_path, config) - + + scan_project(tmp_path, config) + native = require_native() native_files = native.scan_project( str(tmp_path), @@ -395,28 +423,22 @@ def test_native_import_complex_python(tmp_path: Path) -> None: config.support_content.full, config.support_content.tree_only, config.support_content.default, - config.support + config.support, ) - edges = native.build_import_graph( + edges = native.build_relation_graph( str(tmp_path), native_files, config.python.source_roots, - config.python.module_init_files + config.python.module_init_files, ) - imports = {Path(getattr(edge, "from")): set() for edge in edges} + imports = {Path(getattr(edge, "source")): set() for edge in edges} for edge in edges: - imports[Path(getattr(edge, "from"))].add(Path(edge.to)) - + if edge.kind == "import": + imports[Path(getattr(edge, "source"))].add(Path(edge.target)) + main_path = Path("src/main.py") assert main_path in imports - - expected_imports = { - Path("src/a.py"), - Path("src/b.py"), - Path("src/c.py") - } - assert imports[main_path] == expected_imports - - + expected_imports = {Path("src/a.py"), Path("src/b.py"), Path("src/c.py")} + assert imports[main_path] == expected_imports diff --git a/tests/test_scriber.py b/tests/test_scriber.py index 8ddf870..abd832a 100644 --- a/tests/test_scriber.py +++ b/tests/test_scriber.py @@ -63,17 +63,25 @@ def make_project(tmp_path: Path) -> Path: write(tmp_path / "poetry.lock", "very large lock in real life\n") write(tmp_path / "Dockerfile", "FROM python:3.12\n") write(tmp_path / "src/app/__init__.py", "") - write(tmp_path / "src/app/auth.py", "from .session import Session\nfrom .config import SETTINGS\n\nclass Auth: pass\n") + write( + tmp_path / "src/app/auth.py", + "from .session import Session\nfrom .config import SETTINGS\n\nclass Auth: pass\n", + ) write(tmp_path / "src/app/session.py", "class Session: pass\n") write(tmp_path / "src/app/config.py", "SETTINGS = {}\n") write(tmp_path / "src/app/main.py", "from app.auth import Auth\n") write(tmp_path / "src/api/routes.py", "from app.auth import Auth\n") - write(tmp_path / "tests/test_auth.py", "from app.auth import Auth\n\ndef test_auth():\n assert Auth\n") + write( + tmp_path / "tests/test_auth.py", + "from app.auth import Auth\n\ndef test_auth():\n assert Auth\n", + ) write(tmp_path / "src/app/unrelated.py", "VALUE = 1\n") return tmp_path -def test_build_pack_includes_seed_dependencies_reverse_tests_and_support(tmp_path: Path, monkeypatch) -> None: +def test_build_pack_includes_seed_dependencies_reverse_tests_and_support( + tmp_path: Path, monkeypatch +) -> None: project = make_project(tmp_path) monkeypatch.chdir(project) @@ -90,7 +98,9 @@ def test_build_pack_includes_seed_dependencies_reverse_tests_and_support(tmp_pat assert "requirements.txt" in paths assert "poetry.lock" in paths - by_path = {candidate.file.relative.as_posix(): candidate for candidate in pack.candidates} + by_path = { + candidate.file.relative.as_posix(): candidate for candidate in pack.candidates + } assert by_path["src/app/auth.py"].score == 100 assert by_path["src/app/session.py"].score >= 80 assert by_path["src/api/routes.py"].score >= 80 @@ -119,11 +129,18 @@ def test_multiple_paths_promote_shared_dependency(tmp_path: Path, monkeypatch) - write(tmp_path / "src/app/billing.py", "from .config import SETTINGS\n") monkeypatch.chdir(project) - pack = build_pack(["src/app/auth.py", "src/app/billing.py"], config_path="pyproject.toml") - by_path = {candidate.file.relative.as_posix(): candidate for candidate in pack.candidates} + pack = build_pack( + ["src/app/auth.py", "src/app/billing.py"], config_path="pyproject.toml" + ) + by_path = { + candidate.file.relative.as_posix(): candidate for candidate in pack.candidates + } assert "src/app/config.py" in by_path assert by_path["src/app/config.py"].score == 100 - assert any("shared by multiple seed paths" in reason for reason in by_path["src/app/config.py"].reasons) + assert any( + "shared by multiple seed paths" in reason + for reason in by_path["src/app/config.py"].reasons + ) def test_no_modules_keeps_seed_and_pyproject(tmp_path: Path, monkeypatch) -> None: @@ -155,8 +172,10 @@ def test_project_snapshot_mode(tmp_path: Path, monkeypatch) -> None: pack = build_pack(["."], config_path="pyproject.toml") assert pack.mode == "project_snapshot" - by_path = {candidate.file.relative.as_posix(): candidate for candidate in pack.candidates} - + by_path = { + candidate.file.relative.as_posix(): candidate for candidate in pack.candidates + } + # Entrypoint (e.g., src/app/main.py matches main.py pattern) assert by_path["src/app/main.py"].score == 90 assert by_path["src/app/main.py"].reason_summary == "entrypoint file" @@ -172,7 +191,7 @@ def test_project_snapshot_mode(tmp_path: Path, monkeypatch) -> None: # Support files assert by_path["README.md"].score == 45 assert by_path["README.md"].reason_summary == "project support file" - + # Ensure no near-seed duplication in project snapshot mode assert "near" not in by_path["README.md"].reason_summary assert "shared by multiple seed paths" not in by_path["README.md"].reasons @@ -180,6 +199,7 @@ def test_project_snapshot_mode(tmp_path: Path, monkeypatch) -> None: def test_dry_run_and_open_cli(tmp_path: Path, monkeypatch) -> None: from scriber.cli.main import main + project = make_project(tmp_path) monkeypatch.chdir(project) @@ -219,4 +239,3 @@ def test_no_support_excludes_support_files_folder_seed(tmp_path: Path) -> None: paths = {c.file.relative.as_posix() for c in pack.candidates} assert "README.md" not in paths assert "pyproject.toml" not in paths - diff --git a/tests/test_symbols.py b/tests/test_symbols.py new file mode 100644 index 0000000..324df0a --- /dev/null +++ b/tests/test_symbols.py @@ -0,0 +1,44 @@ +from pathlib import Path +from scriber.core.symbols import SymbolIndex +from scriber.graph.languages.extractor import extract_python_symbols + + +def test_extract_python_symbols() -> None: + code = """ +class MyClass: + def __init__(self): + pass + + async def my_method(self): + pass + +def global_function(): + pass +""" + index = SymbolIndex() + file_path = Path("src/dummy.py") + + extract_python_symbols(file_path, code, index) + + symbols = index.get_symbols(file_path) + assert len(symbols) == 4 + + # Check Class + class_sym = next(s for s in symbols if s.name == "MyClass") + assert class_sym.kind == "class" + assert class_sym.parent_name is None + + # Check Constructor + init_sym = next(s for s in symbols if s.name == "__init__") + assert init_sym.kind == "function" + assert init_sym.parent_name == "MyClass" + + # Check Async Method + method_sym = next(s for s in symbols if s.name == "my_method") + assert method_sym.kind == "function" + assert method_sym.parent_name == "MyClass" + + # Check Global Function + func_sym = next(s for s in symbols if s.name == "global_function") + assert func_sym.kind == "function" + assert func_sym.parent_name is None diff --git a/tests/test_tokens.py b/tests/test_tokens.py index fe1e2b2..35c6339 100644 --- a/tests/test_tokens.py +++ b/tests/test_tokens.py @@ -21,11 +21,14 @@ def test_token_estimation_custom_config() -> None: def test_token_estimation_parsing_from_config(tmp_path: Path) -> None: config_file = tmp_path / "pyproject.toml" - config_file.write_text(""" + config_file.write_text( + """ [tool.scriber.tokens] estimator = "chars" chars_per_token = 5 -""".strip(), encoding="utf-8") +""".strip(), + encoding="utf-8", + ) config = load_config(config_file) assert config.tokens.estimator == "chars" diff --git a/tests/test_top_deps.py b/tests/test_top_deps.py new file mode 100644 index 0000000..5e9123c --- /dev/null +++ b/tests/test_top_deps.py @@ -0,0 +1,39 @@ +def test_top_dependencies_limits_graph_traversal(): + from pathlib import Path + from scriber.core.models import RelationEdge + from scriber.engine.scorer import _walk_weighted_neighbors + + start = Path("app.py") + edges = [] + # Create 15 outgoing edges with varying strengths + for i in range(15): + target = Path(f"dep_{i}.py") + edges.append( + RelationEdge( + source=start, + target=target, + kind="import", + weight=1.0, + confidence=0.1 * i, # Higher i = higher confidence + evidence=[], + line=i, + analyzer="test", + ) + ) + + # Unlimited dependencies (0) + result_unlimited = _walk_weighted_neighbors( + edges, start, depth_limit=1, top_dependencies=0 + ) + assert len(result_unlimited) == 15 + + # Top 5 dependencies + result_top5 = _walk_weighted_neighbors( + edges, start, depth_limit=1, top_dependencies=5 + ) + assert len(result_top5) == 5 + + # Verify the ones with highest confidence were picked + # The edges have confidence 0.0 to 1.4. The top 5 should be from 1.0 to 1.4 (dep_10 to dep_14) + expected_deps = {Path(f"dep_{i}.py") for i in range(10, 15)} + assert set(result_top5.keys()) == expected_deps