From 1cd58b81fded9fbc1bb1a105631db33768a05dfb Mon Sep 17 00:00:00 2001
From: SunneV
Date: Sun, 31 May 2026 08:15:54 +0200
Subject: [PATCH 1/6] scriber_pack.md output file optimization
---
CHANGELOG.md | 12 +
pyproject.toml | 6 +-
rust/scriber_native/src/import.rs | 76 ++++++
rust/scriber_native/src/lib.rs | 2 +
rust/scriber_native/src/score.rs | 162 +++++++++----
src/scriber/__init__.py | 5 +-
src/scriber/budget/allocator.py | 66 ++++++
src/scriber/cache.py | 72 ++++--
src/scriber/cli/main.py | 204 ++++++++++++++--
src/scriber/core/config.py | 4 +-
src/scriber/core/models.py | 98 +++++++-
src/scriber/core/symbols.py | 21 ++
src/scriber/engine/ranker.py | 134 +++++++++++
src/scriber/engine/roles.py | 53 +++++
src/scriber/engine/scorer.py | 60 ++++-
src/scriber/graph/analyzers/__init__.py | 26 +++
src/scriber/graph/analyzers/base.py | 15 ++
src/scriber/graph/analyzers/config_refs.py | 35 +++
src/scriber/graph/analyzers/docs.py | 32 +++
src/scriber/graph/analyzers/env.py | 50 ++++
src/scriber/graph/analyzers/package.py | 30 +++
src/scriber/graph/analyzers/tests.py | 36 +++
src/scriber/graph/builder.py | 22 +-
src/scriber/graph/indexes.py | 37 +++
src/scriber/graph/languages/extractor.py | 74 ++++++
src/scriber/graph/model.py | 58 +++++
src/scriber/outline/__init__.py | 13 ++
src/scriber/outline/base.py | 7 +
src/scriber/outline/generic.py | 18 ++
src/scriber/outline/python.py | 39 ++++
src/scriber/packer/pack.py | 258 +++++++++++++++++----
src/scriber/renderer/llm_report.py | 216 +++++++++++++++++
src/scriber/scanner/files.py | 6 +-
src/scriber/scanner/scan.py | 12 +-
tests/test_cache.py | 2 +-
tests/test_native.py | 14 +-
tests/test_symbols.py | 43 ++++
37 files changed, 1856 insertions(+), 162 deletions(-)
create mode 100644 src/scriber/budget/allocator.py
create mode 100644 src/scriber/core/symbols.py
create mode 100644 src/scriber/engine/ranker.py
create mode 100644 src/scriber/engine/roles.py
create mode 100644 src/scriber/graph/analyzers/__init__.py
create mode 100644 src/scriber/graph/analyzers/base.py
create mode 100644 src/scriber/graph/analyzers/config_refs.py
create mode 100644 src/scriber/graph/analyzers/docs.py
create mode 100644 src/scriber/graph/analyzers/env.py
create mode 100644 src/scriber/graph/analyzers/package.py
create mode 100644 src/scriber/graph/analyzers/tests.py
create mode 100644 src/scriber/graph/indexes.py
create mode 100644 src/scriber/graph/languages/extractor.py
create mode 100644 src/scriber/graph/model.py
create mode 100644 src/scriber/outline/__init__.py
create mode 100644 src/scriber/outline/base.py
create mode 100644 src/scriber/outline/generic.py
create mode 100644 src/scriber/outline/python.py
create mode 100644 src/scriber/renderer/llm_report.py
create mode 100644 tests/test_symbols.py
diff --git a/CHANGELOG.md b/CHANGELOG.md
index bcd7481..e5e4f20 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -5,6 +5,18 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
+## [2.1.0] - 2026-05-31
+
+### Added
+- **π― AI-Native Navigation (P1)**: Implemented line-numbered code fences and symbol-level XML anchors (``) for classes and functions in full mode, allowing AI to navigate and apply Search & Replace diffs flawlessly.
+- **π Ultra-Focused Mode Optimization**: Focused mode (`scriber `) now acts as a precise surgical tool, cutting out unnecessary contextual noise.
+- **π‘οΈ Support Files Pruning**: Support files (`pyproject.toml`, `README.md`, Dockerfiles) are no longer granted automatic `full` mode immunity when running focused scans. They now decay to tree mode unless explicitly targeted.
+- **π§ͺ Test File Quarantine**: Test modules are heavily penalized in focused mode, dropping out of full/excerpt context to keep the generated pack laser-focused on actual implementation logic.
+
+### Fixed
+- **π Excerpt Fallback Bug**: Fixed a critical bug where `excerpt` files failed to render and completely dropped their token estimates, resulting in `_Excerpt unavailable_` placeholders. They now correctly fall back to outline AST structures and compute tokens accurately.
+- **βοΈ Graph Token Hard-Capping**: Re-engineered token budgeting with rigid distance-based hard caps in `ranker.py` (Max scores: 100/79/74/44 for Dist 0/1/2/3+ respectively). Focused mode is now reliably ~45% of the full project token size, completely eliminating distant `full` mode leaks.
+
## [2.0.0] - 2026-05-30
### Added
diff --git a/pyproject.toml b/pyproject.toml
index 6339246..010424e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "maturin"
[project]
name = "project-scriber"
-version = "2.0.0"
+version = "2.1.0"
description = "Scriber 2.0: build intelligent code packs from one or more project paths."
readme = "README.md"
requires-python = ">=3.10"
@@ -54,8 +54,8 @@ format = "md"
output = ".scriber/scriber_pack.md"
only_tree = false
use_gitignore = true
-max_files = 60
-max_tokens = 100000
+max_files = 0
+max_tokens = 0
min_score = 45
path_style = "project-relative"
allow_external_paths = false
diff --git a/rust/scriber_native/src/import.rs b/rust/scriber_native/src/import.rs
index c86a963..e2c2327 100644
--- a/rust/scriber_native/src/import.rs
+++ b/rust/scriber_native/src/import.rs
@@ -620,3 +620,79 @@ pub fn build_import_graph(
Ok(edges)
}
+
+#[pyclass]
+#[derive(Clone, Debug)]
+pub struct NativeRelationEdge {
+ #[pyo3(get)]
+ pub source: String,
+ #[pyo3(get)]
+ pub target: String,
+ #[pyo3(get)]
+ pub kind: String,
+ #[pyo3(get)]
+ pub weight: f64,
+ #[pyo3(get)]
+ pub confidence: f64,
+ #[pyo3(get)]
+ pub evidence: Option,
+ #[pyo3(get)]
+ pub line: Option,
+ #[pyo3(get)]
+ pub analyzer: String,
+}
+
+#[pymethods]
+impl NativeRelationEdge {
+ #[new]
+ #[pyo3(signature = (source, target, kind, weight, confidence, evidence, line, analyzer))]
+ #[allow(clippy::too_many_arguments)]
+ fn new(
+ source: String,
+ target: String,
+ kind: String,
+ weight: f64,
+ confidence: f64,
+ evidence: Option,
+ line: Option,
+ analyzer: String,
+ ) -> Self {
+ NativeRelationEdge {
+ source,
+ target,
+ kind,
+ weight,
+ confidence,
+ evidence,
+ line,
+ analyzer,
+ }
+ }
+}
+
+#[pyfunction]
+pub fn build_relation_graph(
+ root: &str,
+ files: Vec,
+ python_source_roots: Vec,
+ python_module_init_files: Vec,
+) -> PyResult> {
+ let import_edges =
+ build_import_graph(root, files, python_source_roots, python_module_init_files)?;
+
+ let mut relation_edges = Vec::with_capacity(import_edges.len());
+ for edge in import_edges {
+ relation_edges.push(NativeRelationEdge {
+ source: edge.from,
+ target: edge.to,
+ kind: "import".to_string(), // we map everything to "import" for now to match python
+ weight: 1.0,
+ confidence: 0.98,
+ evidence: None,
+ line: None,
+ analyzer: "imports:native".to_string(),
+ });
+ }
+
+ Ok(relation_edges)
+}
diff --git a/rust/scriber_native/src/lib.rs b/rust/scriber_native/src/lib.rs
index 4b854dd..90d4285 100644
--- a/rust/scriber_native/src/lib.rs
+++ b/rust/scriber_native/src/lib.rs
@@ -76,6 +76,7 @@ fn build_info() -> PyResult {
fn _native(_py: Python, m: &PyModule) -> PyResult<()> {
m.add_class::()?;
m.add_class::()?;
+ m.add_class::()?;
m.add_class::()?;
m.add_class::()?;
m.add_function(wrap_pyfunction!(read_text, m)?)?;
@@ -84,6 +85,7 @@ fn _native(_py: Python, m: &PyModule) -> PyResult<()> {
m.add_function(wrap_pyfunction!(read_many_text, m)?)?;
m.add_function(wrap_pyfunction!(scan_project, m)?)?;
m.add_function(wrap_pyfunction!(import::build_import_graph, m)?)?;
+ m.add_function(wrap_pyfunction!(import::build_relation_graph, m)?)?;
m.add_function(wrap_pyfunction!(score::score_candidates_native, m)?)?;
m.add_function(wrap_pyfunction!(render::render_tree, m)?)?;
m.add_function(wrap_pyfunction!(native_api_version, m)?)?;
diff --git a/rust/scriber_native/src/score.rs b/rust/scriber_native/src/score.rs
index a4efbff..f7d5ef4 100644
--- a/rust/scriber_native/src/score.rs
+++ b/rust/scriber_native/src/score.rs
@@ -1,4 +1,4 @@
-use crate::import::NativeImportEdge;
+use crate::import::NativeRelationEdge;
use crate::scan::NativeFileInfo;
use pyo3::prelude::*;
use std::collections::{HashMap, HashSet};
@@ -351,39 +351,117 @@ fn is_near_seed(support_file: &str, seed: &str) -> bool {
|| seed_parent.starts_with(sf_parent)
}
-fn walk_neighbors(
- edges: &HashMap>,
+use std::cmp::Ordering;
+use std::collections::BinaryHeap;
+
+#[derive(Debug, Clone)]
+struct QueueState {
+ strength: f64,
+ depth: usize,
+ node: String,
+}
+
+impl Eq for QueueState {}
+
+impl PartialEq for QueueState {
+ fn eq(&self, other: &Self) -> bool {
+ self.strength == other.strength && self.depth == other.depth && self.node == other.node
+ }
+}
+
+impl Ord for QueueState {
+ fn cmp(&self, other: &Self) -> Ordering {
+ self.strength
+ .partial_cmp(&other.strength)
+ .unwrap_or(Ordering::Equal)
+ .then_with(|| other.depth.cmp(&self.depth))
+ }
+}
+
+impl PartialOrd for QueueState {
+ fn partial_cmp(&self, other: &Self) -> Option {
+ Some(self.cmp(other))
+ }
+}
+
+fn walk_weighted_neighbors(
+ edges: &[NativeRelationEdge],
start: &str,
depth: usize,
-) -> HashMap {
- let mut found = HashMap::new();
- let mut frontier = HashSet::new();
- frontier.insert(start.to_string());
- let mut visited = HashSet::new();
- visited.insert(start.to_string());
-
- for distance in 1..=depth {
- let mut next_frontier = HashSet::new();
- for item in frontier {
- if let Some(neighbors) = edges.get(&item) {
- for neighbor in neighbors {
- if visited.contains(neighbor) {
- continue;
+ reverse: bool,
+) -> HashMap {
+ let mut adj: HashMap> = HashMap::new();
+ for edge in edges {
+ let u = if reverse { &edge.target } else { &edge.source };
+ let v = if reverse { &edge.source } else { &edge.target };
+ adj.entry(u.clone()).or_default().push((v.clone(), edge));
+ }
+
+ let mut max_strength: HashMap = HashMap::new();
+ max_strength.insert(start.to_string(), 1.0);
+
+ let mut best_at_state: HashMap<(String, usize), f64> = HashMap::new();
+ best_at_state.insert((start.to_string(), 0), 1.0);
+
+ let mut heap = BinaryHeap::new();
+ heap.push(QueueState {
+ strength: 1.0,
+ depth: 0,
+ node: start.to_string(),
+ });
+
+ while let Some(QueueState {
+ strength: u_str,
+ depth: u_depth,
+ node: u,
+ }) = heap.pop()
+ {
+ if u_str < *best_at_state.get(&(u.clone(), u_depth)).unwrap_or(&0.0) {
+ continue;
+ }
+
+ if u_depth >= depth {
+ continue;
+ }
+
+ if let Some(neighbors) = adj.get(&u) {
+ for (neighbor, edge) in neighbors {
+ let edge_str = if edge.kind == "import" || edge.kind == "reexport" {
+ if u_depth == 0 {
+ 1.0
+ } else {
+ 0.88
}
- visited.insert(neighbor.clone());
- found.insert(neighbor.clone(), distance);
- next_frontier.insert(neighbor.clone());
+ } else {
+ edge.weight * edge.confidence
+ };
+
+ let next_str = u_str * edge_str;
+ let next_depth = u_depth + 1;
+
+ if next_str > *max_strength.get(neighbor).unwrap_or(&0.0) {
+ max_strength.insert(neighbor.clone(), next_str);
+ }
+
+ let state_key = (neighbor.clone(), next_depth);
+ if next_str > *best_at_state.get(&state_key).unwrap_or(&0.0) {
+ best_at_state.insert(state_key, next_str);
+ heap.push(QueueState {
+ strength: next_str,
+ depth: next_depth,
+ node: neighbor.clone(),
+ });
}
}
}
- frontier = next_frontier;
- if frontier.is_empty() {
- break;
- }
}
- found
+
+ max_strength.remove(start);
+ max_strength
}
+
+
fn support_base_score(file: &NativeFileInfo, options: &NativePackOptions) -> i32 {
let cat = file.support_category.as_deref().unwrap_or("support file");
match cat {
@@ -429,7 +507,7 @@ fn matches_entrypoint(rel: &str, entrypoint_patterns: &[String]) -> bool {
pub fn score_candidates_native(
files: Vec,
seeds_list: Vec,
- edges: Vec,
+ edges: Vec,
options: NativePackOptions,
) -> PyResult> {
let mut mapped_files = HashMap::new();
@@ -450,15 +528,17 @@ pub fn score_candidates_native(
// Build graph edges maps
let mut graph_imports: HashMap> = HashMap::new();
let mut graph_imported_by: HashMap> = HashMap::new();
- for edge in edges {
- graph_imports
- .entry(edge.from.clone())
- .or_default()
- .insert(edge.to.clone());
- graph_imported_by
- .entry(edge.to.clone())
- .or_default()
- .insert(edge.from.clone());
+ for edge in &edges {
+ if edge.kind == "import" || edge.kind == "reexport" {
+ graph_imports
+ .entry(edge.source.clone())
+ .or_default()
+ .insert(edge.target.clone());
+ graph_imported_by
+ .entry(edge.target.clone())
+ .or_default()
+ .insert(edge.source.clone());
+ }
}
if options.mode == "project_snapshot" {
@@ -531,10 +611,12 @@ pub fn score_candidates_native(
for seed_rel in &seed_files {
// Direct dependencies
if options.include_direct_dependencies {
- for (dep, distance) in walk_neighbors(&graph_imports, seed_rel, options.depth) {
+ for (dep, strength) in
+ walk_weighted_neighbors(&edges, seed_rel, options.depth, false)
+ {
let score = std::cmp::max(
options.tree_min_score,
- options.direct_dependency_score - ((distance as i32 - 1) * 10),
+ (options.direct_dependency_score as f64 * strength) as i32,
);
if let Some(c) = mapped_files.get_mut(&dep) {
c.score = std::cmp::max(c.score, score);
@@ -551,12 +633,12 @@ pub fn score_candidates_native(
// Reverse dependencies
if options.include_reverse_dependencies {
- for (dep, distance) in
- walk_neighbors(&graph_imported_by, seed_rel, options.depth)
+ for (dep, strength) in
+ walk_weighted_neighbors(&edges, seed_rel, options.depth, true)
{
let score = std::cmp::max(
options.tree_min_score,
- options.reverse_dependency_score - ((distance as i32 - 1) * 10),
+ (options.reverse_dependency_score as f64 * strength) as i32,
);
if let Some(c) = mapped_files.get_mut(&dep) {
c.score = std::cmp::max(c.score, score);
diff --git a/src/scriber/__init__.py b/src/scriber/__init__.py
index 1aef752..b32d113 100644
--- a/src/scriber/__init__.py
+++ b/src/scriber/__init__.py
@@ -1,8 +1,9 @@
-"""ProjectScriber 2.0."""
+"""ProjectScriber 2.1."""
from .packer.pack import build_pack, build_and_write_pack
from .core.models import ScriberPack
__all__ = ["build_pack", "build_and_write_pack", "ScriberPack"]
-__version__ = "2.0.0"
+__version__ = "2.1.0"
+
diff --git a/src/scriber/budget/allocator.py b/src/scriber/budget/allocator.py
new file mode 100644
index 0000000..80b945b
--- /dev/null
+++ b/src/scriber/budget/allocator.py
@@ -0,0 +1,66 @@
+from __future__ import annotations
+from dataclasses import dataclass
+from typing import Any
+from scriber.core.models import Candidate, ContentMode, PackItem, FileRole
+
+@dataclass(slots=True)
+class BudgetPolicy:
+ target_tokens: int
+ hard_limit_tokens: int
+ mode: str = "full"
+ header_budget_ratio: float = 0.12
+ graph_budget_ratio: float = 0.08
+ full_code_budget_ratio: float = 0.55
+ outline_budget_ratio: float = 0.20
+ reserve_ratio: float = 0.05
+
+def allocate_budget(candidates: list[Candidate], policy: BudgetPolicy, explicit_seeds: set) -> list[PackItem]:
+ items = []
+
+ current_tokens = 0
+ full_budget = int(policy.target_tokens * policy.full_code_budget_ratio)
+
+ for i, c in enumerate(candidates):
+ item_id = f"F{i+1:03d}"
+ role = getattr(c, "role", "unknown")
+
+ mode: ContentMode = "tree"
+
+ is_seed = c.file.relative in explicit_seeds
+
+ if is_seed:
+ mode = "full"
+ elif c.file.content_policy == "tree_only":
+ mode = "tree"
+ elif c.file.content_policy == "full" and policy.mode != "focused":
+ mode = "full"
+ elif c.token_estimate <= 1200 and c.score >= 80 and current_tokens < full_budget:
+ mode = "full"
+ elif c.score >= 85 and c.token_estimate <= 2400 and current_tokens < full_budget:
+ mode = "full"
+ elif c.score >= 75:
+ mode = "excerpt"
+ elif c.score >= 45:
+ mode = "outline"
+ else:
+ mode = "tree"
+
+ if mode == "full":
+ current_tokens += c.token_estimate
+
+ item = PackItem(
+ file=c.file,
+ score=c.score,
+ role=role,
+ content_mode=mode,
+ reason=c.reason_summary,
+ reasons=c.reasons,
+ relation_evidence=[],
+ token_estimate=c.token_estimate,
+ utility=c.utility,
+ raw_score=c.raw_score,
+ item_id=item_id
+ )
+ items.append(item)
+
+ return items
diff --git a/src/scriber/cache.py b/src/scriber/cache.py
index f96f0fc..d899aa8 100644
--- a/src/scriber/cache.py
+++ b/src/scriber/cache.py
@@ -37,12 +37,17 @@ def __init__(self, config: ScriberConfig, project_root: Path):
self.enabled = config.cache.enabled
self.cache_dir = project_root / config.cache.dir
self.files_cache_path = self.cache_dir / "files.json"
- self.graph_cache_path = self.cache_dir / "import_graph.json"
+ self.imports_cache_path = self.cache_dir / "imports_v2.json"
+ self.relations_cache_path = self.cache_dir / "relations_v1.jsonl"
self.config_hash = get_config_hash(config)
self.python_version = f"{sys.version_info.major}.{sys.version_info.minor}"
+ self.reads = 0
+ self.hits = 0
+ self.writes = 0
+
self.files_data: dict[str, dict[str, Any]] = {}
- self.graph_data: dict[str, list[str]] = {}
+ self.imports_data: dict[str, dict[str, Any]] = {}
self._load()
def _load(self) -> None:
@@ -53,13 +58,14 @@ def _load(self) -> None:
if self.files_cache_path.exists():
with self.files_cache_path.open("r", encoding="utf-8") as f:
self.files_data = json.load(f)
- if self.graph_cache_path.exists():
- with self.graph_cache_path.open("r", encoding="utf-8") as f:
- self.graph_data = json.load(f)
+ if self.imports_cache_path.exists():
+ with self.imports_cache_path.open("r", encoding="utf-8") as f:
+ self.imports_data = json.load(f)
+ # relations_v1.jsonl will be append-only or rewritten on save, we don't load it entirely into memory for now
except Exception:
# Silently fallback to empty cache on read errors
self.files_data = {}
- self.graph_data = {}
+ self.imports_data = {}
def get_file(self, rel_path: Path, mtime_ns: int, size: int) -> dict[str, Any] | None:
if not self.enabled:
@@ -92,17 +98,57 @@ def set_file(self, rel_path: Path, mtime_ns: int, size: int, data: dict[str, Any
def get_imports(self, rel_path: Path) -> set[Path] | None:
if not self.enabled:
return None
+ self.reads += 1
key = rel_path.as_posix()
- imports = self.graph_data.get(key)
+ imports = self.imports_data.get(key)
if imports is not None:
- return {Path(p) for p in imports}
+ self.hits += 1
+ return {Path(p) for p in imports.get("targets", [])}
return None
def set_imports(self, rel_path: Path, imports: set[Path]) -> None:
if not self.enabled:
return
+ self.writes += 1
key = rel_path.as_posix()
- self.graph_data[key] = [p.as_posix() for p in sorted(imports)]
+ try:
+ stat = (self.cache_dir.parent.parent / rel_path).stat()
+ mtime_ns = stat.st_mtime_ns
+ size = stat.st_size
+ except OSError:
+ mtime_ns = 0
+ size = 0
+ self.imports_data[key] = {
+ "mtime_ns": mtime_ns,
+ "size": size,
+ "config_hash": self.config_hash,
+ "targets": [p.as_posix() for p in sorted(imports)]
+ }
+
+ def add_import_edge(self, source: Path, target: Path) -> None:
+ if not self.enabled:
+ return
+ self.writes += 1
+ key = source.as_posix()
+ target_str = target.as_posix()
+ if key not in self.imports_data:
+ try:
+ stat = (self.cache_dir.parent.parent / source).stat()
+ mtime_ns = stat.st_mtime_ns
+ size = stat.st_size
+ except OSError:
+ mtime_ns = 0
+ size = 0
+ self.imports_data[key] = {
+ "mtime_ns": mtime_ns,
+ "size": size,
+ "config_hash": self.config_hash,
+ "targets": [target_str]
+ }
+ else:
+ if target_str not in self.imports_data[key].get("targets", []):
+ self.imports_data[key].setdefault("targets", []).append(target_str)
+ self.imports_data[key]["targets"].sort()
def save(self, active_files: set[Path] | None = None) -> None:
if not self.enabled:
@@ -116,7 +162,7 @@ def save(self, active_files: set[Path] | None = None) -> None:
if active_files is not None:
active_keys = {p.as_posix() for p in active_files}
self.files_data = {k: v for k, v in self.files_data.items() if k in active_keys}
- self.graph_data = {k: v for k, v in self.graph_data.items() if k in active_keys}
+ self.imports_data = {k: v for k, v in self.imports_data.items() if k in active_keys}
# 2. Enforce absolute limit of max 1000 entries to prevent infinite growth
if len(self.files_data) > 1000:
@@ -125,11 +171,11 @@ def save(self, active_files: set[Path] | None = None) -> None:
to_remove = sorted_keys[:len(sorted_keys) - 1000]
for k in to_remove:
self.files_data.pop(k, None)
- self.graph_data.pop(k, None)
+ self.imports_data.pop(k, None)
with self.files_cache_path.open("w", encoding="utf-8") as f:
json.dump(self.files_data, f, indent=2)
- with self.graph_cache_path.open("w", encoding="utf-8") as f:
- json.dump(self.graph_data, f, indent=2)
+ with self.imports_cache_path.open("w", encoding="utf-8") as f:
+ json.dump(self.imports_data, f, indent=2)
except Exception:
pass # Fail silently on write errors to not interrupt execution
diff --git a/src/scriber/cli/main.py b/src/scriber/cli/main.py
index c943226..7071f7f 100644
--- a/src/scriber/cli/main.py
+++ b/src/scriber/cli/main.py
@@ -11,6 +11,109 @@
from scriber.core.root import resolve_config_path
from scriber.packer.pack import build_and_write_pack
+def handle_introspection(args, pack) -> None:
+ import json
+
+ # 1. Export Graph JSON if requested
+ if args.graph_json:
+ edges_data = []
+ for edge in pack.graph.edges:
+ edges_data.append({
+ "source": str(edge.source),
+ "target": str(edge.target),
+ "kind": edge.kind,
+ "weight": edge.weight,
+ "confidence": edge.confidence,
+ "evidence": edge.evidence,
+ "line": edge.line,
+ "analyzer": edge.analyzer
+ })
+
+ graph_data = {"edges": edges_data}
+ json_path = Path(args.graph_json)
+ try:
+ with open(json_path, "w", encoding="utf-8") as f:
+ json.dump(graph_data, f, indent=2)
+ print(f"Exported relation graph to {json_path}", file=sys.stderr)
+ except Exception as e:
+ print(f"Error exporting relation graph to JSON: {e}", file=sys.stderr)
+
+ # 2. Explain Graph
+ if args.explain_graph:
+ edges = pack.graph.edges
+ total_edges = len(edges)
+
+ # Group by kind
+ kind_counts = {}
+ for edge in edges:
+ kind_counts[edge.kind] = kind_counts.get(edge.kind, 0) + 1
+
+ # Get unique nodes
+ nodes = set()
+ for edge in edges:
+ nodes.add(edge.source)
+ nodes.add(edge.target)
+ unique_nodes = len(nodes)
+ avg_degree = (total_edges * 2.0 / unique_nodes) if unique_nodes > 0 else 0.0
+
+ print("\n========================================", file=sys.stderr)
+ print("SCRIBER RELATION GRAPH EXPLANATION", file=sys.stderr)
+ print("========================================", file=sys.stderr)
+ print(f"Total Edges: {total_edges}", file=sys.stderr)
+ print("Edges by Kind:", file=sys.stderr)
+ for kind, count in sorted(kind_counts.items(), key=lambda x: x[1], reverse=True):
+ print(f" - {kind.ljust(20)}: {count}", file=sys.stderr)
+ print(f"Unique Nodes: {unique_nodes}", file=sys.stderr)
+ print(f"Average Degree: {avg_degree:.2f}", file=sys.stderr)
+ print("========================================\n", file=sys.stderr)
+
+ # 3. Why
+ if args.why:
+ why_target = args.why.replace("\\", "/").lower()
+ target_c = None
+
+ candidates_or_items = getattr(pack, "candidates", getattr(pack, "items", []))
+ for c in candidates_or_items:
+ rel_str = c.file.relative.as_posix().lower()
+ abs_str = c.file.absolute.as_posix().lower()
+ if why_target in rel_str or why_target in abs_str:
+ target_c = c
+ break
+
+ if not target_c:
+ print(f"\nCould not find file matching '{args.why}' in the analyzed candidates.", file=sys.stderr)
+ return
+
+ print("\n========================================", file=sys.stderr)
+ print(f"WHY WAS '{target_c.file.relative}' INCLUDED?", file=sys.stderr)
+ print("========================================", file=sys.stderr)
+ print(f"Score: {target_c.score}", file=sys.stderr)
+ if hasattr(target_c, "role"):
+ print(f"Role: {target_c.role}", file=sys.stderr)
+
+ reasons = getattr(target_c, "reasons", [])
+ if reasons:
+ print("Selection Reasons:", file=sys.stderr)
+ for r in reasons:
+ print(f" - {r}", file=sys.stderr)
+ else:
+ reason_summary = getattr(target_c, "reason_summary", getattr(target_c, "reason", "None"))
+ print(f"Selection Reasons: {reason_summary}", file=sys.stderr)
+
+ incoming = []
+ for edge in pack.graph.edges:
+ if edge.target == target_c.file.relative:
+ incoming.append(edge)
+
+ if incoming:
+ print("\nIncoming Relation Edges:", file=sys.stderr)
+ for edge in sorted(incoming, key=lambda e: (e.kind, str(e.source))):
+ ev = f" ({edge.evidence})" if edge.evidence else ""
+ print(f" - {edge.source} -> [this file] (kind: {edge.kind}, weight: {edge.weight}, confidence: {edge.confidence}){ev}", file=sys.stderr)
+ else:
+ print("\nNo incoming relation edges found in graph.", file=sys.stderr)
+ print("========================================\n", file=sys.stderr)
+
@@ -27,6 +130,7 @@ def build_parser() -> argparse.ArgumentParser:
description="Scriber 2.0: build an intelligent code pack from one or more project paths.",
)
parser.add_argument("paths", nargs="*", help="Project file/folder paths used as seeds. Defaults to current directory.")
+ parser.add_argument("--profile", choices=["gpt", "focused-gpt", "full"], default="gpt", help="Preset configuration profile (gpt, focused-gpt, full).")
parser.add_argument("--config", help="Path to pyproject.toml. Its parent directory becomes the project root.")
parser.add_argument("--path-base", choices=["project", "cwd"], default="project", help="Base directory for relative paths when --config is used.")
parser.add_argument("--format", choices=["md", "txt"], dest="output_format", help="Output format.")
@@ -46,6 +150,9 @@ def build_parser() -> argparse.ArgumentParser:
parser.add_argument("--force", action="store_true", help="Allow --init to append even if [tool.scriber] already exists.")
parser.add_argument("--project", action="store_true", help="Force project snapshot mode.")
parser.add_argument("--explain-selection", action="store_true", help="Explain reason for file selection in detail.")
+ parser.add_argument("--explain-graph", action="store_true", help="Print relation graph statistics and relations.")
+ parser.add_argument("--why", help="Print exactly which rules/edges pulled the specified file into the pack.")
+ parser.add_argument("--graph-json", help="Export the RelationGraph as a JSON file to the specified path.")
parser.add_argument("--validate-config", action="store_true", help="Validate pyproject.toml scriber config.")
parser.add_argument("--dry-run", action="store_true", help="Perform a dry run without saving the pack file.")
parser.add_argument("--open", action="store_true", help="Open the output file automatically after creation.")
@@ -113,6 +220,7 @@ def main(argv: Sequence[str] | None = None) -> int:
pack = build_pack(
args.paths or ["."],
config_path=args.config,
+ profile=args.profile,
output=args.output,
output_format=args.output_format,
only_tree=True if args.only_tree else None,
@@ -129,9 +237,16 @@ def main(argv: Sequence[str] | None = None) -> int:
sys.stderr.write("\r".ljust(80) + "\r")
sys.stderr.flush()
- code_count = len([c for c in pack.candidates if c.file.kind == "code" and c.include_content])
- support_count = len([c for c in pack.candidates if c.file.kind == "support" and c.include_content])
- total_count = len(pack.candidates)
+ is_llm_pack = hasattr(pack, "items")
+ items = getattr(pack, "items", getattr(pack, "candidates", []))
+ if is_llm_pack:
+ code_count = len([c for c in items if c.file.kind == "code" and c.content_mode != "tree"])
+ support_count = len([c for c in items if c.file.kind == "support" and c.content_mode != "tree"])
+ total_count = len([c for c in items if c.content_mode != "tree"])
+ else:
+ code_count = len([c for c in items if c.file.kind == "code" and c.include_content])
+ support_count = len([c for c in items if c.file.kind == "support" and c.include_content])
+ total_count = len([c for c in items if c.include_content])
print("Scriber dry-run completed.", file=sys.stderr)
print("----------------------------------------", file=sys.stderr)
@@ -139,13 +254,24 @@ def main(argv: Sequence[str] | None = None) -> int:
print(f" Code files selected: {code_count}", file=sys.stderr)
print(f" Support files selected: {support_count}", file=sys.stderr)
print(f" Total files in pack: {total_count}", file=sys.stderr)
- print(f" Estimated tokens: {pack.total_tokens}", file=sys.stderr)
- if args.timings and pack.timings:
- print("----------------------------------------", file=sys.stderr)
- print("Timings:", file=sys.stderr)
- for phase, duration in pack.timings.items():
- print(f" {phase.replace('_', ' ').ljust(15)}: {duration:.4f}s", file=sys.stderr)
- print(f" total: {sum(pack.timings.values()):.4f}s", file=sys.stderr)
+ total_tokens = getattr(pack, "budget_actual", getattr(pack, "total_tokens", 0))
+ print(f" Estimated tokens: {total_tokens}", file=sys.stderr)
+ if args.timings:
+ if pack.stats:
+ print("----------------------------------------", file=sys.stderr)
+ print("Stats:", file=sys.stderr)
+ if "graph_edges_built" in pack.stats:
+ print(f" Graph edges built: {pack.stats['graph_edges_built']}", file=sys.stderr)
+ print(f" Graph cache reads: {pack.stats['graph_cache_reads']}", file=sys.stderr)
+ print(f" Graph cache hits: {pack.stats['graph_cache_hits']}", file=sys.stderr)
+ print(f" Graph cache writes: {pack.stats['graph_cache_writes']}", file=sys.stderr)
+ print(f" Graph source: {pack.stats['graph_source']}", file=sys.stderr)
+ if pack.timings:
+ print("----------------------------------------", file=sys.stderr)
+ print("Timings:", file=sys.stderr)
+ for phase, duration in pack.timings.items():
+ print(f" {phase.replace('_', ' ').ljust(15)}: {duration:.4f}s", file=sys.stderr)
+ print(f" total: {sum(pack.timings.values()):.4f}s", file=sys.stderr)
config = load_config(pack.config_path)
config = apply_overrides(config, output=args.output)
@@ -154,11 +280,14 @@ def main(argv: Sequence[str] | None = None) -> int:
output_path = pack.project_root / output_path
print(f" Proposed output path: {output_path}", file=sys.stderr)
print("----------------------------------------", file=sys.stderr)
+ if args.explain_graph or args.why or args.graph_json:
+ handle_introspection(args, pack)
return 0
output, pack = build_and_write_pack(
args.paths or ["."],
config_path=args.config,
+ profile=args.profile,
output=args.output,
output_format=args.output_format,
only_tree=True if args.only_tree else None,
@@ -177,31 +306,58 @@ def main(argv: Sequence[str] | None = None) -> int:
sys.stderr.write("\r".ljust(80) + "\r")
sys.stderr.flush()
+ is_llm_pack = hasattr(pack, "items")
+ items = getattr(pack, "items", getattr(pack, "candidates", []))
+
code_count = 0
support_count = 0
omitted_count = 0
- for cand in pack.candidates:
- if cand.include_content:
- if cand.file.kind == "code":
- code_count += 1
- elif cand.file.kind == "support":
- support_count += 1
+
+ for cand in items:
+ if is_llm_pack:
+ if cand.content_mode != "tree":
+ if cand.file.kind == "code":
+ code_count += 1
+ elif cand.file.kind == "support":
+ support_count += 1
+ else:
+ omitted_count += 1
else:
- omitted_count += 1
+ if cand.include_content:
+ if cand.file.kind == "code":
+ code_count += 1
+ elif cand.file.kind == "support":
+ support_count += 1
+ else:
+ omitted_count += 1
sys.stderr.write("Scriber build completed.\n")
sys.stderr.write("----------------------------------------\n")
sys.stderr.write(f" Code files included: {code_count}\n")
sys.stderr.write(f" Support files included: {support_count}\n")
sys.stderr.write(f" Files omitted/skipped: {omitted_count}\n")
- sys.stderr.write(f" Estimated tokens: {pack.total_tokens}\n")
+ total_tokens = getattr(pack, "budget_actual", getattr(pack, "total_tokens", 0))
+ sys.stderr.write(f" Estimated tokens: {total_tokens}\n")
sys.stderr.write("----------------------------------------\n")
- if args.timings and pack.timings:
- sys.stderr.write("Timings:\n")
- for phase, duration in pack.timings.items():
- sys.stderr.write(f" - {phase.replace('_', ' ').ljust(15)}: {duration:.4f}s\n")
- sys.stderr.write(f" - total: {sum(pack.timings.values()):.4f}s\n")
- sys.stderr.write("----------------------------------------\n")
+ if args.timings:
+ if pack.stats:
+ sys.stderr.write("Stats:\n")
+ if "graph_edges_built" in pack.stats:
+ sys.stderr.write(f" - Graph edges built: {pack.stats['graph_edges_built']}\n")
+ sys.stderr.write(f" - Graph cache reads: {pack.stats['graph_cache_reads']}\n")
+ sys.stderr.write(f" - Graph cache hits: {pack.stats['graph_cache_hits']}\n")
+ sys.stderr.write(f" - Graph cache writes: {pack.stats['graph_cache_writes']}\n")
+ sys.stderr.write(f" - Graph source: {pack.stats['graph_source']}\n")
+ sys.stderr.write("----------------------------------------\n")
+ if pack.timings:
+ sys.stderr.write("Timings:\n")
+ for phase, duration in pack.timings.items():
+ sys.stderr.write(f" - {phase.replace('_', ' ').ljust(15)}: {duration:.4f}s\n")
+ sys.stderr.write(f" - total: {sum(pack.timings.values()):.4f}s\n")
+ sys.stderr.write("----------------------------------------\n")
+
+ if args.explain_graph or args.why or args.graph_json:
+ handle_introspection(args, pack)
if output is not None:
print(f"Scriber pack written to: {output}")
diff --git a/src/scriber/core/config.py b/src/scriber/core/config.py
index fed0545..73895fc 100644
--- a/src/scriber/core/config.py
+++ b/src/scriber/core/config.py
@@ -147,8 +147,8 @@
output = ".scriber/scriber_pack.md"
only_tree = false
use_gitignore = true
-max_files = 60
-max_tokens = 100000
+max_files = 0
+max_tokens = 0
min_score = 45
path_style = "project-relative"
allow_external_paths = false
diff --git a/src/scriber/core/models.py b/src/scriber/core/models.py
index c521c49..c093382 100644
--- a/src/scriber/core/models.py
+++ b/src/scriber/core/models.py
@@ -2,7 +2,7 @@
from dataclasses import dataclass, field
from pathlib import Path
-from typing import Literal
+from typing import Any, Literal
FileKind = Literal["code", "support", "other"]
ContentPolicy = Literal["full", "auto", "tree_only"]
@@ -113,8 +113,16 @@ class FileNode:
def read_text(self) -> str:
if self._cached_text is not None:
return self._cached_text
- from scriber.native import require_native
- text = require_native().read_text(str(self.absolute))
+
+ try:
+ from scriber.native import is_native_available, require_native
+ if is_native_available():
+ text = require_native().read_text(str(self.absolute))
+ else:
+ text = self.absolute.read_text(encoding="utf-8", errors="replace")
+ except Exception:
+ text = self.absolute.read_text(encoding="utf-8", errors="replace")
+
object.__setattr__(self, "_cached_text", text)
return text
@@ -141,12 +149,12 @@ class Candidate:
reason_counts: dict[str, int] = field(default_factory=dict)
reason_examples: dict[str, list[Path]] = field(default_factory=dict)
reason_summary: str = ""
+ utility: float = 0.0
+ raw_score: float = 0.0
+ role: str = "unknown"
-@dataclass(slots=True)
-class ModuleGraph:
- imports: dict[Path, set[Path]] = field(default_factory=dict)
- imported_by: dict[Path, set[Path]] = field(default_factory=dict)
+from scriber.graph.model import RelationKind, RelationEdge, RelationGraph, ModuleGraph
@dataclass(slots=True)
@@ -160,8 +168,84 @@ class ScriberPack:
output_format: OutputFormat
mode: PackMode
total_tokens: int = 0
+ stats: dict[str, Any] = field(default_factory=dict)
timings: dict[str, float] = field(default_factory=dict)
@property
def included_paths(self) -> list[Path]:
return [candidate.file.relative for candidate in self.candidates]
+
+
+ContentMode = Literal["full", "excerpt", "outline", "tree", "omit"]
+
+FileRole = Literal[
+ "entrypoint",
+ "orchestrator",
+ "model",
+ "config",
+ "graph",
+ "ranker",
+ "renderer",
+ "scanner",
+ "language_adapter",
+ "native_adapter",
+ "test",
+ "support",
+ "docs",
+ "generated",
+ "unknown",
+]
+
+@dataclass(frozen=True, slots=True)
+class FileRef:
+ path: Path
+ kind: FileKind
+ language: str
+ size_bytes: int
+ token_estimate: int
+ role: FileRole = "unknown"
+
+@dataclass(frozen=True, slots=True)
+class FileOutline:
+ path: Path
+ language: str
+ purpose: str | None
+ imports: list[str]
+ exports: list[str]
+ classes: list[str]
+ functions: list[str]
+ constants: list[str]
+ notes: list[str]
+ token_estimate: int
+
+@dataclass(slots=True)
+class PackItem:
+ file: FileNode
+ score: int
+ role: FileRole
+ content_mode: ContentMode
+ reason: str
+ reasons: list[str]
+ relation_evidence: list[RelationEdge]
+ outline: FileOutline | None = None
+ content: str | None = None
+ excerpts: list[str] = field(default_factory=list)
+ token_estimate: int = 0
+ item_id: str = ""
+ utility: float = 0.0
+ raw_score: float = 0.0
+
+@dataclass(slots=True)
+class LlmPack:
+ project_root: Path
+ config_path: Path
+ profile: str
+ mode: PackMode
+ goal: str | None
+ budget_target: int
+ budget_actual: int
+ items: list[PackItem]
+ graph: RelationGraph
+ stats: dict[str, Any]
+ warnings: list[str]
+ timings: dict[str, float] = field(default_factory=dict)
diff --git a/src/scriber/core/symbols.py b/src/scriber/core/symbols.py
new file mode 100644
index 0000000..fa127b0
--- /dev/null
+++ b/src/scriber/core/symbols.py
@@ -0,0 +1,21 @@
+from dataclasses import dataclass, field
+from pathlib import Path
+
+@dataclass(slots=True)
+class SymbolNode:
+ name: str
+ kind: str # "class" or "function"
+ line_start: int
+ line_end: int
+ parent_name: str | None = None
+
+
+@dataclass(slots=True)
+class SymbolIndex:
+ symbols_by_file: dict[Path, list[SymbolNode]] = field(default_factory=dict)
+
+ def add_symbol(self, file_path: Path, symbol: SymbolNode) -> None:
+ self.symbols_by_file.setdefault(file_path, []).append(symbol)
+
+ def get_symbols(self, file_path: Path) -> list[SymbolNode]:
+ return self.symbols_by_file.get(file_path, [])
diff --git a/src/scriber/engine/ranker.py b/src/scriber/engine/ranker.py
new file mode 100644
index 0000000..cdc8474
--- /dev/null
+++ b/src/scriber/engine/ranker.py
@@ -0,0 +1,134 @@
+from __future__ import annotations
+from pathlib import Path
+import math
+from collections import deque, defaultdict
+from scriber.core.models import FileNode, RelationGraph, ScriberConfig, Candidate
+from scriber.engine.roles import classify_file_role, ROLE_SCORE
+
+RELATION_WEIGHT = {
+ "import": 90,
+ "reexport": 80,
+ "test_of": 78,
+ "entrypoint_to_module": 75,
+ "config_refs_code": 58,
+ "env_key": 52,
+ "doc_mentions_code": 42,
+ "git_cochange": 40,
+ "same_package": 28,
+ "same_dir": 20,
+ "name_similarity": 18,
+ "semantic_similarity": 15,
+}
+
+def rank_context(files: dict[Path, FileNode], graph: RelationGraph, seeds: list[Path], config: ScriberConfig, mode: str) -> list[Candidate]:
+ candidates = []
+
+ explicit_seeds = {s for s in seeds}
+
+ distances = {}
+ if mode == "focused":
+ adj_out = defaultdict(list)
+ adj_in = defaultdict(list)
+ for edge in graph.edges:
+ adj_out[edge.source].append(edge.target)
+ adj_in[edge.target].append(edge.source)
+
+ q_out = deque()
+ q_in = deque()
+ dist_out = {}
+ dist_in = {}
+
+ for s in explicit_seeds:
+ if s in files:
+ dist_out[s] = 0
+ dist_in[s] = 0
+ q_out.append(s)
+ q_in.append(s)
+
+ while q_out:
+ curr = q_out.popleft()
+ d = dist_out[curr]
+ for nbr in adj_out[curr]:
+ if nbr not in dist_out:
+ dist_out[nbr] = d + 1
+ q_out.append(nbr)
+
+ while q_in:
+ curr = q_in.popleft()
+ d = dist_in[curr]
+ for nbr in adj_in[curr]:
+ if nbr not in dist_in:
+ dist_in[nbr] = d + 1
+ q_in.append(nbr)
+
+ for rel in files.keys():
+ d_out = dist_out.get(rel, 999)
+ d_in = dist_in.get(rel, 999)
+ distances[rel] = min(d_out, d_in)
+
+ for rel, node in files.items():
+ role = classify_file_role(node, graph)
+ role_score = ROLE_SCORE.get(role, 20)
+
+ relation_score = 0.0
+ incoming = graph.incoming.get(rel, [])
+ for edge in incoming:
+ weight = RELATION_WEIGHT.get(edge.kind, 10) * edge.weight * edge.confidence
+ relation_score += weight
+
+ centrality_bonus = 0
+ evidence_bonus = len(incoming) * 2
+ noise_penalty = 0
+
+ if node.language in {"json", "lock", "svg"}:
+ noise_penalty += 50
+
+ if mode == "focused":
+ dist = distances.get(rel, 999)
+ if dist == 0:
+ decay = 1.0
+ seed_bonus = 100
+ max_score = 100
+ elif dist == 1:
+ decay = 1.0
+ seed_bonus = 0
+ max_score = 79
+ elif dist == 2:
+ decay = 0.5
+ seed_bonus = 0
+ max_score = 74
+ else:
+ decay = 0.1
+ seed_bonus = 0
+ max_score = 44
+ else:
+ decay = 1.0
+ seed_bonus = 100 if rel in explicit_seeds else 0
+ max_score = 100
+
+ if mode == "focused" and role == "test" and rel not in explicit_seeds:
+ noise_penalty += 80
+ max_score = min(max_score, 44) # Force test files to tree mode unless specifically targeted
+
+ raw_score = (role_score + relation_score + seed_bonus + centrality_bonus + evidence_bonus - noise_penalty) * decay
+
+ token_estimate = node.size_bytes // 4
+ utility = raw_score / math.sqrt(token_estimate + 200)
+
+ c = Candidate(
+ file=node,
+ score=int(min(max_score, max(0, raw_score))), # clamp to distance-based max_score
+ reasons=[f"Role {role}: {role_score}", f"Relations: {relation_score:.1f}"],
+ include_content=False,
+ token_estimate=token_estimate
+ )
+
+ object.__setattr__(c, "utility", utility)
+ object.__setattr__(c, "raw_score", raw_score)
+ object.__setattr__(c, "role", role)
+
+ candidates.append(c)
+
+ # Primary sort by utility, then score
+ candidates.sort(key=lambda c: (getattr(c, "utility", 0), c.score), reverse=True)
+ return candidates
diff --git a/src/scriber/engine/roles.py b/src/scriber/engine/roles.py
new file mode 100644
index 0000000..2f319c8
--- /dev/null
+++ b/src/scriber/engine/roles.py
@@ -0,0 +1,53 @@
+from __future__ import annotations
+from pathlib import Path
+from scriber.core.models import FileNode, FileRole, RelationGraph
+
+ROLE_SCORE: dict[FileRole, int] = {
+ "entrypoint": 95,
+ "orchestrator": 95,
+ "graph": 90,
+ "ranker": 90,
+ "renderer": 90,
+ "model": 88,
+ "config": 82,
+ "scanner": 75,
+ "native_adapter": 65,
+ "language_adapter": 65,
+ "test": 55,
+ "support": 45,
+ "docs": 35,
+ "generated": 5,
+ "unknown": 20,
+}
+
+def classify_file_role(file: FileNode, graph: RelationGraph) -> FileRole:
+ rel = file.relative.as_posix().lower()
+
+ if rel in {"cli/main.py", "src/scriber/cli/main.py", "src/main.py", "main.py"}:
+ return "entrypoint"
+ if "orchestrator" in rel or "pack.py" in rel or "build.py" in rel:
+ return "orchestrator"
+ if "core/models.py" in rel or "model.py" in rel:
+ return "model"
+ if "core/config.py" in rel or "config.py" in rel:
+ return "config"
+ if "test" in rel and file.kind == "code":
+ return "test"
+ if "languages/" in rel:
+ return "language_adapter"
+ if "graph/" in rel:
+ return "graph"
+ if "ranker.py" in rel or "scorer.py" in rel:
+ return "ranker"
+ if "renderer" in rel or "llm_report" in rel:
+ return "renderer"
+ if "scanner/" in rel:
+ return "scanner"
+ if rel.endswith("native.py") or "rust/scriber_native/" in rel or ("native" in rel and file.language == "rust"):
+ return "native_adapter"
+ if "readme" in rel or rel.startswith("docs"):
+ return "docs"
+ if rel in {"pyproject.toml", "package.json", "cargo.toml"} or file.kind == "support":
+ return "support"
+
+ return "unknown"
diff --git a/src/scriber/engine/scorer.py b/src/scriber/engine/scorer.py
index 0dbfb24..dee0cca 100644
--- a/src/scriber/engine/scorer.py
+++ b/src/scriber/engine/scorer.py
@@ -3,7 +3,7 @@
from pathlib import Path
from scriber.core.matchers import match_pattern
-from scriber.core.models import Candidate, FileNode, ModuleGraph, ScriberConfig, SeedPath
+from scriber.core.models import Candidate, FileNode, ModuleGraph, ScriberConfig, SeedPath, RelationEdge
def _score(config: ScriberConfig, key: str) -> int:
@@ -111,6 +111,56 @@ def _name_related(a: Path, b: Path) -> bool:
return a_stem in b_stem or b_stem in a_stem
+def _walk_weighted_neighbors(
+ edges: list[RelationEdge],
+ start: Path,
+ depth_limit: int,
+ reverse: bool = False
+) -> dict[Path, float]:
+ import heapq
+
+ adj: dict[Path, list[tuple[Path, RelationEdge]]] = {}
+ for edge in edges:
+ u = edge.target if reverse else edge.source
+ v = edge.source if reverse else edge.target
+ adj.setdefault(u, []).append((v, edge))
+
+ queue = [(-1.0, 0, start)]
+ max_strength: dict[Path, float] = {start: 1.0}
+ best_at_state: dict[tuple[Path, int], float] = {(start, 0): 1.0}
+
+ while queue:
+ neg_str, depth, u = heapq.heappop(queue)
+ u_str = -neg_str
+
+ if u_str < best_at_state.get((u, depth), 0.0):
+ continue
+
+ if depth >= depth_limit:
+ continue
+
+ for neighbor, edge in adj.get(u, []):
+ if edge.kind in {"import", "reexport"}:
+ edge_str = 1.0 if depth == 0 else 0.88
+ else:
+ edge_str = edge.weight * edge.confidence
+
+ next_str = u_str * edge_str
+ next_depth = depth + 1
+
+ if next_str > max_strength.get(neighbor, 0.0):
+ max_strength[neighbor] = next_str
+
+ if next_str > best_at_state.get((neighbor, next_depth), 0.0):
+ best_at_state[(neighbor, next_depth)] = next_str
+ heapq.heappush(queue, (-next_str, next_depth, neighbor))
+
+ if start in max_strength:
+ del max_strength[start]
+
+ return max_strength
+
+
def _walk_neighbors(edges: dict[Path, set[Path]], start: Path, depth: int) -> dict[Path, int]:
found: dict[Path, int] = {}
frontier = {start}
@@ -222,13 +272,13 @@ def score_candidates(
if config.modules and scoring.enabled:
for seed_rel in seed_files:
if scoring.include_direct_dependencies:
- for dep, distance in _walk_neighbors(graph.imports, seed_rel, scoring.depth).items():
- score = max(scoring.tree_min_score, _score(config, "direct_dependency") - ((distance - 1) * 10))
+ for dep, strength in _walk_weighted_neighbors(graph.edges, seed_rel, scoring.depth, reverse=False).items():
+ score = max(scoring.tree_min_score, int(_score(config, "direct_dependency") * strength))
_add(candidates, files, dep, score, "direct_dependency", f"direct dependency of `{seed_rel.as_posix()}`", seed=seed_rel)
if scoring.include_reverse_dependencies:
- for dep, distance in _walk_neighbors(graph.imported_by, seed_rel, scoring.depth).items():
- score = max(scoring.tree_min_score, _score(config, "reverse_dependency") - ((distance - 1) * 10))
+ for dep, strength in _walk_weighted_neighbors(graph.edges, seed_rel, scoring.depth, reverse=True).items():
+ score = max(scoring.tree_min_score, int(_score(config, "reverse_dependency") * strength))
_add(candidates, files, dep, score, "reverse_dependency", f"imports seed `{seed_rel.as_posix()}`", seed=seed_rel)
if scoring.include_same_package:
diff --git a/src/scriber/graph/analyzers/__init__.py b/src/scriber/graph/analyzers/__init__.py
new file mode 100644
index 0000000..129c757
--- /dev/null
+++ b/src/scriber/graph/analyzers/__init__.py
@@ -0,0 +1,26 @@
+from pathlib import Path
+from typing import Any
+from scriber.graph.indexes import GraphIndexes
+from scriber.graph.analyzers.tests import TestsAnalyzer
+from scriber.graph.analyzers.package import PackageAnalyzer
+from scriber.graph.analyzers.env import EnvAnalyzer
+from scriber.graph.analyzers.config_refs import ConfigRefsAnalyzer
+from scriber.graph.analyzers.docs import DocsAnalyzer
+
+def generate_cheap_relations(files: dict[Path, Any], edge_cls: Any, is_native: bool = False) -> list[Any]:
+ indexes = GraphIndexes.build(files)
+ config = None # Passed as None for these simple analyzers
+
+ analyzers = [
+ TestsAnalyzer(),
+ PackageAnalyzer(),
+ EnvAnalyzer(),
+ ConfigRefsAnalyzer(),
+ DocsAnalyzer(),
+ ]
+
+ edges = []
+ for analyzer in analyzers:
+ edges.extend(analyzer.analyze(files, indexes, config, edge_cls, is_native))
+
+ return edges
diff --git a/src/scriber/graph/analyzers/base.py b/src/scriber/graph/analyzers/base.py
new file mode 100644
index 0000000..9abe43f
--- /dev/null
+++ b/src/scriber/graph/analyzers/base.py
@@ -0,0 +1,15 @@
+from __future__ import annotations
+
+from typing import Iterable, Protocol
+from pathlib import Path
+
+from scriber.core.models import FileNode, ScriberConfig
+from scriber.graph.indexes import GraphIndexes
+from scriber.graph.model import RelationEdge
+
+
+class RelationAnalyzer(Protocol):
+ name: str
+
+ def analyze(self, files: dict[Path, FileNode], indexes: GraphIndexes, config: ScriberConfig) -> Iterable[RelationEdge]:
+ ...
diff --git a/src/scriber/graph/analyzers/config_refs.py b/src/scriber/graph/analyzers/config_refs.py
new file mode 100644
index 0000000..481b213
--- /dev/null
+++ b/src/scriber/graph/analyzers/config_refs.py
@@ -0,0 +1,35 @@
+from __future__ import annotations
+from typing import Iterable, Any
+from pathlib import Path
+from scriber.core.models import FileNode, ScriberConfig
+from scriber.graph.indexes import GraphIndexes
+
+def is_config_file(f: FileNode) -> bool:
+ name = f.relative.name.lower()
+ return name in {"pyproject.toml", "setup.py", "package.json", "dockerfile"} or f.relative.suffix.lower() in {".toml", ".yaml", ".yml", ".json"}
+
+class ConfigRefsAnalyzer:
+ name = "config_refs"
+
+ def analyze(self, files: dict[Path, FileNode], indexes: GraphIndexes, config: ScriberConfig | None, edge_cls: Any, is_native: bool) -> Iterable:
+ edges = []
+ for rel, node in files.items():
+ if is_config_file(node):
+ try:
+ content = node.absolute.read_text(encoding="utf-8", errors="ignore")
+ for crel, cnode in files.items():
+ if cnode.kind == "code":
+ if crel.as_posix() in content or (len(crel.name) > 4 and crel.name != "__init__.py" and crel.name in content):
+ edges.append(edge_cls(
+ source=str(rel) if is_native else rel,
+ target=str(crel) if is_native else crel,
+ kind="config_refs_code",
+ weight=0.6,
+ confidence=0.8,
+ evidence=f"Config {rel.name} references {crel.name}",
+ line=None,
+ analyzer="config_refs:indexed"
+ ))
+ except Exception:
+ pass
+ return edges
diff --git a/src/scriber/graph/analyzers/docs.py b/src/scriber/graph/analyzers/docs.py
new file mode 100644
index 0000000..6afc72e
--- /dev/null
+++ b/src/scriber/graph/analyzers/docs.py
@@ -0,0 +1,32 @@
+from __future__ import annotations
+from typing import Iterable, Any
+from pathlib import Path
+from scriber.core.models import FileNode, ScriberConfig
+from scriber.graph.indexes import GraphIndexes
+
+class DocsAnalyzer:
+ name = "docs"
+
+ def analyze(self, files: dict[Path, FileNode], indexes: GraphIndexes, config: ScriberConfig | None, edge_cls: Any, is_native: bool) -> Iterable:
+ edges = []
+ for rel, node in files.items():
+ name_lower = node.relative.name.lower()
+ if name_lower in {"readme.md", "readme.txt", "readme"} or "doc" in name_lower:
+ try:
+ content = node.absolute.read_text(encoding="utf-8", errors="ignore")
+ for crel, cnode in files.items():
+ if cnode.kind == "code":
+ if crel.as_posix() in content or (len(crel.name) > 4 and crel.name != "__init__.py" and crel.name in content):
+ edges.append(edge_cls(
+ source=str(rel) if is_native else rel,
+ target=str(crel) if is_native else crel,
+ kind="doc_mentions_code",
+ weight=0.42,
+ confidence=0.8,
+ evidence=f"{node.relative.name} mentions {crel.name}",
+ line=None,
+ analyzer="docs:indexed"
+ ))
+ except Exception:
+ pass
+ return edges
diff --git a/src/scriber/graph/analyzers/env.py b/src/scriber/graph/analyzers/env.py
new file mode 100644
index 0000000..f4eb938
--- /dev/null
+++ b/src/scriber/graph/analyzers/env.py
@@ -0,0 +1,50 @@
+from __future__ import annotations
+from typing import Iterable, Any
+from pathlib import Path
+import re
+from scriber.core.models import FileNode, ScriberConfig
+from scriber.graph.indexes import GraphIndexes
+
+class EnvAnalyzer:
+ name = "env"
+
+ def analyze(self, files: dict[Path, FileNode], indexes: GraphIndexes, config: ScriberConfig | None, edge_cls: Any, is_native: bool) -> Iterable:
+ edges = []
+ file_envs = {}
+ for rel, node in files.items():
+ if node.kind != "code": continue
+ try:
+ content = node.absolute.read_text(encoding="utf-8", errors="ignore")
+ keys = self.extract_env_keys(content)
+ if keys:
+ file_envs[rel] = keys
+ for k in keys:
+ indexes.env_key_to_files.setdefault(k, []).append(node)
+ except Exception:
+ pass
+
+ for key, nodes in indexes.env_key_to_files.items():
+ for i, n1 in enumerate(nodes):
+ for j, n2 in enumerate(nodes):
+ if i == j: continue
+ edges.append(edge_cls(
+ source=str(n1.relative) if is_native else n1.relative,
+ target=str(n2.relative) if is_native else n2.relative,
+ kind="env_key",
+ weight=0.4,
+ confidence=0.9,
+ evidence=f"Shared env key: {key}",
+ line=None,
+ analyzer="env:indexed"
+ ))
+ return edges
+
+ def extract_env_keys(self, content: str) -> set[str]:
+ keys = set()
+ for match in re.finditer(r'os\.environ(?:\[|\.get\()[\'"]([A-Za-z0-9_]+)[\'"]', content):
+ keys.add(match.group(1))
+ for match in re.finditer(r'os\.getenv\([\'"]([A-Za-z0-9_]+)[\'"]\)', content):
+ keys.add(match.group(1))
+ for match in re.finditer(r'process\.env(?:\[[\'"]([A-Za-z0-9_]+)[\'"]\]|\.([A-Za-z0-9_]+))', content):
+ keys.add(match.group(1) or match.group(2))
+ return keys
diff --git a/src/scriber/graph/analyzers/package.py b/src/scriber/graph/analyzers/package.py
new file mode 100644
index 0000000..7626b6e
--- /dev/null
+++ b/src/scriber/graph/analyzers/package.py
@@ -0,0 +1,30 @@
+from __future__ import annotations
+from typing import Iterable, Any
+from pathlib import Path
+from scriber.core.models import FileNode, ScriberConfig
+from scriber.graph.indexes import GraphIndexes
+
+class PackageAnalyzer:
+ name = "package"
+
+ def analyze(self, files: dict[Path, FileNode], indexes: GraphIndexes, config: ScriberConfig | None, edge_cls: Any, is_native: bool) -> Iterable:
+ edges = []
+ for d, siblings in indexes.by_dir.items():
+ code_siblings = [s for s in siblings if s.kind == "code"]
+ for s1 in code_siblings:
+ count = 0
+ for s2 in code_siblings:
+ if s1 == s2: continue
+ count += 1
+ if count > 8: break
+ edges.append(edge_cls(
+ source=str(s1.relative) if is_native else s1.relative,
+ target=str(s2.relative) if is_native else s2.relative,
+ kind="same_package",
+ weight=0.5,
+ confidence=1.0,
+ evidence=None,
+ line=None,
+ analyzer="package:indexed"
+ ))
+ return edges
diff --git a/src/scriber/graph/analyzers/tests.py b/src/scriber/graph/analyzers/tests.py
new file mode 100644
index 0000000..409f63f
--- /dev/null
+++ b/src/scriber/graph/analyzers/tests.py
@@ -0,0 +1,36 @@
+from __future__ import annotations
+from typing import Iterable, Any
+from pathlib import Path
+from scriber.core.models import FileNode, ScriberConfig
+from scriber.graph.indexes import GraphIndexes
+
+class TestsAnalyzer:
+ name = "tests"
+
+ def analyze(self, files: dict[Path, FileNode], indexes: GraphIndexes, config: ScriberConfig | None, edge_cls: Any, is_native: bool) -> Iterable:
+ edges = []
+ for rel, node in files.items():
+ if node.kind != "code": continue
+ stem = rel.stem.lower()
+ name = rel.name.lower()
+ clean_stem = stem.replace("test_", "").replace("_test", "").replace(".test", "")
+ is_test = name.startswith("test_") or name.endswith("_test.py") or ".test." in name
+
+ if is_test and clean_stem:
+ targets = indexes.by_clean_stem.get(clean_stem, [])
+ for target_node in targets:
+ if target_node.relative == rel: continue
+ target_name = target_node.relative.name.lower()
+ target_is_test = target_name.startswith("test_") or target_name.endswith("_test.py") or ".test." in target_name
+ if not target_is_test:
+ edges.append(edge_cls(
+ source=str(rel) if is_native else rel,
+ target=str(target_node.relative) if is_native else target_node.relative,
+ kind="test_of",
+ weight=0.85,
+ confidence=0.9,
+ evidence=f"test filename {rel.name} matches {target_node.relative.name}",
+ line=None,
+ analyzer="tests:indexed"
+ ))
+ return edges
diff --git a/src/scriber/graph/builder.py b/src/scriber/graph/builder.py
index a181441..17afd2a 100644
--- a/src/scriber/graph/builder.py
+++ b/src/scriber/graph/builder.py
@@ -2,12 +2,13 @@
from pathlib import Path
-from scriber.core.models import FileNode, ModuleGraph, ScriberConfig
+from scriber.core.models import FileNode, ScriberConfig
+from scriber.graph.model import ModuleGraph, RelationEdge
from scriber.graph.languages.python import build_module_map, parse_python_imports, resolve_import_record
from scriber.scanner.files import read_text_lossy
-def build_graph(files: dict[Path, FileNode], config: ScriberConfig) -> ModuleGraph:
+def build_graph(files: dict[Path, FileNode], config: ScriberConfig, cache: ScriberCache | None = None) -> ModuleGraph:
graph = ModuleGraph()
if not files:
return graph
@@ -24,8 +25,9 @@ def build_graph(files: dict[Path, FileNode], config: ScriberConfig) -> ModuleGra
sample = next(iter(files.values()))
root = Path(sample.absolute.as_posix()[:len(sample.absolute.as_posix()) - len(sample.relative.as_posix())]).resolve()
- from scriber.cache import ScriberCache
- cache = ScriberCache(config, root)
+ if cache is None:
+ from scriber.cache import ScriberCache
+ cache = ScriberCache(config, root)
module_to_path, path_to_module = build_module_map(files, config.python)
@@ -124,9 +126,17 @@ def build_graph(files: dict[Path, FileNode], config: ScriberConfig) -> ModuleGra
resolved_set.add(target)
+ from scriber.core.models import RelationEdge
+
for target in resolved_set:
- graph.imports.setdefault(rel, set()).add(target)
- graph.imported_by.setdefault(target, set()).add(rel)
+ graph.add_edge(RelationEdge(
+ source=rel,
+ target=target,
+ kind="import",
+ weight=1.0,
+ confidence=0.98,
+ analyzer=f"imports:{file.language}",
+ ))
cache.set_imports(rel, resolved_set)
diff --git a/src/scriber/graph/indexes.py b/src/scriber/graph/indexes.py
new file mode 100644
index 0000000..aa61518
--- /dev/null
+++ b/src/scriber/graph/indexes.py
@@ -0,0 +1,37 @@
+from __future__ import annotations
+
+import re
+from dataclasses import dataclass, field
+from pathlib import Path
+
+from scriber.core.models import FileNode
+
+
+@dataclass(slots=True)
+class GraphIndexes:
+ by_dir: dict[Path, list[FileNode]] = field(default_factory=dict)
+ by_stem: dict[str, list[FileNode]] = field(default_factory=dict)
+ by_clean_stem: dict[str, list[FileNode]] = field(default_factory=dict)
+ by_language: dict[str, list[FileNode]] = field(default_factory=dict)
+ env_key_to_files: dict[str, list[FileNode]] = field(default_factory=dict)
+ config_tokens: dict[Path, set[str]] = field(default_factory=dict)
+ doc_tokens: dict[Path, set[str]] = field(default_factory=dict)
+
+ @classmethod
+ def build(cls, files: dict[Path, FileNode]) -> GraphIndexes:
+ indexes = cls()
+
+ for rel, node in files.items():
+ indexes.by_dir.setdefault(rel.parent, []).append(node)
+ indexes.by_stem.setdefault(rel.stem, []).append(node)
+
+ clean_stem = re.sub(r'[^a-zA-Z0-9]', '', rel.stem).lower()
+ if clean_stem:
+ indexes.by_clean_stem.setdefault(clean_stem, []).append(node)
+
+ indexes.by_language.setdefault(node.language, []).append(node)
+
+ # Simple indexing for .env and docs is done per analyzer as needed,
+ # but we can initialize the dicts here.
+
+ return indexes
diff --git a/src/scriber/graph/languages/extractor.py b/src/scriber/graph/languages/extractor.py
new file mode 100644
index 0000000..333e74e
--- /dev/null
+++ b/src/scriber/graph/languages/extractor.py
@@ -0,0 +1,74 @@
+import ast
+from pathlib import Path
+from typing import Any
+from scriber.core.symbols import SymbolNode, SymbolIndex
+
+class PythonSymbolVisitor(ast.NodeVisitor):
+ def __init__(self, file_path: Path, index: SymbolIndex):
+ self.file_path = file_path
+ self.index = index
+ self.current_parent: str | None = None
+
+ def visit_ClassDef(self, node: ast.ClassDef) -> Any:
+ start = node.lineno
+ end = getattr(node, "end_lineno", start)
+
+ symbol = SymbolNode(
+ name=node.name,
+ kind="class",
+ line_start=start,
+ line_end=end,
+ parent_name=self.current_parent
+ )
+ self.index.add_symbol(self.file_path, symbol)
+
+ old_parent = self.current_parent
+ self.current_parent = node.name
+ self.generic_visit(node)
+ self.current_parent = old_parent
+
+ def visit_FunctionDef(self, node: ast.FunctionDef) -> Any:
+ start = node.lineno
+ end = getattr(node, "end_lineno", start)
+
+ symbol = SymbolNode(
+ name=node.name,
+ kind="function",
+ line_start=start,
+ line_end=end,
+ parent_name=self.current_parent
+ )
+ self.index.add_symbol(self.file_path, symbol)
+
+ old_parent = self.current_parent
+ self.current_parent = node.name
+ self.generic_visit(node)
+ self.current_parent = old_parent
+
+ def visit_AsyncFunctionDef(self, node: ast.AsyncFunctionDef) -> Any:
+ start = node.lineno
+ end = getattr(node, "end_lineno", start)
+
+ symbol = SymbolNode(
+ name=node.name,
+ kind="function",
+ line_start=start,
+ line_end=end,
+ parent_name=self.current_parent
+ )
+ self.index.add_symbol(self.file_path, symbol)
+
+ old_parent = self.current_parent
+ self.current_parent = node.name
+ self.generic_visit(node)
+ self.current_parent = old_parent
+
+
+def extract_python_symbols(file_path: Path, source_code: str, index: SymbolIndex) -> None:
+ try:
+ tree = ast.parse(source_code, filename=str(file_path))
+ visitor = PythonSymbolVisitor(file_path, index)
+ visitor.visit(tree)
+ except Exception:
+ # Gracefully handle syntactically invalid or unparseable files
+ pass
diff --git a/src/scriber/graph/model.py b/src/scriber/graph/model.py
new file mode 100644
index 0000000..d87376c
--- /dev/null
+++ b/src/scriber/graph/model.py
@@ -0,0 +1,58 @@
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Literal
+
+RelationKind = Literal[
+ "import",
+ "reexport",
+ "call",
+ "type_reference",
+ "inherits",
+ "implements",
+ "test_of",
+ "fixture_for",
+ "config_refs_code",
+ "env_key",
+ "doc_mentions_symbol",
+ "doc_mentions_code",
+ "same_package",
+ "same_dir",
+ "name_similarity",
+ "git_cochange",
+ "semantic_similarity",
+ "entrypoint_to_module",
+]
+
+@dataclass(frozen=True, slots=True)
+class RelationEdge:
+ source: Path
+ target: Path
+ kind: RelationKind
+ weight: float = 1.0
+ confidence: float = 1.0
+ evidence: str | None = None
+ line: int | None = None
+ analyzer: str = "unknown"
+
+@dataclass(slots=True)
+class RelationGraph:
+ edges: list[RelationEdge] = field(default_factory=list)
+ outgoing: dict[Path, list[RelationEdge]] = field(default_factory=dict)
+ incoming: dict[Path, list[RelationEdge]] = field(default_factory=dict)
+ imports: dict[Path, set[Path]] = field(default_factory=dict)
+ imported_by: dict[Path, set[Path]] = field(default_factory=dict)
+
+ def add_edge(self, edge: RelationEdge) -> None:
+ self.edges.append(edge)
+ self.outgoing.setdefault(edge.source, []).append(edge)
+ self.incoming.setdefault(edge.target, []).append(edge)
+
+ if edge.kind in {"import", "reexport"}:
+ self.imports.setdefault(edge.source, set()).add(edge.target)
+ self.imported_by.setdefault(edge.target, set()).add(edge.source)
+
+@dataclass(slots=True)
+class ModuleGraph(RelationGraph):
+ pass
diff --git a/src/scriber/outline/__init__.py b/src/scriber/outline/__init__.py
new file mode 100644
index 0000000..2e72db9
--- /dev/null
+++ b/src/scriber/outline/__init__.py
@@ -0,0 +1,13 @@
+from scriber.core.models import FileNode, FileOutline
+from scriber.outline.base import Outliner
+from scriber.outline.generic import GenericOutliner
+from scriber.outline.python import PythonOutliner
+
+_outliners: dict[str, Outliner] = {
+ "python": PythonOutliner(),
+}
+_generic = GenericOutliner()
+
+def generate_outline(file: FileNode, content: str) -> FileOutline:
+ outliner = _outliners.get(file.language, _generic)
+ return outliner.outline(file, content)
diff --git a/src/scriber/outline/base.py b/src/scriber/outline/base.py
new file mode 100644
index 0000000..a79c6c0
--- /dev/null
+++ b/src/scriber/outline/base.py
@@ -0,0 +1,7 @@
+from __future__ import annotations
+from typing import Protocol
+from scriber.core.models import FileNode, FileOutline
+
+class Outliner(Protocol):
+ def outline(self, file: FileNode, content: str) -> FileOutline:
+ ...
diff --git a/src/scriber/outline/generic.py b/src/scriber/outline/generic.py
new file mode 100644
index 0000000..f0aac16
--- /dev/null
+++ b/src/scriber/outline/generic.py
@@ -0,0 +1,18 @@
+from __future__ import annotations
+from scriber.core.models import FileNode, FileOutline
+from scriber.outline.base import Outliner
+
+class GenericOutliner(Outliner):
+ def outline(self, file: FileNode, content: str) -> FileOutline:
+ return FileOutline(
+ path=file.relative,
+ language=file.language,
+ purpose=None,
+ imports=[],
+ exports=[],
+ classes=[],
+ functions=[],
+ constants=[],
+ notes=["Static outline not implemented for this language. Showing generic info."],
+ token_estimate=20
+ )
diff --git a/src/scriber/outline/python.py b/src/scriber/outline/python.py
new file mode 100644
index 0000000..bd9c9c7
--- /dev/null
+++ b/src/scriber/outline/python.py
@@ -0,0 +1,39 @@
+from __future__ import annotations
+import ast
+from scriber.core.models import FileNode, FileOutline
+from scriber.outline.base import Outliner
+
+class PythonOutliner(Outliner):
+ def outline(self, file: FileNode, content: str) -> FileOutline:
+ classes = []
+ functions = []
+ imports = []
+ try:
+ tree = ast.parse(content)
+ for node in tree.body:
+ if isinstance(node, ast.ClassDef):
+ classes.append(node.name)
+ elif isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)):
+ functions.append(node.name)
+ elif isinstance(node, ast.Import):
+ for alias in node.names:
+ imports.append(alias.name)
+ elif isinstance(node, ast.ImportFrom):
+ module = node.module or ""
+ for alias in node.names:
+ imports.append(f"{module}.{alias.name}")
+ except SyntaxError:
+ pass
+
+ return FileOutline(
+ path=file.relative,
+ language="python",
+ purpose=None,
+ imports=imports[:20],
+ exports=[],
+ classes=classes,
+ functions=functions,
+ constants=[],
+ notes=[],
+ token_estimate=len(classes)*5 + len(functions)*3 + len(imports)*2
+ )
diff --git a/src/scriber/packer/pack.py b/src/scriber/packer/pack.py
index 2e7011c..801f93f 100644
--- a/src/scriber/packer/pack.py
+++ b/src/scriber/packer/pack.py
@@ -13,6 +13,7 @@
from scriber.scanner.files import classify_file, is_text_readable, read_text_lossy
from scriber.tokens import estimate_tokens
from scriber.scanner.scan import scan_project
+from scriber.core.models import LlmPack
def _resolve_input(path_value: str, root: Path, allow_external: bool, path_base: str = "cwd") -> Path:
@@ -151,28 +152,7 @@ def _apply_content_policy(pack: ScriberPack, config) -> None:
pack.total_tokens = total
-def build_pack(
- paths: list[str] | None = None,
- *,
- config_path: str | None = None,
- output: str | None = None,
- output_format: str | None = None,
- only_tree: bool | None = None,
- modules: bool | None = None,
- support: bool | None = None,
- max_files: int | None = None,
- max_tokens: int | None = None,
- min_score: int | None = None,
- support_content: str | None = None,
- progress_callback: Callable[[str], None] | None = None,
- project: bool | None = None,
- path_base: str = "project",
-) -> ScriberPack:
- from time import perf_counter
- timings = {}
-
- t_start = perf_counter()
- paths = paths or ["."]
+def _load_and_apply_config(paths, config_path, output, output_format, only_tree, modules, support, max_files, max_tokens, min_score, support_content):
resolved_config = resolve_config_path(paths, config_path)
root = project_root_from_config(resolved_config)
config = load_config(resolved_config)
@@ -188,11 +168,11 @@ def build_pack(
min_score=min_score,
support_content=support_content,
)
- timings["config_load"] = perf_counter() - t_start
+ return resolved_config, root, config
- t_scan = perf_counter()
+def _scan_files(paths, root, config, path_base, progress_callback):
if progress_callback: progress_callback("Skanowanie plikow...")
- from scriber.native import require_native, is_native_available
+ from scriber.native import is_native_available
native_files = None
if is_native_available():
from scriber.scanner.scan import scan_project_with_native
@@ -201,21 +181,23 @@ def build_pack(
files = scan_project(root, config)
resolved_inputs = [_resolve_input(item, root, config.allow_external_paths, path_base) for item in paths]
seeds = [_expand_seed(path, root, files, config) for path in resolved_inputs]
- timings["scan"] = perf_counter() - t_scan
-
- # Detect mode
+
is_project_snapshot = False
- if project:
- is_project_snapshot = True
- else:
- for path in resolved_inputs:
- if path == root:
- is_project_snapshot = True
- break
- mode = "project_snapshot" if is_project_snapshot else "focused"
+ for path in resolved_inputs:
+ if path == root:
+ is_project_snapshot = True
+ break
+
+ return files, native_files, seeds, is_project_snapshot
- # Use native code pack builder if available
+
+def _build_graph_and_score(mode, files, seeds, native_files, root, config, progress_callback):
+ from time import perf_counter
+ timings = {}
+ stats = {}
+ from scriber.native import is_native_available
if is_native_available():
+ from scriber.native import require_native
native = require_native()
t_graph = perf_counter()
@@ -223,20 +205,45 @@ def build_pack(
assert native_files is not None
- edges = native.build_import_graph(
+ edges = native.build_relation_graph(
str(root),
native_files,
config.python.source_roots,
config.python.module_init_files
)
- from scriber.core.models import ModuleGraph
+ from scriber.graph.analyzers import generate_cheap_relations
+ edges.extend(generate_cheap_relations(files, native.NativeRelationEdge, is_native=True))
+
+ from scriber.cache import ScriberCache
+ cache = ScriberCache(config, root)
+
+ from scriber.core.models import ModuleGraph, RelationEdge
graph = ModuleGraph()
for edge in edges:
- from_path = Path(getattr(edge, "from"))
- to_path = Path(edge.to)
- graph.imports.setdefault(from_path, set()).add(to_path)
- graph.imported_by.setdefault(to_path, set()).add(from_path)
+ from_path = Path(getattr(edge, "source"))
+ to_path = Path(edge.target)
+ py_edge = RelationEdge(
+ source=from_path,
+ target=to_path,
+ kind=edge.kind,
+ weight=edge.weight,
+ confidence=edge.confidence,
+ evidence=edge.evidence,
+ line=edge.line,
+ analyzer=edge.analyzer
+ )
+ graph.add_edge(py_edge)
+ if py_edge.kind in {"import", "reexport"}:
+ cache.add_import_edge(from_path, to_path)
+
+ cache.save(set(files.keys()))
+
+ stats["graph_edges_built"] = len(edges)
+ stats["graph_source"] = "native"
+ stats["graph_cache_reads"] = cache.reads
+ stats["graph_cache_hits"] = cache.hits
+ stats["graph_cache_writes"] = cache.writes
timings["graph_build"] = perf_counter() - t_graph
@@ -300,7 +307,23 @@ def build_pack(
else:
t_graph = perf_counter()
if progress_callback: progress_callback("Budowanie grafu modulow...")
- graph = build_graph(files, config)
+ from scriber.cache import ScriberCache
+ cache = ScriberCache(config, root)
+ from scriber.graph.builder import build_graph
+ graph = build_graph(files, config, cache)
+
+ from scriber.graph.analyzers import generate_cheap_relations
+ from scriber.core.models import RelationEdge
+ cheap_edges = generate_cheap_relations(files, RelationEdge, is_native=False)
+ for edge in cheap_edges:
+ graph.add_edge(edge)
+
+ stats["graph_edges_built"] = len(graph.edges)
+ stats["graph_source"] = "python"
+ stats["graph_cache_reads"] = cache.reads
+ stats["graph_cache_hits"] = cache.hits
+ stats["graph_cache_writes"] = cache.writes
+
timings["graph_build"] = perf_counter() - t_graph
t_score = perf_counter()
@@ -308,6 +331,119 @@ def build_pack(
candidates = score_candidates(files=files, seeds=seeds, graph=graph, config=config, mode=mode)
timings["scoring"] = perf_counter() - t_score
+ return candidates, graph, timings, stats
+
+def build_pack(
+ paths: list[str] | None = None,
+ *,
+ config_path: str | None = None,
+ profile: str | None = None,
+ output: str | None = None,
+ output_format: str | None = None,
+ only_tree: bool | None = None,
+ modules: bool | None = None,
+ support: bool | None = None,
+ max_files: int | None = None,
+ max_tokens: int | None = None,
+ min_score: int | None = None,
+ support_content: str | None = None,
+ progress_callback: Callable[[str], None] | None = None,
+ project: bool | None = None,
+ path_base: str = "project",
+) -> ScriberPack | LlmPack:
+ from time import perf_counter
+
+ t_start = perf_counter()
+ paths = paths or ["."]
+ resolved_config, root, config = _load_and_apply_config(
+ paths, config_path, output, output_format, only_tree, modules, support, max_files, max_tokens, min_score, support_content
+ )
+ t_config_load = perf_counter() - t_start
+
+ t_scan = perf_counter()
+ files, native_files, seeds, is_project_snapshot = _scan_files(paths, root, config, path_base, progress_callback)
+ t_scan_time = perf_counter() - t_scan
+
+ mode = "project_snapshot" if (project or is_project_snapshot) else "focused"
+
+ if profile == "full":
+ mode = "project_snapshot"
+ elif profile == "focused-gpt":
+ mode = "focused"
+
+ candidates, graph, sub_timings, stats = _build_graph_and_score(
+ mode, files, seeds, native_files, root, config, progress_callback
+ )
+
+ if profile in {"gpt", "focused-gpt", "full"}:
+ from scriber.engine.ranker import rank_context
+ from scriber.budget.allocator import allocate_budget, BudgetPolicy
+ from time import perf_counter
+
+ t_rank = perf_counter()
+ if progress_callback: progress_callback("Rankowanie kontekstu...")
+ seed_paths = [seed for p in seeds for seed in p.expanded_files]
+ new_candidates = rank_context(files, graph, seed_paths, config, mode)
+ sub_timings["rank_context"] = perf_counter() - t_rank
+
+ t_budget = perf_counter()
+ if progress_callback: progress_callback("Alokacja budzetu...")
+ policy = BudgetPolicy(
+ target_tokens=config.max_tokens if config.max_tokens > 0 else 30000,
+ hard_limit_tokens=config.max_tokens if config.max_tokens > 0 else 100000,
+ mode=mode
+ )
+ if mode == "focused":
+ explicit_seeds = {seed for p in seeds for seed in p.expanded_files}
+ else:
+ explicit_seeds = {seed for p in seeds if not p.is_dir for seed in p.expanded_files}
+
+ items = allocate_budget(new_candidates, policy, explicit_seeds)
+ sub_timings["budget_allocation"] = perf_counter() - t_budget
+
+ t_content = perf_counter()
+ if progress_callback: progress_callback("Czytanie i outline...")
+ from scriber.outline import generate_outline
+
+ actual_tokens = 0
+ for item in items:
+ if item.content_mode == "full":
+ try:
+ item.content = item.file.read_text()
+ actual_tokens += item.token_estimate
+ except Exception:
+ item.content_mode = "tree"
+ elif item.content_mode in ("outline", "excerpt"):
+ try:
+ content = item.file.read_text()
+ item.outline = generate_outline(item.file, content)
+ actual_tokens += item.outline.token_estimate
+ except Exception:
+ item.content_mode = "tree"
+
+ sub_timings["content_read"] = perf_counter() - t_content
+
+ stats["input_paths"] = paths
+ pack = LlmPack(
+ project_root=root,
+ config_path=resolved_config,
+ profile=profile,
+ mode=mode,
+ goal=None,
+ budget_target=policy.target_tokens,
+ budget_actual=actual_tokens,
+ items=items,
+ graph=graph,
+ stats=stats,
+ warnings=[]
+ )
+ pack.timings = {
+ "config_load": t_config_load,
+ "scan": t_scan_time,
+ **sub_timings
+ }
+ return pack
+
pack = ScriberPack(
project_root=root,
config_path=resolved_config,
@@ -317,18 +453,24 @@ def build_pack(
only_tree=config.only_tree,
output_format=config.format,
mode=mode,
+ stats=stats,
)
t_content = perf_counter()
if progress_callback: progress_callback("Aplikowanie regul zawartosci...")
_apply_content_policy(pack, config)
- timings["content_read"] = perf_counter() - t_content
+ t_content_time = perf_counter() - t_content
- pack.timings = timings
+ pack.timings = {
+ "config_load": t_config_load,
+ "scan": t_scan_time,
+ "content_read": t_content_time,
+ **sub_timings
+ }
return pack
-def build_and_write_pack(paths: list[str] | None = None, **kwargs) -> tuple[Path | None, ScriberPack]:
+def build_and_write_pack(paths: list[str] | None = None, **kwargs) -> tuple[Path | None, ScriberPack | LlmPack]:
explain_selection = kwargs.pop("explain_selection", False)
pack = build_pack(paths, **kwargs)
config_path = resolve_config_path(paths or ["."], kwargs.get("config_path"))
@@ -347,7 +489,16 @@ def build_and_write_pack(paths: list[str] | None = None, **kwargs) -> tuple[Path
)
progress = kwargs.get("progress_callback")
if progress: progress("Renderowanie Markdown...")
- rendered = render_pack(pack, explain_selection=explain_selection)
+
+ if isinstance(pack, LlmPack):
+ from scriber.renderer.llm_report import render_llm_report
+ import io
+ buf = io.StringIO()
+ render_llm_report(pack, buf)
+ rendered = buf.getvalue()
+ else:
+ rendered = render_pack(pack, explain_selection=explain_selection)
+
output = config.output
if str(output) == "-":
import sys
@@ -360,6 +511,13 @@ def build_and_write_pack(paths: list[str] | None = None, **kwargs) -> tuple[Path
if not output.is_absolute():
output = pack.project_root / output
output.parent.mkdir(parents=True, exist_ok=True)
- from scriber.native import require_native
- require_native().write_text(str(output), rendered)
+ try:
+ from scriber.native import is_native_available, require_native
+ if is_native_available():
+ require_native().write_text(str(output), rendered)
+ else:
+ output.write_text(rendered, encoding="utf-8")
+ except Exception:
+ output.write_text(rendered, encoding="utf-8")
+
return output, pack
diff --git a/src/scriber/renderer/llm_report.py b/src/scriber/renderer/llm_report.py
new file mode 100644
index 0000000..027b182
--- /dev/null
+++ b/src/scriber/renderer/llm_report.py
@@ -0,0 +1,216 @@
+from __future__ import annotations
+from typing import TextIO
+from pathlib import Path
+from collections import defaultdict
+import json
+
+from scriber.core.models import LlmPack, PackItem, FileOutline
+from scriber.graph.model import RelationEdge
+
+def render_llm_report(pack: LlmPack, out: TextIO) -> None:
+ out.write("# Scriber Pack v3\n\n")
+
+ out.write("\n")
+ out.write("You are reading a generated codebase context pack.\n")
+ out.write("Prefer facts from , , and blocks.\n")
+ out.write("If a file is tree_only or omitted, do not infer its contents.\n")
+ out.write("When proposing patches, cite file IDs and line ranges.\n")
+ out.write("\n\n")
+
+ out.write("\n")
+ out.write("project:\n")
+ out.write(f" mode: {pack.mode}\n")
+ out.write(f" goal: {pack.goal or 'null'}\n")
+ out.write(f" target_tokens: {pack.budget_target}\n")
+ out.write(f" actual_tokens: {pack.budget_actual}\n")
+
+ input_paths = pack.stats.get("input_paths", [])
+ if input_paths:
+ out.write(" analyzed_targets:\n")
+ for p in input_paths:
+ out.write(f" - {p}\n")
+ out.write("\n")
+
+ out.write("read_order:\n")
+ for item in pack.items:
+ if item.content_mode not in ("tree", "omit"):
+ out.write(f" - {item.item_id} # {item.file.relative.as_posix()}\n")
+
+ out.write("\nfiles:\n")
+ for item in pack.items:
+ if item.content_mode in ("omit",):
+ continue
+ out.write(f" {item.item_id}:\n")
+ out.write(f" path: {item.file.relative.as_posix()}\n")
+ out.write(f" role: {item.role}\n")
+ out.write(f" mode: {item.content_mode}\n")
+ out.write(f" score: {item.score}\n")
+ out.write(f" utility: {item.utility:.2f}\n")
+ out.write(f" tokens: {item.token_estimate}\n")
+ if item.outline and item.outline.purpose:
+ out.write(f" purpose: {item.outline.purpose}\n")
+ out.write("\n\n")
+
+ out.write("## Architecture map\n")
+ out.write("```\n")
+ _render_tree(pack.items, out)
+ out.write("```\n\n")
+
+ out.write("\n")
+ _render_graph(pack, out)
+ out.write("\n\n")
+
+ warnings = _generate_warnings(pack)
+ if warnings:
+ out.write("## Pack quality warnings\n\n")
+ for w in warnings:
+ out.write(f"- {w}\n")
+ out.write("\n")
+
+ out.write("## Files Content\n\n")
+
+ for item in pack.items:
+ if item.content_mode in ("tree", "omit"):
+ continue
+
+ out.write(f'\n')
+
+ if item.outline and item.outline.purpose:
+ out.write("\n")
+ out.write(f"{item.outline.purpose}\n")
+ out.write("\n\n")
+
+ if item.outline:
+ _render_symbols_manifest(item.outline, out)
+
+ if item.content_mode == "full" and item.content:
+ out.write(f"```{item.file.language} linenums=\"1\"\n")
+ out.write(_add_line_numbers(item.content, item.file.relative.as_posix(), item.file.language))
+ if not item.content.endswith("\n"):
+ out.write("\n")
+ out.write("```\n")
+
+ elif item.content_mode == "excerpt":
+ if item.excerpts:
+ for excerpt in item.excerpts:
+ out.write(f"```{item.file.language}\n")
+ out.write(excerpt)
+ out.write("\n```\n\n")
+ elif item.outline:
+ _render_outline_fallback(item, out)
+ else:
+ out.write("_Excerpt unavailable; falling back to metadata only._\n\n")
+
+ elif item.content_mode == "outline" and item.outline:
+ _render_outline_fallback(item, out)
+
+ out.write("\n\n")
+
+import re
+
+def _add_line_numbers(content: str, path: str, language: str) -> str:
+ lines = content.splitlines()
+ out = []
+ out.append(f"# file: {path}")
+ out.append(f"# lines: 1-{len(lines)}")
+ for i, line in enumerate(lines, 1):
+ if language in ("python", "py"):
+ m = re.match(r'^(\s*)(class|def|async def)\s+([a-zA-Z0-9_]+)', line)
+ if m:
+ indent, _, name = m.groups()
+ out.append(f"{i:04d} {indent}# ")
+ out.append(f"{i:04d} {line}")
+ return "\n".join(out)
+
+def _render_symbols_manifest(outline: FileOutline, out: TextIO) -> None:
+ symbols = []
+ if outline.classes:
+ symbols.extend(outline.classes)
+ if outline.functions:
+ symbols.extend(outline.functions)
+ if not symbols:
+ return
+
+ out.write("\n")
+ for sym in symbols:
+ out.write(f"- {sym}\n")
+ out.write("\n\n")
+
+def _render_outline_fallback(item: PackItem, out: TextIO) -> None:
+ out.write("```python\n")
+ out.write(f"# Outline for {item.file.relative.name}\n")
+ if item.outline.classes:
+ out.write("Classes: " + ", ".join(item.outline.classes) + "\n")
+ if item.outline.functions:
+ out.write("Functions: " + ", ".join(item.outline.functions) + "\n")
+ if item.outline.imports:
+ out.write("Imports: " + ", ".join(item.outline.imports) + "\n")
+ out.write("```\n\n")
+
+def _generate_warnings(pack: LlmPack) -> list[str]:
+ warnings = []
+ empty_excerpts = sum(1 for i in pack.items if i.content_mode == "excerpt" and not i.excerpts)
+ if empty_excerpts > 0:
+ warnings.append(f"{empty_excerpts} files are marked excerpt but have no excerpts (falling back to outline).")
+
+ unknown_roles = sum(1 for i in pack.items if i.role == "unknown")
+ if unknown_roles > 0:
+ warnings.append(f"{unknown_roles} files have role=unknown.")
+
+ return warnings
+
+def _render_tree(items: list[PackItem], out: TextIO) -> None:
+ tree = {}
+ item_map = {item.file.relative.as_posix(): item for item in items}
+
+ for item in items:
+ parts = item.file.relative.parts
+ curr = tree
+ for part in parts:
+ if part not in curr:
+ curr[part] = {}
+ curr = curr[part]
+
+ def print_node(path_parts, current_dict, prefix=""):
+ keys = sorted(current_dict.keys())
+ for i, k in enumerate(keys):
+ is_last = i == len(keys) - 1
+ child_prefix = prefix + (" " if is_last else "β ")
+ connector = "βββ " if is_last else "βββ "
+
+ full_path = "/".join(path_parts + (k,))
+ item = item_map.get(full_path)
+
+ if item:
+ badge = f"[{item.item_id} {item.role} {item.content_mode} score={item.score}]"
+ name_str = f"{prefix}{connector}{k}"
+ out.write(f"{name_str:<50} {badge}\n")
+ else:
+ out.write(f"{prefix}{connector}{k}/\n")
+ print_node(path_parts + (k,), current_dict[k], child_prefix)
+
+ out.write(".\n")
+ print_node((), tree, "")
+
+def _render_graph(pack: LlmPack, out: TextIO) -> None:
+ included_paths = {item.file.relative for item in pack.items}
+ item_id_map = {item.file.relative: item.item_id for item in pack.items}
+
+ groups = defaultdict(list)
+ for edge in pack.graph.edges:
+ if edge.source in included_paths and edge.target in included_paths:
+ key = (edge.source, edge.target, edge.kind)
+ groups[key].append(edge)
+
+ sorted_groups = sorted(groups.items(), key=lambda x: (x[0][0].as_posix(), x[0][1].as_posix()))
+
+ for (source, target, kind), edges in sorted_groups:
+ count = len(edges)
+ max_conf = max(e.confidence for e in edges)
+ analyzers = sorted({e.analyzer for e in edges})
+
+ s_id = item_id_map[source]
+ t_id = item_id_map[target]
+
+ analyzer_str = ",".join(analyzers)
+ out.write(f"{s_id} -> {t_id} [{kind}] x{count} (analyzers=[{analyzer_str}], conf={max_conf:.2f})\n")
diff --git a/src/scriber/scanner/files.py b/src/scriber/scanner/files.py
index f203dde..d4e8c20 100644
--- a/src/scriber/scanner/files.py
+++ b/src/scriber/scanner/files.py
@@ -42,7 +42,11 @@ def is_probably_binary(path: Path) -> bool:
try:
return require_native().is_probably_binary(str(path))
except Exception:
- return True
+ try:
+ chunk = path.read_bytes()[:4096]
+ return b"\0" in chunk
+ except OSError:
+ return True
def language_for(path: Path) -> str:
diff --git a/src/scriber/scanner/scan.py b/src/scriber/scanner/scan.py
index e2fa8a4..922c44a 100644
--- a/src/scriber/scanner/scan.py
+++ b/src/scriber/scanner/scan.py
@@ -6,8 +6,16 @@
def scan_project(root: Path, config: ScriberConfig) -> dict[Path, FileNode]:
- files, _ = scan_project_with_native(root, config)
- return files
+ try:
+ from scriber.native import is_native_available
+ if is_native_available():
+ files, _ = scan_project_with_native(root, config)
+ return files
+ except Exception:
+ pass
+
+ from scriber.scanner.scan_py import scan_project as scan_project_py
+ return scan_project_py(root, config)
def scan_project_with_native(root: Path, config: ScriberConfig) -> tuple[dict[Path, FileNode], list]:
diff --git a/tests/test_cache.py b/tests/test_cache.py
index 5c141bf..94dc68e 100644
--- a/tests/test_cache.py
+++ b/tests/test_cache.py
@@ -35,7 +35,7 @@ def test_cache_functionality(tmp_path: Path) -> None:
# Check that cache files were created
assert (tmp_path / ".scriber/cache/files.json").exists()
- assert (tmp_path / ".scriber/cache/import_graph.json").exists()
+ assert (tmp_path / ".scriber/cache/imports_v2.json").exists()
# Reload cache and check if retrieved properly
new_cache = ScriberCache(config, tmp_path)
diff --git a/tests/test_native.py b/tests/test_native.py
index 643d795..77c3341 100644
--- a/tests/test_native.py
+++ b/tests/test_native.py
@@ -193,7 +193,7 @@ def test_native_graph_matches_python_graph_mixed_project(tmp_path: Path) -> None
config.support_content.default,
config.support
)
- edges = native.build_import_graph(
+ edges = native.build_relation_graph(
str(tmp_path),
native_files,
config.python.source_roots,
@@ -202,7 +202,8 @@ def test_native_graph_matches_python_graph_mixed_project(tmp_path: Path) -> None
rs_imports = {}
for edge in edges:
- rs_imports.setdefault(Path(getattr(edge, "from")), set()).add(Path(edge.to))
+ if edge.kind == "import" or edge.kind == "mod" or edge.kind == "use" or edge.kind == "include":
+ rs_imports.setdefault(Path(getattr(edge, "source")), set()).add(Path(edge.target))
for path, targets in py_graph.imports.items():
file = python_files[path]
@@ -242,7 +243,7 @@ def test_native_scoring_matches_python_for_focused_pack(tmp_path: Path) -> None:
config.support_content.default,
config.support
)
- edges = native.build_import_graph(
+ edges = native.build_relation_graph(
str(tmp_path),
native_files,
config.python.source_roots,
@@ -397,16 +398,17 @@ def test_native_import_complex_python(tmp_path: Path) -> None:
config.support_content.default,
config.support
)
- edges = native.build_import_graph(
+ edges = native.build_relation_graph(
str(tmp_path),
native_files,
config.python.source_roots,
config.python.module_init_files
)
- imports = {Path(getattr(edge, "from")): set() for edge in edges}
+ imports = {Path(getattr(edge, "source")): set() for edge in edges}
for edge in edges:
- imports[Path(getattr(edge, "from"))].add(Path(edge.to))
+ if edge.kind == "import":
+ imports[Path(getattr(edge, "source"))].add(Path(edge.target))
main_path = Path("src/main.py")
assert main_path in imports
diff --git a/tests/test_symbols.py b/tests/test_symbols.py
new file mode 100644
index 0000000..fd4a5e1
--- /dev/null
+++ b/tests/test_symbols.py
@@ -0,0 +1,43 @@
+from pathlib import Path
+from scriber.core.symbols import SymbolIndex, SymbolNode
+from scriber.graph.languages.extractor import extract_python_symbols
+
+def test_extract_python_symbols() -> None:
+ code = """
+class MyClass:
+ def __init__(self):
+ pass
+
+ async def my_method(self):
+ pass
+
+def global_function():
+ pass
+"""
+ index = SymbolIndex()
+ file_path = Path("src/dummy.py")
+
+ extract_python_symbols(file_path, code, index)
+
+ symbols = index.get_symbols(file_path)
+ assert len(symbols) == 4
+
+ # Check Class
+ class_sym = next(s for s in symbols if s.name == "MyClass")
+ assert class_sym.kind == "class"
+ assert class_sym.parent_name is None
+
+ # Check Constructor
+ init_sym = next(s for s in symbols if s.name == "__init__")
+ assert init_sym.kind == "function"
+ assert init_sym.parent_name == "MyClass"
+
+ # Check Async Method
+ method_sym = next(s for s in symbols if s.name == "my_method")
+ assert method_sym.kind == "function"
+ assert method_sym.parent_name == "MyClass"
+
+ # Check Global Function
+ func_sym = next(s for s in symbols if s.name == "global_function")
+ assert func_sym.kind == "function"
+ assert func_sym.parent_name is None
From 7ed7a9bd1b78f3cac69c6743103ae1c3fe8f2348 Mon Sep 17 00:00:00 2001
From: SunneV
Date: Sun, 31 May 2026 12:45:56 +0200
Subject: [PATCH 2/6] cleanup
---
rust/scriber_native/src/score.rs | 2 --
1 file changed, 2 deletions(-)
diff --git a/rust/scriber_native/src/score.rs b/rust/scriber_native/src/score.rs
index f7d5ef4..ff71eef 100644
--- a/rust/scriber_native/src/score.rs
+++ b/rust/scriber_native/src/score.rs
@@ -460,8 +460,6 @@ fn walk_weighted_neighbors(
max_strength
}
-
-
fn support_base_score(file: &NativeFileInfo, options: &NativePackOptions) -> i32 {
let cat = file.support_category.as_deref().unwrap_or("support file");
match cat {
From ee9e4cad6c228fb8a5ebd1834f83891fd8545603 Mon Sep 17 00:00:00 2001
From: SunneV
Date: Sun, 31 May 2026 13:03:33 +0200
Subject: [PATCH 3/6] more improvements
---
.github/workflows/ci.yml | 53 +++-
.github/workflows/release.yml | 2 +-
.gitignore | 57 ++++
.pre-commit-config.yaml | 30 ++
CHANGELOG.md | 27 +-
Cargo.toml | 3 +-
README.md | 96 ++++--
assets/scriber_name.svg | 2 +-
pyproject.toml | 1 +
rust/scriber_native/src/import.rs | 14 +-
rust/scriber_native/src/score.rs | 38 ++-
scripts/bench_scan.py | 35 +++
scripts/sync_readme.py | 143 +++++++++
src/scriber/__init__.py | 1 -
src/scriber/budget/allocator.py | 37 ++-
src/scriber/cache.py | 73 +++--
src/scriber/cli/main.py | 341 ++++++++++++++++-----
src/scriber/core/config.py | 202 +++++++++---
src/scriber/core/init_config.py | 24 +-
src/scriber/core/models.py | 25 +-
src/scriber/core/profiles.py | 46 +++
src/scriber/core/root.py | 8 +-
src/scriber/core/symbols.py | 1 +
src/scriber/engine/ranker.py | 60 ++--
src/scriber/engine/roles.py | 31 +-
src/scriber/engine/scorer.py | 331 ++++++++++++++++----
src/scriber/graph/analyzers/__init__.py | 13 +-
src/scriber/graph/analyzers/base.py | 5 +-
src/scriber/graph/analyzers/config_refs.py | 46 ++-
src/scriber/graph/analyzers/docs.py | 43 ++-
src/scriber/graph/analyzers/env.py | 48 ++-
src/scriber/graph/analyzers/package.py | 38 ++-
src/scriber/graph/analyzers/tests.py | 58 +++-
src/scriber/graph/builder.py | 89 ++++--
src/scriber/graph/indexes.py | 14 +-
src/scriber/graph/languages/cpp.py | 8 +-
src/scriber/graph/languages/extractor.py | 23 +-
src/scriber/graph/languages/go.py | 15 +-
src/scriber/graph/languages/javascript.py | 29 +-
src/scriber/graph/languages/python.py | 28 +-
src/scriber/graph/languages/rust.py | 19 +-
src/scriber/graph/model.py | 3 +
src/scriber/native.py | 5 +-
src/scriber/outline/__init__.py | 1 +
src/scriber/outline/base.py | 4 +-
src/scriber/outline/generic.py | 7 +-
src/scriber/outline/python.py | 5 +-
src/scriber/packer/pack.py | 265 +++++++++++-----
src/scriber/renderer/llm_report.py | 99 +++---
src/scriber/rendering/renderer.py | 82 +++--
src/scriber/scanner/files.py | 71 ++++-
src/scriber/scanner/scan.py | 38 ++-
src/scriber/scanner/scan_py.py | 48 ++-
tests/test_cache.py | 45 ++-
tests/test_config_schema.py | 19 +-
tests/test_fixes.py | 166 ++++++++++
tests/test_init_config.py | 25 +-
tests/test_languages.py | 80 ++---
tests/test_native.py | 128 ++++----
tests/test_scriber.py | 41 ++-
tests/test_symbols.py | 15 +-
tests/test_tokens.py | 7 +-
62 files changed, 2520 insertions(+), 791 deletions(-)
create mode 100644 .gitignore
create mode 100644 .pre-commit-config.yaml
create mode 100644 scripts/bench_scan.py
create mode 100644 scripts/sync_readme.py
create mode 100644 src/scriber/core/profiles.py
create mode 100644 tests/test_fixes.py
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index d0ffdb8..98d3576 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -6,8 +6,53 @@ on:
pull_request:
jobs:
+ lint:
+ name: Lint & Format
+ runs-on: ubuntu-latest
+ permissions:
+ contents: write # Needed for auto-commit
+
+ steps:
+ - uses: actions/checkout@v4
+ with:
+ ref: ${{ github.head_ref }}
+
+ - name: Set up Rust
+ uses: dtolnay/rust-toolchain@stable
+
+ - name: Install uv
+ uses: astral-sh/setup-uv@v5
+ with:
+ cache-dependency-glob: "pyproject.toml"
+
+ - name: Set up Python
+ run: uv python install 3.12
+
+ - name: Sync dependencies
+ run: uv sync --all-extras
+
+ - name: Run pre-commit hooks
+ id: precommit
+ run: uv run pre-commit run --all-files
+
+ - name: Check README sync
+ run: uv run python scripts/sync_readme.py --check
+
+ - name: Auto-commit formatting fixes
+ if: failure()
+ uses: stefanzweifel/git-auto-commit-action@v5
+ with:
+ commit_message: "style: auto-fix formatting and linting issues"
+ commit_options: "--no-verify"
+ # Fail the job even if we commit so the user knows they need to pull
+
+ - name: Fail if pre-commit failed
+ if: steps.precommit.outcome == 'failure'
+ run: exit 1
+
test:
name: ${{ matrix.os }} / py${{ matrix.python-version }}
+ needs: lint
runs-on: ${{ matrix.os }}
strategy:
@@ -41,12 +86,6 @@ jobs:
- name: Check native import
run: uv run python -c "import scriber._native; print('native ok')"
- - name: Rust format check
- run: cargo fmt --check
-
- - name: Rust clippy
- run: cargo clippy --all-targets -- -D warnings
-
- name: Rust tests
run: cargo test
@@ -54,4 +93,4 @@ jobs:
run: uv run pytest
- name: CLI smoke
- run: uv run scriber . --only-tree --output -
\ No newline at end of file
+ run: uv run scriber . --only-tree --output -
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index da08b96..fa5abc8 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -82,4 +82,4 @@ jobs:
- name: Publish
uses: pypa/gh-action-pypi-publish@release/v1
with:
- packages-dir: dist-artifacts
\ No newline at end of file
+ packages-dir: dist-artifacts
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..3ec7f23
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,57 @@
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+
+# Python
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# Rust
+target/
+**/*.rs.bk
+
+# Caches and tooling
+.pytest_cache/
+.ruff_cache/
+.mypy_cache/
+.coverage
+htmlcov/
+.tox/
+.nox/
+
+# Scriber specific
+.scriber/
+scriber_pack.md
+*.scriber_pack.md
+context.md
+
+# IDEs and Editors
+.idea/
+.vscode/
+*.swp
+*.swo
+*~
+.DS_Store
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
new file mode 100644
index 0000000..12a0146
--- /dev/null
+++ b/.pre-commit-config.yaml
@@ -0,0 +1,30 @@
+repos:
+- repo: https://github.com/pre-commit/pre-commit-hooks
+ rev: v4.5.0
+ hooks:
+ - id: trailing-whitespace
+ - id: end-of-file-fixer
+ - id: check-yaml
+ - id: check-toml
+
+- repo: https://github.com/astral-sh/ruff-pre-commit
+ rev: v0.3.5
+ hooks:
+ - id: ruff
+ args: [ --fix ]
+ - id: ruff-format
+
+- repo: local
+ hooks:
+ - id: cargo-fmt
+ name: cargo fmt
+ entry: cargo fmt --manifest-path Cargo.toml --
+ language: system
+ types: [rust]
+ pass_filenames: true
+ - id: cargo-clippy
+ name: cargo clippy
+ entry: cargo clippy --manifest-path Cargo.toml -- -D warnings
+ language: system
+ types: [rust]
+ pass_filenames: false
diff --git a/CHANGELOG.md b/CHANGELOG.md
index e5e4f20..17c3b71 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -8,22 +8,25 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
## [2.1.0] - 2026-05-31
### Added
-- **π― AI-Native Navigation (P1)**: Implemented line-numbered code fences and symbol-level XML anchors (``) for classes and functions in full mode, allowing AI to navigate and apply Search & Replace diffs flawlessly.
-- **π Ultra-Focused Mode Optimization**: Focused mode (`scriber `) now acts as a precise surgical tool, cutting out unnecessary contextual noise.
-- **π‘οΈ Support Files Pruning**: Support files (`pyproject.toml`, `README.md`, Dockerfiles) are no longer granted automatic `full` mode immunity when running focused scans. They now decay to tree mode unless explicitly targeted.
-- **π§ͺ Test File Quarantine**: Test modules are heavily penalized in focused mode, dropping out of full/excerpt context to keep the generated pack laser-focused on actual implementation logic.
+- **Frontend Graph Tracking**: Added dependency parsing support for modern frontend frameworks (`.vue`, `.svelte`, `.astro`), HTML templates, and CSS stylesheets within JS/TS graph construction.
+- **Packaging Profiles (`--profile`)**: Added `default`, `audit`, `debug`, `refactor`, and `docs` profiles to quickly bias the file scoring and inclusion criteria without manually tweaking config options.
+- **CLI Introspection**: Added `--explain` flag as an alias. Enhanced `--why` output to show estimated token cost, content mode, and omission reasons for any target file.
+- **Automated README Sync**: Added `scripts/sync_readme.py` tool to automatically sync CLI arguments, profiles documentation, and version tags across the `README.md`.
+- **AI-Native Navigation & Optimization**: Implemented XML anchors for symbols, aggressive test file quarantine, and support file pruning to keep focused mode clean and strictly token-capped.
+- **Version Alignment**: Synchronized Python and Rust crate versions. `scriber --version` now reports both Python and native API versions.
### Fixed
-- **π Excerpt Fallback Bug**: Fixed a critical bug where `excerpt` files failed to render and completely dropped their token estimates, resulting in `_Excerpt unavailable_` placeholders. They now correctly fall back to outline AST structures and compute tokens accurately.
-- **βοΈ Graph Token Hard-Capping**: Re-engineered token budgeting with rigid distance-based hard caps in `ranker.py` (Max scores: 100/79/74/44 for Dist 0/1/2/3+ respectively). Focused mode is now reliably ~45% of the full project token size, completely eliminating distant `full` mode leaks.
+- **Cache Stability**: Fixed graph warm-cache edge generation and stale import cache validation (now strictly validating `mtime` and `size`).
+- **Resilience & Scanners**: Added pure-Python fallback for `read_text_lossy`, optimized scanner ordering (whitelist before binary check), and corrected the test role classifier to prevent false positives on files naturally named `tests.py`.
+- **Excerpt Fallback Bug**: Fixed rendering and token estimations for empty excerpt files; they now correctly fall back to outline AST structures or full content if budget allows.
## [2.0.0] - 2026-05-30
### Added
-- **β‘ Native Rust Acceleration (`scriber._native`)**: Full transition of filesystem scanning, high-performance file reading/writing, and binary classification to a compiled Rust extension built using Maturin and PyO3.
-- **π³ Fast Parallel Scanner**: Re-engineered directory scanning utilizing the `WalkBuilder` from the `ignore` crate, fully respecting `.gitignore` rules with blazing fast native execution.
-- **π§ͺ Rigorous Verification & Equivalence Testing**: Comprehensive suite of regression and equivalence tests validating 100% exact matching behavior between Rust and Python scanner modules.
-- **π¦ Multi-Platform Binary Wheels**: CI/CD integration using `PyO3/maturin-action` to compile and distribute native wheels across Linux, macOS, and Windows.
+- **Native Rust Acceleration (`scriber._native`)**: Full transition of filesystem scanning, high-performance file reading/writing, and binary classification to a compiled Rust extension built using Maturin and PyO3.
+- **Fast Parallel Scanner**: Re-engineered directory scanning utilizing the `WalkBuilder` from the `ignore` crate, fully respecting `.gitignore` rules with blazing fast native execution.
+- **Rigorous Verification & Equivalence Testing**: Comprehensive suite of regression and equivalence tests validating 100% exact matching behavior between Rust and Python scanner modules.
+- **Multi-Platform Binary Wheels**: CI/CD integration using `PyO3/maturin-action` to compile and distribute native wheels across Linux, macOS, and Windows.
## [1.1.2] - 2025-09-30
@@ -65,7 +68,7 @@ The CLI now falls back to simple text-based output if `rich` is not installed.
### Added
- Configured a GitHub Actions pipeline for automated testing and releases.
-- `-v` and `--version` to scriber app
+- `-v` and `--version` to scriber app
- The `--config` flag now accepts a path to a `pyproject.toml` file, providing more flexibility for monorepo configurations.
### Fixed
@@ -80,4 +83,4 @@ The CLI now falls back to simple text-based output if `rich` is not installed.
- **Clipboard Integration**: Enabled copying the generated project structure to the clipboard.
- **Command-Line Interface**: Created a command-line tool with a configurable `init` command for saving settings to `pyproject.toml`.
- **Configuration**: Introduced `pyproject.toml` as the single source of truth for project metadata and configuration.
-- **Testing**: Added a test suite using `pytest` to ensure core functionality and CLI commands work as expected.
\ No newline at end of file
+- **Testing**: Added a test suite using `pytest` to ensure core functionality and CLI commands work as expected.
diff --git a/Cargo.toml b/Cargo.toml
index 91e0426..a57d2f4 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,6 +1,6 @@
[package]
name = "project-scriber-native"
-version = "2.0.0"
+version = "2.1.0"
edition = "2021"
[lib]
@@ -16,4 +16,3 @@ serde = { version = "1.0", features = ["derive"] }
serde_json = "1.0"
memchr = "2.7"
regex = "1.10"
-
diff --git a/README.md b/README.md
index 7dc7d1a..e711a7b 100644
--- a/README.md
+++ b/README.md
@@ -9,13 +9,13 @@
-An intelligent tool to map, analyze, and compile project source code into a single, context-optimized text file for Large Language Models (LLMs). **Version 2.0** brings advanced dependency graph analysis, strict whitelist-based file inclusion, zero-dependency lightweight execution, and progress tracking!
+An intelligent tool to map, analyze, and compile project source code into a single, context-optimized text file for Large Language Models (LLMs). **Version 2** brings advanced dependency graph analysis, strict whitelist-based file inclusion, zero-dependency lightweight execution, and progress tracking!
-----
## π Table of Contents
-- [π€ Why ProjectScriber 2.0?](#-why-projectscriber-20)
+- [π€ Why ProjectScriber?](#-why-projectscriber)
- [β¨ Key Features](#-key-features)
- [π Quick Start](#-quick-start)
- [πΎ Installation](#-installation)
@@ -25,14 +25,14 @@ An intelligent tool to map, analyze, and compile project source code into a sing
-----
-## π€ Why ProjectScriber 2.0?
+## π€ Why ProjectScriber?
When working with Large Language Models, providing the full context of a codebase is crucial for getting accurate analysis, documentation, or refactoring suggestions. However, blindly pasting an entire project wastes tokens and introduces noise.
-**ProjectScriber 2.0** automates context building using a **Whitelist-First** philosophy and an **Intelligent Scoring Engine**. It analyzes your codebase's dependency graph (e.g., Python imports), determines which files are most relevant to the code you're working on, and bundles them into a single, clean markdown file, strictly respecting your token budgets and file-type configurations.
+**ProjectScriber** automates context building using a **Whitelist-First** philosophy and an **Intelligent Scoring Engine**. It analyzes your codebase's dependency graph (e.g., Python imports), determines which files are most relevant to the code you're working on, and bundles them into a single, clean markdown file, strictly respecting your token budgets and file-type configurations.
- π Your Codebase β π¦ ProjectScriber 2.0 β π LLM-Ready Context
+ π Your Codebase β π¦ ProjectScriber β π LLM-Ready Context
-----
@@ -123,32 +123,78 @@ uv pip install project-scriber
### CLI Options
+
| Option | Description |
|:---|:---|
-| `paths` | Project file/folder paths used as seeds. Defaults to current directory `.`. |
-| `--config [path]` | Path to `pyproject.toml`. Its parent directory becomes the project root. |
-| `--path-base [base]`| Base for relative paths: `project` (default) or `cwd`. |
-| `--format [md, txt]` | Output format. Defaults to `md` (Markdown). |
-| `--output [file]` | Output file path. Use `-` for stdout. |
-| `--dry-run` | Show pack summary without writing the output file. |
-| `--open` | Open the generated file in the default editor. |
-| `--validate-config`| Validate the `[tool.scriber]` configuration and exit. |
-| `--only-tree` | Render only the scored tree/map, without any file contents. |
-| `--[no-]modules` | Enable/Disable automatic related module selection (dependency graph scanning). |
-| `--[no-]support` | Enable/Disable support files (like `.env.example`, `.github/workflows`). |
-| `--support-content` | Override support file content policy (`full`, `auto`, `tree_only`). |
+| `paths` | Project file/folder paths used as seeds. Defaults to current directory. |
+| `--profile` | Preset configuration profile. |
+| `--config` | Path to pyproject.toml. Its parent directory becomes the project root. |
+| `--path-base` | Base directory for relative paths when --config is used. |
+| `--format` | Output format. |
+| `--output` | Output file path, relative to project root unless absolute. Use '-' for stdout. |
+| `--only-tree` | Render only scored tree/map, without file contents. |
+| `--modules` | Enable automatic related module selection. |
+| `--no-modules` | Disable automatic related module selection. |
+| `--support` | Enable support files. |
+| `--no-support` | Disable support files. |
+| `--support-content` | Override default support file content policy. |
| `--max-files` | Maximum number of files in the pack. |
-| `--max-tokens` | Approximate token budget using char-based estimation. `0` disables budget. |
-| `--min-score` | Minimum relevance score (0-100) for non-seed files to be included. |
-| `--init` | Append a default `[tool.scriber]` config to `pyproject.toml` and exit. |
-| `--force` | Force overwrite of the config block when used with `--init`. |
-| `--version` | Show program's version number and exit. |
+| `--max-tokens` | Approximate token budget for included file contents. 0 disables budget. |
+| `--min-score` | Minimum score for non-seed files. |
+| `--init` | Append a default [tool.scriber] config to pyproject.toml and exit. |
+| `--force` | Allow --init to append even if [tool.scriber] already exists. |
+| `--project` | Force project snapshot mode. |
+| `--explain, --explain-selection` | Explain reason for file selection in detail. |
+| `--explain-graph` | Print relation graph statistics and relations. |
+| `--why` | Print exactly which rules/edges pulled the specified file into the pack. |
+| `--graph-json` | Export the RelationGraph as a JSON file to the specified path. |
+| `--validate-config` | Validate pyproject.toml scriber config. |
+| `--dry-run` | Perform a dry run without saving the pack file. |
+| `--open` | Open the output file automatically after creation. |
+| `--timings` | Show execution timings for each phase. |
+| `--version` | Show version information and exit. |
+
+
+
+### Profiles
+
+ProjectScriber comes with several preset profiles to quickly bias the file scoring and inclusion criteria:
+
+| Profile | Description |
+|:---|:---|
+| `default` | Standard scoring behavior. |
+| `audit` | Boosts tests, config files, CI environments, and dependency files. Assumes full support content inclusion. |
+| `debug` | Boosts direct/reverse dependencies, tests, runtime support, and files close to the seed path. |
+| `refactor` | Boosts files within the same package, related tests, and direct dependencies. |
+| `docs` | Heavily boosts documentation files while suppressing test and code file scores. Assumes tree_only support content by default. |
+
+
+-----
+
+## π οΈ IDE Integrations
+
+### PyCharm / IntelliJ IDEA (External Tools)
+
+You can integrate ProjectScriber directly into PyCharm's right-click context menu to quickly generate LLM context packs for any selected file or folder!
+
+1. Open **Settings / Preferences** β **Tools** β **External Tools**.
+2. Click the **`+`** button to add a new tool.
+3. Configure it as follows:
+
+* **Name:** `Scriber`
+* **Group:** `External Tools`
+* **Description:** `Runs ProjectScriber on the selected directory and copies output to clipboard`
+* **Program:** `scriber` *(or the absolute path to your `scriber.exe` e.g., `C:\Tools\Python\Python313\Scripts\scriber.exe`)*
+* **Arguments:** `"$FilePath$" --config $ProjectFileDir$/pyproject.toml`
+* **Working directory:** `$ProjectFileDir$`
+
+Now, you can simply right-click any file or directory in your Project tree, select **External Tools** β **Scriber**, and the context pack will be generated instantly based on your project configuration!
-----
## βοΈ Configuration
-ProjectScriber 2.0 configures itself through the standard `pyproject.toml` using the `[tool.scriber]` table.
+ProjectScriber 2.1.0 configures itself through the standard `pyproject.toml` using the `[tool.scriber]` table.
Generate the default block using:
```shell
@@ -217,7 +263,7 @@ patterns = [
```
### Whitelist Policy
-ProjectScriber 2.0 uses a strict **whitelist** approach:
+ProjectScriber 2.1.0 uses a strict **whitelist** approach:
1. Files must match either a `code_pattern` or a `support_pattern` to be considered.
2. Unrecognized extensions and binary files are automatically excluded, keeping your LLM context safe from binary garbage.
3. Lock files are included in the tree by default, but their contents are omitted to save tokens.
@@ -246,4 +292,4 @@ Contributions are welcome!
3. **Run Tests**:
```shell
uv run pytest
- ```
\ No newline at end of file
+ ```
diff --git a/assets/scriber_name.svg b/assets/scriber_name.svg
index 2c84186..b7cbe31 100644
--- a/assets/scriber_name.svg
+++ b/assets/scriber_name.svg
@@ -2,4 +2,4 @@
ProjectScriber
-
\ No newline at end of file
+
diff --git a/pyproject.toml b/pyproject.toml
index 010424e..837c8d1 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -34,6 +34,7 @@ dependencies = [
dev = [
"pytest>=8",
"maturin>=1.7,<2",
+ "pre-commit",
]
[project.scripts]
diff --git a/rust/scriber_native/src/import.rs b/rust/scriber_native/src/import.rs
index e2c2327..66b751c 100644
--- a/rust/scriber_native/src/import.rs
+++ b/rust/scriber_native/src/import.rs
@@ -355,7 +355,9 @@ pub fn build_import_graph(
let base_normalized = normalize_posix_path(&raw_base);
let mut resolved = false;
- let extensions = vec!["", ".ts", ".tsx", ".js", ".jsx", ".d.ts"];
+ let extensions = vec![
+ "", ".ts", ".tsx", ".js", ".jsx", ".d.ts", ".vue", ".svelte", ".astro", ".json",
+ ];
for ext in extensions {
let cand = if ext.is_empty() {
base_normalized.clone()
@@ -376,7 +378,15 @@ pub fn build_import_graph(
}
if !resolved {
- let index_names = vec!["index.ts", "index.tsx", "index.js", "index.jsx"];
+ let index_names = vec![
+ "index.ts",
+ "index.tsx",
+ "index.js",
+ "index.jsx",
+ "index.vue",
+ "index.svelte",
+ "index.astro",
+ ];
for idx in index_names {
let cand = format!("{}/{}", base_normalized, idx);
if let Some(target) = absolute_to_file.get(&cand) {
diff --git a/rust/scriber_native/src/score.rs b/rust/scriber_native/src/score.rs
index ff71eef..551c032 100644
--- a/rust/scriber_native/src/score.rs
+++ b/rust/scriber_native/src/score.rs
@@ -64,6 +64,14 @@ pub struct NativePackOptions {
pub documentation_score: i32,
#[pyo3(get, set)]
pub shared_dependency_bonus: i32,
+ #[pyo3(get, set)]
+ pub entrypoint_file_score: i32,
+ #[pyo3(get, set)]
+ pub code_file_score: i32,
+ #[pyo3(get, set)]
+ pub test_file_score: i32,
+ #[pyo3(get, set)]
+ pub other_file_score: i32,
// Module flags
#[pyo3(get, set)]
@@ -116,6 +124,10 @@ impl NativePackOptions {
runtime_support_score = 50,
documentation_score = 45,
shared_dependency_bonus = 10,
+ entrypoint_file_score = 90,
+ code_file_score = 80,
+ test_file_score = 60,
+ other_file_score = 40,
modules_enabled = true,
include_direct_dependencies = true,
include_reverse_dependencies = true,
@@ -148,6 +160,10 @@ impl NativePackOptions {
runtime_support_score: i32,
documentation_score: i32,
shared_dependency_bonus: i32,
+ entrypoint_file_score: i32,
+ code_file_score: i32,
+ test_file_score: i32,
+ other_file_score: i32,
modules_enabled: bool,
include_direct_dependencies: bool,
include_reverse_dependencies: bool,
@@ -179,6 +195,10 @@ impl NativePackOptions {
runtime_support_score,
documentation_score,
shared_dependency_bonus,
+ entrypoint_file_score,
+ code_file_score,
+ test_file_score,
+ other_file_score,
modules_enabled,
include_direct_dependencies,
include_reverse_dependencies,
@@ -311,9 +331,15 @@ fn is_test_file(rel: &str, test_roots: &[String]) -> bool {
.unwrap_or(std::ffi::OsStr::new(""))
.to_string_lossy()
.to_lowercase();
- for part in p.components().filter_map(|c| c.as_os_str().to_str()) {
- if test_roots.contains(&part.to_string()) {
- return true;
+ let components: Vec<_> = p
+ .components()
+ .filter_map(|c| c.as_os_str().to_str())
+ .collect();
+ if components.len() > 1 {
+ for part in &components[0..components.len() - 1] {
+ if test_roots.contains(&part.to_string()) {
+ return true;
+ }
}
}
name.starts_with("test_") || name.ends_with("_test.py") || name.ends_with(".test.py")
@@ -543,13 +569,13 @@ pub fn score_candidates_native(
for (rel, c) in &mut mapped_files {
if c.info.kind == "code" {
if matches_entrypoint(rel, &options.entrypoint_patterns) {
- c.score = 90;
+ c.score = options.entrypoint_file_score;
add_reason(c, "entrypoint", "entrypoint file", None);
} else if is_test_file(rel, &options.test_roots) {
- c.score = 60;
+ c.score = options.test_file_score;
add_reason(c, "test_file", "test file", None);
} else {
- c.score = 80;
+ c.score = options.code_file_score;
add_reason(c, "code_file", "code file", None);
}
} else if c.info.kind == "support" && options.support_enabled {
diff --git a/scripts/bench_scan.py b/scripts/bench_scan.py
new file mode 100644
index 0000000..25226c3
--- /dev/null
+++ b/scripts/bench_scan.py
@@ -0,0 +1,35 @@
+from __future__ import annotations
+
+import statistics
+import time
+from pathlib import Path
+
+from scriber.core.config import load_config
+from scriber.scanner.scan import scan_project as scan_rust
+from scriber.scanner.scan_py import scan_project as scan_python
+
+
+def bench(name, fn, rounds=10):
+ times = []
+ for _ in range(rounds):
+ start = time.perf_counter()
+ result = fn()
+ times.append(time.perf_counter() - start)
+
+ print(f"{name}:")
+ print(f" files: {len(result)}")
+ print(f" min: {min(times):.4f}s")
+ print(f" avg: {statistics.mean(times):.4f}s")
+ print(f" p95: {sorted(times)[int(len(times) * 0.95) - 1]:.4f}s")
+
+
+def main():
+ root = Path.cwd()
+ config = load_config(root / "pyproject.toml")
+
+ bench("python scan", lambda: scan_python(root, config))
+ bench("rust scan", lambda: scan_rust(root, config))
+
+
+if __name__ == "__main__":
+ main()
diff --git a/scripts/sync_readme.py b/scripts/sync_readme.py
new file mode 100644
index 0000000..152b4f9
--- /dev/null
+++ b/scripts/sync_readme.py
@@ -0,0 +1,143 @@
+import argparse
+import sys
+import re
+from pathlib import Path
+
+# Must be run from project root, or specify path
+try:
+ import tomli
+except ImportError:
+ import tomllib as tomli
+
+
+def get_version(root: Path) -> str:
+ with open(root / "pyproject.toml", "rb") as f:
+ data = tomli.load(f)
+ return data["project"]["version"]
+
+
+def generate_cli_options() -> str:
+ # We must import scriber to build the parser
+ # Assume we run it inside the environment
+ from scriber.cli.main import build_parser
+
+ parser = build_parser()
+
+ lines = ["| Option | Description |", "|:---|:---|"]
+ for action in parser._actions:
+ if action.dest == "help":
+ continue
+
+ flags = ", ".join(action.option_strings)
+ if not flags:
+ flags = action.dest
+
+ help_text = action.help or ""
+ lines.append(f"| `{flags}` | {help_text} |")
+
+ return "\n".join(lines)
+
+
+def generate_profiles() -> str:
+ from scriber.core.profiles import PROFILE_CHOICES
+
+ lines = [
+ "### Profiles",
+ "",
+ "ProjectScriber comes with several preset profiles to quickly bias the file scoring and inclusion criteria:",
+ "",
+ "| Profile | Description |",
+ "|:---|:---|",
+ ]
+
+ descriptions = {
+ "default": "Standard scoring behavior.",
+ "audit": "Boosts tests, config files, CI environments, and dependency files. Assumes full support content inclusion.",
+ "debug": "Boosts direct/reverse dependencies, tests, runtime support, and files close to the seed path.",
+ "refactor": "Boosts files within the same package, related tests, and direct dependencies.",
+ "docs": "Heavily boosts documentation files while suppressing test and code file scores. Assumes tree_only support content by default.",
+ }
+
+ for p in PROFILE_CHOICES:
+ lines.append(f"| `{p}` | {descriptions.get(p, '')} |")
+
+ return "\n".join(lines)
+
+
+def sync_readme(root: Path, write: bool = False) -> bool:
+ readme_path = root / "README.md"
+ content = readme_path.read_text(encoding="utf-8")
+ original_content = content
+
+ version = get_version(root)
+
+ # 1. Update Version tags
+ version_pattern = re.compile(
+ r".*?", re.DOTALL
+ )
+ content = version_pattern.sub(
+ f"{version}", content
+ )
+
+ # 2. Update CLI Options
+ cli_options = generate_cli_options()
+ cli_pattern = re.compile(
+ r".*?",
+ re.DOTALL,
+ )
+ content = cli_pattern.sub(
+ f"\n{cli_options}\n",
+ content,
+ )
+
+ # 3. Update Profiles
+ profiles = generate_profiles()
+ profiles_pattern = re.compile(
+ r".*?", re.DOTALL
+ )
+ content = profiles_pattern.sub(
+ f"\n{profiles}\n",
+ content,
+ )
+
+ # Also enforce 2.x references
+ content = re.sub(
+ r"\*\*Version 2\.\d+(\.\d+)?\*\*", f"**Version {version}**", content
+ )
+ content = re.sub(
+ r"ProjectScriber 2\.\d+(\.\d+)?", f"ProjectScriber {version}", content
+ )
+ content = re.sub(r"Scriber 2\.\d+(\.\d+)?", f"Scriber {version}", content)
+
+ if content == original_content:
+ print("README.md is up to date.")
+ return True
+
+ if write:
+ readme_path.write_text(content, encoding="utf-8")
+ print("README.md has been updated.")
+ return True
+ else:
+ print(
+ "Error: README.md is stale. Run 'python scripts/sync_readme.py --write' to update."
+ )
+ return False
+
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser()
+ parser.add_argument(
+ "--write", action="store_true", help="Write changes to README.md"
+ )
+ parser.add_argument(
+ "--check", action="store_true", help="Check if README.md is up to date"
+ )
+ args = parser.parse_args()
+
+ root = Path(__file__).parent.parent
+ if args.write:
+ sys.exit(0 if sync_readme(root, write=True) else 1)
+ elif args.check:
+ sys.exit(0 if sync_readme(root, write=False) else 1)
+ else:
+ parser.print_help()
diff --git a/src/scriber/__init__.py b/src/scriber/__init__.py
index b32d113..16060c4 100644
--- a/src/scriber/__init__.py
+++ b/src/scriber/__init__.py
@@ -6,4 +6,3 @@
__all__ = ["build_pack", "build_and_write_pack", "ScriberPack"]
__version__ = "2.1.0"
-
diff --git a/src/scriber/budget/allocator.py b/src/scriber/budget/allocator.py
index 80b945b..f501b13 100644
--- a/src/scriber/budget/allocator.py
+++ b/src/scriber/budget/allocator.py
@@ -1,7 +1,7 @@
from __future__ import annotations
from dataclasses import dataclass
-from typing import Any
-from scriber.core.models import Candidate, ContentMode, PackItem, FileRole
+from scriber.core.models import Candidate, ContentMode, PackItem
+
@dataclass(slots=True)
class BudgetPolicy:
@@ -14,29 +14,36 @@ class BudgetPolicy:
outline_budget_ratio: float = 0.20
reserve_ratio: float = 0.05
-def allocate_budget(candidates: list[Candidate], policy: BudgetPolicy, explicit_seeds: set) -> list[PackItem]:
+
+def allocate_budget(
+ candidates: list[Candidate], policy: BudgetPolicy, explicit_seeds: set
+) -> list[PackItem]:
items = []
-
+
current_tokens = 0
full_budget = int(policy.target_tokens * policy.full_code_budget_ratio)
-
+
for i, c in enumerate(candidates):
- item_id = f"F{i+1:03d}"
+ item_id = f"F{i + 1:03d}"
role = getattr(c, "role", "unknown")
-
+
mode: ContentMode = "tree"
-
+
is_seed = c.file.relative in explicit_seeds
-
+
if is_seed:
mode = "full"
elif c.file.content_policy == "tree_only":
mode = "tree"
elif c.file.content_policy == "full" and policy.mode != "focused":
mode = "full"
- elif c.token_estimate <= 1200 and c.score >= 80 and current_tokens < full_budget:
+ elif (
+ c.token_estimate <= 1200 and c.score >= 80 and current_tokens < full_budget
+ ):
mode = "full"
- elif c.score >= 85 and c.token_estimate <= 2400 and current_tokens < full_budget:
+ elif (
+ c.score >= 85 and c.token_estimate <= 2400 and current_tokens < full_budget
+ ):
mode = "full"
elif c.score >= 75:
mode = "excerpt"
@@ -44,10 +51,10 @@ def allocate_budget(candidates: list[Candidate], policy: BudgetPolicy, explicit_
mode = "outline"
else:
mode = "tree"
-
+
if mode == "full":
current_tokens += c.token_estimate
-
+
item = PackItem(
file=c.file,
score=c.score,
@@ -59,8 +66,8 @@ def allocate_budget(candidates: list[Candidate], policy: BudgetPolicy, explicit_
token_estimate=c.token_estimate,
utility=c.utility,
raw_score=c.raw_score,
- item_id=item_id
+ item_id=item_id,
)
items.append(item)
-
+
return items
diff --git a/src/scriber/cache.py b/src/scriber/cache.py
index d899aa8..cb97650 100644
--- a/src/scriber/cache.py
+++ b/src/scriber/cache.py
@@ -1,6 +1,5 @@
from __future__ import annotations
-import os
import sys
import json
import hashlib
@@ -13,6 +12,7 @@
def get_config_hash(config: ScriberConfig) -> str:
from scriber import __version__
+
data = {
"code_patterns": config.code_patterns,
"support_patterns": config.support_patterns,
@@ -35,17 +35,18 @@ def get_config_hash(config: ScriberConfig) -> str:
class ScriberCache:
def __init__(self, config: ScriberConfig, project_root: Path):
self.enabled = config.cache.enabled
- self.cache_dir = project_root / config.cache.dir
+ self.project_root = project_root.resolve()
+ self.cache_dir = self.project_root / config.cache.dir
self.files_cache_path = self.cache_dir / "files.json"
self.imports_cache_path = self.cache_dir / "imports_v2.json"
self.relations_cache_path = self.cache_dir / "relations_v1.jsonl"
self.config_hash = get_config_hash(config)
self.python_version = f"{sys.version_info.major}.{sys.version_info.minor}"
-
+
self.reads = 0
self.hits = 0
self.writes = 0
-
+
self.files_data: dict[str, dict[str, Any]] = {}
self.imports_data: dict[str, dict[str, Any]] = {}
self._load()
@@ -53,7 +54,7 @@ def __init__(self, config: ScriberConfig, project_root: Path):
def _load(self) -> None:
if not self.enabled:
return
-
+
try:
if self.files_cache_path.exists():
with self.files_cache_path.open("r", encoding="utf-8") as f:
@@ -67,23 +68,29 @@ def _load(self) -> None:
self.files_data = {}
self.imports_data = {}
- def get_file(self, rel_path: Path, mtime_ns: int, size: int) -> dict[str, Any] | None:
+ def get_file(
+ self, rel_path: Path, mtime_ns: int, size: int
+ ) -> dict[str, Any] | None:
if not self.enabled:
return None
-
+
key = rel_path.as_posix()
entry = self.files_data.get(key)
if entry is None:
return None
-
- if (entry.get("mtime_ns") == mtime_ns and
- entry.get("size") == size and
- entry.get("python_version") == self.python_version and
- entry.get("config_hash") == self.config_hash):
+
+ if (
+ entry.get("mtime_ns") == mtime_ns
+ and entry.get("size") == size
+ and entry.get("python_version") == self.python_version
+ and entry.get("config_hash") == self.config_hash
+ ):
return entry.get("data")
return None
- def set_file(self, rel_path: Path, mtime_ns: int, size: int, data: dict[str, Any]) -> None:
+ def set_file(
+ self, rel_path: Path, mtime_ns: int, size: int, data: dict[str, Any]
+ ) -> None:
if not self.enabled:
return
key = rel_path.as_posix()
@@ -92,18 +99,23 @@ def set_file(self, rel_path: Path, mtime_ns: int, size: int, data: dict[str, Any
"size": size,
"python_version": self.python_version,
"config_hash": self.config_hash,
- "data": data
+ "data": data,
}
- def get_imports(self, rel_path: Path) -> set[Path] | None:
+ def get_imports(self, rel_path: Path, mtime_ns: int, size: int) -> set[Path] | None:
if not self.enabled:
return None
self.reads += 1
key = rel_path.as_posix()
imports = self.imports_data.get(key)
if imports is not None:
- self.hits += 1
- return {Path(p) for p in imports.get("targets", [])}
+ if (
+ imports.get("mtime_ns") == mtime_ns
+ and imports.get("size") == size
+ and imports.get("config_hash") == self.config_hash
+ ):
+ self.hits += 1
+ return {Path(p) for p in imports.get("targets", [])}
return None
def set_imports(self, rel_path: Path, imports: set[Path]) -> None:
@@ -112,7 +124,7 @@ def set_imports(self, rel_path: Path, imports: set[Path]) -> None:
self.writes += 1
key = rel_path.as_posix()
try:
- stat = (self.cache_dir.parent.parent / rel_path).stat()
+ stat = (self.project_root / rel_path).stat()
mtime_ns = stat.st_mtime_ns
size = stat.st_size
except OSError:
@@ -122,7 +134,7 @@ def set_imports(self, rel_path: Path, imports: set[Path]) -> None:
"mtime_ns": mtime_ns,
"size": size,
"config_hash": self.config_hash,
- "targets": [p.as_posix() for p in sorted(imports)]
+ "targets": [p.as_posix() for p in sorted(imports)],
}
def add_import_edge(self, source: Path, target: Path) -> None:
@@ -133,7 +145,7 @@ def add_import_edge(self, source: Path, target: Path) -> None:
target_str = target.as_posix()
if key not in self.imports_data:
try:
- stat = (self.cache_dir.parent.parent / source).stat()
+ stat = (self.project_root / source).stat()
mtime_ns = stat.st_mtime_ns
size = stat.st_size
except OSError:
@@ -143,7 +155,7 @@ def add_import_edge(self, source: Path, target: Path) -> None:
"mtime_ns": mtime_ns,
"size": size,
"config_hash": self.config_hash,
- "targets": [target_str]
+ "targets": [target_str],
}
else:
if target_str not in self.imports_data[key].get("targets", []):
@@ -153,22 +165,29 @@ def add_import_edge(self, source: Path, target: Path) -> None:
def save(self, active_files: set[Path] | None = None) -> None:
if not self.enabled:
return
-
+
try:
self.cache_dir.mkdir(parents=True, exist_ok=True)
-
+
# Simple cleanup mechanism:
# 1. Prune stale cache entries (entries for files no longer in active_files)
if active_files is not None:
active_keys = {p.as_posix() for p in active_files}
- self.files_data = {k: v for k, v in self.files_data.items() if k in active_keys}
- self.imports_data = {k: v for k, v in self.imports_data.items() if k in active_keys}
+ self.files_data = {
+ k: v for k, v in self.files_data.items() if k in active_keys
+ }
+ self.imports_data = {
+ k: v for k, v in self.imports_data.items() if k in active_keys
+ }
# 2. Enforce absolute limit of max 1000 entries to prevent infinite growth
if len(self.files_data) > 1000:
# Remove oldest keys
- sorted_keys = sorted(self.files_data.keys(), key=lambda k: self.files_data[k].get("mtime_ns", 0))
- to_remove = sorted_keys[:len(sorted_keys) - 1000]
+ sorted_keys = sorted(
+ self.files_data.keys(),
+ key=lambda k: self.files_data[k].get("mtime_ns", 0),
+ )
+ to_remove = sorted_keys[: len(sorted_keys) - 1000]
for k in to_remove:
self.files_data.pop(k, None)
self.imports_data.pop(k, None)
diff --git a/src/scriber/cli/main.py b/src/scriber/cli/main.py
index 7071f7f..1b69a25 100644
--- a/src/scriber/cli/main.py
+++ b/src/scriber/cli/main.py
@@ -5,30 +5,38 @@
from pathlib import Path
from typing import Sequence
-from scriber.core.config import DEFAULT_CONFIG_BLOCK, load_raw_pyproject, load_config, validate_config, validate_raw_config
+from scriber.core.config import (
+ load_raw_pyproject,
+ load_config,
+ validate_config,
+ validate_raw_config,
+)
from scriber.core.errors import ScriberError
from scriber.core.init_config import init_project
from scriber.core.root import resolve_config_path
from scriber.packer.pack import build_and_write_pack
+
def handle_introspection(args, pack) -> None:
import json
-
+
# 1. Export Graph JSON if requested
if args.graph_json:
edges_data = []
for edge in pack.graph.edges:
- edges_data.append({
- "source": str(edge.source),
- "target": str(edge.target),
- "kind": edge.kind,
- "weight": edge.weight,
- "confidence": edge.confidence,
- "evidence": edge.evidence,
- "line": edge.line,
- "analyzer": edge.analyzer
- })
-
+ edges_data.append(
+ {
+ "source": str(edge.source),
+ "target": str(edge.target),
+ "kind": edge.kind,
+ "weight": edge.weight,
+ "confidence": edge.confidence,
+ "evidence": edge.evidence,
+ "line": edge.line,
+ "analyzer": edge.analyzer,
+ }
+ )
+
graph_data = {"edges": edges_data}
json_path = Path(args.graph_json)
try:
@@ -42,12 +50,12 @@ def handle_introspection(args, pack) -> None:
if args.explain_graph:
edges = pack.graph.edges
total_edges = len(edges)
-
+
# Group by kind
kind_counts = {}
for edge in edges:
kind_counts[edge.kind] = kind_counts.get(edge.kind, 0) + 1
-
+
# Get unique nodes
nodes = set()
for edge in edges:
@@ -55,13 +63,15 @@ def handle_introspection(args, pack) -> None:
nodes.add(edge.target)
unique_nodes = len(nodes)
avg_degree = (total_edges * 2.0 / unique_nodes) if unique_nodes > 0 else 0.0
-
+
print("\n========================================", file=sys.stderr)
print("SCRIBER RELATION GRAPH EXPLANATION", file=sys.stderr)
print("========================================", file=sys.stderr)
print(f"Total Edges: {total_edges}", file=sys.stderr)
print("Edges by Kind:", file=sys.stderr)
- for kind, count in sorted(kind_counts.items(), key=lambda x: x[1], reverse=True):
+ for kind, count in sorted(
+ kind_counts.items(), key=lambda x: x[1], reverse=True
+ ):
print(f" - {kind.ljust(20)}: {count}", file=sys.stderr)
print(f"Unique Nodes: {unique_nodes}", file=sys.stderr)
print(f"Average Degree: {avg_degree:.2f}", file=sys.stderr)
@@ -71,7 +81,7 @@ def handle_introspection(args, pack) -> None:
if args.why:
why_target = args.why.replace("\\", "/").lower()
target_c = None
-
+
candidates_or_items = getattr(pack, "candidates", getattr(pack, "items", []))
for c in candidates_or_items:
rel_str = c.file.relative.as_posix().lower()
@@ -79,9 +89,12 @@ def handle_introspection(args, pack) -> None:
if why_target in rel_str or why_target in abs_str:
target_c = c
break
-
+
if not target_c:
- print(f"\nCould not find file matching '{args.why}' in the analyzed candidates.", file=sys.stderr)
+ print(
+ f"\nCould not find file matching '{args.why}' in the analyzed candidates.",
+ file=sys.stderr,
+ )
return
print("\n========================================", file=sys.stderr)
@@ -90,34 +103,42 @@ def handle_introspection(args, pack) -> None:
print(f"Score: {target_c.score}", file=sys.stderr)
if hasattr(target_c, "role"):
print(f"Role: {target_c.role}", file=sys.stderr)
-
+ if hasattr(target_c, "token_estimate"):
+ print(f"Token Cost: {target_c.token_estimate}", file=sys.stderr)
+ if hasattr(target_c, "content_mode"):
+ print(f"Content Mode: {target_c.content_mode}", file=sys.stderr)
+ if hasattr(target_c, "omitted_reason") and target_c.omitted_reason:
+ print(f"Omitted Reason: {target_c.omitted_reason}", file=sys.stderr)
+
reasons = getattr(target_c, "reasons", [])
if reasons:
print("Selection Reasons:", file=sys.stderr)
for r in reasons:
print(f" - {r}", file=sys.stderr)
else:
- reason_summary = getattr(target_c, "reason_summary", getattr(target_c, "reason", "None"))
+ reason_summary = getattr(
+ target_c, "reason_summary", getattr(target_c, "reason", "None")
+ )
print(f"Selection Reasons: {reason_summary}", file=sys.stderr)
-
+
incoming = []
for edge in pack.graph.edges:
if edge.target == target_c.file.relative:
incoming.append(edge)
-
+
if incoming:
print("\nIncoming Relation Edges:", file=sys.stderr)
for edge in sorted(incoming, key=lambda e: (e.kind, str(e.source))):
ev = f" ({edge.evidence})" if edge.evidence else ""
- print(f" - {edge.source} -> [this file] (kind: {edge.kind}, weight: {edge.weight}, confidence: {edge.confidence}){ev}", file=sys.stderr)
+ print(
+ f" - {edge.source} -> [this file] (kind: {edge.kind}, weight: {edge.weight}, confidence: {edge.confidence}){ev}",
+ file=sys.stderr,
+ )
else:
print("\nNo incoming relation edges found in graph.", file=sys.stderr)
print("========================================\n", file=sys.stderr)
-
-
-
def _progress(msg: str) -> None:
# Use carriage return and padding to avoid external dependencies like rich
sys.stderr.write(f"\r[Scriber] {msg}".ljust(80))
@@ -129,35 +150,132 @@ def build_parser() -> argparse.ArgumentParser:
prog="scriber",
description="Scriber 2.0: build an intelligent code pack from one or more project paths.",
)
- parser.add_argument("paths", nargs="*", help="Project file/folder paths used as seeds. Defaults to current directory.")
- parser.add_argument("--profile", choices=["gpt", "focused-gpt", "full"], default="gpt", help="Preset configuration profile (gpt, focused-gpt, full).")
- parser.add_argument("--config", help="Path to pyproject.toml. Its parent directory becomes the project root.")
- parser.add_argument("--path-base", choices=["project", "cwd"], default="project", help="Base directory for relative paths when --config is used.")
- parser.add_argument("--format", choices=["md", "txt"], dest="output_format", help="Output format.")
- parser.add_argument("--output", help="Output file path, relative to project root unless absolute. Use '-' for stdout.")
- parser.add_argument("--only-tree", action="store_true", help="Render only scored tree/map, without file contents.")
- parser.add_argument("--modules", dest="modules", action="store_true", help="Enable automatic related module selection.")
- parser.add_argument("--no-modules", dest="modules", action="store_false", help="Disable automatic related module selection.")
+ parser.add_argument(
+ "paths",
+ nargs="*",
+ help="Project file/folder paths used as seeds. Defaults to current directory.",
+ )
+ parser.add_argument(
+ "--profile",
+ choices=["default", "audit", "debug", "refactor", "docs"],
+ default="default",
+ help="Preset configuration profile.",
+ )
+ parser.add_argument(
+ "--config",
+ help="Path to pyproject.toml. Its parent directory becomes the project root.",
+ )
+ parser.add_argument(
+ "--path-base",
+ choices=["project", "cwd"],
+ default="project",
+ help="Base directory for relative paths when --config is used.",
+ )
+ parser.add_argument(
+ "--format", choices=["md", "txt"], dest="output_format", help="Output format."
+ )
+ parser.add_argument(
+ "--output",
+ help="Output file path, relative to project root unless absolute. Use '-' for stdout.",
+ )
+ parser.add_argument(
+ "--only-tree",
+ action="store_true",
+ help="Render only scored tree/map, without file contents.",
+ )
+ parser.add_argument(
+ "--modules",
+ dest="modules",
+ action="store_true",
+ help="Enable automatic related module selection.",
+ )
+ parser.add_argument(
+ "--no-modules",
+ dest="modules",
+ action="store_false",
+ help="Disable automatic related module selection.",
+ )
parser.set_defaults(modules=None)
- parser.add_argument("--support", dest="support", action="store_true", help="Enable support files.")
- parser.add_argument("--no-support", dest="support", action="store_false", help="Disable support files.")
+ parser.add_argument(
+ "--support", dest="support", action="store_true", help="Enable support files."
+ )
+ parser.add_argument(
+ "--no-support",
+ dest="support",
+ action="store_false",
+ help="Disable support files.",
+ )
parser.set_defaults(support=None)
- parser.add_argument("--support-content", choices=["full", "auto", "tree_only"], help="Override default support file content policy.")
- parser.add_argument("--max-files", type=int, help="Maximum number of files in the pack.")
- parser.add_argument("--max-tokens", type=int, help="Approximate token budget for included file contents. 0 disables budget.")
- parser.add_argument("--min-score", type=int, help="Minimum score for non-seed files.")
- parser.add_argument("--init", action="store_true", help="Append a default [tool.scriber] config to pyproject.toml and exit.")
- parser.add_argument("--force", action="store_true", help="Allow --init to append even if [tool.scriber] already exists.")
- parser.add_argument("--project", action="store_true", help="Force project snapshot mode.")
- parser.add_argument("--explain-selection", action="store_true", help="Explain reason for file selection in detail.")
- parser.add_argument("--explain-graph", action="store_true", help="Print relation graph statistics and relations.")
- parser.add_argument("--why", help="Print exactly which rules/edges pulled the specified file into the pack.")
- parser.add_argument("--graph-json", help="Export the RelationGraph as a JSON file to the specified path.")
- parser.add_argument("--validate-config", action="store_true", help="Validate pyproject.toml scriber config.")
- parser.add_argument("--dry-run", action="store_true", help="Perform a dry run without saving the pack file.")
- parser.add_argument("--open", action="store_true", help="Open the output file automatically after creation.")
- parser.add_argument("--timings", action="store_true", help="Show execution timings for each phase.")
- parser.add_argument("--version", action="store_true", help="Show version information and exit.")
+ parser.add_argument(
+ "--support-content",
+ choices=["full", "auto", "tree_only"],
+ help="Override default support file content policy.",
+ )
+ parser.add_argument(
+ "--max-files", type=int, help="Maximum number of files in the pack."
+ )
+ parser.add_argument(
+ "--max-tokens",
+ type=int,
+ help="Approximate token budget for included file contents. 0 disables budget.",
+ )
+ parser.add_argument(
+ "--min-score", type=int, help="Minimum score for non-seed files."
+ )
+ parser.add_argument(
+ "--init",
+ action="store_true",
+ help="Append a default [tool.scriber] config to pyproject.toml and exit.",
+ )
+ parser.add_argument(
+ "--force",
+ action="store_true",
+ help="Allow --init to append even if [tool.scriber] already exists.",
+ )
+ parser.add_argument(
+ "--project", action="store_true", help="Force project snapshot mode."
+ )
+ parser.add_argument(
+ "--explain",
+ "--explain-selection",
+ dest="explain_selection",
+ action="store_true",
+ help="Explain reason for file selection in detail.",
+ )
+ parser.add_argument(
+ "--explain-graph",
+ action="store_true",
+ help="Print relation graph statistics and relations.",
+ )
+ parser.add_argument(
+ "--why",
+ help="Print exactly which rules/edges pulled the specified file into the pack.",
+ )
+ parser.add_argument(
+ "--graph-json",
+ help="Export the RelationGraph as a JSON file to the specified path.",
+ )
+ parser.add_argument(
+ "--validate-config",
+ action="store_true",
+ help="Validate pyproject.toml scriber config.",
+ )
+ parser.add_argument(
+ "--dry-run",
+ action="store_true",
+ help="Perform a dry run without saving the pack file.",
+ )
+ parser.add_argument(
+ "--open",
+ action="store_true",
+ help="Open the output file automatically after creation.",
+ )
+ parser.add_argument(
+ "--timings", action="store_true", help="Show execution timings for each phase."
+ )
+ parser.add_argument(
+ "--version", action="store_true", help="Show version information and exit."
+ )
return parser
@@ -168,12 +286,19 @@ def main(argv: Sequence[str] | None = None) -> int:
try:
if args.version:
from scriber import __version__
+
print(f"scriber {__version__}")
from scriber.native import is_native_available, require_native
+
if is_native_available():
native = require_native()
if hasattr(native, "build_info"):
- print(f"native {native.build_info()}")
+ api_ver = (
+ native.native_api_version()
+ if hasattr(native, "native_api_version")
+ else "unknown"
+ )
+ print(f"native {native.build_info()} (API v{api_ver})")
return 0
if args.validate_config:
@@ -189,7 +314,7 @@ def main(argv: Sequence[str] | None = None) -> int:
else:
config = load_config(config_path)
issues = validate_config(config, raw_data, config_path)
-
+
if not issues:
print("Scriber config is valid.", file=sys.stderr)
return 0
@@ -203,7 +328,10 @@ def main(argv: Sequence[str] | None = None) -> int:
else:
warnings += 1
print(f"[{severity}] {issue.message}", file=sys.stderr)
- print(f"\nValidation completed: {errors} error(s), {warnings} warning(s)", file=sys.stderr)
+ print(
+ f"\nValidation completed: {errors} error(s), {warnings} warning(s)",
+ file=sys.stderr,
+ )
return 1 if errors > 0 else 0
except Exception as exc:
print(f"Error: Failed to parse pyproject.toml: {exc}", file=sys.stderr)
@@ -217,6 +345,7 @@ def main(argv: Sequence[str] | None = None) -> int:
if args.dry_run:
from scriber.packer.pack import build_pack
from scriber.core.config import apply_overrides
+
pack = build_pack(
args.paths or ["."],
config_path=args.config,
@@ -240,12 +369,28 @@ def main(argv: Sequence[str] | None = None) -> int:
is_llm_pack = hasattr(pack, "items")
items = getattr(pack, "items", getattr(pack, "candidates", []))
if is_llm_pack:
- code_count = len([c for c in items if c.file.kind == "code" and c.content_mode != "tree"])
- support_count = len([c for c in items if c.file.kind == "support" and c.content_mode != "tree"])
+ code_count = len(
+ [
+ c
+ for c in items
+ if c.file.kind == "code" and c.content_mode != "tree"
+ ]
+ )
+ support_count = len(
+ [
+ c
+ for c in items
+ if c.file.kind == "support" and c.content_mode != "tree"
+ ]
+ )
total_count = len([c for c in items if c.content_mode != "tree"])
else:
- code_count = len([c for c in items if c.file.kind == "code" and c.include_content])
- support_count = len([c for c in items if c.file.kind == "support" and c.include_content])
+ code_count = len(
+ [c for c in items if c.file.kind == "code" and c.include_content]
+ )
+ support_count = len(
+ [c for c in items if c.file.kind == "support" and c.include_content]
+ )
total_count = len([c for c in items if c.include_content])
print("Scriber dry-run completed.", file=sys.stderr)
@@ -254,24 +399,47 @@ def main(argv: Sequence[str] | None = None) -> int:
print(f" Code files selected: {code_count}", file=sys.stderr)
print(f" Support files selected: {support_count}", file=sys.stderr)
print(f" Total files in pack: {total_count}", file=sys.stderr)
- total_tokens = getattr(pack, "budget_actual", getattr(pack, "total_tokens", 0))
+ total_tokens = getattr(
+ pack, "budget_actual", getattr(pack, "total_tokens", 0)
+ )
print(f" Estimated tokens: {total_tokens}", file=sys.stderr)
if args.timings:
if pack.stats:
print("----------------------------------------", file=sys.stderr)
print("Stats:", file=sys.stderr)
if "graph_edges_built" in pack.stats:
- print(f" Graph edges built: {pack.stats['graph_edges_built']}", file=sys.stderr)
- print(f" Graph cache reads: {pack.stats['graph_cache_reads']}", file=sys.stderr)
- print(f" Graph cache hits: {pack.stats['graph_cache_hits']}", file=sys.stderr)
- print(f" Graph cache writes: {pack.stats['graph_cache_writes']}", file=sys.stderr)
- print(f" Graph source: {pack.stats['graph_source']}", file=sys.stderr)
+ print(
+ f" Graph edges built: {pack.stats['graph_edges_built']}",
+ file=sys.stderr,
+ )
+ print(
+ f" Graph cache reads: {pack.stats['graph_cache_reads']}",
+ file=sys.stderr,
+ )
+ print(
+ f" Graph cache hits: {pack.stats['graph_cache_hits']}",
+ file=sys.stderr,
+ )
+ print(
+ f" Graph cache writes: {pack.stats['graph_cache_writes']}",
+ file=sys.stderr,
+ )
+ print(
+ f" Graph source: {pack.stats['graph_source']}",
+ file=sys.stderr,
+ )
if pack.timings:
print("----------------------------------------", file=sys.stderr)
print("Timings:", file=sys.stderr)
for phase, duration in pack.timings.items():
- print(f" {phase.replace('_', ' ').ljust(15)}: {duration:.4f}s", file=sys.stderr)
- print(f" total: {sum(pack.timings.values()):.4f}s", file=sys.stderr)
+ print(
+ f" {phase.replace('_', ' ').ljust(15)}: {duration:.4f}s",
+ file=sys.stderr,
+ )
+ print(
+ f" total: {sum(pack.timings.values()):.4f}s",
+ file=sys.stderr,
+ )
config = load_config(pack.config_path)
config = apply_overrides(config, output=args.output)
@@ -308,11 +476,11 @@ def main(argv: Sequence[str] | None = None) -> int:
is_llm_pack = hasattr(pack, "items")
items = getattr(pack, "items", getattr(pack, "candidates", []))
-
+
code_count = 0
support_count = 0
omitted_count = 0
-
+
for cand in items:
if is_llm_pack:
if cand.content_mode != "tree":
@@ -343,17 +511,31 @@ def main(argv: Sequence[str] | None = None) -> int:
if pack.stats:
sys.stderr.write("Stats:\n")
if "graph_edges_built" in pack.stats:
- sys.stderr.write(f" - Graph edges built: {pack.stats['graph_edges_built']}\n")
- sys.stderr.write(f" - Graph cache reads: {pack.stats['graph_cache_reads']}\n")
- sys.stderr.write(f" - Graph cache hits: {pack.stats['graph_cache_hits']}\n")
- sys.stderr.write(f" - Graph cache writes: {pack.stats['graph_cache_writes']}\n")
- sys.stderr.write(f" - Graph source: {pack.stats['graph_source']}\n")
+ sys.stderr.write(
+ f" - Graph edges built: {pack.stats['graph_edges_built']}\n"
+ )
+ sys.stderr.write(
+ f" - Graph cache reads: {pack.stats['graph_cache_reads']}\n"
+ )
+ sys.stderr.write(
+ f" - Graph cache hits: {pack.stats['graph_cache_hits']}\n"
+ )
+ sys.stderr.write(
+ f" - Graph cache writes: {pack.stats['graph_cache_writes']}\n"
+ )
+ sys.stderr.write(
+ f" - Graph source: {pack.stats['graph_source']}\n"
+ )
sys.stderr.write("----------------------------------------\n")
if pack.timings:
sys.stderr.write("Timings:\n")
for phase, duration in pack.timings.items():
- sys.stderr.write(f" - {phase.replace('_', ' ').ljust(15)}: {duration:.4f}s\n")
- sys.stderr.write(f" - total: {sum(pack.timings.values()):.4f}s\n")
+ sys.stderr.write(
+ f" - {phase.replace('_', ' ').ljust(15)}: {duration:.4f}s\n"
+ )
+ sys.stderr.write(
+ f" - total: {sum(pack.timings.values()):.4f}s\n"
+ )
sys.stderr.write("----------------------------------------\n")
if args.explain_graph or args.why or args.graph_json:
@@ -363,6 +545,7 @@ def main(argv: Sequence[str] | None = None) -> int:
print(f"Scriber pack written to: {output}")
if args.open:
from scriber.core.open_file import open_path
+
open_path(output)
return 0
except ScriberError as exc:
diff --git a/src/scriber/core/config.py b/src/scriber/core/config.py
index 73895fc..69c7491 100644
--- a/src/scriber/core/config.py
+++ b/src/scriber/core/config.py
@@ -9,7 +9,14 @@
except ModuleNotFoundError: # pragma: no cover
import tomli as tomllib # type: ignore[no-redef]
-from .models import CacheConfig, ModuleConfig, PythonConfig, ScriberConfig, SupportContentConfig, TokenConfig
+from .models import (
+ CacheConfig,
+ ModuleConfig,
+ PythonConfig,
+ ScriberConfig,
+ SupportContentConfig,
+ TokenConfig,
+)
DEFAULT_CODE_PATTERNS = [
"**/*.py",
@@ -26,6 +33,15 @@
"**/*.cpp",
"**/*.h",
"**/*.hpp",
+ "**/*.html",
+ "**/*.htm",
+ "**/*.vue",
+ "**/*.svelte",
+ "**/*.astro",
+ "**/*.css",
+ "**/*.scss",
+ "**/*.sass",
+ "**/*.less",
]
DEFAULT_SUPPORT_PATTERNS = [
@@ -81,6 +97,7 @@
"pnpm-lock.yaml",
"yarn.lock",
"**/*.svg",
+ "**/*.json",
]
DEFAULT_SUPPORT_FULL = [
@@ -104,6 +121,7 @@
"README.md",
"Cargo.toml",
"go.mod",
+ "**/*.json",
]
DEFAULT_SUPPORT_TREE_ONLY = [
@@ -228,7 +246,9 @@ def load_config(config_path: Path) -> ScriberConfig:
config.max_tokens = int(data.get("max_tokens", config.max_tokens))
config.min_score = int(data.get("min_score", config.min_score))
config.path_style = str(data.get("path_style", config.path_style))
- config.allow_external_paths = bool(data.get("allow_external_paths", config.allow_external_paths))
+ config.allow_external_paths = bool(
+ data.get("allow_external_paths", config.allow_external_paths)
+ )
code_files = data.get("code_files", {})
if isinstance(code_files, dict) and isinstance(code_files.get("patterns"), list):
@@ -243,9 +263,19 @@ def load_config(config_path: Path) -> ScriberConfig:
if isinstance(content, dict):
config.support_content = SupportContentConfig(
default=content.get("default", config.support_content.default),
- full=[str(item) for item in content.get("full", config.support_content.full)],
- tree_only=[str(item) for item in content.get("tree_only", config.support_content.tree_only)],
- auto_max_bytes=int(content.get("auto_max_bytes", config.support_content.auto_max_bytes)),
+ full=[
+ str(item)
+ for item in content.get("full", config.support_content.full)
+ ],
+ tree_only=[
+ str(item)
+ for item in content.get(
+ "tree_only", config.support_content.tree_only
+ )
+ ],
+ auto_max_bytes=int(
+ content.get("auto_max_bytes", config.support_content.auto_max_bytes)
+ ),
)
if not config.support_content.full:
config.support_content.full = list(DEFAULT_SUPPORT_FULL)
@@ -265,14 +295,46 @@ def load_config(config_path: Path) -> ScriberConfig:
config.modules_config = ModuleConfig(
enabled=bool(modules.get("enabled", config.modules_config.enabled)),
depth=int(modules.get("depth", config.modules_config.depth)),
- include_direct_dependencies=bool(modules.get("include_direct_dependencies", config.modules_config.include_direct_dependencies)),
- include_reverse_dependencies=bool(modules.get("include_reverse_dependencies", config.modules_config.include_reverse_dependencies)),
- include_tests=bool(modules.get("include_tests", config.modules_config.include_tests)),
- include_same_package=bool(modules.get("include_same_package", config.modules_config.include_same_package)),
- include_parent_entrypoints=bool(modules.get("include_parent_entrypoints", config.modules_config.include_parent_entrypoints)),
- include_project_configs=bool(modules.get("include_project_configs", config.modules_config.include_project_configs)),
- content_min_score=int(modules.get("content_min_score", config.modules_config.content_min_score)),
- tree_min_score=int(modules.get("tree_min_score", config.modules_config.tree_min_score)),
+ include_direct_dependencies=bool(
+ modules.get(
+ "include_direct_dependencies",
+ config.modules_config.include_direct_dependencies,
+ )
+ ),
+ include_reverse_dependencies=bool(
+ modules.get(
+ "include_reverse_dependencies",
+ config.modules_config.include_reverse_dependencies,
+ )
+ ),
+ include_tests=bool(
+ modules.get("include_tests", config.modules_config.include_tests)
+ ),
+ include_same_package=bool(
+ modules.get(
+ "include_same_package", config.modules_config.include_same_package
+ )
+ ),
+ include_parent_entrypoints=bool(
+ modules.get(
+ "include_parent_entrypoints",
+ config.modules_config.include_parent_entrypoints,
+ )
+ ),
+ include_project_configs=bool(
+ modules.get(
+ "include_project_configs",
+ config.modules_config.include_project_configs,
+ )
+ ),
+ content_min_score=int(
+ modules.get(
+ "content_min_score", config.modules_config.content_min_score
+ )
+ ),
+ tree_min_score=int(
+ modules.get("tree_min_score", config.modules_config.tree_min_score)
+ ),
scoring=scoring,
)
config.modules = config.modules_config.enabled
@@ -280,17 +342,34 @@ def load_config(config_path: Path) -> ScriberConfig:
python = data.get("python", {})
if isinstance(python, dict):
config.python = PythonConfig(
- source_roots=[str(item) for item in python.get("source_roots", config.python.source_roots)],
- test_roots=[str(item) for item in python.get("test_roots", config.python.test_roots)],
- module_init_files=[str(item) for item in python.get("module_init_files", config.python.module_init_files)],
- entrypoint_patterns=[str(item) for item in python.get("entrypoint_patterns", config.python.entrypoint_patterns)],
+ source_roots=[
+ str(item)
+ for item in python.get("source_roots", config.python.source_roots)
+ ],
+ test_roots=[
+ str(item) for item in python.get("test_roots", config.python.test_roots)
+ ],
+ module_init_files=[
+ str(item)
+ for item in python.get(
+ "module_init_files", config.python.module_init_files
+ )
+ ],
+ entrypoint_patterns=[
+ str(item)
+ for item in python.get(
+ "entrypoint_patterns", config.python.entrypoint_patterns
+ )
+ ],
)
tokens = data.get("tokens", {})
if isinstance(tokens, dict):
config.tokens = TokenConfig(
estimator=str(tokens.get("estimator", config.tokens.estimator)),
- chars_per_token=int(tokens.get("chars_per_token", config.tokens.chars_per_token)),
+ chars_per_token=int(
+ tokens.get("chars_per_token", config.tokens.chars_per_token)
+ ),
)
cache = data.get("cache", {})
@@ -348,25 +427,31 @@ class ConfigIssue:
def validate_raw_config(raw_data: dict[str, Any]) -> list[ConfigIssue]:
issues: list[ConfigIssue] = []
-
+
# 1. check if raw_data contains tool.scriber
tool = raw_data.get("tool", {}) if isinstance(raw_data, dict) else {}
if not isinstance(tool, dict):
issues.append(ConfigIssue("error", "[tool] in pyproject.toml must be a table."))
return issues
-
+
data = tool.get("scriber", {}) if isinstance(tool, dict) else {}
if not data:
- issues.append(ConfigIssue("warning", "[tool.scriber] section is missing or empty."))
+ issues.append(
+ ConfigIssue("warning", "[tool.scriber] section is missing or empty.")
+ )
return issues
-
+
if not isinstance(data, dict):
issues.append(ConfigIssue("error", "[tool.scriber] must be a table."))
return issues
# 2. check output format
if "format" in data and data["format"] not in {"md", "txt"}:
- issues.append(ConfigIssue("error", f"Invalid format: '{data['format']}'. Must be 'md' or 'txt'."))
+ issues.append(
+ ConfigIssue(
+ "error", f"Invalid format: '{data['format']}'. Must be 'md' or 'txt'."
+ )
+ )
# 4. check support_content default
support_files = data.get("support_files", {})
@@ -375,7 +460,12 @@ def validate_raw_config(raw_data: dict[str, Any]) -> list[ConfigIssue]:
if isinstance(content, dict) and "default" in content:
val = content["default"]
if val not in {"full", "auto", "tree_only"}:
- issues.append(ConfigIssue("error", f"Invalid support_files.content.default: '{val}'. Must be 'full', 'auto', or 'tree_only'."))
+ issues.append(
+ ConfigIssue(
+ "error",
+ f"Invalid support_files.content.default: '{val}'. Must be 'full', 'auto', or 'tree_only'.",
+ )
+ )
# 5. check numeric values >= 0
for field in ["max_files", "max_tokens", "min_score"]:
@@ -383,20 +473,37 @@ def validate_raw_config(raw_data: dict[str, Any]) -> list[ConfigIssue]:
try:
val = int(data[field])
if val < 0:
- issues.append(ConfigIssue("error", f"{field} must be a number >= 0. Got: {val}"))
+ issues.append(
+ ConfigIssue(
+ "error", f"{field} must be a number >= 0. Got: {val}"
+ )
+ )
except (ValueError, TypeError):
- issues.append(ConfigIssue("error", f"{field} must be an integer. Got: {data[field]}"))
+ issues.append(
+ ConfigIssue(
+ "error", f"{field} must be an integer. Got: {data[field]}"
+ )
+ )
# 6. check patterns are list of strings
def check_pattern_list(parent_dict: dict[str, Any], path_name: str) -> None:
if "patterns" in parent_dict:
patterns = parent_dict["patterns"]
if not isinstance(patterns, list):
- issues.append(ConfigIssue("error", f"{path_name}.patterns must be a list of strings."))
+ issues.append(
+ ConfigIssue(
+ "error", f"{path_name}.patterns must be a list of strings."
+ )
+ )
else:
for item in patterns:
if not isinstance(item, str):
- issues.append(ConfigIssue("error", f"Pattern in {path_name}.patterns must be a string. Got: {item}"))
+ issues.append(
+ ConfigIssue(
+ "error",
+ f"Pattern in {path_name}.patterns must be a string. Got: {item}",
+ )
+ )
code_files = data.get("code_files", {})
if isinstance(code_files, dict):
@@ -406,7 +513,7 @@ def check_pattern_list(parent_dict: dict[str, Any], path_name: str) -> None:
if isinstance(support_files, dict):
check_pattern_list(support_files, "support_files")
-
+
# Check support_files.content full and tree_only patterns
content = support_files.get("content", {})
if isinstance(content, dict):
@@ -414,11 +521,21 @@ def check_pattern_list(parent_dict: dict[str, Any], path_name: str) -> None:
if field in content:
patterns = content[field]
if not isinstance(patterns, list):
- issues.append(ConfigIssue("error", f"support_files.content.{field} must be a list of strings."))
+ issues.append(
+ ConfigIssue(
+ "error",
+ f"support_files.content.{field} must be a list of strings.",
+ )
+ )
else:
for item in patterns:
if not isinstance(item, str):
- issues.append(ConfigIssue("error", f"Pattern in support_files.content.{field} must be a string. Got: {item}"))
+ issues.append(
+ ConfigIssue(
+ "error",
+ f"Pattern in support_files.content.{field} must be a string. Got: {item}",
+ )
+ )
elif "support_files" in data:
issues.append(ConfigIssue("error", "support_files must be a table."))
@@ -430,18 +547,29 @@ def check_pattern_list(parent_dict: dict[str, Any], path_name: str) -> None:
return issues
-def validate_config(config: ScriberConfig, raw_data: dict[str, Any], config_path: Path | None = None) -> list[ConfigIssue]:
+
+def validate_config(
+ config: ScriberConfig, raw_data: dict[str, Any], config_path: Path | None = None
+) -> list[ConfigIssue]:
issues = validate_raw_config(raw_data)
-
+
# Check output path is not a directory
output_path = config.output
if not output_path.is_absolute() and config_path:
output_path = config_path.parent / output_path
-
+
if output_path.suffix == "" and not str(output_path).endswith("-"):
- issues.append(ConfigIssue("warning", f"Output path '{output_path}' has no extension. Is it a directory?"))
+ issues.append(
+ ConfigIssue(
+ "warning",
+ f"Output path '{output_path}' has no extension. Is it a directory?",
+ )
+ )
if output_path.exists() and output_path.is_dir():
- issues.append(ConfigIssue("error", f"Output path '{output_path}' points to an existing directory."))
+ issues.append(
+ ConfigIssue(
+ "error", f"Output path '{output_path}' points to an existing directory."
+ )
+ )
return issues
-
diff --git a/src/scriber/core/init_config.py b/src/scriber/core/init_config.py
index f345fe8..0809771 100644
--- a/src/scriber/core/init_config.py
+++ b/src/scriber/core/init_config.py
@@ -9,7 +9,7 @@ def replace_existing_tool_scriber_block(content: str, default_block: str) -> str
lines = content.splitlines()
new_lines = []
in_scriber = False
-
+
for line in lines:
stripped = line.strip()
if stripped.startswith("[") and stripped.endswith("]"):
@@ -19,10 +19,10 @@ def replace_existing_tool_scriber_block(content: str, default_block: str) -> str
continue
else:
in_scriber = False
-
+
if not in_scriber:
new_lines.append(line)
-
+
cleaned = "\n".join(new_lines).strip()
if cleaned:
return cleaned + "\n\n" + default_block + "\n"
@@ -35,23 +35,27 @@ def init_project(config_path: str | None = None, force: bool = False) -> Path:
path = path / "pyproject.toml"
if not path.is_absolute():
path = Path.cwd() / path
-
+
if path.exists():
content = path.read_text(encoding="utf-8")
has_scriber = "[tool.scriber]" in content
-
+
if has_scriber and not force:
- raise ScriberError(f"Scriber config already exists. Use --force to replace it.")
-
+ raise ScriberError(
+ "Scriber config already exists. Use --force to replace it."
+ )
+
if has_scriber:
- new_content = replace_existing_tool_scriber_block(content, DEFAULT_CONFIG_BLOCK)
+ new_content = replace_existing_tool_scriber_block(
+ content, DEFAULT_CONFIG_BLOCK
+ )
else:
if content and not content.endswith("\n"):
content += "\n"
new_content = content + "\n" + DEFAULT_CONFIG_BLOCK + "\n"
-
+
path.write_text(new_content, encoding="utf-8")
else:
path.write_text(DEFAULT_CONFIG_BLOCK + "\n", encoding="utf-8")
-
+
return path
diff --git a/src/scriber/core/models.py b/src/scriber/core/models.py
index c093382..757a462 100644
--- a/src/scriber/core/models.py
+++ b/src/scriber/core/models.py
@@ -10,7 +10,6 @@
PackMode = Literal["focused", "project_snapshot"]
-
DEFAULT_SCORING: dict[str, int] = {
"seed_file": 100,
"seed_folder_file": 100,
@@ -26,6 +25,10 @@
"documentation": 45,
"name_similarity": 45,
"shared_dependency_bonus": 10,
+ "entrypoint_file": 90,
+ "code_file": 80,
+ "test_file": 60,
+ "other_file": 40,
}
@@ -50,7 +53,14 @@ class PythonConfig:
test_roots: list[str] = field(default_factory=lambda: ["tests", "test"])
module_init_files: list[str] = field(default_factory=lambda: ["__init__.py"])
entrypoint_patterns: list[str] = field(
- default_factory=lambda: ["main.py", "app.py", "asgi.py", "wsgi.py", "routes.py", "router.py"]
+ default_factory=lambda: [
+ "main.py",
+ "app.py",
+ "asgi.py",
+ "wsgi.py",
+ "routes.py",
+ "router.py",
+ ]
)
@@ -108,7 +118,9 @@ class FileNode:
is_binary: bool = False
support_category: str | None = None
content_policy: ContentPolicy = "auto"
- _cached_text: str | None = field(default=None, init=False, repr=False, compare=False, hash=False)
+ _cached_text: str | None = field(
+ default=None, init=False, repr=False, compare=False, hash=False
+ )
def read_text(self) -> str:
if self._cached_text is not None:
@@ -116,6 +128,7 @@ def read_text(self) -> str:
try:
from scriber.native import is_native_available, require_native
+
if is_native_available():
text = require_native().read_text(str(self.absolute))
else:
@@ -154,7 +167,7 @@ class Candidate:
role: str = "unknown"
-from scriber.graph.model import RelationKind, RelationEdge, RelationGraph, ModuleGraph
+from scriber.graph.model import RelationEdge, RelationGraph, ModuleGraph # noqa: E402
@dataclass(slots=True)
@@ -196,6 +209,7 @@ def included_paths(self) -> list[Path]:
"unknown",
]
+
@dataclass(frozen=True, slots=True)
class FileRef:
path: Path
@@ -205,6 +219,7 @@ class FileRef:
token_estimate: int
role: FileRole = "unknown"
+
@dataclass(frozen=True, slots=True)
class FileOutline:
path: Path
@@ -218,6 +233,7 @@ class FileOutline:
notes: list[str]
token_estimate: int
+
@dataclass(slots=True)
class PackItem:
file: FileNode
@@ -235,6 +251,7 @@ class PackItem:
utility: float = 0.0
raw_score: float = 0.0
+
@dataclass(slots=True)
class LlmPack:
project_root: Path
diff --git a/src/scriber/core/profiles.py b/src/scriber/core/profiles.py
new file mode 100644
index 0000000..c8de117
--- /dev/null
+++ b/src/scriber/core/profiles.py
@@ -0,0 +1,46 @@
+from __future__ import annotations
+from copy import deepcopy
+from typing import TYPE_CHECKING
+
+if TYPE_CHECKING:
+ from scriber.core.models import ScriberConfig
+
+PROFILE_CHOICES = ("default", "audit", "debug", "refactor", "docs")
+
+
+def apply_profile(config: ScriberConfig, profile: str) -> ScriberConfig:
+ if profile == "default" or not profile:
+ return config
+
+ cfg = deepcopy(config)
+ scoring = cfg.modules_config.scoring
+
+ if profile == "audit":
+ scoring["test_file"] = 80
+ scoring["project_config"] = 90
+ scoring["dependency_file"] = 90
+ scoring["runtime_support"] = 85
+ scoring["documentation"] = 70
+
+ elif profile == "debug":
+ scoring["direct_dependency"] = 90
+ scoring["reverse_dependency"] = 80
+ scoring["test_file"] = 70
+ scoring["runtime_support"] = 80
+ scoring["support_near_seed"] = 80
+
+ elif profile == "refactor":
+ scoring["same_package"] = 80
+ scoring["related_test"] = 90
+ scoring["test_file"] = 75
+ scoring["direct_dependency"] = 60
+
+ elif profile == "docs":
+ scoring["documentation"] = 95
+ scoring["project_config"] = 50
+ scoring["dependency_file"] = 30
+ scoring["test_file"] = 10
+ scoring["code_file"] = 30
+ cfg.support_content.default = "tree_only"
+
+ return cfg
diff --git a/src/scriber/core/root.py b/src/scriber/core/root.py
index b8042b5..93c6743 100644
--- a/src/scriber/core/root.py
+++ b/src/scriber/core/root.py
@@ -16,7 +16,9 @@ def resolve_config_path(paths: list[str], explicit_config: str | None = None) ->
if not config.exists():
raise ScriberError(f"Config not found: {config}")
if config.name != "pyproject.toml":
- raise ScriberError("Scriber 2.0 expects --config to point to pyproject.toml")
+ raise ScriberError(
+ "Scriber 2.0 expects --config to point to pyproject.toml"
+ )
return config
starts: list[Path] = []
@@ -44,7 +46,9 @@ def resolve_config_path(paths: list[str], explicit_config: str | None = None) ->
if candidate.exists():
return candidate.resolve()
- raise ScriberError("No pyproject.toml found. Run `scriber init` or pass `--config /path/to/pyproject.toml`.")
+ raise ScriberError(
+ "No pyproject.toml found. Run `scriber init` or pass `--config /path/to/pyproject.toml`."
+ )
def project_root_from_config(config_path: Path) -> Path:
diff --git a/src/scriber/core/symbols.py b/src/scriber/core/symbols.py
index fa127b0..8877930 100644
--- a/src/scriber/core/symbols.py
+++ b/src/scriber/core/symbols.py
@@ -1,6 +1,7 @@
from dataclasses import dataclass, field
from pathlib import Path
+
@dataclass(slots=True)
class SymbolNode:
name: str
diff --git a/src/scriber/engine/ranker.py b/src/scriber/engine/ranker.py
index cdc8474..623f880 100644
--- a/src/scriber/engine/ranker.py
+++ b/src/scriber/engine/ranker.py
@@ -20,9 +20,16 @@
"semantic_similarity": 15,
}
-def rank_context(files: dict[Path, FileNode], graph: RelationGraph, seeds: list[Path], config: ScriberConfig, mode: str) -> list[Candidate]:
+
+def rank_context(
+ files: dict[Path, FileNode],
+ graph: RelationGraph,
+ seeds: list[Path],
+ config: ScriberConfig,
+ mode: str,
+) -> list[Candidate]:
candidates = []
-
+
explicit_seeds = {s for s in seeds}
distances = {}
@@ -32,19 +39,19 @@ def rank_context(files: dict[Path, FileNode], graph: RelationGraph, seeds: list[
for edge in graph.edges:
adj_out[edge.source].append(edge.target)
adj_in[edge.target].append(edge.source)
-
+
q_out = deque()
q_in = deque()
dist_out = {}
dist_in = {}
-
+
for s in explicit_seeds:
if s in files:
dist_out[s] = 0
dist_in[s] = 0
q_out.append(s)
q_in.append(s)
-
+
while q_out:
curr = q_out.popleft()
d = dist_out[curr]
@@ -52,7 +59,7 @@ def rank_context(files: dict[Path, FileNode], graph: RelationGraph, seeds: list[
if nbr not in dist_out:
dist_out[nbr] = d + 1
q_out.append(nbr)
-
+
while q_in:
curr = q_in.popleft()
d = dist_in[curr]
@@ -60,7 +67,7 @@ def rank_context(files: dict[Path, FileNode], graph: RelationGraph, seeds: list[
if nbr not in dist_in:
dist_in[nbr] = d + 1
q_in.append(nbr)
-
+
for rel in files.keys():
d_out = dist_out.get(rel, 999)
d_in = dist_in.get(rel, 999)
@@ -69,17 +76,17 @@ def rank_context(files: dict[Path, FileNode], graph: RelationGraph, seeds: list[
for rel, node in files.items():
role = classify_file_role(node, graph)
role_score = ROLE_SCORE.get(role, 20)
-
+
relation_score = 0.0
incoming = graph.incoming.get(rel, [])
for edge in incoming:
weight = RELATION_WEIGHT.get(edge.kind, 10) * edge.weight * edge.confidence
relation_score += weight
-
+
centrality_bonus = 0
evidence_bonus = len(incoming) * 2
noise_penalty = 0
-
+
if node.language in {"json", "lock", "svg"}:
noise_penalty += 50
@@ -105,30 +112,41 @@ def rank_context(files: dict[Path, FileNode], graph: RelationGraph, seeds: list[
decay = 1.0
seed_bonus = 100 if rel in explicit_seeds else 0
max_score = 100
-
+
if mode == "focused" and role == "test" and rel not in explicit_seeds:
noise_penalty += 80
- max_score = min(max_score, 44) # Force test files to tree mode unless specifically targeted
-
- raw_score = (role_score + relation_score + seed_bonus + centrality_bonus + evidence_bonus - noise_penalty) * decay
-
+ max_score = min(
+ max_score, 44
+ ) # Force test files to tree mode unless specifically targeted
+
+ raw_score = (
+ role_score
+ + relation_score
+ + seed_bonus
+ + centrality_bonus
+ + evidence_bonus
+ - noise_penalty
+ ) * decay
+
token_estimate = node.size_bytes // 4
utility = raw_score / math.sqrt(token_estimate + 200)
-
+
c = Candidate(
file=node,
- score=int(min(max_score, max(0, raw_score))), # clamp to distance-based max_score
+ score=int(
+ min(max_score, max(0, raw_score))
+ ), # clamp to distance-based max_score
reasons=[f"Role {role}: {role_score}", f"Relations: {relation_score:.1f}"],
include_content=False,
- token_estimate=token_estimate
+ token_estimate=token_estimate,
)
-
+
object.__setattr__(c, "utility", utility)
object.__setattr__(c, "raw_score", raw_score)
object.__setattr__(c, "role", role)
-
+
candidates.append(c)
-
+
# Primary sort by utility, then score
candidates.sort(key=lambda c: (getattr(c, "utility", 0), c.score), reverse=True)
return candidates
diff --git a/src/scriber/engine/roles.py b/src/scriber/engine/roles.py
index 2f319c8..8607b56 100644
--- a/src/scriber/engine/roles.py
+++ b/src/scriber/engine/roles.py
@@ -1,5 +1,4 @@
from __future__ import annotations
-from pathlib import Path
from scriber.core.models import FileNode, FileRole, RelationGraph
ROLE_SCORE: dict[FileRole, int] = {
@@ -20,9 +19,22 @@
"unknown": 20,
}
+
+def _is_test_path(rel: str, test_roots: set[str]) -> bool:
+ parts = rel.split("/")
+ name = parts[-1]
+ parent_parts = parts[:-1]
+ return (
+ any(part in test_roots for part in parent_parts)
+ or name.startswith("test_")
+ or name.endswith("_test.py")
+ or name.endswith(".test.py")
+ )
+
+
def classify_file_role(file: FileNode, graph: RelationGraph) -> FileRole:
rel = file.relative.as_posix().lower()
-
+
if rel in {"cli/main.py", "src/scriber/cli/main.py", "src/main.py", "main.py"}:
return "entrypoint"
if "orchestrator" in rel or "pack.py" in rel or "build.py" in rel:
@@ -31,7 +43,7 @@ def classify_file_role(file: FileNode, graph: RelationGraph) -> FileRole:
return "model"
if "core/config.py" in rel or "config.py" in rel:
return "config"
- if "test" in rel and file.kind == "code":
+ if file.kind == "code" and _is_test_path(rel, {"tests", "test"}):
return "test"
if "languages/" in rel:
return "language_adapter"
@@ -43,11 +55,18 @@ def classify_file_role(file: FileNode, graph: RelationGraph) -> FileRole:
return "renderer"
if "scanner/" in rel:
return "scanner"
- if rel.endswith("native.py") or "rust/scriber_native/" in rel or ("native" in rel and file.language == "rust"):
+ if (
+ rel.endswith("native.py")
+ or "rust/scriber_native/" in rel
+ or ("native" in rel and file.language == "rust")
+ ):
return "native_adapter"
if "readme" in rel or rel.startswith("docs"):
return "docs"
- if rel in {"pyproject.toml", "package.json", "cargo.toml"} or file.kind == "support":
+ if (
+ rel in {"pyproject.toml", "package.json", "cargo.toml"}
+ or file.kind == "support"
+ ):
return "support"
-
+
return "unknown"
diff --git a/src/scriber/engine/scorer.py b/src/scriber/engine/scorer.py
index dee0cca..3193d5a 100644
--- a/src/scriber/engine/scorer.py
+++ b/src/scriber/engine/scorer.py
@@ -3,14 +3,23 @@
from pathlib import Path
from scriber.core.matchers import match_pattern
-from scriber.core.models import Candidate, FileNode, ModuleGraph, ScriberConfig, SeedPath, RelationEdge
+from scriber.core.models import (
+ Candidate,
+ FileNode,
+ ModuleGraph,
+ ScriberConfig,
+ SeedPath,
+ RelationEdge,
+)
def _score(config: ScriberConfig, key: str) -> int:
return int(config.modules_config.scoring.get(key, 0))
-def _add_reason(candidate: Candidate, kind: str, label: str, example: Path | None = None) -> None:
+def _add_reason(
+ candidate: Candidate, kind: str, label: str, example: Path | None = None
+) -> None:
candidate.reason_counts[kind] = candidate.reason_counts.get(kind, 0) + 1
if example is not None:
if kind not in candidate.reason_examples:
@@ -89,18 +98,22 @@ def _add(
candidates[rel] = existing
else:
existing.score = max(existing.score, score)
-
+
_add_reason(existing, kind, label, example=seed)
if seed is not None:
existing.seed_sources.add(seed)
def _is_test_file(rel: Path, config: ScriberConfig) -> bool:
- parts = rel.parts
+ parts = rel.parts[:-1] if len(rel.parts) > 1 else ()
name = rel.name.lower()
if any(part in set(config.python.test_roots) for part in parts):
return True
- return name.startswith("test_") or name.endswith("_test.py") or name.endswith(".test.py")
+ return (
+ name.startswith("test_")
+ or name.endswith("_test.py")
+ or name.endswith(".test.py")
+ )
def _name_related(a: Path, b: Path) -> bool:
@@ -112,56 +125,55 @@ def _name_related(a: Path, b: Path) -> bool:
def _walk_weighted_neighbors(
- edges: list[RelationEdge],
- start: Path,
- depth_limit: int,
- reverse: bool = False
+ edges: list[RelationEdge], start: Path, depth_limit: int, reverse: bool = False
) -> dict[Path, float]:
import heapq
-
+
adj: dict[Path, list[tuple[Path, RelationEdge]]] = {}
for edge in edges:
u = edge.target if reverse else edge.source
v = edge.source if reverse else edge.target
adj.setdefault(u, []).append((v, edge))
-
+
queue = [(-1.0, 0, start)]
max_strength: dict[Path, float] = {start: 1.0}
best_at_state: dict[tuple[Path, int], float] = {(start, 0): 1.0}
-
+
while queue:
neg_str, depth, u = heapq.heappop(queue)
u_str = -neg_str
-
+
if u_str < best_at_state.get((u, depth), 0.0):
continue
-
+
if depth >= depth_limit:
continue
-
+
for neighbor, edge in adj.get(u, []):
if edge.kind in {"import", "reexport"}:
edge_str = 1.0 if depth == 0 else 0.88
else:
edge_str = edge.weight * edge.confidence
-
+
next_str = u_str * edge_str
next_depth = depth + 1
-
+
if next_str > max_strength.get(neighbor, 0.0):
max_strength[neighbor] = next_str
-
+
if next_str > best_at_state.get((neighbor, next_depth), 0.0):
best_at_state[(neighbor, next_depth)] = next_str
heapq.heappush(queue, (-next_str, next_depth, neighbor))
-
+
if start in max_strength:
del max_strength[start]
-
+
return max_strength
-def _walk_neighbors(edges: dict[Path, set[Path]], start: Path, depth: int) -> dict[Path, int]:
+def _walk_neighbors(
+ edges: dict[Path, set[Path]], start: Path, depth: int
+) -> dict[Path, int]:
found: dict[Path, int] = {}
frontier = {start}
visited = {start}
@@ -186,7 +198,12 @@ def _support_base_score(file: FileNode, config: ScriberConfig) -> int:
return _score(config, "project_config")
if category == "dependency file":
return _score(config, "dependency_file")
- if category in {"runtime support", "runtime config", "ci support", "tooling config"}:
+ if category in {
+ "runtime support",
+ "runtime config",
+ "ci support",
+ "tooling config",
+ }:
return _score(config, "runtime_support")
if category == "documentation":
return _score(config, "documentation")
@@ -197,11 +214,18 @@ def _is_near_seed(support_file: Path, seed: Path) -> bool:
if support_file.parent == Path("."):
return True
seed_parent = seed.parent
- return support_file.parent == seed_parent or support_file.parent in seed_parent.parents or seed_parent in support_file.parent.parents
+ return (
+ support_file.parent == seed_parent
+ or support_file.parent in seed_parent.parents
+ or seed_parent in support_file.parent.parents
+ )
def _matches_entrypoint(rel: Path, config: ScriberConfig) -> bool:
- return any(match_pattern(rel.name, pattern) for pattern in config.python.entrypoint_patterns)
+ return any(
+ match_pattern(rel.name, pattern)
+ for pattern in config.python.entrypoint_patterns
+ )
def score_candidates_project_snapshot(
@@ -215,17 +239,45 @@ def score_candidates_project_snapshot(
for rel, file in files.items():
if file.kind == "code":
if _matches_entrypoint(rel, config):
- _add(candidates, files, rel, 90, "entrypoint", "entrypoint file")
+ _add(
+ candidates,
+ files,
+ rel,
+ _score(config, "entrypoint_file"),
+ "entrypoint",
+ "entrypoint file",
+ )
elif _is_test_file(rel, config):
- _add(candidates, files, rel, 60, "test_file", "test file")
+ _add(
+ candidates,
+ files,
+ rel,
+ _score(config, "test_file"),
+ "test_file",
+ "test file",
+ )
else:
- _add(candidates, files, rel, 80, "code_file", "code file")
+ _add(
+ candidates,
+ files,
+ rel,
+ _score(config, "code_file"),
+ "code_file",
+ "code file",
+ )
elif file.kind == "support" and config.support:
base = _support_base_score(file, config)
category = file.support_category or "support file"
_add(candidates, files, rel, base, "project_support", category)
elif file.kind == "other":
- _add(candidates, files, rel, 40, "other_file", "other file")
+ _add(
+ candidates,
+ files,
+ rel,
+ _score(config, "other_file"),
+ "other_file",
+ "other file",
+ )
for candidate in candidates.values():
candidate.reason_summary = _build_reason_summary(candidate)
@@ -233,16 +285,37 @@ def score_candidates_project_snapshot(
filtered = [
candidate
for rel, candidate in candidates.items()
- if candidate.score >= config.min_score or candidate.score >= config.modules_config.tree_min_score
+ if candidate.score >= config.min_score
+ or candidate.score >= config.modules_config.tree_min_score
]
- filtered.sort(key=lambda item: (-item.score, item.file.kind != "code", item.file.relative.as_posix()))
+ filtered.sort(
+ key=lambda item: (
+ -item.score,
+ item.file.kind != "code",
+ item.file.relative.as_posix(),
+ )
+ )
if config.max_files > 0 and len(filtered) > config.max_files:
- pinned = [c for c in filtered if c.file.relative.name in {"pyproject.toml", "README.md"}]
- rest = [c for c in filtered if c.file.relative.name not in {"pyproject.toml", "README.md"}]
+ pinned = [
+ c
+ for c in filtered
+ if c.file.relative.name in {"pyproject.toml", "README.md"}
+ ]
+ rest = [
+ c
+ for c in filtered
+ if c.file.relative.name not in {"pyproject.toml", "README.md"}
+ ]
remaining = max(0, config.max_files - len(pinned))
filtered = pinned + rest[:remaining]
- filtered.sort(key=lambda item: (-item.score, item.file.kind != "code", item.file.relative.as_posix()))
+ filtered.sort(
+ key=lambda item: (
+ -item.score,
+ item.file.kind != "code",
+ item.file.relative.as_posix(),
+ )
+ )
return filtered
@@ -256,7 +329,9 @@ def score_candidates(
mode: str = "focused",
) -> list[Candidate]:
if mode == "project_snapshot":
- return score_candidates_project_snapshot(files=files, graph=graph, config=config)
+ return score_candidates_project_snapshot(
+ files=files, graph=graph, config=config
+ )
candidates: dict[Path, Candidate] = {}
scoring = config.modules_config
@@ -266,43 +341,127 @@ def score_candidates(
for seed in seeds:
for rel in seed.expanded_files:
key = "seed_folder_file" if seed.is_dir else "seed_file"
- reason = f"file inside seed folder `{seed.relative.as_posix()}`" if seed.is_dir else "seed file"
- _add(candidates, files, rel, _score(config, key), "seed_folder_file" if seed.is_dir else "seed_file", reason, seed=rel)
+ reason = (
+ f"file inside seed folder `{seed.relative.as_posix()}`"
+ if seed.is_dir
+ else "seed file"
+ )
+ _add(
+ candidates,
+ files,
+ rel,
+ _score(config, key),
+ "seed_folder_file" if seed.is_dir else "seed_file",
+ reason,
+ seed=rel,
+ )
if config.modules and scoring.enabled:
for seed_rel in seed_files:
if scoring.include_direct_dependencies:
- for dep, strength in _walk_weighted_neighbors(graph.edges, seed_rel, scoring.depth, reverse=False).items():
- score = max(scoring.tree_min_score, int(_score(config, "direct_dependency") * strength))
- _add(candidates, files, dep, score, "direct_dependency", f"direct dependency of `{seed_rel.as_posix()}`", seed=seed_rel)
+ for dep, strength in _walk_weighted_neighbors(
+ graph.edges, seed_rel, scoring.depth, reverse=False
+ ).items():
+ score = max(
+ scoring.tree_min_score,
+ int(_score(config, "direct_dependency") * strength),
+ )
+ _add(
+ candidates,
+ files,
+ dep,
+ score,
+ "direct_dependency",
+ f"direct dependency of `{seed_rel.as_posix()}`",
+ seed=seed_rel,
+ )
if scoring.include_reverse_dependencies:
- for dep, strength in _walk_weighted_neighbors(graph.edges, seed_rel, scoring.depth, reverse=True).items():
- score = max(scoring.tree_min_score, int(_score(config, "reverse_dependency") * strength))
- _add(candidates, files, dep, score, "reverse_dependency", f"imports seed `{seed_rel.as_posix()}`", seed=seed_rel)
+ for dep, strength in _walk_weighted_neighbors(
+ graph.edges, seed_rel, scoring.depth, reverse=True
+ ).items():
+ score = max(
+ scoring.tree_min_score,
+ int(_score(config, "reverse_dependency") * strength),
+ )
+ _add(
+ candidates,
+ files,
+ dep,
+ score,
+ "reverse_dependency",
+ f"imports seed `{seed_rel.as_posix()}`",
+ seed=seed_rel,
+ )
if scoring.include_same_package:
seed_parent = seed_rel.parent
for rel, file in files.items():
- if file.kind == "code" and rel.parent == seed_parent and rel not in seed_set:
- _add(candidates, files, rel, _score(config, "same_package"), "same_package", f"same package as `{seed_rel.as_posix()}`", seed=seed_rel)
+ if (
+ file.kind == "code"
+ and rel.parent == seed_parent
+ and rel not in seed_set
+ ):
+ _add(
+ candidates,
+ files,
+ rel,
+ _score(config, "same_package"),
+ "same_package",
+ f"same package as `{seed_rel.as_posix()}`",
+ seed=seed_rel,
+ )
if scoring.include_parent_entrypoints:
for rel, file in files.items():
if file.kind == "code" and _matches_entrypoint(rel, config):
- if rel.parent == Path(".") or rel.parent in seed_rel.parents or seed_rel.parent in rel.parents:
- _add(candidates, files, rel, _score(config, "parent_entrypoint"), "parent_entrypoint", f"parent/entrypoint near `{seed_rel.as_posix()}`", seed=seed_rel)
+ if (
+ rel.parent == Path(".")
+ or rel.parent in seed_rel.parents
+ or seed_rel.parent in rel.parents
+ ):
+ _add(
+ candidates,
+ files,
+ rel,
+ _score(config, "parent_entrypoint"),
+ "parent_entrypoint",
+ f"parent/entrypoint near `{seed_rel.as_posix()}`",
+ seed=seed_rel,
+ )
if scoring.include_tests:
for rel, file in files.items():
if file.kind != "code" or not _is_test_file(rel, config):
continue
- if _name_related(rel, seed_rel) or seed_rel in graph.imports.get(rel, set()):
- _add(candidates, files, rel, _score(config, "related_test"), "related_test", f"related test for `{seed_rel.as_posix()}`", seed=seed_rel)
+ if _name_related(rel, seed_rel) or seed_rel in graph.imports.get(
+ rel, set()
+ ):
+ _add(
+ candidates,
+ files,
+ rel,
+ _score(config, "related_test"),
+ "related_test",
+ f"related test for `{seed_rel.as_posix()}`",
+ seed=seed_rel,
+ )
for rel, file in files.items():
- if file.kind == "code" and rel not in seed_set and _name_related(rel, seed_rel):
- _add(candidates, files, rel, _score(config, "name_similarity"), "name_similarity", f"name similarity with `{seed_rel.as_posix()}`", seed=seed_rel)
+ if (
+ file.kind == "code"
+ and rel not in seed_set
+ and _name_related(rel, seed_rel)
+ ):
+ _add(
+ candidates,
+ files,
+ rel,
+ _score(config, "name_similarity"),
+ "name_similarity",
+ f"name similarity with `{seed_rel.as_posix()}`",
+ seed=seed_rel,
+ )
if config.support:
for rel, file in files.items():
@@ -311,24 +470,52 @@ def score_candidates(
base = _support_base_score(file, config)
reason = file.support_category or "support file"
if rel.name == "pyproject.toml":
- _add(candidates, files, rel, _score(config, "project_config"), "project_support", "project config/root file")
+ _add(
+ candidates,
+ files,
+ rel,
+ _score(config, "project_config"),
+ "project_support",
+ "project config/root file",
+ )
continue
added = False
for seed_rel in seed_files:
if _is_near_seed(rel, seed_rel):
- _add(candidates, files, rel, max(base, _score(config, "support_near_seed")), "support_near_seed", f"{reason} near `{seed_rel.as_posix()}`", seed=seed_rel)
+ _add(
+ candidates,
+ files,
+ rel,
+ max(base, _score(config, "support_near_seed")),
+ "support_near_seed",
+ f"{reason} near `{seed_rel.as_posix()}`",
+ seed=seed_rel,
+ )
added = True
- if not added and file.relative.parent == Path(".") and scoring.include_project_configs:
+ if (
+ not added
+ and file.relative.parent == Path(".")
+ and scoring.include_project_configs
+ ):
_add(candidates, files, rel, base, "project_support", reason)
else:
if config.support:
pyproject = files.get(Path("pyproject.toml"))
if pyproject:
- _add(candidates, files, Path("pyproject.toml"), _score(config, "project_config"), "project_support", "project config/root file")
+ _add(
+ candidates,
+ files,
+ Path("pyproject.toml"),
+ _score(config, "project_config"),
+ "project_support",
+ "project config/root file",
+ )
for candidate in candidates.values():
if len(candidate.seed_sources) > 1:
- candidate.score = min(100, candidate.score + _score(config, "shared_dependency_bonus"))
+ candidate.score = min(
+ 100, candidate.score + _score(config, "shared_dependency_bonus")
+ )
_add_reason(candidate, "shared_dependency", "shared by multiple seed paths")
for candidate in candidates.values():
@@ -338,15 +525,39 @@ def score_candidates(
filtered = [
candidate
for rel, candidate in candidates.items()
- if rel in required or candidate.score >= config.min_score or candidate.score >= config.modules_config.tree_min_score
+ if rel in required
+ or candidate.score >= config.min_score
+ or candidate.score >= config.modules_config.tree_min_score
]
- filtered.sort(key=lambda item: (-item.score, item.file.kind != "code", item.file.relative.as_posix()))
+ filtered.sort(
+ key=lambda item: (
+ -item.score,
+ item.file.kind != "code",
+ item.file.relative.as_posix(),
+ )
+ )
if config.max_files > 0 and len(filtered) > config.max_files:
- seeds_first = [candidate for candidate in filtered if candidate.file.relative in required or candidate.file.relative.name in {"pyproject.toml", "README.md"}]
- rest = [candidate for candidate in filtered if candidate.file.relative not in required and candidate.file.relative.name not in {"pyproject.toml", "README.md"}]
+ seeds_first = [
+ candidate
+ for candidate in filtered
+ if candidate.file.relative in required
+ or candidate.file.relative.name in {"pyproject.toml", "README.md"}
+ ]
+ rest = [
+ candidate
+ for candidate in filtered
+ if candidate.file.relative not in required
+ and candidate.file.relative.name not in {"pyproject.toml", "README.md"}
+ ]
remaining = max(0, config.max_files - len(seeds_first))
filtered = seeds_first + rest[:remaining]
- filtered.sort(key=lambda item: (-item.score, item.file.kind != "code", item.file.relative.as_posix()))
+ filtered.sort(
+ key=lambda item: (
+ -item.score,
+ item.file.kind != "code",
+ item.file.relative.as_posix(),
+ )
+ )
return filtered
diff --git a/src/scriber/graph/analyzers/__init__.py b/src/scriber/graph/analyzers/__init__.py
index 129c757..307bcf6 100644
--- a/src/scriber/graph/analyzers/__init__.py
+++ b/src/scriber/graph/analyzers/__init__.py
@@ -7,10 +7,13 @@
from scriber.graph.analyzers.config_refs import ConfigRefsAnalyzer
from scriber.graph.analyzers.docs import DocsAnalyzer
-def generate_cheap_relations(files: dict[Path, Any], edge_cls: Any, is_native: bool = False) -> list[Any]:
+
+def generate_cheap_relations(
+ files: dict[Path, Any], edge_cls: Any, is_native: bool = False
+) -> list[Any]:
indexes = GraphIndexes.build(files)
- config = None # Passed as None for these simple analyzers
-
+ config = None # Passed as None for these simple analyzers
+
analyzers = [
TestsAnalyzer(),
PackageAnalyzer(),
@@ -18,9 +21,9 @@ def generate_cheap_relations(files: dict[Path, Any], edge_cls: Any, is_native: b
ConfigRefsAnalyzer(),
DocsAnalyzer(),
]
-
+
edges = []
for analyzer in analyzers:
edges.extend(analyzer.analyze(files, indexes, config, edge_cls, is_native))
-
+
return edges
diff --git a/src/scriber/graph/analyzers/base.py b/src/scriber/graph/analyzers/base.py
index 9abe43f..827d388 100644
--- a/src/scriber/graph/analyzers/base.py
+++ b/src/scriber/graph/analyzers/base.py
@@ -11,5 +11,6 @@
class RelationAnalyzer(Protocol):
name: str
- def analyze(self, files: dict[Path, FileNode], indexes: GraphIndexes, config: ScriberConfig) -> Iterable[RelationEdge]:
- ...
+ def analyze(
+ self, files: dict[Path, FileNode], indexes: GraphIndexes, config: ScriberConfig
+ ) -> Iterable[RelationEdge]: ...
diff --git a/src/scriber/graph/analyzers/config_refs.py b/src/scriber/graph/analyzers/config_refs.py
index 481b213..e7e0f17 100644
--- a/src/scriber/graph/analyzers/config_refs.py
+++ b/src/scriber/graph/analyzers/config_refs.py
@@ -4,14 +4,28 @@
from scriber.core.models import FileNode, ScriberConfig
from scriber.graph.indexes import GraphIndexes
+
def is_config_file(f: FileNode) -> bool:
name = f.relative.name.lower()
- return name in {"pyproject.toml", "setup.py", "package.json", "dockerfile"} or f.relative.suffix.lower() in {".toml", ".yaml", ".yml", ".json"}
+ return name in {
+ "pyproject.toml",
+ "setup.py",
+ "package.json",
+ "dockerfile",
+ } or f.relative.suffix.lower() in {".toml", ".yaml", ".yml", ".json"}
+
class ConfigRefsAnalyzer:
name = "config_refs"
- def analyze(self, files: dict[Path, FileNode], indexes: GraphIndexes, config: ScriberConfig | None, edge_cls: Any, is_native: bool) -> Iterable:
+ def analyze(
+ self,
+ files: dict[Path, FileNode],
+ indexes: GraphIndexes,
+ config: ScriberConfig | None,
+ edge_cls: Any,
+ is_native: bool,
+ ) -> Iterable:
edges = []
for rel, node in files.items():
if is_config_file(node):
@@ -19,17 +33,23 @@ def analyze(self, files: dict[Path, FileNode], indexes: GraphIndexes, config: Sc
content = node.absolute.read_text(encoding="utf-8", errors="ignore")
for crel, cnode in files.items():
if cnode.kind == "code":
- if crel.as_posix() in content or (len(crel.name) > 4 and crel.name != "__init__.py" and crel.name in content):
- edges.append(edge_cls(
- source=str(rel) if is_native else rel,
- target=str(crel) if is_native else crel,
- kind="config_refs_code",
- weight=0.6,
- confidence=0.8,
- evidence=f"Config {rel.name} references {crel.name}",
- line=None,
- analyzer="config_refs:indexed"
- ))
+ if crel.as_posix() in content or (
+ len(crel.name) > 4
+ and crel.name != "__init__.py"
+ and crel.name in content
+ ):
+ edges.append(
+ edge_cls(
+ source=str(rel) if is_native else rel,
+ target=str(crel) if is_native else crel,
+ kind="config_refs_code",
+ weight=0.6,
+ confidence=0.8,
+ evidence=f"Config {rel.name} references {crel.name}",
+ line=None,
+ analyzer="config_refs:indexed",
+ )
+ )
except Exception:
pass
return edges
diff --git a/src/scriber/graph/analyzers/docs.py b/src/scriber/graph/analyzers/docs.py
index 6afc72e..ca623f7 100644
--- a/src/scriber/graph/analyzers/docs.py
+++ b/src/scriber/graph/analyzers/docs.py
@@ -4,29 +4,46 @@
from scriber.core.models import FileNode, ScriberConfig
from scriber.graph.indexes import GraphIndexes
+
class DocsAnalyzer:
name = "docs"
- def analyze(self, files: dict[Path, FileNode], indexes: GraphIndexes, config: ScriberConfig | None, edge_cls: Any, is_native: bool) -> Iterable:
+ def analyze(
+ self,
+ files: dict[Path, FileNode],
+ indexes: GraphIndexes,
+ config: ScriberConfig | None,
+ edge_cls: Any,
+ is_native: bool,
+ ) -> Iterable:
edges = []
for rel, node in files.items():
name_lower = node.relative.name.lower()
- if name_lower in {"readme.md", "readme.txt", "readme"} or "doc" in name_lower:
+ if (
+ name_lower in {"readme.md", "readme.txt", "readme"}
+ or "doc" in name_lower
+ ):
try:
content = node.absolute.read_text(encoding="utf-8", errors="ignore")
for crel, cnode in files.items():
if cnode.kind == "code":
- if crel.as_posix() in content or (len(crel.name) > 4 and crel.name != "__init__.py" and crel.name in content):
- edges.append(edge_cls(
- source=str(rel) if is_native else rel,
- target=str(crel) if is_native else crel,
- kind="doc_mentions_code",
- weight=0.42,
- confidence=0.8,
- evidence=f"{node.relative.name} mentions {crel.name}",
- line=None,
- analyzer="docs:indexed"
- ))
+ if crel.as_posix() in content or (
+ len(crel.name) > 4
+ and crel.name != "__init__.py"
+ and crel.name in content
+ ):
+ edges.append(
+ edge_cls(
+ source=str(rel) if is_native else rel,
+ target=str(crel) if is_native else crel,
+ kind="doc_mentions_code",
+ weight=0.42,
+ confidence=0.8,
+ evidence=f"{node.relative.name} mentions {crel.name}",
+ line=None,
+ analyzer="docs:indexed",
+ )
+ )
except Exception:
pass
return edges
diff --git a/src/scriber/graph/analyzers/env.py b/src/scriber/graph/analyzers/env.py
index f4eb938..566b679 100644
--- a/src/scriber/graph/analyzers/env.py
+++ b/src/scriber/graph/analyzers/env.py
@@ -5,14 +5,23 @@
from scriber.core.models import FileNode, ScriberConfig
from scriber.graph.indexes import GraphIndexes
+
class EnvAnalyzer:
name = "env"
- def analyze(self, files: dict[Path, FileNode], indexes: GraphIndexes, config: ScriberConfig | None, edge_cls: Any, is_native: bool) -> Iterable:
+ def analyze(
+ self,
+ files: dict[Path, FileNode],
+ indexes: GraphIndexes,
+ config: ScriberConfig | None,
+ edge_cls: Any,
+ is_native: bool,
+ ) -> Iterable:
edges = []
file_envs = {}
for rel, node in files.items():
- if node.kind != "code": continue
+ if node.kind != "code":
+ continue
try:
content = node.absolute.read_text(encoding="utf-8", errors="ignore")
keys = self.extract_env_keys(content)
@@ -22,29 +31,36 @@ def analyze(self, files: dict[Path, FileNode], indexes: GraphIndexes, config: Sc
indexes.env_key_to_files.setdefault(k, []).append(node)
except Exception:
pass
-
+
for key, nodes in indexes.env_key_to_files.items():
for i, n1 in enumerate(nodes):
for j, n2 in enumerate(nodes):
- if i == j: continue
- edges.append(edge_cls(
- source=str(n1.relative) if is_native else n1.relative,
- target=str(n2.relative) if is_native else n2.relative,
- kind="env_key",
- weight=0.4,
- confidence=0.9,
- evidence=f"Shared env key: {key}",
- line=None,
- analyzer="env:indexed"
- ))
+ if i == j:
+ continue
+ edges.append(
+ edge_cls(
+ source=str(n1.relative) if is_native else n1.relative,
+ target=str(n2.relative) if is_native else n2.relative,
+ kind="env_key",
+ weight=0.4,
+ confidence=0.9,
+ evidence=f"Shared env key: {key}",
+ line=None,
+ analyzer="env:indexed",
+ )
+ )
return edges
def extract_env_keys(self, content: str) -> set[str]:
keys = set()
- for match in re.finditer(r'os\.environ(?:\[|\.get\()[\'"]([A-Za-z0-9_]+)[\'"]', content):
+ for match in re.finditer(
+ r'os\.environ(?:\[|\.get\()[\'"]([A-Za-z0-9_]+)[\'"]', content
+ ):
keys.add(match.group(1))
for match in re.finditer(r'os\.getenv\([\'"]([A-Za-z0-9_]+)[\'"]\)', content):
keys.add(match.group(1))
- for match in re.finditer(r'process\.env(?:\[[\'"]([A-Za-z0-9_]+)[\'"]\]|\.([A-Za-z0-9_]+))', content):
+ for match in re.finditer(
+ r'process\.env(?:\[[\'"]([A-Za-z0-9_]+)[\'"]\]|\.([A-Za-z0-9_]+))', content
+ ):
keys.add(match.group(1) or match.group(2))
return keys
diff --git a/src/scriber/graph/analyzers/package.py b/src/scriber/graph/analyzers/package.py
index 7626b6e..b7f7c2f 100644
--- a/src/scriber/graph/analyzers/package.py
+++ b/src/scriber/graph/analyzers/package.py
@@ -4,27 +4,39 @@
from scriber.core.models import FileNode, ScriberConfig
from scriber.graph.indexes import GraphIndexes
+
class PackageAnalyzer:
name = "package"
- def analyze(self, files: dict[Path, FileNode], indexes: GraphIndexes, config: ScriberConfig | None, edge_cls: Any, is_native: bool) -> Iterable:
+ def analyze(
+ self,
+ files: dict[Path, FileNode],
+ indexes: GraphIndexes,
+ config: ScriberConfig | None,
+ edge_cls: Any,
+ is_native: bool,
+ ) -> Iterable:
edges = []
for d, siblings in indexes.by_dir.items():
code_siblings = [s for s in siblings if s.kind == "code"]
for s1 in code_siblings:
count = 0
for s2 in code_siblings:
- if s1 == s2: continue
+ if s1 == s2:
+ continue
count += 1
- if count > 8: break
- edges.append(edge_cls(
- source=str(s1.relative) if is_native else s1.relative,
- target=str(s2.relative) if is_native else s2.relative,
- kind="same_package",
- weight=0.5,
- confidence=1.0,
- evidence=None,
- line=None,
- analyzer="package:indexed"
- ))
+ if count > 8:
+ break
+ edges.append(
+ edge_cls(
+ source=str(s1.relative) if is_native else s1.relative,
+ target=str(s2.relative) if is_native else s2.relative,
+ kind="same_package",
+ weight=0.5,
+ confidence=1.0,
+ evidence=None,
+ line=None,
+ analyzer="package:indexed",
+ )
+ )
return edges
diff --git a/src/scriber/graph/analyzers/tests.py b/src/scriber/graph/analyzers/tests.py
index 409f63f..24d7f0f 100644
--- a/src/scriber/graph/analyzers/tests.py
+++ b/src/scriber/graph/analyzers/tests.py
@@ -4,33 +4,57 @@
from scriber.core.models import FileNode, ScriberConfig
from scriber.graph.indexes import GraphIndexes
+
class TestsAnalyzer:
name = "tests"
- def analyze(self, files: dict[Path, FileNode], indexes: GraphIndexes, config: ScriberConfig | None, edge_cls: Any, is_native: bool) -> Iterable:
+ def analyze(
+ self,
+ files: dict[Path, FileNode],
+ indexes: GraphIndexes,
+ config: ScriberConfig | None,
+ edge_cls: Any,
+ is_native: bool,
+ ) -> Iterable:
edges = []
for rel, node in files.items():
- if node.kind != "code": continue
+ if node.kind != "code":
+ continue
stem = rel.stem.lower()
name = rel.name.lower()
- clean_stem = stem.replace("test_", "").replace("_test", "").replace(".test", "")
- is_test = name.startswith("test_") or name.endswith("_test.py") or ".test." in name
-
+ clean_stem = (
+ stem.replace("test_", "").replace("_test", "").replace(".test", "")
+ )
+ is_test = (
+ name.startswith("test_")
+ or name.endswith("_test.py")
+ or ".test." in name
+ )
+
if is_test and clean_stem:
targets = indexes.by_clean_stem.get(clean_stem, [])
for target_node in targets:
- if target_node.relative == rel: continue
+ if target_node.relative == rel:
+ continue
target_name = target_node.relative.name.lower()
- target_is_test = target_name.startswith("test_") or target_name.endswith("_test.py") or ".test." in target_name
+ target_is_test = (
+ target_name.startswith("test_")
+ or target_name.endswith("_test.py")
+ or ".test." in target_name
+ )
if not target_is_test:
- edges.append(edge_cls(
- source=str(rel) if is_native else rel,
- target=str(target_node.relative) if is_native else target_node.relative,
- kind="test_of",
- weight=0.85,
- confidence=0.9,
- evidence=f"test filename {rel.name} matches {target_node.relative.name}",
- line=None,
- analyzer="tests:indexed"
- ))
+ edges.append(
+ edge_cls(
+ source=str(rel) if is_native else rel,
+ target=str(target_node.relative)
+ if is_native
+ else target_node.relative,
+ kind="test_of",
+ weight=0.85,
+ confidence=0.9,
+ evidence=f"test filename {rel.name} matches {target_node.relative.name}",
+ line=None,
+ analyzer="tests:indexed",
+ )
+ )
return edges
diff --git a/src/scriber/graph/builder.py b/src/scriber/graph/builder.py
index 17afd2a..33f2c98 100644
--- a/src/scriber/graph/builder.py
+++ b/src/scriber/graph/builder.py
@@ -1,14 +1,25 @@
from __future__ import annotations
+from typing import TYPE_CHECKING
+
+if TYPE_CHECKING:
+ from scriber.cache import ScriberCache
from pathlib import Path
from scriber.core.models import FileNode, ScriberConfig
from scriber.graph.model import ModuleGraph, RelationEdge
-from scriber.graph.languages.python import build_module_map, parse_python_imports, resolve_import_record
-from scriber.scanner.files import read_text_lossy
-
-
-def build_graph(files: dict[Path, FileNode], config: ScriberConfig, cache: ScriberCache | None = None) -> ModuleGraph:
+from scriber.graph.languages.python import (
+ build_module_map,
+ parse_python_imports,
+ resolve_import_record,
+)
+
+
+def build_graph(
+ files: dict[Path, FileNode],
+ config: ScriberConfig,
+ cache: ScriberCache | None = None,
+) -> ModuleGraph:
graph = ModuleGraph()
if not files:
return graph
@@ -23,16 +34,26 @@ def build_graph(files: dict[Path, FileNode], config: ScriberConfig, cache: Scrib
dir_to_files.setdefault(node.absolute.parent, []).append(node)
sample = next(iter(files.values()))
- root = Path(sample.absolute.as_posix()[:len(sample.absolute.as_posix()) - len(sample.relative.as_posix())]).resolve()
+ root = Path(
+ sample.absolute.as_posix()[
+ : len(sample.absolute.as_posix()) - len(sample.relative.as_posix())
+ ]
+ ).resolve()
if cache is None:
from scriber.cache import ScriberCache
+
cache = ScriberCache(config, root)
module_to_path, path_to_module = build_module_map(files, config.python)
for rel, file in files.items():
- if file.kind != "code" or file.is_binary or file.language not in {"python", "javascript", "typescript", "rust", "go", "c", "cpp"}:
+ if (
+ file.kind != "code"
+ or file.is_binary
+ or file.language
+ not in {"python", "javascript", "typescript", "rust", "go", "c", "cpp"}
+ ):
continue
try:
@@ -44,12 +65,20 @@ def build_graph(files: dict[Path, FileNode], config: ScriberConfig, cache: Scrib
cached_data = cache.get_file(rel, mtime_ns, size)
if cached_data is not None:
- cached_imports = cache.get_imports(rel)
+ cached_imports = cache.get_imports(rel, mtime_ns, size)
if cached_imports is not None:
for target in cached_imports:
if target in files:
- graph.imports.setdefault(rel, set()).add(target)
- graph.imported_by.setdefault(target, set()).add(rel)
+ graph.add_edge(
+ RelationEdge(
+ source=rel,
+ target=target,
+ kind="import",
+ weight=1.0,
+ confidence=0.98,
+ analyzer=f"imports:{file.language}",
+ )
+ )
continue
resolved_set = set()
@@ -74,7 +103,11 @@ def build_graph(files: dict[Path, FileNode], config: ScriberConfig, cache: Scrib
resolved_set.add(target)
elif file.language in {"javascript", "typescript", "react"}:
- from scriber.graph.languages.javascript import parse_javascript_imports, resolve_javascript_import
+ from scriber.graph.languages.javascript import (
+ parse_javascript_imports,
+ resolve_javascript_import,
+ )
+
try:
source = file.read_text()
except OSError:
@@ -87,7 +120,11 @@ def build_graph(files: dict[Path, FileNode], config: ScriberConfig, cache: Scrib
resolved_set.add(target)
elif file.language == "rust":
- from scriber.graph.languages.rust import parse_rust_imports, resolve_rust_import
+ from scriber.graph.languages.rust import (
+ parse_rust_imports,
+ resolve_rust_import,
+ )
+
try:
source = file.read_text()
except OSError:
@@ -101,6 +138,7 @@ def build_graph(files: dict[Path, FileNode], config: ScriberConfig, cache: Scrib
elif file.language == "go":
from scriber.graph.languages.go import parse_go_imports, resolve_go_import
+
try:
source = file.read_text()
except OSError:
@@ -113,7 +151,11 @@ def build_graph(files: dict[Path, FileNode], config: ScriberConfig, cache: Scrib
resolved_set.add(target)
elif file.language in {"c", "cpp"}:
- from scriber.graph.languages.cpp import parse_cpp_includes, resolve_cpp_include
+ from scriber.graph.languages.cpp import (
+ parse_cpp_includes,
+ resolve_cpp_include,
+ )
+
try:
source = file.read_text()
except OSError:
@@ -125,18 +167,17 @@ def build_graph(files: dict[Path, FileNode], config: ScriberConfig, cache: Scrib
continue
resolved_set.add(target)
-
- from scriber.core.models import RelationEdge
-
for target in resolved_set:
- graph.add_edge(RelationEdge(
- source=rel,
- target=target,
- kind="import",
- weight=1.0,
- confidence=0.98,
- analyzer=f"imports:{file.language}",
- ))
+ graph.add_edge(
+ RelationEdge(
+ source=rel,
+ target=target,
+ kind="import",
+ weight=1.0,
+ confidence=0.98,
+ analyzer=f"imports:{file.language}",
+ )
+ )
cache.set_imports(rel, resolved_set)
diff --git a/src/scriber/graph/indexes.py b/src/scriber/graph/indexes.py
index aa61518..905d952 100644
--- a/src/scriber/graph/indexes.py
+++ b/src/scriber/graph/indexes.py
@@ -20,18 +20,18 @@ class GraphIndexes:
@classmethod
def build(cls, files: dict[Path, FileNode]) -> GraphIndexes:
indexes = cls()
-
+
for rel, node in files.items():
indexes.by_dir.setdefault(rel.parent, []).append(node)
indexes.by_stem.setdefault(rel.stem, []).append(node)
-
- clean_stem = re.sub(r'[^a-zA-Z0-9]', '', rel.stem).lower()
+
+ clean_stem = re.sub(r"[^a-zA-Z0-9]", "", rel.stem).lower()
if clean_stem:
indexes.by_clean_stem.setdefault(clean_stem, []).append(node)
-
+
indexes.by_language.setdefault(node.language, []).append(node)
-
- # Simple indexing for .env and docs is done per analyzer as needed,
+
+ # Simple indexing for .env and docs is done per analyzer as needed,
# but we can initialize the dicts here.
-
+
return indexes
diff --git a/src/scriber/graph/languages/cpp.py b/src/scriber/graph/languages/cpp.py
index 5c19732..2de541a 100644
--- a/src/scriber/graph/languages/cpp.py
+++ b/src/scriber/graph/languages/cpp.py
@@ -19,20 +19,18 @@ def parse_cpp_includes(source: str) -> list[str]:
def resolve_cpp_include(
- include_spec: str,
- current_file: FileNode,
- absolute_to_file: dict[Path, FileNode]
+ include_spec: str, current_file: FileNode, absolute_to_file: dict[Path, FileNode]
) -> set[Path]:
"""Resolve a C/C++ include specifier to a project file path."""
resolved = set()
parent = current_file.absolute.parent
-
+
# 1. Try resolving relative to current file's directory
try:
candidate = (parent / include_spec).resolve(strict=False)
except Exception:
candidate = parent / include_spec
-
+
node = absolute_to_file.get(candidate)
if node and not node.is_binary:
resolved.add(node.relative)
diff --git a/src/scriber/graph/languages/extractor.py b/src/scriber/graph/languages/extractor.py
index 333e74e..2d7dc75 100644
--- a/src/scriber/graph/languages/extractor.py
+++ b/src/scriber/graph/languages/extractor.py
@@ -3,6 +3,7 @@
from typing import Any
from scriber.core.symbols import SymbolNode, SymbolIndex
+
class PythonSymbolVisitor(ast.NodeVisitor):
def __init__(self, file_path: Path, index: SymbolIndex):
self.file_path = file_path
@@ -12,16 +13,16 @@ def __init__(self, file_path: Path, index: SymbolIndex):
def visit_ClassDef(self, node: ast.ClassDef) -> Any:
start = node.lineno
end = getattr(node, "end_lineno", start)
-
+
symbol = SymbolNode(
name=node.name,
kind="class",
line_start=start,
line_end=end,
- parent_name=self.current_parent
+ parent_name=self.current_parent,
)
self.index.add_symbol(self.file_path, symbol)
-
+
old_parent = self.current_parent
self.current_parent = node.name
self.generic_visit(node)
@@ -30,16 +31,16 @@ def visit_ClassDef(self, node: ast.ClassDef) -> Any:
def visit_FunctionDef(self, node: ast.FunctionDef) -> Any:
start = node.lineno
end = getattr(node, "end_lineno", start)
-
+
symbol = SymbolNode(
name=node.name,
kind="function",
line_start=start,
line_end=end,
- parent_name=self.current_parent
+ parent_name=self.current_parent,
)
self.index.add_symbol(self.file_path, symbol)
-
+
old_parent = self.current_parent
self.current_parent = node.name
self.generic_visit(node)
@@ -48,23 +49,25 @@ def visit_FunctionDef(self, node: ast.FunctionDef) -> Any:
def visit_AsyncFunctionDef(self, node: ast.AsyncFunctionDef) -> Any:
start = node.lineno
end = getattr(node, "end_lineno", start)
-
+
symbol = SymbolNode(
name=node.name,
kind="function",
line_start=start,
line_end=end,
- parent_name=self.current_parent
+ parent_name=self.current_parent,
)
self.index.add_symbol(self.file_path, symbol)
-
+
old_parent = self.current_parent
self.current_parent = node.name
self.generic_visit(node)
self.current_parent = old_parent
-def extract_python_symbols(file_path: Path, source_code: str, index: SymbolIndex) -> None:
+def extract_python_symbols(
+ file_path: Path, source_code: str, index: SymbolIndex
+) -> None:
try:
tree = ast.parse(source_code, filename=str(file_path))
visitor = PythonSymbolVisitor(file_path, index)
diff --git a/src/scriber/graph/languages/go.py b/src/scriber/graph/languages/go.py
index 25b5fab..49bd77f 100644
--- a/src/scriber/graph/languages/go.py
+++ b/src/scriber/graph/languages/go.py
@@ -6,7 +6,7 @@
IMPORT_SINGLE_RE = re.compile(r'\bimport\s+[\'"]([^\'"]+)[\'"]')
-IMPORT_BLOCK_RE = re.compile(r'\bimport\s*\(([^)]+)\)')
+IMPORT_BLOCK_RE = re.compile(r"\bimport\s*\(([^)]+)\)")
def parse_go_imports(source: str) -> list[str]:
@@ -25,24 +25,29 @@ def parse_go_imports(source: str) -> list[str]:
return imports
-def resolve_go_import(import_spec: str, current_file: FileNode, dir_to_files: dict[Path, list[FileNode]], project_root: Path) -> set[Path]:
+def resolve_go_import(
+ import_spec: str,
+ current_file: FileNode,
+ dir_to_files: dict[Path, list[FileNode]],
+ project_root: Path,
+) -> set[Path]:
resolved = set()
go_mod_path = project_root / "go.mod"
module_name = None
if go_mod_path.exists():
try:
content = go_mod_path.read_text(encoding="utf-8")
- m = re.search(r'^\s*module\s+(\S+)', content, re.MULTILINE)
+ m = re.search(r"^\s*module\s+(\S+)", content, re.MULTILINE)
if m:
module_name = m.group(1)
except Exception:
pass
if module_name and import_spec.startswith(module_name):
- rel_spec = import_spec[len(module_name):].lstrip("/")
+ rel_spec = import_spec[len(module_name) :].lstrip("/")
target_dir = (project_root / rel_spec).resolve()
for node in dir_to_files.get(target_dir, []):
if node.language == "go":
resolved.add(node.relative)
-
+
return resolved
diff --git a/src/scriber/graph/languages/javascript.py b/src/scriber/graph/languages/javascript.py
index 9ca43f7..385e918 100644
--- a/src/scriber/graph/languages/javascript.py
+++ b/src/scriber/graph/languages/javascript.py
@@ -21,7 +21,9 @@ def parse_javascript_imports(source: str) -> list[str]:
return imports
-def resolve_javascript_import(import_spec: str, current_file: FileNode, absolute_to_file: dict[Path, FileNode]) -> set[Path]:
+def resolve_javascript_import(
+ import_spec: str, current_file: FileNode, absolute_to_file: dict[Path, FileNode]
+) -> set[Path]:
resolved = set()
if not import_spec.startswith("."):
return resolved
@@ -31,8 +33,19 @@ def resolve_javascript_import(import_spec: str, current_file: FileNode, absolute
base_path = Path(os.path.abspath(parent / import_spec))
except Exception:
base_path = (parent / import_spec).resolve(strict=False)
-
- extensions = ["", ".ts", ".tsx", ".js", ".jsx", ".d.ts"]
+
+ extensions = [
+ "",
+ ".ts",
+ ".tsx",
+ ".js",
+ ".jsx",
+ ".d.ts",
+ ".vue",
+ ".svelte",
+ ".astro",
+ ".json",
+ ]
for ext in extensions:
candidate = base_path.with_name(base_path.name + ext) if ext else base_path
node = absolute_to_file.get(candidate)
@@ -41,7 +54,15 @@ def resolve_javascript_import(import_spec: str, current_file: FileNode, absolute
return resolved
# Try index files
- for index_name in ["index.ts", "index.tsx", "index.js", "index.jsx"]:
+ for index_name in [
+ "index.ts",
+ "index.tsx",
+ "index.js",
+ "index.jsx",
+ "index.vue",
+ "index.svelte",
+ "index.astro",
+ ]:
candidate = base_path / index_name
node = absolute_to_file.get(candidate)
if node and not node.is_binary:
diff --git a/src/scriber/graph/languages/python.py b/src/scriber/graph/languages/python.py
index 60af766..cc8fc1e 100644
--- a/src/scriber/graph/languages/python.py
+++ b/src/scriber/graph/languages/python.py
@@ -25,11 +25,15 @@ def parse_python_imports(path: Path, source: str) -> list[ImportRecord]:
for node in ast.walk(tree):
if isinstance(node, ast.Import):
for alias in node.names:
- imports.append(ImportRecord(kind="import", module=alias.name, names=(), level=0))
+ imports.append(
+ ImportRecord(kind="import", module=alias.name, names=(), level=0)
+ )
elif isinstance(node, ast.ImportFrom):
module = node.module or ""
names = tuple(alias.name for alias in node.names if alias.name != "*")
- imports.append(ImportRecord(kind="from", module=module, names=names, level=node.level))
+ imports.append(
+ ImportRecord(kind="from", module=module, names=names, level=node.level)
+ )
return imports
@@ -55,7 +59,11 @@ def module_name_for_file(file: FileNode, python: PythonConfig) -> str | None:
if file.language != "python":
return None
rel = file.relative
- roots = sorted(python.source_roots, key=lambda item: 0 if item == "." else len(item), reverse=True)
+ roots = sorted(
+ python.source_roots,
+ key=lambda item: 0 if item == "." else len(item),
+ reverse=True,
+ )
for source_root in roots:
if not _is_under(rel, source_root):
continue
@@ -73,7 +81,9 @@ def module_name_for_file(file: FileNode, python: PythonConfig) -> str | None:
return None
-def build_module_map(files: dict[Path, FileNode], python: PythonConfig) -> tuple[dict[str, Path], dict[Path, str]]:
+def build_module_map(
+ files: dict[Path, FileNode], python: PythonConfig
+) -> tuple[dict[str, Path], dict[Path, str]]:
module_to_path: dict[str, Path] = {}
path_to_module: dict[Path, str] = {}
for rel, file in files.items():
@@ -85,7 +95,9 @@ def build_module_map(files: dict[Path, FileNode], python: PythonConfig) -> tuple
return module_to_path, path_to_module
-def resolve_relative_module(current_module: str, current_is_init: bool, record: ImportRecord) -> str:
+def resolve_relative_module(
+ current_module: str, current_is_init: bool, record: ImportRecord
+) -> str:
if record.level <= 0:
return record.module
if current_is_init:
@@ -114,7 +126,11 @@ def resolve_import_record(
if record.kind == "import":
candidates.append(record.module)
else:
- base = resolve_relative_module(current_module, current_is_init, record) if record.level else record.module
+ base = (
+ resolve_relative_module(current_module, current_is_init, record)
+ if record.level
+ else record.module
+ )
for name in record.names:
if base:
candidates.append(f"{base}.{name}")
diff --git a/src/scriber/graph/languages/rust.py b/src/scriber/graph/languages/rust.py
index 14feecc..07a9f73 100644
--- a/src/scriber/graph/languages/rust.py
+++ b/src/scriber/graph/languages/rust.py
@@ -5,8 +5,8 @@
from scriber.core.models import FileNode
-MOD_RE = re.compile(r'\bmod\s+(\w+)\s*;')
-USE_RE = re.compile(r'\buse\s+([^;]+)\s*;')
+MOD_RE = re.compile(r"\bmod\s+(\w+)\s*;")
+USE_RE = re.compile(r"\buse\s+([^;]+)\s*;")
def parse_rust_imports(source: str) -> list[tuple[str, str]]:
@@ -28,15 +28,14 @@ def parse_rust_imports(source: str) -> list[tuple[str, str]]:
return imports
-def resolve_rust_import(kind: str, spec: str, current_file: FileNode, absolute_to_file: dict[Path, FileNode]) -> set[Path]:
+def resolve_rust_import(
+ kind: str, spec: str, current_file: FileNode, absolute_to_file: dict[Path, FileNode]
+) -> set[Path]:
resolved = set()
parent = current_file.absolute.parent
if kind == "mod":
- candidates = [
- parent / f"{spec}.rs",
- parent / spec / "mod.rs"
- ]
+ candidates = [parent / f"{spec}.rs", parent / spec / "mod.rs"]
for cand in candidates:
node = absolute_to_file.get(cand)
if node:
@@ -65,7 +64,7 @@ def resolve_rust_import(kind: str, spec: str, current_file: FileNode, absolute_t
module_path = crate_root / Path(*sub_parts[:end])
candidates = [
module_path.with_name(module_path.name + ".rs"),
- module_path / "mod.rs"
+ module_path / "mod.rs",
]
for cand in candidates:
node = absolute_to_file.get(cand)
@@ -80,7 +79,7 @@ def resolve_rust_import(kind: str, spec: str, current_file: FileNode, absolute_t
module_path = crate_root / Path(*sub_parts[:end])
candidates = [
module_path.with_name(module_path.name + ".rs"),
- module_path / "mod.rs"
+ module_path / "mod.rs",
]
for cand in candidates:
node = absolute_to_file.get(cand)
@@ -95,7 +94,7 @@ def resolve_rust_import(kind: str, spec: str, current_file: FileNode, absolute_t
module_path = crate_root / Path(*sub_parts[:end])
candidates = [
module_path.with_name(module_path.name + ".rs"),
- module_path / "mod.rs"
+ module_path / "mod.rs",
]
for cand in candidates:
node = absolute_to_file.get(cand)
diff --git a/src/scriber/graph/model.py b/src/scriber/graph/model.py
index d87376c..f0996e1 100644
--- a/src/scriber/graph/model.py
+++ b/src/scriber/graph/model.py
@@ -25,6 +25,7 @@
"entrypoint_to_module",
]
+
@dataclass(frozen=True, slots=True)
class RelationEdge:
source: Path
@@ -36,6 +37,7 @@ class RelationEdge:
line: int | None = None
analyzer: str = "unknown"
+
@dataclass(slots=True)
class RelationGraph:
edges: list[RelationEdge] = field(default_factory=list)
@@ -53,6 +55,7 @@ def add_edge(self, edge: RelationEdge) -> None:
self.imports.setdefault(edge.source, set()).add(edge.target)
self.imported_by.setdefault(edge.target, set()).add(edge.source)
+
@dataclass(slots=True)
class ModuleGraph(RelationGraph):
pass
diff --git a/src/scriber/native.py b/src/scriber/native.py
index 08b415c..64b494c 100644
--- a/src/scriber/native.py
+++ b/src/scriber/native.py
@@ -14,6 +14,7 @@ def _load_native() -> Any:
raise _IMPORT_ERROR
try:
from scriber import _native
+
_NATIVE_MODULE = _native
return _NATIVE_MODULE
except ImportError as e:
@@ -35,7 +36,9 @@ def require_native() -> Any:
try:
native = _load_native()
if hasattr(native, "native_api_version") and native.native_api_version() != 1:
- raise RuntimeError("Niezgodna wersja natywnego backendu Scriber (oczekiwano wersji 1).")
+ raise RuntimeError(
+ "Niezgodna wersja natywnego backendu Scriber (oczekiwano wersji 1)."
+ )
return native
except ImportError as e:
raise ImportError(
diff --git a/src/scriber/outline/__init__.py b/src/scriber/outline/__init__.py
index 2e72db9..b9c7888 100644
--- a/src/scriber/outline/__init__.py
+++ b/src/scriber/outline/__init__.py
@@ -8,6 +8,7 @@
}
_generic = GenericOutliner()
+
def generate_outline(file: FileNode, content: str) -> FileOutline:
outliner = _outliners.get(file.language, _generic)
return outliner.outline(file, content)
diff --git a/src/scriber/outline/base.py b/src/scriber/outline/base.py
index a79c6c0..32affb4 100644
--- a/src/scriber/outline/base.py
+++ b/src/scriber/outline/base.py
@@ -2,6 +2,6 @@
from typing import Protocol
from scriber.core.models import FileNode, FileOutline
+
class Outliner(Protocol):
- def outline(self, file: FileNode, content: str) -> FileOutline:
- ...
+ def outline(self, file: FileNode, content: str) -> FileOutline: ...
diff --git a/src/scriber/outline/generic.py b/src/scriber/outline/generic.py
index f0aac16..eaabd36 100644
--- a/src/scriber/outline/generic.py
+++ b/src/scriber/outline/generic.py
@@ -2,6 +2,7 @@
from scriber.core.models import FileNode, FileOutline
from scriber.outline.base import Outliner
+
class GenericOutliner(Outliner):
def outline(self, file: FileNode, content: str) -> FileOutline:
return FileOutline(
@@ -13,6 +14,8 @@ def outline(self, file: FileNode, content: str) -> FileOutline:
classes=[],
functions=[],
constants=[],
- notes=["Static outline not implemented for this language. Showing generic info."],
- token_estimate=20
+ notes=[
+ "Static outline not implemented for this language. Showing generic info."
+ ],
+ token_estimate=20,
)
diff --git a/src/scriber/outline/python.py b/src/scriber/outline/python.py
index bd9c9c7..4056ba9 100644
--- a/src/scriber/outline/python.py
+++ b/src/scriber/outline/python.py
@@ -3,6 +3,7 @@
from scriber.core.models import FileNode, FileOutline
from scriber.outline.base import Outliner
+
class PythonOutliner(Outliner):
def outline(self, file: FileNode, content: str) -> FileOutline:
classes = []
@@ -24,7 +25,7 @@ def outline(self, file: FileNode, content: str) -> FileOutline:
imports.append(f"{module}.{alias.name}")
except SyntaxError:
pass
-
+
return FileOutline(
path=file.relative,
language="python",
@@ -35,5 +36,5 @@ def outline(self, file: FileNode, content: str) -> FileOutline:
functions=functions,
constants=[],
notes=[],
- token_estimate=len(classes)*5 + len(functions)*3 + len(imports)*2
+ token_estimate=len(classes) * 5 + len(functions) * 3 + len(imports) * 2,
)
diff --git a/src/scriber/packer/pack.py b/src/scriber/packer/pack.py
index 801f93f..0faa162 100644
--- a/src/scriber/packer/pack.py
+++ b/src/scriber/packer/pack.py
@@ -6,17 +6,23 @@
from scriber.core.config import apply_overrides, load_config
from scriber.core.errors import ScriberError
from scriber.core.models import Candidate, FileNode, ScriberPack, SeedPath
-from scriber.core.root import ensure_inside_root, project_root_from_config, rel_to_root, resolve_config_path
+from scriber.core.root import (
+ ensure_inside_root,
+ project_root_from_config,
+ rel_to_root,
+ resolve_config_path,
+)
from scriber.engine.scorer import score_candidates
-from scriber.graph.builder import build_graph
from scriber.rendering.renderer import render_pack
-from scriber.scanner.files import classify_file, is_text_readable, read_text_lossy
+from scriber.scanner.files import classify_file, is_text_readable
from scriber.tokens import estimate_tokens
from scriber.scanner.scan import scan_project
from scriber.core.models import LlmPack
-def _resolve_input(path_value: str, root: Path, allow_external: bool, path_base: str = "cwd") -> Path:
+def _resolve_input(
+ path_value: str, root: Path, allow_external: bool, path_base: str = "cwd"
+) -> Path:
path = Path(path_value).expanduser()
if not path.is_absolute():
if path_base == "project":
@@ -36,7 +42,9 @@ def _resolve_input(path_value: str, root: Path, allow_external: bool, path_base:
return path.resolve()
-def _ensure_seed_file(path: Path, root: Path, files: dict[Path, FileNode], config) -> FileNode:
+def _ensure_seed_file(
+ path: Path, root: Path, files: dict[Path, FileNode], config
+) -> FileNode:
rel = rel_to_root(path, root)
existing = files.get(rel)
if existing is not None:
@@ -60,11 +68,19 @@ def _ensure_seed_file(path: Path, root: Path, files: dict[Path, FileNode], confi
return node
-def _expand_seed(path: Path, root: Path, files: dict[Path, FileNode], config) -> SeedPath:
+def _expand_seed(
+ path: Path, root: Path, files: dict[Path, FileNode], config
+) -> SeedPath:
rel = rel_to_root(path, root)
if path.is_file():
node = _ensure_seed_file(path, root, files, config)
- return SeedPath(original=Path(path), absolute=path, relative=rel, is_dir=False, expanded_files=[node.relative])
+ return SeedPath(
+ original=Path(path),
+ absolute=path,
+ relative=rel,
+ is_dir=False,
+ expanded_files=[node.relative],
+ )
expanded: list[Path] = []
for file_rel, node in files.items():
@@ -76,11 +92,26 @@ def _expand_seed(path: Path, root: Path, files: dict[Path, FileNode], config) ->
expanded.append(file_rel)
expanded.sort(key=lambda item: item.as_posix())
if not expanded:
- raise ScriberError(f"No readable project files found inside seed folder: {rel.as_posix()}")
- return SeedPath(original=Path(path), absolute=path, relative=rel, is_dir=True, expanded_files=expanded)
+ raise ScriberError(
+ f"No readable project files found inside seed folder: {rel.as_posix()}"
+ )
+ return SeedPath(
+ original=Path(path),
+ absolute=path,
+ relative=rel,
+ is_dir=True,
+ expanded_files=expanded,
+ )
-def _decide_content(candidate: Candidate, *, config, only_tree: bool, budget_left: int | None, is_seed: bool) -> tuple[bool, str | None, str | None, int]:
+def _decide_content(
+ candidate: Candidate,
+ *,
+ config,
+ only_tree: bool,
+ budget_left: int | None,
+ is_seed: bool,
+) -> tuple[bool, str | None, str | None, int]:
if only_tree:
return False, None, "only-tree mode", 0
file = candidate.file
@@ -127,9 +158,16 @@ def _decide_content(candidate: Candidate, *, config, only_tree: bool, budget_lef
def _apply_content_policy(pack: ScriberPack, config) -> None:
if pack.mode == "focused":
- explicit_seed_files = {rel for seed in pack.seed_paths for rel in seed.expanded_files}
+ explicit_seed_files = {
+ rel for seed in pack.seed_paths for rel in seed.expanded_files
+ }
else:
- explicit_seed_files = {rel for seed in pack.seed_paths if not seed.is_dir for rel in seed.expanded_files}
+ explicit_seed_files = {
+ rel
+ for seed in pack.seed_paths
+ if not seed.is_dir
+ for rel in seed.expanded_files
+ }
budget_left = config.max_tokens if config.max_tokens > 0 else None
total = 0
for candidate in pack.candidates:
@@ -152,10 +190,26 @@ def _apply_content_policy(pack: ScriberPack, config) -> None:
pack.total_tokens = total
-def _load_and_apply_config(paths, config_path, output, output_format, only_tree, modules, support, max_files, max_tokens, min_score, support_content):
+def _load_and_apply_config(
+ paths,
+ config_path,
+ profile,
+ output,
+ output_format,
+ only_tree,
+ modules,
+ support,
+ max_files,
+ max_tokens,
+ min_score,
+ support_content,
+):
resolved_config = resolve_config_path(paths, config_path)
root = project_root_from_config(resolved_config)
config = load_config(resolved_config)
+ from scriber.core.profiles import apply_profile
+
+ config = apply_profile(config, profile)
config = apply_overrides(
config,
output=output,
@@ -170,55 +224,73 @@ def _load_and_apply_config(paths, config_path, output, output_format, only_tree,
)
return resolved_config, root, config
+
def _scan_files(paths, root, config, path_base, progress_callback):
- if progress_callback: progress_callback("Skanowanie plikow...")
+ if progress_callback:
+ progress_callback("Skanowanie plikow...")
from scriber.native import is_native_available
+
native_files = None
if is_native_available():
from scriber.scanner.scan import scan_project_with_native
+
files, native_files = scan_project_with_native(root, config)
else:
files = scan_project(root, config)
- resolved_inputs = [_resolve_input(item, root, config.allow_external_paths, path_base) for item in paths]
+ resolved_inputs = [
+ _resolve_input(item, root, config.allow_external_paths, path_base)
+ for item in paths
+ ]
seeds = [_expand_seed(path, root, files, config) for path in resolved_inputs]
-
+
is_project_snapshot = False
for path in resolved_inputs:
if path == root:
is_project_snapshot = True
break
-
+
return files, native_files, seeds, is_project_snapshot
-def _build_graph_and_score(mode, files, seeds, native_files, root, config, progress_callback):
+def _build_graph_and_score(
+ mode, files, seeds, native_files, root, config, progress_callback
+):
from time import perf_counter
+
timings = {}
stats = {}
from scriber.native import is_native_available
+
if is_native_available():
from scriber.native import require_native
+
native = require_native()
-
+
t_graph = perf_counter()
- if progress_callback: progress_callback("Budowanie grafu modulow (natywnie)...")
-
+ if progress_callback:
+ progress_callback("Budowanie grafu modulow (natywnie)...")
+
assert native_files is not None
-
+
edges = native.build_relation_graph(
str(root),
native_files,
config.python.source_roots,
- config.python.module_init_files
+ config.python.module_init_files,
)
-
+
from scriber.graph.analyzers import generate_cheap_relations
- edges.extend(generate_cheap_relations(files, native.NativeRelationEdge, is_native=True))
-
+
+ edges.extend(
+ generate_cheap_relations(files, native.NativeRelationEdge, is_native=True)
+ )
+
from scriber.cache import ScriberCache
+
cache = ScriberCache(config, root)
from scriber.core.models import ModuleGraph, RelationEdge
+
graph = ModuleGraph()
for edge in edges:
from_path = Path(getattr(edge, "source"))
@@ -231,24 +303,25 @@ def _build_graph_and_score(mode, files, seeds, native_files, root, config, progr
confidence=edge.confidence,
evidence=edge.evidence,
line=edge.line,
- analyzer=edge.analyzer
+ analyzer=edge.analyzer,
)
graph.add_edge(py_edge)
if py_edge.kind in {"import", "reexport"}:
cache.add_import_edge(from_path, to_path)
-
+
cache.save(set(files.keys()))
-
+
stats["graph_edges_built"] = len(edges)
stats["graph_source"] = "native"
stats["graph_cache_reads"] = cache.reads
stats["graph_cache_hits"] = cache.hits
stats["graph_cache_writes"] = cache.writes
-
+
timings["graph_build"] = perf_counter() - t_graph
-
+
t_score = perf_counter()
- if progress_callback: progress_callback("Ocenianie zaleznosci (natywnie)...")
+ if progress_callback:
+ progress_callback("Ocenianie zaleznosci (natywnie)...")
scoring = config.modules_config.scoring
opts = native.NativePackOptions(
mode=mode,
@@ -269,6 +342,10 @@ def _build_graph_and_score(mode, files, seeds, native_files, root, config, progr
runtime_support_score=scoring.get("runtime_support", 50),
documentation_score=scoring.get("documentation", 45),
shared_dependency_bonus=scoring.get("shared_dependency_bonus", 10),
+ entrypoint_file_score=scoring.get("entrypoint_file", 90),
+ code_file_score=scoring.get("code_file", 80),
+ test_file_score=scoring.get("test_file", 60),
+ other_file_score=scoring.get("other_file", 40),
modules_enabled=config.modules,
include_direct_dependencies=config.modules_config.include_direct_dependencies,
include_reverse_dependencies=config.modules_config.include_reverse_dependencies,
@@ -281,14 +358,11 @@ def _build_graph_and_score(mode, files, seeds, native_files, root, config, progr
entrypoint_patterns=config.python.entrypoint_patterns,
test_roots=config.python.test_roots,
)
-
+
rs_candidates = native.score_candidates_native(
- native_files,
- [seed.relative.as_posix() for seed in seeds],
- edges,
- opts
+ native_files, [seed.relative.as_posix() for seed in seeds], edges, opts
)
-
+
candidates = []
for rc in rs_candidates:
rel = Path(rc.path)
@@ -306,33 +380,41 @@ def _build_graph_and_score(mode, files, seeds, native_files, root, config, progr
timings["scoring"] = perf_counter() - t_score
else:
t_graph = perf_counter()
- if progress_callback: progress_callback("Budowanie grafu modulow...")
+ if progress_callback:
+ progress_callback("Budowanie grafu modulow...")
from scriber.cache import ScriberCache
+
cache = ScriberCache(config, root)
from scriber.graph.builder import build_graph
+
graph = build_graph(files, config, cache)
-
+
from scriber.graph.analyzers import generate_cheap_relations
from scriber.core.models import RelationEdge
+
cheap_edges = generate_cheap_relations(files, RelationEdge, is_native=False)
for edge in cheap_edges:
graph.add_edge(edge)
-
+
stats["graph_edges_built"] = len(graph.edges)
stats["graph_source"] = "python"
stats["graph_cache_reads"] = cache.reads
stats["graph_cache_hits"] = cache.hits
stats["graph_cache_writes"] = cache.writes
-
+
timings["graph_build"] = perf_counter() - t_graph
-
+
t_score = perf_counter()
- if progress_callback: progress_callback("Ocenianie zaleznosci...")
- candidates = score_candidates(files=files, seeds=seeds, graph=graph, config=config, mode=mode)
+ if progress_callback:
+ progress_callback("Ocenianie zaleznosci...")
+ candidates = score_candidates(
+ files=files, seeds=seeds, graph=graph, config=config, mode=mode
+ )
timings["scoring"] = perf_counter() - t_score
return candidates, graph, timings, stats
+
def build_pack(
paths: list[str] | None = None,
*,
@@ -352,16 +434,29 @@ def build_pack(
path_base: str = "project",
) -> ScriberPack | LlmPack:
from time import perf_counter
-
+
t_start = perf_counter()
paths = paths or ["."]
resolved_config, root, config = _load_and_apply_config(
- paths, config_path, output, output_format, only_tree, modules, support, max_files, max_tokens, min_score, support_content
+ paths,
+ config_path,
+ profile,
+ output,
+ output_format,
+ only_tree,
+ modules,
+ support,
+ max_files,
+ max_tokens,
+ min_score,
+ support_content,
)
t_config_load = perf_counter() - t_start
t_scan = perf_counter()
- files, native_files, seeds, is_project_snapshot = _scan_files(paths, root, config, path_base, progress_callback)
+ files, native_files, seeds, is_project_snapshot = _scan_files(
+ paths, root, config, path_base, progress_callback
+ )
t_scan_time = perf_counter() - t_scan
mode = "project_snapshot" if (project or is_project_snapshot) else "focused"
@@ -379,32 +474,37 @@ def build_pack(
from scriber.engine.ranker import rank_context
from scriber.budget.allocator import allocate_budget, BudgetPolicy
from time import perf_counter
-
+
t_rank = perf_counter()
- if progress_callback: progress_callback("Rankowanie kontekstu...")
+ if progress_callback:
+ progress_callback("Rankowanie kontekstu...")
seed_paths = [seed for p in seeds for seed in p.expanded_files]
new_candidates = rank_context(files, graph, seed_paths, config, mode)
sub_timings["rank_context"] = perf_counter() - t_rank
-
+
t_budget = perf_counter()
- if progress_callback: progress_callback("Alokacja budzetu...")
+ if progress_callback:
+ progress_callback("Alokacja budzetu...")
policy = BudgetPolicy(
target_tokens=config.max_tokens if config.max_tokens > 0 else 30000,
hard_limit_tokens=config.max_tokens if config.max_tokens > 0 else 100000,
- mode=mode
+ mode=mode,
)
if mode == "focused":
explicit_seeds = {seed for p in seeds for seed in p.expanded_files}
else:
- explicit_seeds = {seed for p in seeds if not p.is_dir for seed in p.expanded_files}
-
+ explicit_seeds = {
+ seed for p in seeds if not p.is_dir for seed in p.expanded_files
+ }
+
items = allocate_budget(new_candidates, policy, explicit_seeds)
sub_timings["budget_allocation"] = perf_counter() - t_budget
-
+
t_content = perf_counter()
- if progress_callback: progress_callback("Czytanie i outline...")
+ if progress_callback:
+ progress_callback("Czytanie i outline...")
from scriber.outline import generate_outline
-
+
actual_tokens = 0
for item in items:
if item.content_mode == "full":
@@ -417,12 +517,26 @@ def build_pack(
try:
content = item.file.read_text()
item.outline = generate_outline(item.file, content)
- actual_tokens += item.outline.token_estimate
+ has_outline_symbols = bool(
+ item.outline.classes
+ or item.outline.functions
+ or item.outline.constants
+ or item.outline.imports
+ )
+ if item.content_mode == "excerpt" and not has_outline_symbols:
+ if actual_tokens + item.token_estimate <= policy.target_tokens:
+ item.content_mode = "full"
+ item.content = content
+ actual_tokens += item.token_estimate
+ else:
+ item.content_mode = "tree"
+ else:
+ actual_tokens += item.outline.token_estimate
except Exception:
item.content_mode = "tree"
-
+
sub_timings["content_read"] = perf_counter() - t_content
-
+
stats["input_paths"] = paths
pack = LlmPack(
project_root=root,
@@ -435,12 +549,12 @@ def build_pack(
items=items,
graph=graph,
stats=stats,
- warnings=[]
+ warnings=[],
)
pack.timings = {
"config_load": t_config_load,
"scan": t_scan_time,
- **sub_timings
+ **sub_timings,
}
return pack
@@ -455,22 +569,25 @@ def build_pack(
mode=mode,
stats=stats,
)
-
+
t_content = perf_counter()
- if progress_callback: progress_callback("Aplikowanie regul zawartosci...")
+ if progress_callback:
+ progress_callback("Aplikowanie regul zawartosci...")
_apply_content_policy(pack, config)
t_content_time = perf_counter() - t_content
-
+
pack.timings = {
"config_load": t_config_load,
"scan": t_scan_time,
"content_read": t_content_time,
- **sub_timings
+ **sub_timings,
}
return pack
-def build_and_write_pack(paths: list[str] | None = None, **kwargs) -> tuple[Path | None, ScriberPack | LlmPack]:
+def build_and_write_pack(
+ paths: list[str] | None = None, **kwargs
+) -> tuple[Path | None, ScriberPack | LlmPack]:
explain_selection = kwargs.pop("explain_selection", False)
pack = build_pack(paths, **kwargs)
config_path = resolve_config_path(paths or ["."], kwargs.get("config_path"))
@@ -488,20 +605,23 @@ def build_and_write_pack(paths: list[str] | None = None, **kwargs) -> tuple[Path
support_content=kwargs.get("support_content"),
)
progress = kwargs.get("progress_callback")
- if progress: progress("Renderowanie Markdown...")
-
+ if progress:
+ progress("Renderowanie Markdown...")
+
if isinstance(pack, LlmPack):
from scriber.renderer.llm_report import render_llm_report
import io
+
buf = io.StringIO()
render_llm_report(pack, buf)
rendered = buf.getvalue()
else:
rendered = render_pack(pack, explain_selection=explain_selection)
-
+
output = config.output
if str(output) == "-":
import sys
+
try:
sys.stdout.buffer.write(rendered.encode("utf-8"))
sys.stdout.flush()
@@ -513,11 +633,12 @@ def build_and_write_pack(paths: list[str] | None = None, **kwargs) -> tuple[Path
output.parent.mkdir(parents=True, exist_ok=True)
try:
from scriber.native import is_native_available, require_native
+
if is_native_available():
require_native().write_text(str(output), rendered)
else:
output.write_text(rendered, encoding="utf-8")
except Exception:
output.write_text(rendered, encoding="utf-8")
-
+
return output, pack
diff --git a/src/scriber/renderer/llm_report.py b/src/scriber/renderer/llm_report.py
index 027b182..b720af1 100644
--- a/src/scriber/renderer/llm_report.py
+++ b/src/scriber/renderer/llm_report.py
@@ -1,41 +1,40 @@
from __future__ import annotations
from typing import TextIO
-from pathlib import Path
from collections import defaultdict
-import json
+import re
from scriber.core.models import LlmPack, PackItem, FileOutline
-from scriber.graph.model import RelationEdge
+
def render_llm_report(pack: LlmPack, out: TextIO) -> None:
out.write("# Scriber Pack v3\n\n")
-
+
out.write("\n")
out.write("You are reading a generated codebase context pack.\n")
out.write("Prefer facts from , , and blocks.\n")
out.write("If a file is tree_only or omitted, do not infer its contents.\n")
out.write("When proposing patches, cite file IDs and line ranges.\n")
out.write("\n\n")
-
- out.write("\n")
+
+ out.write('\n')
out.write("project:\n")
out.write(f" mode: {pack.mode}\n")
out.write(f" goal: {pack.goal or 'null'}\n")
out.write(f" target_tokens: {pack.budget_target}\n")
out.write(f" actual_tokens: {pack.budget_actual}\n")
-
+
input_paths = pack.stats.get("input_paths", [])
if input_paths:
out.write(" analyzed_targets:\n")
for p in input_paths:
out.write(f" - {p}\n")
out.write("\n")
-
+
out.write("read_order:\n")
for item in pack.items:
if item.content_mode not in ("tree", "omit"):
out.write(f" - {item.item_id} # {item.file.relative.as_posix()}\n")
-
+
out.write("\nfiles:\n")
for item in pack.items:
if item.content_mode in ("omit",):
@@ -59,7 +58,7 @@ def render_llm_report(pack: LlmPack, out: TextIO) -> None:
out.write("\n")
_render_graph(pack, out)
out.write("\n\n")
-
+
warnings = _generate_warnings(pack)
if warnings:
out.write("## Pack quality warnings\n\n")
@@ -68,28 +67,34 @@ def render_llm_report(pack: LlmPack, out: TextIO) -> None:
out.write("\n")
out.write("## Files Content\n\n")
-
+
for item in pack.items:
if item.content_mode in ("tree", "omit"):
continue
-
- out.write(f'\n')
-
+
+ out.write(
+ f'\n'
+ )
+
if item.outline and item.outline.purpose:
out.write("\n")
out.write(f"{item.outline.purpose}\n")
out.write("\n\n")
-
+
if item.outline:
_render_symbols_manifest(item.outline, out)
if item.content_mode == "full" and item.content:
- out.write(f"```{item.file.language} linenums=\"1\"\n")
- out.write(_add_line_numbers(item.content, item.file.relative.as_posix(), item.file.language))
+ out.write(f'```{item.file.language} linenums="1"\n')
+ out.write(
+ _add_line_numbers(
+ item.content, item.file.relative.as_posix(), item.file.language
+ )
+ )
if not item.content.endswith("\n"):
out.write("\n")
out.write("```\n")
-
+
elif item.content_mode == "excerpt":
if item.excerpts:
for excerpt in item.excerpts:
@@ -100,13 +105,12 @@ def render_llm_report(pack: LlmPack, out: TextIO) -> None:
_render_outline_fallback(item, out)
else:
out.write("_Excerpt unavailable; falling back to metadata only._\n\n")
-
+
elif item.content_mode == "outline" and item.outline:
_render_outline_fallback(item, out)
-
+
out.write("\n\n")
-import re
def _add_line_numbers(content: str, path: str, language: str) -> str:
lines = content.splitlines()
@@ -115,13 +119,14 @@ def _add_line_numbers(content: str, path: str, language: str) -> str:
out.append(f"# lines: 1-{len(lines)}")
for i, line in enumerate(lines, 1):
if language in ("python", "py"):
- m = re.match(r'^(\s*)(class|def|async def)\s+([a-zA-Z0-9_]+)', line)
+ m = re.match(r"^(\s*)(class|def|async def)\s+([a-zA-Z0-9_]+)", line)
if m:
indent, _, name = m.groups()
- out.append(f"{i:04d} {indent}# ")
+ out.append(f'{i:04d} {indent}# ')
out.append(f"{i:04d} {line}")
return "\n".join(out)
+
def _render_symbols_manifest(outline: FileOutline, out: TextIO) -> None:
symbols = []
if outline.classes:
@@ -130,14 +135,15 @@ def _render_symbols_manifest(outline: FileOutline, out: TextIO) -> None:
symbols.extend(outline.functions)
if not symbols:
return
-
+
out.write("\n")
for sym in symbols:
out.write(f"- {sym}\n")
out.write("\n\n")
+
def _render_outline_fallback(item: PackItem, out: TextIO) -> None:
- out.write("```python\n")
+ out.write("```python\n")
out.write(f"# Outline for {item.file.relative.name}\n")
if item.outline.classes:
out.write("Classes: " + ", ".join(item.outline.classes) + "\n")
@@ -147,22 +153,28 @@ def _render_outline_fallback(item: PackItem, out: TextIO) -> None:
out.write("Imports: " + ", ".join(item.outline.imports) + "\n")
out.write("```\n\n")
+
def _generate_warnings(pack: LlmPack) -> list[str]:
warnings = []
- empty_excerpts = sum(1 for i in pack.items if i.content_mode == "excerpt" and not i.excerpts)
+ empty_excerpts = sum(
+ 1 for i in pack.items if i.content_mode == "excerpt" and not i.excerpts
+ )
if empty_excerpts > 0:
- warnings.append(f"{empty_excerpts} files are marked excerpt but have no excerpts (falling back to outline).")
-
+ warnings.append(
+ f"{empty_excerpts} files are marked excerpt but have no excerpts (falling back to outline)."
+ )
+
unknown_roles = sum(1 for i in pack.items if i.role == "unknown")
if unknown_roles > 0:
warnings.append(f"{unknown_roles} files have role=unknown.")
-
+
return warnings
+
def _render_tree(items: list[PackItem], out: TextIO) -> None:
tree = {}
item_map = {item.file.relative.as_posix(): item for item in items}
-
+
for item in items:
parts = item.file.relative.parts
curr = tree
@@ -170,17 +182,17 @@ def _render_tree(items: list[PackItem], out: TextIO) -> None:
if part not in curr:
curr[part] = {}
curr = curr[part]
-
+
def print_node(path_parts, current_dict, prefix=""):
keys = sorted(current_dict.keys())
for i, k in enumerate(keys):
is_last = i == len(keys) - 1
child_prefix = prefix + (" " if is_last else "β ")
connector = "βββ " if is_last else "βββ "
-
+
full_path = "/".join(path_parts + (k,))
item = item_map.get(full_path)
-
+
if item:
badge = f"[{item.item_id} {item.role} {item.content_mode} score={item.score}]"
name_str = f"{prefix}{connector}{k}"
@@ -188,29 +200,34 @@ def print_node(path_parts, current_dict, prefix=""):
else:
out.write(f"{prefix}{connector}{k}/\n")
print_node(path_parts + (k,), current_dict[k], child_prefix)
-
+
out.write(".\n")
print_node((), tree, "")
+
def _render_graph(pack: LlmPack, out: TextIO) -> None:
included_paths = {item.file.relative for item in pack.items}
item_id_map = {item.file.relative: item.item_id for item in pack.items}
-
+
groups = defaultdict(list)
for edge in pack.graph.edges:
if edge.source in included_paths and edge.target in included_paths:
key = (edge.source, edge.target, edge.kind)
groups[key].append(edge)
-
- sorted_groups = sorted(groups.items(), key=lambda x: (x[0][0].as_posix(), x[0][1].as_posix()))
-
+
+ sorted_groups = sorted(
+ groups.items(), key=lambda x: (x[0][0].as_posix(), x[0][1].as_posix())
+ )
+
for (source, target, kind), edges in sorted_groups:
count = len(edges)
max_conf = max(e.confidence for e in edges)
analyzers = sorted({e.analyzer for e in edges})
-
+
s_id = item_id_map[source]
t_id = item_id_map[target]
-
+
analyzer_str = ",".join(analyzers)
- out.write(f"{s_id} -> {t_id} [{kind}] x{count} (analyzers=[{analyzer_str}], conf={max_conf:.2f})\n")
+ out.write(
+ f"{s_id} -> {t_id} [{kind}] x{count} (analyzers=[{analyzer_str}], conf={max_conf:.2f})\n"
+ )
diff --git a/src/scriber/rendering/renderer.py b/src/scriber/rendering/renderer.py
index 657cd55..77b6e96 100644
--- a/src/scriber/rendering/renderer.py
+++ b/src/scriber/rendering/renderer.py
@@ -26,7 +26,11 @@ def _table(candidates: list[Candidate], explain_selection: bool = False) -> str:
return "_None._\n"
lines = ["| Score | Content | Path | Reason |", "|---:|---|---|---|"]
for candidate in candidates:
- reason = "; ".join(candidate.reasons) if explain_selection else candidate.reason_summary
+ reason = (
+ "; ".join(candidate.reasons)
+ if explain_selection
+ else candidate.reason_summary
+ )
lines.append(
f"| {candidate.score} | {_escape_table(_content_flag(candidate))} | `{_escape_table(_path(candidate.file.relative))}` | {_escape_table(reason)} |"
)
@@ -65,31 +69,39 @@ def render_module_graph(pack: ScriberPack) -> str:
imports = len(pack.graph.imports.get(path, set()) & included)
if imports > 0:
import_counts.append((path, imports))
-
+
imported_by = len(pack.graph.imported_by.get(path, set()) & included)
if imported_by > 0:
imported_by_counts.append((path, imported_by))
-
+
import_counts.sort(key=lambda x: (-x[1], x[0].as_posix()))
imported_by_counts.sort(key=lambda x: (-x[1], x[0].as_posix()))
-
+
lines.append("Top 5 files with most dependencies:")
for path, count in import_counts[:5]:
lines.append(f"- `{_path(path)}`: imports {count} included files")
-
+
lines.append("")
lines.append("Top 5 most imported files:")
for path, count in imported_by_counts[:5]:
lines.append(f"- `{_path(path)}`: imported by {count} included files")
-
+
return "\n".join(lines).strip() or "No module graph available."
for seed in pack.seed_paths:
for seed_file in seed.expanded_files:
lines.append(_path(seed_file))
- imports = sorted(pack.graph.imports.get(seed_file, set()) & included, key=lambda item: item.as_posix())
- imported_by = sorted(pack.graph.imported_by.get(seed_file, set()) & included, key=lambda item: item.as_posix())
- edges = [("imports", item) for item in imports] + [("imported by", item) for item in imported_by]
+ imports = sorted(
+ pack.graph.imports.get(seed_file, set()) & included,
+ key=lambda item: item.as_posix(),
+ )
+ imported_by = sorted(
+ pack.graph.imported_by.get(seed_file, set()) & included,
+ key=lambda item: item.as_posix(),
+ )
+ edges = [("imports", item) for item in imports] + [
+ ("imported by", item) for item in imported_by
+ ]
for index, (kind, target) in enumerate(edges):
branch = "βββ" if index == len(edges) - 1 else "βββ"
lines.append(f"{branch} {kind} {_path(target)}")
@@ -100,7 +112,23 @@ def render_module_graph(pack: ScriberPack) -> str:
def _language_fence(language: str) -> str:
- if language in {"python", "rust", "javascript", "typescript", "go", "java", "kotlin", "c", "cpp", "toml", "yaml", "json", "markdown", "dockerfile", "ini"}:
+ if language in {
+ "python",
+ "rust",
+ "javascript",
+ "typescript",
+ "go",
+ "java",
+ "kotlin",
+ "c",
+ "cpp",
+ "toml",
+ "yaml",
+ "json",
+ "markdown",
+ "dockerfile",
+ "ini",
+ }:
return language
return "text"
@@ -133,7 +161,7 @@ def render_summary(pack: ScriberPack) -> str:
f"- Content files: `{content_count}`",
f"- Tree-only files: `{tree_only_count}`",
f"- Estimated tokens: `{pack.total_tokens}`",
- ""
+ "",
]
return "\n".join(lines)
@@ -154,15 +182,19 @@ def render_summary_text(pack: ScriberPack) -> str:
f"Content files: {content_count}",
f"Tree-only files: {tree_only_count}",
f"Estimated tokens: {pack.total_tokens}",
- ""
+ "",
]
return "\n".join(lines)
def render_markdown(pack: ScriberPack, explain_selection: bool = False) -> str:
code = [candidate for candidate in pack.candidates if candidate.file.kind == "code"]
- support = [candidate for candidate in pack.candidates if candidate.file.kind == "support"]
- other = [candidate for candidate in pack.candidates if candidate.file.kind == "other"]
+ support = [
+ candidate for candidate in pack.candidates if candidate.file.kind == "support"
+ ]
+ other = [
+ candidate for candidate in pack.candidates if candidate.file.kind == "other"
+ ]
lines: list[str] = []
lines.append("# Scriber 2.0 Pack")
@@ -172,7 +204,9 @@ def render_markdown(pack: ScriberPack, explain_selection: bool = False) -> str:
lines.append("## Project")
lines.append("")
lines.append(f"Root: `{pack.project_root}`")
- lines.append(f"Config: `{pack.config_path.relative_to(pack.project_root).as_posix()}`")
+ lines.append(
+ f"Config: `{pack.config_path.relative_to(pack.project_root).as_posix()}`"
+ )
lines.append(f"Format: `{pack.output_format}`")
lines.append(f"Only tree: `{str(pack.only_tree).lower()}`")
lines.append("")
@@ -214,7 +248,9 @@ def render_markdown(pack: ScriberPack, explain_selection: bool = False) -> str:
lines.append(f"### `{_path(candidate.file.relative)}`")
lines.append("")
if not candidate.include_content:
- lines.append(f"_Content omitted: {candidate.omitted_reason or 'not selected for content'}._")
+ lines.append(
+ f"_Content omitted: {candidate.omitted_reason or 'not selected for content'}._"
+ )
continue
content = candidate.content or ""
fence = _fence_for(content)
@@ -235,7 +271,9 @@ def render_text(pack: ScriberPack, explain_selection: bool = False) -> str:
lines.append(render_summary_text(pack).rstrip())
lines.append("")
lines.append(f"PROJECT ROOT: {pack.project_root}")
- lines.append(f"CONFIG: {pack.config_path.relative_to(pack.project_root).as_posix()}")
+ lines.append(
+ f"CONFIG: {pack.config_path.relative_to(pack.project_root).as_posix()}"
+ )
lines.append(f"FORMAT: {pack.output_format}")
lines.append(f"ONLY TREE: {str(pack.only_tree).lower()}")
lines.append("")
@@ -245,7 +283,11 @@ def render_text(pack: ScriberPack, explain_selection: bool = False) -> str:
lines.append("")
lines.append("INCLUDED FILES")
for candidate in pack.candidates:
- reason = "; ".join(candidate.reasons) if explain_selection else candidate.reason_summary
+ reason = (
+ "; ".join(candidate.reasons)
+ if explain_selection
+ else candidate.reason_summary
+ )
lines.append(f"[{candidate.score:03d}] {_path(candidate.file.relative)}")
lines.append(f" kind: {candidate.file.kind}")
lines.append(f" content: {_content_flag(candidate)}")
@@ -265,7 +307,9 @@ def render_text(pack: ScriberPack, explain_selection: bool = False) -> str:
lines.append("")
lines.append(f"--- FILE: {_path(candidate.file.relative)} ---")
if not candidate.include_content:
- lines.append(f"[content omitted: {candidate.omitted_reason or 'not selected for content'}]")
+ lines.append(
+ f"[content omitted: {candidate.omitted_reason or 'not selected for content'}]"
+ )
continue
lines.append(candidate.content or "")
lines.append("")
diff --git a/src/scriber/scanner/files.py b/src/scriber/scanner/files.py
index d4e8c20..701d093 100644
--- a/src/scriber/scanner/files.py
+++ b/src/scriber/scanner/files.py
@@ -2,7 +2,7 @@
from pathlib import Path
-from scriber.core.matchers import match_pattern, matches_any
+from scriber.core.matchers import matches_any
from scriber.core.models import ContentPolicy, FileKind, FileNode, ScriberConfig
LANGUAGE_BY_SUFFIX = {
@@ -39,6 +39,7 @@
def is_probably_binary(path: Path) -> bool:
from scriber.native import require_native
+
try:
return require_native().is_probably_binary(str(path))
except Exception:
@@ -58,19 +59,63 @@ def language_for(path: Path) -> str:
def support_category(rel: Path) -> str:
s = rel.as_posix()
name = rel.name
- if name == "pyproject.toml" or name.endswith(".toml") or name in {"setup.py", "setup.cfg", "tox.ini", "pytest.ini", "mypy.ini", "ruff.toml", ".ruff.toml"}:
+ if (
+ name == "pyproject.toml"
+ or name.endswith(".toml")
+ or name
+ in {
+ "setup.py",
+ "setup.cfg",
+ "tox.ini",
+ "pytest.ini",
+ "mypy.ini",
+ "ruff.toml",
+ ".ruff.toml",
+ }
+ ):
return "project config"
- if name.endswith(".lock") or name in {"requirements.txt", "poetry.lock", "uv.lock", "Pipfile", "Pipfile.lock", "package.json", "package-lock.json", "pnpm-lock.yaml", "yarn.lock", "Cargo.toml", "Cargo.lock", "go.mod", "go.sum"} or s.startswith("requirements/"):
+ if (
+ name.endswith(".lock")
+ or name
+ in {
+ "requirements.txt",
+ "poetry.lock",
+ "uv.lock",
+ "Pipfile",
+ "Pipfile.lock",
+ "package.json",
+ "package-lock.json",
+ "pnpm-lock.yaml",
+ "yarn.lock",
+ "Cargo.toml",
+ "Cargo.lock",
+ "go.mod",
+ "go.sum",
+ }
+ or s.startswith("requirements/")
+ ):
return "dependency file"
- if name.startswith("README") or name in {"CHANGELOG.md", "CONTRIBUTING.md"} or s.startswith("docs/"):
+ if (
+ name.startswith("README")
+ or name in {"CHANGELOG.md", "CONTRIBUTING.md"}
+ or s.startswith("docs/")
+ ):
return "documentation"
- if name.startswith("Dockerfile") or name.startswith("docker-compose") or name.startswith("compose"):
+ if (
+ name.startswith("Dockerfile")
+ or name.startswith("docker-compose")
+ or name.startswith("compose")
+ ):
return "runtime support"
if s.startswith(".github/workflows/") or name == ".gitlab-ci.yml":
return "ci support"
if name.startswith(".env") or s.startswith("config/") or s.startswith("settings/"):
return "runtime config"
- if name in {".pre-commit-config.yaml", "tsconfig.json"} or name.startswith("vite.config") or name.startswith("webpack.config"):
+ if (
+ name in {".pre-commit-config.yaml", "tsconfig.json"}
+ or name.startswith("vite.config")
+ or name.startswith("webpack.config")
+ ):
return "tooling config"
return "support file"
@@ -91,7 +136,6 @@ def classify_file(path: Path, root: Path, config: ScriberConfig) -> FileNode | N
if matches_any(rel_s, config.hard_ignore_patterns):
return None
- binary = is_probably_binary(path)
kind: FileKind = "other"
category = None
policy: ContentPolicy = "auto"
@@ -105,6 +149,8 @@ def classify_file(path: Path, root: Path, config: ScriberConfig) -> FileNode | N
else:
return None
+ binary = is_probably_binary(path)
+
try:
size = path.stat().st_size
except OSError:
@@ -139,8 +185,11 @@ def is_text_readable(path: Path) -> bool:
def read_text_lossy(path: Path) -> str:
- from scriber.native import require_native
- return require_native().read_text(str(path))
-
-
+ try:
+ from scriber.native import is_native_available, require_native
+ if is_native_available():
+ return require_native().read_text(str(path))
+ except Exception:
+ pass
+ return path.read_text(encoding="utf-8", errors="replace")
diff --git a/src/scriber/scanner/scan.py b/src/scriber/scanner/scan.py
index 922c44a..7b51c1b 100644
--- a/src/scriber/scanner/scan.py
+++ b/src/scriber/scanner/scan.py
@@ -8,6 +8,7 @@
def scan_project(root: Path, config: ScriberConfig) -> dict[Path, FileNode]:
try:
from scriber.native import is_native_available
+
if is_native_available():
files, _ = scan_project_with_native(root, config)
return files
@@ -15,10 +16,13 @@ def scan_project(root: Path, config: ScriberConfig) -> dict[Path, FileNode]:
pass
from scriber.scanner.scan_py import scan_project as scan_project_py
+
return scan_project_py(root, config)
-def scan_project_with_native(root: Path, config: ScriberConfig) -> tuple[dict[Path, FileNode], list]:
+def scan_project_with_native(
+ root: Path, config: ScriberConfig
+) -> tuple[dict[Path, FileNode], list]:
root = root.resolve()
native = require_native()
@@ -31,12 +35,13 @@ def scan_project_with_native(root: Path, config: ScriberConfig) -> tuple[dict[Pa
config.support_content.full,
config.support_content.tree_only,
config.support_content.default,
- config.support
+ config.support,
)
files: dict[Path, FileNode] = {}
-
+
from scriber.cache import ScriberCache
+
cache = ScriberCache(config, root)
active_files: set[Path] = set()
@@ -54,7 +59,7 @@ def scan_project_with_native(root: Path, config: ScriberConfig) -> tuple[dict[Pa
size_bytes=cached_data["size_bytes"],
is_binary=cached_data["is_binary"],
support_category=cached_data["support_category"],
- content_policy=cached_data["content_policy"]
+ content_policy=cached_data["content_policy"],
)
files[node.relative] = node
else:
@@ -66,18 +71,23 @@ def scan_project_with_native(root: Path, config: ScriberConfig) -> tuple[dict[Pa
size_bytes=item.size_bytes,
is_binary=item.is_binary,
support_category=item.support_category,
- content_policy=item.content_policy
+ content_policy=item.content_policy,
)
files[node.relative] = node
- cache.set_file(rel, item.mtime_ns, item.size_bytes, {
- "relative": node.relative.as_posix(),
- "kind": node.kind,
- "language": node.language,
- "size_bytes": node.size_bytes,
- "is_binary": node.is_binary,
- "support_category": node.support_category,
- "content_policy": node.content_policy
- })
+ cache.set_file(
+ rel,
+ item.mtime_ns,
+ item.size_bytes,
+ {
+ "relative": node.relative.as_posix(),
+ "kind": node.kind,
+ "language": node.language,
+ "size_bytes": node.size_bytes,
+ "is_binary": node.is_binary,
+ "support_category": node.support_category,
+ "content_policy": node.content_policy,
+ },
+ )
cache.save(active_files)
return files, native_files
diff --git a/src/scriber/scanner/scan_py.py b/src/scriber/scanner/scan_py.py
index 2c0ebae..e4a5818 100644
--- a/src/scriber/scanner/scan_py.py
+++ b/src/scriber/scanner/scan_py.py
@@ -10,10 +10,15 @@
def scan_project(root: Path, config: ScriberConfig) -> dict[Path, FileNode]:
root = root.resolve()
- gitignore = SimpleGitIgnore.from_file(root / ".gitignore") if config.use_gitignore else SimpleGitIgnore([])
+ gitignore = (
+ SimpleGitIgnore.from_file(root / ".gitignore")
+ if config.use_gitignore
+ else SimpleGitIgnore([])
+ )
files: dict[Path, FileNode] = {}
-
+
from scriber.cache import ScriberCache
+
cache = ScriberCache(config, root)
active_files: set[Path] = set()
@@ -23,10 +28,14 @@ def scan_project(root: Path, config: ScriberConfig) -> dict[Path, FileNode]:
kept_dirs: list[str] = []
for dirname in dirnames:
- child_rel = (rel_dir / dirname) if rel_dir.as_posix() != "." else Path(dirname)
+ child_rel = (
+ (rel_dir / dirname) if rel_dir.as_posix() != "." else Path(dirname)
+ )
if should_hard_ignore(child_rel, config):
continue
- if config.use_gitignore and gitignore.ignores(child_rel.as_posix(), is_dir=True):
+ if config.use_gitignore and gitignore.ignores(
+ child_rel.as_posix(), is_dir=True
+ ):
continue
kept_dirs.append(dirname)
dirnames[:] = kept_dirs
@@ -47,33 +56,40 @@ def scan_project(root: Path, config: ScriberConfig) -> dict[Path, FileNode]:
continue
active_files.add(rel)
-
+
cached_data = cache.get_file(rel, mtime_ns, size)
if cached_data is not None:
node = FileNode(
- absolute=(root / Path(cached_data["relative"])).resolve(strict=False),
+ absolute=(root / Path(cached_data["relative"])).resolve(
+ strict=False
+ ),
relative=Path(cached_data["relative"]),
kind=cached_data["kind"],
language=cached_data["language"],
size_bytes=cached_data["size_bytes"],
is_binary=cached_data["is_binary"],
support_category=cached_data["support_category"],
- content_policy=cached_data["content_policy"]
+ content_policy=cached_data["content_policy"],
)
files[node.relative] = node
else:
node = classify_file(path, root, config)
if node is not None:
files[node.relative] = node
- cache.set_file(rel, mtime_ns, size, {
- "relative": node.relative.as_posix(),
- "kind": node.kind,
- "language": node.language,
- "size_bytes": node.size_bytes,
- "is_binary": node.is_binary,
- "support_category": node.support_category,
- "content_policy": node.content_policy
- })
+ cache.set_file(
+ rel,
+ mtime_ns,
+ size,
+ {
+ "relative": node.relative.as_posix(),
+ "kind": node.kind,
+ "language": node.language,
+ "size_bytes": node.size_bytes,
+ "is_binary": node.is_binary,
+ "support_category": node.support_category,
+ "content_policy": node.content_policy,
+ },
+ )
cache.save(active_files)
return files
diff --git a/tests/test_cache.py b/tests/test_cache.py
index 94dc68e..1b87a51 100644
--- a/tests/test_cache.py
+++ b/tests/test_cache.py
@@ -1,9 +1,8 @@
from __future__ import annotations
-import json
from pathlib import Path
from scriber.core.models import ScriberConfig
-from scriber.cache import ScriberCache, get_config_hash
+from scriber.cache import ScriberCache
def test_cache_functionality(tmp_path: Path) -> None:
@@ -11,33 +10,47 @@ def test_cache_functionality(tmp_path: Path) -> None:
# Ensure cache is enabled
config.cache.enabled = True
config.cache.dir = ".scriber/cache"
-
+
cache = ScriberCache(config, tmp_path)
-
+
rel_path = Path("src/main.py")
- mtime = 123456789
- size = 1000
- data = {"kind": "code", "language": "python", "size_bytes": 1000, "is_binary": False, "support_category": None, "content_policy": "auto", "absolute": "src/main.py", "relative": "src/main.py"}
-
+ (tmp_path / "src").mkdir(exist_ok=True)
+ (tmp_path / rel_path).write_text("print('hello')")
+
+ stat = (tmp_path / rel_path).stat()
+ mtime = stat.st_mtime_ns
+ size = stat.st_size
+
+ data = {
+ "kind": "code",
+ "language": "python",
+ "size_bytes": 1000,
+ "is_binary": False,
+ "support_category": None,
+ "content_policy": "auto",
+ "absolute": "src/main.py",
+ "relative": "src/main.py",
+ }
+
assert cache.get_file(rel_path, mtime, size) is None
-
+
cache.set_file(rel_path, mtime, size, data)
assert cache.get_file(rel_path, mtime, size) == data
-
+
# Check imports cache
imports = {Path("src/auth.py"), Path("src/db.py")}
- assert cache.get_imports(rel_path) is None
+ assert cache.get_imports(rel_path, mtime, size) is None
cache.set_imports(rel_path, imports)
- assert cache.get_imports(rel_path) == imports
-
+ assert cache.get_imports(rel_path, mtime, size) == imports
+
# Save cache
cache.save(active_files={rel_path})
-
+
# Check that cache files were created
assert (tmp_path / ".scriber/cache/files.json").exists()
assert (tmp_path / ".scriber/cache/imports_v2.json").exists()
-
+
# Reload cache and check if retrieved properly
new_cache = ScriberCache(config, tmp_path)
assert new_cache.get_file(rel_path, mtime, size) == data
- assert new_cache.get_imports(rel_path) == imports
+ assert new_cache.get_imports(rel_path, mtime, size) == imports
diff --git a/tests/test_config_schema.py b/tests/test_config_schema.py
index ab377de..a071d34 100644
--- a/tests/test_config_schema.py
+++ b/tests/test_config_schema.py
@@ -6,7 +6,8 @@
def test_config_schema_parsing(tmp_path: Path) -> None:
config_file = tmp_path / "pyproject.toml"
- config_file.write_text("""
+ config_file.write_text(
+ """
[tool.scriber]
format = "txt"
max_tokens = 50000
@@ -33,30 +34,32 @@ def test_config_schema_parsing(tmp_path: Path) -> None:
[tool.scriber.hard_ignore]
patterns = [".git/**", "node_modules/**"]
-""".strip(), encoding="utf-8")
+""".strip(),
+ encoding="utf-8",
+ )
config = load_config(config_file)
-
+
assert config.format == "txt"
assert config.max_tokens == 50000
assert config.max_files == 30
assert config.only_tree is True
assert config.allow_external_paths is True
-
+
assert config.modules is False
assert config.modules_config.enabled is False
assert config.modules_config.content_min_score == 40
-
+
assert config.code_patterns == ["**/*.py", "**/*.rs"]
-
+
assert config.support is True
assert config.support_patterns == ["pyproject.toml", "Dockerfile"]
-
+
assert config.support_content.default == "tree_only"
assert config.support_content.auto_max_bytes == 20000
assert config.support_content.full == ["pyproject.toml"]
assert config.support_content.tree_only == ["Dockerfile"]
-
+
assert config.hard_ignore_patterns == [".git/**", "node_modules/**"]
diff --git a/tests/test_fixes.py b/tests/test_fixes.py
new file mode 100644
index 0000000..176e127
--- /dev/null
+++ b/tests/test_fixes.py
@@ -0,0 +1,166 @@
+from pathlib import Path
+from unittest.mock import patch
+
+from scriber.core.config import ScriberConfig
+from scriber.core.models import FileNode, ModuleGraph
+from scriber.engine.roles import classify_file_role
+from scriber.engine.scorer import _is_test_file
+from scriber.scanner.files import classify_file, read_text_lossy
+
+
+def test_role_classifier_does_not_mark_production_tests_analyzer_as_test():
+ config = ScriberConfig()
+ config.python.test_roots = ["tests", "test"]
+ rel = Path("src/scriber/graph/analyzers/tests.py")
+ assert not _is_test_file(rel, config)
+
+ rel2 = Path("tests/test_something.py")
+ assert _is_test_file(rel2, config)
+
+
+def test_classify_file_does_not_binary_check_unmatched_files():
+ config = ScriberConfig()
+ config.code_patterns = ["**/*.py"]
+ config.support = False
+
+ with patch("scriber.scanner.files.is_probably_binary") as mock_binary:
+ # Not a match to any pattern
+ res = classify_file(Path("/fake/file.unknown"), Path("/fake"), config)
+ assert res is None
+ mock_binary.assert_not_called()
+
+
+def test_read_text_lossy_without_native(tmp_path):
+ p = tmp_path / "test.txt"
+ p.write_bytes(b"hello \xff world") # invalid utf-8
+
+ with patch("scriber.native.is_native_available", return_value=False):
+ content = read_text_lossy(p)
+ assert "hello \ufffd world" in content or "hello world" in content
+
+
+def test_classify_file_role_does_not_mark_graph_analyzers_tests_py_as_test():
+ file = FileNode(
+ absolute=Path("/src/scriber/graph/analyzers/tests.py"),
+ relative=Path("src/scriber/graph/analyzers/tests.py"),
+ kind="code",
+ language="python",
+ size_bytes=100,
+ )
+ graph = ModuleGraph()
+ role = classify_file_role(file, graph)
+ assert role != "test"
+
+
+def test_read_text_lossy_falls_back_when_native_read_raises(tmp_path):
+ p = tmp_path / "test.txt"
+ p.write_bytes(b"hello")
+
+ with patch("scriber.native.is_native_available", return_value=True):
+ with patch("scriber.native.require_native") as mock_require:
+ mock_require.return_value.read_text.side_effect = Exception(
+ "Native read failed"
+ )
+ content = read_text_lossy(p)
+ assert content == "hello"
+
+
+def test_import_cache_works_with_custom_cache_dir(tmp_path):
+ config = ScriberConfig()
+ config.cache.dir = "custom/cache/dir"
+
+ from scriber.cache import ScriberCache
+
+ cache = ScriberCache(config, tmp_path)
+
+ assert cache.cache_dir == tmp_path / "custom" / "cache" / "dir"
+
+ f1 = tmp_path / "a.py"
+ f2 = tmp_path / "b.py"
+ f1.write_text("import b")
+ f2.write_text("")
+
+ cache.set_imports(Path("a.py"), {Path("b.py")})
+ assert cache.imports_data["a.py"]["targets"] == ["b.py"]
+
+
+def test_project_snapshot_docs_profile_changes_code_and_test_scores():
+ from scriber.core.profiles import apply_profile
+
+ config = ScriberConfig()
+ config = apply_profile(config, "docs")
+
+ from scriber.engine.scorer import score_candidates_project_snapshot
+
+ files = {
+ Path("app.py"): FileNode(
+ Path("/app.py"), Path("app.py"), "code", "python", 100
+ ),
+ Path("test_app.py"): FileNode(
+ Path("/test_app.py"), Path("test_app.py"), "code", "python", 100
+ ),
+ Path("utils.py"): FileNode(
+ Path("/utils.py"), Path("utils.py"), "code", "python", 100
+ ),
+ }
+ graph = ModuleGraph()
+ # Mocking minimums so we see all files in output
+ config.min_score = 0
+ config.modules_config.tree_min_score = 0
+
+ candidates = score_candidates_project_snapshot(
+ files=files, graph=graph, config=config
+ )
+
+ c_app = next(c for c in candidates if c.file.relative.name == "app.py")
+ assert c_app.score == config.modules_config.scoring.get("entrypoint_file", 90)
+
+ c_test = next(c for c in candidates if c.file.relative.name == "test_app.py")
+ assert c_test.score == config.modules_config.scoring.get("test_file", 60)
+
+ c_utils = next(c for c in candidates if c.file.relative.name == "utils.py")
+ assert c_utils.score == config.modules_config.scoring.get("code_file", 80)
+
+
+def test_native_project_snapshot_uses_profile_code_and_test_scores():
+ from scriber.core.profiles import apply_profile
+
+ config = ScriberConfig()
+ config = apply_profile(config, "docs")
+ from scriber.native import is_native_available, require_native
+
+ if not is_native_available():
+ return # skip if native not built
+
+ native = require_native()
+ scoring = config.modules_config.scoring
+ opts = native.NativePackOptions(
+ mode="project_snapshot",
+ max_files=10,
+ min_score=0,
+ tree_min_score=0,
+ entrypoint_patterns=config.python.entrypoint_patterns,
+ test_roots=config.python.test_roots,
+ entrypoint_file_score=scoring.get("entrypoint_file", 90),
+ code_file_score=scoring.get("code_file", 80),
+ test_file_score=scoring.get("test_file", 60),
+ other_file_score=scoring.get("other_file", 40),
+ )
+ assert opts.entrypoint_file_score == scoring.get("entrypoint_file", 90)
+ assert opts.test_file_score == scoring.get("test_file", 60)
+ assert opts.code_file_score == scoring.get("code_file", 80)
+ assert opts.other_file_score == scoring.get("other_file", 40)
+
+
+def test_llm_pack_gpt_profile_does_not_access_missing_outline_symbols(tmp_path):
+ from scriber.packer.pack import build_pack
+
+ config_path = tmp_path / "pyproject.toml"
+ config_path.write_text("")
+
+ code_path = tmp_path / "test.py"
+ code_path.write_text("def my_func(): pass")
+
+ # Just verify it builds without Exception on outline.symbols
+ pack = build_pack(paths=[str(code_path)], profile="gpt", path_base="cwd")
+ assert pack is not None
diff --git a/tests/test_init_config.py b/tests/test_init_config.py
index 59d0e85..1b98815 100644
--- a/tests/test_init_config.py
+++ b/tests/test_init_config.py
@@ -26,7 +26,8 @@ def test_replace_existing_block() -> None:
version = "2"
""".strip()
- expected = """
+ expected = (
+ """
[build-system]
requires = ["setuptools>=61"]
@@ -35,7 +36,9 @@ def test_replace_existing_block() -> None:
[tool.scriber]
version = "2"
-""".strip() + "\n"
+""".strip()
+ + "\n"
+ )
res = replace_existing_tool_scriber_block(content, default_block)
assert res == expected
@@ -44,7 +47,7 @@ def test_replace_existing_block() -> None:
def test_init_project_file_missing(tmp_path: Path) -> None:
config_path = tmp_path / "pyproject.toml"
assert not config_path.exists()
-
+
path = init_project(str(config_path))
assert path == config_path.resolve()
assert config_path.exists()
@@ -54,7 +57,7 @@ def test_init_project_file_missing(tmp_path: Path) -> None:
def test_init_project_exists_no_scriber(tmp_path: Path) -> None:
config_path = tmp_path / "pyproject.toml"
config_path.write_text("[build-system]\n", encoding="utf-8")
-
+
init_project(str(config_path))
content = config_path.read_text(encoding="utf-8")
assert "[build-system]" in content
@@ -64,26 +67,30 @@ def test_init_project_exists_no_scriber(tmp_path: Path) -> None:
def test_init_project_exists_with_scriber_raises(tmp_path: Path) -> None:
config_path = tmp_path / "pyproject.toml"
config_path.write_text("[tool.scriber]\nversion = '1'\n", encoding="utf-8")
-
+
with pytest.raises(ScriberError, match="Scriber config already exists"):
init_project(str(config_path))
def test_init_project_exists_with_scriber_force(tmp_path: Path) -> None:
config_path = tmp_path / "pyproject.toml"
- config_path.write_text("""
+ config_path.write_text(
+ """
[build-system]
requires = ["setuptools>=61"]
[tool.scriber]
version = '1'
-""".strip() + "\n", encoding="utf-8")
-
+""".strip()
+ + "\n",
+ encoding="utf-8",
+ )
+
init_project(str(config_path), force=True)
content = config_path.read_text(encoding="utf-8")
assert "[build-system]" in content
assert "[tool.scriber]" in content
assert "version = '1'" not in content # must be replaced with the default block
-
+
# Ensure there is exactly one [tool.scriber] header in pyproject.toml
assert content.count("[tool.scriber]") == 1
diff --git a/tests/test_languages.py b/tests/test_languages.py
index 5f53f23..fc4f737 100644
--- a/tests/test_languages.py
+++ b/tests/test_languages.py
@@ -7,31 +7,33 @@
def test_javascript_typescript_graph(tmp_path: Path) -> None:
config = ScriberConfig()
-
+
auth_path = tmp_path / "src/auth.ts"
auth_path.parent.mkdir(parents=True, exist_ok=True)
auth_path.write_text("export class Auth {}", encoding="utf-8")
-
+
main_path = tmp_path / "src/main.ts"
- main_path.write_text("import { Auth } from './auth';\nimport 'lodash';", encoding="utf-8")
-
+ main_path.write_text(
+ "import { Auth } from './auth';\nimport 'lodash';", encoding="utf-8"
+ )
+
files = {
Path("src/auth.ts"): FileNode(
absolute=auth_path.resolve(),
relative=Path("src/auth.ts"),
kind="code",
language="typescript",
- size_bytes=auth_path.stat().st_size
+ size_bytes=auth_path.stat().st_size,
),
Path("src/main.ts"): FileNode(
absolute=main_path.resolve(),
relative=Path("src/main.ts"),
kind="code",
language="typescript",
- size_bytes=main_path.stat().st_size
- )
+ size_bytes=main_path.stat().st_size,
+ ),
}
-
+
graph = build_graph(files, config)
assert Path("src/auth.ts") in graph.imports[Path("src/main.ts")]
assert Path("src/main.ts") in graph.imported_by[Path("src/auth.ts")]
@@ -39,34 +41,36 @@ def test_javascript_typescript_graph(tmp_path: Path) -> None:
def test_rust_graph(tmp_path: Path) -> None:
config = ScriberConfig()
-
+
cargo_toml = tmp_path / "Cargo.toml"
cargo_toml.write_text("[package]\nname = 'test'", encoding="utf-8")
-
+
auth_path = tmp_path / "src/auth.rs"
auth_path.parent.mkdir(parents=True, exist_ok=True)
auth_path.write_text("pub struct Auth;", encoding="utf-8")
-
+
main_path = tmp_path / "src/main.rs"
- main_path.write_text("mod auth;\nuse crate::auth::Auth;\nuse super::unrelated;", encoding="utf-8")
-
+ main_path.write_text(
+ "mod auth;\nuse crate::auth::Auth;\nuse super::unrelated;", encoding="utf-8"
+ )
+
files = {
Path("src/auth.rs"): FileNode(
absolute=auth_path.resolve(),
relative=Path("src/auth.rs"),
kind="code",
language="rust",
- size_bytes=auth_path.stat().st_size
+ size_bytes=auth_path.stat().st_size,
),
Path("src/main.rs"): FileNode(
absolute=main_path.resolve(),
relative=Path("src/main.rs"),
kind="code",
language="rust",
- size_bytes=main_path.stat().st_size
- )
+ size_bytes=main_path.stat().st_size,
+ ),
}
-
+
graph = build_graph(files, config)
assert Path("src/auth.rs") in graph.imports[Path("src/main.rs")]
assert Path("src/main.rs") in graph.imported_by[Path("src/auth.rs")]
@@ -74,35 +78,37 @@ def test_rust_graph(tmp_path: Path) -> None:
def test_go_graph(tmp_path: Path) -> None:
config = ScriberConfig()
-
+
go_mod = tmp_path / "go.mod"
go_mod.write_text("module github.com/user/project\n", encoding="utf-8")
-
+
db_path = tmp_path / "pkg/db/db.go"
db_path.parent.mkdir(parents=True, exist_ok=True)
db_path.write_text("package db\n", encoding="utf-8")
-
+
main_path = tmp_path / "cmd/main.go"
main_path.parent.mkdir(parents=True, exist_ok=True)
- main_path.write_text('package main\nimport "github.com/user/project/pkg/db"\n', encoding="utf-8")
-
+ main_path.write_text(
+ 'package main\nimport "github.com/user/project/pkg/db"\n', encoding="utf-8"
+ )
+
files = {
Path("pkg/db/db.go"): FileNode(
absolute=db_path.resolve(),
relative=Path("pkg/db/db.go"),
kind="code",
language="go",
- size_bytes=db_path.stat().st_size
+ size_bytes=db_path.stat().st_size,
),
Path("cmd/main.go"): FileNode(
absolute=main_path.resolve(),
relative=Path("cmd/main.go"),
kind="code",
language="go",
- size_bytes=main_path.stat().st_size
- )
+ size_bytes=main_path.stat().st_size,
+ ),
}
-
+
graph = build_graph(files, config)
assert Path("pkg/db/db.go") in graph.imports[Path("cmd/main.go")]
assert Path("cmd/main.go") in graph.imported_by[Path("pkg/db/db.go")]
@@ -110,14 +116,17 @@ def test_go_graph(tmp_path: Path) -> None:
def test_cpp_graph(tmp_path: Path) -> None:
config = ScriberConfig()
-
+
header_path = tmp_path / "src/auth.h"
header_path.parent.mkdir(parents=True, exist_ok=True)
header_path.write_text("class Auth {};", encoding="utf-8")
-
+
main_path = tmp_path / "src/main.cpp"
- main_path.write_text('#include "auth.h"\n#include \n#include "utils/helper.hpp"', encoding="utf-8")
-
+ main_path.write_text(
+ '#include "auth.h"\n#include \n#include "utils/helper.hpp"',
+ encoding="utf-8",
+ )
+
helper_path = tmp_path / "src/utils/helper.hpp"
helper_path.parent.mkdir(parents=True, exist_ok=True)
helper_path.write_text("void helper();", encoding="utf-8")
@@ -128,27 +137,26 @@ def test_cpp_graph(tmp_path: Path) -> None:
relative=Path("src/auth.h"),
kind="code",
language="c",
- size_bytes=header_path.stat().st_size
+ size_bytes=header_path.stat().st_size,
),
Path("src/main.cpp"): FileNode(
absolute=main_path.resolve(),
relative=Path("src/main.cpp"),
kind="code",
language="cpp",
- size_bytes=main_path.stat().st_size
+ size_bytes=main_path.stat().st_size,
),
Path("src/utils/helper.hpp"): FileNode(
absolute=helper_path.resolve(),
relative=Path("src/utils/helper.hpp"),
kind="code",
language="cpp",
- size_bytes=helper_path.stat().st_size
- )
+ size_bytes=helper_path.stat().st_size,
+ ),
}
-
+
graph = build_graph(files, config)
assert Path("src/auth.h") in graph.imports[Path("src/main.cpp")]
assert Path("src/main.cpp") in graph.imported_by[Path("src/auth.h")]
assert Path("src/utils/helper.hpp") in graph.imports[Path("src/main.cpp")]
assert Path("src/main.cpp") in graph.imported_by[Path("src/utils/helper.hpp")]
-
diff --git a/tests/test_native.py b/tests/test_native.py
index 77c3341..39d55c5 100644
--- a/tests/test_native.py
+++ b/tests/test_native.py
@@ -18,22 +18,22 @@ def test_native_read_write(tmp_path: Path) -> None:
native = require_native()
test_file = tmp_path / "test.txt"
content = "Hello, native Rust world!\nWith some special characters: ΕΓ³Δ
dΕΊΕ\n"
-
+
native.write_text(str(test_file), content)
assert test_file.exists()
-
+
read_back = native.read_text(str(test_file))
assert read_back == content
def test_native_binary_check(tmp_path: Path) -> None:
native = require_native()
-
+
# Test text file
txt_file = tmp_path / "normal.txt"
txt_file.write_text("Hello world", encoding="utf-8")
assert not native.is_probably_binary(str(txt_file))
-
+
# Test binary file
bin_file = tmp_path / "binary.bin"
bin_file.write_bytes(b"Hello\x00world")
@@ -47,19 +47,21 @@ def test_native_scan_matches_python_scan(tmp_path: Path) -> None:
(tmp_path / "src" / "helper.py").write_text("import sys", encoding="utf-8")
(tmp_path / "src" / "binary.dat").write_bytes(b"\x00\x01\x02")
(tmp_path / "README.md").write_text("# Test Project", encoding="utf-8")
- (tmp_path / "pyproject.toml").write_text("[tool.scriber]\nversion='2'", encoding="utf-8")
-
+ (tmp_path / "pyproject.toml").write_text(
+ "[tool.scriber]\nversion='2'", encoding="utf-8"
+ )
+
# Hidden dir and ignored patterns
(tmp_path / ".git").mkdir()
(tmp_path / ".git" / "config").write_text("git config", encoding="utf-8")
-
+
config = ScriberConfig(
use_gitignore=True,
code_patterns=["**/*.py"],
support_patterns=["pyproject.toml", "README.md", "requirements.txt"],
hard_ignore_patterns=[".git/**", "**/binary.dat"],
)
-
+
# Create gitignore
(tmp_path / ".gitignore").write_text("*.pyc\n", encoding="utf-8")
@@ -71,7 +73,7 @@ def test_native_scan_matches_python_scan(tmp_path: Path) -> None:
for path, rust_node in rust_result.items():
py_node = python_result[path]
-
+
# Verify fields match exactly
assert rust_node.relative == py_node.relative
assert rust_node.kind == py_node.kind
@@ -86,7 +88,9 @@ def test_native_no_support(tmp_path: Path) -> None:
(tmp_path / "src").mkdir()
(tmp_path / "src" / "main.py").write_text("print('hello')", encoding="utf-8")
(tmp_path / "README.md").write_text("# Test Project", encoding="utf-8")
- (tmp_path / "pyproject.toml").write_text("[tool.scriber]\nversion='2'", encoding="utf-8")
+ (tmp_path / "pyproject.toml").write_text(
+ "[tool.scriber]\nversion='2'", encoding="utf-8"
+ )
config = ScriberConfig(
support=False,
@@ -177,10 +181,11 @@ def test_native_graph_matches_python_graph_mixed_project(tmp_path: Path) -> None
config = make_config()
python_files = scan_python(tmp_path, config)
-
+
from scriber.graph.builder import build_graph as build_python_graph
+
py_graph = build_python_graph(python_files, config)
-
+
native = require_native()
native_files = native.scan_project(
str(tmp_path),
@@ -191,23 +196,38 @@ def test_native_graph_matches_python_graph_mixed_project(tmp_path: Path) -> None
config.support_content.full,
config.support_content.tree_only,
config.support_content.default,
- config.support
+ config.support,
)
edges = native.build_relation_graph(
str(tmp_path),
native_files,
config.python.source_roots,
- config.python.module_init_files
+ config.python.module_init_files,
)
rs_imports = {}
for edge in edges:
- if edge.kind == "import" or edge.kind == "mod" or edge.kind == "use" or edge.kind == "include":
- rs_imports.setdefault(Path(getattr(edge, "source")), set()).add(Path(edge.target))
-
+ if (
+ edge.kind == "import"
+ or edge.kind == "mod"
+ or edge.kind == "use"
+ or edge.kind == "include"
+ ):
+ rs_imports.setdefault(Path(getattr(edge, "source")), set()).add(
+ Path(edge.target)
+ )
+
for path, targets in py_graph.imports.items():
file = python_files[path]
- if file.language in {"python", "javascript", "typescript", "rust", "go", "c", "cpp"}:
+ if file.language in {
+ "python",
+ "javascript",
+ "typescript",
+ "rust",
+ "go",
+ "c",
+ "cpp",
+ }:
rs_targets = rs_imports.get(path, set())
assert rs_targets == targets
@@ -215,22 +235,26 @@ def test_native_graph_matches_python_graph_mixed_project(tmp_path: Path) -> None
def test_native_scoring_matches_python_for_focused_pack(tmp_path: Path) -> None:
make_mixed_project(tmp_path)
config = make_config()
-
+
python_files = scan_python(tmp_path, config)
from scriber.graph.builder import build_graph as build_python_graph
+
py_graph = build_python_graph(python_files, config)
-
+
from scriber.engine.scorer import score_candidates as score_python
from scriber.core.models import SeedPath
+
seed = SeedPath(
original=Path("src/main.py"),
absolute=(tmp_path / "src/main.py").resolve(),
relative=Path("src/main.py"),
is_dir=False,
- expanded_files=[Path("src/main.py")]
+ expanded_files=[Path("src/main.py")],
)
- py_candidates = score_python(files=python_files, seeds=[seed], graph=py_graph, config=config, mode="focused")
-
+ py_candidates = score_python(
+ files=python_files, seeds=[seed], graph=py_graph, config=config, mode="focused"
+ )
+
native = require_native()
native_files = native.scan_project(
str(tmp_path),
@@ -241,15 +265,15 @@ def test_native_scoring_matches_python_for_focused_pack(tmp_path: Path) -> None:
config.support_content.full,
config.support_content.tree_only,
config.support_content.default,
- config.support
+ config.support,
)
edges = native.build_relation_graph(
str(tmp_path),
native_files,
config.python.source_roots,
- config.python.module_init_files
+ config.python.module_init_files,
)
-
+
scoring = config.modules_config.scoring
opts = native.NativePackOptions(
mode="focused",
@@ -282,17 +306,14 @@ def test_native_scoring_matches_python_for_focused_pack(tmp_path: Path) -> None:
entrypoint_patterns=config.python.entrypoint_patterns,
test_roots=config.python.test_roots,
)
-
+
rs_candidates = native.score_candidates_native(
- native_files,
- ["src/main.py"],
- edges,
- opts
+ native_files, ["src/main.py"], edges, opts
)
-
+
py_map = {c.file.relative.as_posix(): c for c in py_candidates}
rs_map = {c.path: c for c in rs_candidates}
-
+
assert set(py_map.keys()) == set(rs_map.keys())
for path, py_c in py_map.items():
rs_c = rs_map[path]
@@ -309,12 +330,13 @@ def test_native_render_tree_matches_python() -> None:
"pyproject.toml",
"README.md",
]
-
+
from scriber.rendering.renderer import render_tree as render_python_tree
+
py_tree = render_python_tree([Path(p) for p in paths])
-
+
rs_tree = native.render_tree(paths)
-
+
assert rs_tree.strip() == py_tree.strip()
@@ -325,14 +347,16 @@ def test_default_toml_and_lock_support(tmp_path: Path) -> None:
# Create dummy files
(tmp_path / "src").mkdir()
(tmp_path / "src" / "main.py").write_text("print('hello')", encoding="utf-8")
- (tmp_path / "pyproject.toml").write_text("[tool.scriber]\nversion='2'", encoding="utf-8")
+ (tmp_path / "pyproject.toml").write_text(
+ "[tool.scriber]\nversion='2'", encoding="utf-8"
+ )
(tmp_path / "some_random_config.toml").write_text("a = 1", encoding="utf-8")
(tmp_path / "some_random_lockfile.lock").write_text("lock", encoding="utf-8")
# Load default config
config = load_config(tmp_path / "pyproject.toml")
config.use_gitignore = False
-
+
# Assert that **/*.toml and **/*.lock are in support patterns
assert "**/*.toml" in config.support_patterns
assert "**/*.toml" in config.support_content.full
@@ -363,7 +387,7 @@ def test_native_import_complex_python(tmp_path: Path) -> None:
(tmp_path / "src" / "b.py").write_text("class B: pass", encoding="utf-8")
(tmp_path / "src" / "c.py").write_text("class C: pass", encoding="utf-8")
(tmp_path / "src" / "d.py").write_text("class D: pass", encoding="utf-8")
-
+
import_test_content = """
import os, sys
import math as m, json
@@ -375,7 +399,9 @@ def test_native_import_complex_python(tmp_path: Path) -> None:
from .c import D
"""
(tmp_path / "src" / "main.py").write_text(import_test_content, encoding="utf-8")
- (tmp_path / "pyproject.toml").write_text("[tool.scriber]\nversion='2'", encoding="utf-8")
+ (tmp_path / "pyproject.toml").write_text(
+ "[tool.scriber]\nversion='2'", encoding="utf-8"
+ )
config = ScriberConfig(
use_gitignore=False,
@@ -384,8 +410,9 @@ def test_native_import_complex_python(tmp_path: Path) -> None:
)
from scriber.scanner.scan import scan_project
- files = scan_project(tmp_path, config)
-
+
+ scan_project(tmp_path, config)
+
native = require_native()
native_files = native.scan_project(
str(tmp_path),
@@ -396,29 +423,22 @@ def test_native_import_complex_python(tmp_path: Path) -> None:
config.support_content.full,
config.support_content.tree_only,
config.support_content.default,
- config.support
+ config.support,
)
edges = native.build_relation_graph(
str(tmp_path),
native_files,
config.python.source_roots,
- config.python.module_init_files
+ config.python.module_init_files,
)
imports = {Path(getattr(edge, "source")): set() for edge in edges}
for edge in edges:
if edge.kind == "import":
imports[Path(getattr(edge, "source"))].add(Path(edge.target))
-
+
main_path = Path("src/main.py")
assert main_path in imports
-
- expected_imports = {
- Path("src/a.py"),
- Path("src/b.py"),
- Path("src/c.py")
- }
- assert imports[main_path] == expected_imports
-
-
+ expected_imports = {Path("src/a.py"), Path("src/b.py"), Path("src/c.py")}
+ assert imports[main_path] == expected_imports
diff --git a/tests/test_scriber.py b/tests/test_scriber.py
index 8ddf870..abd832a 100644
--- a/tests/test_scriber.py
+++ b/tests/test_scriber.py
@@ -63,17 +63,25 @@ def make_project(tmp_path: Path) -> Path:
write(tmp_path / "poetry.lock", "very large lock in real life\n")
write(tmp_path / "Dockerfile", "FROM python:3.12\n")
write(tmp_path / "src/app/__init__.py", "")
- write(tmp_path / "src/app/auth.py", "from .session import Session\nfrom .config import SETTINGS\n\nclass Auth: pass\n")
+ write(
+ tmp_path / "src/app/auth.py",
+ "from .session import Session\nfrom .config import SETTINGS\n\nclass Auth: pass\n",
+ )
write(tmp_path / "src/app/session.py", "class Session: pass\n")
write(tmp_path / "src/app/config.py", "SETTINGS = {}\n")
write(tmp_path / "src/app/main.py", "from app.auth import Auth\n")
write(tmp_path / "src/api/routes.py", "from app.auth import Auth\n")
- write(tmp_path / "tests/test_auth.py", "from app.auth import Auth\n\ndef test_auth():\n assert Auth\n")
+ write(
+ tmp_path / "tests/test_auth.py",
+ "from app.auth import Auth\n\ndef test_auth():\n assert Auth\n",
+ )
write(tmp_path / "src/app/unrelated.py", "VALUE = 1\n")
return tmp_path
-def test_build_pack_includes_seed_dependencies_reverse_tests_and_support(tmp_path: Path, monkeypatch) -> None:
+def test_build_pack_includes_seed_dependencies_reverse_tests_and_support(
+ tmp_path: Path, monkeypatch
+) -> None:
project = make_project(tmp_path)
monkeypatch.chdir(project)
@@ -90,7 +98,9 @@ def test_build_pack_includes_seed_dependencies_reverse_tests_and_support(tmp_pat
assert "requirements.txt" in paths
assert "poetry.lock" in paths
- by_path = {candidate.file.relative.as_posix(): candidate for candidate in pack.candidates}
+ by_path = {
+ candidate.file.relative.as_posix(): candidate for candidate in pack.candidates
+ }
assert by_path["src/app/auth.py"].score == 100
assert by_path["src/app/session.py"].score >= 80
assert by_path["src/api/routes.py"].score >= 80
@@ -119,11 +129,18 @@ def test_multiple_paths_promote_shared_dependency(tmp_path: Path, monkeypatch) -
write(tmp_path / "src/app/billing.py", "from .config import SETTINGS\n")
monkeypatch.chdir(project)
- pack = build_pack(["src/app/auth.py", "src/app/billing.py"], config_path="pyproject.toml")
- by_path = {candidate.file.relative.as_posix(): candidate for candidate in pack.candidates}
+ pack = build_pack(
+ ["src/app/auth.py", "src/app/billing.py"], config_path="pyproject.toml"
+ )
+ by_path = {
+ candidate.file.relative.as_posix(): candidate for candidate in pack.candidates
+ }
assert "src/app/config.py" in by_path
assert by_path["src/app/config.py"].score == 100
- assert any("shared by multiple seed paths" in reason for reason in by_path["src/app/config.py"].reasons)
+ assert any(
+ "shared by multiple seed paths" in reason
+ for reason in by_path["src/app/config.py"].reasons
+ )
def test_no_modules_keeps_seed_and_pyproject(tmp_path: Path, monkeypatch) -> None:
@@ -155,8 +172,10 @@ def test_project_snapshot_mode(tmp_path: Path, monkeypatch) -> None:
pack = build_pack(["."], config_path="pyproject.toml")
assert pack.mode == "project_snapshot"
- by_path = {candidate.file.relative.as_posix(): candidate for candidate in pack.candidates}
-
+ by_path = {
+ candidate.file.relative.as_posix(): candidate for candidate in pack.candidates
+ }
+
# Entrypoint (e.g., src/app/main.py matches main.py pattern)
assert by_path["src/app/main.py"].score == 90
assert by_path["src/app/main.py"].reason_summary == "entrypoint file"
@@ -172,7 +191,7 @@ def test_project_snapshot_mode(tmp_path: Path, monkeypatch) -> None:
# Support files
assert by_path["README.md"].score == 45
assert by_path["README.md"].reason_summary == "project support file"
-
+
# Ensure no near-seed duplication in project snapshot mode
assert "near" not in by_path["README.md"].reason_summary
assert "shared by multiple seed paths" not in by_path["README.md"].reasons
@@ -180,6 +199,7 @@ def test_project_snapshot_mode(tmp_path: Path, monkeypatch) -> None:
def test_dry_run_and_open_cli(tmp_path: Path, monkeypatch) -> None:
from scriber.cli.main import main
+
project = make_project(tmp_path)
monkeypatch.chdir(project)
@@ -219,4 +239,3 @@ def test_no_support_excludes_support_files_folder_seed(tmp_path: Path) -> None:
paths = {c.file.relative.as_posix() for c in pack.candidates}
assert "README.md" not in paths
assert "pyproject.toml" not in paths
-
diff --git a/tests/test_symbols.py b/tests/test_symbols.py
index fd4a5e1..324df0a 100644
--- a/tests/test_symbols.py
+++ b/tests/test_symbols.py
@@ -1,7 +1,8 @@
from pathlib import Path
-from scriber.core.symbols import SymbolIndex, SymbolNode
+from scriber.core.symbols import SymbolIndex
from scriber.graph.languages.extractor import extract_python_symbols
+
def test_extract_python_symbols() -> None:
code = """
class MyClass:
@@ -16,27 +17,27 @@ def global_function():
"""
index = SymbolIndex()
file_path = Path("src/dummy.py")
-
+
extract_python_symbols(file_path, code, index)
-
+
symbols = index.get_symbols(file_path)
assert len(symbols) == 4
-
+
# Check Class
class_sym = next(s for s in symbols if s.name == "MyClass")
assert class_sym.kind == "class"
assert class_sym.parent_name is None
-
+
# Check Constructor
init_sym = next(s for s in symbols if s.name == "__init__")
assert init_sym.kind == "function"
assert init_sym.parent_name == "MyClass"
-
+
# Check Async Method
method_sym = next(s for s in symbols if s.name == "my_method")
assert method_sym.kind == "function"
assert method_sym.parent_name == "MyClass"
-
+
# Check Global Function
func_sym = next(s for s in symbols if s.name == "global_function")
assert func_sym.kind == "function"
diff --git a/tests/test_tokens.py b/tests/test_tokens.py
index fe1e2b2..35c6339 100644
--- a/tests/test_tokens.py
+++ b/tests/test_tokens.py
@@ -21,11 +21,14 @@ def test_token_estimation_custom_config() -> None:
def test_token_estimation_parsing_from_config(tmp_path: Path) -> None:
config_file = tmp_path / "pyproject.toml"
- config_file.write_text("""
+ config_file.write_text(
+ """
[tool.scriber.tokens]
estimator = "chars"
chars_per_token = 5
-""".strip(), encoding="utf-8")
+""".strip(),
+ encoding="utf-8",
+ )
config = load_config(config_file)
assert config.tokens.estimator == "chars"
From 57267fef8c9eef1e6c436dcea4516cbc1e29ec0e Mon Sep 17 00:00:00 2001
From: SunneV
Date: Sun, 31 May 2026 13:49:39 +0200
Subject: [PATCH 4/6] update ci cd to use node24
---
.github/workflows/ci.yml | 3 +++
.github/workflows/release.yml | 3 +++
2 files changed, 6 insertions(+)
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 98d3576..c867f6d 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -5,6 +5,9 @@ on:
branches: [main, develop]
pull_request:
+env:
+ FORCE_JAVASCRIPT_ACTIONS_TO_NODE24: true
+
jobs:
lint:
name: Lint & Format
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index fa5abc8..2378218 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -5,6 +5,9 @@ on:
tags:
- "v*"
+env:
+ FORCE_JAVASCRIPT_ACTIONS_TO_NODE24: true
+
jobs:
build:
name: Build ${{ matrix.os }}
From 80032d19a6494aca1cea58e6060db37c738a1d37 Mon Sep 17 00:00:00 2001
From: SunneV
Date: Sun, 31 May 2026 14:06:06 +0200
Subject: [PATCH 5/6] added top_dependencies configuration
---
CHANGELOG.md | 1 +
pyproject.toml | 1 +
rust/scriber_native/src/score.rs | 39 +++++++++++++++++++++++++++-----
src/scriber/core/config.py | 24 ++++++++++++++++++++
src/scriber/core/models.py | 1 +
src/scriber/engine/scorer.py | 22 +++++++++++++++---
src/scriber/packer/pack.py | 1 +
7 files changed, 80 insertions(+), 9 deletions(-)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 17c3b71..dd220be 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -13,6 +13,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- **CLI Introspection**: Added `--explain` flag as an alias. Enhanced `--why` output to show estimated token cost, content mode, and omission reasons for any target file.
- **Automated README Sync**: Added `scripts/sync_readme.py` tool to automatically sync CLI arguments, profiles documentation, and version tags across the `README.md`.
- **AI-Native Navigation & Optimization**: Implemented XML anchors for symbols, aggressive test file quarantine, and support file pruning to keep focused mode clean and strictly token-capped.
+- **Dependency Limiting**: Introduced `top_dependencies` (defaulting to 10) in the configuration to limit the width of the graph traversal and pull in only the highest-confidence dependencies per file.
- **Version Alignment**: Synchronized Python and Rust crate versions. `scriber --version` now reports both Python and native API versions.
### Fixed
diff --git a/pyproject.toml b/pyproject.toml
index 837c8d1..fd78d83 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -94,6 +94,7 @@ include_tests = true
include_same_package = true
include_parent_entrypoints = true
include_project_configs = true
+top_dependencies = 10
content_min_score = 50
tree_min_score = 30
diff --git a/rust/scriber_native/src/score.rs b/rust/scriber_native/src/score.rs
index 551c032..e2c3c5a 100644
--- a/rust/scriber_native/src/score.rs
+++ b/rust/scriber_native/src/score.rs
@@ -90,6 +90,8 @@ pub struct NativePackOptions {
pub include_project_configs: bool,
#[pyo3(get, set)]
pub depth: usize,
+ #[pyo3(get, set)]
+ pub top_dependencies: usize,
// Support file scanning
#[pyo3(get, set)]
@@ -136,6 +138,7 @@ impl NativePackOptions {
include_tests = true,
include_project_configs = true,
depth = 2,
+ top_dependencies = 10,
support_enabled = true,
entrypoint_patterns = Vec::new(),
test_roots = Vec::new(),
@@ -172,6 +175,7 @@ impl NativePackOptions {
include_tests: bool,
include_project_configs: bool,
depth: usize,
+ top_dependencies: usize,
support_enabled: bool,
entrypoint_patterns: Vec,
test_roots: Vec,
@@ -207,6 +211,7 @@ impl NativePackOptions {
include_tests,
include_project_configs,
depth,
+ top_dependencies,
support_enabled,
entrypoint_patterns,
test_roots,
@@ -414,6 +419,7 @@ fn walk_weighted_neighbors(
edges: &[NativeRelationEdge],
start: &str,
depth: usize,
+ top_dependencies: usize,
reverse: bool,
) -> HashMap {
let mut adj: HashMap> = HashMap::new();
@@ -423,6 +429,19 @@ fn walk_weighted_neighbors(
adj.entry(u.clone()).or_default().push((v.clone(), edge));
}
+ if top_dependencies > 0 {
+ for edges_from_u in adj.values_mut() {
+ if edges_from_u.len() > top_dependencies {
+ edges_from_u.sort_by(|a, b| {
+ let str_a = a.1.weight * a.1.confidence;
+ let str_b = b.1.weight * b.1.confidence;
+ str_b.partial_cmp(&str_a).unwrap_or(Ordering::Equal)
+ });
+ edges_from_u.truncate(top_dependencies);
+ }
+ }
+ }
+
let mut max_strength: HashMap = HashMap::new();
max_strength.insert(start.to_string(), 1.0);
@@ -635,9 +654,13 @@ pub fn score_candidates_native(
for seed_rel in &seed_files {
// Direct dependencies
if options.include_direct_dependencies {
- for (dep, strength) in
- walk_weighted_neighbors(&edges, seed_rel, options.depth, false)
- {
+ for (dep, strength) in walk_weighted_neighbors(
+ &edges,
+ seed_rel,
+ options.depth,
+ options.top_dependencies,
+ false,
+ ) {
let score = std::cmp::max(
options.tree_min_score,
(options.direct_dependency_score as f64 * strength) as i32,
@@ -657,9 +680,13 @@ pub fn score_candidates_native(
// Reverse dependencies
if options.include_reverse_dependencies {
- for (dep, strength) in
- walk_weighted_neighbors(&edges, seed_rel, options.depth, true)
- {
+ for (dep, strength) in walk_weighted_neighbors(
+ &edges,
+ seed_rel,
+ options.depth,
+ options.top_dependencies,
+ true,
+ ) {
let score = std::cmp::max(
options.tree_min_score,
(options.reverse_dependency_score as f64 * strength) as i32,
diff --git a/src/scriber/core/config.py b/src/scriber/core/config.py
index 69c7491..f0e1fb0 100644
--- a/src/scriber/core/config.py
+++ b/src/scriber/core/config.py
@@ -204,6 +204,7 @@
include_same_package = true
include_parent_entrypoints = true
include_project_configs = true
+top_dependencies = 10
content_min_score = 50
tree_min_score = 30
@@ -327,6 +328,9 @@ def load_config(config_path: Path) -> ScriberConfig:
config.modules_config.include_project_configs,
)
),
+ top_dependencies=int(
+ modules.get("top_dependencies", config.modules_config.top_dependencies)
+ ),
content_min_score=int(
modules.get(
"content_min_score", config.modules_config.content_min_score
@@ -485,6 +489,26 @@ def validate_raw_config(raw_data: dict[str, Any]) -> list[ConfigIssue]:
)
)
+ modules = data.get("modules", {})
+ if isinstance(modules, dict):
+ if "top_dependencies" in modules:
+ try:
+ val = int(modules["top_dependencies"])
+ if val < 0:
+ issues.append(
+ ConfigIssue(
+ "error",
+ f"modules.top_dependencies must be a number >= 0. Got: {val}",
+ )
+ )
+ except (ValueError, TypeError):
+ issues.append(
+ ConfigIssue(
+ "error",
+ f"modules.top_dependencies must be an integer. Got: {modules['top_dependencies']}",
+ )
+ )
+
# 6. check patterns are list of strings
def check_pattern_list(parent_dict: dict[str, Any], path_name: str) -> None:
if "patterns" in parent_dict:
diff --git a/src/scriber/core/models.py b/src/scriber/core/models.py
index 757a462..ca40354 100644
--- a/src/scriber/core/models.py
+++ b/src/scriber/core/models.py
@@ -42,6 +42,7 @@ class ModuleConfig:
include_same_package: bool = True
include_parent_entrypoints: bool = True
include_project_configs: bool = True
+ top_dependencies: int = 10
content_min_score: int = 50
tree_min_score: int = 30
scoring: dict[str, int] = field(default_factory=lambda: dict(DEFAULT_SCORING))
diff --git a/src/scriber/engine/scorer.py b/src/scriber/engine/scorer.py
index 3193d5a..db018a8 100644
--- a/src/scriber/engine/scorer.py
+++ b/src/scriber/engine/scorer.py
@@ -125,7 +125,11 @@ def _name_related(a: Path, b: Path) -> bool:
def _walk_weighted_neighbors(
- edges: list[RelationEdge], start: Path, depth_limit: int, reverse: bool = False
+ edges: list[RelationEdge],
+ start: Path,
+ depth_limit: int,
+ top_dependencies: int,
+ reverse: bool = False,
) -> dict[Path, float]:
import heapq
@@ -135,6 +139,14 @@ def _walk_weighted_neighbors(
v = edge.source if reverse else edge.target
adj.setdefault(u, []).append((v, edge))
+ if top_dependencies > 0:
+ for u, edges_from_u in adj.items():
+ if len(edges_from_u) > top_dependencies:
+ edges_from_u.sort(
+ key=lambda item: item[1].weight * item[1].confidence, reverse=True
+ )
+ adj[u] = edges_from_u[:top_dependencies]
+
queue = [(-1.0, 0, start)]
max_strength: dict[Path, float] = {start: 1.0}
best_at_state: dict[tuple[Path, int], float] = {(start, 0): 1.0}
@@ -360,7 +372,7 @@ def score_candidates(
for seed_rel in seed_files:
if scoring.include_direct_dependencies:
for dep, strength in _walk_weighted_neighbors(
- graph.edges, seed_rel, scoring.depth, reverse=False
+ graph.edges, seed_rel, scoring.depth, scoring.top_dependencies
).items():
score = max(
scoring.tree_min_score,
@@ -378,7 +390,11 @@ def score_candidates(
if scoring.include_reverse_dependencies:
for dep, strength in _walk_weighted_neighbors(
- graph.edges, seed_rel, scoring.depth, reverse=True
+ graph.edges,
+ seed_rel,
+ scoring.depth,
+ scoring.top_dependencies,
+ reverse=True,
).items():
score = max(
scoring.tree_min_score,
diff --git a/src/scriber/packer/pack.py b/src/scriber/packer/pack.py
index 0faa162..7469bc5 100644
--- a/src/scriber/packer/pack.py
+++ b/src/scriber/packer/pack.py
@@ -354,6 +354,7 @@ def _build_graph_and_score(
include_tests=config.modules_config.include_tests,
include_project_configs=config.modules_config.include_project_configs,
depth=config.modules_config.depth,
+ top_dependencies=config.modules_config.top_dependencies,
support_enabled=config.support,
entrypoint_patterns=config.python.entrypoint_patterns,
test_roots=config.python.test_roots,
From 948fb12668297f4f5ec42e9f3a6297a1cb87e48b Mon Sep 17 00:00:00 2001
From: SunneV
Date: Sun, 31 May 2026 14:08:26 +0200
Subject: [PATCH 6/6] added test
---
tests/test_top_deps.py | 39 +++++++++++++++++++++++++++++++++++++++
1 file changed, 39 insertions(+)
create mode 100644 tests/test_top_deps.py
diff --git a/tests/test_top_deps.py b/tests/test_top_deps.py
new file mode 100644
index 0000000..5e9123c
--- /dev/null
+++ b/tests/test_top_deps.py
@@ -0,0 +1,39 @@
+def test_top_dependencies_limits_graph_traversal():
+ from pathlib import Path
+ from scriber.core.models import RelationEdge
+ from scriber.engine.scorer import _walk_weighted_neighbors
+
+ start = Path("app.py")
+ edges = []
+ # Create 15 outgoing edges with varying strengths
+ for i in range(15):
+ target = Path(f"dep_{i}.py")
+ edges.append(
+ RelationEdge(
+ source=start,
+ target=target,
+ kind="import",
+ weight=1.0,
+ confidence=0.1 * i, # Higher i = higher confidence
+ evidence=[],
+ line=i,
+ analyzer="test",
+ )
+ )
+
+ # Unlimited dependencies (0)
+ result_unlimited = _walk_weighted_neighbors(
+ edges, start, depth_limit=1, top_dependencies=0
+ )
+ assert len(result_unlimited) == 15
+
+ # Top 5 dependencies
+ result_top5 = _walk_weighted_neighbors(
+ edges, start, depth_limit=1, top_dependencies=5
+ )
+ assert len(result_top5) == 5
+
+ # Verify the ones with highest confidence were picked
+ # The edges have confidence 0.0 to 1.4. The top 5 should be from 1.0 to 1.4 (dep_10 to dep_14)
+ expected_deps = {Path(f"dep_{i}.py") for i in range(10, 15)}
+ assert set(result_top5.keys()) == expected_deps