From 25bd6fb499d4cb9e99dab87f71a6140ad682312d Mon Sep 17 00:00:00 2001 From: Mateo Torres Date: Fri, 22 May 2026 10:46:45 -0300 Subject: [PATCH 1/2] feat(indexing): respect .gitignore when indexing (closes #28) Aggregates patterns from every .gitignore under the indexed root and skips matching files. Hardcoded baseline (.git, node_modules, caches, binaries) still applies. New --include-ignored flag on libr add (and include_ignored arg on index_directory_to_library) opts out. The previously-duplicated _should_skip_file in cli.py and server.py now delegates to a shared helper in librarian/sources/ignore.py. --- librarian/cli.py | 121 +++++----------------- librarian/server.py | 111 +++----------------- librarian/sources/__init__.py | 1 + librarian/sources/ignore.py | 189 ++++++++++++++++++++++++++++++++++ pyproject.toml | 1 + tests/test_cli.py | 4 +- tests/test_gitignore.py | 109 ++++++++++++++++++++ 7 files changed, 345 insertions(+), 191 deletions(-) create mode 100644 librarian/sources/__init__.py create mode 100644 librarian/sources/ignore.py create mode 100644 tests/test_gitignore.py diff --git a/librarian/cli.py b/librarian/cli.py index bf8f28a..58f8ca2 100644 --- a/librarian/cli.py +++ b/librarian/cli.py @@ -37,6 +37,8 @@ from rich.panel import Panel from rich.table import Table +from librarian.sources.ignore import GitignoreMatcher, should_skip_file + # Initialize Typer app app = typer.Typer( name="libr", @@ -216,100 +218,13 @@ def _get_config() -> dict[str, Any]: } -def _should_skip_file(file_path: Path, supported_extensions: set[str]) -> bool: - """ - Check if a file should be skipped during indexing. - - Args: - file_path: Path to the file. - supported_extensions: Set of supported extensions. - - Returns: - True if the file should be skipped. - """ - # Skip system/hidden directories - skip_dirs = { - "__pycache__", - ".git", - ".svn", - ".hg", - "node_modules", - ".venv", - "venv", - ".pytest_cache", - ".mypy_cache", - ".ruff_cache", - "__MACOSX", - ".DS_Store", - } - - # Check if file is in a skipped directory - for parent in file_path.parents: - if parent.name in skip_dirs: - return True - - # Skip hidden files (starting with .) - if file_path.name.startswith("."): - return True - - # Skip binary/system file extensions - skip_extensions = { - # Executables and binaries - ".exe", - ".bin", - ".dll", - ".so", - ".dylib", - ".a", - ".o", - # Disk images and archives - ".dmg", - ".iso", - ".img", - ".app", - ".pkg", - # Compressed archives - ".zip", - ".tar", - ".gz", - ".bz2", - ".xz", - ".7z", - ".rar", - # Python compiled - ".pyc", - ".pyo", - ".pyd", - # System files - ".lock", - ".log", - ".tmp", - ".temp", - ".cache", - # Media files (large binaries) - ".mp4", - ".mp3", - ".wav", - ".avi", - ".mov", - ".flac", - # Font files - ".ttf", - ".otf", - ".woff", - ".woff2", - } - - if file_path.suffix.lower() in skip_extensions: - return True - - # Skip files without extensions unless they're in supported list - # (e.g., README is supported, but random no-extension files aren't) - if not file_path.suffix: - return True - - # Skip if extension not in supported list - return file_path.suffix.lower() not in supported_extensions +def _should_skip_file( + file_path: Path, + supported_extensions: set[str], + gitignore_matcher: "GitignoreMatcher | None" = None, +) -> bool: + """Check if a file should be skipped during indexing.""" + return should_skip_file(file_path, supported_extensions, gitignore_matcher) def _find_source(name_or_path: str) -> dict | None: @@ -506,6 +421,13 @@ def add_source( verbose: Annotated[ bool, typer.Option("--verbose", "-v", help="Show files being indexed") ] = False, + include_ignored: Annotated[ + bool, + typer.Option( + "--include-ignored", + help="Index files even when matched by a .gitignore in the source tree", + ), + ] = False, ) -> None: """Add a file or directory as a source and index it recursively.""" cfg = _get_config() @@ -551,9 +473,13 @@ def add_source( else: files_to_index.extend(source_path.rglob(f"*{ext}")) - # Filter out system/binary files + gitignore_matcher = None if include_ignored else GitignoreMatcher(source_path) + + # Filter out system/binary files and .gitignore matches files_to_index = [ - f for f in files_to_index if not _should_skip_file(f, supported_extensions) + f + for f in files_to_index + if not _should_skip_file(f, supported_extensions, gitignore_matcher) ] # Apply pattern filter @@ -598,6 +524,7 @@ def add_source( "depth": depth, "pattern": pattern, "exclude": exclude, + "include_ignored": include_ignored, "added_at": datetime.now().isoformat(), } @@ -629,6 +556,7 @@ def add_source( server_ingest( context=None, # type: ignore[arg-type] directory=str(source_path), + include_ignored=include_ignored, ) ) @@ -909,6 +837,7 @@ def index_build( server_ingest( context=None, # type: ignore[arg-type] directory=str(src_path), + include_ignored=bool(src.get("include_ignored", False)), ) ) total_indexed += result.get("indexed", 0) + result.get("updated", 0) diff --git a/librarian/server.py b/librarian/server.py index 6cdd687..5ee6a0b 100644 --- a/librarian/server.py +++ b/librarian/server.py @@ -47,6 +47,7 @@ from librarian.processing.embed import get_embedder from librarian.processing.parsers.base import FileReadError, FileReadTimeoutError from librarian.retrieval.search import HybridSearcher +from librarian.sources.ignore import GitignoreMatcher, should_skip_file from librarian.storage.database import get_database from librarian.tool_outputs import ( AddOutput, @@ -96,99 +97,13 @@ def _process_and_index_file(file_path: Path) -> dict[str, Any]: return get_indexing_service().index_file(file_path) -def _should_skip_file(file_path: Path, supported_extensions: set[str]) -> bool: - """ - Check if a file should be skipped during indexing. - - Args: - file_path: Path to the file. - supported_extensions: Set of supported extensions. - - Returns: - True if the file should be skipped. - """ - # Skip system/hidden directories - skip_dirs = { - "__pycache__", - ".git", - ".svn", - ".hg", - "node_modules", - ".venv", - "venv", - ".pytest_cache", - ".mypy_cache", - ".ruff_cache", - "__MACOSX", - ".DS_Store", - } - - # Check if file is in a skipped directory - for parent in file_path.parents: - if parent.name in skip_dirs: - return True - - # Skip hidden files (starting with .) - if file_path.name.startswith("."): - return True - - # Skip binary/system file extensions - skip_extensions = { - # Executables and binaries - ".exe", - ".bin", - ".dll", - ".so", - ".dylib", - ".a", - ".o", - # Disk images and archives - ".dmg", - ".iso", - ".img", - ".app", - ".pkg", - # Compressed archives - ".zip", - ".tar", - ".gz", - ".bz2", - ".xz", - ".7z", - ".rar", - # Python compiled - ".pyc", - ".pyo", - ".pyd", - # System files - ".lock", - ".log", - ".tmp", - ".temp", - ".cache", - # Media files (large binaries) - ".mp4", - ".mp3", - ".wav", - ".avi", - ".mov", - ".flac", - # Font files - ".ttf", - ".otf", - ".woff", - ".woff2", - } - - if file_path.suffix.lower() in skip_extensions: - return True - - # Skip files without extensions - if not file_path.suffix: - return True - - # Skip if extension not in supported list - return file_path.suffix.lower() not in supported_extensions +def _should_skip_file( + file_path: Path, + supported_extensions: set[str], + gitignore_matcher: GitignoreMatcher | None = None, +) -> bool: + """Check if a file should be skipped during indexing.""" + return should_skip_file(file_path, supported_extensions, gitignore_matcher) def _resolve_path(raw_path: str, kind: str = "path") -> Path: @@ -243,6 +158,10 @@ def _resolve_path(raw_path: str, kind: str = "path") -> Path: async def index_directory_to_library( context: Context, directory: Annotated[str, "Absolute path to directory containing files to add to the library"], + include_ignored: Annotated[ + bool, + "If True, index files even when matched by a .gitignore under the directory.", + ] = False, ) -> Annotated[ IndexDirectoryOutput, "Per-directory index summary with counts and a per-file status list.", @@ -285,12 +204,16 @@ async def index_directory_to_library( registry = get_registry() supported_extensions = registry.get_supported_extensions() + gitignore_matcher = None if include_ignored else GitignoreMatcher(dir_path) + all_files: list[Path] = [] for ext in supported_extensions: pattern = f"**/*{ext}" all_files.extend(dir_path.glob(pattern)) - all_files = [f for f in all_files if not _should_skip_file(f, supported_extensions)] + all_files = [ + f for f in all_files if not _should_skip_file(f, supported_extensions, gitignore_matcher) + ] if not all_files: return IndexDirectoryOutput( diff --git a/librarian/sources/__init__.py b/librarian/sources/__init__.py new file mode 100644 index 0000000..2627986 --- /dev/null +++ b/librarian/sources/__init__.py @@ -0,0 +1 @@ +"""Source management for the librarian.""" diff --git a/librarian/sources/ignore.py b/librarian/sources/ignore.py new file mode 100644 index 0000000..e0627bf --- /dev/null +++ b/librarian/sources/ignore.py @@ -0,0 +1,189 @@ +"""File/directory skip logic for indexing, including .gitignore support.""" + +from __future__ import annotations + +from pathlib import Path + +from pathspec import GitIgnoreSpec + +# Directories that are always skipped, even when --include-ignored is set. +# These are caches, VCS metadata, and OS junk that should never enter the library. +ALWAYS_SKIP_DIRS: frozenset[str] = frozenset({ + "__pycache__", + ".git", + ".svn", + ".hg", + "node_modules", + ".venv", + "venv", + ".pytest_cache", + ".mypy_cache", + ".ruff_cache", + "__MACOSX", + ".DS_Store", +}) + +# Binary / system file extensions we never index. +SKIP_EXTENSIONS: frozenset[str] = frozenset({ + # Executables and binaries + ".exe", + ".bin", + ".dll", + ".so", + ".dylib", + ".a", + ".o", + # Disk images and archives + ".dmg", + ".iso", + ".img", + ".app", + ".pkg", + # Compressed archives + ".zip", + ".tar", + ".gz", + ".bz2", + ".xz", + ".7z", + ".rar", + # Python compiled + ".pyc", + ".pyo", + ".pyd", + # System files + ".lock", + ".log", + ".tmp", + ".temp", + ".cache", + # Media files (large binaries) + # TODO: revisit once audio/video parsers land — these will need to move + # out of the skip list and into the parser registry as new asset types. + ".mp4", + ".mp3", + ".wav", + ".avi", + ".mov", + ".flac", + # Font files + ".ttf", + ".otf", + ".woff", + ".woff2", +}) + + +class GitignoreMatcher: + """Tells whether a file is excluded by any `.gitignore` under a root. + + Aggregates patterns from every `.gitignore` found at or below the root. + Patterns from nested files are anchored to their containing directory, + mirroring git's own semantics for the common cases (anchored patterns, + floating patterns, directory-only patterns, and negations). + """ + + def __init__(self, root: Path) -> None: + self.root = root.resolve() + self._spec: GitIgnoreSpec | None = self._build_spec() + + def _build_spec(self) -> GitIgnoreSpec | None: + if not self.root.is_dir(): + return None + + lines: list[str] = [] + # Sort so outer .gitignore files come before inner ones; later lines + # win in GitIgnoreSpec, which matches git's "deeper file overrides". + gitignores = sorted( + self.root.rglob(".gitignore"), + key=lambda p: len(p.parts), + ) + for gitignore in gitignores: + try: + rel_dir = gitignore.parent.resolve().relative_to(self.root) + except ValueError: + continue + prefix = "" if rel_dir == Path(".") else f"{rel_dir.as_posix()}/" + try: + content = gitignore.read_text(encoding="utf-8", errors="ignore") + except OSError: + continue + for raw in content.splitlines(): + pattern = _prefix_pattern(raw, prefix) + if pattern is not None: + lines.append(pattern) + + if not lines: + return None + return GitIgnoreSpec.from_lines(lines) + + def is_ignored(self, file_path: Path) -> bool: + if self._spec is None: + return False + try: + rel = file_path.resolve().relative_to(self.root) + except ValueError: + return False + return self._spec.match_file(rel.as_posix()) + + +def _prefix_pattern(raw: str, prefix: str) -> str | None: + """Translate a single .gitignore line to be anchored under `prefix`. + + Returns None for blank lines and comments. `prefix` is the gitignore's + directory relative to the matcher root, with a trailing slash (or empty + string when the gitignore lives at the root). + """ + line = raw.rstrip() + if not line or line.startswith("#"): + return None + + negate = line.startswith("!") + if negate: + line = line[1:] + + if not prefix: + return ("!" + line) if negate else line + + # Determine whether the pattern is anchored to its gitignore's directory. + # Per git: a leading '/' or any '/' before the end of the pattern anchors + # it; otherwise the pattern matches at any depth under that directory. + stripped = line.rstrip("/") + if line.startswith("/"): + new = f"/{prefix}{line[1:]}" + elif "/" in stripped: + new = f"/{prefix}{line}" + else: + new = f"{prefix}**/{line}" + return ("!" + new) if negate else new + + +def should_skip_file( + file_path: Path, + supported_extensions: set[str], + gitignore_matcher: GitignoreMatcher | None = None, +) -> bool: + """Decide whether a file should be skipped during indexing. + + Hardcoded baseline (always applied): cache/VCS directories, binary + extensions, hidden files, and files lacking a supported extension. + When `gitignore_matcher` is provided, files it marks as ignored are + also skipped. + """ + for parent in file_path.parents: + if parent.name in ALWAYS_SKIP_DIRS: + return True + + if file_path.name.startswith("."): + return True + + if file_path.suffix.lower() in SKIP_EXTENSIONS: + return True + + if not file_path.suffix: + return True + + if file_path.suffix.lower() not in supported_extensions: + return True + + return gitignore_matcher is not None and gitignore_matcher.is_ignored(file_path) diff --git a/pyproject.toml b/pyproject.toml index f5bd35f..4d04056 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -32,6 +32,7 @@ dependencies = [ "numpy>=1.24.0", "typer>=0.9.0", "rich>=13.0.0", + "pathspec>=0.12.0", ] [project.scripts] diff --git a/tests/test_cli.py b/tests/test_cli.py index 807ceb6..27a536c 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -53,7 +53,9 @@ def test_add_directory_exits_nonzero_when_indexing_errors( monkeypatch.setattr(cli, "_get_config", lambda: {"ensure_directories": lambda: None}) monkeypatch.setattr(cli, "console", Console(width=500, color_system=None)) - async def fake_server_ingest(context: Any, directory: str) -> dict[str, Any]: + async def fake_server_ingest( + context: Any, directory: str, include_ignored: bool = False + ) -> dict[str, Any]: return { "directory": directory, "total_files": 1, diff --git a/tests/test_gitignore.py b/tests/test_gitignore.py new file mode 100644 index 0000000..ff4ed87 --- /dev/null +++ b/tests/test_gitignore.py @@ -0,0 +1,109 @@ +"""Tests for .gitignore-aware skip logic.""" + +from __future__ import annotations + +from pathlib import Path + +from librarian.sources.ignore import ( + ALWAYS_SKIP_DIRS, + GitignoreMatcher, + should_skip_file, +) + +SUPPORTED = {".md", ".py", ".txt"} + + +def _write(path: Path, content: str = "") -> Path: + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(content, encoding="utf-8") + return path + + +class TestGitignoreMatcher: + def test_no_gitignore_means_nothing_is_ignored(self, tmp_path: Path) -> None: + f = _write(tmp_path / "a.md", "hi") + matcher = GitignoreMatcher(tmp_path) + assert not matcher.is_ignored(f) + + def test_root_gitignore_excludes_listed_files(self, tmp_path: Path) -> None: + _write(tmp_path / ".gitignore", "build/\n*.log\n") + log = _write(tmp_path / "app.log", "x") + built = _write(tmp_path / "build" / "out.md", "x") + kept = _write(tmp_path / "keep.md", "x") + + matcher = GitignoreMatcher(tmp_path) + assert matcher.is_ignored(log) + assert matcher.is_ignored(built) + assert not matcher.is_ignored(kept) + + def test_floating_pattern_matches_at_any_depth(self, tmp_path: Path) -> None: + _write(tmp_path / ".gitignore", "node_modules\n") + nested = _write(tmp_path / "pkg" / "node_modules" / "lib.js", "x") + matcher = GitignoreMatcher(tmp_path) + assert matcher.is_ignored(nested) + + def test_anchored_pattern_only_matches_at_root(self, tmp_path: Path) -> None: + _write(tmp_path / ".gitignore", "/foo.md\n") + at_root = _write(tmp_path / "foo.md", "x") + nested = _write(tmp_path / "sub" / "foo.md", "x") + matcher = GitignoreMatcher(tmp_path) + assert matcher.is_ignored(at_root) + assert not matcher.is_ignored(nested) + + def test_nested_gitignore_is_scoped_to_its_directory(self, tmp_path: Path) -> None: + _write(tmp_path / "sub" / ".gitignore", "secret.md\n") + nested_secret = _write(tmp_path / "sub" / "secret.md", "x") + elsewhere_secret = _write(tmp_path / "other" / "secret.md", "x") + + matcher = GitignoreMatcher(tmp_path) + assert matcher.is_ignored(nested_secret) + assert not matcher.is_ignored(elsewhere_secret) + + def test_negation_unignores_a_specific_file(self, tmp_path: Path) -> None: + _write(tmp_path / ".gitignore", "*.md\n!keep.md\n") + ignored = _write(tmp_path / "drop.md", "x") + kept = _write(tmp_path / "keep.md", "x") + + matcher = GitignoreMatcher(tmp_path) + assert matcher.is_ignored(ignored) + assert not matcher.is_ignored(kept) + + def test_paths_outside_root_are_not_ignored(self, tmp_path: Path) -> None: + root = tmp_path / "root" + root.mkdir() + _write(root / ".gitignore", "*.md\n") + outside = _write(tmp_path / "outside.md", "x") + matcher = GitignoreMatcher(root) + assert not matcher.is_ignored(outside) + + +class TestShouldSkipFile: + def test_always_skips_node_modules_even_without_gitignore(self, tmp_path: Path) -> None: + f = _write(tmp_path / "proj" / "node_modules" / "lib" / "a.md", "x") + assert should_skip_file(f, SUPPORTED, gitignore_matcher=None) + + def test_always_skip_dirs_contains_expected(self) -> None: + assert "node_modules" in ALWAYS_SKIP_DIRS + assert ".git" in ALWAYS_SKIP_DIRS + assert "__pycache__" in ALWAYS_SKIP_DIRS + + def test_unsupported_extension_skipped(self, tmp_path: Path) -> None: + f = _write(tmp_path / "image.png", "") + assert should_skip_file(f, SUPPORTED, gitignore_matcher=None) + + def test_supported_file_kept_without_matcher(self, tmp_path: Path) -> None: + f = _write(tmp_path / "doc.md", "hi") + assert not should_skip_file(f, SUPPORTED, gitignore_matcher=None) + + def test_gitignore_match_causes_skip(self, tmp_path: Path) -> None: + _write(tmp_path / ".gitignore", "drafts/\n") + f = _write(tmp_path / "drafts" / "wip.md", "x") + matcher = GitignoreMatcher(tmp_path) + assert should_skip_file(f, SUPPORTED, gitignore_matcher=matcher) + + def test_include_ignored_means_no_matcher_passed(self, tmp_path: Path) -> None: + """When --include-ignored is set, callers pass matcher=None and the + file survives the skip check (assuming it is otherwise valid).""" + _write(tmp_path / ".gitignore", "drafts/\n") + f = _write(tmp_path / "drafts" / "wip.md", "x") + assert not should_skip_file(f, SUPPORTED, gitignore_matcher=None) From be01b8365cd26ab87d249a233ad0aa7b277c308f Mon Sep 17 00:00:00 2001 From: Mateo Torres Date: Mon, 25 May 2026 14:11:06 -0300 Subject: [PATCH 2/2] feat(indexing): add --force-include and .librariantrack overrides Lets users index files that the gitignore aggregator or the skip-dirs baseline would otherwise exclude (e.g. a specific package under node_modules), per PR #29 review feedback. - New --force-include flag on `libr add` and `force_include` arg on `index_directory_to_library`. Persisted on the source entry and honored on rebuild. - New `.librariantrack` file format (gitignore-syntax) opts specific patterns back in from any directory under the source root. - Force-include bypasses the skip-dirs baseline and any .gitignore match but does not rescue unsupported/binary/hidden files (the indexer can't parse them anyway). - The skip-dirs baseline and binary-extension list moved to `librarian/config.py` as INDEX_SKIP_DIRS / INDEX_SKIP_EXTENSIONS, env-overridable instead of hardcoded. --- librarian/cli.py | 43 ++++++- librarian/config.py | 34 ++++++ librarian/server.py | 38 +++++- librarian/sources/ignore.py | 235 ++++++++++++++++++++---------------- tests/test_cli.py | 5 +- tests/test_gitignore.py | 104 ++++++++++++++++ 6 files changed, 351 insertions(+), 108 deletions(-) diff --git a/librarian/cli.py b/librarian/cli.py index 58f8ca2..b4fa133 100644 --- a/librarian/cli.py +++ b/librarian/cli.py @@ -37,7 +37,12 @@ from rich.panel import Panel from rich.table import Table -from librarian.sources.ignore import GitignoreMatcher, should_skip_file +from librarian.sources.ignore import ( + GitignoreMatcher, + LibrarianTrackMatcher, + normalize_force_include, + should_skip_file, +) # Initialize Typer app app = typer.Typer( @@ -222,9 +227,17 @@ def _should_skip_file( file_path: Path, supported_extensions: set[str], gitignore_matcher: "GitignoreMatcher | None" = None, + force_include: "frozenset[Path] | None" = None, + track_matcher: "LibrarianTrackMatcher | None" = None, ) -> bool: """Check if a file should be skipped during indexing.""" - return should_skip_file(file_path, supported_extensions, gitignore_matcher) + return should_skip_file( + file_path, + supported_extensions, + gitignore_matcher, + force_include=force_include, + track_matcher=track_matcher, + ) def _find_source(name_or_path: str) -> dict | None: @@ -428,6 +441,19 @@ def add_source( help="Index files even when matched by a .gitignore in the source tree", ), ] = False, + force_include: Annotated[ + Optional[list[str]], + typer.Option( + "--force-include", + help=( + "Path to always index, even when matched by .gitignore or by the " + "skip-dirs baseline (node_modules, __pycache__, .venv, etc.). " + "Pointing at a directory force-includes everything beneath it. " + "Can be repeated. A .librariantrack file inside the source has " + "the same effect for patterns it lists." + ), + ), + ] = None, ) -> None: """Add a file or directory as a source and index it recursively.""" cfg = _get_config() @@ -474,12 +500,20 @@ def add_source( files_to_index.extend(source_path.rglob(f"*{ext}")) gitignore_matcher = None if include_ignored else GitignoreMatcher(source_path) + track_matcher = LibrarianTrackMatcher(source_path) + forced_paths = normalize_force_include(force_include) # Filter out system/binary files and .gitignore matches files_to_index = [ f for f in files_to_index - if not _should_skip_file(f, supported_extensions, gitignore_matcher) + if not _should_skip_file( + f, + supported_extensions, + gitignore_matcher, + force_include=forced_paths, + track_matcher=track_matcher, + ) ] # Apply pattern filter @@ -525,6 +559,7 @@ def add_source( "pattern": pattern, "exclude": exclude, "include_ignored": include_ignored, + "force_include": list(force_include) if force_include else [], "added_at": datetime.now().isoformat(), } @@ -557,6 +592,7 @@ def add_source( context=None, # type: ignore[arg-type] directory=str(source_path), include_ignored=include_ignored, + force_include=list(force_include) if force_include else None, ) ) @@ -838,6 +874,7 @@ def index_build( context=None, # type: ignore[arg-type] directory=str(src_path), include_ignored=bool(src.get("include_ignored", False)), + force_include=list(src.get("force_include") or []) or None, ) ) total_indexed += result.get("indexed", 0) + result.get("updated", 0) diff --git a/librarian/config.py b/librarian/config.py index 9061021..d15532d 100644 --- a/librarian/config.py +++ b/librarian/config.py @@ -44,6 +44,40 @@ def safe_bool(value: str | None, default: bool) -> bool: DOCUMENTS_PATH = os.path.abspath(os.path.expanduser(os.getenv("DOCUMENTS_PATH", "./documents"))) +# ============================================================================= +# Indexing Skip Defaults +# ============================================================================= + +# Directories that are skipped during indexing unless explicitly overridden +# (via --force-include or a .librariantrack entry). Override the default set +# with INDEX_SKIP_DIRS as a comma-separated list. +_DEFAULT_INDEX_SKIP_DIRS = ( + "__pycache__,.git,.svn,.hg,node_modules,.venv,venv," + ".pytest_cache,.mypy_cache,.ruff_cache,__MACOSX,.DS_Store" +) +INDEX_SKIP_DIRS: frozenset[str] = frozenset( + d.strip() + for d in os.getenv("INDEX_SKIP_DIRS", _DEFAULT_INDEX_SKIP_DIRS).split(",") + if d.strip() +) + +# File extensions that are skipped during indexing (binary / archive / media). +# Override with INDEX_SKIP_EXTENSIONS as a comma-separated list (include the dot). +_DEFAULT_INDEX_SKIP_EXTENSIONS = ( + ".exe,.bin,.dll,.so,.dylib,.a,.o," + ".dmg,.iso,.img,.app,.pkg," + ".zip,.tar,.gz,.bz2,.xz,.7z,.rar," + ".pyc,.pyo,.pyd," + ".lock,.log,.tmp,.temp,.cache," + ".mp4,.mp3,.wav,.avi,.mov,.flac," + ".ttf,.otf,.woff,.woff2" +) +INDEX_SKIP_EXTENSIONS: frozenset[str] = frozenset( + e.strip().lower() + for e in os.getenv("INDEX_SKIP_EXTENSIONS", _DEFAULT_INDEX_SKIP_EXTENSIONS).split(",") + if e.strip() +) + DATABASE_PATH = os.path.abspath( os.path.expanduser(os.getenv("DATABASE_PATH", "~/.librarian/index.db")) ) diff --git a/librarian/server.py b/librarian/server.py index 5ee6a0b..cec3947 100644 --- a/librarian/server.py +++ b/librarian/server.py @@ -47,7 +47,12 @@ from librarian.processing.embed import get_embedder from librarian.processing.parsers.base import FileReadError, FileReadTimeoutError from librarian.retrieval.search import HybridSearcher -from librarian.sources.ignore import GitignoreMatcher, should_skip_file +from librarian.sources.ignore import ( + GitignoreMatcher, + LibrarianTrackMatcher, + normalize_force_include, + should_skip_file, +) from librarian.storage.database import get_database from librarian.tool_outputs import ( AddOutput, @@ -101,9 +106,17 @@ def _should_skip_file( file_path: Path, supported_extensions: set[str], gitignore_matcher: GitignoreMatcher | None = None, + force_include: frozenset[Path] | None = None, + track_matcher: LibrarianTrackMatcher | None = None, ) -> bool: """Check if a file should be skipped during indexing.""" - return should_skip_file(file_path, supported_extensions, gitignore_matcher) + return should_skip_file( + file_path, + supported_extensions, + gitignore_matcher, + force_include=force_include, + track_matcher=track_matcher, + ) def _resolve_path(raw_path: str, kind: str = "path") -> Path: @@ -162,6 +175,15 @@ async def index_directory_to_library( bool, "If True, index files even when matched by a .gitignore under the directory.", ] = False, + force_include: Annotated[ + list[str] | None, + ( + "Files or directories to always index, even when matched by a .gitignore " + "or by the skip-dirs baseline (node_modules, __pycache__, etc.). " + "Pointing at a directory force-includes everything underneath. " + "Has no effect on unsupported or binary file types." + ), + ] = None, ) -> Annotated[ IndexDirectoryOutput, "Per-directory index summary with counts and a per-file status list.", @@ -205,6 +227,8 @@ async def index_directory_to_library( supported_extensions = registry.get_supported_extensions() gitignore_matcher = None if include_ignored else GitignoreMatcher(dir_path) + track_matcher = LibrarianTrackMatcher(dir_path) + forced_paths = normalize_force_include(force_include) all_files: list[Path] = [] for ext in supported_extensions: @@ -212,7 +236,15 @@ async def index_directory_to_library( all_files.extend(dir_path.glob(pattern)) all_files = [ - f for f in all_files if not _should_skip_file(f, supported_extensions, gitignore_matcher) + f + for f in all_files + if not _should_skip_file( + f, + supported_extensions, + gitignore_matcher, + force_include=forced_paths, + track_matcher=track_matcher, + ) ] if not all_files: diff --git a/librarian/sources/ignore.py b/librarian/sources/ignore.py index e0627bf..cd0967e 100644 --- a/librarian/sources/ignore.py +++ b/librarian/sources/ignore.py @@ -1,4 +1,14 @@ -"""File/directory skip logic for indexing, including .gitignore support.""" +"""File/directory skip logic for indexing. + +Three layers, in order of precedence (highest wins): +1. Force-include set (--force-include) and `.librariantrack` patterns — make a + file survive every skip rule below, including the skip-dirs baseline and any + `.gitignore` match. +2. `.gitignore` aggregation under the source root (`GitignoreMatcher`). +3. Hardcoded but config-overridable defaults: directory names from + `INDEX_SKIP_DIRS`, binary/archive extensions from `INDEX_SKIP_EXTENSIONS`, + hidden files, and unsupported extensions. +""" from __future__ import annotations @@ -6,72 +16,17 @@ from pathspec import GitIgnoreSpec -# Directories that are always skipped, even when --include-ignored is set. -# These are caches, VCS metadata, and OS junk that should never enter the library. -ALWAYS_SKIP_DIRS: frozenset[str] = frozenset({ - "__pycache__", - ".git", - ".svn", - ".hg", - "node_modules", - ".venv", - "venv", - ".pytest_cache", - ".mypy_cache", - ".ruff_cache", - "__MACOSX", - ".DS_Store", -}) - -# Binary / system file extensions we never index. -SKIP_EXTENSIONS: frozenset[str] = frozenset({ - # Executables and binaries - ".exe", - ".bin", - ".dll", - ".so", - ".dylib", - ".a", - ".o", - # Disk images and archives - ".dmg", - ".iso", - ".img", - ".app", - ".pkg", - # Compressed archives - ".zip", - ".tar", - ".gz", - ".bz2", - ".xz", - ".7z", - ".rar", - # Python compiled - ".pyc", - ".pyo", - ".pyd", - # System files - ".lock", - ".log", - ".tmp", - ".temp", - ".cache", - # Media files (large binaries) - # TODO: revisit once audio/video parsers land — these will need to move - # out of the skip list and into the parser registry as new asset types. - ".mp4", - ".mp3", - ".wav", - ".avi", - ".mov", - ".flac", - # Font files - ".ttf", - ".otf", - ".woff", - ".woff2", -}) +from librarian.config import INDEX_SKIP_DIRS, INDEX_SKIP_EXTENSIONS + +# Re-exported under the historical names so callers and tests that imported +# them from this module keep working. +ALWAYS_SKIP_DIRS = INDEX_SKIP_DIRS +SKIP_EXTENSIONS = INDEX_SKIP_EXTENSIONS + +# Filename used for per-directory force-include patterns. Patterns inside a +# `.librariantrack` file behave like inverse `.gitignore` patterns: anything +# matched is indexed even if it would otherwise be skipped. +LIBRARIANTRACK_FILENAME = ".librariantrack" class GitignoreMatcher: @@ -90,34 +45,43 @@ def __init__(self, root: Path) -> None: def _build_spec(self) -> GitIgnoreSpec | None: if not self.root.is_dir(): return None + lines = _collect_anchored_patterns(self.root, ".gitignore") + if not lines: + return None + return GitIgnoreSpec.from_lines(lines) - lines: list[str] = [] - # Sort so outer .gitignore files come before inner ones; later lines - # win in GitIgnoreSpec, which matches git's "deeper file overrides". - gitignores = sorted( - self.root.rglob(".gitignore"), - key=lambda p: len(p.parts), - ) - for gitignore in gitignores: - try: - rel_dir = gitignore.parent.resolve().relative_to(self.root) - except ValueError: - continue - prefix = "" if rel_dir == Path(".") else f"{rel_dir.as_posix()}/" - try: - content = gitignore.read_text(encoding="utf-8", errors="ignore") - except OSError: - continue - for raw in content.splitlines(): - pattern = _prefix_pattern(raw, prefix) - if pattern is not None: - lines.append(pattern) + def is_ignored(self, file_path: Path) -> bool: + if self._spec is None: + return False + try: + rel = file_path.resolve().relative_to(self.root) + except ValueError: + return False + return self._spec.match_file(rel.as_posix()) + +class LibrarianTrackMatcher: + """Tells whether a file is force-included by a `.librariantrack` file. + + `.librariantrack` patterns work like `.gitignore` patterns but inverted: + a match means "track this file no matter what other skip rules say". + Patterns are anchored to the containing directory the same way gitignore + patterns are. + """ + + def __init__(self, root: Path) -> None: + self.root = root.resolve() + self._spec: GitIgnoreSpec | None = self._build_spec() + + def _build_spec(self) -> GitIgnoreSpec | None: + if not self.root.is_dir(): + return None + lines = _collect_anchored_patterns(self.root, LIBRARIANTRACK_FILENAME) if not lines: return None return GitIgnoreSpec.from_lines(lines) - def is_ignored(self, file_path: Path) -> bool: + def is_tracked(self, file_path: Path) -> bool: if self._spec is None: return False try: @@ -127,12 +91,37 @@ def is_ignored(self, file_path: Path) -> bool: return self._spec.match_file(rel.as_posix()) +def _collect_anchored_patterns(root: Path, filename: str) -> list[str]: + """Aggregate patterns from every file named `filename` under `root`. + + Outer files come before inner ones so later (deeper) lines win, matching + git's "deeper file overrides" semantics. + """ + lines: list[str] = [] + files = sorted(root.rglob(filename), key=lambda p: len(p.parts)) + for f in files: + try: + rel_dir = f.parent.resolve().relative_to(root) + except ValueError: + continue + prefix = "" if rel_dir == Path(".") else f"{rel_dir.as_posix()}/" + try: + content = f.read_text(encoding="utf-8", errors="ignore") + except OSError: + continue + for raw in content.splitlines(): + pattern = _prefix_pattern(raw, prefix) + if pattern is not None: + lines.append(pattern) + return lines + + def _prefix_pattern(raw: str, prefix: str) -> str | None: - """Translate a single .gitignore line to be anchored under `prefix`. + """Translate a single gitignore-style line to be anchored under `prefix`. - Returns None for blank lines and comments. `prefix` is the gitignore's + Returns None for blank lines and comments. `prefix` is the file's directory relative to the matcher root, with a trailing slash (or empty - string when the gitignore lives at the root). + string when the file lives at the root). """ line = raw.rstrip() if not line or line.startswith("#"): @@ -145,7 +134,7 @@ def _prefix_pattern(raw: str, prefix: str) -> str | None: if not prefix: return ("!" + line) if negate else line - # Determine whether the pattern is anchored to its gitignore's directory. + # Determine whether the pattern is anchored to its directory. # Per git: a leading '/' or any '/' before the end of the pattern anchors # it; otherwise the pattern matches at any depth under that directory. stripped = line.rstrip("/") @@ -158,26 +147,67 @@ def _prefix_pattern(raw: str, prefix: str) -> str | None: return ("!" + new) if negate else new +def _is_force_included( + file_path: Path, + force_include: frozenset[Path] | None, + track_matcher: LibrarianTrackMatcher | None, +) -> bool: + """A force-include match overrides every skip rule except unparseable types.""" + if force_include: + resolved = file_path.resolve() + for forced in force_include: + try: + resolved.relative_to(forced) + except ValueError: + continue + return True + return track_matcher is not None and track_matcher.is_tracked(file_path) + + +def normalize_force_include(paths: list[str] | None) -> frozenset[Path]: + """Resolve a list of user-supplied force-include paths. + + Non-existent paths are dropped silently — they cannot match anything, and + surfacing them as errors would force callers to validate before us. + """ + if not paths: + return frozenset() + out: set[Path] = set() + for raw in paths: + try: + p = Path(raw).expanduser().resolve() + except OSError: + continue + if p.exists(): + out.add(p) + return frozenset(out) + + def should_skip_file( file_path: Path, supported_extensions: set[str], gitignore_matcher: GitignoreMatcher | None = None, + force_include: frozenset[Path] | None = None, + track_matcher: LibrarianTrackMatcher | None = None, ) -> bool: """Decide whether a file should be skipped during indexing. - Hardcoded baseline (always applied): cache/VCS directories, binary - extensions, hidden files, and files lacking a supported extension. - When `gitignore_matcher` is provided, files it marks as ignored are - also skipped. + Force-include (via `force_include` paths or a `.librariantrack` match) + bypasses the skip-dirs baseline and any `.gitignore` rule. It does not + bypass unparseable file types (unsupported or binary extensions, hidden + files, files without an extension) because the indexer can't process them. """ - for parent in file_path.parents: - if parent.name in ALWAYS_SKIP_DIRS: - return True + forced = _is_force_included(file_path, force_include, track_matcher) + + if not forced: + for parent in file_path.parents: + if parent.name in INDEX_SKIP_DIRS: + return True if file_path.name.startswith("."): return True - if file_path.suffix.lower() in SKIP_EXTENSIONS: + if file_path.suffix.lower() in INDEX_SKIP_EXTENSIONS: return True if not file_path.suffix: @@ -186,4 +216,7 @@ def should_skip_file( if file_path.suffix.lower() not in supported_extensions: return True + if forced: + return False + return gitignore_matcher is not None and gitignore_matcher.is_ignored(file_path) diff --git a/tests/test_cli.py b/tests/test_cli.py index 27a536c..157ccb7 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -54,7 +54,10 @@ def test_add_directory_exits_nonzero_when_indexing_errors( monkeypatch.setattr(cli, "console", Console(width=500, color_system=None)) async def fake_server_ingest( - context: Any, directory: str, include_ignored: bool = False + context: Any, + directory: str, + include_ignored: bool = False, + force_include: list[str] | None = None, ) -> dict[str, Any]: return { "directory": directory, diff --git a/tests/test_gitignore.py b/tests/test_gitignore.py index ff4ed87..704805b 100644 --- a/tests/test_gitignore.py +++ b/tests/test_gitignore.py @@ -7,6 +7,8 @@ from librarian.sources.ignore import ( ALWAYS_SKIP_DIRS, GitignoreMatcher, + LibrarianTrackMatcher, + normalize_force_include, should_skip_file, ) @@ -107,3 +109,105 @@ def test_include_ignored_means_no_matcher_passed(self, tmp_path: Path) -> None: _write(tmp_path / ".gitignore", "drafts/\n") f = _write(tmp_path / "drafts" / "wip.md", "x") assert not should_skip_file(f, SUPPORTED, gitignore_matcher=None) + + +class TestForceInclude: + def test_force_include_directory_overrides_gitignore(self, tmp_path: Path) -> None: + _write(tmp_path / ".gitignore", "drafts/\n") + f = _write(tmp_path / "drafts" / "wip.md", "x") + matcher = GitignoreMatcher(tmp_path) + forced = normalize_force_include([str(tmp_path / "drafts")]) + + assert should_skip_file(f, SUPPORTED, gitignore_matcher=matcher) + assert not should_skip_file(f, SUPPORTED, gitignore_matcher=matcher, force_include=forced) + + def test_force_include_file_overrides_gitignore(self, tmp_path: Path) -> None: + _write(tmp_path / ".gitignore", "*.md\n") + f = _write(tmp_path / "keep.md", "x") + matcher = GitignoreMatcher(tmp_path) + forced = normalize_force_include([str(f)]) + + assert should_skip_file(f, SUPPORTED, gitignore_matcher=matcher) + assert not should_skip_file(f, SUPPORTED, gitignore_matcher=matcher, force_include=forced) + + def test_force_include_overrides_always_skip_dirs(self, tmp_path: Path) -> None: + f = _write(tmp_path / "proj" / "node_modules" / "pkg" / "lib.py", "x") + forced = normalize_force_include([str(tmp_path / "proj" / "node_modules" / "pkg")]) + + assert should_skip_file(f, SUPPORTED, gitignore_matcher=None) + assert not should_skip_file(f, SUPPORTED, gitignore_matcher=None, force_include=forced) + + def test_force_include_does_not_rescue_unsupported_extension(self, tmp_path: Path) -> None: + """Force-include bypasses skip rules, but unparseable file types still + cannot enter the index — the parser registry has no parser for them.""" + f = _write(tmp_path / "binary.exe", "x") + forced = normalize_force_include([str(tmp_path)]) + assert should_skip_file(f, SUPPORTED, gitignore_matcher=None, force_include=forced) + + def test_force_include_does_not_rescue_hidden_files(self, tmp_path: Path) -> None: + f = _write(tmp_path / ".hidden.md", "x") + forced = normalize_force_include([str(tmp_path)]) + assert should_skip_file(f, SUPPORTED, gitignore_matcher=None, force_include=forced) + + def test_normalize_drops_nonexistent_paths(self, tmp_path: Path) -> None: + present = _write(tmp_path / "exists.md", "x") + forced = normalize_force_include([ + str(present), + str(tmp_path / "does_not_exist"), + ]) + assert present.resolve() in forced + assert len(forced) == 1 + + def test_normalize_handles_none_and_empty(self) -> None: + assert normalize_force_include(None) == frozenset() + assert normalize_force_include([]) == frozenset() + + +class TestLibrarianTrackMatcher: + def test_no_trackfile_means_nothing_is_tracked(self, tmp_path: Path) -> None: + f = _write(tmp_path / "a.md", "x") + matcher = LibrarianTrackMatcher(tmp_path) + assert not matcher.is_tracked(f) + + def test_root_trackfile_unignores_listed_files(self, tmp_path: Path) -> None: + _write(tmp_path / ".librariantrack", "drafts/\n") + kept = _write(tmp_path / "drafts" / "wip.md", "x") + other = _write(tmp_path / "other.md", "x") + matcher = LibrarianTrackMatcher(tmp_path) + assert matcher.is_tracked(kept) + assert not matcher.is_tracked(other) + + def test_track_overrides_gitignore_for_matched_files(self, tmp_path: Path) -> None: + _write(tmp_path / ".gitignore", "drafts/\n") + _write(tmp_path / ".librariantrack", "drafts/\n") + f = _write(tmp_path / "drafts" / "wip.md", "x") + + gitignore = GitignoreMatcher(tmp_path) + track = LibrarianTrackMatcher(tmp_path) + + assert should_skip_file(f, SUPPORTED, gitignore_matcher=gitignore) + assert not should_skip_file(f, SUPPORTED, gitignore_matcher=gitignore, track_matcher=track) + + def test_track_overrides_always_skip_dirs(self, tmp_path: Path) -> None: + _write(tmp_path / ".librariantrack", "node_modules/pkg/\n") + f = _write(tmp_path / "node_modules" / "pkg" / "lib.py", "x") + track = LibrarianTrackMatcher(tmp_path) + + assert should_skip_file(f, SUPPORTED, gitignore_matcher=None) + assert not should_skip_file(f, SUPPORTED, gitignore_matcher=None, track_matcher=track) + + def test_nested_trackfile_is_scoped_to_its_directory(self, tmp_path: Path) -> None: + _write(tmp_path / ".gitignore", "*.md\n") + _write(tmp_path / "kept" / ".librariantrack", "*.md\n") + kept = _write(tmp_path / "kept" / "doc.md", "x") + elsewhere = _write(tmp_path / "other" / "doc.md", "x") + + gitignore = GitignoreMatcher(tmp_path) + track = LibrarianTrackMatcher(tmp_path) + + assert not should_skip_file( + kept, SUPPORTED, gitignore_matcher=gitignore, track_matcher=track + ) + assert should_skip_file( + elsewhere, SUPPORTED, gitignore_matcher=gitignore, track_matcher=track + )