From d8520e1f51f715858cebc41540131dbc23d573d1 Mon Sep 17 00:00:00 2001 From: shivasurya Date: Thu, 21 May 2026 21:38:41 -0400 Subject: [PATCH 1/2] fix(cli): diff-aware mode must not fall back to full scan on empty diff Two paired bugs caused a delete-only PR to dump every full-repo finding instead of the expected zero. Surfaced as 207 findings on PR #693, which only deletes one YAML workflow file. The wire: pathfinder ci computes changedFiles via git diff, then filters scan results to that set when diff-aware is on. Both halves were broken: 1. diff/git_provider.go ran `git diff --diff-filter=ACMR`, excluding deletions. A delete-only PR therefore produced changedFiles=[]. 2. cmd/ci.go guarded the filter with `diffEnabled && len(changedFiles) > 0`. When the list was empty, the filter was skipped entirely and ALL full-scan findings were returned. The same guard was on the filesScanned count, so the report also claimed "scanned full repo." Fix: - ACMR -> ACMRD so deletions show up in changedFiles. A delete-only PR now returns the deleted path, distinguishing it from an empty PR. - Drop the `len(changedFiles) > 0` guard at both call sites. If diff-aware is on, honour it: an empty intersection returns zero findings, not "full repo." The two states (diff-aware on, nothing matched vs diff-aware off, scan everything) must stay separable. Updated the existing TestGitDiffProvider_DeletedFileExcluded test (now TestGitDiffProvider_DeletedFileIncluded) and added a deletion-only regression covering exactly the PR #693 shape. CLAUDE.md gains a "Diff-Aware Scanning" section documenting the new contract so future maintainers don't reintroduce the fallback. --- CLAUDE.md | 9 ++++++ sast-engine/cmd/ci.go | 16 +++++++--- sast-engine/diff/git_provider.go | 8 +++-- sast-engine/diff/git_provider_test.go | 42 +++++++++++++++++++++++---- 4 files changed, 63 insertions(+), 12 deletions(-) diff --git a/CLAUDE.md b/CLAUDE.md index e5c0c176..559150ba 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -356,6 +356,15 @@ expr.Run(program, envMap) // Returns bool Methods are bound at runtime to actual node fields, enabling type-safe queries without reflection. +### Diff-Aware Scanning (`pathfinder ci --base`) +CI mode filters findings to files touched by the PR. The wire is: + +1. `diff.GetChangedFiles()` runs `git diff --name-only --diff-filter=ACMRD ..HEAD`. +2. The flag set is **ACMRD** (Added, Copied, Modified, Renamed, **Deleted**). Deletions matter because the downstream filter at `cmd/ci.go` uses the list as a sentinel — an empty list means "no source in the diff," which a deletion-only PR would otherwise look identical to. +3. `cmd/ci.go` applies `output.NewDiffFilter(changedFiles)` whenever `diffEnabled`, **without** a `len(changedFiles) > 0` guard. An empty diff intersection returns zero findings; it does NOT fall back to a full-repo scan. Falling back was the May 2026 regression that surfaced as 207 findings on a PR that only deleted a YAML workflow file. + +If you ever need to bring back the full-scan fallback (e.g., for `--no-diff`), do it by turning off `diffEnabled` rather than by smuggling logic into the filter guard. The two states must stay separable: "diff-aware on, nothing matched" vs "diff-aware off, scan everything." + ### SARIF Report Generation CI mode generates SARIF reports for GitHub Advanced Security: ```go diff --git a/sast-engine/cmd/ci.go b/sast-engine/cmd/ci.go index 98c62f4b..52f99ce5 100644 --- a/sast-engine/cmd/ci.go +++ b/sast-engine/cmd/ci.go @@ -366,8 +366,14 @@ Examples: // Merge container detections with code analysis detections. allEnriched = append(allEnriched, containerDetections...) - // Apply diff filter when diff-aware mode is active. - if diffEnabled && len(changedFiles) > 0 { + // Apply diff filter when diff-aware mode is active. We deliberately do + // NOT guard on len(changedFiles) > 0: an empty list means the diff + // genuinely covers no source files (e.g. a deletion-only PR, a docs-only + // PR, or an empty PR), and the right answer in that case is 0 findings, + // not "fall back to a full repo scan." Falling back was the May 2026 + // regression that surfaced as 207 findings on a PR that only deleted a + // YAML workflow file. + if diffEnabled { totalBefore := len(allEnriched) diffFilter := output.NewDiffFilter(changedFiles) allEnriched = diffFilter.Filter(allEnriched) @@ -377,9 +383,11 @@ Examples: // Total rules = code analysis rules loaded + container rules loaded. totalRules := len(rules) + containerRulesCount - // Count unique source files. When diff-aware, only count changed files. + // Count unique source files. When diff-aware, only count changed files + // (including 0 when the diff covers no source: see the diff-filter + // comment above for why we don't fall back to a full scan here either). var filesScanned int - if diffEnabled && len(changedFiles) > 0 { + if diffEnabled { filesScanned = len(changedFiles) } else { uniqueFiles := make(map[string]bool) diff --git a/sast-engine/diff/git_provider.go b/sast-engine/diff/git_provider.go index 1356d937..3d695075 100644 --- a/sast-engine/diff/git_provider.go +++ b/sast-engine/diff/git_provider.go @@ -59,13 +59,17 @@ func (p *GitDiffProvider) findMergeBase() (string, error) { } // diffFiles runs git diff --name-only to list changed files from merge-base to head. -// Uses --diff-filter=ACMR to include Added, Copied, Modified, and Renamed files only. +// Uses --diff-filter=ACMRD to include Added, Copied, Modified, Renamed, AND Deleted +// files. Deletions matter because callers use this list as a sentinel: an empty list +// signals "no files in the diff," and dropping deletions would make a delete-only PR +// look identical to an empty PR, which historically caused the diff filter at +// cmd/ci.go to fall back to a full-repo scan. func (p *GitDiffProvider) diffFiles(mergeBase string) ([]string, error) { ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) defer cancel() diffRange := mergeBase + ".." + p.HeadRef - cmd := exec.CommandContext(ctx, "git", "diff", "--name-only", "--diff-filter=ACMR", diffRange) + cmd := exec.CommandContext(ctx, "git", "diff", "--name-only", "--diff-filter=ACMRD", diffRange) cmd.Dir = p.ProjectRoot output, err := cmd.Output() diff --git a/sast-engine/diff/git_provider_test.go b/sast-engine/diff/git_provider_test.go index 08bd767f..43ba5b77 100644 --- a/sast-engine/diff/git_provider_test.go +++ b/sast-engine/diff/git_provider_test.go @@ -109,18 +109,20 @@ func TestGitDiffProvider_BranchWithMergeBase(t *testing.T) { assert.Equal(t, []string{"feature.py"}, files) } -func TestGitDiffProvider_DeletedFileExcluded(t *testing.T) { - // Tests that deleted files are excluded (--diff-filter=ACMR does not include D). +func TestGitDiffProvider_DeletedFileIncluded(t *testing.T) { + // Deleted files MUST appear in GetChangedFiles. They drive a separate + // downstream signal: cmd/ci.go treats an empty list as "no source in the + // diff" and refuses to fall back to a full scan, so dropping deletions + // would make a delete-only PR look identical to an empty PR. See the + // comment on diffFiles for the full rationale (--diff-filter=ACMRD). dir := setupTestRepo(t) - // Add a file on main. writeFile(t, filepath.Join(dir, "to_delete.py"), "# will be deleted") runGit(t, dir, "add", ".") runGit(t, dir, "commit", "-m", "add file to delete") runGit(t, dir, "checkout", "-b", "feature") - // Delete the file and add a new one. require.NoError(t, os.Remove(filepath.Join(dir, "to_delete.py"))) writeFile(t, filepath.Join(dir, "new_file.py"), "# new") runGit(t, dir, "add", ".") @@ -134,8 +136,36 @@ func TestGitDiffProvider_DeletedFileExcluded(t *testing.T) { files, err := provider.GetChangedFiles() require.NoError(t, err) - // to_delete.py should NOT appear (deleted). Only new_file.py. - assert.Equal(t, []string{"new_file.py"}, files) + assert.ElementsMatch(t, []string{"to_delete.py", "new_file.py"}, files) +} + +func TestGitDiffProvider_DeletionOnlyPR(t *testing.T) { + // Regression for the May 2026 207-findings incident: a PR that only + // deletes a file (no additions or modifications) must surface that + // deletion in GetChangedFiles. Returning an empty list here looks + // identical to an empty PR and historically caused cmd/ci.go to skip + // the diff filter entirely, dumping every full-scan finding. + dir := setupTestRepo(t) + + writeFile(t, filepath.Join(dir, ".github", "workflows", "doomed.yml"), "name: doomed\n") + runGit(t, dir, "add", ".") + runGit(t, dir, "commit", "-m", "add workflow we'll later delete") + + runGit(t, dir, "checkout", "-b", "feature") + require.NoError(t, os.Remove(filepath.Join(dir, ".github", "workflows", "doomed.yml"))) + runGit(t, dir, "add", ".") + runGit(t, dir, "commit", "-m", "delete the workflow") + + provider := &GitDiffProvider{ + ProjectRoot: dir, + BaseRef: "main", + HeadRef: "HEAD", + } + + files, err := provider.GetChangedFiles() + require.NoError(t, err) + assert.Equal(t, []string{".github/workflows/doomed.yml"}, files, + "deletion-only diff must surface the deleted path so callers don't mistake it for an empty PR") } func TestGitDiffProvider_EmptyDiff(t *testing.T) { From 810f8d06aeea23fc1be01aaea1346a633ba5437f Mon Sep 17 00:00:00 2001 From: shivasurya Date: Thu, 21 May 2026 22:02:46 -0400 Subject: [PATCH 2/2] refactor(cli): extract applyDiffFilter helper, fix the same bug in output.DiffFilter MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Codecov caught that the two diff-filter call-site changes in PR #712 weren't directly unit-tested (33% patch coverage). Pulling the gate logic into a named helper makes both branches testable as pure functions and exposes a deeper instance of the same bug. Changes: - cmd/diff_filter.go: new applyDiffFilter + countScannedFiles helpers. applyDiffFilter is the single source of truth for "if diff-aware is on, run the filter; an empty changedFiles list is a valid input that yields zero detections, NOT a pass-through." - cmd/ci.go, cmd/scan.go: both call sites switched to applyDiffFilter. cmd/scan.go previously had the same `diffAware && len(changedFiles) > 0` guard as cmd/ci.go — same regression risk, fixed here too. - output/filter.go: removed the `if len(f.changedFiles) == 0 { return detections }` short-circuit in both Filter and FilteredCount. That was the deeper instance of the bug: even after fixing the cmd-layer guard, output.NewDiffFilter([]).Filter(d) would still pass d through, defeating the cmd-layer fix for genuinely empty diffs. Callers that want "no filtering" must skip Filter entirely (which applyDiffFilter now enforces via the diffEnabled gate). - output/filter_test.go: flipped two test cases that hard-coded the buggy pass-through behaviour to assert the new contract. - cmd/diff_filter_test.go: 10 tests covering every branch of both helpers (100% function coverage). go vet, golangci-lint: 0 issues. --- sast-engine/cmd/ci.go | 39 +++++------- sast-engine/cmd/diff_filter.go | 41 +++++++++++++ sast-engine/cmd/diff_filter_test.go | 93 +++++++++++++++++++++++++++++ sast-engine/cmd/scan.go | 13 ++-- sast-engine/output/filter.go | 15 ++--- sast-engine/output/filter_test.go | 16 +++-- 6 files changed, 176 insertions(+), 41 deletions(-) create mode 100644 sast-engine/cmd/diff_filter.go create mode 100644 sast-engine/cmd/diff_filter_test.go diff --git a/sast-engine/cmd/ci.go b/sast-engine/cmd/ci.go index 52f99ce5..772449e2 100644 --- a/sast-engine/cmd/ci.go +++ b/sast-engine/cmd/ci.go @@ -366,38 +366,29 @@ Examples: // Merge container detections with code analysis detections. allEnriched = append(allEnriched, containerDetections...) - // Apply diff filter when diff-aware mode is active. We deliberately do - // NOT guard on len(changedFiles) > 0: an empty list means the diff - // genuinely covers no source files (e.g. a deletion-only PR, a docs-only - // PR, or an empty PR), and the right answer in that case is 0 findings, - // not "fall back to a full repo scan." Falling back was the May 2026 - // regression that surfaced as 207 findings on a PR that only deleted a - // YAML workflow file. - if diffEnabled { - totalBefore := len(allEnriched) - diffFilter := output.NewDiffFilter(changedFiles) - allEnriched = diffFilter.Filter(allEnriched) + // Apply diff filter when diff-aware mode is active. See + // applyDiffFilter for the explicit "do not fall back to full scan + // on empty diff" contract this commit enforces. + totalBefore := len(allEnriched) + var filterApplied bool + allEnriched, filterApplied = applyDiffFilter(allEnriched, changedFiles, diffEnabled) + if filterApplied { logger.Progress("Diff filter: %d/%d findings in changed files", len(allEnriched), totalBefore) } // Total rules = code analysis rules loaded + container rules loaded. totalRules := len(rules) + containerRulesCount - // Count unique source files. When diff-aware, only count changed files - // (including 0 when the diff covers no source: see the diff-filter - // comment above for why we don't fall back to a full scan here either). - var filesScanned int - if diffEnabled { - filesScanned = len(changedFiles) - } else { - uniqueFiles := make(map[string]bool) - for _, node := range codeGraph.Nodes { - if node.File != "" { - uniqueFiles[node.File] = true - } + // Count unique source files for the report. countScannedFiles picks + // len(changedFiles) when diff-aware (including 0, by design), else + // the unique-file count derived from the code graph. + uniqueFiles := make(map[string]bool) + for _, node := range codeGraph.Nodes { + if node.File != "" { + uniqueFiles[node.File] = true } - filesScanned = len(uniqueFiles) } + filesScanned := countScannedFiles(diffEnabled, len(changedFiles), len(uniqueFiles)) logger.Statistic("Scan complete. Found %d vulnerabilities", len(allEnriched)) logger.Progress("Generating %s output...", outputFormat) diff --git a/sast-engine/cmd/diff_filter.go b/sast-engine/cmd/diff_filter.go new file mode 100644 index 00000000..eb81dfc0 --- /dev/null +++ b/sast-engine/cmd/diff_filter.go @@ -0,0 +1,41 @@ +package cmd + +import ( + "github.com/shivasurya/code-pathfinder/sast-engine/dsl" + "github.com/shivasurya/code-pathfinder/sast-engine/output" +) + +// applyDiffFilter intersects detections with the set of files changed in +// the diff when diff-aware mode is enabled. When diffEnabled is false it +// returns the input unchanged so callers can branch on the bool to decide +// whether to log "Diff filter: X/Y..." +// +// Critically, applyDiffFilter does NOT short-circuit when changedFiles is +// empty: an empty list means the diff genuinely covers no source files +// (delete-only PR, docs-only PR, empty PR), and the right answer is zero +// detections, not "fall back to a full scan." Falling back was the May +// 2026 regression that surfaced as 207 findings on a PR that only +// deleted a YAML workflow file (the public post-mortem in CLAUDE.md). +func applyDiffFilter( + detections []*dsl.EnrichedDetection, + changedFiles []string, + diffEnabled bool, +) (filtered []*dsl.EnrichedDetection, applied bool) { + if !diffEnabled { + return detections, false + } + return output.NewDiffFilter(changedFiles).Filter(detections), true +} + +// countScannedFiles picks the file-count number to report in scan output +// based on whether diff-aware mode was active. When diff-aware, the +// reported count is the number of files in the diff (which may be 0 by +// design; see applyDiffFilter for why we no longer fall back). When not +// diff-aware, the count is the number of unique files the file walk +// observed and the scanner actually parsed. +func countScannedFiles(diffEnabled bool, changedFilesCount, fallbackUniqueFilesCount int) int { + if diffEnabled { + return changedFilesCount + } + return fallbackUniqueFilesCount +} diff --git a/sast-engine/cmd/diff_filter_test.go b/sast-engine/cmd/diff_filter_test.go new file mode 100644 index 00000000..5d5235ce --- /dev/null +++ b/sast-engine/cmd/diff_filter_test.go @@ -0,0 +1,93 @@ +package cmd + +import ( + "testing" + + "github.com/stretchr/testify/assert" + + "github.com/shivasurya/code-pathfinder/sast-engine/dsl" +) + +// detection builds a minimal EnrichedDetection whose RelPath matches the +// given path. Sufficient for DiffFilter, which keys off RelPath / FilePath. +func detection(rel string) *dsl.EnrichedDetection { + return &dsl.EnrichedDetection{ + Location: dsl.LocationInfo{RelPath: rel, FilePath: rel}, + } +} + +// --- applyDiffFilter --- + +func TestApplyDiffFilter_DisabledReturnsInputUnchanged(t *testing.T) { + in := []*dsl.EnrichedDetection{detection("a.py"), detection("b.py")} + out, applied := applyDiffFilter(in, nil, false) + assert.False(t, applied, "applied flag must be false when diff is disabled") + assert.Equal(t, in, out, "input must be returned unchanged when diff is disabled") +} + +func TestApplyDiffFilter_DisabledIgnoresChangedFiles(t *testing.T) { + // changedFiles is non-nil but diffEnabled is false: the filter still must + // not run. Guards against accidental "always apply if list provided" + // regressions. + in := []*dsl.EnrichedDetection{detection("a.py")} + out, applied := applyDiffFilter(in, []string{"unrelated.py"}, false) + assert.False(t, applied) + assert.Equal(t, in, out) +} + +func TestApplyDiffFilter_EnabledWithMatchingFile(t *testing.T) { + in := []*dsl.EnrichedDetection{detection("a.py"), detection("b.py")} + out, applied := applyDiffFilter(in, []string{"a.py"}, true) + assert.True(t, applied, "applied flag must be true when diff is enabled") + assert.Len(t, out, 1, "only a.py finding should survive") + assert.Equal(t, "a.py", out[0].Location.RelPath) +} + +func TestApplyDiffFilter_EnabledWithNoMatches(t *testing.T) { + in := []*dsl.EnrichedDetection{detection("a.py"), detection("b.py")} + out, applied := applyDiffFilter(in, []string{"unrelated.py"}, true) + assert.True(t, applied) + assert.Empty(t, out) +} + +func TestApplyDiffFilter_EnabledWithEmptyChangedFiles(t *testing.T) { + // The regression case. Empty changedFiles + diffEnabled=true must produce + // zero detections, NOT a full pass-through. Falling back to a full scan + // here was the May 2026 207-findings bug. + in := []*dsl.EnrichedDetection{detection("a.py"), detection("b.py")} + out, applied := applyDiffFilter(in, []string{}, true) + assert.True(t, applied, "applied flag must remain true even with empty changedFiles") + assert.Empty(t, out, "empty diff must filter ALL findings to zero, not pass them through") +} + +func TestApplyDiffFilter_EnabledWithNilChangedFiles(t *testing.T) { + // nil slice is treated the same as an empty slice: nothing to match + // against, so everything is filtered out. Same regression guard. + in := []*dsl.EnrichedDetection{detection("a.py")} + out, applied := applyDiffFilter(in, nil, true) + assert.True(t, applied) + assert.Empty(t, out) +} + +// --- countScannedFiles --- + +func TestCountScannedFiles_DiffEnabledReturnsChangedCount(t *testing.T) { + assert.Equal(t, 5, countScannedFiles(true, 5, 999), + "diff-aware count must be the changed-files count, regardless of fallback") +} + +func TestCountScannedFiles_DiffEnabledWithZeroChangedReturnsZero(t *testing.T) { + // Regression guard mirroring applyDiffFilter's empty-diff behaviour: an + // empty diff must report 0 files scanned, NOT silently use the fallback + // count (which would mask the empty-diff state in the JSON output). + assert.Equal(t, 0, countScannedFiles(true, 0, 250)) +} + +func TestCountScannedFiles_DiffDisabledReturnsFallback(t *testing.T) { + assert.Equal(t, 42, countScannedFiles(false, 7, 42), + "non-diff scans must report the unique-file count from the graph") +} + +func TestCountScannedFiles_DiffDisabledWithZeroFallback(t *testing.T) { + assert.Equal(t, 0, countScannedFiles(false, 7, 0)) +} diff --git a/sast-engine/cmd/scan.go b/sast-engine/cmd/scan.go index 7c4899ba..03cb84bf 100644 --- a/sast-engine/cmd/scan.go +++ b/sast-engine/cmd/scan.go @@ -356,11 +356,14 @@ Examples: // Merge container detections with code analysis detections allEnriched = append(allEnriched, containerDetections...) - // Apply diff filter when diff-aware mode is active. - if diffAware && len(changedFiles) > 0 { - totalBefore := len(allEnriched) - diffFilter := output.NewDiffFilter(changedFiles) - allEnriched = diffFilter.Filter(allEnriched) + // Apply diff filter when diff-aware mode is active. Shared with + // cmd/ci.go via applyDiffFilter to keep the empty-diff contract + // (return zero findings, do NOT fall back to a full scan) honoured + // in both commands. See cmd/diff_filter.go for the rationale. + totalBefore := len(allEnriched) + var filterApplied bool + allEnriched, filterApplied = applyDiffFilter(allEnriched, changedFiles, diffAware) + if filterApplied { logger.Progress("Diff filter: %d/%d findings in changed files", len(allEnriched), totalBefore) } diff --git a/sast-engine/output/filter.go b/sast-engine/output/filter.go index a6dd2790..342ad149 100644 --- a/sast-engine/output/filter.go +++ b/sast-engine/output/filter.go @@ -20,11 +20,13 @@ func NewDiffFilter(changedFiles []string) *DiffFilter { } // Filter returns only detections whose RelPath is in the changed files set. -// If no changed files were provided (empty set), all detections are returned. +// An empty set filters everything out: a DiffFilter constructed with no +// changed files represents "nothing in the diff to match," and the right +// answer is zero detections, not pass-through. Callers that want "no +// filtering" must not call Filter at all (see cmd.applyDiffFilter which +// gates on the diffEnabled flag for exactly this reason). Falling back to +// pass-through was the May 2026 207-findings regression. func (f *DiffFilter) Filter(detections []*dsl.EnrichedDetection) []*dsl.EnrichedDetection { - if len(f.changedFiles) == 0 { - return detections - } filtered := make([]*dsl.EnrichedDetection, 0, len(detections)) for _, det := range detections { if f.changedFiles[det.Location.RelPath] { @@ -35,10 +37,9 @@ func (f *DiffFilter) Filter(detections []*dsl.EnrichedDetection) []*dsl.Enriched } // FilteredCount returns the number of detections that would be removed. +// With an empty changed-files set, every detection would be removed, so +// the count equals len(detections). See Filter for the rationale. func (f *DiffFilter) FilteredCount(detections []*dsl.EnrichedDetection) int { - if len(f.changedFiles) == 0 { - return 0 - } count := 0 for _, det := range detections { if !f.changedFiles[det.Location.RelPath] { diff --git a/sast-engine/output/filter_test.go b/sast-engine/output/filter_test.go index f00fdbc2..286fd17b 100644 --- a/sast-engine/output/filter_test.go +++ b/sast-engine/output/filter_test.go @@ -74,14 +74,17 @@ func TestDiffFilter_Filter(t *testing.T) { wantRelPaths: []string{"app/views.py", "app/auth.py"}, }, { - name: "empty changed files returns all detections", + // An empty changed-files set means "nothing in the diff to match"; + // the right answer is zero detections, not pass-through. Callers + // that want "no filtering" must skip Filter entirely. + name: "empty changed files filters everything out", changedFiles: []string{}, detections: []*dsl.EnrichedDetection{ makeDetection("app/views.py", "critical"), makeDetection("app/models.py", "high"), }, - wantCount: 2, - wantRelPaths: []string{"app/views.py", "app/models.py"}, + wantCount: 0, + wantRelPaths: nil, }, { name: "nil detections", @@ -257,12 +260,15 @@ func TestDiffFilter_FilteredCount(t *testing.T) { wantFiltered: 2, }, { - name: "empty changed files means no filtering", + // With an empty changed-files set, every detection would be + // removed by Filter (see Filter's contract): FilteredCount must + // report that as well, not 0. + name: "empty changed files reports all as removed", changedFiles: []string{}, detections: []*dsl.EnrichedDetection{ makeDetection("app/views.py", "critical"), }, - wantFiltered: 0, + wantFiltered: 1, }, { name: "nil detections",