From a24c9e574c9ab3be2a16048546eef9b7760159fd Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 22 Apr 2026 12:50:29 +0000 Subject: [PATCH 1/3] refactor: harden normalizeZoektRepoName against multi-segment repo paths Fixes two bugs that caused silent chunk-resolution failures for GitLab nested-group repos (e.g. gitlab.com/org/subgroup/repo): 1. job_processor.go used FullName() for the zoekt repo name, which only returns the first two path segments. For nested groups this truncated the path (e.g. "gitlab.com/org/sub" instead of "gitlab.com/org/sub/repo"). Fixed by adding FullPath() to RepositoryURL and using it instead. 2. zoektRepoNameCandidates only generated 2 candidates (original + strip-host). For a 4-segment zoekt name the third "sub/repo" variant (matching ExtractRepositoryNameFromURL's last-two-segments format) was never tried. Replaced the two-candidate logic with a progressive segment-stripping algorithm that stops before single-segment results. Closes #56 https://claude.ai/code/session_018DGnjmwnT5WZPB9MHPYSvH --- .../application/service/search_service.go | 41 ++++-- .../service/search_service_test.go | 121 ++++++++++++++++++ internal/application/worker/job_processor.go | 2 +- internal/domain/valueobject/repository_url.go | 8 ++ .../domain/valueobject/repository_url_test.go | 32 +++++ 5 files changed, 191 insertions(+), 13 deletions(-) diff --git a/internal/application/service/search_service.go b/internal/application/service/search_service.go index cf601482..267bf6c3 100644 --- a/internal/application/service/search_service.go +++ b/internal/application/service/search_service.go @@ -280,21 +280,38 @@ func normalizeZoektRepoName(name string) string { return name } -func stripZoektRepoHost(name string) string { - parts := strings.SplitN(name, "/", 3) - if len(parts) == 3 { - return parts[1] + "/" + parts[2] - } - return name -} - +// zoektRepoNameCandidates returns all plausible DB repository name variants for +// a zoekt repo name, ordered from most to least specific. It strips one leading +// path segment at a time, stopping before single-segment results to avoid +// false-positive matches. +// +// Examples: +// +// "github.com/owner/repo" → ["github.com/owner/repo", "owner/repo"] +// "gitlab.com/org/sub/repo" → ["gitlab.com/org/sub/repo", "org/sub/repo", "sub/repo"] +// "owner/repo" → ["owner/repo"] func zoektRepoNameCandidates(name string) []string { normalized := normalizeZoektRepoName(name) - stripped := stripZoektRepoHost(normalized) - if stripped == normalized { - return []string{normalized} + seen := map[string]bool{normalized: true} + candidates := []string{normalized} + current := normalized + for { + idx := strings.Index(current, "/") + if idx < 0 { + break + } + next := current[idx+1:] + // Stop before adding single-segment candidates — too ambiguous. + if !strings.Contains(next, "/") { + break + } + if !seen[next] { + candidates = append(candidates, next) + seen[next] = true + } + current = next } - return []string{normalized, stripped} + return candidates } func lineRangesOverlap(startA, endA, startB, endB int) bool { diff --git a/internal/application/service/search_service_test.go b/internal/application/service/search_service_test.go index d07338f3..2296f4ea 100644 --- a/internal/application/service/search_service_test.go +++ b/internal/application/service/search_service_test.go @@ -2061,3 +2061,124 @@ func TestSearchService_HybridChunkLevelZoektResolutionContract(t *testing.T) { mockZoekt.AssertExpectations(t) }) } + +func TestZoektRepoNameCandidates(t *testing.T) { + tests := []struct { + name string + input string + expected []string + }{ + { + name: "standard GitHub 3-segment", + input: "github.com/owner/repo", + expected: []string{"github.com/owner/repo", "owner/repo"}, + }, + { + name: "GitLab nested group 4-segment", + input: "gitlab.com/org/sub/repo", + expected: []string{"gitlab.com/org/sub/repo", "org/sub/repo", "sub/repo"}, + }, + { + name: "no host 2-segment", + input: "owner/repo", + expected: []string{"owner/repo"}, + }, + { + name: "deeply nested 5-segment", + input: "gitlab.com/a/b/c/d", + expected: []string{"gitlab.com/a/b/c/d", "a/b/c/d", "b/c/d", "c/d"}, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := zoektRepoNameCandidates(tt.input) + assert.Equal(t, tt.expected, got) + }) + } +} + +func TestSearchService_HybridChunkResolution_GitLabNestedGroup(t *testing.T) { + silentLogger, err := logging.NewApplicationLogger(logging.Config{ + Level: "ERROR", + Format: "json", + Output: "buffer", + }) + require.NoError(t, err) + slogger.SetGlobalLogger(silentLogger) + defer slogger.SetGlobalLogger(nil) + + t.Run("Resolves_Chunk_Via_Third_Candidate_For_Nested_Group", func(t *testing.T) { + mockVectorRepo := new(MockVectorStorageRepository) + mockEmbeddingService := new(MockEmbeddingService) + mockChunkRepo := new(MockChunkRepository) + mockZoekt := new(MockZoektSearcher) + + svc := NewSearchService( + mockVectorRepo, mockEmbeddingService, mockChunkRepo, + new(MockRepositoryRepository), testConfig(), mockZoekt, nil, + ) + + ctx := context.Background() + req := dto.SearchRequestDTO{ + Query: "ProcessRequest", + Mode: dto.SearchModeHybrid, + Limit: 10, + } + queryVector := []float64{0.1, 0.2, 0.3} + mockEmbeddingService.On("GenerateEmbedding", ctx, req.Query, mock.AnythingOfType("outbound.EmbeddingOptions")). + Return(&outbound.EmbeddingResult{Vector: queryVector}, nil) + mockVectorRepo.On("VectorSimilaritySearch", ctx, queryVector, mock.AnythingOfType("outbound.SimilaritySearchOptions")). + Return([]outbound.VectorSimilarityResult{}, nil) + mockChunkRepo.On("FindChunksByIDs", ctx, []uuid.UUID{}). + Return([]ChunkInfo{}, nil) + + targetChunkID := uuid.MustParse("aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa") + // DB stores name as "sub/repo" (ExtractRepositoryNameFromURL returns last-two URL segments). + // Candidates tried in order: "gitlab.com/org/sub/repo", "org/sub/repo", "sub/repo". + mockChunkRepo.On("FindChunksByRepositoryPath", ctx, "gitlab.com/org/sub/repo", "handler.go"). + Return([]ChunkInfo{}, nil) + mockChunkRepo.On("FindChunksByRepositoryPath", ctx, "org/sub/repo", "handler.go"). + Return([]ChunkInfo{}, nil) + mockChunkRepo.On("FindChunksByRepositoryPath", ctx, "sub/repo", "handler.go"). + Return([]ChunkInfo{ + { + ChunkID: targetChunkID, + Content: "func ProcessRequest(r *http.Request) {}", + FilePath: "handler.go", + Language: "Go", + StartLine: 10, + EndLine: 20, + EntityName: "ProcessRequest", + Repository: dto.RepositoryInfo{Name: "sub/repo"}, + }, + }, nil) + + mockZoekt.On("Search", ctx, req.Query, mock.AnythingOfType("outbound.ZoektSearchOptions")). + Return(&outbound.ZoektSearchResult{ + FileMatches: []outbound.ZoektFileMatch{ + { + Repository: "gitlab.com/org/sub/repo", + FileName: "handler.go", + Language: "Go", + Score: 30, + LineMatches: []outbound.ZoektLineMatch{ + {LineNumber: 12, LineContent: "func ProcessRequest(r *http.Request) {"}, + }, + }, + }, + TotalCount: 1, + }, nil) + + result, err := svc.Search(ctx, req) + + require.NoError(t, err) + require.NotNil(t, result) + require.Len(t, result.Results, 1) + assert.Equal(t, targetChunkID, result.Results[0].ChunkID) + assert.Equal(t, "zoekt", result.Results[0].SourceEngine) + + mockChunkRepo.AssertExpectations(t) + mockZoekt.AssertExpectations(t) + }) +} diff --git a/internal/application/worker/job_processor.go b/internal/application/worker/job_processor.go index 0c57d6a3..9a0ae8b5 100644 --- a/internal/application/worker/job_processor.go +++ b/internal/application/worker/job_processor.go @@ -353,7 +353,7 @@ func (p *DefaultJobProcessor) executeJobPipeline( commitHash = unknownCommitHash } repoURL := repo.URL() - zoektRepoName := repoURL.Host() + "/" + repoURL.FullName() + zoektRepoName := repoURL.Host() + "/" + repoURL.FullPath() result := p.runConcurrentIndexing( ctx, zoektRepoName, diff --git a/internal/domain/valueobject/repository_url.go b/internal/domain/valueobject/repository_url.go index 6877ff2a..cac3da7b 100644 --- a/internal/domain/valueobject/repository_url.go +++ b/internal/domain/valueobject/repository_url.go @@ -375,6 +375,14 @@ func (r RepositoryURL) Name() string { return "" } +// FullPath returns the full URL path without the host (e.g., "org/sub/repo"). +// Unlike FullName, this preserves all path segments, making it correct for +// nested GitLab groups and other hosting services with deep paths. +func (r RepositoryURL) FullPath() string { + parsedURL, _ := url.Parse(r.normalized) + return strings.Trim(parsedURL.Path, "/") +} + // FullName returns the full repository name in "owner/name" format (e.g., "golang/go"). // Returns empty string if either owner or name cannot be determined. func (r RepositoryURL) FullName() string { diff --git a/internal/domain/valueobject/repository_url_test.go b/internal/domain/valueobject/repository_url_test.go index df3e8a3b..e05666c3 100644 --- a/internal/domain/valueobject/repository_url_test.go +++ b/internal/domain/valueobject/repository_url_test.go @@ -261,6 +261,38 @@ func TestRepositoryURL_Methods(t *testing.T) { } } +func TestRepositoryURL_FullPath(t *testing.T) { + tests := []struct { + name string + rawURL string + expected string + }{ + { + name: "standard GitHub repo", + rawURL: "https://github.com/owner/repo", + expected: "owner/repo", + }, + { + name: "GitLab nested group", + rawURL: "https://gitlab.com/org/sub/repo", + expected: "org/sub/repo", + }, + { + name: "deeply nested group", + rawURL: "https://gitlab.com/a/b/c/repo", + expected: "a/b/c/repo", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + u, err := NewRepositoryURL(tt.rawURL) + require.NoError(t, err) + assert.Equal(t, tt.expected, u.FullPath()) + }) + } +} + func TestRepositoryURL_Equal(t *testing.T) { tests := []struct { name string From 76524fa4cc3a35a5f564552e4f28a121851ac300 Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 22 Apr 2026 13:04:05 +0000 Subject: [PATCH 2/3] fix: harden FullPath() to strip UI path suffixes for GitHub/Bitbucket GitHub and Bitbucket repos always have exactly owner/repo (2 path segments). FullPath() now truncates any extra segments (e.g. from a /tree/ UI URL) to return a clean repo identifier. GitLab and other hosts are left unchanged so nested group paths (org/sub/repo) are preserved correctly. Addresses Copilot review comment on PR #60. https://claude.ai/code/session_018DGnjmwnT5WZPB9MHPYSvH --- internal/domain/valueobject/repository_url.go | 27 ++++++++++++++++--- .../domain/valueobject/repository_url_test.go | 10 +++++++ 2 files changed, 33 insertions(+), 4 deletions(-) diff --git a/internal/domain/valueobject/repository_url.go b/internal/domain/valueobject/repository_url.go index cac3da7b..3fc777ad 100644 --- a/internal/domain/valueobject/repository_url.go +++ b/internal/domain/valueobject/repository_url.go @@ -375,12 +375,31 @@ func (r RepositoryURL) Name() string { return "" } -// FullPath returns the full URL path without the host (e.g., "org/sub/repo"). -// Unlike FullName, this preserves all path segments, making it correct for -// nested GitLab groups and other hosting services with deep paths. +// FullPath returns the repository root path without the host +// (e.g., "org/sub/repo" or "owner/repo"). +// For GitHub and Bitbucket (which use owner/repo with no nested groups) any +// extra path segments — e.g., from a UI URL like /tree/ — are stripped +// so callers always receive a clean repository identifier. +// For GitLab and other hosts the full path is preserved to support nested groups. func (r RepositoryURL) FullPath() string { parsedURL, _ := url.Parse(r.normalized) - return strings.Trim(parsedURL.Path, "/") + trimmedPath := strings.Trim(parsedURL.Path, "/") + if trimmedPath == "" { + return "" + } + + parts := strings.Split(trimmedPath, "/") + + // GitHub and Bitbucket repos are always owner/repo (exactly 2 path segments). + // Trim any extra segments that indicate a UI page rather than the repo root. + switch strings.ToLower(parsedURL.Hostname()) { + case "github.com", "www.github.com", "bitbucket.org", "www.bitbucket.org": + if len(parts) > MinPathPartsForRepoURL { + return strings.Join(parts[:MinPathPartsForRepoURL], "/") + } + } + + return trimmedPath } // FullName returns the full repository name in "owner/name" format (e.g., "golang/go"). diff --git a/internal/domain/valueobject/repository_url_test.go b/internal/domain/valueobject/repository_url_test.go index e05666c3..4935a018 100644 --- a/internal/domain/valueobject/repository_url_test.go +++ b/internal/domain/valueobject/repository_url_test.go @@ -282,6 +282,16 @@ func TestRepositoryURL_FullPath(t *testing.T) { rawURL: "https://gitlab.com/a/b/c/repo", expected: "a/b/c/repo", }, + { + name: "GitHub URL with UI tree suffix is truncated to repo root", + rawURL: "https://github.com/owner/repo", + expected: "owner/repo", + }, + { + name: "Bitbucket URL returns owner/repo", + rawURL: "https://bitbucket.org/owner/repo", + expected: "owner/repo", + }, } for _, tt := range tests { From 62cf26564d82b5eafe7ca3ca2b3444ff91f57b9a Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 22 Apr 2026 13:16:01 +0000 Subject: [PATCH 3/3] test: fix FullPath test URL and add GitLab UI sentinel test cases Addresses Copilot review comments on PR #60: - Fix 'GitHub URL with UI tree suffix' test case to actually use a URL with a /tree/main suffix (previously used same URL as standard case) - Add GitLab nested group test with direct /tree/main sentinel - Add GitLab nested group test with /-/ UI separator (normalised to -/) https://claude.ai/code/session_018DGnjmwnT5WZPB9MHPYSvH --- internal/domain/valueobject/repository_url.go | 44 +++++++++++++++++-- .../domain/valueobject/repository_url_test.go | 12 ++++- 2 files changed, 51 insertions(+), 5 deletions(-) diff --git a/internal/domain/valueobject/repository_url.go b/internal/domain/valueobject/repository_url.go index 3fc777ad..e5de3c03 100644 --- a/internal/domain/valueobject/repository_url.go +++ b/internal/domain/valueobject/repository_url.go @@ -375,12 +375,30 @@ func (r RepositoryURL) Name() string { return "" } +// hostingUISentinels is the set of path segments that mark the start of a +// hosting-service UI route following the repository root. Any path segments +// from the first match onward are stripped by FullPath. +var hostingUISentinels = map[string]bool{ + "tree": true, "blob": true, "raw": true, "blame": true, + "commit": true, "commits": true, "compare": true, + "issues": true, "pull": true, "pulls": true, + "releases": true, "tags": true, "branches": true, + "wiki": true, "actions": true, "settings": true, + "graphs": true, "network": true, "security": true, +} + // FullPath returns the repository root path without the host // (e.g., "org/sub/repo" or "owner/repo"). -// For GitHub and Bitbucket (which use owner/repo with no nested groups) any -// extra path segments — e.g., from a UI URL like /tree/ — are stripped -// so callers always receive a clean repository identifier. -// For GitLab and other hosts the full path is preserved to support nested groups. +// +// For GitHub and Bitbucket (owner/repo only, no nested groups) any extra path +// segments — e.g., from a UI URL like /tree/ — are stripped by +// limiting to 2 segments. +// +// For GitLab and other hosts the path is kept in full to support nested groups, +// but known hosting-service UI sentinels (tree, blob, commit, …) truncate the +// result at the repository root. The URL normalizer rewrites "/-" to "-", so a +// GitLab UI separator "/-/" is also handled by stripping the resulting trailing +// dash from the last retained segment. func (r RepositoryURL) FullPath() string { parsedURL, _ := url.Parse(r.normalized) trimmedPath := strings.Trim(parsedURL.Path, "/") @@ -397,6 +415,24 @@ func (r RepositoryURL) FullPath() string { if len(parts) > MinPathPartsForRepoURL { return strings.Join(parts[:MinPathPartsForRepoURL], "/") } + return trimmedPath + } + + // For GitLab and other hosts: strip at the first known UI sentinel segment. + // Require at least MinPathPartsForRepoURL leading segments so that a group or + // repo legitimately named with a sentinel word is not accidentally truncated. + for i, part := range parts { + if i < MinPathPartsForRepoURL { + continue + } + if hostingUISentinels[strings.ToLower(part)] { + repoPath := make([]string, i) + copy(repoPath, parts[:i]) + // The normalizer transforms "/-" → "-", leaving a trailing dash on + // the segment that preceded the GitLab "/-/" route separator. + repoPath[len(repoPath)-1] = strings.TrimSuffix(repoPath[len(repoPath)-1], "-") + return strings.Join(repoPath, "/") + } } return trimmedPath diff --git a/internal/domain/valueobject/repository_url_test.go b/internal/domain/valueobject/repository_url_test.go index 4935a018..5626d8fe 100644 --- a/internal/domain/valueobject/repository_url_test.go +++ b/internal/domain/valueobject/repository_url_test.go @@ -284,7 +284,7 @@ func TestRepositoryURL_FullPath(t *testing.T) { }, { name: "GitHub URL with UI tree suffix is truncated to repo root", - rawURL: "https://github.com/owner/repo", + rawURL: "https://github.com/owner/repo/tree/main", expected: "owner/repo", }, { @@ -292,6 +292,16 @@ func TestRepositoryURL_FullPath(t *testing.T) { rawURL: "https://bitbucket.org/owner/repo", expected: "owner/repo", }, + { + name: "GitLab nested group with tree sentinel is truncated to repo root", + rawURL: "https://gitlab.com/org/sub/repo/tree/main", + expected: "org/sub/repo", + }, + { + name: "GitLab nested group with UI separator is truncated to repo root", + rawURL: "https://gitlab.com/org/sub/repo/-/tree/main", + expected: "org/sub/repo", + }, } for _, tt := range tests {