diff --git a/internal/application/service/search_service.go b/internal/application/service/search_service.go index cf60148..267bf6c 100644 --- a/internal/application/service/search_service.go +++ b/internal/application/service/search_service.go @@ -280,21 +280,38 @@ func normalizeZoektRepoName(name string) string { return name } -func stripZoektRepoHost(name string) string { - parts := strings.SplitN(name, "/", 3) - if len(parts) == 3 { - return parts[1] + "/" + parts[2] - } - return name -} - +// zoektRepoNameCandidates returns all plausible DB repository name variants for +// a zoekt repo name, ordered from most to least specific. It strips one leading +// path segment at a time, stopping before single-segment results to avoid +// false-positive matches. +// +// Examples: +// +// "github.com/owner/repo" → ["github.com/owner/repo", "owner/repo"] +// "gitlab.com/org/sub/repo" → ["gitlab.com/org/sub/repo", "org/sub/repo", "sub/repo"] +// "owner/repo" → ["owner/repo"] func zoektRepoNameCandidates(name string) []string { normalized := normalizeZoektRepoName(name) - stripped := stripZoektRepoHost(normalized) - if stripped == normalized { - return []string{normalized} + seen := map[string]bool{normalized: true} + candidates := []string{normalized} + current := normalized + for { + idx := strings.Index(current, "/") + if idx < 0 { + break + } + next := current[idx+1:] + // Stop before adding single-segment candidates — too ambiguous. + if !strings.Contains(next, "/") { + break + } + if !seen[next] { + candidates = append(candidates, next) + seen[next] = true + } + current = next } - return []string{normalized, stripped} + return candidates } func lineRangesOverlap(startA, endA, startB, endB int) bool { diff --git a/internal/application/service/search_service_test.go b/internal/application/service/search_service_test.go index d07338f..2296f4e 100644 --- a/internal/application/service/search_service_test.go +++ b/internal/application/service/search_service_test.go @@ -2061,3 +2061,124 @@ func TestSearchService_HybridChunkLevelZoektResolutionContract(t *testing.T) { mockZoekt.AssertExpectations(t) }) } + +func TestZoektRepoNameCandidates(t *testing.T) { + tests := []struct { + name string + input string + expected []string + }{ + { + name: "standard GitHub 3-segment", + input: "github.com/owner/repo", + expected: []string{"github.com/owner/repo", "owner/repo"}, + }, + { + name: "GitLab nested group 4-segment", + input: "gitlab.com/org/sub/repo", + expected: []string{"gitlab.com/org/sub/repo", "org/sub/repo", "sub/repo"}, + }, + { + name: "no host 2-segment", + input: "owner/repo", + expected: []string{"owner/repo"}, + }, + { + name: "deeply nested 5-segment", + input: "gitlab.com/a/b/c/d", + expected: []string{"gitlab.com/a/b/c/d", "a/b/c/d", "b/c/d", "c/d"}, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := zoektRepoNameCandidates(tt.input) + assert.Equal(t, tt.expected, got) + }) + } +} + +func TestSearchService_HybridChunkResolution_GitLabNestedGroup(t *testing.T) { + silentLogger, err := logging.NewApplicationLogger(logging.Config{ + Level: "ERROR", + Format: "json", + Output: "buffer", + }) + require.NoError(t, err) + slogger.SetGlobalLogger(silentLogger) + defer slogger.SetGlobalLogger(nil) + + t.Run("Resolves_Chunk_Via_Third_Candidate_For_Nested_Group", func(t *testing.T) { + mockVectorRepo := new(MockVectorStorageRepository) + mockEmbeddingService := new(MockEmbeddingService) + mockChunkRepo := new(MockChunkRepository) + mockZoekt := new(MockZoektSearcher) + + svc := NewSearchService( + mockVectorRepo, mockEmbeddingService, mockChunkRepo, + new(MockRepositoryRepository), testConfig(), mockZoekt, nil, + ) + + ctx := context.Background() + req := dto.SearchRequestDTO{ + Query: "ProcessRequest", + Mode: dto.SearchModeHybrid, + Limit: 10, + } + queryVector := []float64{0.1, 0.2, 0.3} + mockEmbeddingService.On("GenerateEmbedding", ctx, req.Query, mock.AnythingOfType("outbound.EmbeddingOptions")). + Return(&outbound.EmbeddingResult{Vector: queryVector}, nil) + mockVectorRepo.On("VectorSimilaritySearch", ctx, queryVector, mock.AnythingOfType("outbound.SimilaritySearchOptions")). + Return([]outbound.VectorSimilarityResult{}, nil) + mockChunkRepo.On("FindChunksByIDs", ctx, []uuid.UUID{}). + Return([]ChunkInfo{}, nil) + + targetChunkID := uuid.MustParse("aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa") + // DB stores name as "sub/repo" (ExtractRepositoryNameFromURL returns last-two URL segments). + // Candidates tried in order: "gitlab.com/org/sub/repo", "org/sub/repo", "sub/repo". + mockChunkRepo.On("FindChunksByRepositoryPath", ctx, "gitlab.com/org/sub/repo", "handler.go"). + Return([]ChunkInfo{}, nil) + mockChunkRepo.On("FindChunksByRepositoryPath", ctx, "org/sub/repo", "handler.go"). + Return([]ChunkInfo{}, nil) + mockChunkRepo.On("FindChunksByRepositoryPath", ctx, "sub/repo", "handler.go"). + Return([]ChunkInfo{ + { + ChunkID: targetChunkID, + Content: "func ProcessRequest(r *http.Request) {}", + FilePath: "handler.go", + Language: "Go", + StartLine: 10, + EndLine: 20, + EntityName: "ProcessRequest", + Repository: dto.RepositoryInfo{Name: "sub/repo"}, + }, + }, nil) + + mockZoekt.On("Search", ctx, req.Query, mock.AnythingOfType("outbound.ZoektSearchOptions")). + Return(&outbound.ZoektSearchResult{ + FileMatches: []outbound.ZoektFileMatch{ + { + Repository: "gitlab.com/org/sub/repo", + FileName: "handler.go", + Language: "Go", + Score: 30, + LineMatches: []outbound.ZoektLineMatch{ + {LineNumber: 12, LineContent: "func ProcessRequest(r *http.Request) {"}, + }, + }, + }, + TotalCount: 1, + }, nil) + + result, err := svc.Search(ctx, req) + + require.NoError(t, err) + require.NotNil(t, result) + require.Len(t, result.Results, 1) + assert.Equal(t, targetChunkID, result.Results[0].ChunkID) + assert.Equal(t, "zoekt", result.Results[0].SourceEngine) + + mockChunkRepo.AssertExpectations(t) + mockZoekt.AssertExpectations(t) + }) +} diff --git a/internal/application/worker/job_processor.go b/internal/application/worker/job_processor.go index 0c57d6a..9a0ae8b 100644 --- a/internal/application/worker/job_processor.go +++ b/internal/application/worker/job_processor.go @@ -353,7 +353,7 @@ func (p *DefaultJobProcessor) executeJobPipeline( commitHash = unknownCommitHash } repoURL := repo.URL() - zoektRepoName := repoURL.Host() + "/" + repoURL.FullName() + zoektRepoName := repoURL.Host() + "/" + repoURL.FullPath() result := p.runConcurrentIndexing( ctx, zoektRepoName, diff --git a/internal/domain/valueobject/repository_url.go b/internal/domain/valueobject/repository_url.go index 6877ff2..e5de3c0 100644 --- a/internal/domain/valueobject/repository_url.go +++ b/internal/domain/valueobject/repository_url.go @@ -375,6 +375,69 @@ func (r RepositoryURL) Name() string { return "" } +// hostingUISentinels is the set of path segments that mark the start of a +// hosting-service UI route following the repository root. Any path segments +// from the first match onward are stripped by FullPath. +var hostingUISentinels = map[string]bool{ + "tree": true, "blob": true, "raw": true, "blame": true, + "commit": true, "commits": true, "compare": true, + "issues": true, "pull": true, "pulls": true, + "releases": true, "tags": true, "branches": true, + "wiki": true, "actions": true, "settings": true, + "graphs": true, "network": true, "security": true, +} + +// FullPath returns the repository root path without the host +// (e.g., "org/sub/repo" or "owner/repo"). +// +// For GitHub and Bitbucket (owner/repo only, no nested groups) any extra path +// segments — e.g., from a UI URL like /tree/ — are stripped by +// limiting to 2 segments. +// +// For GitLab and other hosts the path is kept in full to support nested groups, +// but known hosting-service UI sentinels (tree, blob, commit, …) truncate the +// result at the repository root. The URL normalizer rewrites "/-" to "-", so a +// GitLab UI separator "/-/" is also handled by stripping the resulting trailing +// dash from the last retained segment. +func (r RepositoryURL) FullPath() string { + parsedURL, _ := url.Parse(r.normalized) + trimmedPath := strings.Trim(parsedURL.Path, "/") + if trimmedPath == "" { + return "" + } + + parts := strings.Split(trimmedPath, "/") + + // GitHub and Bitbucket repos are always owner/repo (exactly 2 path segments). + // Trim any extra segments that indicate a UI page rather than the repo root. + switch strings.ToLower(parsedURL.Hostname()) { + case "github.com", "www.github.com", "bitbucket.org", "www.bitbucket.org": + if len(parts) > MinPathPartsForRepoURL { + return strings.Join(parts[:MinPathPartsForRepoURL], "/") + } + return trimmedPath + } + + // For GitLab and other hosts: strip at the first known UI sentinel segment. + // Require at least MinPathPartsForRepoURL leading segments so that a group or + // repo legitimately named with a sentinel word is not accidentally truncated. + for i, part := range parts { + if i < MinPathPartsForRepoURL { + continue + } + if hostingUISentinels[strings.ToLower(part)] { + repoPath := make([]string, i) + copy(repoPath, parts[:i]) + // The normalizer transforms "/-" → "-", leaving a trailing dash on + // the segment that preceded the GitLab "/-/" route separator. + repoPath[len(repoPath)-1] = strings.TrimSuffix(repoPath[len(repoPath)-1], "-") + return strings.Join(repoPath, "/") + } + } + + return trimmedPath +} + // FullName returns the full repository name in "owner/name" format (e.g., "golang/go"). // Returns empty string if either owner or name cannot be determined. func (r RepositoryURL) FullName() string { diff --git a/internal/domain/valueobject/repository_url_test.go b/internal/domain/valueobject/repository_url_test.go index df3e8a3..5626d8f 100644 --- a/internal/domain/valueobject/repository_url_test.go +++ b/internal/domain/valueobject/repository_url_test.go @@ -261,6 +261,58 @@ func TestRepositoryURL_Methods(t *testing.T) { } } +func TestRepositoryURL_FullPath(t *testing.T) { + tests := []struct { + name string + rawURL string + expected string + }{ + { + name: "standard GitHub repo", + rawURL: "https://github.com/owner/repo", + expected: "owner/repo", + }, + { + name: "GitLab nested group", + rawURL: "https://gitlab.com/org/sub/repo", + expected: "org/sub/repo", + }, + { + name: "deeply nested group", + rawURL: "https://gitlab.com/a/b/c/repo", + expected: "a/b/c/repo", + }, + { + name: "GitHub URL with UI tree suffix is truncated to repo root", + rawURL: "https://github.com/owner/repo/tree/main", + expected: "owner/repo", + }, + { + name: "Bitbucket URL returns owner/repo", + rawURL: "https://bitbucket.org/owner/repo", + expected: "owner/repo", + }, + { + name: "GitLab nested group with tree sentinel is truncated to repo root", + rawURL: "https://gitlab.com/org/sub/repo/tree/main", + expected: "org/sub/repo", + }, + { + name: "GitLab nested group with UI separator is truncated to repo root", + rawURL: "https://gitlab.com/org/sub/repo/-/tree/main", + expected: "org/sub/repo", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + u, err := NewRepositoryURL(tt.rawURL) + require.NoError(t, err) + assert.Equal(t, tt.expected, u.FullPath()) + }) + } +} + func TestRepositoryURL_Equal(t *testing.T) { tests := []struct { name string