Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 29 additions & 12 deletions internal/application/service/search_service.go
Original file line number Diff line number Diff line change
Expand Up @@ -280,21 +280,38 @@ func normalizeZoektRepoName(name string) string {
return name
}

func stripZoektRepoHost(name string) string {
parts := strings.SplitN(name, "/", 3)
if len(parts) == 3 {
return parts[1] + "/" + parts[2]
}
return name
}

// zoektRepoNameCandidates returns all plausible DB repository name variants for
// a zoekt repo name, ordered from most to least specific. It strips one leading
// path segment at a time, stopping before single-segment results to avoid
// false-positive matches.
//
// Examples:
//
// "github.com/owner/repo" → ["github.com/owner/repo", "owner/repo"]
// "gitlab.com/org/sub/repo" → ["gitlab.com/org/sub/repo", "org/sub/repo", "sub/repo"]
// "owner/repo" → ["owner/repo"]
func zoektRepoNameCandidates(name string) []string {
normalized := normalizeZoektRepoName(name)
stripped := stripZoektRepoHost(normalized)
if stripped == normalized {
return []string{normalized}
seen := map[string]bool{normalized: true}
candidates := []string{normalized}
current := normalized
for {
idx := strings.Index(current, "/")
if idx < 0 {
break
}
next := current[idx+1:]
// Stop before adding single-segment candidates — too ambiguous.
if !strings.Contains(next, "/") {
break
}
if !seen[next] {
candidates = append(candidates, next)
seen[next] = true
}
current = next
}
return []string{normalized, stripped}
return candidates
}

func lineRangesOverlap(startA, endA, startB, endB int) bool {
Expand Down
121 changes: 121 additions & 0 deletions internal/application/service/search_service_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2061,3 +2061,124 @@ func TestSearchService_HybridChunkLevelZoektResolutionContract(t *testing.T) {
mockZoekt.AssertExpectations(t)
})
}

func TestZoektRepoNameCandidates(t *testing.T) {
tests := []struct {
name string
input string
expected []string
}{
{
name: "standard GitHub 3-segment",
input: "github.com/owner/repo",
expected: []string{"github.com/owner/repo", "owner/repo"},
},
{
name: "GitLab nested group 4-segment",
input: "gitlab.com/org/sub/repo",
expected: []string{"gitlab.com/org/sub/repo", "org/sub/repo", "sub/repo"},
},
{
name: "no host 2-segment",
input: "owner/repo",
expected: []string{"owner/repo"},
},
{
name: "deeply nested 5-segment",
input: "gitlab.com/a/b/c/d",
expected: []string{"gitlab.com/a/b/c/d", "a/b/c/d", "b/c/d", "c/d"},
},
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
got := zoektRepoNameCandidates(tt.input)
assert.Equal(t, tt.expected, got)
})
}
}

func TestSearchService_HybridChunkResolution_GitLabNestedGroup(t *testing.T) {
silentLogger, err := logging.NewApplicationLogger(logging.Config{
Level: "ERROR",
Format: "json",
Output: "buffer",
})
require.NoError(t, err)
slogger.SetGlobalLogger(silentLogger)
defer slogger.SetGlobalLogger(nil)

t.Run("Resolves_Chunk_Via_Third_Candidate_For_Nested_Group", func(t *testing.T) {
mockVectorRepo := new(MockVectorStorageRepository)
mockEmbeddingService := new(MockEmbeddingService)
mockChunkRepo := new(MockChunkRepository)
mockZoekt := new(MockZoektSearcher)

svc := NewSearchService(
mockVectorRepo, mockEmbeddingService, mockChunkRepo,
new(MockRepositoryRepository), testConfig(), mockZoekt, nil,
)

ctx := context.Background()
req := dto.SearchRequestDTO{
Query: "ProcessRequest",
Mode: dto.SearchModeHybrid,
Limit: 10,
}
queryVector := []float64{0.1, 0.2, 0.3}
mockEmbeddingService.On("GenerateEmbedding", ctx, req.Query, mock.AnythingOfType("outbound.EmbeddingOptions")).
Return(&outbound.EmbeddingResult{Vector: queryVector}, nil)
mockVectorRepo.On("VectorSimilaritySearch", ctx, queryVector, mock.AnythingOfType("outbound.SimilaritySearchOptions")).
Return([]outbound.VectorSimilarityResult{}, nil)
mockChunkRepo.On("FindChunksByIDs", ctx, []uuid.UUID{}).
Return([]ChunkInfo{}, nil)

targetChunkID := uuid.MustParse("aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa")
// DB stores name as "sub/repo" (ExtractRepositoryNameFromURL returns last-two URL segments).
// Candidates tried in order: "gitlab.com/org/sub/repo", "org/sub/repo", "sub/repo".
mockChunkRepo.On("FindChunksByRepositoryPath", ctx, "gitlab.com/org/sub/repo", "handler.go").
Return([]ChunkInfo{}, nil)
mockChunkRepo.On("FindChunksByRepositoryPath", ctx, "org/sub/repo", "handler.go").
Return([]ChunkInfo{}, nil)
mockChunkRepo.On("FindChunksByRepositoryPath", ctx, "sub/repo", "handler.go").
Return([]ChunkInfo{
{
ChunkID: targetChunkID,
Content: "func ProcessRequest(r *http.Request) {}",
FilePath: "handler.go",
Language: "Go",
StartLine: 10,
EndLine: 20,
EntityName: "ProcessRequest",
Repository: dto.RepositoryInfo{Name: "sub/repo"},
},
}, nil)

mockZoekt.On("Search", ctx, req.Query, mock.AnythingOfType("outbound.ZoektSearchOptions")).
Return(&outbound.ZoektSearchResult{
FileMatches: []outbound.ZoektFileMatch{
{
Repository: "gitlab.com/org/sub/repo",
FileName: "handler.go",
Language: "Go",
Score: 30,
LineMatches: []outbound.ZoektLineMatch{
{LineNumber: 12, LineContent: "func ProcessRequest(r *http.Request) {"},
},
},
},
TotalCount: 1,
}, nil)

result, err := svc.Search(ctx, req)

require.NoError(t, err)
require.NotNil(t, result)
require.Len(t, result.Results, 1)
assert.Equal(t, targetChunkID, result.Results[0].ChunkID)
assert.Equal(t, "zoekt", result.Results[0].SourceEngine)

mockChunkRepo.AssertExpectations(t)
mockZoekt.AssertExpectations(t)
})
}
2 changes: 1 addition & 1 deletion internal/application/worker/job_processor.go
Original file line number Diff line number Diff line change
Expand Up @@ -353,7 +353,7 @@ func (p *DefaultJobProcessor) executeJobPipeline(
commitHash = unknownCommitHash
}
repoURL := repo.URL()
zoektRepoName := repoURL.Host() + "/" + repoURL.FullName()
zoektRepoName := repoURL.Host() + "/" + repoURL.FullPath()
Comment thread
Anthony-Bible marked this conversation as resolved.
result := p.runConcurrentIndexing(
ctx,
zoektRepoName,
Expand Down
63 changes: 63 additions & 0 deletions internal/domain/valueobject/repository_url.go
Original file line number Diff line number Diff line change
Expand Up @@ -375,6 +375,69 @@ func (r RepositoryURL) Name() string {
return ""
}

// hostingUISentinels is the set of path segments that mark the start of a
// hosting-service UI route following the repository root. Any path segments
// from the first match onward are stripped by FullPath.
var hostingUISentinels = map[string]bool{
"tree": true, "blob": true, "raw": true, "blame": true,
"commit": true, "commits": true, "compare": true,
"issues": true, "pull": true, "pulls": true,
"releases": true, "tags": true, "branches": true,
"wiki": true, "actions": true, "settings": true,
"graphs": true, "network": true, "security": true,
}

// FullPath returns the repository root path without the host
// (e.g., "org/sub/repo" or "owner/repo").
//
// For GitHub and Bitbucket (owner/repo only, no nested groups) any extra path
// segments — e.g., from a UI URL like /tree/<branch> — are stripped by
// limiting to 2 segments.
//
// For GitLab and other hosts the path is kept in full to support nested groups,
// but known hosting-service UI sentinels (tree, blob, commit, …) truncate the
// result at the repository root. The URL normalizer rewrites "/-" to "-", so a
// GitLab UI separator "/-/" is also handled by stripping the resulting trailing
// dash from the last retained segment.
func (r RepositoryURL) FullPath() string {
parsedURL, _ := url.Parse(r.normalized)
trimmedPath := strings.Trim(parsedURL.Path, "/")
if trimmedPath == "" {
return ""
}

parts := strings.Split(trimmedPath, "/")

// GitHub and Bitbucket repos are always owner/repo (exactly 2 path segments).
// Trim any extra segments that indicate a UI page rather than the repo root.
switch strings.ToLower(parsedURL.Hostname()) {
case "github.com", "www.github.com", "bitbucket.org", "www.bitbucket.org":
if len(parts) > MinPathPartsForRepoURL {
return strings.Join(parts[:MinPathPartsForRepoURL], "/")
}
return trimmedPath
}

// For GitLab and other hosts: strip at the first known UI sentinel segment.
// Require at least MinPathPartsForRepoURL leading segments so that a group or
// repo legitimately named with a sentinel word is not accidentally truncated.
for i, part := range parts {
if i < MinPathPartsForRepoURL {
continue
}
if hostingUISentinels[strings.ToLower(part)] {
repoPath := make([]string, i)
copy(repoPath, parts[:i])
// The normalizer transforms "/-" → "-", leaving a trailing dash on
// the segment that preceded the GitLab "/-/" route separator.
repoPath[len(repoPath)-1] = strings.TrimSuffix(repoPath[len(repoPath)-1], "-")
return strings.Join(repoPath, "/")
}
}

return trimmedPath
Comment on lines +402 to +438
Copy link

Copilot AI Apr 22, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

FullPath() relies on the normalized URL path. For GitLab (and any host not in the GitHub/Bitbucket allowlist), it returns the entire path as-is, which will include common UI suffixes like /-/tree/<branch> (and note the normalizer currently rewrites /- to -, turning that into part of the repo path). Since FullPath() is now used to construct the Zoekt repo name, storing a non-repo-root GitLab URL can produce a zoektRepoName that will never match the actual indexed repository. Consider stripping GitLab UI suffixes (e.g., anything after a /-/ segment) and/or applying the same UI-suffix trimming logic to configured extra hosts that follow the owner/repo pattern (e.g., GitHub Enterprise).

Copilot uses AI. Check for mistakes.
}

// FullName returns the full repository name in "owner/name" format (e.g., "golang/go").
// Returns empty string if either owner or name cannot be determined.
func (r RepositoryURL) FullName() string {
Expand Down
52 changes: 52 additions & 0 deletions internal/domain/valueobject/repository_url_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -261,6 +261,58 @@ func TestRepositoryURL_Methods(t *testing.T) {
}
}

func TestRepositoryURL_FullPath(t *testing.T) {
tests := []struct {
name string
rawURL string
expected string
}{
{
name: "standard GitHub repo",
rawURL: "https://github.com/owner/repo",
expected: "owner/repo",
},
{
name: "GitLab nested group",
rawURL: "https://gitlab.com/org/sub/repo",
expected: "org/sub/repo",
},
{
name: "deeply nested group",
rawURL: "https://gitlab.com/a/b/c/repo",
expected: "a/b/c/repo",
},
{
name: "GitHub URL with UI tree suffix is truncated to repo root",
rawURL: "https://github.com/owner/repo/tree/main",
expected: "owner/repo",
},
{
name: "Bitbucket URL returns owner/repo",
rawURL: "https://bitbucket.org/owner/repo",
expected: "owner/repo",
},
{
name: "GitLab nested group with tree sentinel is truncated to repo root",
rawURL: "https://gitlab.com/org/sub/repo/tree/main",
expected: "org/sub/repo",
},
{
name: "GitLab nested group with UI separator is truncated to repo root",
rawURL: "https://gitlab.com/org/sub/repo/-/tree/main",
expected: "org/sub/repo",
},
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
u, err := NewRepositoryURL(tt.rawURL)
require.NoError(t, err)
assert.Equal(t, tt.expected, u.FullPath())
})
}
}

func TestRepositoryURL_Equal(t *testing.T) {
tests := []struct {
name string
Expand Down