Skip to content

Commit 1b1ecbd

Browse files
authored
fix: software value failing for large repos [CM-1029] (#3947)
Signed-off-by: Mouad BANI <mouad-mb@outlook.com>
1 parent 8a05d36 commit 1b1ecbd

2 files changed

Lines changed: 66 additions & 22 deletions

File tree

services/apps/git_integration/src/crowdgit/services/software_value/main.go

Lines changed: 31 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ package main
33
import (
44
"context"
55
"encoding/json"
6+
"flag"
67
"fmt"
78
"os"
89
"os/exec"
@@ -13,23 +14,27 @@ import (
1314
)
1415

1516
func main() {
16-
response := processRepository()
17+
noLarge := flag.Bool("no-large", false, "Skip files larger than 100MB to avoid OOM on large repos")
18+
flag.Parse()
19+
20+
response := processRepository(*noLarge)
1721
outputJSON(response)
1822

1923
// Always exit with code 0 - status details are in JSON response
2024
}
2125

2226
// processRepository handles the main logic and returns a StandardResponse
23-
func processRepository() StandardResponse {
27+
func processRepository(noLarge bool) StandardResponse {
2428
ctx := context.Background()
2529

26-
// Get target path from command line argument
30+
// Get target path from remaining non-flag arguments
31+
args := flag.Args()
2732
var targetPath string
28-
if len(os.Args) > 1 {
29-
targetPath = os.Args[1]
33+
if len(args) > 0 {
34+
targetPath = args[0]
3035
} else {
3136
errorCode := ErrorCodeInvalidArguments
32-
errorMessage := fmt.Sprintf("Usage: %s <target-path>", os.Args[0])
37+
errorMessage := fmt.Sprintf("Usage: %s [--no-large] <target-path>", os.Args[0])
3338
return StandardResponse{
3439
Status: StatusFailure,
3540
ErrorCode: &errorCode,
@@ -51,10 +56,10 @@ func processRepository() StandardResponse {
5156
// Process single repository (the target path argument)
5257
repoDir := config.TargetPath
5358

54-
insightsDb, err := NewInsightsDB(ctx, config.InsightsDatabase)
55-
if err != nil {
59+
insightsDb, dbErr := NewInsightsDB(ctx, config.InsightsDatabase)
60+
if dbErr != nil {
5661
errorCode := ErrorCodeDatabaseConnection
57-
errorMessage := fmt.Sprintf("Error connecting to insights database: %v", err)
62+
errorMessage := fmt.Sprintf("Error connecting to insights database: %v", dbErr)
5863
return StandardResponse{
5964
Status: StatusFailure,
6065
ErrorCode: &errorCode,
@@ -76,7 +81,7 @@ func processRepository() StandardResponse {
7681
}
7782

7883
// Process the repository with SCC
79-
report, err := getSCCReport(config.SCCPath, repoDir)
84+
report, err := getSCCReport(config.SCCPath, repoDir, noLarge)
8085
if err != nil {
8186
errorCode := getErrorCodeFromSCCError(err)
8287
errorMessage := fmt.Sprintf("Error processing repository '%s': %v", repoDir, err)
@@ -120,10 +125,10 @@ func processRepository() StandardResponse {
120125

121126

122127
// getSCCReport analyzes a directory with scc and returns a report containing the estimated cost and language statistics.
123-
func getSCCReport(sccPath, dirPath string) (SCCReport, error) {
124-
cost, err := getCost(sccPath, dirPath)
128+
func getSCCReport(sccPath, dirPath string, noLarge bool) (SCCReport, error) {
129+
cost, err := getCost(sccPath, dirPath, noLarge)
125130
if err != nil {
126-
return SCCReport{}, fmt.Errorf("error getting SCC report for '%s': %v\"", err)
131+
return SCCReport{}, fmt.Errorf("error getting SCC report for '%s': %v", dirPath, err)
127132
}
128133

129134
// Skip saving to database if cost is 0 - do we want to do this?
@@ -133,7 +138,7 @@ func getSCCReport(sccPath, dirPath string) (SCCReport, error) {
133138

134139
projectPath := filepath.Base(dirPath)
135140

136-
langStats, err := getLanguageStats(sccPath, dirPath)
141+
langStats, err := getLanguageStats(sccPath, dirPath, noLarge)
137142
if err != nil {
138143
return SCCReport{}, fmt.Errorf("error getting language stats for '%s': %v", dirPath, err)
139144
}
@@ -177,8 +182,8 @@ func getGitRepositoryURL(dirPath string) (string, error) {
177182
}
178183

179184
// getCost runs the scc command and parses the output to get the estimated cost.
180-
func getCost(sccPathPath, repoPath string) (float64, error) {
181-
output, err := runSCC(sccPathPath, "--format=short", repoPath)
185+
func getCost(sccPathPath, repoPath string, noLarge bool) (float64, error) {
186+
output, err := runSCC(sccPathPath, noLarge, "--format=short", repoPath)
182187
if err != nil {
183188
return 0, fmt.Errorf("failed to run scc command: %w", err)
184189
}
@@ -192,8 +197,8 @@ func getCost(sccPathPath, repoPath string) (float64, error) {
192197
}
193198

194199
// getLanguageStats runs the scc command and parses the output to get language statistics.
195-
func getLanguageStats(sccPathPath, repoPath string) ([]LanguageStats, error) {
196-
output, err := runSCC(sccPathPath, "--format=json", repoPath)
200+
func getLanguageStats(sccPathPath, repoPath string, noLarge bool) ([]LanguageStats, error) {
201+
output, err := runSCC(sccPathPath, noLarge, "--format=json", repoPath)
197202
if err != nil {
198203
return nil, fmt.Errorf("failed to run scc command: %w", err)
199204
}
@@ -207,8 +212,14 @@ func getLanguageStats(sccPathPath, repoPath string) ([]LanguageStats, error) {
207212
}
208213

209214
// runSCC executes the scc command with the given arguments and returns the output.
210-
func runSCC(sccPathPath string, args ...string) (string, error) {
211-
cmd := exec.Command(sccPathPath, args...)
215+
// When noLarge is true, files larger than 100MB are skipped to avoid OOM on large repos.
216+
func runSCC(sccPathPath string, noLarge bool, args ...string) (string, error) {
217+
var cmdArgs []string
218+
if noLarge {
219+
cmdArgs = append(cmdArgs, "--no-large", "--large-byte-count", "100000000")
220+
}
221+
cmdArgs = append(cmdArgs, args...)
222+
cmd := exec.Command(sccPathPath, cmdArgs...)
212223
output, err := cmd.Output()
213224
if err != nil {
214225
if exitErr, ok := err.(*exec.ExitError); ok {

services/apps/git_integration/src/crowdgit/services/software_value/software_value_service.py

Lines changed: 35 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,21 @@
88
from crowdgit.services.base.base_service import BaseService
99
from crowdgit.services.utils import run_shell_command
1010

11+
_LARGE_REPO_THRESHOLD_BYTES = 10 * 1024 * 1024 * 1024 # 10 GB
12+
# Repos excluded from software value analysis.
13+
# f7f92577-f258-49f0-b5b4-ba07194ca040: data repo (not a code repo), produces misleading results.
14+
_SOFTWARE_VALUE_EXCLUDED_REPO_IDS = frozenset({"f7f92577-f258-49f0-b5b4-ba07194ca040"})
15+
16+
17+
async def _get_repo_size_bytes(repo_path: str) -> int:
18+
"""Return total disk usage of repo_path in bytes using du -sb."""
19+
try:
20+
output = await run_shell_command(["du", "-sb", repo_path], timeout=120)
21+
return int(output.split()[0])
22+
except Exception:
23+
pass
24+
return 0
25+
1126

1227
class SoftwareValueService(BaseService):
1328
"""Service for calculating software value metrics"""
@@ -20,16 +35,34 @@ def __init__(self):
2035
async def run(self, repo_id: str, repo_path: str) -> None:
2136
"""
2237
Triggers software value binary for given repo.
23-
Results are saved into insights database directly
38+
Results are saved into insights database directly.
39+
Repos in _SOFTWARE_VALUE_EXCLUDED_REPO_IDS are skipped entirely.
40+
For repos larger than 10 GB, scc is run with --no-large (skipping files >100MB) to avoid OOM.
2441
"""
42+
if repo_id in _SOFTWARE_VALUE_EXCLUDED_REPO_IDS:
43+
self.logger.info(f"Skipping software value for excluded repo {repo_id}")
44+
return
45+
2546
start_time = time.time()
2647
execution_status = ExecutionStatus.SUCCESS
2748
error_code = None
2849
error_message = None
2950

3051
try:
52+
cmd = [self.software_value_executable]
53+
54+
repo_size = await _get_repo_size_bytes(repo_path)
55+
if repo_size >= _LARGE_REPO_THRESHOLD_BYTES:
56+
self.logger.info(
57+
f"Repo size {repo_size / (1024**3):.1f} GB exceeds threshold — "
58+
"running scc with no-large (skipping files >100MB)"
59+
)
60+
cmd += ["--no-large"]
61+
62+
cmd.append(repo_path)
63+
3164
self.logger.info("Running software value...")
32-
output = await run_shell_command([self.software_value_executable, repo_path])
65+
output = await run_shell_command(cmd)
3366
self.logger.info(f"Software value output: {output}")
3467

3568
# Parse JSON output and extract fields from StandardResponse structure

0 commit comments

Comments
 (0)