Skip to content

Commit 8a89ae2

Browse files
authored
feat(git-integration): auto detect stuck commits and handle repos with force-pushes (CM-745) (#3658)
1 parent 861a96a commit 8a89ae2

5 files changed

Lines changed: 89 additions & 4 deletions

File tree

services/apps/git_integration/src/crowdgit/database/crud.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -140,7 +140,11 @@ async def acquire_recurrent_repo() -> Repository | None:
140140
)
141141
RETURNING id, url, state, priority, "lastProcessedAt", "lastProcessedCommit", "lockedAt", "createdAt", "updatedAt", "segmentId", "integrationId", "maintainerFile", "lastMaintainerRunAt", "branch", "forkedFrom"
142142
"""
143-
states_to_exclude = (RepositoryState.PENDING, RepositoryState.PROCESSING)
143+
states_to_exclude = (
144+
RepositoryState.PENDING,
145+
RepositoryState.PROCESSING,
146+
RepositoryState.STUCK,
147+
)
144148
return await acquire_repository(
145149
recurrent_repo_sql_query,
146150
(RepositoryState.PROCESSING, states_to_exclude, REPOSITORY_UPDATE_INTERVAL_HOURS),

services/apps/git_integration/src/crowdgit/enums.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,8 @@ class ErrorCode(str, Enum):
2020
MAINTAINER_INTERVAL_NOT_ELAPSED = "maintainer-interval-not-elapsed"
2121
CLEANUP_FAILED = "cleanup-failed"
2222
PARENT_REPO_INVALID = "parent-repo-invalid"
23+
REONBOARDING_REQUIRED = "reonboarding-required"
24+
STUCK_REPO = "stuck-repo"
2325

2426

2527
class RepositoryState(str, Enum):
@@ -30,6 +32,7 @@ class RepositoryState(str, Enum):
3032
COMPLETED = "completed"
3133
FAILED = "failed"
3234
REQUIRES_PARENT = "requires_parent" # fork repo without valid parent repo in out system
35+
STUCK = "stuck" # requires manual resolution
3336

3437

3538
class RepositoryPriority(int):

services/apps/git_integration/src/crowdgit/errors.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -107,3 +107,15 @@ class MaintainerIntervalNotElapsedError(CrowdGitError):
107107
class ParentRepoInvalidError(CrowdGitError):
108108
error_message: str = "Parent repository is not valid or not found"
109109
error_code: ErrorCode = ErrorCode.PARENT_REPO_INVALID
110+
111+
112+
@dataclass
113+
class ReOnboardingRequiredError(CrowdGitError):
114+
error_message = "Repository cannot be processed and requires re-onboarding"
115+
error_code: ErrorCode = ErrorCode.REONBOARDING_REQUIRED
116+
117+
118+
@dataclass
119+
class StuckRepoError(CrowdGitError):
120+
error_message = "Repos stuck in processing state for a long time"
121+
error_code: ErrorCode = ErrorCode.STUCK_REPO

services/apps/git_integration/src/crowdgit/settings.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,3 +38,9 @@ def load_env_var(key: str, required=True, default=None):
3838
WORKER_SHUTDOWN_TIMEOUT_SEC = int(load_env_var("WORKER_SHUTDOWN_TIMEOUT_SEC", default="3600"))
3939
MAX_CONCURRENT_ONBOARDINGS = int(load_env_var("MAX_CONCURRENT_ONBOARDINGS", default="3"))
4040
MAX_INTEGRATION_RESULTS = int(load_env_var("MAX_INTEGRATION_RESULTS", default="5000000"))
41+
STUCK_ONBOARDING_REPO_TIMEOUT_HOURS = int(
42+
load_env_var("STUCK_ONBOARDING_REPO_TIMEOUT_HOURS", default="12")
43+
)
44+
STUCK_RECURRENT_REPO_TIMEOUT_HOURS = int(
45+
load_env_var("STUCK_RECURRENT_REPO_TIMEOUT_HOURS", default="4")
46+
)

services/apps/git_integration/src/crowdgit/worker/repository_worker.py

Lines changed: 63 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import asyncio
2+
from datetime import datetime, timezone
23

34
from crowdgit.database.crud import (
45
acquire_repo_for_processing,
@@ -8,11 +9,16 @@
89
update_last_processed_commit,
910
)
1011
from crowdgit.enums import RepositoryState
11-
from crowdgit.errors import InternalError, ParentRepoInvalidError
12+
from crowdgit.errors import (
13+
InternalError,
14+
ParentRepoInvalidError,
15+
ReOnboardingRequiredError,
16+
StuckRepoError,
17+
)
1218

1319
# Import configured loguru logger from crowdgit.logger
1420
from crowdgit.logger import logger
15-
from crowdgit.models.repository import Repository
21+
from crowdgit.models import Repository
1622
from crowdgit.services import (
1723
CloneService,
1824
CommitService,
@@ -21,7 +27,12 @@
2127
SoftwareValueService,
2228
)
2329
from crowdgit.services.utils import get_default_branch, get_repo_name
24-
from crowdgit.settings import WORKER_ERROR_BACKOFF_SEC, WORKER_POLLING_INTERVAL_SEC
30+
from crowdgit.settings import (
31+
STUCK_ONBOARDING_REPO_TIMEOUT_HOURS,
32+
STUCK_RECURRENT_REPO_TIMEOUT_HOURS,
33+
WORKER_ERROR_BACKOFF_SEC,
34+
WORKER_POLLING_INTERVAL_SEC,
35+
)
2536

2637

2738
class RepositoryWorker:
@@ -78,6 +89,36 @@ async def shutdown(self):
7889

7990
logger.info("Worker services shutdown triggered")
8091

92+
async def _ensure_repo_not_stuck(self, repository: Repository):
93+
"""
94+
Check if repo is stuck and raise the appropriate exception if so.
95+
Repos can get stuck in processing state for different reasons:
96+
- Worker crash or restart (e.g. pod eviction due OOM, deployment after timeout, ...)
97+
- `last_processed_commit` is no loger valid due to force-push, dangling-commit, or so...
98+
- Race condition: remote is going under breaking changes at the same time we're processing it
99+
- Network issues breaking the clone/pull operation
100+
"""
101+
# detection
102+
processing_duration_hours = (
103+
datetime.now(timezone.utc) - repository.locked_at.astimezone(timezone.utc)
104+
).total_seconds() / 3600
105+
repo_stuck: bool = (
106+
repository.last_processed_commit
107+
and processing_duration_hours >= STUCK_RECURRENT_REPO_TIMEOUT_HOURS
108+
) or (
109+
repository.last_processed_commit is None # onboarding
110+
and processing_duration_hours >= STUCK_ONBOARDING_REPO_TIMEOUT_HOURS
111+
)
112+
113+
# handling
114+
if repo_stuck and repository.forked_from == repository.url:
115+
logger.warning(
116+
f"Repo {repository.url} is stuck due to force-push or dangling commit. Will be re-onboarded"
117+
)
118+
raise ReOnboardingRequiredError()
119+
120+
raise StuckRepoError()
121+
81122
async def _process_repositories(self):
82123
"""
83124
Process repositories by priority - check acquire_repo_for_processing()
@@ -153,6 +194,10 @@ async def _validate_and_get_parent_repo(self, repository: Repository) -> Reposit
153194
if not repository.forked_from:
154195
return None
155196

197+
if repository.forked_from == repository.url:
198+
# EDGE CASE: not a fork but repo get reonboarded a lot and we treat it as a "fork" to avoid producing tons of duplicate activities
199+
return repository.forked_from
200+
156201
logger.info(
157202
f"Repository {repository.url} is forked from {repository.forked_from}, validating parent repo..."
158203
)
@@ -200,9 +245,24 @@ async def _process_single_repository(self, repository: Repository):
200245
commit_hash=batch_info.latest_commit_in_repo,
201246
branch=await get_default_branch(batch_info.repo_path),
202247
)
248+
else:
249+
await self._ensure_repo_not_stuck(repository)
203250

204251
logger.info("Incremental processing completed successfully")
205252
processing_state = RepositoryState.COMPLETED
253+
except StuckRepoError:
254+
logger.error(
255+
f"Repo {repository.url} is stuck for unkown reason, marking it as stuck until manually resolved!"
256+
)
257+
processing_state = RepositoryState.STUCK
258+
except ReOnboardingRequiredError:
259+
logger.info(f"Resetting and queueing {repository.url} for re-onboarding")
260+
await update_last_processed_commit(
261+
repo_id=repository.id,
262+
commit_hash=None,
263+
branch=None,
264+
)
265+
processing_state = RepositoryState.PENDING
206266
except ParentRepoInvalidError as e:
207267
logger.error(f"Parent repo validation failed: {repr(e)}")
208268
processing_state = RepositoryState.REQUIRES_PARENT

0 commit comments

Comments
 (0)