Skip to content

Commit 4d3566f

Browse files
shantanu patilclaude
authored andcommitted
Add GCS storage backend with storage factory pattern (Phase 3)
Cloud Run is ephemeral — /tmp data lost on cold starts. This adds a Google Cloud Storage backend alongside the existing local filesystem storage, controlled by the USE_LOCAL_STORAGE environment variable. New: GCSStorage in storage/gcs.py, get_storage() factory in storage/factory.py, 21 tests in test_storage.py (LocalStorage, GCSStorage with mocked GCS client, factory). Modified: DigestStorage ABC gains get_metadata() + get_digest_bytes() abstract methods. LocalStorage gets get_digest_bytes(). query_processor uses factory (_store_digest replaces _store_digest_locally). Download endpoint refactored to use storage abstraction instead of direct filesystem reads. Deploy config switches to USE_LOCAL_STORAGE=false with GCS_BUCKET_NAME=gitunderstand-digests. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 4523f41 commit 4d3566f

11 files changed

Lines changed: 527 additions & 35 deletions

File tree

.github/workflows/deploy.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -96,7 +96,7 @@ jobs:
9696
--min-instances 0 \
9797
--max-instances 10 \
9898
--timeout 300 \
99-
--set-env-vars "^@^GCP_PROJECT_ID=${{ env.PROJECT_ID }}@USE_LOCAL_STORAGE=true@ALLOWED_HOSTS=gitunderstand.com,gitunderstand-308289525742.us-central1.run.app,localhost,127.0.0.1" \
99+
--set-env-vars "^@^GCP_PROJECT_ID=${{ env.PROJECT_ID }}@USE_LOCAL_STORAGE=false@GCS_BUCKET_NAME=gitunderstand-digests@ALLOWED_HOSTS=gitunderstand.com,gitunderstand-308289525742.us-central1.run.app,localhost,127.0.0.1" \
100100
--project ${{ env.PROJECT_ID }}
101101
102102
- name: Show Cloud Run URL

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ dependencies = [
88
"click>=8.0.0",
99
"fastapi[standard]>=0.109.1",
1010
"gitpython>=3.1.0",
11+
"google-cloud-storage>=2.10.0",
1112
"httpx",
1213
"jinja2",
1314
"pathspec>=0.12.1",

requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
click>=8.0.0
22
fastapi[standard]>=0.109.1
33
gitpython>=3.1.0
4+
google-cloud-storage>=2.10.0
45
httpx
56
jinja2
67
pathspec>=0.12.1

src/api/query_processor.py

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
from core.parser import parse_remote_repo
1717
from core.utils.git_utils import validate_github_token
1818
from core.utils.pattern_utils import process_patterns
19-
from storage.local import LocalStorage
19+
from storage.factory import get_storage
2020

2121
logger = logging.getLogger(__name__)
2222

@@ -40,15 +40,18 @@ def _cleanup_repository(clone_config: CloneConfig) -> None:
4040
logger.exception("Could not delete repository at %s", clone_config.local_path)
4141

4242

43-
def _store_digest_locally(
43+
def _store_digest(
4444
query: IngestionQuery,
4545
clone_config: CloneConfig,
4646
digest_content: str,
4747
summary: str,
4848
tree: str,
4949
content: str,
5050
) -> str:
51-
"""Store digest content to local storage.
51+
"""Store digest content using the configured storage backend.
52+
53+
Uses the storage factory to select either local filesystem or GCS
54+
based on application settings.
5255
5356
Parameters
5457
----------
@@ -71,7 +74,7 @@ def _store_digest_locally(
7174
The download URL for the stored digest.
7275
7376
"""
74-
storage = LocalStorage(base_path=settings.local_storage_path)
77+
storage = get_storage()
7578
digest_id = str(query.id)
7679

7780
storage.store_digest(
@@ -149,7 +152,7 @@ async def process_query(
149152
try:
150153
summary, tree, content, token_counts = ingest_query(query)
151154
digest_content = tree + "\n" + content
152-
digest_url = _store_digest_locally(query, clone_config, digest_content, summary, tree, content)
155+
digest_url = _store_digest(query, clone_config, digest_content, summary, tree, content)
153156
except Exception as exc:
154157
logger.error(
155158
"Query processing failed for %s: %s",
@@ -273,7 +276,7 @@ async def process_query_streaming(
273276
reporter.report(ProgressStage.STORING, {"message": "Saving digest..."})
274277

275278
digest_content = tree + "\n" + content
276-
digest_url = _store_digest_locally(query, clone_config, digest_content, summary, tree, content)
279+
digest_url = _store_digest(query, clone_config, digest_content, summary, tree, content)
277280
except Exception as exc:
278281
logger.error("Query processing failed for %s: %s", query.url, exc)
279282
if reporter:

src/api/routers/ingest.py

Lines changed: 20 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -5,11 +5,10 @@
55
import asyncio
66
import contextlib
77
import logging
8-
from pathlib import Path
98
from typing import TYPE_CHECKING, Any
109

11-
from fastapi import APIRouter, HTTPException, Request, status
12-
from fastapi.responses import FileResponse, HTMLResponse, JSONResponse, StreamingResponse
10+
from fastapi import APIRouter, HTTPException, Request, Response, status
11+
from fastapi.responses import HTMLResponse, JSONResponse, StreamingResponse
1312

1413
from api.config import get_settings
1514
from api.middleware import limiter
@@ -19,6 +18,7 @@
1918
from api.shared import templates
2019
from core.output_formats import OutputFormat
2120
from core.progress import ProgressStage
21+
from storage.factory import get_storage
2222

2323
if TYPE_CHECKING:
2424
from collections.abc import AsyncGenerator
@@ -299,47 +299,40 @@ async def api_ingest_get(
299299
@router.get("/api/download/file/{ingest_id}", response_model=None)
300300
async def download_ingest(
301301
ingest_id: UUID,
302-
) -> FileResponse | JSONResponse: # noqa: FA100
302+
) -> Response | JSONResponse: # noqa: FA100
303303
"""Download the text file produced for an ingest ID.
304304
305+
Uses the configured storage backend (local filesystem or GCS) to
306+
retrieve the digest content.
307+
305308
Parameters
306309
----------
307310
ingest_id : UUID
308311
Identifier that the ingest step emitted.
309312
310313
Returns
311314
-------
312-
FileResponse
315+
Response
313316
Streamed response with media type ``text/plain``.
314317
315318
Raises
316319
------
317320
HTTPException
318-
404 if digest directory is missing or contains no ``.txt`` file.
319-
403 if there is a permission error reading the file.
321+
404 if the digest does not exist in storage.
320322
321323
"""
322-
tmp_base = Path(settings.local_storage_path)
323-
directory = (tmp_base / str(ingest_id)).resolve()
324-
325-
if not str(directory).startswith(str(tmp_base.resolve())):
326-
raise HTTPException(status_code=status.HTTP_403_FORBIDDEN, detail=f"Invalid ingest ID: {ingest_id!r}")
324+
storage = get_storage()
325+
digest_id = str(ingest_id)
327326

328-
if not directory.is_dir():
327+
if not storage.digest_exists(digest_id):
329328
raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail=f"Digest {ingest_id!r} not found")
330329

331-
try:
332-
first_txt_file = next(directory.glob("*.txt"))
333-
except StopIteration as exc:
334-
raise HTTPException(
335-
status_code=status.HTTP_404_NOT_FOUND,
336-
detail=f"No .txt file found for digest {ingest_id!r}",
337-
) from exc
330+
content_bytes = storage.get_digest_bytes(digest_id)
331+
if content_bytes is None:
332+
raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail=f"No content for digest {ingest_id!r}")
338333

339-
try:
340-
return FileResponse(path=first_txt_file, media_type="text/plain", filename=first_txt_file.name)
341-
except PermissionError as exc:
342-
raise HTTPException(
343-
status_code=status.HTTP_403_FORBIDDEN,
344-
detail=f"Permission denied for {first_txt_file}",
345-
) from exc
334+
return Response(
335+
content=content_bytes,
336+
media_type="text/plain",
337+
headers={"Content-Disposition": f'attachment; filename="digest-{ingest_id}.txt"'},
338+
)

src/storage/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
"""Storage module for GitUnderstand digest persistence."""
22

33
from storage.base import DigestStorage
4+
from storage.gcs import GCSStorage
45
from storage.local import LocalStorage
56

6-
__all__ = ["DigestStorage", "LocalStorage"]
7+
__all__ = ["DigestStorage", "GCSStorage", "LocalStorage"]

src/storage/base.py

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,40 @@ def get_digest(self, digest_id: str) -> str | None:
4949
5050
"""
5151

52+
@abstractmethod
53+
def get_metadata(self, digest_id: str) -> dict[str, Any] | None:
54+
"""Retrieve metadata for a digest.
55+
56+
Parameters
57+
----------
58+
digest_id : str
59+
Unique identifier for the digest.
60+
61+
Returns
62+
-------
63+
dict[str, Any] | None
64+
The metadata dictionary, or ``None`` if not found.
65+
66+
"""
67+
68+
@abstractmethod
69+
def get_digest_bytes(self, digest_id: str) -> bytes | None:
70+
"""Retrieve the raw bytes of a digest.
71+
72+
Useful for streaming downloads where text decoding is not needed.
73+
74+
Parameters
75+
----------
76+
digest_id : str
77+
Unique identifier for the digest.
78+
79+
Returns
80+
-------
81+
bytes | None
82+
The raw bytes of the digest content, or ``None`` if not found.
83+
84+
"""
85+
5286
@abstractmethod
5387
def digest_exists(self, digest_id: str) -> bool:
5488
"""Check if a digest exists in storage.

src/storage/factory.py

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
"""Storage backend factory for GitUnderstand.
2+
3+
Returns the configured storage backend based on application settings.
4+
"""
5+
6+
from __future__ import annotations
7+
8+
from typing import TYPE_CHECKING
9+
10+
if TYPE_CHECKING:
11+
from storage.base import DigestStorage
12+
13+
14+
def get_storage() -> DigestStorage:
15+
"""Return the configured storage backend instance.
16+
17+
Uses ``settings.use_local_storage`` to decide which backend to use:
18+
19+
- ``True`` → :class:`~storage.local.LocalStorage` (filesystem)
20+
- ``False`` → :class:`~storage.gcs.GCSStorage` (Google Cloud Storage)
21+
22+
Returns
23+
-------
24+
DigestStorage
25+
The storage backend instance.
26+
27+
"""
28+
from api.config import get_settings # noqa: PLC0415
29+
30+
settings = get_settings()
31+
32+
if settings.use_local_storage:
33+
from storage.local import LocalStorage # noqa: PLC0415
34+
35+
return LocalStorage(base_path=settings.local_storage_path)
36+
37+
from storage.gcs import GCSStorage # noqa: PLC0415
38+
39+
return GCSStorage(
40+
bucket_name=settings.gcs_bucket_name,
41+
project_id=settings.gcp_project_id,
42+
)

0 commit comments

Comments
 (0)