Skip to content

Commit 9849ac9

Browse files
authored
Merge pull request #40 from git-pkgs/mirror-feature
Add mirror command and API for selective package mirroring
2 parents 7985a28 + 7346008 commit 9849ac9

45 files changed

Lines changed: 3095 additions & 269 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

README.md

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -460,6 +460,44 @@ proxy serve [flags]
460460
proxy [flags] # same as 'proxy serve'
461461
```
462462

463+
### mirror
464+
465+
Pre-populate the cache from PURLs, SBOM files, or entire registries. Useful for ensuring offline availability or warming the cache before deployments.
466+
467+
```bash
468+
# Mirror specific package versions
469+
proxy mirror pkg:npm/lodash@4.17.21 pkg:cargo/serde@1.0.0
470+
471+
# Mirror all versions of a package
472+
proxy mirror pkg:npm/lodash
473+
474+
# Mirror from a CycloneDX or SPDX SBOM
475+
proxy mirror --sbom sbom.cdx.json
476+
477+
# Preview what would be mirrored
478+
proxy mirror --dry-run pkg:npm/lodash
479+
480+
# Control parallelism
481+
proxy mirror --concurrency 8 pkg:npm/lodash@4.17.21
482+
```
483+
484+
The mirror command accepts the same storage and database flags as `serve`. Already-cached artifacts are skipped.
485+
486+
A mirror API is also available when the server is running:
487+
488+
```bash
489+
# Start a mirror job
490+
curl -X POST http://localhost:8080/api/mirror \
491+
-H "Content-Type: application/json" \
492+
-d '{"purls": ["pkg:npm/lodash@4.17.21"]}'
493+
494+
# Check job status
495+
curl http://localhost:8080/api/mirror/mirror-1
496+
497+
# Cancel a running job
498+
curl -X DELETE http://localhost:8080/api/mirror/mirror-1
499+
```
500+
463501
### stats
464502

465503
Show cache statistics without running the server.
@@ -534,6 +572,14 @@ Recently cached:
534572
| `GET /debian/*` | Debian/APT repository protocol |
535573
| `GET /rpm/*` | RPM/Yum repository protocol |
536574

575+
### Mirror API
576+
577+
| Endpoint | Description |
578+
|----------|-------------|
579+
| `POST /api/mirror` | Start a mirror job (JSON body with `purls`) |
580+
| `GET /api/mirror/{id}` | Get job status and progress |
581+
| `DELETE /api/mirror/{id}` | Cancel a running job |
582+
537583
### Enrichment API
538584

539585
The proxy provides REST endpoints for package metadata enrichment, vulnerability scanning, and outdated detection.

cmd/proxy/main.go

Lines changed: 155 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
//
1717
// serve Start the proxy server (default if no command given)
1818
// stats Show cache statistics
19+
// mirror Pre-populate cache from PURLs, SBOMs, or registries
1920
//
2021
// Serve Flags:
2122
//
@@ -100,7 +101,11 @@ import (
100101

101102
"github.com/git-pkgs/proxy/internal/config"
102103
"github.com/git-pkgs/proxy/internal/database"
104+
"github.com/git-pkgs/proxy/internal/handler"
105+
"github.com/git-pkgs/proxy/internal/mirror"
103106
"github.com/git-pkgs/proxy/internal/server"
107+
"github.com/git-pkgs/proxy/internal/storage"
108+
"github.com/git-pkgs/registries/fetch"
104109
)
105110

106111
const defaultTopN = 10
@@ -124,6 +129,10 @@ func main() {
124129
os.Args = append(os.Args[:1], os.Args[2:]...)
125130
runStats()
126131
return
132+
case "mirror":
133+
os.Args = append(os.Args[:1], os.Args[2:]...)
134+
runMirror()
135+
return
127136
case "-version", "--version":
128137
fmt.Printf("proxy %s (%s)\n", Version, Commit)
129138
os.Exit(0)
@@ -145,6 +154,7 @@ Usage: proxy [command] [flags]
145154
Commands:
146155
serve Start the proxy server (default)
147156
stats Show cache statistics
157+
mirror Pre-populate cache from PURLs, SBOMs, or registries
148158
149159
Run 'proxy <command> -help' for more information on a command.
150160
@@ -340,6 +350,151 @@ func runStats() {
340350
}
341351
}
342352

353+
func runMirror() {
354+
fs := flag.NewFlagSet("mirror", flag.ExitOnError)
355+
configPath := fs.String("config", "", "Path to configuration file")
356+
storageURL := fs.String("storage-url", "", "Storage URL (file:// or s3://)")
357+
databaseDriver := fs.String("database-driver", "", "Database driver: sqlite or postgres")
358+
databasePath := fs.String("database-path", "", "Path to SQLite database file")
359+
databaseURL := fs.String("database-url", "", "PostgreSQL connection URL")
360+
sbomPath := fs.String("sbom", "", "Path to CycloneDX or SPDX SBOM file")
361+
concurrency := fs.Int("concurrency", 4, "Number of parallel downloads") //nolint:mnd // default concurrency
362+
dryRun := fs.Bool("dry-run", false, "Show what would be mirrored without downloading")
363+
364+
fs.Usage = func() {
365+
fmt.Fprintf(os.Stderr, "git-pkgs proxy - Pre-populate cache\n\n")
366+
fmt.Fprintf(os.Stderr, "Usage: proxy mirror [flags] [purl...]\n\n")
367+
fmt.Fprintf(os.Stderr, "Examples:\n")
368+
fmt.Fprintf(os.Stderr, " proxy mirror pkg:npm/lodash@4.17.21\n")
369+
fmt.Fprintf(os.Stderr, " proxy mirror --sbom sbom.cdx.json\n")
370+
fmt.Fprintf(os.Stderr, " proxy mirror pkg:npm/lodash # all versions\n\n")
371+
fmt.Fprintf(os.Stderr, "Flags:\n")
372+
fs.PrintDefaults()
373+
}
374+
375+
_ = fs.Parse(os.Args[1:])
376+
purls := fs.Args()
377+
378+
// Determine source
379+
var source mirror.Source
380+
switch {
381+
case *sbomPath != "":
382+
source = &mirror.SBOMSource{Path: *sbomPath}
383+
case len(purls) > 0:
384+
source = &mirror.PURLSource{PURLs: purls}
385+
default:
386+
fmt.Fprintf(os.Stderr, "error: provide PURLs or --sbom\n")
387+
fs.Usage()
388+
os.Exit(1)
389+
}
390+
391+
// Load config
392+
cfg, err := loadConfig(*configPath)
393+
if err != nil {
394+
fmt.Fprintf(os.Stderr, "error loading config: %v\n", err)
395+
os.Exit(1)
396+
}
397+
cfg.LoadFromEnv()
398+
399+
if *storageURL != "" {
400+
cfg.Storage.URL = *storageURL
401+
}
402+
if *databaseDriver != "" {
403+
cfg.Database.Driver = *databaseDriver
404+
}
405+
if *databasePath != "" {
406+
cfg.Database.Path = *databasePath
407+
}
408+
if *databaseURL != "" {
409+
cfg.Database.URL = *databaseURL
410+
}
411+
412+
if err := cfg.Validate(); err != nil {
413+
fmt.Fprintf(os.Stderr, "invalid configuration: %v\n", err)
414+
os.Exit(1)
415+
}
416+
417+
logger := setupLogger("info", "text")
418+
419+
// Open database
420+
var db *database.DB
421+
switch cfg.Database.Driver {
422+
case "postgres":
423+
db, err = database.OpenPostgresOrCreate(cfg.Database.URL)
424+
default:
425+
db, err = database.OpenOrCreate(cfg.Database.Path)
426+
}
427+
if err != nil {
428+
fmt.Fprintf(os.Stderr, "error opening database: %v\n", err)
429+
os.Exit(1)
430+
}
431+
defer func() { _ = db.Close() }()
432+
433+
if err := db.MigrateSchema(); err != nil {
434+
_ = db.Close()
435+
fmt.Fprintf(os.Stderr, "error migrating schema: %v\n", err)
436+
os.Exit(1) //nolint:gocritic // db closed above
437+
}
438+
439+
// Open storage
440+
sURL := cfg.Storage.URL
441+
if sURL == "" {
442+
sURL = "file://" + cfg.Storage.Path //nolint:staticcheck // backwards compat
443+
}
444+
store, err := storage.OpenBucket(context.Background(), sURL)
445+
if err != nil {
446+
_ = db.Close()
447+
fmt.Fprintf(os.Stderr, "error opening storage: %v\n", err)
448+
os.Exit(1) //nolint:gocritic // db closed above
449+
}
450+
451+
// Build proxy (reuses same pipeline as serve)
452+
fetcher := fetch.NewFetcher()
453+
resolver := fetch.NewResolver()
454+
proxy := handler.NewProxy(db, store, fetcher, resolver, logger)
455+
proxy.CacheMetadata = true // mirror always caches metadata
456+
proxy.MetadataTTL = cfg.ParseMetadataTTL()
457+
458+
m := mirror.New(proxy, db, store, logger, *concurrency)
459+
460+
ctx, cancel := context.WithCancel(context.Background())
461+
go func() {
462+
sigCh := make(chan os.Signal, 1)
463+
signal.Notify(sigCh, syscall.SIGINT, syscall.SIGTERM)
464+
<-sigCh
465+
cancel()
466+
}()
467+
468+
if *dryRun {
469+
items, err := m.RunDryRun(ctx, source)
470+
if err != nil {
471+
fmt.Fprintf(os.Stderr, "error: %v\n", err)
472+
os.Exit(1)
473+
}
474+
fmt.Printf("Would mirror %d package versions:\n", len(items))
475+
for _, item := range items {
476+
fmt.Printf(" %s\n", item)
477+
}
478+
return
479+
}
480+
481+
progress, err := m.Run(ctx, source)
482+
if err != nil {
483+
fmt.Fprintf(os.Stderr, "error: %v\n", err)
484+
os.Exit(1)
485+
}
486+
487+
fmt.Printf("Mirror complete: %d downloaded, %d skipped (cached), %d failed, %s total\n",
488+
progress.Completed, progress.Skipped, progress.Failed, formatSize(progress.Bytes))
489+
490+
if len(progress.Errors) > 0 {
491+
fmt.Fprintf(os.Stderr, "\nErrors:\n")
492+
for _, e := range progress.Errors {
493+
fmt.Fprintf(os.Stderr, " %s/%s@%s: %s\n", e.Ecosystem, e.Name, e.Version, e.Error)
494+
}
495+
}
496+
}
497+
343498
func printStats(db *database.DB, popular, recent int, asJSON bool) error {
344499
defer func() { _ = db.Close() }()
345500

docs/architecture.md

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -161,6 +161,20 @@ vulnerabilities (
161161
updated_at DATETIME
162162
)
163163
-- indexes: (vuln_id, ecosystem, package_name) unique, (ecosystem, package_name)
164+
165+
metadata_cache (
166+
id INTEGER PRIMARY KEY,
167+
ecosystem TEXT NOT NULL,
168+
name TEXT NOT NULL,
169+
storage_path TEXT NOT NULL,
170+
etag TEXT,
171+
content_type TEXT,
172+
size INTEGER, -- BIGINT on Postgres
173+
fetched_at DATETIME,
174+
created_at DATETIME,
175+
updated_at DATETIME
176+
)
177+
-- indexes: (ecosystem, name) unique
164178
```
165179

166180
On PostgreSQL, `INTEGER PRIMARY KEY` becomes `SERIAL`, `DATETIME` becomes `TIMESTAMP`, `INTEGER DEFAULT 0` booleans become `BOOLEAN DEFAULT FALSE`, and size/count columns use `BIGINT`.
@@ -277,6 +291,12 @@ Version age filtering for supply chain attack mitigation. Configurable at global
277291

278292
Package metadata enrichment. Fetches license, description, homepage, repository URL, and vulnerability data from upstream registries. Powers the `/api/` endpoints and the web UI's package detail pages.
279293

294+
### `internal/mirror`
295+
296+
Selective package mirroring for pre-populating the proxy cache. Supports multiple input sources: individual PURLs (versioned or unversioned), CycloneDX/SPDX SBOM files, and full registry enumeration. Uses a bounded worker pool backed by `errgroup` to download artifacts in parallel, reusing `handler.Proxy.GetOrFetchArtifact()` for the actual fetch-and-cache work.
297+
298+
The package also provides a `MetadataCache` for storing raw upstream metadata blobs so the proxy can serve metadata responses offline. The `JobStore` manages async mirror jobs exposed via the `/api/mirror` endpoints.
299+
280300
### `internal/config`
281301

282302
Configuration loading.
@@ -326,10 +346,11 @@ Eviction can be implemented as:
326346
- Ensures clients fetch artifacts through proxy
327347
- Alternative: Let clients fetch directly, miss cache opportunity
328348

329-
**Why not cache metadata?**
349+
**Why not cache metadata (by default)?**
330350
- Simplicity - no invalidation logic needed
331351
- Fresh data - new versions visible immediately
332352
- Metadata is small, upstream fetch is fast
353+
- Set `cache_metadata: true` or use the mirror command to enable metadata caching for offline use via the `metadata_cache` table
333354

334355
**Why stream artifacts?**
335356
- Memory efficient - don't load large files into RAM

docs/configuration.md

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -213,6 +213,65 @@ Currently supported for npm, PyPI, pub.dev, Composer, Cargo, NuGet, Conda, RubyG
213213

214214
Note: Hex cooldown requires disabling registry signature verification since the proxy re-encodes the protobuf payload without the original signature. Set `HEX_NO_VERIFY_REPO_ORIGIN=1` or configure your repo with `no_verify: true`.
215215

216+
## Metadata Caching
217+
218+
By default the proxy fetches metadata fresh from upstream on every request. Enable `cache_metadata` to store metadata responses in the database and storage backend for offline fallback. When upstream is unreachable, the proxy serves the last cached copy. ETag-based revalidation avoids re-downloading unchanged metadata.
219+
220+
```yaml
221+
cache_metadata: true
222+
```
223+
224+
Or via environment variable: `PROXY_CACHE_METADATA=true`.
225+
226+
The `proxy mirror` command always enables metadata caching regardless of this setting.
227+
228+
### Metadata TTL
229+
230+
When metadata caching is enabled, `metadata_ttl` controls how long a cached response is considered fresh before revalidating with upstream. During the TTL window, cached metadata is served directly without contacting upstream, reducing latency and upstream load.
231+
232+
```yaml
233+
metadata_ttl: "5m" # default
234+
```
235+
236+
Or via environment variable: `PROXY_METADATA_TTL=10m`.
237+
238+
Set to `"0"` to always revalidate with upstream (ETag-based conditional requests still avoid re-downloading unchanged content).
239+
240+
When upstream is unreachable and the cached entry is past its TTL, the proxy serves the stale cached copy with a `Warning: 110 - "Response is Stale"` header so clients can tell the data may be outdated.
241+
242+
## Mirror API
243+
244+
The `/api/mirror` endpoints are disabled by default. Enable them to allow starting mirror jobs via HTTP:
245+
246+
```yaml
247+
mirror_api: true
248+
```
249+
250+
Or via environment variable: `PROXY_MIRROR_API=true`.
251+
252+
When disabled, the endpoints are not registered and return 404.
253+
254+
## Mirror Command
255+
256+
The `proxy mirror` command pre-populates the cache from various sources. It accepts the same storage and database flags as `serve`.
257+
258+
| Flag | Default | Description |
259+
|------|---------|-------------|
260+
| `--sbom` | | Path to CycloneDX or SPDX SBOM file |
261+
| `--concurrency` | `4` | Number of parallel downloads |
262+
| `--dry-run` | `false` | Show what would be mirrored without downloading |
263+
| `--config` | | Path to configuration file |
264+
| `--storage-url` | | Storage URL |
265+
| `--database-driver` | | Database driver |
266+
| `--database-path` | | SQLite database file |
267+
| `--database-url` | | PostgreSQL connection URL |
268+
269+
Positional arguments are treated as PURLs:
270+
271+
```bash
272+
proxy mirror pkg:npm/lodash@4.17.21 pkg:cargo/serde@1.0.0
273+
```
274+
216275
## Docker
217276

218277
### SQLite with Local Storage

0 commit comments

Comments
 (0)