Skip to content

Commit d62c42b

Browse files
committed
Add mirror command and API for selective package mirroring
Add a `proxy mirror` CLI command and `/api/mirror` API endpoints that pre-populate the cache from various input sources: individual PURLs, SBOM files (CycloneDX and SPDX), or full registry enumeration. The mirror reuses the existing handler.Proxy.GetOrFetchArtifact() pipeline so cached artifacts are identical to those fetched on demand. A bounded worker pool controls download parallelism. Metadata caching is opt-in via `cache_metadata: true` in config (or PROXY_CACHE_METADATA=true). The mirror command always enables it. When enabled, upstream metadata responses are stored for offline fallback with ETag-based conditional revalidation. New internal/mirror package with Source interface, PURLSource, SBOMSource, RegistrySource, and async JobStore. New metadata_cache database table for offline metadata serving.
1 parent 7985a28 commit d62c42b

43 files changed

Lines changed: 2457 additions & 272 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

README.md

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -460,6 +460,47 @@ proxy serve [flags]
460460
proxy [flags] # same as 'proxy serve'
461461
```
462462

463+
### mirror
464+
465+
Pre-populate the cache from PURLs, SBOM files, or entire registries. Useful for ensuring offline availability or warming the cache before deployments.
466+
467+
```bash
468+
# Mirror specific package versions
469+
proxy mirror pkg:npm/lodash@4.17.21 pkg:cargo/serde@1.0.0
470+
471+
# Mirror all versions of a package
472+
proxy mirror pkg:npm/lodash
473+
474+
# Mirror from a CycloneDX or SPDX SBOM
475+
proxy mirror --sbom sbom.cdx.json
476+
477+
# Full registry mirror (npm, pypi, cargo supported)
478+
proxy mirror --registry npm
479+
480+
# Preview what would be mirrored
481+
proxy mirror --dry-run pkg:npm/lodash
482+
483+
# Control parallelism
484+
proxy mirror --concurrency 8 pkg:npm/lodash@4.17.21
485+
```
486+
487+
The mirror command accepts the same storage and database flags as `serve`. Already-cached artifacts are skipped.
488+
489+
A mirror API is also available when the server is running:
490+
491+
```bash
492+
# Start a mirror job
493+
curl -X POST http://localhost:8080/api/mirror \
494+
-H "Content-Type: application/json" \
495+
-d '{"purls": ["pkg:npm/lodash@4.17.21"]}'
496+
497+
# Check job status
498+
curl http://localhost:8080/api/mirror/mirror-1
499+
500+
# Cancel a running job
501+
curl -X DELETE http://localhost:8080/api/mirror/mirror-1
502+
```
503+
463504
### stats
464505

465506
Show cache statistics without running the server.
@@ -534,6 +575,14 @@ Recently cached:
534575
| `GET /debian/*` | Debian/APT repository protocol |
535576
| `GET /rpm/*` | RPM/Yum repository protocol |
536577

578+
### Mirror API
579+
580+
| Endpoint | Description |
581+
|----------|-------------|
582+
| `POST /api/mirror` | Start a mirror job (JSON body with `purls` or `registry`) |
583+
| `GET /api/mirror/{id}` | Get job status and progress |
584+
| `DELETE /api/mirror/{id}` | Cancel a running job |
585+
537586
### Enrichment API
538587

539588
The proxy provides REST endpoints for package metadata enrichment, vulnerability scanning, and outdated detection.

cmd/proxy/main.go

Lines changed: 162 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
//
1717
// serve Start the proxy server (default if no command given)
1818
// stats Show cache statistics
19+
// mirror Pre-populate cache from PURLs, SBOMs, or registries
1920
//
2021
// Serve Flags:
2122
//
@@ -100,7 +101,11 @@ import (
100101

101102
"github.com/git-pkgs/proxy/internal/config"
102103
"github.com/git-pkgs/proxy/internal/database"
104+
"github.com/git-pkgs/proxy/internal/handler"
105+
"github.com/git-pkgs/proxy/internal/mirror"
103106
"github.com/git-pkgs/proxy/internal/server"
107+
"github.com/git-pkgs/proxy/internal/storage"
108+
"github.com/git-pkgs/registries/fetch"
104109
)
105110

106111
const defaultTopN = 10
@@ -124,6 +129,10 @@ func main() {
124129
os.Args = append(os.Args[:1], os.Args[2:]...)
125130
runStats()
126131
return
132+
case "mirror":
133+
os.Args = append(os.Args[:1], os.Args[2:]...)
134+
runMirror()
135+
return
127136
case "-version", "--version":
128137
fmt.Printf("proxy %s (%s)\n", Version, Commit)
129138
os.Exit(0)
@@ -145,6 +154,7 @@ Usage: proxy [command] [flags]
145154
Commands:
146155
serve Start the proxy server (default)
147156
stats Show cache statistics
157+
mirror Pre-populate cache from PURLs, SBOMs, or registries
148158
149159
Run 'proxy <command> -help' for more information on a command.
150160
@@ -340,6 +350,158 @@ func runStats() {
340350
}
341351
}
342352

353+
func runMirror() {
354+
fs := flag.NewFlagSet("mirror", flag.ExitOnError)
355+
configPath := fs.String("config", "", "Path to configuration file")
356+
storageURL := fs.String("storage-url", "", "Storage URL (file:// or s3://)")
357+
databaseDriver := fs.String("database-driver", "", "Database driver: sqlite or postgres")
358+
databasePath := fs.String("database-path", "", "Path to SQLite database file")
359+
databaseURL := fs.String("database-url", "", "PostgreSQL connection URL")
360+
sbomPath := fs.String("sbom", "", "Path to CycloneDX or SPDX SBOM file")
361+
registry := fs.String("registry", "", "Ecosystem name for full registry mirror")
362+
concurrency := fs.Int("concurrency", 4, "Number of parallel downloads") //nolint:mnd // default concurrency
363+
dryRun := fs.Bool("dry-run", false, "Show what would be mirrored without downloading")
364+
365+
fs.Usage = func() {
366+
fmt.Fprintf(os.Stderr, "git-pkgs proxy - Pre-populate cache\n\n")
367+
fmt.Fprintf(os.Stderr, "Usage: proxy mirror [flags] [purl...]\n\n")
368+
fmt.Fprintf(os.Stderr, "Examples:\n")
369+
fmt.Fprintf(os.Stderr, " proxy mirror pkg:npm/lodash@4.17.21\n")
370+
fmt.Fprintf(os.Stderr, " proxy mirror --sbom sbom.cdx.json\n")
371+
fmt.Fprintf(os.Stderr, " proxy mirror pkg:npm/lodash # all versions\n")
372+
fmt.Fprintf(os.Stderr, " proxy mirror --registry npm\n\n")
373+
fmt.Fprintf(os.Stderr, "Flags:\n")
374+
fs.PrintDefaults()
375+
}
376+
377+
_ = fs.Parse(os.Args[1:])
378+
purls := fs.Args()
379+
380+
// Determine source
381+
var source mirror.Source
382+
switch {
383+
case *sbomPath != "":
384+
source = &mirror.SBOMSource{Path: *sbomPath}
385+
case *registry != "":
386+
source = &mirror.RegistrySource{Ecosystem: *registry}
387+
case len(purls) > 0:
388+
source = &mirror.PURLSource{PURLs: purls}
389+
default:
390+
fmt.Fprintf(os.Stderr, "error: provide PURLs, --sbom, or --registry\n")
391+
fs.Usage()
392+
os.Exit(1)
393+
}
394+
395+
// Load config
396+
cfg, err := loadConfig(*configPath)
397+
if err != nil {
398+
fmt.Fprintf(os.Stderr, "error loading config: %v\n", err)
399+
os.Exit(1)
400+
}
401+
cfg.LoadFromEnv()
402+
403+
if *storageURL != "" {
404+
cfg.Storage.URL = *storageURL
405+
}
406+
if *databaseDriver != "" {
407+
cfg.Database.Driver = *databaseDriver
408+
}
409+
if *databasePath != "" {
410+
cfg.Database.Path = *databasePath
411+
}
412+
if *databaseURL != "" {
413+
cfg.Database.URL = *databaseURL
414+
}
415+
416+
if err := cfg.Validate(); err != nil {
417+
fmt.Fprintf(os.Stderr, "invalid configuration: %v\n", err)
418+
os.Exit(1)
419+
}
420+
421+
logger := setupLogger("info", "text")
422+
423+
// Open database
424+
var db *database.DB
425+
switch cfg.Database.Driver {
426+
case "postgres":
427+
db, err = database.OpenPostgresOrCreate(cfg.Database.URL)
428+
default:
429+
db, err = database.OpenOrCreate(cfg.Database.Path)
430+
}
431+
if err != nil {
432+
fmt.Fprintf(os.Stderr, "error opening database: %v\n", err)
433+
os.Exit(1)
434+
}
435+
436+
if err := db.MigrateSchema(); err != nil {
437+
_ = db.Close()
438+
fmt.Fprintf(os.Stderr, "error migrating schema: %v\n", err)
439+
os.Exit(1)
440+
}
441+
442+
// Open storage
443+
sURL := cfg.Storage.URL
444+
if sURL == "" {
445+
sURL = "file://" + cfg.Storage.Path //nolint:staticcheck // backwards compat
446+
}
447+
store, err := storage.OpenBucket(context.Background(), sURL)
448+
if err != nil {
449+
_ = db.Close()
450+
fmt.Fprintf(os.Stderr, "error opening storage: %v\n", err)
451+
os.Exit(1)
452+
}
453+
454+
// Build proxy (reuses same pipeline as serve)
455+
fetcher := fetch.NewFetcher()
456+
resolver := fetch.NewResolver()
457+
proxy := handler.NewProxy(db, store, fetcher, resolver, logger)
458+
proxy.CacheMetadata = true // mirror always caches metadata
459+
460+
m := mirror.New(proxy, db, store, logger, *concurrency)
461+
462+
ctx, cancel := context.WithCancel(context.Background())
463+
go func() {
464+
sigCh := make(chan os.Signal, 1)
465+
signal.Notify(sigCh, syscall.SIGINT, syscall.SIGTERM)
466+
<-sigCh
467+
cancel()
468+
}()
469+
470+
if *dryRun {
471+
items, err := m.RunDryRun(ctx, source)
472+
if err != nil {
473+
_ = db.Close()
474+
fmt.Fprintf(os.Stderr, "error: %v\n", err)
475+
os.Exit(1)
476+
}
477+
fmt.Printf("Would mirror %d package versions:\n", len(items))
478+
for _, item := range items {
479+
fmt.Printf(" %s\n", item)
480+
}
481+
_ = db.Close()
482+
return
483+
}
484+
485+
progress, err := m.Run(ctx, source)
486+
if err != nil {
487+
_ = db.Close()
488+
fmt.Fprintf(os.Stderr, "error: %v\n", err)
489+
os.Exit(1)
490+
}
491+
492+
_ = db.Close()
493+
494+
fmt.Printf("Mirror complete: %d downloaded, %d skipped (cached), %d failed, %s total\n",
495+
progress.Completed, progress.Skipped, progress.Failed, formatSize(progress.Bytes))
496+
497+
if len(progress.Errors) > 0 {
498+
fmt.Fprintf(os.Stderr, "\nErrors:\n")
499+
for _, e := range progress.Errors {
500+
fmt.Fprintf(os.Stderr, " %s/%s@%s: %s\n", e.Ecosystem, e.Name, e.Version, e.Error)
501+
}
502+
}
503+
}
504+
343505
func printStats(db *database.DB, popular, recent int, asJSON bool) error {
344506
defer func() { _ = db.Close() }()
345507

docs/architecture.md

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -161,6 +161,20 @@ vulnerabilities (
161161
updated_at DATETIME
162162
)
163163
-- indexes: (vuln_id, ecosystem, package_name) unique, (ecosystem, package_name)
164+
165+
metadata_cache (
166+
id INTEGER PRIMARY KEY,
167+
ecosystem TEXT NOT NULL,
168+
name TEXT NOT NULL,
169+
storage_path TEXT NOT NULL,
170+
etag TEXT,
171+
content_type TEXT,
172+
size INTEGER, -- BIGINT on Postgres
173+
fetched_at DATETIME,
174+
created_at DATETIME,
175+
updated_at DATETIME
176+
)
177+
-- indexes: (ecosystem, name) unique
164178
```
165179

166180
On PostgreSQL, `INTEGER PRIMARY KEY` becomes `SERIAL`, `DATETIME` becomes `TIMESTAMP`, `INTEGER DEFAULT 0` booleans become `BOOLEAN DEFAULT FALSE`, and size/count columns use `BIGINT`.
@@ -277,6 +291,12 @@ Version age filtering for supply chain attack mitigation. Configurable at global
277291

278292
Package metadata enrichment. Fetches license, description, homepage, repository URL, and vulnerability data from upstream registries. Powers the `/api/` endpoints and the web UI's package detail pages.
279293

294+
### `internal/mirror`
295+
296+
Selective package mirroring for pre-populating the proxy cache. Supports multiple input sources: individual PURLs (versioned or unversioned), CycloneDX/SPDX SBOM files, and full registry enumeration. Uses a bounded worker pool backed by `errgroup` to download artifacts in parallel, reusing `handler.Proxy.GetOrFetchArtifact()` for the actual fetch-and-cache work.
297+
298+
The package also provides a `MetadataCache` for storing raw upstream metadata blobs so the proxy can serve metadata responses offline. The `JobStore` manages async mirror jobs exposed via the `/api/mirror` endpoints.
299+
280300
### `internal/config`
281301

282302
Configuration loading.
@@ -326,10 +346,11 @@ Eviction can be implemented as:
326346
- Ensures clients fetch artifacts through proxy
327347
- Alternative: Let clients fetch directly, miss cache opportunity
328348

329-
**Why not cache metadata?**
349+
**Why not cache metadata (by default)?**
330350
- Simplicity - no invalidation logic needed
331351
- Fresh data - new versions visible immediately
332352
- Metadata is small, upstream fetch is fast
353+
- Set `cache_metadata: true` or use the mirror command to enable metadata caching for offline use via the `metadata_cache` table
333354

334355
**Why stream artifacts?**
335356
- Memory efficient - don't load large files into RAM

docs/configuration.md

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -213,6 +213,40 @@ Currently supported for npm, PyPI, pub.dev, Composer, Cargo, NuGet, Conda, RubyG
213213

214214
Note: Hex cooldown requires disabling registry signature verification since the proxy re-encodes the protobuf payload without the original signature. Set `HEX_NO_VERIFY_REPO_ORIGIN=1` or configure your repo with `no_verify: true`.
215215

216+
## Metadata Caching
217+
218+
By default the proxy fetches metadata fresh from upstream on every request. Enable `cache_metadata` to store metadata responses in the database and storage backend for offline fallback. When upstream is unreachable, the proxy serves the last cached copy. ETag-based revalidation avoids re-downloading unchanged metadata.
219+
220+
```yaml
221+
cache_metadata: true
222+
```
223+
224+
Or via environment variable: `PROXY_CACHE_METADATA=true`.
225+
226+
The `proxy mirror` command always enables metadata caching regardless of this setting.
227+
228+
## Mirror Command
229+
230+
The `proxy mirror` command pre-populates the cache from various sources. It accepts the same storage and database flags as `serve`.
231+
232+
| Flag | Default | Description |
233+
|------|---------|-------------|
234+
| `--sbom` | | Path to CycloneDX or SPDX SBOM file |
235+
| `--registry` | | Ecosystem name for full registry mirror |
236+
| `--concurrency` | `4` | Number of parallel downloads |
237+
| `--dry-run` | `false` | Show what would be mirrored without downloading |
238+
| `--config` | | Path to configuration file |
239+
| `--storage-url` | | Storage URL |
240+
| `--database-driver` | | Database driver |
241+
| `--database-path` | | SQLite database file |
242+
| `--database-url` | | PostgreSQL connection URL |
243+
244+
Positional arguments are treated as PURLs:
245+
246+
```bash
247+
proxy mirror pkg:npm/lodash@4.17.21 pkg:cargo/serde@1.0.0
248+
```
249+
216250
## Docker
217251

218252
### SQLite with Local Storage

go.mod

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ module github.com/git-pkgs/proxy
33
go 1.25.6
44

55
require (
6+
github.com/CycloneDX/cyclonedx-go v0.10.0
67
github.com/git-pkgs/archives v0.2.2
78
github.com/git-pkgs/enrichment v0.2.2
89
github.com/git-pkgs/purl v0.1.10
@@ -15,8 +16,10 @@ require (
1516
github.com/lib/pq v1.12.2
1617
github.com/prometheus/client_golang v1.23.2
1718
github.com/prometheus/client_model v0.6.2
19+
github.com/spdx/tools-golang v0.5.7
1820
github.com/swaggo/swag v1.16.6
1921
gocloud.dev v0.45.0
22+
golang.org/x/sync v0.20.0
2023
google.golang.org/protobuf v1.36.11
2124
gopkg.in/yaml.v3 v3.0.1
2225
modernc.org/sqlite v1.48.0
@@ -52,6 +55,7 @@ require (
5255
github.com/alfatraining/structtag v1.0.0 // indirect
5356
github.com/alingse/asasalint v0.0.11 // indirect
5457
github.com/alingse/nilnesserr v0.2.0 // indirect
58+
github.com/anchore/go-struct-converter v0.1.0 // indirect
5559
github.com/apapsch/go-jsonmerge/v2 v2.0.0 // indirect
5660
github.com/ashanbrown/forbidigo/v2 v2.3.0 // indirect
5761
github.com/ashanbrown/makezero/v2 v2.1.0 // indirect
@@ -277,7 +281,6 @@ require (
277281
golang.org/x/exp/typeparams v0.0.0-20260209203927-2842357ff358 // indirect
278282
golang.org/x/mod v0.33.0 // indirect
279283
golang.org/x/net v0.51.0 // indirect
280-
golang.org/x/sync v0.20.0 // indirect
281284
golang.org/x/sys v0.42.0 // indirect
282285
golang.org/x/text v0.34.0 // indirect
283286
golang.org/x/tools v0.42.0 // indirect
@@ -293,7 +296,7 @@ require (
293296
modernc.org/memory v1.11.0 // indirect
294297
mvdan.cc/gofumpt v0.9.2 // indirect
295298
mvdan.cc/unparam v0.0.0-20251027182757-5beb8c8f8f15 // indirect
296-
sigs.k8s.io/yaml v1.3.0 // indirect
299+
sigs.k8s.io/yaml v1.6.0 // indirect
297300
)
298301

299302
tool github.com/golangci/golangci-lint/v2/cmd/golangci-lint

0 commit comments

Comments
 (0)