Skip to content

Commit c01f0a5

Browse files
committed
Fix metadata caching, 404 propagation, mirror progress, and registry stubs
- ProxyCached now stores upstream Last-Modified in the cache and uses it (along with ETag) for conditional request handling, returning 304 when client validators match. Adds Content-Length to cached responses. - Handlers calling FetchOrCacheMetadata (pypi, composer, pub, nuget) now check for ErrUpstreamNotFound and return 404 instead of 502, matching the existing npm and cargo behavior. - Mirror jobs report live progress via a periodic callback while running, so API polls return real counts instead of zeroed progress. - Registry mirroring removed from CLI flags, API acceptance, README, and docs since every enumerator was a stub returning "not yet implemented". - Added tests for the conditional metadata path (ETag/If-None-Match, Last-Modified/If-Modified-Since, 304 responses, header omission).
1 parent 4768106 commit c01f0a5

16 files changed

Lines changed: 342 additions & 97 deletions

File tree

README.md

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -474,9 +474,6 @@ proxy mirror pkg:npm/lodash
474474
# Mirror from a CycloneDX or SPDX SBOM
475475
proxy mirror --sbom sbom.cdx.json
476476
477-
# Full registry mirror (npm, pypi, cargo supported)
478-
proxy mirror --registry npm
479-
480477
# Preview what would be mirrored
481478
proxy mirror --dry-run pkg:npm/lodash
482479
@@ -579,7 +576,7 @@ Recently cached:
579576

580577
| Endpoint | Description |
581578
|----------|-------------|
582-
| `POST /api/mirror` | Start a mirror job (JSON body with `purls` or `registry`) |
579+
| `POST /api/mirror` | Start a mirror job (JSON body with `purls`) |
583580
| `GET /api/mirror/{id}` | Get job status and progress |
584581
| `DELETE /api/mirror/{id}` | Cancel a running job |
585582

cmd/proxy/main.go

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -358,7 +358,6 @@ func runMirror() {
358358
databasePath := fs.String("database-path", "", "Path to SQLite database file")
359359
databaseURL := fs.String("database-url", "", "PostgreSQL connection URL")
360360
sbomPath := fs.String("sbom", "", "Path to CycloneDX or SPDX SBOM file")
361-
registry := fs.String("registry", "", "Ecosystem name for full registry mirror")
362361
concurrency := fs.Int("concurrency", 4, "Number of parallel downloads") //nolint:mnd // default concurrency
363362
dryRun := fs.Bool("dry-run", false, "Show what would be mirrored without downloading")
364363

@@ -368,8 +367,7 @@ func runMirror() {
368367
fmt.Fprintf(os.Stderr, "Examples:\n")
369368
fmt.Fprintf(os.Stderr, " proxy mirror pkg:npm/lodash@4.17.21\n")
370369
fmt.Fprintf(os.Stderr, " proxy mirror --sbom sbom.cdx.json\n")
371-
fmt.Fprintf(os.Stderr, " proxy mirror pkg:npm/lodash # all versions\n")
372-
fmt.Fprintf(os.Stderr, " proxy mirror --registry npm\n\n")
370+
fmt.Fprintf(os.Stderr, " proxy mirror pkg:npm/lodash # all versions\n\n")
373371
fmt.Fprintf(os.Stderr, "Flags:\n")
374372
fs.PrintDefaults()
375373
}
@@ -382,12 +380,10 @@ func runMirror() {
382380
switch {
383381
case *sbomPath != "":
384382
source = &mirror.SBOMSource{Path: *sbomPath}
385-
case *registry != "":
386-
source = &mirror.RegistrySource{Ecosystem: *registry}
387383
case len(purls) > 0:
388384
source = &mirror.PURLSource{PURLs: purls}
389385
default:
390-
fmt.Fprintf(os.Stderr, "error: provide PURLs, --sbom, or --registry\n")
386+
fmt.Fprintf(os.Stderr, "error: provide PURLs or --sbom\n")
391387
fs.Usage()
392388
os.Exit(1)
393389
}

docs/configuration.md

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -244,7 +244,6 @@ The `proxy mirror` command pre-populates the cache from various sources. It acce
244244
| Flag | Default | Description |
245245
|------|---------|-------------|
246246
| `--sbom` | | Path to CycloneDX or SPDX SBOM file |
247-
| `--registry` | | Ecosystem name for full registry mirror |
248247
| `--concurrency` | `4` | Number of parallel downloads |
249248
| `--dry-run` | `false` | Show what would be mirrored without downloading |
250249
| `--config` | | Path to configuration file |

internal/database/queries.go

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -894,7 +894,7 @@ func (db *DB) GetMetadataCache(ecosystem, name string) (*MetadataCacheEntry, err
894894
var entry MetadataCacheEntry
895895
query := db.Rebind(`
896896
SELECT id, ecosystem, name, storage_path, etag, content_type,
897-
size, fetched_at, created_at, updated_at
897+
size, last_modified, fetched_at, created_at, updated_at
898898
FROM metadata_cache WHERE ecosystem = ? AND name = ?
899899
`)
900900
err := db.Get(&entry, query, ecosystem, name)
@@ -914,34 +914,36 @@ func (db *DB) UpsertMetadataCache(entry *MetadataCacheEntry) error {
914914
if db.dialect == DialectPostgres {
915915
query = `
916916
INSERT INTO metadata_cache (ecosystem, name, storage_path, etag, content_type,
917-
size, fetched_at, created_at, updated_at)
918-
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9)
917+
size, last_modified, fetched_at, created_at, updated_at)
918+
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10)
919919
ON CONFLICT(ecosystem, name) DO UPDATE SET
920920
storage_path = EXCLUDED.storage_path,
921921
etag = EXCLUDED.etag,
922922
content_type = EXCLUDED.content_type,
923923
size = EXCLUDED.size,
924+
last_modified = EXCLUDED.last_modified,
924925
fetched_at = EXCLUDED.fetched_at,
925926
updated_at = EXCLUDED.updated_at
926927
`
927928
} else {
928929
query = `
929930
INSERT INTO metadata_cache (ecosystem, name, storage_path, etag, content_type,
930-
size, fetched_at, created_at, updated_at)
931-
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
931+
size, last_modified, fetched_at, created_at, updated_at)
932+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
932933
ON CONFLICT(ecosystem, name) DO UPDATE SET
933934
storage_path = excluded.storage_path,
934935
etag = excluded.etag,
935936
content_type = excluded.content_type,
936937
size = excluded.size,
938+
last_modified = excluded.last_modified,
937939
fetched_at = excluded.fetched_at,
938940
updated_at = excluded.updated_at
939941
`
940942
}
941943

942944
_, err := db.Exec(query,
943945
entry.Ecosystem, entry.Name, entry.StoragePath, entry.ETag,
944-
entry.ContentType, entry.Size, entry.FetchedAt, now, now,
946+
entry.ContentType, entry.Size, entry.LastModified, entry.FetchedAt, now, now,
945947
)
946948
if err != nil {
947949
return fmt.Errorf("upserting metadata cache: %w", err)

internal/database/schema.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,7 @@ CREATE TABLE IF NOT EXISTS metadata_cache (
9999
etag TEXT,
100100
content_type TEXT,
101101
size INTEGER,
102+
last_modified DATETIME,
102103
fetched_at DATETIME,
103104
created_at DATETIME,
104105
updated_at DATETIME
@@ -198,6 +199,7 @@ CREATE TABLE IF NOT EXISTS metadata_cache (
198199
etag TEXT,
199200
content_type TEXT,
200201
size BIGINT,
202+
last_modified TIMESTAMP,
201203
fetched_at TIMESTAMP,
202204
created_at TIMESTAMP,
203205
updated_at TIMESTAMP
@@ -596,6 +598,7 @@ func (db *DB) EnsureMetadataCacheTable() error {
596598
etag TEXT,
597599
content_type TEXT,
598600
size BIGINT,
601+
last_modified TIMESTAMP,
599602
fetched_at TIMESTAMP,
600603
created_at TIMESTAMP,
601604
updated_at TIMESTAMP
@@ -612,6 +615,7 @@ func (db *DB) EnsureMetadataCacheTable() error {
612615
etag TEXT,
613616
content_type TEXT,
614617
size INTEGER,
618+
last_modified DATETIME,
615619
fetched_at DATETIME,
616620
created_at DATETIME,
617621
updated_at DATETIME

internal/database/types.go

Lines changed: 11 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -78,16 +78,17 @@ func (a *Artifact) IsCached() bool {
7878

7979
// MetadataCacheEntry represents a cached metadata blob for offline serving.
8080
type MetadataCacheEntry struct {
81-
ID int64 `db:"id" json:"id"`
82-
Ecosystem string `db:"ecosystem" json:"ecosystem"`
83-
Name string `db:"name" json:"name"`
84-
StoragePath string `db:"storage_path" json:"storage_path"`
85-
ETag sql.NullString `db:"etag" json:"etag,omitempty"`
86-
ContentType sql.NullString `db:"content_type" json:"content_type,omitempty"`
87-
Size sql.NullInt64 `db:"size" json:"size,omitempty"`
88-
FetchedAt sql.NullTime `db:"fetched_at" json:"fetched_at,omitempty"`
89-
CreatedAt time.Time `db:"created_at" json:"created_at"`
90-
UpdatedAt time.Time `db:"updated_at" json:"updated_at"`
81+
ID int64 `db:"id" json:"id"`
82+
Ecosystem string `db:"ecosystem" json:"ecosystem"`
83+
Name string `db:"name" json:"name"`
84+
StoragePath string `db:"storage_path" json:"storage_path"`
85+
ETag sql.NullString `db:"etag" json:"etag,omitempty"`
86+
ContentType sql.NullString `db:"content_type" json:"content_type,omitempty"`
87+
Size sql.NullInt64 `db:"size" json:"size,omitempty"`
88+
LastModified sql.NullTime `db:"last_modified" json:"last_modified,omitempty"`
89+
FetchedAt sql.NullTime `db:"fetched_at" json:"fetched_at,omitempty"`
90+
CreatedAt time.Time `db:"created_at" json:"created_at"`
91+
UpdatedAt time.Time `db:"updated_at" json:"updated_at"`
9192
}
9293

9394
// Vulnerability represents a cached vulnerability record.

internal/handler/composer.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ package handler
22

33
import (
44
"encoding/json"
5+
"errors"
56
"fmt"
67
"io"
78
"net/http"
@@ -91,6 +92,10 @@ func (h *ComposerHandler) handlePackageMetadata(w http.ResponseWriter, r *http.R
9192

9293
body, _, err := h.proxy.FetchOrCacheMetadata(r.Context(), "composer", packageName, upstreamURL)
9394
if err != nil {
95+
if errors.Is(err, ErrUpstreamNotFound) {
96+
http.Error(w, "not found", http.StatusNotFound)
97+
return
98+
}
9499
h.proxy.Logger.Error("upstream request failed", "error", err)
95100
http.Error(w, "upstream request failed", http.StatusBadGateway)
96101
return

internal/handler/handler.go

Lines changed: 73 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ import (
1010
"io"
1111
"log/slog"
1212
"net/http"
13+
"strconv"
1314
"strings"
1415
"time"
1516

@@ -394,14 +395,14 @@ func (p *Proxy) FetchOrCacheMetadata(ctx context.Context, ecosystem, cacheKey, u
394395
}
395396

396397
// Try upstream
397-
body, contentType, etag, err := p.fetchUpstreamMetadata(ctx, upstreamURL, entry, accept)
398+
body, contentType, etag, lastModified, err := p.fetchUpstreamMetadata(ctx, upstreamURL, entry, accept)
398399
if errors.Is(err, errStale304) {
399400
// 304 but cached file is gone; retry without ETag
400-
body, contentType, etag, err = p.fetchUpstreamMetadata(ctx, upstreamURL, nil, accept)
401+
body, contentType, etag, lastModified, err = p.fetchUpstreamMetadata(ctx, upstreamURL, nil, accept)
401402
}
402403
if err == nil {
403404
if p.CacheMetadata {
404-
p.cacheMetadataBlob(ctx, ecosystem, cacheKey, storagePath, body, contentType, etag)
405+
p.cacheMetadataBlob(ctx, ecosystem, cacheKey, storagePath, body, contentType, etag, lastModified)
405406
}
406407
return body, contentType, nil
407408
}
@@ -435,11 +436,13 @@ func (p *Proxy) FetchOrCacheMetadata(ctx context.Context, ecosystem, cacheKey, u
435436
}
436437

437438
// fetchUpstreamMetadata fetches metadata from upstream, using ETag for conditional revalidation.
438-
// Returns the body, content type, ETag, and any error.
439-
func (p *Proxy) fetchUpstreamMetadata(ctx context.Context, upstreamURL string, entry *database.MetadataCacheEntry, accept string) ([]byte, string, string, error) {
439+
// Returns the body, content type, ETag, upstream Last-Modified time, and any error.
440+
func (p *Proxy) fetchUpstreamMetadata(ctx context.Context, upstreamURL string, entry *database.MetadataCacheEntry, accept string) ([]byte, string, string, time.Time, error) {
441+
var zeroTime time.Time
442+
440443
req, err := http.NewRequestWithContext(ctx, http.MethodGet, upstreamURL, nil)
441444
if err != nil {
442-
return nil, "", "", fmt.Errorf("creating request: %w", err)
445+
return nil, "", "", zeroTime, fmt.Errorf("creating request: %w", err)
443446
}
444447
req.Header.Set("Accept", accept)
445448

@@ -449,38 +452,42 @@ func (p *Proxy) fetchUpstreamMetadata(ctx context.Context, upstreamURL string, e
449452

450453
resp, err := p.HTTPClient.Do(req)
451454
if err != nil {
452-
return nil, "", "", fmt.Errorf("fetching metadata: %w", err)
455+
return nil, "", "", zeroTime, fmt.Errorf("fetching metadata: %w", err)
453456
}
454457
defer func() { _ = resp.Body.Close() }()
455458

456459
// 304 Not Modified -- our cached copy is still good
457460
if resp.StatusCode == http.StatusNotModified && entry != nil {
458461
cached, readErr := p.Storage.Open(ctx, entry.StoragePath)
459462
if readErr != nil {
460-
return nil, "", "", errStale304
463+
return nil, "", "", zeroTime, errStale304
461464
}
462465
defer func() { _ = cached.Close() }()
463466
data, readErr := ReadMetadata(cached)
464467
if readErr != nil {
465-
return nil, "", "", errStale304
468+
return nil, "", "", zeroTime, errStale304
466469
}
467470
ct := contentTypeJSON
468471
if entry.ContentType.Valid {
469472
ct = entry.ContentType.String
470473
}
471-
return data, ct, entry.ETag.String, nil
474+
lm := zeroTime
475+
if entry.LastModified.Valid {
476+
lm = entry.LastModified.Time
477+
}
478+
return data, ct, entry.ETag.String, lm, nil
472479
}
473480

474481
if resp.StatusCode == http.StatusNotFound {
475-
return nil, "", "", ErrUpstreamNotFound
482+
return nil, "", "", zeroTime, ErrUpstreamNotFound
476483
}
477484
if resp.StatusCode != http.StatusOK {
478-
return nil, "", "", fmt.Errorf("upstream returned %d", resp.StatusCode)
485+
return nil, "", "", zeroTime, fmt.Errorf("upstream returned %d", resp.StatusCode)
479486
}
480487

481488
body, err := ReadMetadata(resp.Body)
482489
if err != nil {
483-
return nil, "", "", fmt.Errorf("reading response: %w", err)
490+
return nil, "", "", zeroTime, fmt.Errorf("reading response: %w", err)
484491
}
485492

486493
contentType := resp.Header.Get("Content-Type")
@@ -489,11 +496,17 @@ func (p *Proxy) fetchUpstreamMetadata(ctx context.Context, upstreamURL string, e
489496
}
490497

491498
etag := resp.Header.Get("ETag")
492-
return body, contentType, etag, nil
499+
500+
var lastModified time.Time
501+
if lm := resp.Header.Get("Last-Modified"); lm != "" {
502+
lastModified, _ = http.ParseTime(lm)
503+
}
504+
505+
return body, contentType, etag, lastModified, nil
493506
}
494507

495508
// cacheMetadataBlob stores metadata bytes in storage and updates the database.
496-
func (p *Proxy) cacheMetadataBlob(ctx context.Context, ecosystem, cacheKey, storagePath string, data []byte, contentType, etag string) {
509+
func (p *Proxy) cacheMetadataBlob(ctx context.Context, ecosystem, cacheKey, storagePath string, data []byte, contentType, etag string, lastModified time.Time) {
497510
if p.DB == nil || p.Storage == nil {
498511
return
499512
}
@@ -505,13 +518,14 @@ func (p *Proxy) cacheMetadataBlob(ctx context.Context, ecosystem, cacheKey, stor
505518
}
506519

507520
_ = p.DB.UpsertMetadataCache(&database.MetadataCacheEntry{
508-
Ecosystem: ecosystem,
509-
Name: cacheKey,
510-
StoragePath: storagePath,
511-
ETag: sql.NullString{String: etag, Valid: etag != ""},
512-
ContentType: sql.NullString{String: contentType, Valid: contentType != ""},
513-
Size: sql.NullInt64{Int64: size, Valid: true},
514-
FetchedAt: sql.NullTime{Time: time.Now(), Valid: true},
521+
Ecosystem: ecosystem,
522+
Name: cacheKey,
523+
StoragePath: storagePath,
524+
ETag: sql.NullString{String: etag, Valid: etag != ""},
525+
ContentType: sql.NullString{String: contentType, Valid: contentType != ""},
526+
Size: sql.NullInt64{Int64: size, Valid: true},
527+
LastModified: sql.NullTime{Time: lastModified, Valid: !lastModified.IsZero()},
528+
FetchedAt: sql.NullTime{Time: time.Now(), Valid: true},
515529
})
516530
}
517531

@@ -537,7 +551,44 @@ func (p *Proxy) ProxyCached(w http.ResponseWriter, r *http.Request, upstreamURL,
537551
return
538552
}
539553

554+
// Look up cache entry to get ETag and upstream Last-Modified for conditional response headers
555+
var etag string
556+
var lastModified time.Time
557+
if p.DB != nil {
558+
if entry, err := p.DB.GetMetadataCache(ecosystem, cacheKey); err == nil && entry != nil {
559+
if entry.ETag.Valid {
560+
etag = entry.ETag.String
561+
}
562+
if entry.LastModified.Valid {
563+
lastModified = entry.LastModified.Time
564+
}
565+
}
566+
}
567+
568+
// Honor client conditional request headers
569+
if etag != "" {
570+
if match := r.Header.Get("If-None-Match"); match != "" && match == etag {
571+
w.WriteHeader(http.StatusNotModified)
572+
return
573+
}
574+
}
575+
if !lastModified.IsZero() {
576+
if ims := r.Header.Get("If-Modified-Since"); ims != "" {
577+
if t, err := http.ParseTime(ims); err == nil && !lastModified.After(t) {
578+
w.WriteHeader(http.StatusNotModified)
579+
return
580+
}
581+
}
582+
}
583+
540584
w.Header().Set("Content-Type", contentType)
585+
w.Header().Set("Content-Length", strconv.Itoa(len(body)))
586+
if etag != "" {
587+
w.Header().Set("ETag", etag)
588+
}
589+
if !lastModified.IsZero() {
590+
w.Header().Set("Last-Modified", lastModified.UTC().Format(http.TimeFormat))
591+
}
541592
w.WriteHeader(http.StatusOK)
542593
_, _ = w.Write(body)
543594
}

0 commit comments

Comments
 (0)