Skip to content

Commit c7cba7e

Browse files
committed
Live progress reporting from Python → Worker → loading page
- Worker /api/progress (POST): accepts a {phase, message, ...counts} JSON body, auth'd by Bearer token == GH_DISPATCH_TOKEN. Caches per-user in Workers Cache (1h TTL). - Worker /api/status: surfaces the latest /api/progress payload under a 'progress' field (alongside CI step list + rate-limit + log tail). - generate_stats.py: new report_progress(phase, message, **counts) helper that POSTs to STATS_PROGRESS_URL; called at each major phase boundary so the loading page can show real-time updates. - Action: passes STATS_PROGRESS_TOKEN env var to the mining step. - Loading page: new phase-msg panel above the step list rendering the current phase + counts (repos, commits, prs, issues, stars, repos_accessible).
1 parent 65daa38 commit c7cba7e

3 files changed

Lines changed: 213 additions & 1 deletion

File tree

.github/workflows/mine-and-deploy.yml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,9 @@ jobs:
8383
GH_TOKEN: ${{ secrets.GH_MINING_TOKEN }}
8484
CLOUDFLARE_API_TOKEN: ${{ secrets.CLOUDFLARE_API_TOKEN }}
8585
CLOUDFLARE_ACCOUNT_ID: ${{ secrets.CLOUDFLARE_ACCOUNT_ID }}
86+
# Posted by generate_stats.py to /api/progress so the Worker's
87+
# loading page can render real-time phase info.
88+
STATS_PROGRESS_TOKEN: ${{ secrets.GH_MINING_TOKEN }}
8689
run: |
8790
set -e
8891

cloudflare/worker/index.ts

Lines changed: 155 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,9 @@ export default {
2727
if (url.pathname === "/api/status") {
2828
return handleStatus(req, env, url);
2929
}
30+
if (url.pathname === "/api/progress") {
31+
return handleProgress(req, env, url);
32+
}
3033

3134
// -- Static assets ----------------------------------------------------
3235
const assetResp = await env.ASSETS.fetch(req);
@@ -135,6 +138,46 @@ function json(obj: unknown, status = 200): Response {
135138
}
136139

137140

141+
async function handleProgress(
142+
req: Request,
143+
env: Env,
144+
url: URL,
145+
): Promise<Response> {
146+
// POST { phase, message, totals } — keyed by ?user=X — read back via
147+
// /api/status. Auth: Bearer header must match GH_DISPATCH_TOKEN
148+
// (the same token the Action gets from secrets) so random clients
149+
// can't spam fake progress.
150+
if (req.method !== "POST") {
151+
return json({ error: "POST only" }, 405);
152+
}
153+
const auth = req.headers.get("Authorization") ?? "";
154+
if (!auth.startsWith("Bearer ") ||
155+
auth.slice(7) !== env.GH_DISPATCH_TOKEN) {
156+
return json({ error: "unauthorized" }, 401);
157+
}
158+
const user = url.searchParams.get("user")?.trim();
159+
if (!user || !VALID_LOGIN.test(user)) {
160+
return json({ error: "invalid user" }, 400);
161+
}
162+
const body = await req.text();
163+
// Sanity cap — small JSON only.
164+
if (body.length > 4096) return json({ error: "too large" }, 413);
165+
try { JSON.parse(body); } catch { return json({ error: "bad json" }, 400); }
166+
const cache = caches.default;
167+
await cache.put(
168+
new Request(`https://internal-progress.invalid/${user}`),
169+
new Response(body, {
170+
headers: {
171+
"Cache-Control": "max-age=3600",
172+
"Content-Type": "application/json",
173+
"X-Received-At": new Date().toISOString(),
174+
},
175+
}),
176+
);
177+
return json({ ok: true });
178+
}
179+
180+
138181
async function handleStatus(
139182
req: Request,
140183
env: Env,
@@ -215,6 +258,49 @@ async function handleStatus(
215258
}
216259
} catch {}
217260

261+
// Tail the job's live log for richer progress info (e.g. the Python
262+
// script's `>> [N/M] ...` lines). The GH API redirects to a signed
263+
// download URL — fetch() follows by default.
264+
let recentLog: string[] = [];
265+
if (job?.id) {
266+
try {
267+
const lr = await fetch(
268+
`https://api.github.com/repos/${repo}/actions/jobs/${job.id}/logs`,
269+
{
270+
headers: {
271+
Authorization: `Bearer ${env.GH_DISPATCH_TOKEN}`,
272+
"User-Agent": "githubusers-archivebox-io",
273+
Accept: "application/vnd.github+json",
274+
},
275+
},
276+
);
277+
if (lr.ok) {
278+
const txt = await lr.text();
279+
// Each line is "<ISO timestamp> <message>"; strip timestamp +
280+
// filter to lines that look like Python script output.
281+
const interesting = txt
282+
.split("\n")
283+
.map((l) => l.replace(/^\d{4}-\d{2}-\d{2}T[\d:.]+Z\s?/, ""))
284+
.filter((l) => /^(>>|\s*\[|\s*-{2}|\s*!|\s*resolved\b|\s*scanning |\s*fetching |\s*mining |\s*deploying|\s*search quota|\s*resolving )/i
285+
.test(l))
286+
.slice(-20);
287+
recentLog = interesting;
288+
}
289+
} catch {}
290+
}
291+
292+
// Read the latest progress update posted by the running Python script.
293+
let progress: any = null;
294+
try {
295+
const pres = await caches.default.match(
296+
new Request(`https://internal-progress.invalid/${user}`),
297+
);
298+
if (pres) {
299+
progress = await pres.json();
300+
progress.received_at = pres.headers.get("X-Received-At");
301+
}
302+
} catch {}
303+
218304
return json({
219305
ok: true,
220306
run_id: run.id,
@@ -227,6 +313,8 @@ async function handleStatus(
227313
?? steps.at(-1)?.name ?? null,
228314
steps,
229315
rate_limit: rateLimit,
316+
recent_log: recentLog,
317+
progress,
230318
});
231319
}
232320

@@ -323,6 +411,30 @@ function loadingPage(user: string): string {
323411
display: block; height: 100%; background: #3fb950;
324412
}
325413
.ratelimit.cooldown .gauge > span { background: #d97706; }
414+
.phase-msg {
415+
background: #0e2640; border: 1px solid #1f4d7a;
416+
color: #58a6ff; padding: 12px 14px; border-radius: 6px;
417+
margin: 0 0 14px; font-size: 13px;
418+
display: flex; justify-content: space-between; align-items: center;
419+
gap: 12px; flex-wrap: wrap;
420+
}
421+
.phase-msg .pm-msg { flex: 1; min-width: 200px; }
422+
.phase-msg .pm-counts {
423+
font-family: ui-monospace, SFMono-Regular, Menlo, monospace;
424+
font-size: 11px; color: #8b949e;
425+
}
426+
.phase-msg .pm-counts strong { color: #c9d1d9; }
427+
.livelog {
428+
background: #0d1117; border: 1px solid #21262d;
429+
border-radius: 6px; padding: 10px 12px; margin: 14px 0 0;
430+
font-family: ui-monospace, SFMono-Regular, Menlo, monospace;
431+
font-size: 11px; color: #c9d1d9;
432+
max-height: 180px; overflow-y: auto; line-height: 1.45;
433+
white-space: pre-wrap; word-break: break-all;
434+
}
435+
.livelog .l-hdr { color: #58a6ff; }
436+
.livelog .l-warn { color: #ffa657; }
437+
.livelog .l-err { color: #f85149; }
326438
a { color: #58a6ff; }
327439
code { background: #21262d; padding: 1px 5px; border-radius: 3px;
328440
font-size: 90%; font-family: inherit; }
@@ -348,10 +460,14 @@ function loadingPage(user: string): string {
348460
349461
<div class="progress-track"><div class="progress-fill" id="progress"></div></div>
350462
463+
<div id="phase-msg" class="phase-msg" style="display:none"></div>
464+
351465
<div id="ratelimit" class="ratelimit" style="display:none"></div>
352466
353467
<ol class="steps" id="steps"></ol>
354468
469+
<pre id="livelog" class="livelog" style="display:none"></pre>
470+
355471
<div id="error" class="err" style="display:none"></div>
356472
357473
<div class="footer-row">
@@ -374,6 +490,8 @@ const $err = document.getElementById("error");
374490
const $runLink = document.getElementById("run-link");
375491
const $spinner = document.getElementById("hdr-spinner");
376492
const $rl = document.getElementById("ratelimit");
493+
const $log = document.getElementById("livelog");
494+
const $pmsg = document.getElementById("phase-msg");
377495
378496
const startedAt = Date.now();
379497
function fmtElapsed(sec) {
@@ -440,6 +558,38 @@ async function checkDeployed() {
440558
}
441559
}
442560
561+
function renderProgress(p) {
562+
if (!p || !p.phase) { $pmsg.style.display = "none"; return; }
563+
const countKeys = ["repos", "commits", "prs", "issues", "stars",
564+
"repos_accessible"];
565+
const counts = countKeys
566+
.filter(k => p[k] != null)
567+
.map(k => '<strong>' + p[k] + '</strong> ' + k);
568+
$pmsg.innerHTML =
569+
'<div class="pm-msg">' +
570+
(p.message || p.phase) +
571+
' <code style="font-size:10px;color:#8b949e;margin-left:6px">' +
572+
p.phase + '</code></div>' +
573+
(counts.length ? '<div class="pm-counts">' + counts.join(" · ") + '</div>' : "");
574+
$pmsg.style.display = "flex";
575+
}
576+
577+
function renderLog(lines) {
578+
if (!Array.isArray(lines) || lines.length === 0) {
579+
$log.style.display = "none"; return;
580+
}
581+
const esc = (s) => s.replace(/&/g, "&amp;").replace(/</g, "&lt;").replace(/>/g, "&gt;");
582+
$log.innerHTML = lines.map(l => {
583+
const cls = /^!|fail|error|❌/i.test(l) ? "l-err"
584+
: /quota|warn|^ !/i.test(l) ? "l-warn"
585+
: /^>>/.test(l) ? "l-hdr"
586+
: "";
587+
return '<span class="' + cls + '">' + esc(l) + '</span>';
588+
}).join("\n");
589+
$log.style.display = "block";
590+
$log.scrollTop = $log.scrollHeight;
591+
}
592+
443593
function renderRateLimit(rl) {
444594
if (!rl || (!rl.search && !rl.core)) {
445595
$rl.style.display = "none"; return;
@@ -521,7 +671,11 @@ function renderSteps(status) {
521671
checkDeployed(),
522672
]);
523673
renderSteps(status);
524-
if (status) renderRateLimit(status.rate_limit);
674+
if (status) {
675+
renderProgress(status.progress);
676+
renderRateLimit(status.rate_limit);
677+
renderLog(status.recent_log);
678+
}
525679
if (deployed) {
526680
clearInterval(interval);
527681
$now.textContent = "Dashboard ready — reloading…";

generate_stats.py

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,15 @@
4343
GH_LOGIN = "pirate"
4444
GH_NAME = "Nick Sweeting"
4545

46+
# Optional: when set, the script POSTs phase updates to this URL so the
47+
# live "mining…" page can show real-time progress. The Worker's
48+
# /api/progress endpoint checks the Bearer token against GH_DISPATCH_TOKEN.
49+
PROGRESS_URL = os.environ.get(
50+
"STATS_PROGRESS_URL",
51+
"https://githubusers.archivebox.io/api/progress",
52+
)
53+
PROGRESS_TOKEN = os.environ.get("STATS_PROGRESS_TOKEN", "")
54+
4655
# Known author emails pirate has used over the years.
4756
PIRATE_EMAILS = {
4857
"nikisweeting@gmail.com",
@@ -2364,6 +2373,35 @@ def _collect_all_records() -> list[dict]:
23642373
return all_recs
23652374

23662375

2376+
def report_progress(phase: str, message: str = "", **extra) -> None:
2377+
"""POST a small JSON progress update to the live /api/progress endpoint
2378+
so the user's loading page can render real-time phase info. Best-effort
2379+
— failures are silent."""
2380+
if not PROGRESS_URL or not PROGRESS_TOKEN:
2381+
return
2382+
try:
2383+
import urllib.request, urllib.error
2384+
payload = {
2385+
"phase": phase,
2386+
"message": message,
2387+
"ts": time.time(),
2388+
"user": GH_LOGIN,
2389+
**extra,
2390+
}
2391+
data = json.dumps(payload).encode()
2392+
url = f"{PROGRESS_URL}?user={GH_LOGIN}"
2393+
req = urllib.request.Request(
2394+
url, data=data, method="POST",
2395+
headers={
2396+
"Content-Type": "application/json",
2397+
"Authorization": f"Bearer {PROGRESS_TOKEN}",
2398+
},
2399+
)
2400+
urllib.request.urlopen(req, timeout=5).read()
2401+
except Exception:
2402+
pass
2403+
2404+
23672405
def _render_now(*, mining_status: str = "complete") -> dict:
23682406
"""Aggregate from cache and write stats.html. Returns aggregate dict.
23692407
`mining_status` is embedded in the output so the live-mining UI knows
@@ -2539,6 +2577,8 @@ def main() -> int:
25392577
print(f"Wrote {OUTPUT_FILE}")
25402578
return 0
25412579

2580+
report_progress("starting", f"Mining @{GH_LOGIN}")
2581+
25422582
# ---- Phase 1: local filesystem mining --------------------------------
25432583
if not args.no_local:
25442584
print(">> [1/7] Walking filesystem for local git repos ...", file=sys.stderr)
@@ -2551,8 +2591,11 @@ def main() -> int:
25512591

25522592
# ---- Phase 2: bare-clone owned/org GH repos we don't have locally ----
25532593
if not args.no_clone:
2594+
report_progress("phase-2-listing", "Listing accessible GitHub repos")
25542595
print(">> [2/7] Listing accessible GitHub repos ...", file=sys.stderr)
25552596
accessible = list_accessible_repos()
2597+
report_progress("phase-2-cloning", f"{len(accessible)} repos accessible — cloning",
2598+
repos_accessible=len(accessible))
25562599
print(f">> {len(accessible)} accessible repos via API", file=sys.stderr)
25572600
# For generic --user runs, derive company patterns from the user's
25582601
# top org owners so the dashboard still has some color coding.
@@ -2620,14 +2663,19 @@ def main() -> int:
26202663
except Exception:
26212664
pass
26222665
if not args.no_prs:
2666+
report_progress("phase-4-prs", "Searching PRs + issues authored by user")
26232667
print(">> [4/7] Searching PRs + issues by the user ...", file=sys.stderr)
26242668
try:
26252669
prs, issues = list_prs_and_issues()
2670+
report_progress("phase-4-prs-done",
2671+
f"Found {len(prs)} PRs, {len(issues)} issues",
2672+
prs=len(prs), issues=len(issues))
26262673
print(f">> {len(prs)} PRs, {len(issues)} issues", file=sys.stderr)
26272674
except Exception as e:
26282675
print(f" ! PR/issue search failed: {e}", file=sys.stderr)
26292676

26302677
# ---- Phase 5: PR-detail fetch (lines added/removed) --------------
2678+
report_progress("phase-5-pr-details", "Fetching per-PR additions/deletions")
26312679
print(">> [5/7] Fetching merged-PR additions/deletions ...",
26322680
file=sys.stderr)
26332681
try:
@@ -2637,6 +2685,8 @@ def main() -> int:
26372685
print(f" ! PR-detail fetch failed: {e}", file=sys.stderr)
26382686

26392687
# ---- Phase 5a: per-PR commits (handles non-GH-linked author emails)
2688+
report_progress("phase-5a-pr-commits",
2689+
"Walking each merged PR's commit list")
26402690
try:
26412691
existing_shas: set[str] = {r.get("sha")
26422692
for r in _collect_all_records()
@@ -2697,6 +2747,8 @@ def main() -> int:
26972747

26982748
# ---- Phase 6: star counts (full coverage) ----------------------------
26992749
if not args.no_stars:
2750+
report_progress("phase-6-stars",
2751+
"Fetching star counts + total-commits per repo")
27002752
print(">> [6/7] Fetching star counts for all repos ...",
27012753
file=sys.stderr)
27022754
# Collect every canonical repo name in our data + every repo in
@@ -2743,6 +2795,9 @@ def main() -> int:
27432795
print(f">> DONE: {t['commits']} commits, {t['repos']} repos, "
27442796
f"{t['stars']:,}{t['prs_merged']}/{t['prs']} PRs merged, "
27452797
f"{t['issues']} issues", file=sys.stderr)
2798+
report_progress("done", "Mining complete",
2799+
commits=t['commits'], repos=t['repos'],
2800+
stars=t['stars'], prs=t['prs'], issues=t['issues'])
27462801
print(f"Wrote {OUTPUT_FILE}")
27472802
return 0
27482803

0 commit comments

Comments
 (0)