Skip to content

Mine GitHub user stats + deploy to Cloudflare #60

Mine GitHub user stats + deploy to Cloudflare

Mine GitHub user stats + deploy to Cloudflare #60

name: Mine GitHub user stats + deploy to Cloudflare
on:
schedule:
- cron: "0 6 * * *"
workflow_dispatch:
inputs:
user:
description: "Single GitHub login to mine (skips the full users.txt loop)."
required: false
push:
branches: [main]
paths:
- "generate_stats.py"
- "stats_template.html"
- "cloudflare/**"
- ".github/workflows/mine-and-deploy.yml"
concurrency:
group: mine-and-deploy
cancel-in-progress: false
jobs:
mine-and-deploy:
runs-on: ubuntu-latest
permissions:
contents: write # for appending new users to users.txt
steps:
- uses: actions/checkout@v4
with:
token: ${{ secrets.GH_MINING_TOKEN || github.token }}
- uses: actions/setup-python@v5
with:
python-version: "3.13"
- uses: actions/setup-node@v4
with:
node-version: "24"
# Persist per-user mining caches across runs so we don't re-fetch
# PR details, commit stats, etc. that we've already pulled before.
# Key includes the inputs.user (or 'full' for cron runs) so single-
# user dispatches restore that user's cache specifically.
- name: Restore mining caches
uses: actions/cache@v4
with:
path: |
cache_*/api/
cache_*/bare/
cache/api/
cache/bare/
key: stats-cache-v1-${{ inputs.user || 'full' }}-${{ github.run_id }}
restore-keys: |
stats-cache-v1-${{ inputs.user || 'full' }}-
stats-cache-v1-
# Persist deployed user HTMLs across runs so that single-user
# mining doesn't wipe other users from the CF bucket when wrangler
# replaces the assets dir on deploy. Single shared key — every run
# restores the latest snapshot of all deployed dashboards, adds /
# refreshes its own user(s), and writes back the full set.
#
# Exclude committed dashboards (pirate, index, 404) from the cache
# so old versions can't overwrite the just-checked-out repo files
# when this step restores. We also explicitly re-apply them from
# git after the restore (next step) — defends against the case
# where the existing cache still contains them from a previous run.
- name: Restore deployed dashboards
uses: actions/cache@v4
with:
path: |
cloudflare/public/*.html
!cloudflare/public/pirate.html
!cloudflare/public/index.html
!cloudflare/public/404.html
key: deployed-htmls-v1-${{ github.run_id }}
restore-keys: |
deployed-htmls-v1-
- name: Re-apply committed dashboards from git
run: |
git checkout HEAD -- \
cloudflare/public/pirate.html \
cloudflare/public/index.html \
cloudflare/public/404.html
# Bootstrap from the live CF bucket: any user that's already
# deployed in production but missing locally (because the previous
# run was cancelled at the 6h cap and its post-step cache save
# didn't run) gets downloaded here. Without this, the next run
# would re-mine ~all users every time.
- name: Bootstrap dashboards from deployed bucket
run: |
set +e
deployed_json=$(curl -sSL --max-time 15 \
"https://githubusers.archivebox.io/deployed.json" 2>/dev/null)
if [ -z "$deployed_json" ] || ! echo "$deployed_json" | python3 -c "import json,sys;json.load(sys.stdin)" >/dev/null 2>&1; then
echo "No deployed.json available — skipping bootstrap"
exit 0
fi
users=$(echo "$deployed_json" \
| python3 -c "import json,sys; [print(u) for u in json.load(sys.stdin)]")
n=0
for u in $users; do
f="cloudflare/public/${u}.html"
# Don't clobber pirate.html (committed/authoritative).
[ "$u" = "pirate" ] && continue
if [ ! -s "$f" ]; then
if curl -sSL --max-time 30 --fail \
"https://githubusers.archivebox.io/${u}.html" \
-o "$f" 2>/dev/null; then
n=$((n + 1))
else
rm -f "$f"
fi
fi
done
echo "Bootstrapped $n dashboards from deployed bucket"
- name: Install gh CLI
run: |
type -p gh >/dev/null || (
sudo apt-get update -qq && sudo apt-get install -y gh
)
# gh authenticates via the GH_TOKEN environment variable (which we
# already set on each step that calls it); no explicit `gh auth login`
# step needed.
- name: Determine target users
id: targets
working-directory: .
env:
INPUT_USER: ${{ inputs.user }}
run: |
set -e
mkdir -p cloudflare/public
# Build the list of users we'll mine THIS run.
if [ -n "$INPUT_USER" ]; then
echo "Single-user mine (forced): $INPUT_USER"
# Persist new users into users.txt so future scheduled runs
# include them.
if ! grep -qiE "^${INPUT_USER}$" cloudflare/users.txt; then
echo "$INPUT_USER" >> cloudflare/users.txt
echo "added=true" >> $GITHUB_OUTPUT
fi
echo "$INPUT_USER" > /tmp/targets.txt
else
# Full mine: only mine users that don't have a deployed
# dashboard yet. Once a dashboard exists, it stays put
# until someone clicks the manual "Refresh" button (which
# dispatches with inputs.user set).
echo "Full mine of users.txt (skip already-deployed)"
: > /tmp/targets.txt
while IFS= read -r u || [ -n "$u" ]; do
u="${u%%#*}"
u="${u//[[:space:]]/}"
[ -z "$u" ] && continue
if [ -f "cloudflare/public/${u}.html" ]; then
echo " skip @$u — dashboard already deployed"
continue
fi
echo "$u" >> /tmp/targets.txt
done < cloudflare/users.txt
fi
# Pre-stage pirate's enhanced version
if [ -f stats.html ]; then
cp stats.html cloudflare/public/pirate.html
fi
# Stage users.txt as a public asset so the Worker's dynamic /
# handler can read it for the queued/mining list. Doing it here
# (before the mining loop) means every interim deploy also has
# a fresh users.txt available to the homepage.
cp cloudflare/users.txt cloudflare/public/users.txt
# Generate deployed.json — a JSON array of user logins whose
# dashboard HTML is currently in /public. Updated on every
# deploy_now() call below so the homepage stays accurate.
ls cloudflare/public/*.html 2>/dev/null \
| sed -E 's|cloudflare/public/||;s|\.html$||' \
| grep -vE '^(index|404)$' \
| python3 -c "import sys,json; print(json.dumps(sorted([l.strip() for l in sys.stdin if l.strip()])))" \
> cloudflare/public/deployed.json
echo "Targets:"
cat /tmp/targets.txt
- name: Mine each user (with live in-progress deploys)
working-directory: .
env:
NO_COLOR: "1"
GH_TOKEN: ${{ secrets.GH_MINING_TOKEN }}
CLOUDFLARE_API_TOKEN: ${{ secrets.CLOUDFLARE_API_TOKEN }}
CLOUDFLARE_ACCOUNT_ID: ${{ secrets.CLOUDFLARE_ACCOUNT_ID }}
# Posted by generate_stats.py to /api/progress so the Worker's
# loading page can render real-time phase info.
STATS_PROGRESS_TOKEN: ${{ secrets.GH_MINING_TOKEN }}
run: |
set -e
regen_manifest() {
ls cloudflare/public/*.html 2>/dev/null \
| sed -E 's|cloudflare/public/||;s|\.html$||' \
| grep -vE '^(index|404)$' \
| python3 -c "import sys,json; print(json.dumps(sorted([l.strip() for l in sys.stdin if l.strip()])))" \
> cloudflare/public/deployed.json
}
deploy_now() {
regen_manifest
(cd cloudflare && npx --yes wrangler@latest deploy --minify 2>&1 |
tail -2) || echo "::warning::interim deploy failed"
}
watch_and_deploy() {
# Watches stats_<user>.html every 30s while $1 (PID) is alive.
# Copies any updated file into the deploy dir and re-deploys so
# the live page shows partial data as mining progresses.
local pid="$1" user="$2" src="stats_${user}.html" \
dst="cloudflare/public/${user}.html" last_mtime=0
while kill -0 "$pid" 2>/dev/null; do
sleep 30
if [ -f "$src" ]; then
local mtime
mtime=$(stat -c %Y "$src" 2>/dev/null \
|| stat -f %m "$src" 2>/dev/null || echo 0)
if [ "$mtime" -gt "$last_mtime" ]; then
cp "$src" "$dst"
echo "::group::Interim deploy of @$user (live)"
deploy_now
echo "::endgroup::"
last_mtime="$mtime"
fi
fi
done
}
# Per-user wallclock cap. Some users have hundreds of repos and
# cold-mining them takes forever; bounding to 25min/user keeps
# the queue moving (partial data already deployed via the
# watcher's interim deploys).
USER_TIMEOUT=1500
while IFS= read -r user || [ -n "$user" ]; do
user="${user%%#*}"
user="${user//[[:space:]]/}"
[ -z "$user" ] && continue
[ "$user" = "pirate" ] && continue
echo "::group::Mining @$user"
# Run mining in the background; watch loop deploys partials.
timeout --kill-after=30s "$USER_TIMEOUT" python3 \
generate_stats.py --user "$user" \
--no-search-commits \
--max-api-fetches 800 &
MINE_PID=$!
watch_and_deploy "$MINE_PID" "$user" &
WATCH_PID=$!
wait "$MINE_PID" \
|| echo "::warning::mining @$user exited non-zero (timeout or error)"
# Stop the watcher and do a final deploy with the final HTML.
kill "$WATCH_PID" 2>/dev/null || true
wait "$WATCH_PID" 2>/dev/null || true
if [ -f "stats_$user.html" ]; then
cp "stats_$user.html" "cloudflare/public/$user.html"
echo "::group::Final deploy of @$user"
deploy_now
echo "::endgroup::"
fi
echo "::endgroup::"
done < /tmp/targets.txt
- name: Commit added users.txt entries
if: steps.targets.outputs.added == 'true'
working-directory: cloudflare
run: |
git config user.name "github-actions[bot]"
git config user.email "41898282+github-actions[bot]@users.noreply.github.com"
git add users.txt
git diff --staged --quiet || git commit -m "Add ${{ inputs.user }} to users.txt [skip ci]"
git push || echo "::warning::push failed (no commit permission?)"
- name: Final deploy
working-directory: .
run: |
# Regenerate deployed.json one last time before the final push.
ls cloudflare/public/*.html 2>/dev/null \
| sed -E 's|cloudflare/public/||;s|\.html$||' \
| grep -vE '^(index|404)$' \
| python3 -c "import sys,json; print(json.dumps(sorted([l.strip() for l in sys.stdin if l.strip()])))" \
> cloudflare/public/deployed.json
cd cloudflare && npx --yes wrangler@latest deploy
env:
CLOUDFLARE_API_TOKEN: ${{ secrets.CLOUDFLARE_API_TOKEN }}
CLOUDFLARE_ACCOUNT_ID: ${{ secrets.CLOUDFLARE_ACCOUNT_ID }}