Improve repo intake closeout and site reference capture

RichardGeorgeDavis · RichardGeorgeDavis · commit 06d1e1416bc8 · 2026-04-17T14:35:01.000+02:00
diff --git a/README.md b/README.md
@@ -288,6 +288,7 @@ External skill catalogs such as [`openai/skills`](https://github.com/openai/skil
 - for repo-level Codex discoverability, prefer tracked `.codex/skills/` and keep `.agents/skills/` only when repo-local compatibility mirroring helps
 - keep workspace-wide reusable skill sources in `shared/skills/` and starter templates in `tools/templates/skills/`
 - keep the managed VoltAgent `DESIGN.md` catalog as an optional ability under `repos/abilities/voltagent-awesome-design-md` and copy only the specific `DESIGN.md` files a repo actually needs
+- use `tools/scripts/capture-site-reference.sh` when a repo needs a conservative `httrack`-based public-site reference capture plus a repo-local acquisition note under `ref/httrack/`
 - use `tools/scripts/use-design-md.sh` when you want a stable local mirror under `cache/design-md/catalog/` or need to copy one `DESIGN.md` into a repo root quickly
 - keep third-party orchestration layers and generated agent setup local-only unless there is a strong reason to publish them
 - do not add the whole upstream skill catalog to `repos/`, `tools/`, or as a submodule unless there is a very specific maintenance reason
diff --git a/docs/08-first-run-and-updates.md b/docs/08-first-run-and-updates.md
@@ -50,8 +50,9 @@ If you are starting a fresh repo-aware chat and want a cheaper wake-up path, use
    `tools/scripts/generate-context-cache.sh --workspace --run`
 3. for `workspace-hub` work, also run:
    `tools/scripts/generate-context-cache.sh --repo workspace-hub --run`
-4. treat generated files under `cache/context/` as compact `L0` and `L1` summaries only
-5. trust tracked docs, manifests, and repo files over generated summaries whenever they differ
+4. if a repo was just added or its setup changed, close it into the current Codex thread with `tools/bin/workspace-memory save-repo <repo-path-or-name>`
+5. treat generated files under `cache/context/` as compact `L0` and `L1` summaries only
+6. trust tracked docs, manifests, and repo files over generated summaries whenever they differ
 
 Suggested instruction for a fresh chat:
 
diff --git a/docs/09-new-repo-baseline.md b/docs/09-new-repo-baseline.md
@@ -39,6 +39,7 @@ When useful, add small explicit metadata instead of hidden assumptions:
 - `.workspace/agent-stack.json` when the repo intentionally supports tracked multi-tool agent hints such as OMX-ready or OpenCode-ready setup
 - `README.md` for repo purpose, setup, run instructions, preview notes, and cover block
 - `docs/cover.png` as the default repo-local cover path, even if it starts as a placeholder
+- a runnable launcher command file, preferably `local/commands/run-<repo>` inside the repo or `tools/local/commands/Run <repo>.command` in the workspace
 - `HANDOVER.md` when the repo needs a resumable state document
 - repo-level `AGENTS.md` only when the repo genuinely needs rules beyond the workspace baseline
 - `.codex/skills/` and optional `.codex/config.toml` when the repo should expose official Codex repo-local surfaces
@@ -72,10 +73,11 @@ Recommended intake order:
 2. Read existing setup sources such as `README.md`, `package.json`, `composer.json`, lockfiles, shell scripts, Local notes, or repo-local instruction files.
 3. Classify the repo conservatively and choose a runtime mode only after checking the files.
 4. Create `README.md` if it is missing, or tighten the current one if it exists but does not explain setup and preview.
-5. Add a repo-local cover image reference in the README, even if the image is a placeholder at first.
-6. Add `.workspace/project.json` only when runtime behavior is not obvious from the repo files.
-7. Add repo-level `AGENTS.md`, `HANDOVER.md`, or repo-local skills only when they solve a real repo-specific need.
-8. If README, HANDOVER, or durable setup docs were created or materially updated, run `tools/bin/workspace-memory save-repo <repo-name>` so the shared memory layer captures the repo state, related workspace docs, and the current Codex thread in one closeout step.
+5. Add a runnable launcher command file so the repo can be started without remembering the shell incantation.
+6. Add a repo-local cover image reference in the README, even if the image is a placeholder at first.
+7. Add `.workspace/project.json` only when runtime behavior is not obvious from the repo files.
+8. Add repo-level `AGENTS.md`, `HANDOVER.md`, or repo-local skills only when they solve a real repo-specific need.
+9. If README, HANDOVER, or durable setup docs were created or materially updated, run `tools/bin/workspace-memory save-repo <repo-path-or-name>` so the shared memory layer captures the repo state, related workspace docs, and the current Codex thread in one closeout step.
 
 For MemPalace target metadata, prefer `.workspace/mempalace/` inside the repo rather than dropping `mempalace.yaml` or `entities.json` at the repo root.
 
@@ -131,6 +133,7 @@ Do not present a public-site mirror as if it were the original source project.
 
 1. Record the public source URL and the capture date in `README.md` or `HANDOVER.md`.
 2. State the acquisition method clearly, such as `wget`, `httrack`, or manual asset capture.
+   If `httrack` is available, prefer the workspace wrapper `tools/scripts/capture-site-reference.sh --run <url> <target-dir>` so the repo gets a consistent capture note under `ref/httrack/`.
 3. Document what the repo is: deployed mirror, working local reference copy, or rebuild.
 4. Document what is not present: original source files, build tooling, history, server-side code, private APIs, and environment variables unless they were actually supplied.
 5. Serve the repo through a lightweight local server for testing; do not treat `file://` opening as the default verification path.
@@ -183,11 +186,12 @@ Keep the manifest lightweight. It should clarify runtime behaviour, not become a
 
 For a repo to feel workspace-ready, it should ideally have:
 
-1. a clear way to run or preview it
+1. a clear way to run or preview it, ideally via a tracked launcher command file
 2. a known runtime mode: `direct`, `external`, or explicit repo-native server mode
 3. a `README.md` that captures setup, run, and preview expectations
 4. a repo-local cover image path in the README, even if it begins as a placeholder
-5. enough docs that another person can resume work without guessing
+5. a Codex-friendly closeout path such as `tools/bin/workspace-memory save-repo <repo-path-or-name>`
+6. enough docs that another person can resume work without guessing
 
 For GitHub-backed repos, that usually means enough repo-local docs to explain setup plus a readable issue and PR path. For local-only or git-only repos, it means the tracked local docs carry the same resumable context without pretending GitHub is required.
 
diff --git a/docs/20-ai-context-side-load.md b/docs/20-ai-context-side-load.md
@@ -85,6 +85,7 @@ Recommended sequence:
 3. let the chat use generated `entry.md`, `abstract.md`, and `overview.md` as the fast entry layer
 4. fall back to tracked docs, manifests, and repo files for any real decision or ambiguity
 5. regenerate or ignore the cache if Workspace Hub reports the side-load state as `stale` or `missing`
+6. if a repo intake just created or updated setup docs, close the repo into the current Codex thread with `tools/bin/workspace-memory save-repo <repo-path-or-name>` before continuing
 
 Practical operator flow:
 
@@ -99,6 +100,13 @@ Then start the chat with a handover instruction such as:
 
 For repo-specific work, point the chat at the repo `entry.md` first, then the repo README or handover note only when the side-load packet is insufficient.
 
+When a new repo folder was just added under `repos/`, use this order:
+
+1. run repo intake in Workspace Hub or the equivalent repo-doc setup flow
+2. make sure the repo has a runnable launcher command file
+3. close the repo into memory with `tools/bin/workspace-memory save-repo <repo-path-or-name>`
+4. reopen the generated repo `entry.md` if you want the compact chat packet before deeper docs
+
 ## Source set
 
 ### Workspace
diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md
@@ -1,5 +1,10 @@
 # Changelog
 
+## 2026-04-17
+
+- Installed `httrack` for this workspace environment and added `tools/scripts/capture-site-reference.sh` as the repo-intake wrapper for public-site reference copies, with dry-run-by-default behavior, conservative same-domain defaults, and repo-local capture notes under `ref/httrack/`.
+- Updated the public repo-intake surfaces in [README](../README.md), [docs/README](README.md), [09-new-repo-baseline](09-new-repo-baseline.md), and [tools/templates/repo-docs/README.site-reference.template.md](../tools/templates/repo-docs/README.site-reference.template.md) so site-reference repos have one documented `httrack` path.
+
 ## 2026-04-12
 
 - Added `tools/local/commands/Run Knowledge Palace UI.command` as a workspace-level launcher for the Knowledge Palace local web UI, and aligned the workspace docs index plus repo docs so the command, current state, and next UI batches are documented together.
diff --git a/docs/HANDOVER.md b/docs/HANDOVER.md
@@ -123,6 +123,7 @@ The current repo intake flow in `workspace-hub` is intentionally conservative.
 - it writes `.workspace/project.json` only when the repo appears to need explicit runtime metadata
 - it keeps an existing manifest if one is already present
 - it does not auto-install dependencies or auto-start runtimes as part of intake
+- it now attempts a `tools/bin/workspace-memory save-repo <repo-path-or-name>` closeout after repo intake so the current Codex thread captures the new repo setup
 
 This keeps intake focused on first-pass repo clarity rather than hidden setup side effects.
 
@@ -135,12 +136,14 @@ Current stance:
 - treat the result as one of: deployed mirror, working local reference copy, or clean rebuild
 - do not describe a public-site mirror as the original source project
 - record source URL, capture date, and acquisition method in `README.md` or `HANDOVER.md`
+- when `httrack` is available, prefer `tools/scripts/capture-site-reference.sh --run <url> <target-dir>` so the capture stays conservative by default and records a repo-local note under `ref/httrack/`
 - if automated capture misses files because of permissions or other blockers, provide the direct asset URLs to the user in chat
 - store any user-downloaded fallback files in a repo-local `ref/` folder with source notes
 - create a separate rebuild repo if maintainable editing is the real goal
 
-The tracked template for this flow now lives at:
+The tracked helper surfaces for this flow now live at:
 
+- `tools/scripts/capture-site-reference.sh`
 - `tools/templates/repo-docs/README.site-reference.template.md`
 
 ## Release verification status
@@ -908,3 +911,17 @@ Verification status for this local slice:
 
 - `pnpm --dir "repos/workspace-hub" typecheck`
 - `pnpm --dir "repos/workspace-hub" test`
+
+### Implementation update (2026-04-17, HTTrack site-reference intake wrapper)
+
+Completed in the workspace root:
+
+1. Installed `httrack` for the active workspace environment and verified it is available on `PATH`.
+2. Added `tools/scripts/capture-site-reference.sh` as the dry-run-by-default wrapper for public-site reference captures, with conservative same-domain defaults, optional passthrough HTTrack args, and repo-local capture notes under `ref/httrack/`.
+3. Updated the public repo-intake docs and the site-reference README template so the workspace now has one documented `httrack` path instead of leaving site capture as an ad hoc command.
+
+Verification status for this local slice:
+
+- `sh -n tools/scripts/capture-site-reference.sh`
+- `tools/scripts/capture-site-reference.sh https://example.com /Users/richard/Local\\ Sites/Codex\\ Workspace/cache/httrack-smoke-test`
+- `tools/scripts/capture-site-reference.sh --run https://example.com /Users/richard/Local\\ Sites/Codex\\ Workspace/cache/httrack-smoke-test-run`
diff --git a/docs/README.md b/docs/README.md
@@ -86,6 +86,7 @@ Useful maintenance scripts:
 - `tools/scripts/setup-workspace-profile.sh` provides a guided, non-destructive profile check for `core`, `hub`, `mixed-stack`, `wordpress`, `agent-enhanced`, `workflow-state`, `spec-driven`, and `ui-previews`.
 - `tools/scripts/manage-workspace-capabilities.sh` lists, installs, updates, enables, disables, or uninstalls tracked workspace abilities and core services, with dry-run mode by default.
 - `tools/scripts/update-github-refs.sh` remains the compatibility wrapper for update-only reviewed GitHub-ref flows and delegates to the capability lifecycle command.
+- `tools/scripts/capture-site-reference.sh` previews or runs an `httrack` capture for a public-site reference repo, using conservative same-domain defaults and writing capture notes under repo-local `ref/httrack/`.
 - `tools/scripts/use-design-md.sh` mirrors the managed VoltAgent `DESIGN.md` catalog ability into `cache/design-md/catalog/`, lists available site ids, and can copy a selected `DESIGN.md` into a repo root.
 - `tools/scripts/sync-reference-snapshots.sh` previews or refreshes ignored upstream reference snapshots under `tools/ref/`, with dry-run mode by default.
 - `tools/scripts/sync-codex-skills.sh` previews or syncs tracked workspace skill sources into repo `.codex/skills/` folders plus optional `.agents/skills/` compatibility mirrors, with dry-run mode by default.
diff --git a/repos/workspace-hub/server/repo-intake.ts b/repos/workspace-hub/server/repo-intake.ts
@@ -1,9 +1,12 @@
+import { execFile } from 'node:child_process'
 import { access, copyFile, mkdir, readFile, writeFile } from 'node:fs/promises'
 import path from 'node:path'
+import { promisify } from 'node:util'
 
 import type { WorkspaceRepo } from '../src/types/workspace.ts'
 import { writeRepoManifest } from './repo-manifest.ts'
 
+const execFileAsync = promisify(execFile)
 const coverBlockStart = '<!-- workspace-hub:cover:start -->'
 const coverBlockEnd = '<!-- workspace-hub:cover:end -->'
 const readmeTemplateRelativePath = path.join(
@@ -40,6 +43,26 @@ async function fileExists(targetPath: string) {
   }
 }
 
+async function saveRepoCloseout(workspaceRoot: string, repoPath: string) {
+  const workspaceMemoryPath = path.join(workspaceRoot, 'tools', 'bin', 'workspace-memory')
+
+  if (!(await fileExists(workspaceMemoryPath))) {
+    return 'Workspace memory wrapper not found, so the Codex-thread closeout was skipped.'
+  }
+
+  try {
+    await execFileAsync(workspaceMemoryPath, ['save-repo', repoPath], {
+      cwd: workspaceRoot,
+      env: process.env,
+      maxBuffer: 1024 * 1024 * 16,
+    })
+    return 'Repo closeout was saved to workspace memory for the current Codex thread.'
+  } catch (error) {
+    const message = error instanceof Error ? error.message : 'unknown error'
+    return `Workspace memory closeout failed: ${message}`
+  }
+}
+
 function sanitizeAltText(value: string) {
   return value.replace(/[[\]]/g, '').trim() || 'Repo cover'
 }
@@ -318,6 +341,8 @@ export async function runRepoIntake(
     notes.push('Manifest skipped because runtime behavior looks clear from the repo files.')
   }
 
+  notes.push(await saveRepoCloseout(workspaceRoot, repo.relativePath))
+
   return {
     coverCreated,
     coverImagePath,
diff --git a/tools/scripts/capture-site-reference.sh b/tools/scripts/capture-site-reference.sh
diff --git a/tools/templates/repo-docs/README.site-reference.template.md b/tools/templates/repo-docs/README.site-reference.template.md
diff --git a/tools/templates/repo-docs/README.template.md b/tools/templates/repo-docs/README.template.md