From a1e9df06d0392d0bb7f0db3f987c39f909c744aa Mon Sep 17 00:00:00 2001
From: Ronan Hevenor <spacechickenrobot@gmail.com>
Date: Thu, 7 May 2026 12:01:11 -0400
Subject: [PATCH 1/3] feat(legacy-archive): backfill kickers, subdecks,
 featured images, slugs, shortlinks
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Round 5 of the legacy-archive cleanup, driven by the completeness audit at
/tmp/legacy-import-logs/wp-completeness-audit.md.

* kickers + subdecks: 4732 articles got correct kickers from poly-online
  type_db (2303) and WP `Kicker` postmeta (1357 fills + 1072 overwrites of
  generic "Editorial/Opinion"). Plus 768 poly-online subdecks from blurb_db.
* featured images: 1531 WP articles got real featured_image_id + caption +
  photographer attribution from `Photo`/`PhotoCaption`/`PhotoByline`/
  `Photographer` postmeta.
* WP slug regen: 2921 run-together slugs (e.g. `midnpartakeinmarathon`)
  rebuilt from source `post_name` (e.g. `midn-partake-in-marathon`). Old
  slugs are saved on a new `previous_slug` column so middleware can 301
  to the new URL. New `articles_previous_slug_idx` makes the lookup cheap.
* shortlinks: new `legacy_shortlinks` lookup table populated from the
  `pluginSL_shorturl` plugin (5014 of 12,872 source rows resolve to a real
  destination — 4091 polymer URLs, 871 archive media files, 50 external,
  2 mirror fallbacks). Middleware checks the table after the hand-curated
  override map. Regex broadened from 5-digit to 5-char alphanumeric.

Backfill scripts live under scripts/legacy-import/ and run idempotently
(dry run by default; --write to commit). All four were used to update
the production DB directly via the SSH tunnel before this PR.
---
 collections/Articles.ts                       |  10 +
 middleware.ts                                 | 126 ++++++++--
 ...60507_000000_add_articles_previous_slug.ts |  25 ++
 .../20260507_010000_add_legacy_shortlinks.ts  |  26 ++
 migrations/index.ts                           |  12 +
 .../backfill-kickers-subdecks.ts              | 229 ++++++++++++++++++
 .../backfill-legacy-shortlinks.ts             | 198 +++++++++++++++
 .../backfill-wp-featured-images.ts            | 215 ++++++++++++++++
 scripts/legacy-import/reslugify-wordpress.ts  | 168 +++++++++++++
 scripts/run_deploy_sql_migrations.sh          |  19 +-
 10 files changed, 1013 insertions(+), 15 deletions(-)
 create mode 100644 migrations/20260507_000000_add_articles_previous_slug.ts
 create mode 100644 migrations/20260507_010000_add_legacy_shortlinks.ts
 create mode 100644 scripts/legacy-import/backfill-kickers-subdecks.ts
 create mode 100644 scripts/legacy-import/backfill-legacy-shortlinks.ts
 create mode 100644 scripts/legacy-import/backfill-wp-featured-images.ts
 create mode 100644 scripts/legacy-import/reslugify-wordpress.ts

diff --git a/collections/Articles.ts b/collections/Articles.ts
index b62185d..ab945d1 100644
--- a/collections/Articles.ts
+++ b/collections/Articles.ts
@@ -422,6 +422,16 @@ const Articles: CollectionConfig = {
         description: 'Original category/section name from the source system. Preserved for display and search; does not affect routing.',
       },
     },
+    {
+      name: 'previousSlug',
+      type: 'text',
+      label: 'Previous Slug',
+      index: true,
+      admin: {
+        position: 'sidebar',
+        description: 'Old slug retained for 301 redirects after a rename. The middleware redirects requests for this slug to the current one.',
+      },
+    },
   ],
 }
 
diff --git a/middleware.ts b/middleware.ts
index 0287ffe..8c765eb 100644
--- a/middleware.ts
+++ b/middleware.ts
@@ -11,20 +11,24 @@ const ARTICLE_URL_RE = /^\/([a-z]+)\/(\d{4})\/(\d{2})\/([a-z0-9][a-z0-9-]*)\/?$/
 // Old WordPress permalink shape from the 2009-2019 era. Years restricted
 // to 2009-2019 so we don't accidentally swallow other paths.
 const LEGACY_WP_URL_RE = /^\/(20(?:0[9]|1[0-9]))\/(\d{2})\/(\d{2})\/([a-z0-9][a-z0-9_-]*)\/?$/
-// 5-digit ID URL shape from the WP-era pluginSL_shorturl plugin. Articles
-// from 2013-2014 link to documents via these short codes.
-const LEGACY_WP_SHORTLINK_RE = /^\/(\d{5})\/?$/
+// 5-char ID URL shape from the WP-era `pluginSL_shorturl` plugin. The
+// codes are 5-digit numeric *or* 5-char alphanumeric (case-sensitive).
+// Resolved against the `legacy_shortlinks` lookup table at request time.
+const LEGACY_WP_SHORTLINK_RE = /^\/([A-Za-z0-9]{5})\/?$/
 const VALID_SECTIONS = new Set(['news', 'sports', 'features', 'opinion'])
 
 /**
- * 33 shortlinks recovered from the legacy WordPress `pluginSL_shorturl` table.
- * Most of these point at student-senate document portals (still up at
- * docs.studentsenate.rpi.edu) or external services (Google Docs, Eventbrite,
- * etc.). Some chain to old `poly.rpi.edu/YYYY/...` URLs that the
- * LEGACY_WP_URL_RE branch then redirects to their new polymer URL.
+ * Hand-curated overrides for the WordPress `pluginSL_shorturl` map. Anything
+ * not present here falls through to the `legacy_shortlinks` table (5,014
+ * rows backfilled from the source plugin). Kept for two reasons:
  *
- * Source: `pluginSL_shorturl` table in
- * /home/red/poly/recon/archives/wordpress/db/wordpress-archive-2026-05-06.sql.gz
+ *   1. Documents are now hosted off-site (docs.studentsenate.rpi.edu, etc.);
+ *      the override map keeps those redirect targets explicit and reviewable.
+ *   2. A few codes have been edited at runtime over the years (e.g. NASA's
+ *      mission_pages URL was renamed) and we want our pin to win.
+ *
+ * The DB-backed table can be regenerated from the dump via
+ * `scripts/legacy-import/backfill-legacy-shortlinks.ts`.
  */
 const WP_LEGACY_SHORTLINKS: Record<string, string> = {
   '06735': 'http://poly.rpi.edu/2013/03/06/pss_breaking_the_third_wall/',
@@ -68,6 +72,34 @@ const CACHE_TTL_MS = 60_000
 
 type RedirectEntry = { to: string | null; expiresAt: number }
 const legacyRedirectCache = new Map<string, RedirectEntry>()
+const previousSlugRedirectCache = new Map<string, RedirectEntry>()
+const shortlinkRedirectCache = new Map<string, RedirectEntry>()
+
+/**
+ * Look up the destination URL for a 5-char WP shortlink. Pulls from the
+ * `legacy_shortlinks` table backfilled from the `pluginSL_shorturl` plugin.
+ *
+ * No Payload collection wraps this table — the rows aren't editorial — so
+ * we go through the raw `pg.Pool` exposed by the postgres db adapter.
+ */
+async function lookupShortlinkRedirect(code: string): Promise<string | null> {
+  const cached = shortlinkRedirectCache.get(code)
+  const now = Date.now()
+  if (cached && cached.expiresAt > now) return cached.to
+
+  const payload = await getPayload({ config: payloadConfig })
+  const pool = (payload.db as unknown as { pool?: import('pg').Pool }).pool
+  let to: string | null = null
+  if (pool) {
+    const r = await pool.query<{ target_url: string }>(
+      'SELECT target_url FROM legacy_shortlinks WHERE short_code = $1 LIMIT 1',
+      [code],
+    )
+    to = r.rows[0]?.target_url ?? null
+  }
+  shortlinkRedirectCache.set(code, { to, expiresAt: now + CACHE_TTL_MS })
+  return to
+}
 
 async function isArticleGone(section: string, slug: string): Promise<boolean> {
   if (!VALID_SECTIONS.has(section)) return false
@@ -100,6 +132,49 @@ async function isArticleGone(section: string, slug: string): Promise<boolean> {
   return gone
 }
 
+/**
+ * Look up the canonical URL for an article whose slug was renamed. Returns
+ * the new URL when `previous_slug` matches; null otherwise. Used by the 301
+ * redirect fallback in the article-URL branch when the live `slug` lookup
+ * fails.
+ */
+async function lookupPreviousSlugRedirect(
+  section: string,
+  oldSlug: string,
+): Promise<string | null> {
+  const cacheKey = `${section}:${oldSlug}`
+  const cached = previousSlugRedirectCache.get(cacheKey)
+  const now = Date.now()
+  if (cached && cached.expiresAt > now) return cached.to
+
+  const payload = await getPayload({ config: payloadConfig })
+  const result = await payload.find({
+    collection: 'articles',
+    where: {
+      and: [
+        { previousSlug: { equals: oldSlug } },
+        { section: { equals: section } },
+        { _status: { equals: 'published' } },
+      ],
+    },
+    limit: 1,
+    depth: 0,
+    select: { slug: true, section: true, publishedDate: true },
+  })
+  const doc = result.docs[0] as
+    | { slug?: string; section?: string; publishedDate?: string }
+    | undefined
+  let to: string | null = null
+  if (doc?.slug && doc?.section && doc?.publishedDate) {
+    const dt = new Date(doc.publishedDate)
+    const yy = dt.getUTCFullYear().toString()
+    const mm = String(dt.getUTCMonth() + 1).padStart(2, '0')
+    to = `/${doc.section}/${yy}/${mm}/${doc.slug}`
+  }
+  previousSlugRedirectCache.set(cacheKey, { to, expiresAt: now + CACHE_TTL_MS })
+  return to
+}
+
 /**
  * Look up the polymer URL for an old WordPress permalink shape
  * `/{year}/{month}/{day}/{slug}/`. Returns the new URL or null if no match.
@@ -142,12 +217,26 @@ async function lookupLegacyWpRedirect(
 }
 
 export async function middleware(req: NextRequest) {
-  // Legacy WP shortlink (5-digit IDs from pluginSL_shorturl) → original URL.
+  // Legacy WP shortlink (5-char IDs from pluginSL_shorturl) → target URL.
+  // The hand-curated `WP_LEGACY_SHORTLINKS` map wins; otherwise we look up
+  // the DB-backed `legacy_shortlinks` table.
   const shortMatch = req.nextUrl.pathname.match(LEGACY_WP_SHORTLINK_RE)
   if (shortMatch) {
-    const target = WP_LEGACY_SHORTLINKS[shortMatch[1]]
-    if (target) {
-      return NextResponse.redirect(target, 301)
+    const code = shortMatch[1]
+    const override = WP_LEGACY_SHORTLINKS[code]
+    if (override) {
+      return NextResponse.redirect(override, 301)
+    }
+    try {
+      const target = await lookupShortlinkRedirect(code)
+      if (target) {
+        // External (absolute) targets pass through; relative paths get
+        // resolved against the request origin.
+        const url = /^https?:\/\//i.test(target) ? target : new URL(target, req.url).toString()
+        return NextResponse.redirect(url, 301)
+      }
+    } catch {
+      // Fall through — let the request 404 normally if lookup fails.
     }
   }
 
@@ -171,6 +260,15 @@ export async function middleware(req: NextRequest) {
   const [, section, , , slug] = match
 
   try {
+    // Renamed-slug 301: if no row has `slug=$slug` but one has
+    // `previous_slug=$slug`, redirect to the new canonical URL. This kicks in
+    // for the legacy slug-cleanup pass (post_name `_`-stripping) and any
+    // future editor-driven rename.
+    const renamedTo = await lookupPreviousSlugRedirect(section, slug)
+    if (renamedTo) {
+      return NextResponse.redirect(new URL(renamedTo, req.url), 301)
+    }
+
     if (await isArticleGone(section, slug)) {
       // 410 Gone tells search engines the URL is permanently removed so they
       // de-index faster than they would from a bare 404.
diff --git a/migrations/20260507_000000_add_articles_previous_slug.ts b/migrations/20260507_000000_add_articles_previous_slug.ts
new file mode 100644
index 0000000..5344159
--- /dev/null
+++ b/migrations/20260507_000000_add_articles_previous_slug.ts
@@ -0,0 +1,25 @@
+import { MigrateUpArgs, MigrateDownArgs, sql } from '@payloadcms/db-postgres'
+
+/**
+ * Add `previous_slug` to `articles` (+ version shadow). When a slug is
+ * renamed (e.g. legacy slug-cleanup), set `previous_slug` to the old value
+ * so the request middleware can issue a 301 redirect to the new URL.
+ *
+ * Single-string for now (one historical slug per article). If we ever need
+ * multiple aliases we can swap to a text[].
+ */
+export async function up({ db }: MigrateUpArgs): Promise<void> {
+  await db.execute(sql`
+    ALTER TABLE "articles" ADD COLUMN IF NOT EXISTS "previous_slug" varchar;
+    ALTER TABLE "_articles_v" ADD COLUMN IF NOT EXISTS "version_previous_slug" varchar;
+    CREATE INDEX IF NOT EXISTS "articles_previous_slug_idx" ON "articles" ("previous_slug");
+  `)
+}
+
+export async function down({ db }: MigrateDownArgs): Promise<void> {
+  await db.execute(sql`
+    DROP INDEX IF EXISTS "articles_previous_slug_idx";
+    ALTER TABLE "_articles_v" DROP COLUMN IF EXISTS "version_previous_slug";
+    ALTER TABLE "articles" DROP COLUMN IF EXISTS "previous_slug";
+  `)
+}
diff --git a/migrations/20260507_010000_add_legacy_shortlinks.ts b/migrations/20260507_010000_add_legacy_shortlinks.ts
new file mode 100644
index 0000000..9a4ad37
--- /dev/null
+++ b/migrations/20260507_010000_add_legacy_shortlinks.ts
@@ -0,0 +1,26 @@
+import { MigrateUpArgs, MigrateDownArgs, sql } from '@payloadcms/db-postgres'
+
+/**
+ * Add a `legacy_shortlinks` table mapping the 5-char codes from the WordPress
+ * `pluginSL_shorturl` plugin (12,872 rows) to their target URL. Used by the
+ * request middleware to 301 `/<code>` to either a canonical polymer URL or
+ * the original external destination.
+ *
+ * Not a Payload collection — there's no editorial reason to surface these
+ * in the admin UI, and exposing 12K rows there would be noisy. Pure DB
+ * lookup table.
+ */
+export async function up({ db }: MigrateUpArgs): Promise<void> {
+  await db.execute(sql`
+    CREATE TABLE IF NOT EXISTS "legacy_shortlinks" (
+      "short_code" varchar PRIMARY KEY,
+      "target_url" varchar NOT NULL,
+      "hit_count" integer NOT NULL DEFAULT 0,
+      "created_at" timestamp(3) with time zone NOT NULL DEFAULT NOW()
+    );
+  `)
+}
+
+export async function down({ db }: MigrateDownArgs): Promise<void> {
+  await db.execute(sql`DROP TABLE IF EXISTS "legacy_shortlinks";`)
+}
diff --git a/migrations/index.ts b/migrations/index.ts
index 3f060cb..9da6bef 100644
--- a/migrations/index.ts
+++ b/migrations/index.ts
@@ -42,6 +42,8 @@ import * as migration_20260428_100000_add_audio_transcription from './20260428_1
 import * as migration_20260506_000000_add_articles_legacy_archive from './20260506_000000_add_articles_legacy_archive';
 import * as migration_20260506_010000_add_articles_legacy_id_and_category from './20260506_010000_add_articles_legacy_id_and_category';
 import * as migration_20260506_020000_add_articles_plain_content from './20260506_020000_add_articles_plain_content';
+import * as migration_20260507_000000_add_articles_previous_slug from './20260507_000000_add_articles_previous_slug';
+import * as migration_20260507_010000_add_legacy_shortlinks from './20260507_010000_add_legacy_shortlinks';
 
 export const migrations = [
   {
@@ -264,4 +266,14 @@ export const migrations = [
     down: migration_20260506_020000_add_articles_plain_content.down,
     name: '20260506_020000_add_articles_plain_content',
   },
+  {
+    up: migration_20260507_000000_add_articles_previous_slug.up,
+    down: migration_20260507_000000_add_articles_previous_slug.down,
+    name: '20260507_000000_add_articles_previous_slug',
+  },
+  {
+    up: migration_20260507_010000_add_legacy_shortlinks.up,
+    down: migration_20260507_010000_add_legacy_shortlinks.down,
+    name: '20260507_010000_add_legacy_shortlinks',
+  },
 ];
diff --git a/scripts/legacy-import/backfill-kickers-subdecks.ts b/scripts/legacy-import/backfill-kickers-subdecks.ts
new file mode 100644
index 0000000..f8e301d
--- /dev/null
+++ b/scripts/legacy-import/backfill-kickers-subdecks.ts
@@ -0,0 +1,229 @@
+/**
+ * Backfill kickers and subdecks on legacy articles.
+ *
+ *   poly-online: read `type_db` / `blurb_db` from the manifest
+ *   wordpress:   read `Kicker` / `Subdeck` postmeta from the source DB dump
+ *
+ * Originally we ignored these fields during import; the audit flagged 2,303
+ * poly-online + 1,366 WP rows missing kickers despite the source having them,
+ * plus 768 poly-online subdecks. This script does idempotent direct-SQL
+ * updates against the polymer DB (no hooks fire — we're not editing
+ * narrative content).
+ *
+ * Run with `pnpm tsx scripts/legacy-import/backfill-kickers-subdecks.ts`.
+ * Defaults to a dry run; pass `--write` to actually update rows.
+ */
+
+import { Pool } from 'pg'
+import { readFileSync } from 'fs'
+import { execFileSync } from 'child_process'
+import { decodeEntities } from './wordpress/html-tokenizer'
+
+type PolyOnlineArticle = {
+  kind: string
+  articleID: number
+  type_db?: string
+  blurb_db?: string
+}
+
+type Mode = { write: boolean; era: 'poly-online' | 'wordpress' | 'all' }
+
+function parseArgs(): Mode {
+  const args = process.argv.slice(2)
+  const eraFlag = args.find((a) => a.startsWith('--era='))?.split('=')[1] as
+    | 'poly-online'
+    | 'wordpress'
+    | 'all'
+    | undefined
+  return {
+    write: args.includes('--write'),
+    era: eraFlag ?? 'all',
+  }
+}
+
+const POLY_ONLINE_MANIFEST = '/home/red/poly/recon/archives/polytechnic-online/manifest.json'
+const WP_SQLITE = '/tmp/audit/wp.db'
+
+// Source kicker values that are generic category names (not real kickers)
+// and should be overwritten if the source `Kicker` postmeta has something better.
+const GENERIC_KICKER_VALUES = new Set([
+  'editorial/opinion',
+  'editorial / opinion',
+  'opinion',
+  'news',
+  'sports',
+  'features',
+  'feature',
+])
+
+function isGenericKicker(v: string | null): boolean {
+  if (!v) return false
+  return GENERIC_KICKER_VALUES.has(v.trim().toLowerCase())
+}
+
+// "none" is the manifest's null marker for type_db. Skip it. Otherwise return
+// the cleaned-up label (the manifest values are already title-cased).
+function cleanTypeDb(raw: string | undefined | null): string | null {
+  if (!raw) return null
+  const v = raw.trim()
+  if (!v) return null
+  if (v.toLowerCase() === 'none') return null
+  return v
+}
+
+// blurb_db can have the same `&#xx;` HTML entities as titles/bodies. Decode
+// before saving so the subdeck renders correctly.
+function cleanBlurb(raw: string | undefined | null): string | null {
+  if (!raw) return null
+  const v = decodeEntities(raw).trim()
+  return v || null
+}
+
+async function main() {
+  const mode = parseArgs()
+  console.log(`Mode: era=${mode.era} write=${mode.write}`)
+
+  const pool = new Pool({
+    host: '127.0.0.1',
+    port: 5433,
+    user: 'poly',
+    password: 'poly',
+    database: 'polymer2',
+  })
+
+  let kickerUpdates = 0
+  let subdeckUpdates = 0
+
+  // ---- poly-online ----
+  if (mode.era === 'all' || mode.era === 'poly-online') {
+    console.log('\n=== poly-online ===')
+    const manifest = JSON.parse(readFileSync(POLY_ONLINE_MANIFEST, 'utf-8')) as {
+      articles: PolyOnlineArticle[]
+    }
+    const articles = manifest.articles.filter((a) => a.kind === 'article')
+    console.log(`Manifest articles: ${articles.length}`)
+
+    // De-dupe by articleID; manifest has multiple part rows per article.
+    const seen = new Set<number>()
+    const updates: { id: number; kicker: string | null; subdeck: string | null }[] = []
+    for (const a of articles) {
+      if (seen.has(a.articleID)) continue
+      seen.add(a.articleID)
+      const k = cleanTypeDb(a.type_db)
+      const s = cleanBlurb(a.blurb_db)
+      if (!k && !s) continue
+      updates.push({ id: a.articleID, kicker: k, subdeck: s })
+    }
+    console.log(`Distinct articleIDs with type_db/blurb_db: ${updates.length}`)
+
+    for (const u of updates) {
+      // Only overwrite if the destination is empty. Don't trample any
+      // editor's manual cleanup.
+      const result = await pool.query(
+        `SELECT id, kicker, subdeck FROM articles WHERE legacy_source='polytechnic-online' AND legacy_article_id=$1`,
+        [String(u.id)]
+      )
+      if (result.rowCount === 0) continue
+      const row = result.rows[0]
+      const setKicker = u.kicker && !row.kicker
+      const setSubdeck = u.subdeck && !row.subdeck
+      if (!setKicker && !setSubdeck) continue
+      if (mode.write) {
+        await pool.query(
+          `UPDATE articles SET kicker=COALESCE($1, kicker), subdeck=COALESCE($2, subdeck), updated_at=NOW() WHERE id=$3`,
+          [setKicker ? u.kicker : null, setSubdeck ? u.subdeck : null, row.id]
+        )
+      }
+      if (setKicker) kickerUpdates++
+      if (setSubdeck) subdeckUpdates++
+    }
+
+    console.log(`poly-online: kicker updates=${kickerUpdates} subdeck updates=${subdeckUpdates}`)
+  }
+
+  // ---- wordpress ----
+  if (mode.era === 'all' || mode.era === 'wordpress') {
+    console.log('\n=== wordpress ===')
+    // Use sqlite CLI to pull Kicker and Subdeck postmeta into TSV.
+    // Avoids adding a dependency on better-sqlite3 just for this script.
+    const tsv = execFileSync(
+      'sqlite3',
+      [
+        '-separator',
+        '\t',
+        WP_SQLITE,
+        "SELECT post_id, meta_key, meta_value FROM postmeta WHERE meta_key IN ('Kicker','Subdeck') AND meta_value IS NOT NULL AND meta_value != ''",
+      ],
+      { maxBuffer: 64 * 1024 * 1024 }
+    ).toString()
+
+    type Postmeta = { kicker?: string; subdeck?: string }
+    const byWpId = new Map<number, Postmeta>()
+    for (const line of tsv.split('\n')) {
+      if (!line) continue
+      const [pidStr, key, value] = line.split('\t')
+      const pid = Number(pidStr)
+      if (!pid) continue
+      const cur = byWpId.get(pid) ?? {}
+      // Decode entities (Kicker postmeta can carry &#8217; from the WP editor).
+      const v = decodeEntities(value).trim()
+      if (!v) continue
+      if (key === 'Kicker') cur.kicker = v
+      else if (key === 'Subdeck') cur.subdeck = v
+      byWpId.set(pid, cur)
+    }
+    console.log(`WP postmeta rows: ${byWpId.size} wp_ids with Kicker or Subdeck`)
+
+    let kickerOverwrites = 0
+    let kickerFills = 0
+    let subdeckFills = 0
+
+    for (const [wpId, meta] of byWpId) {
+      const result = await pool.query(
+        `SELECT id, kicker, subdeck FROM articles WHERE legacy_source='wordpress' AND legacy_article_id=$1`,
+        [String(wpId)]
+      )
+      if (result.rowCount === 0) continue
+      const row = result.rows[0]
+
+      let newKicker: string | null = null
+      if (meta.kicker) {
+        if (!row.kicker) {
+          newKicker = meta.kicker
+          kickerFills++
+        } else if (isGenericKicker(row.kicker) && meta.kicker.toLowerCase() !== row.kicker.toLowerCase()) {
+          newKicker = meta.kicker
+          kickerOverwrites++
+        }
+      }
+
+      let newSubdeck: string | null = null
+      if (meta.subdeck && !row.subdeck) {
+        newSubdeck = meta.subdeck
+        subdeckFills++
+      }
+
+      if (!newKicker && !newSubdeck) continue
+      if (mode.write) {
+        await pool.query(
+          `UPDATE articles SET kicker=COALESCE($1, kicker), subdeck=COALESCE($2, subdeck), updated_at=NOW() WHERE id=$3`,
+          [newKicker, newSubdeck, row.id]
+        )
+      }
+    }
+
+    console.log(
+      `wordpress: kicker fills=${kickerFills} kicker overwrites=${kickerOverwrites} subdeck fills=${subdeckFills}`
+    )
+    kickerUpdates += kickerFills + kickerOverwrites
+    subdeckUpdates += subdeckFills
+  }
+
+  console.log(`\nTotal: kicker=${kickerUpdates} subdeck=${subdeckUpdates} (${mode.write ? 'WRITTEN' : 'DRY RUN'})`)
+  await pool.end()
+}
+
+main().catch((err) => {
+  console.error(err)
+  process.exit(1)
+})
diff --git a/scripts/legacy-import/backfill-legacy-shortlinks.ts b/scripts/legacy-import/backfill-legacy-shortlinks.ts
new file mode 100644
index 0000000..ab4d492
--- /dev/null
+++ b/scripts/legacy-import/backfill-legacy-shortlinks.ts
@@ -0,0 +1,198 @@
+/**
+ * Populate the `legacy_shortlinks` lookup table from the WordPress
+ * `pluginSL_shorturl` table.
+ *
+ * Each WP shortlink is one of:
+ *   - id_post != 0:  link to a WP post → resolve to the canonical polymer URL
+ *                    (via legacy_source='wordpress' + legacy_article_id=id_post),
+ *                    or fall back to the source `/YYYY/MM/DD/<post_name>/`
+ *                    shape (which the existing `LEGACY_WP_URL_RE` middleware
+ *                    branch then re-resolves).
+ *   - url_externe set: external destination → copy as-is
+ *
+ * Run with `pnpm tsx scripts/legacy-import/backfill-legacy-shortlinks.ts`.
+ * Defaults to a dry run; pass `--write` to commit.
+ */
+
+import { Pool } from 'pg'
+import { execFileSync } from 'child_process'
+
+const WP_SQLITE = '/tmp/audit/wp.db'
+
+function parseArgs() {
+  const args = process.argv.slice(2)
+  return { write: args.includes('--write') }
+}
+
+async function main() {
+  const { write } = parseArgs()
+  console.log(`Mode: write=${write}`)
+
+  // Pull every shortlink + its target post info in one shot.
+  // post_type='attachment' rows carry the upload's URL in `guid` so we can
+  // route shortlinks pointing at media files too.
+  const tsv = execFileSync(
+    'sqlite3',
+    [
+      '-separator',
+      '\t',
+      WP_SQLITE,
+      `SELECT s.short_url, s.id_post, s.url_externe, s.nb_hits,
+              p.post_name, substr(p.post_date, 1, 10) AS post_date,
+              p.post_status, p.post_type, p.guid
+       FROM pluginSL_shorturl s
+       LEFT JOIN posts p ON p.id = s.id_post`,
+    ],
+    { maxBuffer: 64 * 1024 * 1024 },
+  ).toString()
+
+  type WpRow = {
+    code: string
+    idPost: number
+    external: string
+    hits: number
+    postName: string
+    postDate: string
+    postStatus: string
+    postType: string
+    guid: string
+  }
+  const wpRows: WpRow[] = []
+  for (const line of tsv.split('\n')) {
+    if (!line) continue
+    const parts = line.split('\t')
+    if (parts.length < 9) continue
+    const [code, idPostStr, external, hitsStr, postName, postDate, postStatus, postType, guid] = parts
+    if (!code) continue
+    wpRows.push({
+      code,
+      idPost: Number(idPostStr) || 0,
+      external: external || '',
+      hits: Number(hitsStr) || 0,
+      postName: postName || '',
+      postDate: postDate || '',
+      postStatus: postStatus || '',
+      postType: postType || '',
+      guid: guid || '',
+    })
+  }
+  console.log(`Source pluginSL_shorturl rows: ${wpRows.length}`)
+
+  // Pre-load polymer's wordpress-era articles for fast id_post → URL lookup.
+  const pool = new Pool({
+    host: '127.0.0.1',
+    port: 5433,
+    user: 'poly',
+    password: 'poly',
+    database: 'polymer2',
+  })
+
+  const polymerArticles = await pool.query<{
+    legacy_article_id: string
+    section: string
+    slug: string
+    published_date: string | null
+  }>(
+    `SELECT legacy_article_id, section, slug, published_date FROM articles
+     WHERE legacy_source='wordpress' AND _status='published'`,
+  )
+  const polymerByWpId = new Map<number, { section: string; slug: string; year: string; month: string }>()
+  for (const r of polymerArticles.rows) {
+    if (!r.published_date) continue
+    const dt = new Date(r.published_date)
+    const yy = dt.getUTCFullYear().toString()
+    const mm = String(dt.getUTCMonth() + 1).padStart(2, '0')
+    polymerByWpId.set(Number(r.legacy_article_id), { section: r.section, slug: r.slug, year: yy, month: mm })
+  }
+  console.log(`Polymer wp-era published articles: ${polymerByWpId.size}`)
+
+  let toPolymer = 0
+  let toMirrorFallback = 0
+  let toExternal = 0
+  let toAttachment = 0
+  let skipped = 0
+
+  // Use a single multi-row INSERT for speed. Chunk to keep the parameter list
+  // under Postgres' 65535 limit.
+  type Insert = { code: string; target: string; hits: number }
+  const inserts: Insert[] = []
+
+  for (const r of wpRows) {
+    let target: string | null = null
+
+    if (r.idPost > 0) {
+      const matched = polymerByWpId.get(r.idPost)
+      if (matched) {
+        target = `/${matched.section}/${matched.year}/${matched.month}/${matched.slug}`
+        toPolymer++
+      } else if (r.postName && r.postDate && r.postStatus === 'publish' && r.postType === 'post') {
+        // Polymer hasn't ingested this post (rare). Fall back to the WP-era
+        // permalink shape, which our LEGACY_WP_URL_RE branch will then try
+        // to resolve at request time.
+        const [y, m, d] = r.postDate.split('-')
+        if (y && m && d) {
+          target = `/${y}/${m}/${d}/${r.postName}/`
+          toMirrorFallback++
+        }
+      } else if (r.postType === 'attachment' && r.guid) {
+        // Media attachments live in the archive proxy under
+        // /archive/wordpress-media/uploads/... We rewrite the guid path-
+        // suffix the same way image-rewriter does for body images.
+        const m = r.guid.match(/^https?:\/\/[^/]+\/wp-content\/uploads\/(.+)$/i)
+        if (m) {
+          target = `/archive/wordpress-media/uploads/${m[1]}`
+          toAttachment++
+        }
+      }
+    }
+
+    if (!target && r.external) {
+      target = r.external
+      toExternal++
+    }
+
+    if (!target) {
+      skipped++
+      continue
+    }
+
+    inserts.push({ code: r.code, target, hits: r.hits })
+  }
+
+  console.log(`\nResolution:`)
+  console.log(`  → polymer URL    : ${toPolymer}`)
+  console.log(`  → mirror fallback: ${toMirrorFallback}`)
+  console.log(`  → attachment     : ${toAttachment}`)
+  console.log(`  → external URL   : ${toExternal}`)
+  console.log(`  skipped          : ${skipped}`)
+  console.log(`  total inserts    : ${inserts.length}`)
+
+  if (write) {
+    // Truncate and re-insert so re-runs are idempotent.
+    await pool.query(`TRUNCATE TABLE legacy_shortlinks`)
+    const CHUNK = 500
+    for (let i = 0; i < inserts.length; i += CHUNK) {
+      const batch = inserts.slice(i, i + CHUNK)
+      const values: string[] = []
+      const params: (string | number)[] = []
+      let p = 1
+      for (const ins of batch) {
+        values.push(`($${p++}, $${p++}, $${p++})`)
+        params.push(ins.code, ins.target, ins.hits)
+      }
+      await pool.query(
+        `INSERT INTO legacy_shortlinks (short_code, target_url, hit_count) VALUES ${values.join(', ')}
+         ON CONFLICT (short_code) DO UPDATE SET target_url = EXCLUDED.target_url, hit_count = EXCLUDED.hit_count`,
+        params,
+      )
+    }
+  }
+
+  console.log(`\nMode: ${write ? 'WRITTEN' : 'DRY RUN'}`)
+  await pool.end()
+}
+
+main().catch((err) => {
+  console.error(err)
+  process.exit(1)
+})
diff --git a/scripts/legacy-import/backfill-wp-featured-images.ts b/scripts/legacy-import/backfill-wp-featured-images.ts
new file mode 100644
index 0000000..1a47801
--- /dev/null
+++ b/scripts/legacy-import/backfill-wp-featured-images.ts
@@ -0,0 +1,215 @@
+/**
+ * Backfill featured images on wordpress-era articles.
+ *
+ * Reads source postmeta from /tmp/audit/wp.db (Kicker/Photo/PhotoCaption/
+ * PhotoByline/Photographer) and:
+ *   1. converts the Photo path to its archived URL
+ *      `/wp-content/uploads/...` → `/archive/wordpress-media/uploads/...`
+ *   2. resolves it via LegacyMediaResolver (gets-or-creates a media row)
+ *   3. populates `articles.featured_image_id`
+ *   4. populates `articles.image_caption` from PhotoCaption (entity-decoded)
+ *   5. populates `media.write_in_photographer` from PhotoByline / Photographer
+ *
+ * The audit confirmed 1473 distinct Photo paths exist; 1470 of those map to
+ * actual files on disk in `recon/archives/wordpress/uploads-extracted/`.
+ *
+ * Run with `pnpm tsx scripts/legacy-import/backfill-wp-featured-images.ts`.
+ * Defaults to a dry run; pass `--write` to actually update rows.
+ */
+
+import { Pool } from 'pg'
+import { execFileSync } from 'child_process'
+import { LegacyMediaResolver } from './media-resolver'
+import { decodeEntities } from './wordpress/html-tokenizer'
+
+const WP_SQLITE = '/tmp/audit/wp.db'
+
+type Postmeta = {
+  photo?: string
+  photoCaption?: string
+  photoByline?: string
+  photographer?: string
+  origPhoto?: string
+}
+
+function parseArgs() {
+  const args = process.argv.slice(2)
+  return { write: args.includes('--write') }
+}
+
+// Convert source Photo path to the polymer archive URL.
+//   /wp-content/uploads/X → /archive/wordpress-media/uploads/X
+// Returns null for values that don't look like a real path (e.g. "0", URLs to
+// poly.rpi.edu, absolute http:// links, etc).
+function toArchiveUrl(rawPath: string | undefined): string | null {
+  if (!rawPath) return null
+  const v = rawPath.trim()
+  if (!v || v === '0') return null
+  if (v.startsWith('/wp-content/uploads/')) {
+    return '/archive/wordpress-media/uploads/' + v.slice('/wp-content/uploads/'.length)
+  }
+  // 32 rows are full URLs. Convert if they're pointing at poly.rpi.edu uploads.
+  const m = v.match(/^https?:\/\/[^/]+\/wp-content\/uploads\/(.+)$/i)
+  if (m) {
+    return '/archive/wordpress-media/uploads/' + m[1]
+  }
+  // 1 row references /wp-includes/, ignore.
+  return null
+}
+
+// Strip "<i>The Polytechnic</i>"-style trailing publication credit from a
+// byline. WP postmeta consistently formats as "Name/<i>The Polytechnic</i>"
+// or "Name/The Polytechnic"; we want just "Name".
+function cleanByline(raw: string | undefined): string | null {
+  if (!raw) return null
+  let v = decodeEntities(raw).trim()
+  // Drop trailing "/The Polytechnic" (with or without italic markup).
+  v = v.replace(/<[^>]+>/g, '').trim()
+  v = v.replace(/\s*\/\s*The Polytechnic\s*$/i, '').trim()
+  v = v.replace(/\s*-\s*The Polytechnic\s*$/i, '').trim()
+  return v || null
+}
+
+function cleanCaption(raw: string | undefined): string | null {
+  if (!raw) return null
+  const v = decodeEntities(raw).trim().replace(/\s+/g, ' ')
+  return v || null
+}
+
+async function main() {
+  const { write } = parseArgs()
+  console.log(`Mode: write=${write}`)
+
+  // Pull all relevant postmeta in one shot.
+  const tsv = execFileSync(
+    'sqlite3',
+    [
+      '-separator',
+      '\t',
+      WP_SQLITE,
+      "SELECT post_id, meta_key, meta_value FROM postmeta WHERE meta_key IN ('Photo','PhotoCaption','PhotoByline','Photographer','OrigPhoto') AND meta_value IS NOT NULL AND meta_value != ''",
+    ],
+    { maxBuffer: 64 * 1024 * 1024 },
+  ).toString()
+
+  const byWpId = new Map<number, Postmeta>()
+  for (const line of tsv.split('\n')) {
+    if (!line) continue
+    const tab1 = line.indexOf('\t')
+    const tab2 = line.indexOf('\t', tab1 + 1)
+    if (tab1 < 0 || tab2 < 0) continue
+    const pid = Number(line.slice(0, tab1))
+    const key = line.slice(tab1 + 1, tab2)
+    const value = line.slice(tab2 + 1)
+    if (!pid || !value) continue
+    const cur = byWpId.get(pid) ?? {}
+    if (key === 'Photo') cur.photo = value
+    else if (key === 'PhotoCaption') cur.photoCaption = value
+    else if (key === 'PhotoByline') cur.photoByline = value
+    else if (key === 'Photographer') cur.photographer = value
+    else if (key === 'OrigPhoto') cur.origPhoto = value
+    byWpId.set(pid, cur)
+  }
+
+  console.log(`WP postmeta: ${byWpId.size} wp_ids with photo-related meta`)
+
+  const pool = new Pool({
+    host: '127.0.0.1',
+    port: 5433,
+    user: 'poly',
+    password: 'poly',
+    database: 'polymer2',
+  })
+  const resolver = new LegacyMediaResolver(pool)
+
+  let imageFills = 0
+  let captionFills = 0
+  let mediaCreated = 0
+  let mediaReused = 0
+  let bylineFills = 0
+  let pathSkipped = 0
+  let articleNotFound = 0
+
+  for (const [wpId, meta] of byWpId) {
+    const archiveUrl = toArchiveUrl(meta.photo)
+    if (!archiveUrl) {
+      // Even without a Photo path we may still want to set the caption alone,
+      // but image_caption belongs to the article's hero image; if there's no
+      // image, skip. Caption-only would render as orphaned text.
+      pathSkipped++
+      continue
+    }
+
+    const article = await pool.query<{ id: number; featured_image_id: number | null; image_caption: string | null }>(
+      `SELECT id, featured_image_id, image_caption FROM articles WHERE legacy_source='wordpress' AND legacy_article_id=$1`,
+      [String(wpId)],
+    )
+    if (article.rowCount === 0) {
+      articleNotFound++
+      continue
+    }
+    const row = article.rows[0]
+
+    // Skip articles that already have a featured image — be conservative.
+    if (row.featured_image_id) continue
+
+    // Resolve (or create) the media row.
+    const beforeSize = resolver.size()
+    const mediaId = await resolver.resolve(archiveUrl, null)
+    if (!mediaId) continue
+    if (resolver.size() > beforeSize) mediaCreated++
+    else mediaReused++
+
+    // Update photographer attribution on the media row if absent.
+    // Prefer Photographer (clean name) over PhotoByline (markup-laden).
+    const byline = cleanByline(meta.photographer) || cleanByline(meta.photoByline)
+    if (byline && write) {
+      const r = await pool.query(
+        `UPDATE media
+         SET write_in_photographer = COALESCE(write_in_photographer, $1),
+             updated_at = NOW()
+         WHERE id = $2 AND (write_in_photographer IS NULL OR write_in_photographer = '')`,
+        [byline, mediaId],
+      )
+      if ((r.rowCount ?? 0) > 0) bylineFills++
+    } else if (byline) {
+      bylineFills++
+    }
+
+    const caption = cleanCaption(meta.photoCaption)
+
+    if (write) {
+      await pool.query(
+        `UPDATE articles
+         SET featured_image_id = $1,
+             image_caption = COALESCE(image_caption, $2),
+             updated_at = NOW()
+         WHERE id = $3`,
+        [mediaId, caption, row.id],
+      )
+    }
+    imageFills++
+    if (caption && !row.image_caption) captionFills++
+
+    if (imageFills % 200 === 0) {
+      console.log(`  …${imageFills} images filled`)
+    }
+  }
+
+  console.log(`\nResults:`)
+  console.log(`  featured_image_id filled : ${imageFills}`)
+  console.log(`  image_caption filled     : ${captionFills}`)
+  console.log(`  media rows created       : ${mediaCreated}`)
+  console.log(`  media rows reused        : ${mediaReused}`)
+  console.log(`  byline backfills on media: ${bylineFills}`)
+  console.log(`  Photo path skipped       : ${pathSkipped}`)
+  console.log(`  article not found        : ${articleNotFound}`)
+  console.log(`  Mode: ${write ? 'WRITTEN' : 'DRY RUN'}`)
+
+  await pool.end()
+}
+
+main().catch((err) => {
+  console.error(err)
+  process.exit(1)
+})
diff --git a/scripts/legacy-import/reslugify-wordpress.ts b/scripts/legacy-import/reslugify-wordpress.ts
new file mode 100644
index 0000000..3800457
--- /dev/null
+++ b/scripts/legacy-import/reslugify-wordpress.ts
@@ -0,0 +1,168 @@
+/**
+ * Regenerate slugs for wordpress-era articles whose original `_`-separated
+ * `post_name` was stripped (not hyphen-replaced) during import, producing
+ * keyword-mash slugs like `2017-04-05-votingimpactsstudents`.
+ *
+ * Approach:
+ *   - read source `posts.post_name` from the WP DB dump (sqlite)
+ *   - replace `_` with `-`, drop other non-[a-z0-9-] chars, collapse `--`s
+ *   - new slug: `YYYY-MM-DD-<cleaned>`
+ *   - if the new slug differs from the current one, save the current slug to
+ *     `previous_slug` and overwrite `slug` + `_articles_v.version_slug`
+ *
+ * Collisions: handled by appending `-{wp_id}` to the new slug. polymer's slug
+ * column is UNIQUE so this is necessary on the (rare) cases where two
+ * articles have the same date+post_name in the source.
+ *
+ * Run with `pnpm tsx scripts/legacy-import/reslugify-wordpress.ts`. Defaults
+ * to a dry run; pass `--write` to commit.
+ */
+
+import { Pool } from 'pg'
+import { execFileSync } from 'child_process'
+
+const WP_SQLITE = '/tmp/audit/wp.db'
+
+function parseArgs() {
+  const args = process.argv.slice(2)
+  return { write: args.includes('--write') }
+}
+
+// Match polymer's slugify rule but operate on text that uses `_` as the
+// word separator (the WP `post_name` convention). Underscores become hyphens
+// here, then we run the standard cleanup.
+function cleanWpPostname(postName: string): string {
+  if (!postName) return ''
+  return postName
+    .toLowerCase()
+    .replace(/_/g, '-')
+    .replace(/[^a-z0-9\s-]/g, '')
+    .trim()
+    .replace(/\s+/g, '-')
+    .replace(/-+/g, '-')
+    .replace(/(^-|-$)/g, '')
+}
+
+async function main() {
+  const { write } = parseArgs()
+  console.log(`Mode: write=${write}`)
+
+  // Pull post_name + post_date from sqlite in one shot.
+  const tsv = execFileSync(
+    'sqlite3',
+    [
+      '-separator',
+      '\t',
+      WP_SQLITE,
+      "SELECT id, post_name, substr(post_date, 1, 10) AS post_date FROM posts WHERE post_status='publish' AND post_type='post' AND post_name IS NOT NULL AND post_name != ''",
+    ],
+    { maxBuffer: 64 * 1024 * 1024 },
+  ).toString()
+
+  type WpRow = { id: number; postName: string; date: string }
+  const wpRows: WpRow[] = []
+  for (const line of tsv.split('\n')) {
+    if (!line) continue
+    const [idStr, postName, date] = line.split('\t')
+    const id = Number(idStr)
+    if (!id || !postName || !date) continue
+    wpRows.push({ id, postName, date })
+  }
+  console.log(`WP source rows: ${wpRows.length}`)
+
+  const pool = new Pool({
+    host: '127.0.0.1',
+    port: 5433,
+    user: 'poly',
+    password: 'poly',
+    database: 'polymer2',
+  })
+
+  let totalChecked = 0
+  let unchanged = 0
+  let renamed = 0
+  let collisions = 0
+  let articleNotFound = 0
+  const samples: { wpId: number; old: string; new: string }[] = []
+
+  // Pre-build a map of existing slugs to detect collisions before we attempt
+  // an UPDATE (cheaper than catching unique-constraint violations).
+  const existing = await pool.query<{ slug: string; id: number }>(
+    `SELECT slug, id FROM articles WHERE slug IS NOT NULL`,
+  )
+  const slugToId = new Map<string, number>()
+  for (const r of existing.rows) slugToId.set(r.slug, r.id)
+
+  for (const w of wpRows) {
+    const article = await pool.query<{ id: number; slug: string | null }>(
+      `SELECT id, slug FROM articles WHERE legacy_source='wordpress' AND legacy_article_id=$1`,
+      [String(w.id)],
+    )
+    if (article.rowCount === 0) {
+      articleNotFound++
+      continue
+    }
+    const row = article.rows[0]
+    if (!row.slug) continue
+
+    const cleaned = cleanWpPostname(w.postName)
+    if (!cleaned) continue
+    let newSlug = `${w.date}-${cleaned}`
+
+    totalChecked++
+
+    if (newSlug === row.slug) {
+      unchanged++
+      continue
+    }
+
+    // Collision check. If the target slug already belongs to a *different*
+    // article, append `-{wp_id}` to disambiguate.
+    const taker = slugToId.get(newSlug)
+    if (taker !== undefined && taker !== row.id) {
+      newSlug = `${newSlug}-${w.id}`
+      collisions++
+      if (slugToId.has(newSlug)) {
+        // Highly unlikely but surface it.
+        console.warn(`double-collision skipped: wp_id=${w.id} → ${newSlug}`)
+        continue
+      }
+    }
+
+    if (write) {
+      await pool.query(
+        `UPDATE articles SET previous_slug=$1, slug=$2, updated_at=NOW() WHERE id=$3`,
+        [row.slug, newSlug, row.id],
+      )
+      // Also patch the latest version-shadow row so admin previews stay
+      // aligned with the live row.
+      await pool.query(
+        `UPDATE "_articles_v" SET version_slug=$1 WHERE parent_id=$2 AND version_slug=$3`,
+        [newSlug, row.id, row.slug],
+      )
+    }
+    slugToId.delete(row.slug)
+    slugToId.set(newSlug, row.id)
+    renamed++
+
+    if (samples.length < 12) samples.push({ wpId: w.id, old: row.slug, new: newSlug })
+    if (renamed % 500 === 0) console.log(`  …${renamed} renamed`)
+  }
+
+  console.log(`\nResults:`)
+  console.log(`  checked            : ${totalChecked}`)
+  console.log(`  renamed            : ${renamed}`)
+  console.log(`  unchanged          : ${unchanged}`)
+  console.log(`  collisions handled : ${collisions}`)
+  console.log(`  article not found  : ${articleNotFound}`)
+  console.log(`  Mode: ${write ? 'WRITTEN' : 'DRY RUN'}`)
+  console.log(`\nSample renames:`)
+  for (const s of samples) console.log(`  wp_id=${s.wpId}\n    old: ${s.old}\n    new: ${s.new}`)
+
+  await pool.end()
+}
+
+main().catch((err) => {
+  console.error(err)
+  process.exit(1)
+})
diff --git a/scripts/run_deploy_sql_migrations.sh b/scripts/run_deploy_sql_migrations.sh
index 4feae0b..583dcff 100755
--- a/scripts/run_deploy_sql_migrations.sh
+++ b/scripts/run_deploy_sql_migrations.sh
@@ -51,7 +51,9 @@ VALUES
   ('20260428_000000_add_media_image_sizes', 26, NOW(), NOW()),
   ('20260506_000000_add_articles_legacy_archive', 27, NOW(), NOW()),
   ('20260506_010000_add_articles_legacy_id_and_category', 27, NOW(), NOW()),
-  ('20260506_020000_add_articles_plain_content', 27, NOW(), NOW())
+  ('20260506_020000_add_articles_plain_content', 27, NOW(), NOW()),
+  ('20260507_000000_add_articles_previous_slug', 28, NOW(), NOW()),
+  ('20260507_010000_add_legacy_shortlinks', 28, NOW(), NOW())
 ON CONFLICT DO NOTHING;
 
 -- 20260317: Add opinion_type and image_caption columns
@@ -1377,4 +1379,19 @@ CREATE INDEX IF NOT EXISTS "articles_legacy_source_legacy_article_id_idx" ON "ar
 -- document). Nullable until the legacy backfill completes.
 ALTER TABLE "articles" ADD COLUMN IF NOT EXISTS "plain_content" text;
 ALTER TABLE "_articles_v" ADD COLUMN IF NOT EXISTS "version_plain_content" text;
+
+-- 20260507_000000: Track a previous slug so renames (e.g. legacy slug
+-- regen) can 301-redirect old polymer URLs to the new ones.
+ALTER TABLE "articles" ADD COLUMN IF NOT EXISTS "previous_slug" varchar;
+ALTER TABLE "_articles_v" ADD COLUMN IF NOT EXISTS "version_previous_slug" varchar;
+CREATE INDEX IF NOT EXISTS "articles_previous_slug_idx" ON "articles" ("previous_slug");
+
+-- 20260507_010000: Lookup table for the 12,872 5-char WordPress shortlinks
+-- (pluginSL_shorturl). Middleware 301s /<code> to target_url.
+CREATE TABLE IF NOT EXISTS "legacy_shortlinks" (
+  "short_code" varchar PRIMARY KEY,
+  "target_url" varchar NOT NULL,
+  "hit_count" integer NOT NULL DEFAULT 0,
+  "created_at" timestamp(3) with time zone NOT NULL DEFAULT NOW()
+);
 SQL

From 6997ce6b75337aae14a8126e05c7b5917de19e19 Mon Sep 17 00:00:00 2001
From: Ronan Hevenor <spacechickenrobot@gmail.com>
Date: Thu, 7 May 2026 12:04:30 -0400
Subject: [PATCH 2/3] fix(legacy-archive): satisfy CodeQL multi-char
 sanitization on byline strip

CodeQL flagged a single `replace(/<[^>]+>/g, '')` as an incomplete
sanitizer because unbalanced angle brackets could survive. Replace with
an indexOf fixpoint loop that drops the trailing fragment when a `<` has
no matching `>`.
---
 .../backfill-wp-featured-images.ts            | 24 +++++++++++++++++--
 1 file changed, 22 insertions(+), 2 deletions(-)

diff --git a/scripts/legacy-import/backfill-wp-featured-images.ts b/scripts/legacy-import/backfill-wp-featured-images.ts
index 1a47801..8b3d2d5 100644
--- a/scripts/legacy-import/backfill-wp-featured-images.ts
+++ b/scripts/legacy-import/backfill-wp-featured-images.ts
@@ -60,11 +60,31 @@ function toArchiveUrl(rawPath: string | undefined): string | null {
 // Strip "<i>The Polytechnic</i>"-style trailing publication credit from a
 // byline. WP postmeta consistently formats as "Name/<i>The Polytechnic</i>"
 // or "Name/The Polytechnic"; we want just "Name".
+//
+// Uses indexOf scanning + a fixpoint loop instead of a single regex so the
+// CodeQL `incomplete-multi-character-sanitization` rule is satisfied — a
+// single `replace(/<[^>]+>/g, '')` pass can leave a leading `<` if the input
+// has unbalanced angle brackets.
+function stripTags(input: string): string {
+  let s = input
+  for (;;) {
+    const open = s.indexOf('<')
+    if (open === -1) break
+    const close = s.indexOf('>', open + 1)
+    if (close === -1) {
+      // Unterminated tag — drop everything from `<` onward.
+      s = s.slice(0, open)
+      break
+    }
+    s = s.slice(0, open) + s.slice(close + 1)
+  }
+  return s
+}
+
 function cleanByline(raw: string | undefined): string | null {
   if (!raw) return null
   let v = decodeEntities(raw).trim()
-  // Drop trailing "/The Polytechnic" (with or without italic markup).
-  v = v.replace(/<[^>]+>/g, '').trim()
+  v = stripTags(v).trim()
   v = v.replace(/\s*\/\s*The Polytechnic\s*$/i, '').trim()
   v = v.replace(/\s*-\s*The Polytechnic\s*$/i, '').trim()
   return v || null

From 09e0bfb28c581989dd1b5fedc35eaade4d103790 Mon Sep 17 00:00:00 2001
From: Ronan Hevenor <spacechickenrobot@gmail.com>
Date: Thu, 7 May 2026 12:09:36 -0400
Subject: [PATCH 3/3] feat(legacy-archive): expand [gallery]/[gview]/iframe
 shortcodes + re-import wp_id 7191/7421
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* expand-wp-shortcodes.ts: 75 articles re-rendered. 58 [gallery] shortcodes
  produce real upload nodes (988 images resolved against the attachment
  guid map), 19 [gview] PDFs become 'Download PDF' links, and 2 iframes
  (1 YouTube, 1 Google Form) become outbound links. Pre-expands the
  shortcodes to <img>/<a> tags before handing off to the existing
  WP→Lexical pipeline + media-resolver.
* fix-wp-7191-7421.ts: wp_id 7191 (Tate Boucher's neuromarketing letter,
  previously mis-imported as a day-archive listing titled 'RENSSELAER
  UNION') gets its real content + Letter-to-the-Editor kicker + author.
  wp_id 7421 has zero source content; demoted to draft.
---
 scripts/legacy-import/expand-wp-shortcodes.ts | 241 ++++++++++++++++++
 scripts/legacy-import/fix-wp-7191-7421.ts     | 208 +++++++++++++++
 2 files changed, 449 insertions(+)
 create mode 100644 scripts/legacy-import/expand-wp-shortcodes.ts
 create mode 100644 scripts/legacy-import/fix-wp-7191-7421.ts

diff --git a/scripts/legacy-import/expand-wp-shortcodes.ts b/scripts/legacy-import/expand-wp-shortcodes.ts
new file mode 100644
index 0000000..5d4ed98
--- /dev/null
+++ b/scripts/legacy-import/expand-wp-shortcodes.ts
@@ -0,0 +1,241 @@
+/**
+ * Expand WP body shortcodes that the original importer dropped.
+ *
+ * Targets 77 published WP posts whose source `post_content` includes one of:
+ *   - [gallery ids="N1,N2,..."]    (56 posts, 1012 attachment refs)
+ *   - [gview file="URL"]            (19 PDF "Full Issue" embeds)
+ *   - <iframe src="https://www.youtube.com/embed/VIDEO">  (3 YouTube)
+ *
+ * For each, we pre-expand the shortcode/iframe into plain HTML
+ *   - `[gallery ids="..."]`  → one `<img>` per attachment, sourced from the
+ *                              archive proxy (`/archive/wordpress-media/uploads/...`)
+ *   - `[gview file="URL"]`   → `<p><a href="...">Download PDF</a></p>`
+ *   - YouTube `<iframe>`     → `<p><a href="...">Watch on YouTube</a></p>`
+ *   - other `<iframe>`s      → `<p><a href="...">View embedded source</a></p>`
+ * then re-run the standard WP→Lexical conversion + media-resolver pipeline.
+ *
+ * Run with `pnpm tsx scripts/legacy-import/expand-wp-shortcodes.ts [--write]`.
+ */
+
+import { Pool } from 'pg'
+import { execFileSync } from 'child_process'
+import { convertHtmlToLexical, lexicalToPlainText } from './wordpress/html-to-lexical'
+import { LegacyMediaResolver, resolveImagesInLexicalTree } from './media-resolver'
+
+const WP_SQLITE = '/tmp/audit/wp.db'
+
+function parseArgs() {
+  return { write: process.argv.slice(2).includes('--write') }
+}
+
+// ── attachment lookup ──────────────────────────────────────────────────────
+//
+// Builds id → archive-URL map. WP attachment guids look like
+// `https://poly.rpi.edu/wp-content/uploads/YYYY-MM-DD/foo.jpg`; the archive
+// proxy serves them from `/archive/wordpress-media/uploads/YYYY-MM-DD/foo.jpg`.
+
+type AttachmentMap = Map<number, { url: string; alt: string }>
+
+function loadAttachmentMap(): AttachmentMap {
+  const json = execFileSync(
+    'sqlite3',
+    [
+      '-json',
+      WP_SQLITE,
+      `SELECT p.ID AS id, p.guid, p.post_title, p.post_excerpt
+       FROM posts p
+       WHERE p.post_type = 'attachment'`,
+    ],
+    { maxBuffer: 64 * 1024 * 1024 },
+  ).toString()
+  type Row = { id: number; guid: string | null; post_title: string | null; post_excerpt: string | null }
+  const rows = JSON.parse(json) as Row[]
+
+  const out: AttachmentMap = new Map()
+  for (const r of rows) {
+    if (!r.id || !r.guid) continue
+    const m = r.guid.match(/^https?:\/\/[^/]+\/wp-content\/uploads\/(.+)$/i)
+    if (!m) continue
+    out.set(r.id, {
+      url: `/archive/wordpress-media/uploads/${m[1]}`,
+      alt: (r.post_title || r.post_excerpt || '').trim(),
+    })
+  }
+  return out
+}
+
+// ── shortcode expansion ───────────────────────────────────────────────────
+
+function expandGalleryShortcodes(html: string, attachments: AttachmentMap): { html: string; expanded: number } {
+  let expanded = 0
+  const out = html.replace(/\[gallery([^\]]*)\]/gi, (_full, attrsRaw: string) => {
+    // [gallery ids="N1,N2,..."] is the only shape we expand; standalone
+    // [gallery] (no ids) draws on the post's media library and is not
+    // recoverable from the dump alone.
+    const idsMatch = attrsRaw.match(/ids\s*=\s*["']?([0-9,\s]+)["']?/i)
+    if (!idsMatch) return _full
+    const ids = idsMatch[1]
+      .split(/[,\s]+/)
+      .map((s) => Number(s.trim()))
+      .filter((n) => Number.isFinite(n) && n > 0)
+    const imgTags: string[] = []
+    for (const id of ids) {
+      const att = attachments.get(id)
+      if (!att) continue
+      const altEsc = att.alt.replace(/"/g, '&quot;')
+      imgTags.push(`<p><img src="${att.url}" alt="${altEsc}" /></p>`)
+    }
+    if (imgTags.length === 0) return _full
+    expanded++
+    return imgTags.join('\n')
+  })
+  return { html: out, expanded }
+}
+
+function expandGviewShortcodes(html: string): { html: string; expanded: number } {
+  let expanded = 0
+  const out = html.replace(/\[gview([^\]]*)\]/gi, (_full, attrsRaw: string) => {
+    const fileMatch = attrsRaw.match(/file\s*=\s*["']([^"']+)["']/i)
+    if (!fileMatch) return _full
+    const fileUrl = fileMatch[1].trim()
+    if (!fileUrl) return _full
+    // Rewrite poly.rpi.edu paths to the archive proxy.
+    const m = fileUrl.match(/^https?:\/\/[^/]+\/wp-content\/uploads\/(.+)$/i)
+    const archiveUrl = m ? `/archive/wordpress-media/uploads/${m[1]}` : fileUrl
+    const filename = fileUrl.split('/').pop() || 'PDF'
+    expanded++
+    return `<p><a href="${archiveUrl}">Download PDF (${filename})</a></p>`
+  })
+  return { html: out, expanded }
+}
+
+function expandIframes(html: string): { html: string; expanded: number; youtube: number } {
+  let expanded = 0
+  let youtube = 0
+  const out = html.replace(/<iframe\b[^>]*src=["']([^"']+)["'][^>]*>(?:[\s\S]*?<\/iframe>)?/gi, (_full, src: string) => {
+    if (!src) return ''
+    const yt = src.match(/(?:youtube\.com\/embed\/|youtu\.be\/)([A-Za-z0-9_-]+)/)
+    if (yt) {
+      youtube++
+      expanded++
+      return `<p><a href="https://www.youtube.com/watch?v=${yt[1]}">Watch on YouTube</a></p>`
+    }
+    expanded++
+    return `<p><a href="${src}">View embedded source</a></p>`
+  })
+  return { html: out, expanded, youtube }
+}
+
+// ── main ──────────────────────────────────────────────────────────────────
+
+async function main() {
+  const { write } = parseArgs()
+  console.log(`Mode: write=${write}`)
+
+  const attachments = loadAttachmentMap()
+  console.log(`Attachment map: ${attachments.size} entries`)
+
+  // Pull all shortcode-bearing posts.
+  const json = execFileSync(
+    'sqlite3',
+    [
+      '-json',
+      WP_SQLITE,
+      `SELECT ID AS id, post_content
+       FROM posts
+       WHERE post_status='publish' AND post_type='post'
+         AND (post_content LIKE '%[gallery%' OR post_content LIKE '%[gview%' OR post_content LIKE '%<iframe%')`,
+    ],
+    { maxBuffer: 128 * 1024 * 1024 },
+  ).toString()
+  const wpRows = JSON.parse(json) as { id: number; post_content: string }[]
+  console.log(`Source rows with shortcodes/iframes: ${wpRows.length}`)
+
+  const pool = new Pool({
+    host: '127.0.0.1',
+    port: 5433,
+    user: 'poly',
+    password: 'poly',
+    database: 'polymer2',
+  })
+  const resolver = new LegacyMediaResolver(pool)
+
+  let updated = 0
+  let totalGalleries = 0
+  let totalGviews = 0
+  let totalIframes = 0
+  let totalYoutube = 0
+  let totalImagesResolved = 0
+  let articleNotFound = 0
+
+  for (const w of wpRows) {
+    const expandedGallery = expandGalleryShortcodes(w.post_content, attachments)
+    const expandedGview = expandGviewShortcodes(expandedGallery.html)
+    const expandedIframe = expandIframes(expandedGview.html)
+    totalGalleries += expandedGallery.expanded
+    totalGviews += expandedGview.expanded
+    totalIframes += expandedIframe.expanded
+    totalYoutube += expandedIframe.youtube
+
+    // Skip if nothing changed (defensive).
+    if (
+      expandedGallery.expanded === 0 &&
+      expandedGview.expanded === 0 &&
+      expandedIframe.expanded === 0
+    ) {
+      continue
+    }
+
+    // Find the polymer row.
+    const article = await pool.query<{ id: number; slug: string }>(
+      `SELECT id, slug FROM articles WHERE legacy_source='wordpress' AND legacy_article_id=$1`,
+      [String(w.id)],
+    )
+    if (article.rowCount === 0) {
+      articleNotFound++
+      continue
+    }
+    const row = article.rows[0]
+
+    const html = expandedIframe.html
+    const lexical = convertHtmlToLexical(html, {
+      rewriteImageSrc: (src) => {
+        if (!src) return null
+        const m = src.match(/^https?:\/\/[^/]+\/wp-content\/uploads\/(.+)$/i)
+        if (m) return `/archive/wordpress-media/uploads/${m[1]}`
+        if (src.startsWith('/wp-content/uploads/')) {
+          return '/archive/wordpress-media/uploads/' + src.slice('/wp-content/uploads/'.length)
+        }
+        return src
+      },
+    })
+    const imgRes = await resolveImagesInLexicalTree(lexical, resolver)
+    totalImagesResolved += imgRes.resolved
+
+    const plainContent = lexicalToPlainText(lexical)
+
+    if (write) {
+      await pool.query(
+        `UPDATE articles SET content=$1::jsonb, plain_content=$2, updated_at=NOW() WHERE id=$3`,
+        [JSON.stringify(lexical), plainContent, row.id],
+      )
+    }
+    updated++
+  }
+
+  console.log(`\nResults:`)
+  console.log(`  rows updated         : ${updated}`)
+  console.log(`  galleries expanded   : ${totalGalleries}`)
+  console.log(`  gviews expanded      : ${totalGviews}`)
+  console.log(`  iframes expanded     : ${totalIframes} (youtube: ${totalYoutube})`)
+  console.log(`  images resolved      : ${totalImagesResolved}`)
+  console.log(`  article not found    : ${articleNotFound}`)
+  console.log(`  Mode: ${write ? 'WRITTEN' : 'DRY RUN'}`)
+
+  await pool.end()
+}
+
+main().catch((err) => {
+  console.error(err)
+  process.exit(1)
+})
diff --git a/scripts/legacy-import/fix-wp-7191-7421.ts b/scripts/legacy-import/fix-wp-7191-7421.ts
new file mode 100644
index 0000000..19d64d6
--- /dev/null
+++ b/scripts/legacy-import/fix-wp-7191-7421.ts
@@ -0,0 +1,208 @@
+/**
+ * One-off fixup for two WP rows the importer mis-routed: 7191 (Tate Boucher
+ * letter on neuromarketing) and 7421 (no content at all). Both ended up
+ * pointing at a day-archive listing path instead of a real post URL.
+ *
+ * - 7191: re-import using SQL `post_content` and `Kicker` postmeta. Title is
+ *         empty in the source so we synthesize a sensible plainTitle. Slug
+ *         regenerated from that title with the standard YYYY-MM-DD prefix.
+ *         Section forced to `opinion` because Kicker says "letter to the
+ *         editor".
+ * - 7421: source post_content is 0 chars and post_title is empty. Demote to
+ *         draft and clear the bogus "CAMPUS EVENT" title so it stops showing
+ *         on the archive listing.
+ *
+ * Run with `pnpm tsx scripts/legacy-import/fix-wp-7191-7421.ts [--write]`.
+ */
+
+import { Pool } from 'pg'
+import { execFileSync } from 'child_process'
+import { convertHtmlToLexical } from './wordpress/html-to-lexical'
+
+const WP_SQLITE = '/tmp/audit/wp.db'
+
+function parseArgs() {
+  return { write: process.argv.slice(2).includes('--write') }
+}
+
+function plainTextTitleDoc(text: string) {
+  return {
+    root: {
+      type: 'root',
+      version: 1,
+      direction: null,
+      format: '',
+      indent: 0,
+      children: [
+        {
+          type: 'paragraph',
+          version: 1,
+          direction: null,
+          format: '',
+          indent: 0,
+          textFormat: 0,
+          children: [
+            {
+              type: 'text',
+              version: 1,
+              format: 0,
+              detail: 0,
+              mode: 'normal',
+              style: '',
+              text,
+            },
+          ],
+        },
+      ],
+    },
+  }
+}
+
+async function main() {
+  const { write } = parseArgs()
+  console.log(`Mode: write=${write}`)
+
+  // Read source post_content for both rows. Use sqlite3 JSON mode to dodge
+  // newline-in-column issues that break TSV parsing of multi-paragraph posts.
+  const json = execFileSync(
+    'sqlite3',
+    ['-json', WP_SQLITE, 'SELECT id, post_title, post_content, post_date FROM posts WHERE id IN (7191, 7421)'],
+    { maxBuffer: 8 * 1024 * 1024 },
+  ).toString()
+  // sqlite preserves original column-name casing (`ID`, not `id`). Use the
+  // exact key returned by the dump.
+  const rows = JSON.parse(json) as {
+    ID: number
+    post_title: string | null
+    post_content: string | null
+    post_date: string | null
+  }[]
+
+  const wpRows = new Map<number, { title: string; content: string; date: string }>()
+  for (const r of rows) {
+    wpRows.set(r.ID, {
+      title: r.post_title ?? '',
+      content: r.post_content ?? '',
+      date: r.post_date ?? '',
+    })
+  }
+  console.log(`source rows: ${[...wpRows.keys()].join(', ')}`)
+
+  const pool = new Pool({
+    host: '127.0.0.1',
+    port: 5433,
+    user: 'poly',
+    password: 'poly',
+    database: 'polymer2',
+  })
+
+  // ---- 7191 ----
+  const w7191 = wpRows.get(7191)
+  if (w7191) {
+    const lexical = convertHtmlToLexical(w7191.content, {
+      rewriteImageSrc: (src) => {
+        if (!src) return null
+        const m = src.match(/^https?:\/\/[^/]+\/wp-content\/uploads\/(.+)$/i)
+        if (m) return `/archive/wordpress-media/uploads/${m[1]}`
+        if (src.startsWith('/wp-content/uploads/')) {
+          return '/archive/wordpress-media/uploads/' + src.slice('/wp-content/uploads/'.length)
+        }
+        return src
+      },
+    })
+
+    // Title: synthesize since source is empty. The opening sentence is
+    // "People argue that using neuroscience in the field of marketing is
+    // immoral" — fold to a Letter-to-the-Editor-shaped title.
+    const plainTitle = 'In defense of neuromarketing'
+    const titleDoc = plainTextTitleDoc(plainTitle)
+    const slug = '2015-05-13-in-defense-of-neuromarketing'
+    const legacyHtmlUrl = '/archive/wordpress/mirror/2015/05/13/in_defense_of_neuromarketing/'
+
+    // Build plain content text from the lexical doc for search.
+    const plainContent = w7191.content
+      .replace(/<[^>]+>/g, ' ')
+      .replace(/&[a-z]+;|&#\d+;/g, ' ')
+      .replace(/\s+/g, ' ')
+      .trim()
+
+    const before = await pool.query(
+      `SELECT id, slug, plain_title FROM articles WHERE legacy_source='wordpress' AND legacy_article_id='7191'`,
+    )
+    if (before.rowCount === 0) {
+      console.warn('  7191 not found in polymer')
+    } else {
+      const row = before.rows[0] as { id: number; slug: string; plain_title: string }
+      console.log(`  7191 polymer id=${row.id} old slug=${row.slug} old title="${row.plain_title}"`)
+      if (write) {
+        await pool.query(
+          `UPDATE articles
+           SET title=$1::jsonb,
+               plain_title=$2,
+               content=$3::jsonb,
+               plain_content=$4,
+               kicker='Letter to the Editor',
+               opinion_type='letter-to-the-editor',
+               section='opinion',
+               previous_slug=COALESCE(previous_slug, slug),
+               slug=$5,
+               legacy_html_url=$6,
+               updated_at=NOW()
+           WHERE id=$7`,
+          [
+            JSON.stringify(titleDoc),
+            plainTitle,
+            JSON.stringify(lexical),
+            plainContent,
+            slug,
+            legacyHtmlUrl,
+            row.id,
+          ],
+        )
+        // Clear stale write-in authors for this post and add Tate Boucher.
+        await pool.query(`DELETE FROM articles_write_in_authors WHERE _parent_id=$1`, [row.id])
+        // `id` is a string varchar (Payload's array-row id), not a numeric
+        // sequence, so we generate one ourselves.
+        await pool.query(
+          `INSERT INTO articles_write_in_authors (_parent_id, _order, id, name) VALUES ($1, 1, gen_random_uuid()::text, $2)`,
+          [row.id, 'Tate Boucher'],
+        )
+      }
+    }
+  }
+
+  // ---- 7421 ----
+  const w7421 = wpRows.get(7421)
+  if (w7421) {
+    if (!w7421.content || w7421.content.length === 0) {
+      const before = await pool.query(
+        `SELECT id, slug, plain_title FROM articles WHERE legacy_source='wordpress' AND legacy_article_id='7421'`,
+      )
+      if (before.rowCount === 0) {
+        console.warn('  7421 not found in polymer')
+      } else {
+        const row = before.rows[0] as { id: number; slug: string; plain_title: string }
+        console.log(`  7421 polymer id=${row.id} old slug=${row.slug} old title="${row.plain_title}"`)
+        if (write) {
+          await pool.query(
+            `UPDATE articles
+             SET _status='draft',
+                 plain_title='(no source content — was wp_id 7421)',
+                 title=$1::jsonb,
+                 updated_at=NOW()
+             WHERE id=$2`,
+            [JSON.stringify(plainTextTitleDoc('(no source content — was wp_id 7421)')), row.id],
+          )
+        }
+      }
+    }
+  }
+
+  console.log(`\nMode: ${write ? 'WRITTEN' : 'DRY RUN'}`)
+  await pool.end()
+}
+
+main().catch((err) => {
+  console.error(err)
+  process.exit(1)
+})