From 993f680a0844bf4fbcfdbc9daa8cc9934973cbeb Mon Sep 17 00:00:00 2001
From: Ronan Hevenor <spacechickenrobot@gmail.com>
Date: Thu, 7 May 2026 12:11:52 -0400
Subject: [PATCH 1/2] feat(legacy-archive): restore italic titles + decode
 entities + URL-encode legacy paths

* restore-wp-italic-titles.ts: 165 WP rich-text titles rebuilt to preserve
  the source <i>/<em>/<b>/<strong> formatting (e.g. "<i>Super MNC</i>
  fails to live up to predecessor" now renders italic). plain_title was
  already correct; only the Lexical title node was wrong.
* clean-plain-titles.ts: 27 plain_titles cleaned of residual &#8217;-style
  numeric entities (22 pipeline) and literal <i> tags (poly-online).
* legacy_html_url URL-encoding: 23 chips with literal spaces (Chris
  Mooney discusses... etc.) now use %20 so the chip clicks through to a
  working /archive/ path.
---
 scripts/legacy-import/clean-plain-titles.ts   |  71 +++++++
 .../legacy-import/restore-wp-italic-titles.ts | 195 ++++++++++++++++++
 2 files changed, 266 insertions(+)
 create mode 100644 scripts/legacy-import/clean-plain-titles.ts
 create mode 100644 scripts/legacy-import/restore-wp-italic-titles.ts
diff --git a/scripts/legacy-import/clean-plain-titles.ts b/scripts/legacy-import/clean-plain-titles.ts
new file mode 100644
index 0000000..fe4158b
--- /dev/null
+++ b/scripts/legacy-import/clean-plain-titles.ts
@@ -0,0 +1,71 @@
+/**
+ * Decode HTML entities and strip residual inline tags from `plain_title`
+ * for legacy rows where the original importer left them through.
+ *
+ * Audit found:
+ *   - 22 pipeline plain_titles with `&#8217;`/`&#038;` etc.
+ *   - 7 poly-online plain_titles with literal `<i>...</i>` tags
+ *
+ * Run with `pnpm tsx scripts/legacy-import/clean-plain-titles.ts [--write]`.
+ */
+
+import { Pool } from 'pg'
+import { decodeEntities } from './wordpress/html-tokenizer'
+
+function parseArgs() {
+  return { write: process.argv.slice(2).includes('--write') }
+}
+
+function stripTags(input: string): string {
+  let s = input
+  for (;;) {
+    const open = s.indexOf('<')
+    if (open === -1) break
+    const close = s.indexOf('>', open + 1)
+    if (close === -1) {
+      s = s.slice(0, open)
+      break
+    }
+    s = s.slice(0, open) + s.slice(close + 1)
+  }
+  return s
+}
+
+function clean(s: string): string {
+  return decodeEntities(stripTags(s)).replace(/\s+/g, ' ').trim()
+}
+
+async function main() {
+  const { write } = parseArgs()
+  const pool = new Pool({
+    host: '127.0.0.1',
+    port: 5433,
+    user: 'poly',
+    password: 'poly',
+    database: 'polymer2',
+  })
+
+  const r = await pool.query<{ id: number; plain_title: string; legacy_source: string | null }>(
+    `SELECT id, plain_title, legacy_source FROM articles
+     WHERE plain_title IS NOT NULL
+       AND (plain_title ~ '<[a-z]+>' OR plain_title ~ '&#[0-9]+;' OR plain_title ~ '&[a-z]+;')`,
+  )
+  console.log(`candidates: ${r.rows.length}`)
+
+  let updated = 0
+  for (const row of r.rows) {
+    const cleaned = clean(row.plain_title)
+    if (cleaned === row.plain_title) continue
+    if (write) {
+      await pool.query(`UPDATE articles SET plain_title=$1, updated_at=NOW() WHERE id=$2`, [cleaned, row.id])
+    }
+    updated++
+  }
+  console.log(`updated: ${updated} (${write ? 'WRITTEN' : 'DRY RUN'})`)
+  await pool.end()
+}
+
+main().catch((e) => {
+  console.error(e)
+  process.exit(1)
+})
diff --git a/scripts/legacy-import/restore-wp-italic-titles.ts b/scripts/legacy-import/restore-wp-italic-titles.ts
new file mode 100644
index 0000000..d637a4b
--- /dev/null
+++ b/scripts/legacy-import/restore-wp-italic-titles.ts
@@ -0,0 +1,195 @@
+/**
+ * Rebuild rich-text titles for the 165 WP posts whose source `post_title`
+ * carried inline `<i>`/`<em>`/`<b>`/`<strong>` markup the original importer
+ * dropped. The plain_title field is fine (the importer already strips tags
+ * for display); only the Lexical title doc is wrong.
+ *
+ * Approach: tokenize the source title (very small subset — just the four
+ * format tags + plain text), emit alternating text nodes with `format` bit
+ * flags, wrap in a paragraph + root.
+ *
+ * Run with `pnpm tsx scripts/legacy-import/restore-wp-italic-titles.ts [--write]`.
+ */
+
+import { Pool } from 'pg'
+import { execFileSync } from 'child_process'
+import { decodeEntities } from './wordpress/html-tokenizer'
+
+const WP_SQLITE = '/tmp/audit/wp.db'
+const FMT_BOLD = 1
+const FMT_ITALIC = 2
+
+function parseArgs() {
+  return { write: process.argv.slice(2).includes('--write') }
+}
+
+type Run = { text: string; format: number }
+
+// Very small tokenizer covering the cases we see in real titles. Self-closing
+// tags shouldn't appear here. The tag set is fixed; anything else is dropped.
+function parseTitleRuns(html: string): Run[] {
+  const runs: Run[] = []
+  let format = 0
+  let i = 0
+  let buf = ''
+
+  const FMT_TAGS: Record<string, number> = { i: FMT_ITALIC, em: FMT_ITALIC, b: FMT_BOLD, strong: FMT_BOLD }
+  const flush = () => {
+    if (buf) {
+      runs.push({ text: buf, format })
+      buf = ''
+    }
+  }
+
+  while (i < html.length) {
+    const open = html.indexOf('<', i)
+    if (open === -1) {
+      buf += html.slice(i)
+      break
+    }
+    buf += html.slice(i, open)
+    const close = html.indexOf('>', open + 1)
+    if (close === -1) {
+      // Unterminated — treat as literal text.
+      buf += html.slice(open)
+      break
+    }
+    const inner = html.slice(open + 1, close).trim()
+    const isClosing = inner.startsWith('/')
+    const tagName = (isClosing ? inner.slice(1) : inner).split(/\s/)[0].toLowerCase()
+    const fmt = FMT_TAGS[tagName]
+    if (fmt !== undefined) {
+      flush()
+      format = isClosing ? format & ~fmt : format | fmt
+    }
+    // Drop any other tags silently.
+    i = close + 1
+  }
+  flush()
+  return runs.filter((r) => r.text.length > 0).map((r) => ({ ...r, text: decodeEntities(r.text) }))
+}
+
+function buildTitleDoc(runs: Run[]): unknown {
+  // Drop any leading/trailing whitespace runs that decoding+tag-stripping can
+  // produce, then collapse internal double-spaces (the WP source frequently
+  // had `<i>X</i> ` with a trailing space that's now adjacent to an italic-
+  // boundary).
+  const collapsed: Run[] = []
+  for (const r of runs) {
+    const text = r.text.replace(/\s+/g, ' ')
+    if (!text) continue
+    collapsed.push({ text, format: r.format })
+  }
+  // Strip leading whitespace on the very first run + trailing on the last.
+  if (collapsed.length > 0) {
+    collapsed[0].text = collapsed[0].text.replace(/^\s+/, '')
+    collapsed[collapsed.length - 1].text = collapsed[collapsed.length - 1].text.replace(/\s+$/, '')
+  }
+  const children = collapsed
+    .filter((r) => r.text.length > 0)
+    .map((r) => ({
+      type: 'text',
+      version: 1,
+      format: r.format,
+      detail: 0,
+      mode: 'normal',
+      style: '',
+      text: r.text,
+    }))
+
+  return {
+    root: {
+      type: 'root',
+      version: 1,
+      format: '',
+      indent: 0,
+      direction: 'ltr',
+      children: [
+        {
+          type: 'paragraph',
+          version: 1,
+          format: '',
+          indent: 0,
+          direction: 'ltr',
+          textFormat: 0,
+          children,
+        },
+      ],
+    },
+  }
+}
+
+async function main() {
+  const { write } = parseArgs()
+  console.log(`Mode: write=${write}`)
+
+  // Pull every published WP post whose title carries any of the four format
+  // tags. Using JSON mode avoids tab/newline edge cases.
+  const json = execFileSync(
+    'sqlite3',
+    [
+      '-json',
+      WP_SQLITE,
+      `SELECT ID AS id, post_title FROM posts
+       WHERE post_status='publish' AND post_type='post'
+         AND (post_title LIKE '%<i>%' OR post_title LIKE '%</i>%'
+              OR post_title LIKE '%<em>%' OR post_title LIKE '%</em>%'
+              OR post_title LIKE '%<b>%' OR post_title LIKE '%</b>%'
+              OR post_title LIKE '%<strong>%' OR post_title LIKE '%</strong>%')`,
+    ],
+    { maxBuffer: 64 * 1024 * 1024 },
+  ).toString()
+  const wpRows = JSON.parse(json) as { id: number; post_title: string }[]
+  console.log(`Source rows with tagged titles: ${wpRows.length}`)
+
+  const pool = new Pool({
+    host: '127.0.0.1',
+    port: 5433,
+    user: 'poly',
+    password: 'poly',
+    database: 'polymer2',
+  })
+
+  let updated = 0
+  let articleNotFound = 0
+  let noFormat = 0
+
+  for (const w of wpRows) {
+    const runs = parseTitleRuns(w.post_title)
+    if (runs.every((r) => r.format === 0)) {
+      noFormat++
+      continue
+    }
+    const doc = buildTitleDoc(runs)
+
+    const r = await pool.query<{ id: number }>(
+      `SELECT id FROM articles WHERE legacy_source='wordpress' AND legacy_article_id=$1`,
+      [String(w.id)],
+    )
+    if (r.rowCount === 0) {
+      articleNotFound++
+      continue
+    }
+    const articleId = r.rows[0].id
+
+    if (write) {
+      await pool.query(`UPDATE articles SET title=$1::jsonb, updated_at=NOW() WHERE id=$2`, [
+        JSON.stringify(doc),
+        articleId,
+      ])
+    }
+    updated++
+  }
+
+  console.log(`\nResults:`)
+  console.log(`  titles updated     : ${updated}`)
+  console.log(`  no formatting     : ${noFormat}`)
+  console.log(`  article not found : ${articleNotFound}`)
+  console.log(`  Mode: ${write ? 'WRITTEN' : 'DRY RUN'}`)
+  await pool.end()
+}
+
+main().catch((err) => {
+  console.error(err)
+  process.exit(1)
+})

From 2820619140d431f852dd3dce195a21a06d6ae225 Mon Sep 17 00:00:00 2001
From: Ronan Hevenor <spacechickenrobot@gmail.com>
Date: Thu, 7 May 2026 12:15:03 -0400
Subject: [PATCH 2/2] feat(legacy-archive): fix poly-online dates, dedup author
 casings, split a slash byline

* 11 polytechnic-online articles had the wrong published_date (MM/DD swap
  from manifest parse ambiguity, plus 2 off-by-ones). Slugs adjusted to
  reflect the corrected date; collisions disambiguated with the legacy
  article id suffix. previous_slug retained for the old shape so existing
  links 301.
* normalize-author-casing.ts: 53 author groups had 2+ casings ("James
  Lenze II" vs "JAmes Lenze II" etc.). Picked the most-common form per
  group, renamed 123 article rows to it.
* wp_id 2558's mis-OCR'd "Russell Brown/Paul O'Neil" byline split into
  two write-in author rows.
---
 .../legacy-import/normalize-author-casing.ts  | 72 +++++++++++++++++++
 1 file changed, 72 insertions(+)
 create mode 100644 scripts/legacy-import/normalize-author-casing.ts

diff --git a/scripts/legacy-import/normalize-author-casing.ts b/scripts/legacy-import/normalize-author-casing.ts
new file mode 100644
index 0000000..cbde818
--- /dev/null
+++ b/scripts/legacy-import/normalize-author-casing.ts
@@ -0,0 +1,72 @@
+/**
+ * Author-casing dedup pass. Audit found ~40 cases where the same person was
+ * stored with two different casings (e.g. `James Lenze II` + `JAmes Lenze
+ * II`). For each lower-cased canonical name, we pick the most-common casing
+ * and rename every occurrence to it.
+ *
+ * Run with `pnpm tsx scripts/legacy-import/normalize-author-casing.ts [--write]`.
+ */
+
+import { Pool } from 'pg'
+
+function parseArgs() {
+  return { write: process.argv.slice(2).includes('--write') }
+}
+
+async function main() {
+  const { write } = parseArgs()
+  const pool = new Pool({
+    host: '127.0.0.1',
+    port: 5433,
+    user: 'poly',
+    password: 'poly',
+    database: 'polymer2',
+  })
+
+  // Group by lowercase name. The most-common casing wins. Ties go to whichever
+  // form is first in alphabetical order — deterministic.
+  const r = await pool.query<{ name: string; cnt: number }>(
+    `SELECT name, COUNT(*)::int AS cnt FROM articles_write_in_authors GROUP BY name`,
+  )
+  const byKey = new Map<string, { name: string; cnt: number }[]>()
+  for (const row of r.rows) {
+    const key = row.name.toLowerCase().trim()
+    if (!key) continue
+    const list = byKey.get(key) ?? []
+    list.push({ name: row.name, cnt: row.cnt })
+    byKey.set(key, list)
+  }
+
+  let groups = 0
+  let renames = 0
+  let rowsAffected = 0
+  for (const [, list] of byKey) {
+    if (list.length < 2) continue
+    groups++
+    list.sort((a, b) => b.cnt - a.cnt || a.name.localeCompare(b.name))
+    const winner = list[0].name
+    for (let i = 1; i < list.length; i++) {
+      const loser = list[i]
+      // Skip if the only difference is invisible (whitespace).
+      if (winner === loser.name) continue
+      if (write) {
+        const u = await pool.query(`UPDATE articles_write_in_authors SET name=$1 WHERE name=$2`, [winner, loser.name])
+        rowsAffected += u.rowCount ?? 0
+      } else {
+        rowsAffected += loser.cnt
+      }
+      renames++
+    }
+  }
+
+  console.log(`groups with multiple casings: ${groups}`)
+  console.log(`renamed casings              : ${renames}`)
+  console.log(`rows affected                : ${rowsAffected}`)
+  console.log(`Mode: ${write ? 'WRITTEN' : 'DRY RUN'}`)
+  await pool.end()
+}
+
+main().catch((e) => {
+  console.error(e)
+  process.exit(1)
+})