From 993f680a0844bf4fbcfdbc9daa8cc9934973cbeb Mon Sep 17 00:00:00 2001 From: Ronan Hevenor Date: Thu, 7 May 2026 12:11:52 -0400 Subject: [PATCH 1/2] feat(legacy-archive): restore italic titles + decode entities + URL-encode legacy paths * restore-wp-italic-titles.ts: 165 WP rich-text titles rebuilt to preserve the source /// formatting (e.g. "Super MNC fails to live up to predecessor" now renders italic). plain_title was already correct; only the Lexical title node was wrong. * clean-plain-titles.ts: 27 plain_titles cleaned of residual ’-style numeric entities (22 pipeline) and literal tags (poly-online). * legacy_html_url URL-encoding: 23 chips with literal spaces (Chris Mooney discusses... etc.) now use %20 so the chip clicks through to a working /archive/ path. --- scripts/legacy-import/clean-plain-titles.ts | 71 +++++++ .../legacy-import/restore-wp-italic-titles.ts | 195 ++++++++++++++++++ 2 files changed, 266 insertions(+) create mode 100644 scripts/legacy-import/clean-plain-titles.ts create mode 100644 scripts/legacy-import/restore-wp-italic-titles.ts diff --git a/scripts/legacy-import/clean-plain-titles.ts b/scripts/legacy-import/clean-plain-titles.ts new file mode 100644 index 0000000..fe4158b --- /dev/null +++ b/scripts/legacy-import/clean-plain-titles.ts @@ -0,0 +1,71 @@ +/** + * Decode HTML entities and strip residual inline tags from `plain_title` + * for legacy rows where the original importer left them through. + * + * Audit found: + * - 22 pipeline plain_titles with `’`/`&` etc. + * - 7 poly-online plain_titles with literal `...` tags + * + * Run with `pnpm tsx scripts/legacy-import/clean-plain-titles.ts [--write]`. + */ + +import { Pool } from 'pg' +import { decodeEntities } from './wordpress/html-tokenizer' + +function parseArgs() { + return { write: process.argv.slice(2).includes('--write') } +} + +function stripTags(input: string): string { + let s = input + for (;;) { + const open = s.indexOf('<') + if (open === -1) break + const close = s.indexOf('>', open + 1) + if (close === -1) { + s = s.slice(0, open) + break + } + s = s.slice(0, open) + s.slice(close + 1) + } + return s +} + +function clean(s: string): string { + return decodeEntities(stripTags(s)).replace(/\s+/g, ' ').trim() +} + +async function main() { + const { write } = parseArgs() + const pool = new Pool({ + host: '127.0.0.1', + port: 5433, + user: 'poly', + password: 'poly', + database: 'polymer2', + }) + + const r = await pool.query<{ id: number; plain_title: string; legacy_source: string | null }>( + `SELECT id, plain_title, legacy_source FROM articles + WHERE plain_title IS NOT NULL + AND (plain_title ~ '<[a-z]+>' OR plain_title ~ '&#[0-9]+;' OR plain_title ~ '&[a-z]+;')`, + ) + console.log(`candidates: ${r.rows.length}`) + + let updated = 0 + for (const row of r.rows) { + const cleaned = clean(row.plain_title) + if (cleaned === row.plain_title) continue + if (write) { + await pool.query(`UPDATE articles SET plain_title=$1, updated_at=NOW() WHERE id=$2`, [cleaned, row.id]) + } + updated++ + } + console.log(`updated: ${updated} (${write ? 'WRITTEN' : 'DRY RUN'})`) + await pool.end() +} + +main().catch((e) => { + console.error(e) + process.exit(1) +}) diff --git a/scripts/legacy-import/restore-wp-italic-titles.ts b/scripts/legacy-import/restore-wp-italic-titles.ts new file mode 100644 index 0000000..d637a4b --- /dev/null +++ b/scripts/legacy-import/restore-wp-italic-titles.ts @@ -0,0 +1,195 @@ +/** + * Rebuild rich-text titles for the 165 WP posts whose source `post_title` + * carried inline ``/``/``/`` markup the original importer + * dropped. The plain_title field is fine (the importer already strips tags + * for display); only the Lexical title doc is wrong. + * + * Approach: tokenize the source title (very small subset — just the four + * format tags + plain text), emit alternating text nodes with `format` bit + * flags, wrap in a paragraph + root. + * + * Run with `pnpm tsx scripts/legacy-import/restore-wp-italic-titles.ts [--write]`. + */ + +import { Pool } from 'pg' +import { execFileSync } from 'child_process' +import { decodeEntities } from './wordpress/html-tokenizer' + +const WP_SQLITE = '/tmp/audit/wp.db' +const FMT_BOLD = 1 +const FMT_ITALIC = 2 + +function parseArgs() { + return { write: process.argv.slice(2).includes('--write') } +} + +type Run = { text: string; format: number } + +// Very small tokenizer covering the cases we see in real titles. Self-closing +// tags shouldn't appear here. The tag set is fixed; anything else is dropped. +function parseTitleRuns(html: string): Run[] { + const runs: Run[] = [] + let format = 0 + let i = 0 + let buf = '' + + const FMT_TAGS: Record = { i: FMT_ITALIC, em: FMT_ITALIC, b: FMT_BOLD, strong: FMT_BOLD } + const flush = () => { + if (buf) { + runs.push({ text: buf, format }) + buf = '' + } + } + + while (i < html.length) { + const open = html.indexOf('<', i) + if (open === -1) { + buf += html.slice(i) + break + } + buf += html.slice(i, open) + const close = html.indexOf('>', open + 1) + if (close === -1) { + // Unterminated — treat as literal text. + buf += html.slice(open) + break + } + const inner = html.slice(open + 1, close).trim() + const isClosing = inner.startsWith('/') + const tagName = (isClosing ? inner.slice(1) : inner).split(/\s/)[0].toLowerCase() + const fmt = FMT_TAGS[tagName] + if (fmt !== undefined) { + flush() + format = isClosing ? format & ~fmt : format | fmt + } + // Drop any other tags silently. + i = close + 1 + } + flush() + return runs.filter((r) => r.text.length > 0).map((r) => ({ ...r, text: decodeEntities(r.text) })) +} + +function buildTitleDoc(runs: Run[]): unknown { + // Drop any leading/trailing whitespace runs that decoding+tag-stripping can + // produce, then collapse internal double-spaces (the WP source frequently + // had `X ` with a trailing space that's now adjacent to an italic- + // boundary). + const collapsed: Run[] = [] + for (const r of runs) { + const text = r.text.replace(/\s+/g, ' ') + if (!text) continue + collapsed.push({ text, format: r.format }) + } + // Strip leading whitespace on the very first run + trailing on the last. + if (collapsed.length > 0) { + collapsed[0].text = collapsed[0].text.replace(/^\s+/, '') + collapsed[collapsed.length - 1].text = collapsed[collapsed.length - 1].text.replace(/\s+$/, '') + } + const children = collapsed + .filter((r) => r.text.length > 0) + .map((r) => ({ + type: 'text', + version: 1, + format: r.format, + detail: 0, + mode: 'normal', + style: '', + text: r.text, + })) + + return { + root: { + type: 'root', + version: 1, + format: '', + indent: 0, + direction: 'ltr', + children: [ + { + type: 'paragraph', + version: 1, + format: '', + indent: 0, + direction: 'ltr', + textFormat: 0, + children, + }, + ], + }, + } +} + +async function main() { + const { write } = parseArgs() + console.log(`Mode: write=${write}`) + + // Pull every published WP post whose title carries any of the four format + // tags. Using JSON mode avoids tab/newline edge cases. + const json = execFileSync( + 'sqlite3', + [ + '-json', + WP_SQLITE, + `SELECT ID AS id, post_title FROM posts + WHERE post_status='publish' AND post_type='post' + AND (post_title LIKE '%%' OR post_title LIKE '%%' + OR post_title LIKE '%%' OR post_title LIKE '%%' + OR post_title LIKE '%%' OR post_title LIKE '%%' + OR post_title LIKE '%%' OR post_title LIKE '%%')`, + ], + { maxBuffer: 64 * 1024 * 1024 }, + ).toString() + const wpRows = JSON.parse(json) as { id: number; post_title: string }[] + console.log(`Source rows with tagged titles: ${wpRows.length}`) + + const pool = new Pool({ + host: '127.0.0.1', + port: 5433, + user: 'poly', + password: 'poly', + database: 'polymer2', + }) + + let updated = 0 + let articleNotFound = 0 + let noFormat = 0 + + for (const w of wpRows) { + const runs = parseTitleRuns(w.post_title) + if (runs.every((r) => r.format === 0)) { + noFormat++ + continue + } + const doc = buildTitleDoc(runs) + + const r = await pool.query<{ id: number }>( + `SELECT id FROM articles WHERE legacy_source='wordpress' AND legacy_article_id=$1`, + [String(w.id)], + ) + if (r.rowCount === 0) { + articleNotFound++ + continue + } + const articleId = r.rows[0].id + + if (write) { + await pool.query(`UPDATE articles SET title=$1::jsonb, updated_at=NOW() WHERE id=$2`, [ + JSON.stringify(doc), + articleId, + ]) + } + updated++ + } + + console.log(`\nResults:`) + console.log(` titles updated : ${updated}`) + console.log(` no formatting : ${noFormat}`) + console.log(` article not found : ${articleNotFound}`) + console.log(` Mode: ${write ? 'WRITTEN' : 'DRY RUN'}`) + await pool.end() +} + +main().catch((err) => { + console.error(err) + process.exit(1) +}) From 2820619140d431f852dd3dce195a21a06d6ae225 Mon Sep 17 00:00:00 2001 From: Ronan Hevenor Date: Thu, 7 May 2026 12:15:03 -0400 Subject: [PATCH 2/2] feat(legacy-archive): fix poly-online dates, dedup author casings, split a slash byline * 11 polytechnic-online articles had the wrong published_date (MM/DD swap from manifest parse ambiguity, plus 2 off-by-ones). Slugs adjusted to reflect the corrected date; collisions disambiguated with the legacy article id suffix. previous_slug retained for the old shape so existing links 301. * normalize-author-casing.ts: 53 author groups had 2+ casings ("James Lenze II" vs "JAmes Lenze II" etc.). Picked the most-common form per group, renamed 123 article rows to it. * wp_id 2558's mis-OCR'd "Russell Brown/Paul O'Neil" byline split into two write-in author rows. --- .../legacy-import/normalize-author-casing.ts | 72 +++++++++++++++++++ 1 file changed, 72 insertions(+) create mode 100644 scripts/legacy-import/normalize-author-casing.ts diff --git a/scripts/legacy-import/normalize-author-casing.ts b/scripts/legacy-import/normalize-author-casing.ts new file mode 100644 index 0000000..cbde818 --- /dev/null +++ b/scripts/legacy-import/normalize-author-casing.ts @@ -0,0 +1,72 @@ +/** + * Author-casing dedup pass. Audit found ~40 cases where the same person was + * stored with two different casings (e.g. `James Lenze II` + `JAmes Lenze + * II`). For each lower-cased canonical name, we pick the most-common casing + * and rename every occurrence to it. + * + * Run with `pnpm tsx scripts/legacy-import/normalize-author-casing.ts [--write]`. + */ + +import { Pool } from 'pg' + +function parseArgs() { + return { write: process.argv.slice(2).includes('--write') } +} + +async function main() { + const { write } = parseArgs() + const pool = new Pool({ + host: '127.0.0.1', + port: 5433, + user: 'poly', + password: 'poly', + database: 'polymer2', + }) + + // Group by lowercase name. The most-common casing wins. Ties go to whichever + // form is first in alphabetical order — deterministic. + const r = await pool.query<{ name: string; cnt: number }>( + `SELECT name, COUNT(*)::int AS cnt FROM articles_write_in_authors GROUP BY name`, + ) + const byKey = new Map() + for (const row of r.rows) { + const key = row.name.toLowerCase().trim() + if (!key) continue + const list = byKey.get(key) ?? [] + list.push({ name: row.name, cnt: row.cnt }) + byKey.set(key, list) + } + + let groups = 0 + let renames = 0 + let rowsAffected = 0 + for (const [, list] of byKey) { + if (list.length < 2) continue + groups++ + list.sort((a, b) => b.cnt - a.cnt || a.name.localeCompare(b.name)) + const winner = list[0].name + for (let i = 1; i < list.length; i++) { + const loser = list[i] + // Skip if the only difference is invisible (whitespace). + if (winner === loser.name) continue + if (write) { + const u = await pool.query(`UPDATE articles_write_in_authors SET name=$1 WHERE name=$2`, [winner, loser.name]) + rowsAffected += u.rowCount ?? 0 + } else { + rowsAffected += loser.cnt + } + renames++ + } + } + + console.log(`groups with multiple casings: ${groups}`) + console.log(`renamed casings : ${renames}`) + console.log(`rows affected : ${rowsAffected}`) + console.log(`Mode: ${write ? 'WRITTEN' : 'DRY RUN'}`) + await pool.end() +} + +main().catch((e) => { + console.error(e) + process.exit(1) +})