From a1e9df06d0392d0bb7f0db3f987c39f909c744aa Mon Sep 17 00:00:00 2001 From: Ronan Hevenor Date: Thu, 7 May 2026 12:01:11 -0400 Subject: [PATCH 1/3] feat(legacy-archive): backfill kickers, subdecks, featured images, slugs, shortlinks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Round 5 of the legacy-archive cleanup, driven by the completeness audit at /tmp/legacy-import-logs/wp-completeness-audit.md. * kickers + subdecks: 4732 articles got correct kickers from poly-online type_db (2303) and WP `Kicker` postmeta (1357 fills + 1072 overwrites of generic "Editorial/Opinion"). Plus 768 poly-online subdecks from blurb_db. * featured images: 1531 WP articles got real featured_image_id + caption + photographer attribution from `Photo`/`PhotoCaption`/`PhotoByline`/ `Photographer` postmeta. * WP slug regen: 2921 run-together slugs (e.g. `midnpartakeinmarathon`) rebuilt from source `post_name` (e.g. `midn-partake-in-marathon`). Old slugs are saved on a new `previous_slug` column so middleware can 301 to the new URL. New `articles_previous_slug_idx` makes the lookup cheap. * shortlinks: new `legacy_shortlinks` lookup table populated from the `pluginSL_shorturl` plugin (5014 of 12,872 source rows resolve to a real destination — 4091 polymer URLs, 871 archive media files, 50 external, 2 mirror fallbacks). Middleware checks the table after the hand-curated override map. Regex broadened from 5-digit to 5-char alphanumeric. Backfill scripts live under scripts/legacy-import/ and run idempotently (dry run by default; --write to commit). All four were used to update the production DB directly via the SSH tunnel before this PR. --- collections/Articles.ts | 10 + middleware.ts | 126 ++++++++-- ...60507_000000_add_articles_previous_slug.ts | 25 ++ .../20260507_010000_add_legacy_shortlinks.ts | 26 ++ migrations/index.ts | 12 + .../backfill-kickers-subdecks.ts | 229 ++++++++++++++++++ .../backfill-legacy-shortlinks.ts | 198 +++++++++++++++ .../backfill-wp-featured-images.ts | 215 ++++++++++++++++ scripts/legacy-import/reslugify-wordpress.ts | 168 +++++++++++++ scripts/run_deploy_sql_migrations.sh | 19 +- 10 files changed, 1013 insertions(+), 15 deletions(-) create mode 100644 migrations/20260507_000000_add_articles_previous_slug.ts create mode 100644 migrations/20260507_010000_add_legacy_shortlinks.ts create mode 100644 scripts/legacy-import/backfill-kickers-subdecks.ts create mode 100644 scripts/legacy-import/backfill-legacy-shortlinks.ts create mode 100644 scripts/legacy-import/backfill-wp-featured-images.ts create mode 100644 scripts/legacy-import/reslugify-wordpress.ts diff --git a/collections/Articles.ts b/collections/Articles.ts index b62185d..ab945d1 100644 --- a/collections/Articles.ts +++ b/collections/Articles.ts @@ -422,6 +422,16 @@ const Articles: CollectionConfig = { description: 'Original category/section name from the source system. Preserved for display and search; does not affect routing.', }, }, + { + name: 'previousSlug', + type: 'text', + label: 'Previous Slug', + index: true, + admin: { + position: 'sidebar', + description: 'Old slug retained for 301 redirects after a rename. The middleware redirects requests for this slug to the current one.', + }, + }, ], } diff --git a/middleware.ts b/middleware.ts index 0287ffe..8c765eb 100644 --- a/middleware.ts +++ b/middleware.ts @@ -11,20 +11,24 @@ const ARTICLE_URL_RE = /^\/([a-z]+)\/(\d{4})\/(\d{2})\/([a-z0-9][a-z0-9-]*)\/?$/ // Old WordPress permalink shape from the 2009-2019 era. Years restricted // to 2009-2019 so we don't accidentally swallow other paths. const LEGACY_WP_URL_RE = /^\/(20(?:0[9]|1[0-9]))\/(\d{2})\/(\d{2})\/([a-z0-9][a-z0-9_-]*)\/?$/ -// 5-digit ID URL shape from the WP-era pluginSL_shorturl plugin. Articles -// from 2013-2014 link to documents via these short codes. -const LEGACY_WP_SHORTLINK_RE = /^\/(\d{5})\/?$/ +// 5-char ID URL shape from the WP-era `pluginSL_shorturl` plugin. The +// codes are 5-digit numeric *or* 5-char alphanumeric (case-sensitive). +// Resolved against the `legacy_shortlinks` lookup table at request time. +const LEGACY_WP_SHORTLINK_RE = /^\/([A-Za-z0-9]{5})\/?$/ const VALID_SECTIONS = new Set(['news', 'sports', 'features', 'opinion']) /** - * 33 shortlinks recovered from the legacy WordPress `pluginSL_shorturl` table. - * Most of these point at student-senate document portals (still up at - * docs.studentsenate.rpi.edu) or external services (Google Docs, Eventbrite, - * etc.). Some chain to old `poly.rpi.edu/YYYY/...` URLs that the - * LEGACY_WP_URL_RE branch then redirects to their new polymer URL. + * Hand-curated overrides for the WordPress `pluginSL_shorturl` map. Anything + * not present here falls through to the `legacy_shortlinks` table (5,014 + * rows backfilled from the source plugin). Kept for two reasons: * - * Source: `pluginSL_shorturl` table in - * /home/red/poly/recon/archives/wordpress/db/wordpress-archive-2026-05-06.sql.gz + * 1. Documents are now hosted off-site (docs.studentsenate.rpi.edu, etc.); + * the override map keeps those redirect targets explicit and reviewable. + * 2. A few codes have been edited at runtime over the years (e.g. NASA's + * mission_pages URL was renamed) and we want our pin to win. + * + * The DB-backed table can be regenerated from the dump via + * `scripts/legacy-import/backfill-legacy-shortlinks.ts`. */ const WP_LEGACY_SHORTLINKS: Record = { '06735': 'http://poly.rpi.edu/2013/03/06/pss_breaking_the_third_wall/', @@ -68,6 +72,34 @@ const CACHE_TTL_MS = 60_000 type RedirectEntry = { to: string | null; expiresAt: number } const legacyRedirectCache = new Map() +const previousSlugRedirectCache = new Map() +const shortlinkRedirectCache = new Map() + +/** + * Look up the destination URL for a 5-char WP shortlink. Pulls from the + * `legacy_shortlinks` table backfilled from the `pluginSL_shorturl` plugin. + * + * No Payload collection wraps this table — the rows aren't editorial — so + * we go through the raw `pg.Pool` exposed by the postgres db adapter. + */ +async function lookupShortlinkRedirect(code: string): Promise { + const cached = shortlinkRedirectCache.get(code) + const now = Date.now() + if (cached && cached.expiresAt > now) return cached.to + + const payload = await getPayload({ config: payloadConfig }) + const pool = (payload.db as unknown as { pool?: import('pg').Pool }).pool + let to: string | null = null + if (pool) { + const r = await pool.query<{ target_url: string }>( + 'SELECT target_url FROM legacy_shortlinks WHERE short_code = $1 LIMIT 1', + [code], + ) + to = r.rows[0]?.target_url ?? null + } + shortlinkRedirectCache.set(code, { to, expiresAt: now + CACHE_TTL_MS }) + return to +} async function isArticleGone(section: string, slug: string): Promise { if (!VALID_SECTIONS.has(section)) return false @@ -100,6 +132,49 @@ async function isArticleGone(section: string, slug: string): Promise { return gone } +/** + * Look up the canonical URL for an article whose slug was renamed. Returns + * the new URL when `previous_slug` matches; null otherwise. Used by the 301 + * redirect fallback in the article-URL branch when the live `slug` lookup + * fails. + */ +async function lookupPreviousSlugRedirect( + section: string, + oldSlug: string, +): Promise { + const cacheKey = `${section}:${oldSlug}` + const cached = previousSlugRedirectCache.get(cacheKey) + const now = Date.now() + if (cached && cached.expiresAt > now) return cached.to + + const payload = await getPayload({ config: payloadConfig }) + const result = await payload.find({ + collection: 'articles', + where: { + and: [ + { previousSlug: { equals: oldSlug } }, + { section: { equals: section } }, + { _status: { equals: 'published' } }, + ], + }, + limit: 1, + depth: 0, + select: { slug: true, section: true, publishedDate: true }, + }) + const doc = result.docs[0] as + | { slug?: string; section?: string; publishedDate?: string } + | undefined + let to: string | null = null + if (doc?.slug && doc?.section && doc?.publishedDate) { + const dt = new Date(doc.publishedDate) + const yy = dt.getUTCFullYear().toString() + const mm = String(dt.getUTCMonth() + 1).padStart(2, '0') + to = `/${doc.section}/${yy}/${mm}/${doc.slug}` + } + previousSlugRedirectCache.set(cacheKey, { to, expiresAt: now + CACHE_TTL_MS }) + return to +} + /** * Look up the polymer URL for an old WordPress permalink shape * `/{year}/{month}/{day}/{slug}/`. Returns the new URL or null if no match. @@ -142,12 +217,26 @@ async function lookupLegacyWpRedirect( } export async function middleware(req: NextRequest) { - // Legacy WP shortlink (5-digit IDs from pluginSL_shorturl) → original URL. + // Legacy WP shortlink (5-char IDs from pluginSL_shorturl) → target URL. + // The hand-curated `WP_LEGACY_SHORTLINKS` map wins; otherwise we look up + // the DB-backed `legacy_shortlinks` table. const shortMatch = req.nextUrl.pathname.match(LEGACY_WP_SHORTLINK_RE) if (shortMatch) { - const target = WP_LEGACY_SHORTLINKS[shortMatch[1]] - if (target) { - return NextResponse.redirect(target, 301) + const code = shortMatch[1] + const override = WP_LEGACY_SHORTLINKS[code] + if (override) { + return NextResponse.redirect(override, 301) + } + try { + const target = await lookupShortlinkRedirect(code) + if (target) { + // External (absolute) targets pass through; relative paths get + // resolved against the request origin. + const url = /^https?:\/\//i.test(target) ? target : new URL(target, req.url).toString() + return NextResponse.redirect(url, 301) + } + } catch { + // Fall through — let the request 404 normally if lookup fails. } } @@ -171,6 +260,15 @@ export async function middleware(req: NextRequest) { const [, section, , , slug] = match try { + // Renamed-slug 301: if no row has `slug=$slug` but one has + // `previous_slug=$slug`, redirect to the new canonical URL. This kicks in + // for the legacy slug-cleanup pass (post_name `_`-stripping) and any + // future editor-driven rename. + const renamedTo = await lookupPreviousSlugRedirect(section, slug) + if (renamedTo) { + return NextResponse.redirect(new URL(renamedTo, req.url), 301) + } + if (await isArticleGone(section, slug)) { // 410 Gone tells search engines the URL is permanently removed so they // de-index faster than they would from a bare 404. diff --git a/migrations/20260507_000000_add_articles_previous_slug.ts b/migrations/20260507_000000_add_articles_previous_slug.ts new file mode 100644 index 0000000..5344159 --- /dev/null +++ b/migrations/20260507_000000_add_articles_previous_slug.ts @@ -0,0 +1,25 @@ +import { MigrateUpArgs, MigrateDownArgs, sql } from '@payloadcms/db-postgres' + +/** + * Add `previous_slug` to `articles` (+ version shadow). When a slug is + * renamed (e.g. legacy slug-cleanup), set `previous_slug` to the old value + * so the request middleware can issue a 301 redirect to the new URL. + * + * Single-string for now (one historical slug per article). If we ever need + * multiple aliases we can swap to a text[]. + */ +export async function up({ db }: MigrateUpArgs): Promise { + await db.execute(sql` + ALTER TABLE "articles" ADD COLUMN IF NOT EXISTS "previous_slug" varchar; + ALTER TABLE "_articles_v" ADD COLUMN IF NOT EXISTS "version_previous_slug" varchar; + CREATE INDEX IF NOT EXISTS "articles_previous_slug_idx" ON "articles" ("previous_slug"); + `) +} + +export async function down({ db }: MigrateDownArgs): Promise { + await db.execute(sql` + DROP INDEX IF EXISTS "articles_previous_slug_idx"; + ALTER TABLE "_articles_v" DROP COLUMN IF EXISTS "version_previous_slug"; + ALTER TABLE "articles" DROP COLUMN IF EXISTS "previous_slug"; + `) +} diff --git a/migrations/20260507_010000_add_legacy_shortlinks.ts b/migrations/20260507_010000_add_legacy_shortlinks.ts new file mode 100644 index 0000000..9a4ad37 --- /dev/null +++ b/migrations/20260507_010000_add_legacy_shortlinks.ts @@ -0,0 +1,26 @@ +import { MigrateUpArgs, MigrateDownArgs, sql } from '@payloadcms/db-postgres' + +/** + * Add a `legacy_shortlinks` table mapping the 5-char codes from the WordPress + * `pluginSL_shorturl` plugin (12,872 rows) to their target URL. Used by the + * request middleware to 301 `/` to either a canonical polymer URL or + * the original external destination. + * + * Not a Payload collection — there's no editorial reason to surface these + * in the admin UI, and exposing 12K rows there would be noisy. Pure DB + * lookup table. + */ +export async function up({ db }: MigrateUpArgs): Promise { + await db.execute(sql` + CREATE TABLE IF NOT EXISTS "legacy_shortlinks" ( + "short_code" varchar PRIMARY KEY, + "target_url" varchar NOT NULL, + "hit_count" integer NOT NULL DEFAULT 0, + "created_at" timestamp(3) with time zone NOT NULL DEFAULT NOW() + ); + `) +} + +export async function down({ db }: MigrateDownArgs): Promise { + await db.execute(sql`DROP TABLE IF EXISTS "legacy_shortlinks";`) +} diff --git a/migrations/index.ts b/migrations/index.ts index 3f060cb..9da6bef 100644 --- a/migrations/index.ts +++ b/migrations/index.ts @@ -42,6 +42,8 @@ import * as migration_20260428_100000_add_audio_transcription from './20260428_1 import * as migration_20260506_000000_add_articles_legacy_archive from './20260506_000000_add_articles_legacy_archive'; import * as migration_20260506_010000_add_articles_legacy_id_and_category from './20260506_010000_add_articles_legacy_id_and_category'; import * as migration_20260506_020000_add_articles_plain_content from './20260506_020000_add_articles_plain_content'; +import * as migration_20260507_000000_add_articles_previous_slug from './20260507_000000_add_articles_previous_slug'; +import * as migration_20260507_010000_add_legacy_shortlinks from './20260507_010000_add_legacy_shortlinks'; export const migrations = [ { @@ -264,4 +266,14 @@ export const migrations = [ down: migration_20260506_020000_add_articles_plain_content.down, name: '20260506_020000_add_articles_plain_content', }, + { + up: migration_20260507_000000_add_articles_previous_slug.up, + down: migration_20260507_000000_add_articles_previous_slug.down, + name: '20260507_000000_add_articles_previous_slug', + }, + { + up: migration_20260507_010000_add_legacy_shortlinks.up, + down: migration_20260507_010000_add_legacy_shortlinks.down, + name: '20260507_010000_add_legacy_shortlinks', + }, ]; diff --git a/scripts/legacy-import/backfill-kickers-subdecks.ts b/scripts/legacy-import/backfill-kickers-subdecks.ts new file mode 100644 index 0000000..f8e301d --- /dev/null +++ b/scripts/legacy-import/backfill-kickers-subdecks.ts @@ -0,0 +1,229 @@ +/** + * Backfill kickers and subdecks on legacy articles. + * + * poly-online: read `type_db` / `blurb_db` from the manifest + * wordpress: read `Kicker` / `Subdeck` postmeta from the source DB dump + * + * Originally we ignored these fields during import; the audit flagged 2,303 + * poly-online + 1,366 WP rows missing kickers despite the source having them, + * plus 768 poly-online subdecks. This script does idempotent direct-SQL + * updates against the polymer DB (no hooks fire — we're not editing + * narrative content). + * + * Run with `pnpm tsx scripts/legacy-import/backfill-kickers-subdecks.ts`. + * Defaults to a dry run; pass `--write` to actually update rows. + */ + +import { Pool } from 'pg' +import { readFileSync } from 'fs' +import { execFileSync } from 'child_process' +import { decodeEntities } from './wordpress/html-tokenizer' + +type PolyOnlineArticle = { + kind: string + articleID: number + type_db?: string + blurb_db?: string +} + +type Mode = { write: boolean; era: 'poly-online' | 'wordpress' | 'all' } + +function parseArgs(): Mode { + const args = process.argv.slice(2) + const eraFlag = args.find((a) => a.startsWith('--era='))?.split('=')[1] as + | 'poly-online' + | 'wordpress' + | 'all' + | undefined + return { + write: args.includes('--write'), + era: eraFlag ?? 'all', + } +} + +const POLY_ONLINE_MANIFEST = '/home/red/poly/recon/archives/polytechnic-online/manifest.json' +const WP_SQLITE = '/tmp/audit/wp.db' + +// Source kicker values that are generic category names (not real kickers) +// and should be overwritten if the source `Kicker` postmeta has something better. +const GENERIC_KICKER_VALUES = new Set([ + 'editorial/opinion', + 'editorial / opinion', + 'opinion', + 'news', + 'sports', + 'features', + 'feature', +]) + +function isGenericKicker(v: string | null): boolean { + if (!v) return false + return GENERIC_KICKER_VALUES.has(v.trim().toLowerCase()) +} + +// "none" is the manifest's null marker for type_db. Skip it. Otherwise return +// the cleaned-up label (the manifest values are already title-cased). +function cleanTypeDb(raw: string | undefined | null): string | null { + if (!raw) return null + const v = raw.trim() + if (!v) return null + if (v.toLowerCase() === 'none') return null + return v +} + +// blurb_db can have the same `&#xx;` HTML entities as titles/bodies. Decode +// before saving so the subdeck renders correctly. +function cleanBlurb(raw: string | undefined | null): string | null { + if (!raw) return null + const v = decodeEntities(raw).trim() + return v || null +} + +async function main() { + const mode = parseArgs() + console.log(`Mode: era=${mode.era} write=${mode.write}`) + + const pool = new Pool({ + host: '127.0.0.1', + port: 5433, + user: 'poly', + password: 'poly', + database: 'polymer2', + }) + + let kickerUpdates = 0 + let subdeckUpdates = 0 + + // ---- poly-online ---- + if (mode.era === 'all' || mode.era === 'poly-online') { + console.log('\n=== poly-online ===') + const manifest = JSON.parse(readFileSync(POLY_ONLINE_MANIFEST, 'utf-8')) as { + articles: PolyOnlineArticle[] + } + const articles = manifest.articles.filter((a) => a.kind === 'article') + console.log(`Manifest articles: ${articles.length}`) + + // De-dupe by articleID; manifest has multiple part rows per article. + const seen = new Set() + const updates: { id: number; kicker: string | null; subdeck: string | null }[] = [] + for (const a of articles) { + if (seen.has(a.articleID)) continue + seen.add(a.articleID) + const k = cleanTypeDb(a.type_db) + const s = cleanBlurb(a.blurb_db) + if (!k && !s) continue + updates.push({ id: a.articleID, kicker: k, subdeck: s }) + } + console.log(`Distinct articleIDs with type_db/blurb_db: ${updates.length}`) + + for (const u of updates) { + // Only overwrite if the destination is empty. Don't trample any + // editor's manual cleanup. + const result = await pool.query( + `SELECT id, kicker, subdeck FROM articles WHERE legacy_source='polytechnic-online' AND legacy_article_id=$1`, + [String(u.id)] + ) + if (result.rowCount === 0) continue + const row = result.rows[0] + const setKicker = u.kicker && !row.kicker + const setSubdeck = u.subdeck && !row.subdeck + if (!setKicker && !setSubdeck) continue + if (mode.write) { + await pool.query( + `UPDATE articles SET kicker=COALESCE($1, kicker), subdeck=COALESCE($2, subdeck), updated_at=NOW() WHERE id=$3`, + [setKicker ? u.kicker : null, setSubdeck ? u.subdeck : null, row.id] + ) + } + if (setKicker) kickerUpdates++ + if (setSubdeck) subdeckUpdates++ + } + + console.log(`poly-online: kicker updates=${kickerUpdates} subdeck updates=${subdeckUpdates}`) + } + + // ---- wordpress ---- + if (mode.era === 'all' || mode.era === 'wordpress') { + console.log('\n=== wordpress ===') + // Use sqlite CLI to pull Kicker and Subdeck postmeta into TSV. + // Avoids adding a dependency on better-sqlite3 just for this script. + const tsv = execFileSync( + 'sqlite3', + [ + '-separator', + '\t', + WP_SQLITE, + "SELECT post_id, meta_key, meta_value FROM postmeta WHERE meta_key IN ('Kicker','Subdeck') AND meta_value IS NOT NULL AND meta_value != ''", + ], + { maxBuffer: 64 * 1024 * 1024 } + ).toString() + + type Postmeta = { kicker?: string; subdeck?: string } + const byWpId = new Map() + for (const line of tsv.split('\n')) { + if (!line) continue + const [pidStr, key, value] = line.split('\t') + const pid = Number(pidStr) + if (!pid) continue + const cur = byWpId.get(pid) ?? {} + // Decode entities (Kicker postmeta can carry ’ from the WP editor). + const v = decodeEntities(value).trim() + if (!v) continue + if (key === 'Kicker') cur.kicker = v + else if (key === 'Subdeck') cur.subdeck = v + byWpId.set(pid, cur) + } + console.log(`WP postmeta rows: ${byWpId.size} wp_ids with Kicker or Subdeck`) + + let kickerOverwrites = 0 + let kickerFills = 0 + let subdeckFills = 0 + + for (const [wpId, meta] of byWpId) { + const result = await pool.query( + `SELECT id, kicker, subdeck FROM articles WHERE legacy_source='wordpress' AND legacy_article_id=$1`, + [String(wpId)] + ) + if (result.rowCount === 0) continue + const row = result.rows[0] + + let newKicker: string | null = null + if (meta.kicker) { + if (!row.kicker) { + newKicker = meta.kicker + kickerFills++ + } else if (isGenericKicker(row.kicker) && meta.kicker.toLowerCase() !== row.kicker.toLowerCase()) { + newKicker = meta.kicker + kickerOverwrites++ + } + } + + let newSubdeck: string | null = null + if (meta.subdeck && !row.subdeck) { + newSubdeck = meta.subdeck + subdeckFills++ + } + + if (!newKicker && !newSubdeck) continue + if (mode.write) { + await pool.query( + `UPDATE articles SET kicker=COALESCE($1, kicker), subdeck=COALESCE($2, subdeck), updated_at=NOW() WHERE id=$3`, + [newKicker, newSubdeck, row.id] + ) + } + } + + console.log( + `wordpress: kicker fills=${kickerFills} kicker overwrites=${kickerOverwrites} subdeck fills=${subdeckFills}` + ) + kickerUpdates += kickerFills + kickerOverwrites + subdeckUpdates += subdeckFills + } + + console.log(`\nTotal: kicker=${kickerUpdates} subdeck=${subdeckUpdates} (${mode.write ? 'WRITTEN' : 'DRY RUN'})`) + await pool.end() +} + +main().catch((err) => { + console.error(err) + process.exit(1) +}) diff --git a/scripts/legacy-import/backfill-legacy-shortlinks.ts b/scripts/legacy-import/backfill-legacy-shortlinks.ts new file mode 100644 index 0000000..ab4d492 --- /dev/null +++ b/scripts/legacy-import/backfill-legacy-shortlinks.ts @@ -0,0 +1,198 @@ +/** + * Populate the `legacy_shortlinks` lookup table from the WordPress + * `pluginSL_shorturl` table. + * + * Each WP shortlink is one of: + * - id_post != 0: link to a WP post → resolve to the canonical polymer URL + * (via legacy_source='wordpress' + legacy_article_id=id_post), + * or fall back to the source `/YYYY/MM/DD//` + * shape (which the existing `LEGACY_WP_URL_RE` middleware + * branch then re-resolves). + * - url_externe set: external destination → copy as-is + * + * Run with `pnpm tsx scripts/legacy-import/backfill-legacy-shortlinks.ts`. + * Defaults to a dry run; pass `--write` to commit. + */ + +import { Pool } from 'pg' +import { execFileSync } from 'child_process' + +const WP_SQLITE = '/tmp/audit/wp.db' + +function parseArgs() { + const args = process.argv.slice(2) + return { write: args.includes('--write') } +} + +async function main() { + const { write } = parseArgs() + console.log(`Mode: write=${write}`) + + // Pull every shortlink + its target post info in one shot. + // post_type='attachment' rows carry the upload's URL in `guid` so we can + // route shortlinks pointing at media files too. + const tsv = execFileSync( + 'sqlite3', + [ + '-separator', + '\t', + WP_SQLITE, + `SELECT s.short_url, s.id_post, s.url_externe, s.nb_hits, + p.post_name, substr(p.post_date, 1, 10) AS post_date, + p.post_status, p.post_type, p.guid + FROM pluginSL_shorturl s + LEFT JOIN posts p ON p.id = s.id_post`, + ], + { maxBuffer: 64 * 1024 * 1024 }, + ).toString() + + type WpRow = { + code: string + idPost: number + external: string + hits: number + postName: string + postDate: string + postStatus: string + postType: string + guid: string + } + const wpRows: WpRow[] = [] + for (const line of tsv.split('\n')) { + if (!line) continue + const parts = line.split('\t') + if (parts.length < 9) continue + const [code, idPostStr, external, hitsStr, postName, postDate, postStatus, postType, guid] = parts + if (!code) continue + wpRows.push({ + code, + idPost: Number(idPostStr) || 0, + external: external || '', + hits: Number(hitsStr) || 0, + postName: postName || '', + postDate: postDate || '', + postStatus: postStatus || '', + postType: postType || '', + guid: guid || '', + }) + } + console.log(`Source pluginSL_shorturl rows: ${wpRows.length}`) + + // Pre-load polymer's wordpress-era articles for fast id_post → URL lookup. + const pool = new Pool({ + host: '127.0.0.1', + port: 5433, + user: 'poly', + password: 'poly', + database: 'polymer2', + }) + + const polymerArticles = await pool.query<{ + legacy_article_id: string + section: string + slug: string + published_date: string | null + }>( + `SELECT legacy_article_id, section, slug, published_date FROM articles + WHERE legacy_source='wordpress' AND _status='published'`, + ) + const polymerByWpId = new Map() + for (const r of polymerArticles.rows) { + if (!r.published_date) continue + const dt = new Date(r.published_date) + const yy = dt.getUTCFullYear().toString() + const mm = String(dt.getUTCMonth() + 1).padStart(2, '0') + polymerByWpId.set(Number(r.legacy_article_id), { section: r.section, slug: r.slug, year: yy, month: mm }) + } + console.log(`Polymer wp-era published articles: ${polymerByWpId.size}`) + + let toPolymer = 0 + let toMirrorFallback = 0 + let toExternal = 0 + let toAttachment = 0 + let skipped = 0 + + // Use a single multi-row INSERT for speed. Chunk to keep the parameter list + // under Postgres' 65535 limit. + type Insert = { code: string; target: string; hits: number } + const inserts: Insert[] = [] + + for (const r of wpRows) { + let target: string | null = null + + if (r.idPost > 0) { + const matched = polymerByWpId.get(r.idPost) + if (matched) { + target = `/${matched.section}/${matched.year}/${matched.month}/${matched.slug}` + toPolymer++ + } else if (r.postName && r.postDate && r.postStatus === 'publish' && r.postType === 'post') { + // Polymer hasn't ingested this post (rare). Fall back to the WP-era + // permalink shape, which our LEGACY_WP_URL_RE branch will then try + // to resolve at request time. + const [y, m, d] = r.postDate.split('-') + if (y && m && d) { + target = `/${y}/${m}/${d}/${r.postName}/` + toMirrorFallback++ + } + } else if (r.postType === 'attachment' && r.guid) { + // Media attachments live in the archive proxy under + // /archive/wordpress-media/uploads/... We rewrite the guid path- + // suffix the same way image-rewriter does for body images. + const m = r.guid.match(/^https?:\/\/[^/]+\/wp-content\/uploads\/(.+)$/i) + if (m) { + target = `/archive/wordpress-media/uploads/${m[1]}` + toAttachment++ + } + } + } + + if (!target && r.external) { + target = r.external + toExternal++ + } + + if (!target) { + skipped++ + continue + } + + inserts.push({ code: r.code, target, hits: r.hits }) + } + + console.log(`\nResolution:`) + console.log(` → polymer URL : ${toPolymer}`) + console.log(` → mirror fallback: ${toMirrorFallback}`) + console.log(` → attachment : ${toAttachment}`) + console.log(` → external URL : ${toExternal}`) + console.log(` skipped : ${skipped}`) + console.log(` total inserts : ${inserts.length}`) + + if (write) { + // Truncate and re-insert so re-runs are idempotent. + await pool.query(`TRUNCATE TABLE legacy_shortlinks`) + const CHUNK = 500 + for (let i = 0; i < inserts.length; i += CHUNK) { + const batch = inserts.slice(i, i + CHUNK) + const values: string[] = [] + const params: (string | number)[] = [] + let p = 1 + for (const ins of batch) { + values.push(`($${p++}, $${p++}, $${p++})`) + params.push(ins.code, ins.target, ins.hits) + } + await pool.query( + `INSERT INTO legacy_shortlinks (short_code, target_url, hit_count) VALUES ${values.join(', ')} + ON CONFLICT (short_code) DO UPDATE SET target_url = EXCLUDED.target_url, hit_count = EXCLUDED.hit_count`, + params, + ) + } + } + + console.log(`\nMode: ${write ? 'WRITTEN' : 'DRY RUN'}`) + await pool.end() +} + +main().catch((err) => { + console.error(err) + process.exit(1) +}) diff --git a/scripts/legacy-import/backfill-wp-featured-images.ts b/scripts/legacy-import/backfill-wp-featured-images.ts new file mode 100644 index 0000000..1a47801 --- /dev/null +++ b/scripts/legacy-import/backfill-wp-featured-images.ts @@ -0,0 +1,215 @@ +/** + * Backfill featured images on wordpress-era articles. + * + * Reads source postmeta from /tmp/audit/wp.db (Kicker/Photo/PhotoCaption/ + * PhotoByline/Photographer) and: + * 1. converts the Photo path to its archived URL + * `/wp-content/uploads/...` → `/archive/wordpress-media/uploads/...` + * 2. resolves it via LegacyMediaResolver (gets-or-creates a media row) + * 3. populates `articles.featured_image_id` + * 4. populates `articles.image_caption` from PhotoCaption (entity-decoded) + * 5. populates `media.write_in_photographer` from PhotoByline / Photographer + * + * The audit confirmed 1473 distinct Photo paths exist; 1470 of those map to + * actual files on disk in `recon/archives/wordpress/uploads-extracted/`. + * + * Run with `pnpm tsx scripts/legacy-import/backfill-wp-featured-images.ts`. + * Defaults to a dry run; pass `--write` to actually update rows. + */ + +import { Pool } from 'pg' +import { execFileSync } from 'child_process' +import { LegacyMediaResolver } from './media-resolver' +import { decodeEntities } from './wordpress/html-tokenizer' + +const WP_SQLITE = '/tmp/audit/wp.db' + +type Postmeta = { + photo?: string + photoCaption?: string + photoByline?: string + photographer?: string + origPhoto?: string +} + +function parseArgs() { + const args = process.argv.slice(2) + return { write: args.includes('--write') } +} + +// Convert source Photo path to the polymer archive URL. +// /wp-content/uploads/X → /archive/wordpress-media/uploads/X +// Returns null for values that don't look like a real path (e.g. "0", URLs to +// poly.rpi.edu, absolute http:// links, etc). +function toArchiveUrl(rawPath: string | undefined): string | null { + if (!rawPath) return null + const v = rawPath.trim() + if (!v || v === '0') return null + if (v.startsWith('/wp-content/uploads/')) { + return '/archive/wordpress-media/uploads/' + v.slice('/wp-content/uploads/'.length) + } + // 32 rows are full URLs. Convert if they're pointing at poly.rpi.edu uploads. + const m = v.match(/^https?:\/\/[^/]+\/wp-content\/uploads\/(.+)$/i) + if (m) { + return '/archive/wordpress-media/uploads/' + m[1] + } + // 1 row references /wp-includes/, ignore. + return null +} + +// Strip "The Polytechnic"-style trailing publication credit from a +// byline. WP postmeta consistently formats as "Name/The Polytechnic" +// or "Name/The Polytechnic"; we want just "Name". +function cleanByline(raw: string | undefined): string | null { + if (!raw) return null + let v = decodeEntities(raw).trim() + // Drop trailing "/The Polytechnic" (with or without italic markup). + v = v.replace(/<[^>]+>/g, '').trim() + v = v.replace(/\s*\/\s*The Polytechnic\s*$/i, '').trim() + v = v.replace(/\s*-\s*The Polytechnic\s*$/i, '').trim() + return v || null +} + +function cleanCaption(raw: string | undefined): string | null { + if (!raw) return null + const v = decodeEntities(raw).trim().replace(/\s+/g, ' ') + return v || null +} + +async function main() { + const { write } = parseArgs() + console.log(`Mode: write=${write}`) + + // Pull all relevant postmeta in one shot. + const tsv = execFileSync( + 'sqlite3', + [ + '-separator', + '\t', + WP_SQLITE, + "SELECT post_id, meta_key, meta_value FROM postmeta WHERE meta_key IN ('Photo','PhotoCaption','PhotoByline','Photographer','OrigPhoto') AND meta_value IS NOT NULL AND meta_value != ''", + ], + { maxBuffer: 64 * 1024 * 1024 }, + ).toString() + + const byWpId = new Map() + for (const line of tsv.split('\n')) { + if (!line) continue + const tab1 = line.indexOf('\t') + const tab2 = line.indexOf('\t', tab1 + 1) + if (tab1 < 0 || tab2 < 0) continue + const pid = Number(line.slice(0, tab1)) + const key = line.slice(tab1 + 1, tab2) + const value = line.slice(tab2 + 1) + if (!pid || !value) continue + const cur = byWpId.get(pid) ?? {} + if (key === 'Photo') cur.photo = value + else if (key === 'PhotoCaption') cur.photoCaption = value + else if (key === 'PhotoByline') cur.photoByline = value + else if (key === 'Photographer') cur.photographer = value + else if (key === 'OrigPhoto') cur.origPhoto = value + byWpId.set(pid, cur) + } + + console.log(`WP postmeta: ${byWpId.size} wp_ids with photo-related meta`) + + const pool = new Pool({ + host: '127.0.0.1', + port: 5433, + user: 'poly', + password: 'poly', + database: 'polymer2', + }) + const resolver = new LegacyMediaResolver(pool) + + let imageFills = 0 + let captionFills = 0 + let mediaCreated = 0 + let mediaReused = 0 + let bylineFills = 0 + let pathSkipped = 0 + let articleNotFound = 0 + + for (const [wpId, meta] of byWpId) { + const archiveUrl = toArchiveUrl(meta.photo) + if (!archiveUrl) { + // Even without a Photo path we may still want to set the caption alone, + // but image_caption belongs to the article's hero image; if there's no + // image, skip. Caption-only would render as orphaned text. + pathSkipped++ + continue + } + + const article = await pool.query<{ id: number; featured_image_id: number | null; image_caption: string | null }>( + `SELECT id, featured_image_id, image_caption FROM articles WHERE legacy_source='wordpress' AND legacy_article_id=$1`, + [String(wpId)], + ) + if (article.rowCount === 0) { + articleNotFound++ + continue + } + const row = article.rows[0] + + // Skip articles that already have a featured image — be conservative. + if (row.featured_image_id) continue + + // Resolve (or create) the media row. + const beforeSize = resolver.size() + const mediaId = await resolver.resolve(archiveUrl, null) + if (!mediaId) continue + if (resolver.size() > beforeSize) mediaCreated++ + else mediaReused++ + + // Update photographer attribution on the media row if absent. + // Prefer Photographer (clean name) over PhotoByline (markup-laden). + const byline = cleanByline(meta.photographer) || cleanByline(meta.photoByline) + if (byline && write) { + const r = await pool.query( + `UPDATE media + SET write_in_photographer = COALESCE(write_in_photographer, $1), + updated_at = NOW() + WHERE id = $2 AND (write_in_photographer IS NULL OR write_in_photographer = '')`, + [byline, mediaId], + ) + if ((r.rowCount ?? 0) > 0) bylineFills++ + } else if (byline) { + bylineFills++ + } + + const caption = cleanCaption(meta.photoCaption) + + if (write) { + await pool.query( + `UPDATE articles + SET featured_image_id = $1, + image_caption = COALESCE(image_caption, $2), + updated_at = NOW() + WHERE id = $3`, + [mediaId, caption, row.id], + ) + } + imageFills++ + if (caption && !row.image_caption) captionFills++ + + if (imageFills % 200 === 0) { + console.log(` …${imageFills} images filled`) + } + } + + console.log(`\nResults:`) + console.log(` featured_image_id filled : ${imageFills}`) + console.log(` image_caption filled : ${captionFills}`) + console.log(` media rows created : ${mediaCreated}`) + console.log(` media rows reused : ${mediaReused}`) + console.log(` byline backfills on media: ${bylineFills}`) + console.log(` Photo path skipped : ${pathSkipped}`) + console.log(` article not found : ${articleNotFound}`) + console.log(` Mode: ${write ? 'WRITTEN' : 'DRY RUN'}`) + + await pool.end() +} + +main().catch((err) => { + console.error(err) + process.exit(1) +}) diff --git a/scripts/legacy-import/reslugify-wordpress.ts b/scripts/legacy-import/reslugify-wordpress.ts new file mode 100644 index 0000000..3800457 --- /dev/null +++ b/scripts/legacy-import/reslugify-wordpress.ts @@ -0,0 +1,168 @@ +/** + * Regenerate slugs for wordpress-era articles whose original `_`-separated + * `post_name` was stripped (not hyphen-replaced) during import, producing + * keyword-mash slugs like `2017-04-05-votingimpactsstudents`. + * + * Approach: + * - read source `posts.post_name` from the WP DB dump (sqlite) + * - replace `_` with `-`, drop other non-[a-z0-9-] chars, collapse `--`s + * - new slug: `YYYY-MM-DD-` + * - if the new slug differs from the current one, save the current slug to + * `previous_slug` and overwrite `slug` + `_articles_v.version_slug` + * + * Collisions: handled by appending `-{wp_id}` to the new slug. polymer's slug + * column is UNIQUE so this is necessary on the (rare) cases where two + * articles have the same date+post_name in the source. + * + * Run with `pnpm tsx scripts/legacy-import/reslugify-wordpress.ts`. Defaults + * to a dry run; pass `--write` to commit. + */ + +import { Pool } from 'pg' +import { execFileSync } from 'child_process' + +const WP_SQLITE = '/tmp/audit/wp.db' + +function parseArgs() { + const args = process.argv.slice(2) + return { write: args.includes('--write') } +} + +// Match polymer's slugify rule but operate on text that uses `_` as the +// word separator (the WP `post_name` convention). Underscores become hyphens +// here, then we run the standard cleanup. +function cleanWpPostname(postName: string): string { + if (!postName) return '' + return postName + .toLowerCase() + .replace(/_/g, '-') + .replace(/[^a-z0-9\s-]/g, '') + .trim() + .replace(/\s+/g, '-') + .replace(/-+/g, '-') + .replace(/(^-|-$)/g, '') +} + +async function main() { + const { write } = parseArgs() + console.log(`Mode: write=${write}`) + + // Pull post_name + post_date from sqlite in one shot. + const tsv = execFileSync( + 'sqlite3', + [ + '-separator', + '\t', + WP_SQLITE, + "SELECT id, post_name, substr(post_date, 1, 10) AS post_date FROM posts WHERE post_status='publish' AND post_type='post' AND post_name IS NOT NULL AND post_name != ''", + ], + { maxBuffer: 64 * 1024 * 1024 }, + ).toString() + + type WpRow = { id: number; postName: string; date: string } + const wpRows: WpRow[] = [] + for (const line of tsv.split('\n')) { + if (!line) continue + const [idStr, postName, date] = line.split('\t') + const id = Number(idStr) + if (!id || !postName || !date) continue + wpRows.push({ id, postName, date }) + } + console.log(`WP source rows: ${wpRows.length}`) + + const pool = new Pool({ + host: '127.0.0.1', + port: 5433, + user: 'poly', + password: 'poly', + database: 'polymer2', + }) + + let totalChecked = 0 + let unchanged = 0 + let renamed = 0 + let collisions = 0 + let articleNotFound = 0 + const samples: { wpId: number; old: string; new: string }[] = [] + + // Pre-build a map of existing slugs to detect collisions before we attempt + // an UPDATE (cheaper than catching unique-constraint violations). + const existing = await pool.query<{ slug: string; id: number }>( + `SELECT slug, id FROM articles WHERE slug IS NOT NULL`, + ) + const slugToId = new Map() + for (const r of existing.rows) slugToId.set(r.slug, r.id) + + for (const w of wpRows) { + const article = await pool.query<{ id: number; slug: string | null }>( + `SELECT id, slug FROM articles WHERE legacy_source='wordpress' AND legacy_article_id=$1`, + [String(w.id)], + ) + if (article.rowCount === 0) { + articleNotFound++ + continue + } + const row = article.rows[0] + if (!row.slug) continue + + const cleaned = cleanWpPostname(w.postName) + if (!cleaned) continue + let newSlug = `${w.date}-${cleaned}` + + totalChecked++ + + if (newSlug === row.slug) { + unchanged++ + continue + } + + // Collision check. If the target slug already belongs to a *different* + // article, append `-{wp_id}` to disambiguate. + const taker = slugToId.get(newSlug) + if (taker !== undefined && taker !== row.id) { + newSlug = `${newSlug}-${w.id}` + collisions++ + if (slugToId.has(newSlug)) { + // Highly unlikely but surface it. + console.warn(`double-collision skipped: wp_id=${w.id} → ${newSlug}`) + continue + } + } + + if (write) { + await pool.query( + `UPDATE articles SET previous_slug=$1, slug=$2, updated_at=NOW() WHERE id=$3`, + [row.slug, newSlug, row.id], + ) + // Also patch the latest version-shadow row so admin previews stay + // aligned with the live row. + await pool.query( + `UPDATE "_articles_v" SET version_slug=$1 WHERE parent_id=$2 AND version_slug=$3`, + [newSlug, row.id, row.slug], + ) + } + slugToId.delete(row.slug) + slugToId.set(newSlug, row.id) + renamed++ + + if (samples.length < 12) samples.push({ wpId: w.id, old: row.slug, new: newSlug }) + if (renamed % 500 === 0) console.log(` …${renamed} renamed`) + } + + console.log(`\nResults:`) + console.log(` checked : ${totalChecked}`) + console.log(` renamed : ${renamed}`) + console.log(` unchanged : ${unchanged}`) + console.log(` collisions handled : ${collisions}`) + console.log(` article not found : ${articleNotFound}`) + console.log(` Mode: ${write ? 'WRITTEN' : 'DRY RUN'}`) + console.log(`\nSample renames:`) + for (const s of samples) console.log(` wp_id=${s.wpId}\n old: ${s.old}\n new: ${s.new}`) + + await pool.end() +} + +main().catch((err) => { + console.error(err) + process.exit(1) +}) diff --git a/scripts/run_deploy_sql_migrations.sh b/scripts/run_deploy_sql_migrations.sh index 4feae0b..583dcff 100755 --- a/scripts/run_deploy_sql_migrations.sh +++ b/scripts/run_deploy_sql_migrations.sh @@ -51,7 +51,9 @@ VALUES ('20260428_000000_add_media_image_sizes', 26, NOW(), NOW()), ('20260506_000000_add_articles_legacy_archive', 27, NOW(), NOW()), ('20260506_010000_add_articles_legacy_id_and_category', 27, NOW(), NOW()), - ('20260506_020000_add_articles_plain_content', 27, NOW(), NOW()) + ('20260506_020000_add_articles_plain_content', 27, NOW(), NOW()), + ('20260507_000000_add_articles_previous_slug', 28, NOW(), NOW()), + ('20260507_010000_add_legacy_shortlinks', 28, NOW(), NOW()) ON CONFLICT DO NOTHING; -- 20260317: Add opinion_type and image_caption columns @@ -1377,4 +1379,19 @@ CREATE INDEX IF NOT EXISTS "articles_legacy_source_legacy_article_id_idx" ON "ar -- document). Nullable until the legacy backfill completes. ALTER TABLE "articles" ADD COLUMN IF NOT EXISTS "plain_content" text; ALTER TABLE "_articles_v" ADD COLUMN IF NOT EXISTS "version_plain_content" text; + +-- 20260507_000000: Track a previous slug so renames (e.g. legacy slug +-- regen) can 301-redirect old polymer URLs to the new ones. +ALTER TABLE "articles" ADD COLUMN IF NOT EXISTS "previous_slug" varchar; +ALTER TABLE "_articles_v" ADD COLUMN IF NOT EXISTS "version_previous_slug" varchar; +CREATE INDEX IF NOT EXISTS "articles_previous_slug_idx" ON "articles" ("previous_slug"); + +-- 20260507_010000: Lookup table for the 12,872 5-char WordPress shortlinks +-- (pluginSL_shorturl). Middleware 301s / to target_url. +CREATE TABLE IF NOT EXISTS "legacy_shortlinks" ( + "short_code" varchar PRIMARY KEY, + "target_url" varchar NOT NULL, + "hit_count" integer NOT NULL DEFAULT 0, + "created_at" timestamp(3) with time zone NOT NULL DEFAULT NOW() +); SQL From 6997ce6b75337aae14a8126e05c7b5917de19e19 Mon Sep 17 00:00:00 2001 From: Ronan Hevenor Date: Thu, 7 May 2026 12:04:30 -0400 Subject: [PATCH 2/3] fix(legacy-archive): satisfy CodeQL multi-char sanitization on byline strip CodeQL flagged a single `replace(/<[^>]+>/g, '')` as an incomplete sanitizer because unbalanced angle brackets could survive. Replace with an indexOf fixpoint loop that drops the trailing fragment when a `<` has no matching `>`. --- .../backfill-wp-featured-images.ts | 24 +++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/scripts/legacy-import/backfill-wp-featured-images.ts b/scripts/legacy-import/backfill-wp-featured-images.ts index 1a47801..8b3d2d5 100644 --- a/scripts/legacy-import/backfill-wp-featured-images.ts +++ b/scripts/legacy-import/backfill-wp-featured-images.ts @@ -60,11 +60,31 @@ function toArchiveUrl(rawPath: string | undefined): string | null { // Strip "The Polytechnic"-style trailing publication credit from a // byline. WP postmeta consistently formats as "Name/The Polytechnic" // or "Name/The Polytechnic"; we want just "Name". +// +// Uses indexOf scanning + a fixpoint loop instead of a single regex so the +// CodeQL `incomplete-multi-character-sanitization` rule is satisfied — a +// single `replace(/<[^>]+>/g, '')` pass can leave a leading `<` if the input +// has unbalanced angle brackets. +function stripTags(input: string): string { + let s = input + for (;;) { + const open = s.indexOf('<') + if (open === -1) break + const close = s.indexOf('>', open + 1) + if (close === -1) { + // Unterminated tag — drop everything from `<` onward. + s = s.slice(0, open) + break + } + s = s.slice(0, open) + s.slice(close + 1) + } + return s +} + function cleanByline(raw: string | undefined): string | null { if (!raw) return null let v = decodeEntities(raw).trim() - // Drop trailing "/The Polytechnic" (with or without italic markup). - v = v.replace(/<[^>]+>/g, '').trim() + v = stripTags(v).trim() v = v.replace(/\s*\/\s*The Polytechnic\s*$/i, '').trim() v = v.replace(/\s*-\s*The Polytechnic\s*$/i, '').trim() return v || null From 09e0bfb28c581989dd1b5fedc35eaade4d103790 Mon Sep 17 00:00:00 2001 From: Ronan Hevenor Date: Thu, 7 May 2026 12:09:36 -0400 Subject: [PATCH 3/3] feat(legacy-archive): expand [gallery]/[gview]/iframe shortcodes + re-import wp_id 7191/7421 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * expand-wp-shortcodes.ts: 75 articles re-rendered. 58 [gallery] shortcodes produce real upload nodes (988 images resolved against the attachment guid map), 19 [gview] PDFs become 'Download PDF' links, and 2 iframes (1 YouTube, 1 Google Form) become outbound links. Pre-expands the shortcodes to / tags before handing off to the existing WP→Lexical pipeline + media-resolver. * fix-wp-7191-7421.ts: wp_id 7191 (Tate Boucher's neuromarketing letter, previously mis-imported as a day-archive listing titled 'RENSSELAER UNION') gets its real content + Letter-to-the-Editor kicker + author. wp_id 7421 has zero source content; demoted to draft. --- scripts/legacy-import/expand-wp-shortcodes.ts | 241 ++++++++++++++++++ scripts/legacy-import/fix-wp-7191-7421.ts | 208 +++++++++++++++ 2 files changed, 449 insertions(+) create mode 100644 scripts/legacy-import/expand-wp-shortcodes.ts create mode 100644 scripts/legacy-import/fix-wp-7191-7421.ts diff --git a/scripts/legacy-import/expand-wp-shortcodes.ts b/scripts/legacy-import/expand-wp-shortcodes.ts new file mode 100644 index 0000000..5d4ed98 --- /dev/null +++ b/scripts/legacy-import/expand-wp-shortcodes.ts @@ -0,0 +1,241 @@ +/** + * Expand WP body shortcodes that the original importer dropped. + * + * Targets 77 published WP posts whose source `post_content` includes one of: + * - [gallery ids="N1,N2,..."] (56 posts, 1012 attachment refs) + * - [gview file="URL"] (19 PDF "Full Issue" embeds) + * -