diff --git a/collections/Articles.ts b/collections/Articles.ts index b62185d..ab945d1 100644 --- a/collections/Articles.ts +++ b/collections/Articles.ts @@ -422,6 +422,16 @@ const Articles: CollectionConfig = { description: 'Original category/section name from the source system. Preserved for display and search; does not affect routing.', }, }, + { + name: 'previousSlug', + type: 'text', + label: 'Previous Slug', + index: true, + admin: { + position: 'sidebar', + description: 'Old slug retained for 301 redirects after a rename. The middleware redirects requests for this slug to the current one.', + }, + }, ], } diff --git a/middleware.ts b/middleware.ts index 0287ffe..8c765eb 100644 --- a/middleware.ts +++ b/middleware.ts @@ -11,20 +11,24 @@ const ARTICLE_URL_RE = /^\/([a-z]+)\/(\d{4})\/(\d{2})\/([a-z0-9][a-z0-9-]*)\/?$/ // Old WordPress permalink shape from the 2009-2019 era. Years restricted // to 2009-2019 so we don't accidentally swallow other paths. const LEGACY_WP_URL_RE = /^\/(20(?:0[9]|1[0-9]))\/(\d{2})\/(\d{2})\/([a-z0-9][a-z0-9_-]*)\/?$/ -// 5-digit ID URL shape from the WP-era pluginSL_shorturl plugin. Articles -// from 2013-2014 link to documents via these short codes. -const LEGACY_WP_SHORTLINK_RE = /^\/(\d{5})\/?$/ +// 5-char ID URL shape from the WP-era `pluginSL_shorturl` plugin. The +// codes are 5-digit numeric *or* 5-char alphanumeric (case-sensitive). +// Resolved against the `legacy_shortlinks` lookup table at request time. +const LEGACY_WP_SHORTLINK_RE = /^\/([A-Za-z0-9]{5})\/?$/ const VALID_SECTIONS = new Set(['news', 'sports', 'features', 'opinion']) /** - * 33 shortlinks recovered from the legacy WordPress `pluginSL_shorturl` table. - * Most of these point at student-senate document portals (still up at - * docs.studentsenate.rpi.edu) or external services (Google Docs, Eventbrite, - * etc.). Some chain to old `poly.rpi.edu/YYYY/...` URLs that the - * LEGACY_WP_URL_RE branch then redirects to their new polymer URL. + * Hand-curated overrides for the WordPress `pluginSL_shorturl` map. Anything + * not present here falls through to the `legacy_shortlinks` table (5,014 + * rows backfilled from the source plugin). Kept for two reasons: * - * Source: `pluginSL_shorturl` table in - * /home/red/poly/recon/archives/wordpress/db/wordpress-archive-2026-05-06.sql.gz + * 1. Documents are now hosted off-site (docs.studentsenate.rpi.edu, etc.); + * the override map keeps those redirect targets explicit and reviewable. + * 2. A few codes have been edited at runtime over the years (e.g. NASA's + * mission_pages URL was renamed) and we want our pin to win. + * + * The DB-backed table can be regenerated from the dump via + * `scripts/legacy-import/backfill-legacy-shortlinks.ts`. */ const WP_LEGACY_SHORTLINKS: Record = { '06735': 'http://poly.rpi.edu/2013/03/06/pss_breaking_the_third_wall/', @@ -68,6 +72,34 @@ const CACHE_TTL_MS = 60_000 type RedirectEntry = { to: string | null; expiresAt: number } const legacyRedirectCache = new Map() +const previousSlugRedirectCache = new Map() +const shortlinkRedirectCache = new Map() + +/** + * Look up the destination URL for a 5-char WP shortlink. Pulls from the + * `legacy_shortlinks` table backfilled from the `pluginSL_shorturl` plugin. + * + * No Payload collection wraps this table — the rows aren't editorial — so + * we go through the raw `pg.Pool` exposed by the postgres db adapter. + */ +async function lookupShortlinkRedirect(code: string): Promise { + const cached = shortlinkRedirectCache.get(code) + const now = Date.now() + if (cached && cached.expiresAt > now) return cached.to + + const payload = await getPayload({ config: payloadConfig }) + const pool = (payload.db as unknown as { pool?: import('pg').Pool }).pool + let to: string | null = null + if (pool) { + const r = await pool.query<{ target_url: string }>( + 'SELECT target_url FROM legacy_shortlinks WHERE short_code = $1 LIMIT 1', + [code], + ) + to = r.rows[0]?.target_url ?? null + } + shortlinkRedirectCache.set(code, { to, expiresAt: now + CACHE_TTL_MS }) + return to +} async function isArticleGone(section: string, slug: string): Promise { if (!VALID_SECTIONS.has(section)) return false @@ -100,6 +132,49 @@ async function isArticleGone(section: string, slug: string): Promise { return gone } +/** + * Look up the canonical URL for an article whose slug was renamed. Returns + * the new URL when `previous_slug` matches; null otherwise. Used by the 301 + * redirect fallback in the article-URL branch when the live `slug` lookup + * fails. + */ +async function lookupPreviousSlugRedirect( + section: string, + oldSlug: string, +): Promise { + const cacheKey = `${section}:${oldSlug}` + const cached = previousSlugRedirectCache.get(cacheKey) + const now = Date.now() + if (cached && cached.expiresAt > now) return cached.to + + const payload = await getPayload({ config: payloadConfig }) + const result = await payload.find({ + collection: 'articles', + where: { + and: [ + { previousSlug: { equals: oldSlug } }, + { section: { equals: section } }, + { _status: { equals: 'published' } }, + ], + }, + limit: 1, + depth: 0, + select: { slug: true, section: true, publishedDate: true }, + }) + const doc = result.docs[0] as + | { slug?: string; section?: string; publishedDate?: string } + | undefined + let to: string | null = null + if (doc?.slug && doc?.section && doc?.publishedDate) { + const dt = new Date(doc.publishedDate) + const yy = dt.getUTCFullYear().toString() + const mm = String(dt.getUTCMonth() + 1).padStart(2, '0') + to = `/${doc.section}/${yy}/${mm}/${doc.slug}` + } + previousSlugRedirectCache.set(cacheKey, { to, expiresAt: now + CACHE_TTL_MS }) + return to +} + /** * Look up the polymer URL for an old WordPress permalink shape * `/{year}/{month}/{day}/{slug}/`. Returns the new URL or null if no match. @@ -142,12 +217,26 @@ async function lookupLegacyWpRedirect( } export async function middleware(req: NextRequest) { - // Legacy WP shortlink (5-digit IDs from pluginSL_shorturl) → original URL. + // Legacy WP shortlink (5-char IDs from pluginSL_shorturl) → target URL. + // The hand-curated `WP_LEGACY_SHORTLINKS` map wins; otherwise we look up + // the DB-backed `legacy_shortlinks` table. const shortMatch = req.nextUrl.pathname.match(LEGACY_WP_SHORTLINK_RE) if (shortMatch) { - const target = WP_LEGACY_SHORTLINKS[shortMatch[1]] - if (target) { - return NextResponse.redirect(target, 301) + const code = shortMatch[1] + const override = WP_LEGACY_SHORTLINKS[code] + if (override) { + return NextResponse.redirect(override, 301) + } + try { + const target = await lookupShortlinkRedirect(code) + if (target) { + // External (absolute) targets pass through; relative paths get + // resolved against the request origin. + const url = /^https?:\/\//i.test(target) ? target : new URL(target, req.url).toString() + return NextResponse.redirect(url, 301) + } + } catch { + // Fall through — let the request 404 normally if lookup fails. } } @@ -171,6 +260,15 @@ export async function middleware(req: NextRequest) { const [, section, , , slug] = match try { + // Renamed-slug 301: if no row has `slug=$slug` but one has + // `previous_slug=$slug`, redirect to the new canonical URL. This kicks in + // for the legacy slug-cleanup pass (post_name `_`-stripping) and any + // future editor-driven rename. + const renamedTo = await lookupPreviousSlugRedirect(section, slug) + if (renamedTo) { + return NextResponse.redirect(new URL(renamedTo, req.url), 301) + } + if (await isArticleGone(section, slug)) { // 410 Gone tells search engines the URL is permanently removed so they // de-index faster than they would from a bare 404. diff --git a/migrations/20260507_000000_add_articles_previous_slug.ts b/migrations/20260507_000000_add_articles_previous_slug.ts new file mode 100644 index 0000000..5344159 --- /dev/null +++ b/migrations/20260507_000000_add_articles_previous_slug.ts @@ -0,0 +1,25 @@ +import { MigrateUpArgs, MigrateDownArgs, sql } from '@payloadcms/db-postgres' + +/** + * Add `previous_slug` to `articles` (+ version shadow). When a slug is + * renamed (e.g. legacy slug-cleanup), set `previous_slug` to the old value + * so the request middleware can issue a 301 redirect to the new URL. + * + * Single-string for now (one historical slug per article). If we ever need + * multiple aliases we can swap to a text[]. + */ +export async function up({ db }: MigrateUpArgs): Promise { + await db.execute(sql` + ALTER TABLE "articles" ADD COLUMN IF NOT EXISTS "previous_slug" varchar; + ALTER TABLE "_articles_v" ADD COLUMN IF NOT EXISTS "version_previous_slug" varchar; + CREATE INDEX IF NOT EXISTS "articles_previous_slug_idx" ON "articles" ("previous_slug"); + `) +} + +export async function down({ db }: MigrateDownArgs): Promise { + await db.execute(sql` + DROP INDEX IF EXISTS "articles_previous_slug_idx"; + ALTER TABLE "_articles_v" DROP COLUMN IF EXISTS "version_previous_slug"; + ALTER TABLE "articles" DROP COLUMN IF EXISTS "previous_slug"; + `) +} diff --git a/migrations/20260507_010000_add_legacy_shortlinks.ts b/migrations/20260507_010000_add_legacy_shortlinks.ts new file mode 100644 index 0000000..9a4ad37 --- /dev/null +++ b/migrations/20260507_010000_add_legacy_shortlinks.ts @@ -0,0 +1,26 @@ +import { MigrateUpArgs, MigrateDownArgs, sql } from '@payloadcms/db-postgres' + +/** + * Add a `legacy_shortlinks` table mapping the 5-char codes from the WordPress + * `pluginSL_shorturl` plugin (12,872 rows) to their target URL. Used by the + * request middleware to 301 `/` to either a canonical polymer URL or + * the original external destination. + * + * Not a Payload collection — there's no editorial reason to surface these + * in the admin UI, and exposing 12K rows there would be noisy. Pure DB + * lookup table. + */ +export async function up({ db }: MigrateUpArgs): Promise { + await db.execute(sql` + CREATE TABLE IF NOT EXISTS "legacy_shortlinks" ( + "short_code" varchar PRIMARY KEY, + "target_url" varchar NOT NULL, + "hit_count" integer NOT NULL DEFAULT 0, + "created_at" timestamp(3) with time zone NOT NULL DEFAULT NOW() + ); + `) +} + +export async function down({ db }: MigrateDownArgs): Promise { + await db.execute(sql`DROP TABLE IF EXISTS "legacy_shortlinks";`) +} diff --git a/migrations/index.ts b/migrations/index.ts index 3f060cb..9da6bef 100644 --- a/migrations/index.ts +++ b/migrations/index.ts @@ -42,6 +42,8 @@ import * as migration_20260428_100000_add_audio_transcription from './20260428_1 import * as migration_20260506_000000_add_articles_legacy_archive from './20260506_000000_add_articles_legacy_archive'; import * as migration_20260506_010000_add_articles_legacy_id_and_category from './20260506_010000_add_articles_legacy_id_and_category'; import * as migration_20260506_020000_add_articles_plain_content from './20260506_020000_add_articles_plain_content'; +import * as migration_20260507_000000_add_articles_previous_slug from './20260507_000000_add_articles_previous_slug'; +import * as migration_20260507_010000_add_legacy_shortlinks from './20260507_010000_add_legacy_shortlinks'; export const migrations = [ { @@ -264,4 +266,14 @@ export const migrations = [ down: migration_20260506_020000_add_articles_plain_content.down, name: '20260506_020000_add_articles_plain_content', }, + { + up: migration_20260507_000000_add_articles_previous_slug.up, + down: migration_20260507_000000_add_articles_previous_slug.down, + name: '20260507_000000_add_articles_previous_slug', + }, + { + up: migration_20260507_010000_add_legacy_shortlinks.up, + down: migration_20260507_010000_add_legacy_shortlinks.down, + name: '20260507_010000_add_legacy_shortlinks', + }, ]; diff --git a/scripts/legacy-import/backfill-kickers-subdecks.ts b/scripts/legacy-import/backfill-kickers-subdecks.ts new file mode 100644 index 0000000..f8e301d --- /dev/null +++ b/scripts/legacy-import/backfill-kickers-subdecks.ts @@ -0,0 +1,229 @@ +/** + * Backfill kickers and subdecks on legacy articles. + * + * poly-online: read `type_db` / `blurb_db` from the manifest + * wordpress: read `Kicker` / `Subdeck` postmeta from the source DB dump + * + * Originally we ignored these fields during import; the audit flagged 2,303 + * poly-online + 1,366 WP rows missing kickers despite the source having them, + * plus 768 poly-online subdecks. This script does idempotent direct-SQL + * updates against the polymer DB (no hooks fire — we're not editing + * narrative content). + * + * Run with `pnpm tsx scripts/legacy-import/backfill-kickers-subdecks.ts`. + * Defaults to a dry run; pass `--write` to actually update rows. + */ + +import { Pool } from 'pg' +import { readFileSync } from 'fs' +import { execFileSync } from 'child_process' +import { decodeEntities } from './wordpress/html-tokenizer' + +type PolyOnlineArticle = { + kind: string + articleID: number + type_db?: string + blurb_db?: string +} + +type Mode = { write: boolean; era: 'poly-online' | 'wordpress' | 'all' } + +function parseArgs(): Mode { + const args = process.argv.slice(2) + const eraFlag = args.find((a) => a.startsWith('--era='))?.split('=')[1] as + | 'poly-online' + | 'wordpress' + | 'all' + | undefined + return { + write: args.includes('--write'), + era: eraFlag ?? 'all', + } +} + +const POLY_ONLINE_MANIFEST = '/home/red/poly/recon/archives/polytechnic-online/manifest.json' +const WP_SQLITE = '/tmp/audit/wp.db' + +// Source kicker values that are generic category names (not real kickers) +// and should be overwritten if the source `Kicker` postmeta has something better. +const GENERIC_KICKER_VALUES = new Set([ + 'editorial/opinion', + 'editorial / opinion', + 'opinion', + 'news', + 'sports', + 'features', + 'feature', +]) + +function isGenericKicker(v: string | null): boolean { + if (!v) return false + return GENERIC_KICKER_VALUES.has(v.trim().toLowerCase()) +} + +// "none" is the manifest's null marker for type_db. Skip it. Otherwise return +// the cleaned-up label (the manifest values are already title-cased). +function cleanTypeDb(raw: string | undefined | null): string | null { + if (!raw) return null + const v = raw.trim() + if (!v) return null + if (v.toLowerCase() === 'none') return null + return v +} + +// blurb_db can have the same `&#xx;` HTML entities as titles/bodies. Decode +// before saving so the subdeck renders correctly. +function cleanBlurb(raw: string | undefined | null): string | null { + if (!raw) return null + const v = decodeEntities(raw).trim() + return v || null +} + +async function main() { + const mode = parseArgs() + console.log(`Mode: era=${mode.era} write=${mode.write}`) + + const pool = new Pool({ + host: '127.0.0.1', + port: 5433, + user: 'poly', + password: 'poly', + database: 'polymer2', + }) + + let kickerUpdates = 0 + let subdeckUpdates = 0 + + // ---- poly-online ---- + if (mode.era === 'all' || mode.era === 'poly-online') { + console.log('\n=== poly-online ===') + const manifest = JSON.parse(readFileSync(POLY_ONLINE_MANIFEST, 'utf-8')) as { + articles: PolyOnlineArticle[] + } + const articles = manifest.articles.filter((a) => a.kind === 'article') + console.log(`Manifest articles: ${articles.length}`) + + // De-dupe by articleID; manifest has multiple part rows per article. + const seen = new Set() + const updates: { id: number; kicker: string | null; subdeck: string | null }[] = [] + for (const a of articles) { + if (seen.has(a.articleID)) continue + seen.add(a.articleID) + const k = cleanTypeDb(a.type_db) + const s = cleanBlurb(a.blurb_db) + if (!k && !s) continue + updates.push({ id: a.articleID, kicker: k, subdeck: s }) + } + console.log(`Distinct articleIDs with type_db/blurb_db: ${updates.length}`) + + for (const u of updates) { + // Only overwrite if the destination is empty. Don't trample any + // editor's manual cleanup. + const result = await pool.query( + `SELECT id, kicker, subdeck FROM articles WHERE legacy_source='polytechnic-online' AND legacy_article_id=$1`, + [String(u.id)] + ) + if (result.rowCount === 0) continue + const row = result.rows[0] + const setKicker = u.kicker && !row.kicker + const setSubdeck = u.subdeck && !row.subdeck + if (!setKicker && !setSubdeck) continue + if (mode.write) { + await pool.query( + `UPDATE articles SET kicker=COALESCE($1, kicker), subdeck=COALESCE($2, subdeck), updated_at=NOW() WHERE id=$3`, + [setKicker ? u.kicker : null, setSubdeck ? u.subdeck : null, row.id] + ) + } + if (setKicker) kickerUpdates++ + if (setSubdeck) subdeckUpdates++ + } + + console.log(`poly-online: kicker updates=${kickerUpdates} subdeck updates=${subdeckUpdates}`) + } + + // ---- wordpress ---- + if (mode.era === 'all' || mode.era === 'wordpress') { + console.log('\n=== wordpress ===') + // Use sqlite CLI to pull Kicker and Subdeck postmeta into TSV. + // Avoids adding a dependency on better-sqlite3 just for this script. + const tsv = execFileSync( + 'sqlite3', + [ + '-separator', + '\t', + WP_SQLITE, + "SELECT post_id, meta_key, meta_value FROM postmeta WHERE meta_key IN ('Kicker','Subdeck') AND meta_value IS NOT NULL AND meta_value != ''", + ], + { maxBuffer: 64 * 1024 * 1024 } + ).toString() + + type Postmeta = { kicker?: string; subdeck?: string } + const byWpId = new Map() + for (const line of tsv.split('\n')) { + if (!line) continue + const [pidStr, key, value] = line.split('\t') + const pid = Number(pidStr) + if (!pid) continue + const cur = byWpId.get(pid) ?? {} + // Decode entities (Kicker postmeta can carry ’ from the WP editor). + const v = decodeEntities(value).trim() + if (!v) continue + if (key === 'Kicker') cur.kicker = v + else if (key === 'Subdeck') cur.subdeck = v + byWpId.set(pid, cur) + } + console.log(`WP postmeta rows: ${byWpId.size} wp_ids with Kicker or Subdeck`) + + let kickerOverwrites = 0 + let kickerFills = 0 + let subdeckFills = 0 + + for (const [wpId, meta] of byWpId) { + const result = await pool.query( + `SELECT id, kicker, subdeck FROM articles WHERE legacy_source='wordpress' AND legacy_article_id=$1`, + [String(wpId)] + ) + if (result.rowCount === 0) continue + const row = result.rows[0] + + let newKicker: string | null = null + if (meta.kicker) { + if (!row.kicker) { + newKicker = meta.kicker + kickerFills++ + } else if (isGenericKicker(row.kicker) && meta.kicker.toLowerCase() !== row.kicker.toLowerCase()) { + newKicker = meta.kicker + kickerOverwrites++ + } + } + + let newSubdeck: string | null = null + if (meta.subdeck && !row.subdeck) { + newSubdeck = meta.subdeck + subdeckFills++ + } + + if (!newKicker && !newSubdeck) continue + if (mode.write) { + await pool.query( + `UPDATE articles SET kicker=COALESCE($1, kicker), subdeck=COALESCE($2, subdeck), updated_at=NOW() WHERE id=$3`, + [newKicker, newSubdeck, row.id] + ) + } + } + + console.log( + `wordpress: kicker fills=${kickerFills} kicker overwrites=${kickerOverwrites} subdeck fills=${subdeckFills}` + ) + kickerUpdates += kickerFills + kickerOverwrites + subdeckUpdates += subdeckFills + } + + console.log(`\nTotal: kicker=${kickerUpdates} subdeck=${subdeckUpdates} (${mode.write ? 'WRITTEN' : 'DRY RUN'})`) + await pool.end() +} + +main().catch((err) => { + console.error(err) + process.exit(1) +}) diff --git a/scripts/legacy-import/backfill-legacy-shortlinks.ts b/scripts/legacy-import/backfill-legacy-shortlinks.ts new file mode 100644 index 0000000..ab4d492 --- /dev/null +++ b/scripts/legacy-import/backfill-legacy-shortlinks.ts @@ -0,0 +1,198 @@ +/** + * Populate the `legacy_shortlinks` lookup table from the WordPress + * `pluginSL_shorturl` table. + * + * Each WP shortlink is one of: + * - id_post != 0: link to a WP post → resolve to the canonical polymer URL + * (via legacy_source='wordpress' + legacy_article_id=id_post), + * or fall back to the source `/YYYY/MM/DD//` + * shape (which the existing `LEGACY_WP_URL_RE` middleware + * branch then re-resolves). + * - url_externe set: external destination → copy as-is + * + * Run with `pnpm tsx scripts/legacy-import/backfill-legacy-shortlinks.ts`. + * Defaults to a dry run; pass `--write` to commit. + */ + +import { Pool } from 'pg' +import { execFileSync } from 'child_process' + +const WP_SQLITE = '/tmp/audit/wp.db' + +function parseArgs() { + const args = process.argv.slice(2) + return { write: args.includes('--write') } +} + +async function main() { + const { write } = parseArgs() + console.log(`Mode: write=${write}`) + + // Pull every shortlink + its target post info in one shot. + // post_type='attachment' rows carry the upload's URL in `guid` so we can + // route shortlinks pointing at media files too. + const tsv = execFileSync( + 'sqlite3', + [ + '-separator', + '\t', + WP_SQLITE, + `SELECT s.short_url, s.id_post, s.url_externe, s.nb_hits, + p.post_name, substr(p.post_date, 1, 10) AS post_date, + p.post_status, p.post_type, p.guid + FROM pluginSL_shorturl s + LEFT JOIN posts p ON p.id = s.id_post`, + ], + { maxBuffer: 64 * 1024 * 1024 }, + ).toString() + + type WpRow = { + code: string + idPost: number + external: string + hits: number + postName: string + postDate: string + postStatus: string + postType: string + guid: string + } + const wpRows: WpRow[] = [] + for (const line of tsv.split('\n')) { + if (!line) continue + const parts = line.split('\t') + if (parts.length < 9) continue + const [code, idPostStr, external, hitsStr, postName, postDate, postStatus, postType, guid] = parts + if (!code) continue + wpRows.push({ + code, + idPost: Number(idPostStr) || 0, + external: external || '', + hits: Number(hitsStr) || 0, + postName: postName || '', + postDate: postDate || '', + postStatus: postStatus || '', + postType: postType || '', + guid: guid || '', + }) + } + console.log(`Source pluginSL_shorturl rows: ${wpRows.length}`) + + // Pre-load polymer's wordpress-era articles for fast id_post → URL lookup. + const pool = new Pool({ + host: '127.0.0.1', + port: 5433, + user: 'poly', + password: 'poly', + database: 'polymer2', + }) + + const polymerArticles = await pool.query<{ + legacy_article_id: string + section: string + slug: string + published_date: string | null + }>( + `SELECT legacy_article_id, section, slug, published_date FROM articles + WHERE legacy_source='wordpress' AND _status='published'`, + ) + const polymerByWpId = new Map() + for (const r of polymerArticles.rows) { + if (!r.published_date) continue + const dt = new Date(r.published_date) + const yy = dt.getUTCFullYear().toString() + const mm = String(dt.getUTCMonth() + 1).padStart(2, '0') + polymerByWpId.set(Number(r.legacy_article_id), { section: r.section, slug: r.slug, year: yy, month: mm }) + } + console.log(`Polymer wp-era published articles: ${polymerByWpId.size}`) + + let toPolymer = 0 + let toMirrorFallback = 0 + let toExternal = 0 + let toAttachment = 0 + let skipped = 0 + + // Use a single multi-row INSERT for speed. Chunk to keep the parameter list + // under Postgres' 65535 limit. + type Insert = { code: string; target: string; hits: number } + const inserts: Insert[] = [] + + for (const r of wpRows) { + let target: string | null = null + + if (r.idPost > 0) { + const matched = polymerByWpId.get(r.idPost) + if (matched) { + target = `/${matched.section}/${matched.year}/${matched.month}/${matched.slug}` + toPolymer++ + } else if (r.postName && r.postDate && r.postStatus === 'publish' && r.postType === 'post') { + // Polymer hasn't ingested this post (rare). Fall back to the WP-era + // permalink shape, which our LEGACY_WP_URL_RE branch will then try + // to resolve at request time. + const [y, m, d] = r.postDate.split('-') + if (y && m && d) { + target = `/${y}/${m}/${d}/${r.postName}/` + toMirrorFallback++ + } + } else if (r.postType === 'attachment' && r.guid) { + // Media attachments live in the archive proxy under + // /archive/wordpress-media/uploads/... We rewrite the guid path- + // suffix the same way image-rewriter does for body images. + const m = r.guid.match(/^https?:\/\/[^/]+\/wp-content\/uploads\/(.+)$/i) + if (m) { + target = `/archive/wordpress-media/uploads/${m[1]}` + toAttachment++ + } + } + } + + if (!target && r.external) { + target = r.external + toExternal++ + } + + if (!target) { + skipped++ + continue + } + + inserts.push({ code: r.code, target, hits: r.hits }) + } + + console.log(`\nResolution:`) + console.log(` → polymer URL : ${toPolymer}`) + console.log(` → mirror fallback: ${toMirrorFallback}`) + console.log(` → attachment : ${toAttachment}`) + console.log(` → external URL : ${toExternal}`) + console.log(` skipped : ${skipped}`) + console.log(` total inserts : ${inserts.length}`) + + if (write) { + // Truncate and re-insert so re-runs are idempotent. + await pool.query(`TRUNCATE TABLE legacy_shortlinks`) + const CHUNK = 500 + for (let i = 0; i < inserts.length; i += CHUNK) { + const batch = inserts.slice(i, i + CHUNK) + const values: string[] = [] + const params: (string | number)[] = [] + let p = 1 + for (const ins of batch) { + values.push(`($${p++}, $${p++}, $${p++})`) + params.push(ins.code, ins.target, ins.hits) + } + await pool.query( + `INSERT INTO legacy_shortlinks (short_code, target_url, hit_count) VALUES ${values.join(', ')} + ON CONFLICT (short_code) DO UPDATE SET target_url = EXCLUDED.target_url, hit_count = EXCLUDED.hit_count`, + params, + ) + } + } + + console.log(`\nMode: ${write ? 'WRITTEN' : 'DRY RUN'}`) + await pool.end() +} + +main().catch((err) => { + console.error(err) + process.exit(1) +}) diff --git a/scripts/legacy-import/backfill-wp-featured-images.ts b/scripts/legacy-import/backfill-wp-featured-images.ts new file mode 100644 index 0000000..8b3d2d5 --- /dev/null +++ b/scripts/legacy-import/backfill-wp-featured-images.ts @@ -0,0 +1,235 @@ +/** + * Backfill featured images on wordpress-era articles. + * + * Reads source postmeta from /tmp/audit/wp.db (Kicker/Photo/PhotoCaption/ + * PhotoByline/Photographer) and: + * 1. converts the Photo path to its archived URL + * `/wp-content/uploads/...` → `/archive/wordpress-media/uploads/...` + * 2. resolves it via LegacyMediaResolver (gets-or-creates a media row) + * 3. populates `articles.featured_image_id` + * 4. populates `articles.image_caption` from PhotoCaption (entity-decoded) + * 5. populates `media.write_in_photographer` from PhotoByline / Photographer + * + * The audit confirmed 1473 distinct Photo paths exist; 1470 of those map to + * actual files on disk in `recon/archives/wordpress/uploads-extracted/`. + * + * Run with `pnpm tsx scripts/legacy-import/backfill-wp-featured-images.ts`. + * Defaults to a dry run; pass `--write` to actually update rows. + */ + +import { Pool } from 'pg' +import { execFileSync } from 'child_process' +import { LegacyMediaResolver } from './media-resolver' +import { decodeEntities } from './wordpress/html-tokenizer' + +const WP_SQLITE = '/tmp/audit/wp.db' + +type Postmeta = { + photo?: string + photoCaption?: string + photoByline?: string + photographer?: string + origPhoto?: string +} + +function parseArgs() { + const args = process.argv.slice(2) + return { write: args.includes('--write') } +} + +// Convert source Photo path to the polymer archive URL. +// /wp-content/uploads/X → /archive/wordpress-media/uploads/X +// Returns null for values that don't look like a real path (e.g. "0", URLs to +// poly.rpi.edu, absolute http:// links, etc). +function toArchiveUrl(rawPath: string | undefined): string | null { + if (!rawPath) return null + const v = rawPath.trim() + if (!v || v === '0') return null + if (v.startsWith('/wp-content/uploads/')) { + return '/archive/wordpress-media/uploads/' + v.slice('/wp-content/uploads/'.length) + } + // 32 rows are full URLs. Convert if they're pointing at poly.rpi.edu uploads. + const m = v.match(/^https?:\/\/[^/]+\/wp-content\/uploads\/(.+)$/i) + if (m) { + return '/archive/wordpress-media/uploads/' + m[1] + } + // 1 row references /wp-includes/, ignore. + return null +} + +// Strip "The Polytechnic"-style trailing publication credit from a +// byline. WP postmeta consistently formats as "Name/The Polytechnic" +// or "Name/The Polytechnic"; we want just "Name". +// +// Uses indexOf scanning + a fixpoint loop instead of a single regex so the +// CodeQL `incomplete-multi-character-sanitization` rule is satisfied — a +// single `replace(/<[^>]+>/g, '')` pass can leave a leading `<` if the input +// has unbalanced angle brackets. +function stripTags(input: string): string { + let s = input + for (;;) { + const open = s.indexOf('<') + if (open === -1) break + const close = s.indexOf('>', open + 1) + if (close === -1) { + // Unterminated tag — drop everything from `<` onward. + s = s.slice(0, open) + break + } + s = s.slice(0, open) + s.slice(close + 1) + } + return s +} + +function cleanByline(raw: string | undefined): string | null { + if (!raw) return null + let v = decodeEntities(raw).trim() + v = stripTags(v).trim() + v = v.replace(/\s*\/\s*The Polytechnic\s*$/i, '').trim() + v = v.replace(/\s*-\s*The Polytechnic\s*$/i, '').trim() + return v || null +} + +function cleanCaption(raw: string | undefined): string | null { + if (!raw) return null + const v = decodeEntities(raw).trim().replace(/\s+/g, ' ') + return v || null +} + +async function main() { + const { write } = parseArgs() + console.log(`Mode: write=${write}`) + + // Pull all relevant postmeta in one shot. + const tsv = execFileSync( + 'sqlite3', + [ + '-separator', + '\t', + WP_SQLITE, + "SELECT post_id, meta_key, meta_value FROM postmeta WHERE meta_key IN ('Photo','PhotoCaption','PhotoByline','Photographer','OrigPhoto') AND meta_value IS NOT NULL AND meta_value != ''", + ], + { maxBuffer: 64 * 1024 * 1024 }, + ).toString() + + const byWpId = new Map() + for (const line of tsv.split('\n')) { + if (!line) continue + const tab1 = line.indexOf('\t') + const tab2 = line.indexOf('\t', tab1 + 1) + if (tab1 < 0 || tab2 < 0) continue + const pid = Number(line.slice(0, tab1)) + const key = line.slice(tab1 + 1, tab2) + const value = line.slice(tab2 + 1) + if (!pid || !value) continue + const cur = byWpId.get(pid) ?? {} + if (key === 'Photo') cur.photo = value + else if (key === 'PhotoCaption') cur.photoCaption = value + else if (key === 'PhotoByline') cur.photoByline = value + else if (key === 'Photographer') cur.photographer = value + else if (key === 'OrigPhoto') cur.origPhoto = value + byWpId.set(pid, cur) + } + + console.log(`WP postmeta: ${byWpId.size} wp_ids with photo-related meta`) + + const pool = new Pool({ + host: '127.0.0.1', + port: 5433, + user: 'poly', + password: 'poly', + database: 'polymer2', + }) + const resolver = new LegacyMediaResolver(pool) + + let imageFills = 0 + let captionFills = 0 + let mediaCreated = 0 + let mediaReused = 0 + let bylineFills = 0 + let pathSkipped = 0 + let articleNotFound = 0 + + for (const [wpId, meta] of byWpId) { + const archiveUrl = toArchiveUrl(meta.photo) + if (!archiveUrl) { + // Even without a Photo path we may still want to set the caption alone, + // but image_caption belongs to the article's hero image; if there's no + // image, skip. Caption-only would render as orphaned text. + pathSkipped++ + continue + } + + const article = await pool.query<{ id: number; featured_image_id: number | null; image_caption: string | null }>( + `SELECT id, featured_image_id, image_caption FROM articles WHERE legacy_source='wordpress' AND legacy_article_id=$1`, + [String(wpId)], + ) + if (article.rowCount === 0) { + articleNotFound++ + continue + } + const row = article.rows[0] + + // Skip articles that already have a featured image — be conservative. + if (row.featured_image_id) continue + + // Resolve (or create) the media row. + const beforeSize = resolver.size() + const mediaId = await resolver.resolve(archiveUrl, null) + if (!mediaId) continue + if (resolver.size() > beforeSize) mediaCreated++ + else mediaReused++ + + // Update photographer attribution on the media row if absent. + // Prefer Photographer (clean name) over PhotoByline (markup-laden). + const byline = cleanByline(meta.photographer) || cleanByline(meta.photoByline) + if (byline && write) { + const r = await pool.query( + `UPDATE media + SET write_in_photographer = COALESCE(write_in_photographer, $1), + updated_at = NOW() + WHERE id = $2 AND (write_in_photographer IS NULL OR write_in_photographer = '')`, + [byline, mediaId], + ) + if ((r.rowCount ?? 0) > 0) bylineFills++ + } else if (byline) { + bylineFills++ + } + + const caption = cleanCaption(meta.photoCaption) + + if (write) { + await pool.query( + `UPDATE articles + SET featured_image_id = $1, + image_caption = COALESCE(image_caption, $2), + updated_at = NOW() + WHERE id = $3`, + [mediaId, caption, row.id], + ) + } + imageFills++ + if (caption && !row.image_caption) captionFills++ + + if (imageFills % 200 === 0) { + console.log(` …${imageFills} images filled`) + } + } + + console.log(`\nResults:`) + console.log(` featured_image_id filled : ${imageFills}`) + console.log(` image_caption filled : ${captionFills}`) + console.log(` media rows created : ${mediaCreated}`) + console.log(` media rows reused : ${mediaReused}`) + console.log(` byline backfills on media: ${bylineFills}`) + console.log(` Photo path skipped : ${pathSkipped}`) + console.log(` article not found : ${articleNotFound}`) + console.log(` Mode: ${write ? 'WRITTEN' : 'DRY RUN'}`) + + await pool.end() +} + +main().catch((err) => { + console.error(err) + process.exit(1) +}) diff --git a/scripts/legacy-import/expand-wp-shortcodes.ts b/scripts/legacy-import/expand-wp-shortcodes.ts new file mode 100644 index 0000000..5d4ed98 --- /dev/null +++ b/scripts/legacy-import/expand-wp-shortcodes.ts @@ -0,0 +1,241 @@ +/** + * Expand WP body shortcodes that the original importer dropped. + * + * Targets 77 published WP posts whose source `post_content` includes one of: + * - [gallery ids="N1,N2,..."] (56 posts, 1012 attachment refs) + * - [gview file="URL"] (19 PDF "Full Issue" embeds) + * -