Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
71 changes: 71 additions & 0 deletions scripts/legacy-import/clean-plain-titles.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
/**
* Decode HTML entities and strip residual inline tags from `plain_title`
* for legacy rows where the original importer left them through.
*
* Audit found:
* - 22 pipeline plain_titles with `’`/`&` etc.
* - 7 poly-online plain_titles with literal `<i>...</i>` tags
*
* Run with `pnpm tsx scripts/legacy-import/clean-plain-titles.ts [--write]`.
*/

import { Pool } from 'pg'
import { decodeEntities } from './wordpress/html-tokenizer'

function parseArgs() {
return { write: process.argv.slice(2).includes('--write') }
}

function stripTags(input: string): string {
let s = input
for (;;) {
const open = s.indexOf('<')
if (open === -1) break
const close = s.indexOf('>', open + 1)
if (close === -1) {
s = s.slice(0, open)
break
}
s = s.slice(0, open) + s.slice(close + 1)
}
return s
}

function clean(s: string): string {
return decodeEntities(stripTags(s)).replace(/\s+/g, ' ').trim()
}

async function main() {
const { write } = parseArgs()
const pool = new Pool({
host: '127.0.0.1',
port: 5433,
user: 'poly',
password: 'poly',
database: 'polymer2',
})

const r = await pool.query<{ id: number; plain_title: string; legacy_source: string | null }>(
`SELECT id, plain_title, legacy_source FROM articles
WHERE plain_title IS NOT NULL
AND (plain_title ~ '<[a-z]+>' OR plain_title ~ '&#[0-9]+;' OR plain_title ~ '&[a-z]+;')`,
)
console.log(`candidates: ${r.rows.length}`)

let updated = 0
for (const row of r.rows) {
const cleaned = clean(row.plain_title)
if (cleaned === row.plain_title) continue
if (write) {
await pool.query(`UPDATE articles SET plain_title=$1, updated_at=NOW() WHERE id=$2`, [cleaned, row.id])
}
updated++
}
console.log(`updated: ${updated} (${write ? 'WRITTEN' : 'DRY RUN'})`)
await pool.end()
}

main().catch((e) => {
console.error(e)
process.exit(1)
})
72 changes: 72 additions & 0 deletions scripts/legacy-import/normalize-author-casing.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
/**
* Author-casing dedup pass. Audit found ~40 cases where the same person was
* stored with two different casings (e.g. `James Lenze II` + `JAmes Lenze
* II`). For each lower-cased canonical name, we pick the most-common casing
* and rename every occurrence to it.
*
* Run with `pnpm tsx scripts/legacy-import/normalize-author-casing.ts [--write]`.
*/

import { Pool } from 'pg'

function parseArgs() {
return { write: process.argv.slice(2).includes('--write') }
}

async function main() {
const { write } = parseArgs()
const pool = new Pool({
host: '127.0.0.1',
port: 5433,
user: 'poly',
password: 'poly',
database: 'polymer2',
})

// Group by lowercase name. The most-common casing wins. Ties go to whichever
// form is first in alphabetical order — deterministic.
const r = await pool.query<{ name: string; cnt: number }>(
`SELECT name, COUNT(*)::int AS cnt FROM articles_write_in_authors GROUP BY name`,
)
const byKey = new Map<string, { name: string; cnt: number }[]>()
for (const row of r.rows) {
const key = row.name.toLowerCase().trim()
if (!key) continue
const list = byKey.get(key) ?? []
list.push({ name: row.name, cnt: row.cnt })
byKey.set(key, list)
}

let groups = 0
let renames = 0
let rowsAffected = 0
for (const [, list] of byKey) {
if (list.length < 2) continue
groups++
list.sort((a, b) => b.cnt - a.cnt || a.name.localeCompare(b.name))
const winner = list[0].name
for (let i = 1; i < list.length; i++) {
const loser = list[i]
// Skip if the only difference is invisible (whitespace).
if (winner === loser.name) continue
if (write) {
const u = await pool.query(`UPDATE articles_write_in_authors SET name=$1 WHERE name=$2`, [winner, loser.name])
rowsAffected += u.rowCount ?? 0
} else {
rowsAffected += loser.cnt
}
renames++
}
}

console.log(`groups with multiple casings: ${groups}`)
console.log(`renamed casings : ${renames}`)
console.log(`rows affected : ${rowsAffected}`)
console.log(`Mode: ${write ? 'WRITTEN' : 'DRY RUN'}`)
await pool.end()
}

main().catch((e) => {
console.error(e)
process.exit(1)
})
195 changes: 195 additions & 0 deletions scripts/legacy-import/restore-wp-italic-titles.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,195 @@
/**
* Rebuild rich-text titles for the 165 WP posts whose source `post_title`
* carried inline `<i>`/`<em>`/`<b>`/`<strong>` markup the original importer
* dropped. The plain_title field is fine (the importer already strips tags
* for display); only the Lexical title doc is wrong.
*
* Approach: tokenize the source title (very small subset — just the four
* format tags + plain text), emit alternating text nodes with `format` bit
* flags, wrap in a paragraph + root.
*
* Run with `pnpm tsx scripts/legacy-import/restore-wp-italic-titles.ts [--write]`.
*/

import { Pool } from 'pg'
import { execFileSync } from 'child_process'
import { decodeEntities } from './wordpress/html-tokenizer'

const WP_SQLITE = '/tmp/audit/wp.db'
const FMT_BOLD = 1
const FMT_ITALIC = 2

function parseArgs() {
return { write: process.argv.slice(2).includes('--write') }
}

type Run = { text: string; format: number }

// Very small tokenizer covering the cases we see in real titles. Self-closing
// tags shouldn't appear here. The tag set is fixed; anything else is dropped.
function parseTitleRuns(html: string): Run[] {
const runs: Run[] = []
let format = 0
let i = 0
let buf = ''

const FMT_TAGS: Record<string, number> = { i: FMT_ITALIC, em: FMT_ITALIC, b: FMT_BOLD, strong: FMT_BOLD }
const flush = () => {
if (buf) {
runs.push({ text: buf, format })
buf = ''
}
}

while (i < html.length) {
const open = html.indexOf('<', i)
if (open === -1) {
buf += html.slice(i)
break
}
buf += html.slice(i, open)
const close = html.indexOf('>', open + 1)
if (close === -1) {
// Unterminated — treat as literal text.
buf += html.slice(open)
break
}
const inner = html.slice(open + 1, close).trim()
const isClosing = inner.startsWith('/')
const tagName = (isClosing ? inner.slice(1) : inner).split(/\s/)[0].toLowerCase()
const fmt = FMT_TAGS[tagName]
if (fmt !== undefined) {
flush()
format = isClosing ? format & ~fmt : format | fmt
}
// Drop any other tags silently.
i = close + 1
}
flush()
return runs.filter((r) => r.text.length > 0).map((r) => ({ ...r, text: decodeEntities(r.text) }))
}

function buildTitleDoc(runs: Run[]): unknown {
// Drop any leading/trailing whitespace runs that decoding+tag-stripping can
// produce, then collapse internal double-spaces (the WP source frequently
// had `<i>X</i> ` with a trailing space that's now adjacent to an italic-
// boundary).
const collapsed: Run[] = []
for (const r of runs) {
const text = r.text.replace(/\s+/g, ' ')
if (!text) continue
collapsed.push({ text, format: r.format })
}
// Strip leading whitespace on the very first run + trailing on the last.
if (collapsed.length > 0) {
collapsed[0].text = collapsed[0].text.replace(/^\s+/, '')
collapsed[collapsed.length - 1].text = collapsed[collapsed.length - 1].text.replace(/\s+$/, '')
}
const children = collapsed
.filter((r) => r.text.length > 0)
.map((r) => ({
type: 'text',
version: 1,
format: r.format,
detail: 0,
mode: 'normal',
style: '',
text: r.text,
}))

return {
root: {
type: 'root',
version: 1,
format: '',
indent: 0,
direction: 'ltr',
children: [
{
type: 'paragraph',
version: 1,
format: '',
indent: 0,
direction: 'ltr',
textFormat: 0,
children,
},
],
},
}
}

async function main() {
const { write } = parseArgs()
console.log(`Mode: write=${write}`)

// Pull every published WP post whose title carries any of the four format
// tags. Using JSON mode avoids tab/newline edge cases.
const json = execFileSync(
'sqlite3',
[
'-json',
WP_SQLITE,
`SELECT ID AS id, post_title FROM posts
WHERE post_status='publish' AND post_type='post'
AND (post_title LIKE '%<i>%' OR post_title LIKE '%</i>%'
OR post_title LIKE '%<em>%' OR post_title LIKE '%</em>%'
OR post_title LIKE '%<b>%' OR post_title LIKE '%</b>%'
OR post_title LIKE '%<strong>%' OR post_title LIKE '%</strong>%')`,
],
{ maxBuffer: 64 * 1024 * 1024 },
).toString()
const wpRows = JSON.parse(json) as { id: number; post_title: string }[]
console.log(`Source rows with tagged titles: ${wpRows.length}`)

const pool = new Pool({
host: '127.0.0.1',
port: 5433,
user: 'poly',
password: 'poly',
database: 'polymer2',
})

let updated = 0
let articleNotFound = 0
let noFormat = 0

for (const w of wpRows) {
const runs = parseTitleRuns(w.post_title)
if (runs.every((r) => r.format === 0)) {
noFormat++
continue
}
const doc = buildTitleDoc(runs)

const r = await pool.query<{ id: number }>(
`SELECT id FROM articles WHERE legacy_source='wordpress' AND legacy_article_id=$1`,
[String(w.id)],
)
if (r.rowCount === 0) {
articleNotFound++
continue
}
const articleId = r.rows[0].id

if (write) {
await pool.query(`UPDATE articles SET title=$1::jsonb, updated_at=NOW() WHERE id=$2`, [
JSON.stringify(doc),
articleId,
])
}
updated++
}

console.log(`\nResults:`)
console.log(` titles updated : ${updated}`)
console.log(` no formatting : ${noFormat}`)
console.log(` article not found : ${articleNotFound}`)
console.log(` Mode: ${write ? 'WRITTEN' : 'DRY RUN'}`)
await pool.end()
}

main().catch((err) => {
console.error(err)
process.exit(1)
})
Loading