Skip to content

Commit 5d7ab9b

Browse files
committed
fix: Handle repeat content across cards
1 parent 387d99a commit 5d7ab9b

7 files changed

Lines changed: 104 additions & 14 deletions

File tree

apps/web/src/components/LatexText.tsx

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -157,7 +157,9 @@ function renderMarkdown(html: string): string {
157157
html = html.replace(/\*\*([^*]+?)\*\*/g, "<strong>$1</strong>");
158158

159159
// Italic: *text* → <em> (but not inside <strong> tags' asterisks)
160-
html = html.replace(/(?<!\*)\*([^*]+?)\*(?!\*)/g, "<em>$1</em>");
160+
// Require no space after opening * and no space before closing * (CommonMark flanking rules)
161+
// so that bare multiplication like 1 * 2 * 3 is not treated as emphasis.
162+
html = html.replace(/(?<!\*)\*(?! )([^*]+?)(?<! )\*(?!\*)/g, "<em>$1</em>");
161163

162164
// Process block-level elements within paragraph breaks
163165
const blocks = html.split(/\n\n+/);

apps/web/src/lib/cards/generate.ts

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ import type { AIProvider, AIUsage, ImagePart } from '../ai/index.ts'
22
import type { Document, Chunk, InsertCard } from '@scroll-reader/db'
33
import type { CardType, CardStrategy, CardContent, DocumentType, ReadingGoal } from '@scroll-reader/shared-types'
44
import { resolveCardStrategy } from '@scroll-reader/shared-types'
5-
import { buildSmartPrompt } from './prompts.ts'
5+
import { buildSmartPrompt, summarizeCard } from './prompts.ts'
66

77
interface AICard {
88
type: CardType
@@ -29,6 +29,7 @@ export async function generateCardsForChunk(
2929
provider: AIProvider,
3030
cardTypes?: CardType[],
3131
images?: { base64: string; mimeType: string; alt: string }[],
32+
recentCardSummaries?: string[],
3233
): Promise<CardGenResult> {
3334
const strategy: CardStrategy = cardTypes
3435
? { cardTypes, chunkInterval: 1 }
@@ -43,7 +44,7 @@ export async function generateCardsForChunk(
4344
mimeType: img.mimeType,
4445
}))
4546

46-
const prompt = buildSmartPrompt(chunk, prevChunk, doc, strategy, imageAlts)
47+
const prompt = buildSmartPrompt(chunk, prevChunk, doc, strategy, imageAlts, recentCardSummaries)
4748
const response = await provider.generate(prompt, imageParts)
4849
const aiCards = parseAIResponse(response.text, strategy)
4950

apps/web/src/lib/cards/prompts.ts

Lines changed: 34 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,33 @@
11
import type { Document, Chunk } from "@scroll-reader/db";
2-
import type { CardType, CardStrategy } from "@scroll-reader/shared-types";
2+
import type { CardType, CardStrategy, CardContent } from "@scroll-reader/shared-types";
3+
4+
/** Compact one-line summary of a card for dedup context (keeps prompt small). */
5+
export function summarizeCard(type: CardType, content: CardContent): string {
6+
const c = content as Record<string, unknown>
7+
switch (type) {
8+
case 'discover':
9+
case 'raw_commentary':
10+
case 'connect':
11+
return `${type}: ${truncate(String(c.title ?? c.body ?? ''), 80)}`
12+
case 'flashcard':
13+
return `flashcard: ${truncate(String(c.question ?? ''), 80)}`
14+
case 'quiz':
15+
return `quiz: ${truncate(String(c.question ?? ''), 80)}`
16+
case 'glossary':
17+
return `glossary: ${truncate(String(c.term ?? ''), 40)}`
18+
case 'contrast':
19+
return `contrast: ${truncate(String(c.itemA ?? ''), 30)} vs ${truncate(String(c.itemB ?? ''), 30)}`
20+
case 'passage':
21+
return `passage: ${truncate(String(c.excerpt ?? ''), 80)}`
22+
default:
23+
return `${type}: (card)`
24+
}
25+
}
26+
27+
function truncate(s: string, max: number): string {
28+
const oneLine = s.replace(/\n/g, ' ').trim()
29+
return oneLine.length <= max ? oneLine : oneLine.slice(0, max) + '…'
30+
}
331

432
const CARD_TYPE_DESCRIPTIONS: Record<CardType, string> = {
533
discover:
@@ -30,6 +58,7 @@ export function buildSmartPrompt(
3058
doc: Document,
3159
strategy?: CardStrategy | null,
3260
imageAlts?: string[],
61+
recentCardSummaries?: string[],
3362
): string {
3463
const docLabel = doc.title ?? "Untitled";
3564

@@ -76,8 +105,10 @@ You are a reading companion AI. Analyze the ${isCodeChunk ? "code sample" : "pas
76105
77106
SUGGESTED CARD TYPES (you may adjust based on the content):
78107
${typeDescriptions}
79-
${codeInstructions}${imageBlock}
80-
INSTRUCTIONS:
108+
${codeInstructions}${imageBlock}${recentCardSummaries && recentCardSummaries.length > 0 ? `ALREADY GENERATED (do NOT repeat these topics — find a fresh angle or skip if the passage covers the same ground):
109+
${recentCardSummaries.map((s) => ` - ${s}`).join("\n")}
110+
111+
` : ""}INSTRUCTIONS:
81112
1. First, understand what kind of content this is (prose, reference table, notation, formula, code sample, exercises/questions, table of contents, etc.).
82113
2. If the content TEACHES something (explains a concept, presents an argument, demonstrates a technique), generate a "discover" card first when it is in the suggested types — it is the primary card type. Then add other types only if the content warrants them.
83114
3. If the content is primarily exercises, homework questions, discussion prompts, review questions, or a table of contents/index — do NOT generate a "discover" card. These are questions, not teachings. Instead, generate flashcard or quiz cards that ANSWER the most important questions if possible, or return an empty array if the questions are too open-ended or require external work.

apps/web/src/lib/pipeline.ts

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,8 @@ import {
1313
import type { ImageElement, TocEntry, ExtractConfig, ChunkerConfig, AIUsage, PipelineChunk, TocSection } from '@scroll-reader/pipeline'
1414
import { createProvider } from './ai/index.ts'
1515
import { generateCardsForChunk } from './cards/generate.ts'
16-
import type { CardType, DocumentType, ReadingGoal, Tier } from '@scroll-reader/shared-types'
16+
import { summarizeCard } from './cards/prompts.ts'
17+
import type { CardType, CardContent, DocumentType, ReadingGoal, Tier } from '@scroll-reader/shared-types'
1718
import { TIER_LIMITS, resolveCardStrategy } from '@scroll-reader/shared-types'
1819
import { BATCH_SIZE, EXTRACTOR_BIN, CHUNKER_BIN, FIGURE_EXTRACT_BIN } from 'astro:env/server'
1920
import { downloadDocument, deleteDocument, uploadImage } from './storage.ts'
@@ -676,6 +677,9 @@ export async function processDocument(doc: Document, cardBudget: number): Promis
676677
}
677678
}
678679

680+
const recentCardSummaries: string[] = []
681+
const MAX_RECENT_SUMMARIES = 15
682+
679683
for (let i = 0; i < chunksNeedingCards.length && budgetLeft > 0; i += BATCH_SIZE) {
680684
const batch = chunksNeedingCards.slice(i, i + BATCH_SIZE)
681685

@@ -724,7 +728,7 @@ export async function processDocument(doc: Document, cardBudget: number): Promis
724728
const chunkCardTypes = existingTypesForChunk
725729
? (cardTypes as CardType[]).filter((t) => !existingTypesForChunk.has(t))
726730
: cardTypes
727-
const { cards: newCards, usage: cardUsage } = await generateCardsForChunk(chunk, prevChunk, doc as Document, provider, chunkCardTypes, images)
731+
const { cards: newCards, usage: cardUsage } = await generateCardsForChunk(chunk, prevChunk, doc as Document, provider, chunkCardTypes, images, recentCardSummaries.length > 0 ? recentCardSummaries : undefined)
728732
attempted.add(chunk.id)
729733

730734
if (cardUsage) {
@@ -736,6 +740,14 @@ export async function processDocument(doc: Document, cardBudget: number): Promis
736740
cardsGenerated += newCards.length
737741
budgetLeft -= newCards.length
738742

743+
// Accumulate summaries for dedup context (rolling window)
744+
for (const card of newCards) {
745+
recentCardSummaries.push(summarizeCard(card.cardType as CardType, card.content as CardContent))
746+
}
747+
if (recentCardSummaries.length > MAX_RECENT_SUMMARIES) {
748+
recentCardSummaries.splice(0, recentCardSummaries.length - MAX_RECENT_SUMMARIES)
749+
}
750+
739751
// Write back to catalog_cards cache for future users
740752
if (chunkToCatalogChunk) {
741753
const catalogChunkId = chunkToCatalogChunk.get(chunk.id)

apps/worker/src/cards/generate.ts

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ import type { AIProvider, AIUsage } from '../ai/index.ts'
22
import type { Document, Chunk, InsertCard } from '@scroll-reader/db'
33
import type { CardType, CardStrategy, CardContent, DocumentType, ReadingGoal } from '@scroll-reader/shared-types'
44
import { resolveCardStrategy } from '@scroll-reader/shared-types'
5-
import { buildSmartPrompt } from './prompts.ts'
5+
import { buildSmartPrompt, summarizeCard } from './prompts.ts'
66

77
interface AICard {
88
type: CardType
@@ -28,6 +28,7 @@ export async function generateCardsForChunk(
2828
doc: Document,
2929
provider: AIProvider,
3030
cardTypes?: CardType[],
31+
recentCardSummaries?: string[],
3132
): Promise<CardGenResult> {
3233
const strategy: CardStrategy = cardTypes
3334
? { cardTypes, chunkInterval: 1 }
@@ -36,7 +37,7 @@ export async function generateCardsForChunk(
3637
(doc.readingGoal ?? 'reflective') as ReadingGoal,
3738
)
3839

39-
const prompt = buildSmartPrompt(chunk, prevChunk, doc, strategy)
40+
const prompt = buildSmartPrompt(chunk, prevChunk, doc, strategy, recentCardSummaries)
4041
const response = await provider.generate(prompt)
4142
const aiCards = parseAIResponse(response.text, strategy)
4243

apps/worker/src/cards/prompts.ts

Lines changed: 34 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,33 @@
11
import type { Document, Chunk } from "@scroll-reader/db";
2-
import type { CardType, CardStrategy } from "@scroll-reader/shared-types";
2+
import type { CardType, CardStrategy, CardContent } from "@scroll-reader/shared-types";
3+
4+
/** Compact one-line summary of a card for dedup context (keeps prompt small). */
5+
export function summarizeCard(type: CardType, content: CardContent): string {
6+
const c = content as Record<string, unknown>
7+
switch (type) {
8+
case 'discover':
9+
case 'raw_commentary':
10+
case 'connect':
11+
return `${type}: ${truncate(String(c.title ?? c.body ?? ''), 80)}`
12+
case 'flashcard':
13+
return `flashcard: ${truncate(String(c.question ?? ''), 80)}`
14+
case 'quiz':
15+
return `quiz: ${truncate(String(c.question ?? ''), 80)}`
16+
case 'glossary':
17+
return `glossary: ${truncate(String(c.term ?? ''), 40)}`
18+
case 'contrast':
19+
return `contrast: ${truncate(String(c.itemA ?? ''), 30)} vs ${truncate(String(c.itemB ?? ''), 30)}`
20+
case 'passage':
21+
return `passage: ${truncate(String(c.excerpt ?? ''), 80)}`
22+
default:
23+
return `${type}: (card)`
24+
}
25+
}
26+
27+
function truncate(s: string, max: number): string {
28+
const oneLine = s.replace(/\n/g, ' ').trim()
29+
return oneLine.length <= max ? oneLine : oneLine.slice(0, max) + '…'
30+
}
331

432
const CARD_TYPE_DESCRIPTIONS: Record<CardType, string> = {
533
discover:
@@ -29,6 +57,7 @@ export function buildSmartPrompt(
2957
prevChunk: Chunk | null,
3058
doc: Document,
3159
strategy?: CardStrategy | null,
60+
recentCardSummaries?: string[],
3261
): string {
3362
const docLabel = doc.title ?? "Untitled";
3463

@@ -68,8 +97,10 @@ You are a reading companion AI. Analyze the ${isCodeChunk ? "code sample" : "pas
6897
6998
SUGGESTED CARD TYPES (you may adjust based on the content):
7099
${typeDescriptions}
71-
${codeInstructions}
72-
INSTRUCTIONS:
100+
${codeInstructions}${recentCardSummaries && recentCardSummaries.length > 0 ? `ALREADY GENERATED (do NOT repeat these topics — find a fresh angle or skip if the passage covers the same ground):
101+
${recentCardSummaries.map((s) => ` - ${s}`).join("\n")}
102+
103+
` : ""}INSTRUCTIONS:
73104
1. First, understand what kind of content this is (prose, reference table, notation, formula, code sample, exercises/questions, table of contents, etc.).
74105
2. If the content TEACHES something (explains a concept, presents an argument, demonstrates a technique), generate a "discover" card first when it is in the suggested types — it is the primary card type. Then add other types only if the content warrants them.
75106
3. If the content is primarily exercises, homework questions, discussion prompts, review questions, or a table of contents/index — do NOT generate a "discover" card. These are questions, not teachings. Instead, generate flashcard or quiz cards that ANSWER the most important questions if possible, or return an empty array if the questions are too open-ended or require external work.

apps/worker/src/pipeline.ts

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,12 +5,14 @@ import { db } from './db.ts'
55
import { documents, chunks, cards, aiUsageLogs } from '@scroll-reader/db'
66
import type { Document, Chunk } from '@scroll-reader/db'
77
import type { AIProvider, AIUsage } from './ai/index.ts'
8+
import type { CardType, CardContent } from '@scroll-reader/shared-types'
89
import {
910
extractDocument, callSegmenter, callChunker, aiChunk,
1011
mergeConsecutiveCode, foldSmallCodeIntoText,
1112
} from '@scroll-reader/pipeline'
1213
import type { ExtractConfig, ChunkerConfig, PipelineChunk } from '@scroll-reader/pipeline'
1314
import { generateCardsForChunk } from './cards/generate.ts'
15+
import { summarizeCard } from './cards/prompts.ts'
1416

1517
const HERE = dirname(fileURLToPath(import.meta.url))
1618
const extractConfig: ExtractConfig = {
@@ -224,18 +226,28 @@ export async function processDocument(filePath: string, userId: string): Promise
224226
// Generate cards for text and code chunks (skip image-only chunks)
225227
const cardChunkRows = insertedChunks.filter((c) => c.chunkType === 'text' || c.chunkType === 'code')
226228
let totalCards = 0
229+
const recentCardSummaries: string[] = []
230+
const MAX_RECENT_SUMMARIES = 15
227231

228232
for (let i = 0; i < cardChunkRows.length; i++) {
229233
const chunk = cardChunkRows[i]
230234
// Fetch chunk N-1 regardless of type — image chunks provide alt text context
231235
const prevChunk: Chunk | null = insertedChunks[insertedChunks.indexOf(chunk) - 1] ?? null
232236

233-
const { cards: newCards, usage: cardUsage } = await generateCardsForChunk(chunk, prevChunk, doc as Document, provider)
237+
const { cards: newCards, usage: cardUsage } = await generateCardsForChunk(chunk, prevChunk, doc as Document, provider, undefined, recentCardSummaries.length > 0 ? recentCardSummaries : undefined)
234238
if (cardUsage) {
235239
logUsage(userId, doc.id, 'card_generation', provider.name, provider.model, cardUsage, chunk.id)
236240
}
237241
await db.insert(cards).values(newCards)
238242
totalCards += newCards.length
243+
244+
// Accumulate summaries for dedup context (rolling window)
245+
for (const card of newCards) {
246+
recentCardSummaries.push(summarizeCard(card.cardType as CardType, card.content as CardContent))
247+
}
248+
if (recentCardSummaries.length > MAX_RECENT_SUMMARIES) {
249+
recentCardSummaries.splice(0, recentCardSummaries.length - MAX_RECENT_SUMMARIES)
250+
}
239251
}
240252

241253
// --- Done ---

0 commit comments

Comments
 (0)