Skip to content

Commit 6588879

Browse files
authored
Eng 1595 single prompt based extraction pass prompt encodes dg (#972)
* ENG-1592, ENG-1593, ENG-1594: Wire up sidebar inputs for extraction page PDF upload, research question textarea, and toggleable node types with shared NodeTypeDefinition type. Evidence and Claim selected by default. * ENG-1595: Wire extract button; build system prompt from selected node types * Move extraction question into system prompt * Use shadcn Button/Label and default Tailwind utilities in extraction Sidebar Address PR #958 review: swap native <button>/<label> for @repo/ui Button/Label so the sidebar uses the shared component presentation, and replace arbitrary text-[Npx]/tracking-[Nem]/rounded-[Npx]/border-[Npx] classes with Tailwind defaults (text-lg/base/sm/xs, tracking-tight/wide, rounded-3xl, border-2). Adds a new Label component to @repo/ui with @radix-ui/react-label as a direct dep (previously transitive). * Drop arbitrary sidebar width/shadow in favor of Tailwind defaults Swap lg:w-[390px] + xl:w-[420px] for lg:w-96, and the custom shadow-[0_26px_52px_-38px_rgba(15,23,42,0.6)] for shadow-xl. Defers exact visual tuning to the design phase. * Enlarge research question textarea to avoid scrollbar on placeholder min-h-20 (80px) clipped the 3-line placeholder. Bump to min-h-36 (144px) so typical research questions fit without an inner scrollbar. * Surface extraction failures in sidebar, drop debug logs Extraction errors now show a user-facing message under the Extract button while still logging the raw error via console.error so it remains visible in devtools/analytics. Also adds a response.ok check so API failures actually reach the catch path.
1 parent ad6afb8 commit 6588879

5 files changed

Lines changed: 206 additions & 67 deletions

File tree

apps/website/app/(extract)/extract-nodes/components/Sidebar.tsx

Lines changed: 23 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,10 @@ type SidebarProps = {
1717
onResearchQuestionChange: (value: string) => void;
1818
selectedTypes: Set<string>;
1919
onToggleType: (candidateTag: string) => void;
20+
onExtract: () => void;
21+
canExtract: boolean;
22+
isExtracting: boolean;
23+
extractionError: string | null;
2024
};
2125

2226
export const Sidebar = ({
@@ -26,6 +30,10 @@ export const Sidebar = ({
2630
onResearchQuestionChange,
2731
selectedTypes,
2832
onToggleType,
33+
onExtract,
34+
canExtract,
35+
isExtracting,
36+
extractionError,
2937
}: SidebarProps): React.ReactElement => {
3038
const fileInputRef = useRef<HTMLInputElement>(null);
3139

@@ -153,11 +161,21 @@ export const Sidebar = ({
153161
</div>
154162

155163
<div className="border-t border-slate-200/90 bg-white/95 p-4 backdrop-blur-xl">
156-
<p className="mb-2 text-sm font-medium text-slate-500">
157-
Ready to run extraction.
158-
</p>
159-
<Button className="w-full rounded-xl bg-slate-900 py-6 text-lg font-semibold text-white hover:bg-slate-800">
160-
Re-Extract
164+
{extractionError ? (
165+
<p role="alert" className="mb-2 text-sm font-medium text-red-600">
166+
{extractionError}
167+
</p>
168+
) : (
169+
<p className="mb-2 text-sm font-medium text-slate-500">
170+
Ready to run extraction.
171+
</p>
172+
)}
173+
<Button
174+
onClick={onExtract}
175+
disabled={!canExtract}
176+
className="w-full rounded-xl bg-slate-900 py-6 text-lg font-semibold text-white hover:bg-slate-800"
177+
>
178+
{isExtracting ? "Extracting…" : "Re-Extract"}
161179
</Button>
162180
</div>
163181
</aside>

apps/website/app/(extract)/extract-nodes/page.tsx

Lines changed: 59 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,22 @@
11
"use client";
22

33
import { useCallback, useState } from "react";
4-
import type { ExtractedNode } from "~/types/extraction";
4+
import { NODE_TYPE_DEFINITIONS, type ExtractedNode } from "~/types/extraction";
5+
import { buildSystemPrompt } from "~/prompts/extraction";
56
import { MainContent } from "./components/MainContent";
67
import { Sidebar } from "./components/Sidebar";
78

9+
const readFileAsBase64 = (file: File): Promise<string> =>
10+
new Promise((resolve, reject) => {
11+
const reader = new FileReader();
12+
reader.onload = () => {
13+
const dataUrl = String(reader.result);
14+
resolve(dataUrl.slice(dataUrl.indexOf(",") + 1));
15+
};
16+
reader.onerror = () => reject(reader.error);
17+
reader.readAsDataURL(file);
18+
});
19+
820
const SAMPLE_NODES: ExtractedNode[] = [
921
{
1022
nodeType: "Claim",
@@ -86,6 +98,8 @@ const ExtractNodesPage = (): React.ReactElement => {
8698
const [selectedTypes, setSelectedTypes] = useState(
8799
() => new Set(["#evd-candidate", "#clm-candidate"]),
88100
);
101+
const [isExtracting, setIsExtracting] = useState(false);
102+
const [extractionError, setExtractionError] = useState<string | null>(null);
89103
const [nodes] = useState<ExtractedNode[]>(SAMPLE_NODES);
90104

91105
const toggleType = useCallback((candidateTag: string) => {
@@ -100,6 +114,46 @@ const ExtractNodesPage = (): React.ReactElement => {
100114
});
101115
}, []);
102116

117+
const canExtract = !!pdfFile && selectedTypes.size > 0 && !isExtracting;
118+
119+
const handleExtract = useCallback(async () => {
120+
if (!pdfFile) return;
121+
setIsExtracting(true);
122+
setExtractionError(null);
123+
try {
124+
const pdfBase64 = await readFileAsBase64(pdfFile);
125+
const nodeTypes = NODE_TYPE_DEFINITIONS.filter((t) =>
126+
selectedTypes.has(t.candidateTag),
127+
);
128+
const systemPrompt = buildSystemPrompt({
129+
nodeTypes,
130+
researchQuestion: researchQuestion || undefined,
131+
});
132+
const requestBody = {
133+
pdfBase64,
134+
provider: "anthropic",
135+
model: "claude-sonnet-4-6",
136+
systemPrompt,
137+
};
138+
const response = await fetch("/api/ai/extract", {
139+
method: "POST",
140+
// eslint-disable-next-line @typescript-eslint/naming-convention
141+
headers: { "Content-Type": "application/json" },
142+
body: JSON.stringify(requestBody),
143+
});
144+
if (!response.ok) {
145+
throw new Error(`Request failed with status ${response.status}`);
146+
}
147+
} catch (error) {
148+
console.error("extraction failed:", error);
149+
setExtractionError(
150+
"We couldn't extract nodes from this PDF. Please try again.",
151+
);
152+
} finally {
153+
setIsExtracting(false);
154+
}
155+
}, [pdfFile, researchQuestion, selectedTypes]);
156+
103157
return (
104158
<div className="flex h-full w-full flex-1 flex-col gap-4 p-4 lg:flex-row lg:gap-5 lg:p-5">
105159
<Sidebar
@@ -109,6 +163,10 @@ const ExtractNodesPage = (): React.ReactElement => {
109163
onResearchQuestionChange={setResearchQuestion}
110164
selectedTypes={selectedTypes}
111165
onToggleType={toggleType}
166+
onExtract={() => void handleExtract()}
167+
canExtract={canExtract}
168+
isExtracting={isExtracting}
169+
extractionError={extractionError}
112170
/>
113171
<MainContent nodes={nodes} />
114172
</div>

apps/website/app/api/ai/extract/route.ts

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -11,10 +11,7 @@ import {
1111
openaiConfig,
1212
geminiConfig,
1313
} from "~/utils/llm/providers";
14-
import {
15-
DEFAULT_EXTRACTION_PROMPT,
16-
buildUserPrompt,
17-
} from "~/prompts/extraction";
14+
import { buildUserPrompt } from "~/prompts/extraction";
1815
import { parseExtractionResponse } from "~/utils/ai/parseExtractionResponse";
1916

2017
export const runtime = "nodejs";
@@ -108,8 +105,7 @@ export const POST = async (
108105
);
109106
}
110107

111-
const { pdfBase64, researchQuestion, model, provider, systemPrompt } =
112-
validated.data;
108+
const { pdfBase64, model, provider, systemPrompt } = validated.data;
113109

114110
const config = PROVIDER_CONFIGS[provider];
115111
const apiKey = process.env[config.apiKeyEnvVar];
@@ -124,14 +120,14 @@ export const POST = async (
124120
const messages = buildExtractionMessages({
125121
provider,
126122
pdfBase64,
127-
userPrompt: buildUserPrompt(researchQuestion),
123+
userPrompt: buildUserPrompt(),
128124
});
129125

130126
const settings: Settings = {
131127
model,
132128
maxTokens: 16384,
133129
temperature: 0.6,
134-
systemPrompt: systemPrompt ?? DEFAULT_EXTRACTION_PROMPT,
130+
systemPrompt,
135131
outputSchema: EXTRACTION_RESULT_JSON_SCHEMA,
136132
};
137133

Lines changed: 119 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -1,59 +1,127 @@
1-
export const DEFAULT_EXTRACTION_PROMPT = `You are a research analyst extracting discourse graph nodes from academic papers.
1+
import type { NodeTypeDefinition } from "~/types/extraction";
22

3-
Extract discrete, atomic nodes from the paper. Each node is one idea: one claim, one observation, one question.
4-
5-
## Node Types
6-
7-
- **Evidence**: A discrete observation from a published dataset or experiment cited in the paper (prior work). Past tense. Includes observable, model system, method. Quantitative details when available.
8-
- **Claim**: An interpretive assertion by the authors. Debatable — goes beyond data to state what it means. Specific enough to test or argue against.
9-
- **Question**: A research question — explicitly stated or implied by a gap in the literature. Open-ended.
10-
- **Result**: A discrete observation from this paper's own experiments. Same structure as Evidence but from the current work, not prior studies. Past tense.
11-
- **Theory**: A theoretical framework or model used or proposed. Name it, state its core proposition.
12-
- **Source**: A cited publication. Author(s) and year.
13-
14-
## Quality
15-
16-
- Atomic: one idea per node. Split compound sentences.
17-
- Self-contained: understandable without the paper.
18-
- Faithful: no inference or editorializing.
19-
- Specific: "X reduced Y by 43% in Z" not "X was effective."
20-
- 8–25 nodes. Quality over quantity. Cover all sections.
21-
- Evidence = prior work cited. Result = this paper's experiments.
22-
23-
## Example
3+
const QUALITY_CRITERIA = `Atomic: one idea per node. Split compound sentences.
4+
Self-contained: understandable without the paper.
5+
Faithful: no inference or editorializing.
6+
Specific: "X reduced Y by 43% in Z" not "X was effective."
7+
8–25 nodes. Quality over quantity. Cover all sections.
8+
For evidence: include the observation, model, system, and method, in past tense`;
249

10+
const FEW_SHOT_EXAMPLES = `<example>
2511
Excerpt (Results):
2612
"CRISPR-edited T cells maintained cytotoxic activity for 12 weeks in vitro (Fig 3A), longer than controls which declined after week 4 (p<0.001). This correlated with elevated CD62L and CCR7 (Fig 3B), suggesting a memory-like phenotype resisting exhaustion."
27-
2813
{
29-
"nodes": [
30-
{
31-
"nodeType": "Result",
32-
"content": "CRISPR-edited T cells maintained cytotoxic activity for 12 weeks in vitro, significantly longer than unedited controls which declined after week 4",
33-
"supportSnippet": "CRISPR-edited T cells maintained cytotoxic activity for 12 weeks in vitro (Fig 3A), longer than controls which declined after week 4 (p<0.001)",
34-
"sourceSection": "Results"
35-
},
36-
{
37-
"nodeType": "Result",
38-
"content": "Sustained cytotoxic activity of CRISPR-edited T cells correlated with elevated CD62L and CCR7 expression",
39-
"supportSnippet": "This correlated with elevated CD62L and CCR7 (Fig 3B)",
40-
"sourceSection": "Results"
41-
},
42-
{
43-
"nodeType": "Claim",
44-
"content": "CRISPR editing may promote a memory-like T cell phenotype that resists exhaustion",
45-
"supportSnippet": "suggesting a memory-like phenotype resisting exhaustion",
46-
"sourceSection": "Results"
47-
}
48-
]
49-
}`;
50-
51-
export const buildUserPrompt = (researchQuestion?: string): string => {
52-
let prompt = "Extract discourse graph nodes from the attached paper.";
14+
"nodes": [
15+
{
16+
"nodeType": "Result",
17+
"content": "CRISPR-edited T cells maintained cytotoxic activity for 12 weeks in vitro, significantly longer than unedited controls which declined after week 4",
18+
"supportSnippet": "CRISPR-edited T cells maintained cytotoxic activity for 12 weeks in vitro (Fig 3A), longer than controls which declined after week 4 (p<0.001)",
19+
"sourceSection": "Results"
20+
},
21+
{
22+
"nodeType": "Result",
23+
"content": "Sustained cytotoxic activity of CRISPR-edited T cells correlated with elevated CD62L and CCR7 expression",
24+
"supportSnippet": "This correlated with elevated CD62L and CCR7 (Fig 3B)",
25+
"sourceSection": "Results"
26+
},
27+
{
28+
"nodeType": "Claim",
29+
"content": "CRISPR editing may promote a memory-like T cell phenotype that resists exhaustion",
30+
"supportSnippet": "suggesting a memory-like phenotype resisting exhaustion",
31+
"sourceSection": "Results"
32+
}
33+
]
34+
}
35+
</example>
36+
<example>
37+
Excerpt (Results):
38+
"The AFM analysis showed that the light chain-free lattice reached a height of approximately 6 nm above the carbon film, and when nano newton-range orthogonal forces were applied, it exhibited significant resistance to further compression compared to lattices constructed from native clathrin with light chains, which could be compressed reversibly from 12 nm to 6 nm in height."
39+
{
40+
"nodes": [
41+
{
42+
"nodeType": "Result",
43+
"content": "Applying force to clathrin lattices on carbon-coated films caused a larger change in height when clathrin light chain was also present based on AFM",
44+
"supportSnippet": "it exhibited significant resistance to further compression compared to lattices constructed from native clathrin with light chains, which could be compressed reversibly from 12 nm to 6 nm in height",
45+
"sourceSection": "Results"
46+
},
47+
{
48+
"nodeType": "Claim",
49+
"content": "Clathrin light chain increases the rigidity of clathrin-coated vesicles",
50+
"supportSnippet": "The vertical elasticity of the clathrin lattice is dependent on clathrin light chains, suggesting that light chains are important for both, the conformational stability of the clathrin triskelion and that of the lattice.",
51+
"sourceSection": "Results"
52+
}
53+
]
54+
}
55+
</example>
56+
<example>
57+
Excerpt (Results):
58+
"We found that under low tension (0.015 pN/nm), endocytic pits internalize strongly and few barbed ends encounter the base of the pit, with fewer Arp2/3 complexes recruited to the network and a correspondingly low filament bending energy (Figure 7H). Under >50 x higher membrane tension (1 pN/nm), endocytic internalization slowed but was not abolished. For these pits, more barbed ends encountered the base of the pit, binding more Arp2/3 complexes to nucleate more actin filaments and increasing the total actin filament bending energy near the base of the pit (Figure 7H)."
59+
{
60+
"nodes": [
61+
{
62+
"nodeType": "Result",
63+
"content": "Total bending energy of actin filaments increased as a function of membrane tension in endocytic simulations",
64+
"supportSnippet": "Under >50 x higher membrane tension (1 pN/nm), endocytic internalization slowed but was not abolished. For these pits, more barbed ends encountered the base of the pit, binding more Arp2/3 complexes to nucleate more actin filaments and increasing the total actin filament bending energy near the base of the pit (Figure 7H).",
65+
"sourceSection": "Results"
66+
},
67+
{
68+
"nodeType": "Claim",
69+
"content": "Actin filament bending energy associated with endocytic internalization increases with membrane tension",
70+
"supportSnippet": "Here, the distribution of Hip1R linkers around the pit directs more filaments to grow toward the base of the pit (Figure 4), which nucleates more filaments autocatalytically and increases filament bending (Figure 5)",
71+
"sourceSection": "Results"
72+
}
73+
]
74+
}
75+
</example>
76+
<example>
77+
Excerpt (Results):
78+
"Pangram's text classifier is the only model that achieves production-ready levels of accuracy, false positive rate, and false negative rate. Our model is the most accurate at 99%, compared to commercial competitors which do not even clear 95%. Our false positive rate is better than the second best model, GPTZero, by a factor of 3, which achieving 7 times better negative error rate."
79+
{
80+
"nodes": [
81+
{
82+
"nodeType": "Result",
83+
"content": "Pangram had a lower false positive and false negative rate in detecting AI-generated writing than GPTZero, Originality, or DetectGPT, based on classification of 2000 test documents",
84+
"supportSnippet": " Our model is the most accurate at 99%, compared to commercial competitors which do not even clear 95%. Our false positive rate is better than the second best model, GPTZero, by a factor of 3, which achieving 7 times better negative error rate.",
85+
"sourceSection": "Results"
86+
},
87+
{
88+
"nodeType": "Claim",
89+
"content": "Pangram achieves higher accuracy and fewer false positives than other AI writing detection algorithms",
90+
"supportSnippet": " Pangram Text outperforms zero-shot methods such as DetectGPT as well as leading commercial AI detection tools with over 38 times lower error rates on a comprehensive benchmark comprised of 10 text domains (student writing, creative writing, scientific writing, books, encyclopedias, news, email, scientific papers, short-form Q&A) and 8 open and closed-source large language models",
91+
"sourceSection": "Abstract"
92+
}
93+
]
94+
}
95+
</example>`;
5396

54-
if (researchQuestion) {
55-
prompt += `\n\nFocus extraction around this research question: ${researchQuestion}`;
56-
}
97+
export const buildSystemPrompt = ({
98+
nodeTypes,
99+
researchQuestion,
100+
}: {
101+
nodeTypes: NodeTypeDefinition[];
102+
researchQuestion?: string;
103+
}): string => {
104+
const nodeTypesBlock = nodeTypes
105+
.map((t) => `${t.label}: ${t.definition}`)
106+
.join("\n");
107+
const trimmedResearchQuestion = researchQuestion?.trim();
108+
const researchQuestionBlock = trimmedResearchQuestion
109+
? `\n<research-question>\n${trimmedResearchQuestion}\n</research-question>`
110+
: "";
57111

58-
return prompt;
112+
return `You are a research analyst extracting discourse graph nodes from academic papers.
113+
Extract discrete, atomic nodes from the paper. Each node is one idea: one claim, one observation, one question.
114+
${trimmedResearchQuestion ? `Focus extraction around the research question provided below when it is relevant.\n` : ""}${researchQuestionBlock}
115+
<node-types>
116+
${nodeTypesBlock}
117+
</node-types>
118+
<quality>
119+
${QUALITY_CRITERIA}
120+
</quality>
121+
<examples>
122+
${FEW_SHOT_EXAMPLES}
123+
</examples>`;
59124
};
125+
126+
export const buildUserPrompt = (): string =>
127+
"Extract discourse graph nodes from the attached paper.";

apps/website/app/types/extraction.ts

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,8 +26,7 @@ export const ExtractionRequestSchema = z.object({
2626
pdfBase64: z.string().min(1).max(44_000_000),
2727
provider: z.enum(PROVIDER_IDS),
2828
model: z.string().min(1),
29-
researchQuestion: z.string().optional(),
30-
systemPrompt: z.string().optional(),
29+
systemPrompt: z.string().min(1),
3130
});
3231

3332
export type ExtractionRequest = z.infer<typeof ExtractionRequestSchema>;

0 commit comments

Comments
 (0)