Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 13 additions & 14 deletions apps/website/app/(extract)/extract-nodes/components/Sidebar.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ import { useRef } from "react";
import { Button } from "@repo/ui/components/ui/button";
import { Checkbox } from "@repo/ui/components/ui/checkbox";
import { Textarea } from "@repo/ui/components/ui/textarea";
import { ChevronDown, Upload } from "lucide-react";
import { Upload } from "lucide-react";
import { NODE_TYPE_DEFINITIONS } from "~/types/extraction";

const SECTION_LABEL_CLASS =
Expand All @@ -16,6 +16,9 @@ type SidebarProps = {
onResearchQuestionChange: (value: string) => void;
selectedTypes: Set<string>;
onToggleType: (candidateTag: string) => void;
onExtract: () => void;
canExtract: boolean;
isExtracting: boolean;
};

export const Sidebar = ({
Expand All @@ -25,6 +28,9 @@ export const Sidebar = ({
onResearchQuestionChange,
selectedTypes,
onToggleType,
onExtract,
canExtract,
isExtracting,
}: SidebarProps): React.ReactElement => {
const fileInputRef = useRef<HTMLInputElement>(null);

Expand Down Expand Up @@ -92,17 +98,6 @@ export const Sidebar = ({
)}
</section>

<section className="mb-6">
<h3 className={SECTION_LABEL_CLASS}>Model</h3>
<Button
variant="outline"
className="w-full justify-between rounded-xl border-slate-300 px-3.5 py-3 text-[16px] font-medium text-slate-700"
>
<span>Claude Sonnet 4.6</span>
<ChevronDown className="h-4 w-4 text-slate-500" />
</Button>
</section>

<section className="mb-5">
<h3 className={SECTION_LABEL_CLASS}>Research Question</h3>
<Textarea
Expand Down Expand Up @@ -153,8 +148,12 @@ export const Sidebar = ({
<p className="mb-2 text-[14px] font-medium text-slate-500">
Ready to run extraction.
</p>
<Button className="w-full rounded-xl bg-slate-900 py-6 text-[17px] font-semibold text-white hover:bg-slate-800">
Re-Extract
<Button
onClick={onExtract}
disabled={!canExtract}
className="w-full rounded-xl bg-slate-900 py-6 text-[17px] font-semibold text-white hover:bg-slate-800"
>
{isExtracting ? "Extracting…" : "Re-Extract"}
</Button>
</div>
</aside>
Expand Down
54 changes: 54 additions & 0 deletions apps/website/app/(extract)/extract-nodes/page.tsx
Original file line number Diff line number Diff line change
@@ -1,15 +1,29 @@
"use client";

import { useCallback, useState } from "react";
import { NODE_TYPE_DEFINITIONS } from "~/types/extraction";
import { buildSystemPrompt } from "~/prompts/extraction";
import { MainContent } from "./components/MainContent";
import { Sidebar } from "./components/Sidebar";

const readFileAsBase64 = (file: File): Promise<string> =>
new Promise((resolve, reject) => {
const reader = new FileReader();
reader.onload = () => {
const dataUrl = String(reader.result);
resolve(dataUrl.slice(dataUrl.indexOf(",") + 1));
};
reader.onerror = () => reject(reader.error);
reader.readAsDataURL(file);
});

const ExtractNodesPage = (): React.ReactElement => {
const [pdfFile, setPdfFile] = useState<File | null>(null);
const [researchQuestion, setResearchQuestion] = useState("");
const [selectedTypes, setSelectedTypes] = useState(
() => new Set(["#evd-candidate", "#clm-candidate"]),
);
const [isExtracting, setIsExtracting] = useState(false);

const toggleType = useCallback((candidateTag: string) => {
setSelectedTypes((prev) => {
Expand All @@ -23,6 +37,43 @@ const ExtractNodesPage = (): React.ReactElement => {
});
}, []);

const canExtract = !!pdfFile && selectedTypes.size > 0 && !isExtracting;

const handleExtract = useCallback(async () => {
if (!pdfFile) return;
setIsExtracting(true);
try {
const pdfBase64 = await readFileAsBase64(pdfFile);
const nodeTypes = NODE_TYPE_DEFINITIONS.filter((t) =>
selectedTypes.has(t.candidateTag),
);
const systemPrompt = buildSystemPrompt({
nodeTypes,
researchQuestion: researchQuestion || undefined,
});
const requestBody = {
pdfBase64,
provider: "anthropic",
model: "claude-sonnet-4-6",
systemPrompt,
};
console.log("extraction request body:", requestBody);
console.log("extraction system prompt:\n" + systemPrompt);
const response = await fetch("/api/ai/extract", {
Comment thread
sid597 marked this conversation as resolved.
method: "POST",
// eslint-disable-next-line @typescript-eslint/naming-convention
headers: { "Content-Type": "application/json" },
body: JSON.stringify(requestBody),
});
const json: unknown = await response.json();
console.log("extraction result:", json);
} catch (error) {
console.error("extraction failed:", error);
} finally {
setIsExtracting(false);
}
}, [pdfFile, researchQuestion, selectedTypes]);

return (
<div className="flex h-full w-full flex-1 flex-col gap-4 p-4 lg:flex-row lg:gap-5 lg:p-5">
<Sidebar
Expand All @@ -32,6 +83,9 @@ const ExtractNodesPage = (): React.ReactElement => {
onResearchQuestionChange={setResearchQuestion}
selectedTypes={selectedTypes}
onToggleType={toggleType}
onExtract={() => void handleExtract()}
canExtract={canExtract}
isExtracting={isExtracting}
/>
<MainContent />
</div>
Expand Down
12 changes: 4 additions & 8 deletions apps/website/app/api/ai/extract/route.ts
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,7 @@ import {
openaiConfig,
geminiConfig,
} from "~/utils/llm/providers";
import {
DEFAULT_EXTRACTION_PROMPT,
buildUserPrompt,
} from "~/prompts/extraction";
import { buildUserPrompt } from "~/prompts/extraction";
import { parseExtractionResponse } from "~/utils/ai/parseExtractionResponse";

export const runtime = "nodejs";
Expand Down Expand Up @@ -108,8 +105,7 @@ export const POST = async (
);
}

const { pdfBase64, researchQuestion, model, provider, systemPrompt } =
validated.data;
const { pdfBase64, model, provider, systemPrompt } = validated.data;

const config = PROVIDER_CONFIGS[provider];
const apiKey = process.env[config.apiKeyEnvVar];
Expand All @@ -124,14 +120,14 @@ export const POST = async (
const messages = buildExtractionMessages({
provider,
pdfBase64,
userPrompt: buildUserPrompt(researchQuestion),
userPrompt: buildUserPrompt(),
});

const settings: Settings = {
model,
maxTokens: 16384,
temperature: 0.6,
systemPrompt: systemPrompt ?? DEFAULT_EXTRACTION_PROMPT,
systemPrompt,
outputSchema: EXTRACTION_RESULT_JSON_SCHEMA,
};

Expand Down
170 changes: 119 additions & 51 deletions apps/website/app/prompts/extraction.ts
Original file line number Diff line number Diff line change
@@ -1,59 +1,127 @@
export const DEFAULT_EXTRACTION_PROMPT = `You are a research analyst extracting discourse graph nodes from academic papers.
import type { NodeTypeDefinition } from "~/types/extraction";

Extract discrete, atomic nodes from the paper. Each node is one idea: one claim, one observation, one question.

## Node Types

- **Evidence**: A discrete observation from a published dataset or experiment cited in the paper (prior work). Past tense. Includes observable, model system, method. Quantitative details when available.
- **Claim**: An interpretive assertion by the authors. Debatable — goes beyond data to state what it means. Specific enough to test or argue against.
- **Question**: A research question — explicitly stated or implied by a gap in the literature. Open-ended.
- **Result**: A discrete observation from this paper's own experiments. Same structure as Evidence but from the current work, not prior studies. Past tense.
- **Theory**: A theoretical framework or model used or proposed. Name it, state its core proposition.
- **Source**: A cited publication. Author(s) and year.

## Quality

- Atomic: one idea per node. Split compound sentences.
- Self-contained: understandable without the paper.
- Faithful: no inference or editorializing.
- Specific: "X reduced Y by 43% in Z" not "X was effective."
- 8–25 nodes. Quality over quantity. Cover all sections.
- Evidence = prior work cited. Result = this paper's experiments.

## Example
const QUALITY_CRITERIA = `Atomic: one idea per node. Split compound sentences.
Self-contained: understandable without the paper.
Faithful: no inference or editorializing.
Specific: "X reduced Y by 43% in Z" not "X was effective."
8–25 nodes. Quality over quantity. Cover all sections.
For evidence: include the observation, model, system, and method, in past tense`;

const FEW_SHOT_EXAMPLES = `<example>
Excerpt (Results):
"CRISPR-edited T cells maintained cytotoxic activity for 12 weeks in vitro (Fig 3A), longer than controls which declined after week 4 (p<0.001). This correlated with elevated CD62L and CCR7 (Fig 3B), suggesting a memory-like phenotype resisting exhaustion."

{
"nodes": [
{
"nodeType": "Result",
"content": "CRISPR-edited T cells maintained cytotoxic activity for 12 weeks in vitro, significantly longer than unedited controls which declined after week 4",
"supportSnippet": "CRISPR-edited T cells maintained cytotoxic activity for 12 weeks in vitro (Fig 3A), longer than controls which declined after week 4 (p<0.001)",
"sourceSection": "Results"
},
{
"nodeType": "Result",
"content": "Sustained cytotoxic activity of CRISPR-edited T cells correlated with elevated CD62L and CCR7 expression",
"supportSnippet": "This correlated with elevated CD62L and CCR7 (Fig 3B)",
"sourceSection": "Results"
},
{
"nodeType": "Claim",
"content": "CRISPR editing may promote a memory-like T cell phenotype that resists exhaustion",
"supportSnippet": "suggesting a memory-like phenotype resisting exhaustion",
"sourceSection": "Results"
}
]
}`;

export const buildUserPrompt = (researchQuestion?: string): string => {
let prompt = "Extract discourse graph nodes from the attached paper.";
"nodes": [
{
"nodeType": "Result",
"content": "CRISPR-edited T cells maintained cytotoxic activity for 12 weeks in vitro, significantly longer than unedited controls which declined after week 4",
"supportSnippet": "CRISPR-edited T cells maintained cytotoxic activity for 12 weeks in vitro (Fig 3A), longer than controls which declined after week 4 (p<0.001)",
"sourceSection": "Results"
},
{
"nodeType": "Result",
"content": "Sustained cytotoxic activity of CRISPR-edited T cells correlated with elevated CD62L and CCR7 expression",
"supportSnippet": "This correlated with elevated CD62L and CCR7 (Fig 3B)",
"sourceSection": "Results"
},
{
"nodeType": "Claim",
"content": "CRISPR editing may promote a memory-like T cell phenotype that resists exhaustion",
"supportSnippet": "suggesting a memory-like phenotype resisting exhaustion",
"sourceSection": "Results"
}
]
}
</example>
<example>
Excerpt (Results):
"The AFM analysis showed that the light chain-free lattice reached a height of approximately 6 nm above the carbon film, and when nano newton-range orthogonal forces were applied, it exhibited significant resistance to further compression compared to lattices constructed from native clathrin with light chains, which could be compressed reversibly from 12 nm to 6 nm in height."
{
"nodes": [
{
"nodeType": "Result",
"content": "Applying force to clathrin lattices on carbon-coated films caused a larger change in height when clathrin light chain was also present based on AFM",
"supportSnippet": "it exhibited significant resistance to further compression compared to lattices constructed from native clathrin with light chains, which could be compressed reversibly from 12 nm to 6 nm in height",
"sourceSection": "Results"
},
{
"nodeType": "Claim",
"content": "Clathrin light chain increases the rigidity of clathrin-coated vesicles",
"supportSnippet": "The vertical elasticity of the clathrin lattice is dependent on clathrin light chains, suggesting that light chains are important for both, the conformational stability of the clathrin triskelion and that of the lattice.",
"sourceSection": "Results"
}
]
}
</example>
<example>
Excerpt (Results):
"We found that under low tension (0.015 pN/nm), endocytic pits internalize strongly and few barbed ends encounter the base of the pit, with fewer Arp2/3 complexes recruited to the network and a correspondingly low filament bending energy (Figure 7H). Under >50 x higher membrane tension (1 pN/nm), endocytic internalization slowed but was not abolished. For these pits, more barbed ends encountered the base of the pit, binding more Arp2/3 complexes to nucleate more actin filaments and increasing the total actin filament bending energy near the base of the pit (Figure 7H)."
{
"nodes": [
{
"nodeType": "Result",
"content": "Total bending energy of actin filaments increased as a function of membrane tension in endocytic simulations",
"supportSnippet": "Under >50 x higher membrane tension (1 pN/nm), endocytic internalization slowed but was not abolished. For these pits, more barbed ends encountered the base of the pit, binding more Arp2/3 complexes to nucleate more actin filaments and increasing the total actin filament bending energy near the base of the pit (Figure 7H).",
"sourceSection": "Results"
},
{
"nodeType": "Claim",
"content": "Actin filament bending energy associated with endocytic internalization increases with membrane tension",
"supportSnippet": "Here, the distribution of Hip1R linkers around the pit directs more filaments to grow toward the base of the pit (Figure 4), which nucleates more filaments autocatalytically and increases filament bending (Figure 5)",
"sourceSection": "Results"
}
]
}
</example>
<example>
Excerpt (Results):
"Pangram's text classifier is the only model that achieves production-ready levels of accuracy, false positive rate, and false negative rate. Our model is the most accurate at 99%, compared to commercial competitors which do not even clear 95%. Our false positive rate is better than the second best model, GPTZero, by a factor of 3, which achieving 7 times better negative error rate."
{
"nodes": [
{
"nodeType": "Result",
"content": "Pangram had a lower false positive and false negative rate in detecting AI-generated writing than GPTZero, Originality, or DetectGPT, based on classification of 2000 test documents",
"supportSnippet": " Our model is the most accurate at 99%, compared to commercial competitors which do not even clear 95%. Our false positive rate is better than the second best model, GPTZero, by a factor of 3, which achieving 7 times better negative error rate.",
"sourceSection": "Results"
},
{
"nodeType": "Claim",
"content": "Pangram achieves higher accuracy and fewer false positives than other AI writing detection algorithms",
"supportSnippet": " Pangram Text outperforms zero-shot methods such as DetectGPT as well as leading commercial AI detection tools with over 38 times lower error rates on a comprehensive benchmark comprised of 10 text domains (student writing, creative writing, scientific writing, books, encyclopedias, news, email, scientific papers, short-form Q&A) and 8 open and closed-source large language models",
"sourceSection": "Abstract"
}
]
}
</example>`;
Comment thread
sid597 marked this conversation as resolved.
Comment thread
sid597 marked this conversation as resolved.

if (researchQuestion) {
prompt += `\n\nFocus extraction around this research question: ${researchQuestion}`;
}
export const buildSystemPrompt = ({
nodeTypes,
researchQuestion,
}: {
nodeTypes: NodeTypeDefinition[];
researchQuestion?: string;
}): string => {
const nodeTypesBlock = nodeTypes
.map((t) => `${t.label}: ${t.definition}`)
.join("\n");
const trimmedResearchQuestion = researchQuestion?.trim();
const researchQuestionBlock = trimmedResearchQuestion
? `\n<research-question>\n${trimmedResearchQuestion}\n</research-question>`
: "";

return prompt;
return `You are a research analyst extracting discourse graph nodes from academic papers.
Extract discrete, atomic nodes from the paper. Each node is one idea: one claim, one observation, one question.
${trimmedResearchQuestion ? `Focus extraction around the research question provided below when it is relevant.\n` : ""}${researchQuestionBlock}
<node-types>
${nodeTypesBlock}
</node-types>
<quality>
${QUALITY_CRITERIA}
</quality>
<examples>
${FEW_SHOT_EXAMPLES}
</examples>`;
};

export const buildUserPrompt = (): string =>
"Extract discourse graph nodes from the attached paper.";
3 changes: 1 addition & 2 deletions apps/website/app/types/extraction.ts
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,7 @@ export const ExtractionRequestSchema = z.object({
pdfBase64: z.string().min(1).max(44_000_000),
provider: z.enum(PROVIDER_IDS),
model: z.string().min(1),
researchQuestion: z.string().optional(),
systemPrompt: z.string().optional(),
systemPrompt: z.string().min(1),
});

export type ExtractionRequest = z.infer<typeof ExtractionRequestSchema>;
Expand Down
Loading