Eng 1595 single prompt based extraction pass prompt encodes dg (#972)

sid597 · web-flow · commit 6588879c0759 · 2026-04-21T21:58:28.000+05:30
* ENG-1592, ENG-1593, ENG-1594: Wire up sidebar inputs for extraction page PDF upload, research question textarea, and toggleable node types with shared NodeTypeDefinition type. Evidence and Claim selected by default. * ENG-1595: Wire extract button; build system prompt from selected node types * Move extraction question into system prompt * Use shadcn Button/Label and default Tailwind utilities in extraction Sidebar Address PR #958 review: swap native <button>/<label> for @repo/ui Button/Label so the sidebar uses the shared component presentation, and replace arbitrary text-[Npx]/tracking-[Nem]/rounded-[Npx]/border-[Npx] classes with Tailwind defaults (text-lg/base/sm/xs, tracking-tight/wide, rounded-3xl, border-2). Adds a new Label component to @repo/ui with @radix-ui/react-label as a direct dep (previously transitive). * Drop arbitrary sidebar width/shadow in favor of Tailwind defaults Swap lg:w-[390px] + xl:w-[420px] for lg:w-96, and the custom shadow-[0_26px_52px_-38px_rgba(15,23,42,0.6)] for shadow-xl. Defers exact visual tuning to the design phase. * Enlarge research question textarea to avoid scrollbar on placeholder min-h-20 (80px) clipped the 3-line placeholder. Bump to min-h-36 (144px) so typical research questions fit without an inner scrollbar. * Surface extraction failures in sidebar, drop debug logs Extraction errors now show a user-facing message under the Extract button while still logging the raw error via console.error so it remains visible in devtools/analytics. Also adds a response.ok check so API failures actually reach the catch path.
diff --git a/apps/website/app/(extract)/extract-nodes/components/Sidebar.tsx b/apps/website/app/(extract)/extract-nodes/components/Sidebar.tsx
@@ -17,6 +17,10 @@ type SidebarProps = {
   onResearchQuestionChange: (value: string) => void;
   selectedTypes: Set<string>;
   onToggleType: (candidateTag: string) => void;
+  onExtract: () => void;
+  canExtract: boolean;
+  isExtracting: boolean;
+  extractionError: string | null;
 };
 
 export const Sidebar = ({
@@ -26,6 +30,10 @@ export const Sidebar = ({
   onResearchQuestionChange,
   selectedTypes,
   onToggleType,
+  onExtract,
+  canExtract,
+  isExtracting,
+  extractionError,
 }: SidebarProps): React.ReactElement => {
   const fileInputRef = useRef<HTMLInputElement>(null);
 
@@ -153,11 +161,21 @@ export const Sidebar = ({
       </div>
 
       <div className="border-t border-slate-200/90 bg-white/95 p-4 backdrop-blur-xl">
-        <p className="mb-2 text-sm font-medium text-slate-500">
-          Ready to run extraction.
-        </p>
-        <Button className="w-full rounded-xl bg-slate-900 py-6 text-lg font-semibold text-white hover:bg-slate-800">
-          Re-Extract
+        {extractionError ? (
+          <p role="alert" className="mb-2 text-sm font-medium text-red-600">
+            {extractionError}
+          </p>
+        ) : (
+          <p className="mb-2 text-sm font-medium text-slate-500">
+            Ready to run extraction.
+          </p>
+        )}
+        <Button
+          onClick={onExtract}
+          disabled={!canExtract}
+          className="w-full rounded-xl bg-slate-900 py-6 text-lg font-semibold text-white hover:bg-slate-800"
+        >
+          {isExtracting ? "Extracting…" : "Re-Extract"}
         </Button>
       </div>
     </aside>
diff --git a/apps/website/app/(extract)/extract-nodes/page.tsx b/apps/website/app/(extract)/extract-nodes/page.tsx
@@ -1,10 +1,22 @@
 "use client";
 
 import { useCallback, useState } from "react";
-import type { ExtractedNode } from "~/types/extraction";
+import { NODE_TYPE_DEFINITIONS, type ExtractedNode } from "~/types/extraction";
+import { buildSystemPrompt } from "~/prompts/extraction";
 import { MainContent } from "./components/MainContent";
 import { Sidebar } from "./components/Sidebar";
 
+const readFileAsBase64 = (file: File): Promise<string> =>
+  new Promise((resolve, reject) => {
+    const reader = new FileReader();
+    reader.onload = () => {
+      const dataUrl = String(reader.result);
+      resolve(dataUrl.slice(dataUrl.indexOf(",") + 1));
+    };
+    reader.onerror = () => reject(reader.error);
+    reader.readAsDataURL(file);
+  });
+
 const SAMPLE_NODES: ExtractedNode[] = [
   {
     nodeType: "Claim",
@@ -86,6 +98,8 @@ const ExtractNodesPage = (): React.ReactElement => {
   const [selectedTypes, setSelectedTypes] = useState(
     () => new Set(["#evd-candidate", "#clm-candidate"]),
   );
+  const [isExtracting, setIsExtracting] = useState(false);
+  const [extractionError, setExtractionError] = useState<string | null>(null);
   const [nodes] = useState<ExtractedNode[]>(SAMPLE_NODES);
 
   const toggleType = useCallback((candidateTag: string) => {
@@ -100,6 +114,46 @@ const ExtractNodesPage = (): React.ReactElement => {
     });
   }, []);
 
+  const canExtract = !!pdfFile && selectedTypes.size > 0 && !isExtracting;
+
+  const handleExtract = useCallback(async () => {
+    if (!pdfFile) return;
+    setIsExtracting(true);
+    setExtractionError(null);
+    try {
+      const pdfBase64 = await readFileAsBase64(pdfFile);
+      const nodeTypes = NODE_TYPE_DEFINITIONS.filter((t) =>
+        selectedTypes.has(t.candidateTag),
+      );
+      const systemPrompt = buildSystemPrompt({
+        nodeTypes,
+        researchQuestion: researchQuestion || undefined,
+      });
+      const requestBody = {
+        pdfBase64,
+        provider: "anthropic",
+        model: "claude-sonnet-4-6",
+        systemPrompt,
+      };
+      const response = await fetch("/api/ai/extract", {
+        method: "POST",
+        // eslint-disable-next-line @typescript-eslint/naming-convention
+        headers: { "Content-Type": "application/json" },
+        body: JSON.stringify(requestBody),
+      });
+      if (!response.ok) {
+        throw new Error(`Request failed with status ${response.status}`);
+      }
+    } catch (error) {
+      console.error("extraction failed:", error);
+      setExtractionError(
+        "We couldn't extract nodes from this PDF. Please try again.",
+      );
+    } finally {
+      setIsExtracting(false);
+    }
+  }, [pdfFile, researchQuestion, selectedTypes]);
+
   return (
     <div className="flex h-full w-full flex-1 flex-col gap-4 p-4 lg:flex-row lg:gap-5 lg:p-5">
       <Sidebar
@@ -109,6 +163,10 @@ const ExtractNodesPage = (): React.ReactElement => {
         onResearchQuestionChange={setResearchQuestion}
         selectedTypes={selectedTypes}
         onToggleType={toggleType}
+        onExtract={() => void handleExtract()}
+        canExtract={canExtract}
+        isExtracting={isExtracting}
+        extractionError={extractionError}
       />
       <MainContent nodes={nodes} />
     </div>
diff --git a/apps/website/app/api/ai/extract/route.ts b/apps/website/app/api/ai/extract/route.ts
@@ -11,10 +11,7 @@ import {
   openaiConfig,
   geminiConfig,
 } from "~/utils/llm/providers";
-import {
-  DEFAULT_EXTRACTION_PROMPT,
-  buildUserPrompt,
-} from "~/prompts/extraction";
+import { buildUserPrompt } from "~/prompts/extraction";
 import { parseExtractionResponse } from "~/utils/ai/parseExtractionResponse";
 
 export const runtime = "nodejs";
@@ -108,8 +105,7 @@ export const POST = async (
     );
   }
 
-  const { pdfBase64, researchQuestion, model, provider, systemPrompt } =
-    validated.data;
+  const { pdfBase64, model, provider, systemPrompt } = validated.data;
 
   const config = PROVIDER_CONFIGS[provider];
   const apiKey = process.env[config.apiKeyEnvVar];
@@ -124,14 +120,14 @@ export const POST = async (
   const messages = buildExtractionMessages({
     provider,
     pdfBase64,
-    userPrompt: buildUserPrompt(researchQuestion),
+    userPrompt: buildUserPrompt(),
   });
 
   const settings: Settings = {
     model,
     maxTokens: 16384,
     temperature: 0.6,
-    systemPrompt: systemPrompt ?? DEFAULT_EXTRACTION_PROMPT,
+    systemPrompt,
     outputSchema: EXTRACTION_RESULT_JSON_SCHEMA,
   };
 
diff --git a/apps/website/app/prompts/extraction.ts b/apps/website/app/prompts/extraction.ts
@@ -1,59 +1,127 @@
-export const DEFAULT_EXTRACTION_PROMPT = `You are a research analyst extracting discourse graph nodes from academic papers.
+import type { NodeTypeDefinition } from "~/types/extraction";
 
-Extract discrete, atomic nodes from the paper. Each node is one idea: one claim, one observation, one question.
-
-## Node Types
-
-- **Evidence**: A discrete observation from a published dataset or experiment cited in the paper (prior work). Past tense. Includes observable, model system, method. Quantitative details when available.
-- **Claim**: An interpretive assertion by the authors. Debatable — goes beyond data to state what it means. Specific enough to test or argue against.
-- **Question**: A research question — explicitly stated or implied by a gap in the literature. Open-ended.
-- **Result**: A discrete observation from this paper's own experiments. Same structure as Evidence but from the current work, not prior studies. Past tense.
-- **Theory**: A theoretical framework or model used or proposed. Name it, state its core proposition.
-- **Source**: A cited publication. Author(s) and year.
-
-## Quality
-
-- Atomic: one idea per node. Split compound sentences.
-- Self-contained: understandable without the paper.
-- Faithful: no inference or editorializing.
-- Specific: "X reduced Y by 43% in Z" not "X was effective."
-- 8–25 nodes. Quality over quantity. Cover all sections.
-- Evidence = prior work cited. Result = this paper's experiments.
-
-## Example
+const QUALITY_CRITERIA = `Atomic: one idea per node. Split compound sentences.
+Self-contained: understandable without the paper.
+Faithful: no inference or editorializing.
+Specific: "X reduced Y by 43% in Z" not "X was effective."
+8–25 nodes. Quality over quantity. Cover all sections.
+For evidence: include the observation, model, system, and method, in past tense`;
 
+const FEW_SHOT_EXAMPLES = `<example>
 Excerpt (Results):
 "CRISPR-edited T cells maintained cytotoxic activity for 12 weeks in vitro (Fig 3A), longer than controls which declined after week 4 (p<0.001). This correlated with elevated CD62L and CCR7 (Fig 3B), suggesting a memory-like phenotype resisting exhaustion."
-
 {
-  "nodes": [
-    {
-      "nodeType": "Result",
-      "content": "CRISPR-edited T cells maintained cytotoxic activity for 12 weeks in vitro, significantly longer than unedited controls which declined after week 4",
-      "supportSnippet": "CRISPR-edited T cells maintained cytotoxic activity for 12 weeks in vitro (Fig 3A), longer than controls which declined after week 4 (p<0.001)",
-      "sourceSection": "Results"
-    },
-    {
-      "nodeType": "Result",
-      "content": "Sustained cytotoxic activity of CRISPR-edited T cells correlated with elevated CD62L and CCR7 expression",
-      "supportSnippet": "This correlated with elevated CD62L and CCR7 (Fig 3B)",
-      "sourceSection": "Results"
-    },
-    {
-      "nodeType": "Claim",
-      "content": "CRISPR editing may promote a memory-like T cell phenotype that resists exhaustion",
-      "supportSnippet": "suggesting a memory-like phenotype resisting exhaustion",
-      "sourceSection": "Results"
-    }
-  ]
-}`;
-
-export const buildUserPrompt = (researchQuestion?: string): string => {
-  let prompt = "Extract discourse graph nodes from the attached paper.";
+"nodes": [
+{
+  "nodeType": "Result",
+  "content": "CRISPR-edited T cells maintained cytotoxic activity for 12 weeks in vitro, significantly longer than unedited controls which declined after week 4",
+  "supportSnippet": "CRISPR-edited T cells maintained cytotoxic activity for 12 weeks in vitro (Fig 3A), longer than controls which declined after week 4 (p<0.001)",
+  "sourceSection": "Results"
+},
+{
+  "nodeType": "Result",
+  "content": "Sustained cytotoxic activity of CRISPR-edited T cells correlated with elevated CD62L and CCR7 expression",
+  "supportSnippet": "This correlated with elevated CD62L and CCR7 (Fig 3B)",
+  "sourceSection": "Results"
+},
+{
+  "nodeType": "Claim",
+  "content": "CRISPR editing may promote a memory-like T cell phenotype that resists exhaustion",
+  "supportSnippet": "suggesting a memory-like phenotype resisting exhaustion",
+  "sourceSection": "Results"
+}
+]
+}
+</example>
+<example>
+Excerpt (Results):
+"The AFM analysis showed that the light chain-free lattice reached a height of approximately 6 nm above the carbon film, and when nano newton-range orthogonal forces were applied, it exhibited significant resistance to further compression compared to lattices constructed from native clathrin with light chains, which could be compressed reversibly from 12 nm to 6 nm in height."
+{
+"nodes": [
+{
+  "nodeType": "Result",
+  "content": "Applying force to clathrin lattices on carbon-coated films caused a larger change in height when clathrin light chain was also present based on AFM",
+  "supportSnippet": "it exhibited significant resistance to further compression compared to lattices constructed from native clathrin with light chains, which could be compressed reversibly from 12 nm to 6 nm in height",
+  "sourceSection": "Results"
+},
+{
+  "nodeType": "Claim",
+  "content": "Clathrin light chain increases the rigidity of clathrin-coated vesicles",
+  "supportSnippet": "The vertical elasticity of the clathrin lattice is dependent on clathrin light chains, suggesting that light chains are important for both, the conformational stability of the clathrin triskelion and that of the lattice.",
+  "sourceSection": "Results"
+}
+]
+}
+</example>
+<example>
+Excerpt (Results):
+"We found that under low tension (0.015 pN/nm), endocytic pits internalize strongly and few barbed ends encounter the base of the pit, with fewer Arp2/3 complexes recruited to the network and a correspondingly low filament bending energy (Figure 7H). Under >50 x higher membrane tension (1 pN/nm), endocytic internalization slowed but was not abolished. For these pits, more barbed ends encountered the base of the pit, binding more Arp2/3 complexes to nucleate more actin filaments and increasing the total actin filament bending energy near the base of the pit (Figure 7H)."
+{
+"nodes": [
+{
+  "nodeType": "Result",
+  "content": "Total bending energy of actin filaments increased as a function of membrane tension in endocytic simulations",
+  "supportSnippet": "Under >50 x higher membrane tension (1 pN/nm), endocytic internalization slowed but was not abolished. For these pits, more barbed ends encountered the base of the pit, binding more Arp2/3 complexes to nucleate more actin filaments and increasing the total actin filament bending energy near the base of the pit (Figure 7H).",
+  "sourceSection": "Results"
+},
+{
+  "nodeType": "Claim",
+  "content": "Actin filament bending energy associated with endocytic internalization increases with membrane tension",
+  "supportSnippet": "Here, the distribution of Hip1R linkers around the pit directs more filaments to grow toward the base of the pit (Figure 4), which nucleates more filaments autocatalytically and increases filament bending (Figure 5)",
+  "sourceSection": "Results"
+}
+]
+}
+</example>
+<example>
+Excerpt (Results):
+"Pangram's text classifier is the only model that achieves production-ready levels of accuracy, false positive rate, and false negative rate. Our model is the most accurate at 99%, compared to commercial competitors which do not even clear 95%. Our false positive rate is better than the second best model, GPTZero, by a factor of 3, which achieving 7 times better negative error rate."
+{
+"nodes": [
+{
+  "nodeType": "Result",
+  "content": "Pangram had a lower false positive and false negative rate in detecting AI-generated writing than GPTZero, Originality, or DetectGPT, based on classification of 2000 test documents",
+  "supportSnippet": " Our model is the most accurate at 99%, compared to commercial competitors which do not even clear 95%. Our false positive rate is better than the second best model, GPTZero, by a factor of 3, which achieving 7 times better negative error rate.",
+  "sourceSection": "Results"
+},
+{
+  "nodeType": "Claim",
+  "content": "Pangram achieves higher accuracy and fewer false positives than other AI writing detection algorithms",
+  "supportSnippet": " Pangram Text outperforms zero-shot methods such as DetectGPT as well as leading commercial AI detection tools with over 38 times lower error rates on a comprehensive benchmark comprised of 10 text domains (student writing, creative writing, scientific writing, books, encyclopedias, news, email, scientific papers, short-form Q&A) and 8 open and closed-source large language models",
+  "sourceSection": "Abstract"
+}
+]
+}
+</example>`;
 
-  if (researchQuestion) {
-    prompt += `\n\nFocus extraction around this research question: ${researchQuestion}`;
-  }
+export const buildSystemPrompt = ({
+  nodeTypes,
+  researchQuestion,
+}: {
+  nodeTypes: NodeTypeDefinition[];
+  researchQuestion?: string;
+}): string => {
+  const nodeTypesBlock = nodeTypes
+    .map((t) => `${t.label}: ${t.definition}`)
+    .join("\n");
+  const trimmedResearchQuestion = researchQuestion?.trim();
+  const researchQuestionBlock = trimmedResearchQuestion
+    ? `\n<research-question>\n${trimmedResearchQuestion}\n</research-question>`
+    : "";
 
-  return prompt;
+  return `You are a research analyst extracting discourse graph nodes from academic papers.
+Extract discrete, atomic nodes from the paper. Each node is one idea: one claim, one observation, one question.
+${trimmedResearchQuestion ? `Focus extraction around the research question provided below when it is relevant.\n` : ""}${researchQuestionBlock}
+<node-types>
+${nodeTypesBlock}
+</node-types>
+<quality>
+${QUALITY_CRITERIA}
+</quality>
+<examples>
+${FEW_SHOT_EXAMPLES}
+</examples>`;
 };
+
+export const buildUserPrompt = (): string =>
+  "Extract discourse graph nodes from the attached paper.";
diff --git a/apps/website/app/types/extraction.ts b/apps/website/app/types/extraction.ts
@@ -26,8 +26,7 @@ export const ExtractionRequestSchema = z.object({
   pdfBase64: z.string().min(1).max(44_000_000),
   provider: z.enum(PROVIDER_IDS),
   model: z.string().min(1),
-  researchQuestion: z.string().optional(),
-  systemPrompt: z.string().optional(),
+  systemPrompt: z.string().min(1),
 });
 
 export type ExtractionRequest = z.infer<typeof ExtractionRequestSchema>;