diff --git a/chemical-identity-stereochemistry-guard/README.md b/chemical-identity-stereochemistry-guard/README.md new file mode 100644 index 00000000..f1c8b009 --- /dev/null +++ b/chemical-identity-stereochemistry-guard/README.md @@ -0,0 +1,44 @@ +# Chemical Identity Stereochemistry Guard + +This self-contained module adds a deterministic chemical identity and stereochemistry gate for SCIBASE knowledge graph workflows. It is scoped to issue #17 and focuses on whether compound nodes and compound graph edges are safe to merge or publish in recommendations. + +The guard does not call external APIs, payment systems, identity providers, live projects, or private data stores. Fixtures are synthetic and every check runs with Node built-ins. + +## What It Checks + +- InChIKey format readiness. +- Isomeric SMILES presence. +- Missing stereochemistry for compounds that require stereochemical identity. +- Salt, hydrate, or other form conflation with parent freebase nodes. +- Isotope-label metadata for labeled tracers. +- DOI-backed chemical identity evidence. +- Synonym collisions across unrelated chemical skeletons. +- `same_as` edges that merge different skeletons, forms, or stereochemical records. +- Assay-context completeness before graph recommendations. +- DOI-backed relationship evidence for compound graph edges. + +## Local Validation + +```sh +npm --prefix chemical-identity-stereochemistry-guard run check +npm --prefix chemical-identity-stereochemistry-guard test +npm --prefix chemical-identity-stereochemistry-guard run demo +npm --prefix chemical-identity-stereochemistry-guard run make-demo-video +npm --prefix chemical-identity-stereochemistry-guard run verify-video +``` + +## Generated Artifacts + +Running the demo writes: + +- `reports/clean-chemical-identity-report.json` +- `reports/risky-chemical-identity-report.json` +- `reports/risky-chemical-identity-handoff.md` +- `reports/chemical-identity-dashboard.svg` +- `reports/demo.mp4` + +The risky packet intentionally demonstrates release blockers: missing stereochemistry, invalid InChIKeys, missing isotope labels, synonym collisions, `same_as` skeleton mismatch, salt-form conflation, incomplete assay context, missing edge evidence, and missing graph nodes. + +## Issue Fit + +This is a distinct Scientific Knowledge Graph Integration slice. It complements the existing broad extraction/navigation, ontology drift, aliasing, biological accession crosswalk, measurement harmonization, geospatial provenance, sample custody/cold-chain, protocol deviation/reagent lot, software dependency, image metadata, funding provenance, temporal consistency, and recommendation visibility/diversity work by focusing specifically on chemical identity and stereochemistry before graph merge or recommendation publication. diff --git a/chemical-identity-stereochemistry-guard/demo.js b/chemical-identity-stereochemistry-guard/demo.js new file mode 100644 index 00000000..683d45dc --- /dev/null +++ b/chemical-identity-stereochemistry-guard/demo.js @@ -0,0 +1,112 @@ +const fs = require("node:fs"); +const path = require("node:path"); +const { evaluateChemicalIdentityGraph } = require("./index"); +const { cleanPacket, riskyPacket } = require("./sample-data"); + +const reportsDir = path.join(__dirname, "reports"); +fs.mkdirSync(reportsDir, { recursive: true }); + +const clean = evaluateChemicalIdentityGraph(cleanPacket); +const risky = evaluateChemicalIdentityGraph(riskyPacket); + +function writeJson(name, value) { + fs.writeFileSync(path.join(reportsDir, name), `${JSON.stringify(value, null, 2)}\n`); +} + +function escapeXml(value) { + return String(value) + .replace(/&/g, "&") + .replace(//g, ">") + .replace(/"/g, """); +} + +function findingTable(report) { + return report.findings + .slice(0, 12) + .map((finding) => `| ${finding.severity} | ${finding.code} | ${finding.action} |`) + .join("\n"); +} + +function writeHandoff(report) { + const lines = [ + "# Chemical Identity Graph Handoff", + "", + `Decision: ${report.summary.decision}`, + `Compounds reviewed: ${report.summary.compoundsReviewed}`, + `Edges reviewed: ${report.summary.edgesReviewed}`, + `Held nodes: ${report.summary.heldNodes}`, + `Held edges: ${report.summary.heldEdges}`, + `Audit digest: ${report.summary.auditDigest}`, + "", + "## Priority Findings", + "", + "| Severity | Code | Remediation |", + "| --- | --- | --- |", + findingTable(report), + "", + "## Node Actions", + "", + "| Compound | Status | Actions |", + "| --- | --- | --- |", + ...report.compounds.map((node) => `| ${node.id} | ${node.status} | ${node.actions.join(", ") || "none"} |`), + "", + "## Edge Actions", + "", + "| Edge | Status | Actions |", + "| --- | --- | --- |", + ...report.edges.map((edge) => `| ${edge.id} | ${edge.status} | ${edge.actions.join(", ") || "none"} |`), + "" + ]; + fs.writeFileSync(path.join(reportsDir, "risky-chemical-identity-handoff.md"), `${lines.join("\n")}\n`); +} + +function writeSvg(cleanReport, riskyReport) { + const width = 960; + const height = 540; + const findingWidth = Math.round((riskyReport.summary.findingCount / 16) * 300); + const criticalWidth = Math.round((riskyReport.summary.criticalFindings / 5) * 300); + const heldWidth = Math.round(((riskyReport.summary.heldNodes + riskyReport.summary.heldEdges) / 8) * 300); + const rows = riskyReport.findings.slice(0, 8).map((finding, index) => { + const y = 244 + index * 26; + const color = finding.severity === "critical" ? "#991b1b" : finding.severity === "high" ? "#dc2626" : finding.severity === "medium" ? "#d97706" : "#64748b"; + return `${escapeXml(finding.code)}`; + }).join("\n"); + + const svg = ` + + + + Chemical identity stereochemistry guard + Checks compound graph nodes before merge or recommendation publication. + Clean graph findings + + + ${cleanReport.summary.findingCount} findings + Risky critical findings + + + ${riskyReport.summary.criticalFindings} critical + Held nodes and edges + + + ${riskyReport.summary.heldNodes + riskyReport.summary.heldEdges} held + + Top blockers + ${rows} + Decision: ${escapeXml(riskyReport.summary.decision)} | ${riskyReport.summary.auditDigest.slice(0, 28)}... + +`; + fs.writeFileSync(path.join(reportsDir, "chemical-identity-dashboard.svg"), svg); +} + +writeJson("clean-chemical-identity-report.json", clean); +writeJson("risky-chemical-identity-report.json", risky); +writeHandoff(risky); +writeSvg(clean, risky); + +console.log("Wrote chemical identity stereochemistry guard reports:"); +console.log(`- ${path.join(reportsDir, "clean-chemical-identity-report.json")}`); +console.log(`- ${path.join(reportsDir, "risky-chemical-identity-report.json")}`); +console.log(`- ${path.join(reportsDir, "risky-chemical-identity-handoff.md")}`); +console.log(`- ${path.join(reportsDir, "chemical-identity-dashboard.svg")}`); diff --git a/chemical-identity-stereochemistry-guard/index.js b/chemical-identity-stereochemistry-guard/index.js new file mode 100644 index 00000000..151d2555 --- /dev/null +++ b/chemical-identity-stereochemistry-guard/index.js @@ -0,0 +1,355 @@ +const crypto = require("node:crypto"); + +function asArray(value) { + return Array.isArray(value) ? value : []; +} + +function stableJson(value) { + if (Array.isArray(value)) { + return `[${value.map(stableJson).join(",")}]`; + } + if (value && typeof value === "object") { + return `{${Object.keys(value).sort().map((key) => `${JSON.stringify(key)}:${stableJson(value[key])}`).join(",")}}`; + } + return JSON.stringify(value); +} + +function sha256(value) { + return crypto.createHash("sha256").update(stableJson(value)).digest("hex"); +} + +function severityRank(severity) { + return { critical: 4, high: 3, medium: 2, low: 1 }[severity] || 0; +} + +function addFinding(findings, severity, code, message, refs, action) { + findings.push({ + severity, + code, + message, + refs: asArray(refs), + action + }); +} + +function normalize(value) { + return String(value || "").trim().toLowerCase(); +} + +function inchiSkeleton(inchiKey) { + const value = String(inchiKey || "").trim().toUpperCase(); + return value.includes("-") ? value.split("-")[0] : value; +} + +function hasStereoToken(smiles) { + return /[@/\\]/.test(String(smiles || "")); +} + +function synonymKey(value) { + return normalize(value).replace(/[^a-z0-9]/g, ""); +} + +function evidenceOk(evidence) { + return evidence && evidence.doi && /^10\.\d{4,9}\//.test(String(evidence.doi)) && evidence.sourceType; +} + +function nodeById(nodes) { + return new Map(asArray(nodes).map((node) => [node.id, node])); +} + +function evaluateChemicalNode(node, packet, policy, findings) { + const refs = [node.id || "compound"]; + const actions = []; + + if (!node.inchiKey || !/^[A-Z]{14}-[A-Z]{10}-[A-Z]$/.test(String(node.inchiKey))) { + addFinding( + findings, + "high", + "INCHIKEY_FORMAT_INVALID", + `${node.id || "Compound"} has missing or invalid InChIKey evidence.`, + refs, + "attach_valid_inchikey_before_graph_merge" + ); + actions.push("attach_valid_inchikey_before_graph_merge"); + } + + if (!node.isomericSmiles) { + addFinding( + findings, + "high", + "ISOMERIC_SMILES_MISSING", + `${node.id || "Compound"} lacks an isomeric SMILES string.`, + refs, + "attach_isomeric_smiles_before_recommendation" + ); + actions.push("attach_isomeric_smiles_before_recommendation"); + } + + if (node.requiresStereochemistry === true && !hasStereoToken(node.isomericSmiles)) { + addFinding( + findings, + "critical", + "STEREOCHEMISTRY_MISSING", + `${node.id || "Compound"} requires stereochemistry but the graph node has no stereochemical markers.`, + refs, + "split_or_hold_compound_until_stereochemistry_is_explicit" + ); + actions.push("split_or_hold_compound_until_stereochemistry_is_explicit"); + } + + if (node.form && node.parentCompoundId && normalize(node.form) !== "freebase") { + const parent = asArray(packet.compounds).find((candidate) => candidate.id === node.parentCompoundId); + if (parent && inchiSkeleton(parent.inchiKey) === inchiSkeleton(node.inchiKey) && normalize(parent.form) === "freebase") { + addFinding( + findings, + "medium", + "SALT_FORM_NEEDS_SEPARATE_NODE", + `${node.id || "Compound"} is a ${node.form} form that should not be silently merged with the freebase node.`, + refs.concat([parent.id]), + "retain_salt_or_hydrate_form_as_explicit_graph_node" + ); + actions.push("retain_salt_or_hydrate_form_as_explicit_graph_node"); + } + } + + if (node.isotopeLabeled === true && !node.isotopeLabel) { + addFinding( + findings, + "high", + "ISOTOPE_LABEL_MISSING", + `${node.id || "Compound"} is isotope-labeled but has no isotope label metadata.`, + refs, + "attach_isotope_label_before_assay_edge_publication" + ); + actions.push("attach_isotope_label_before_assay_edge_publication"); + } + + if (!evidenceOk(node.evidence)) { + addFinding( + findings, + "high", + "CHEMICAL_IDENTITY_EVIDENCE_MISSING", + `${node.id || "Compound"} lacks DOI-backed identity evidence.`, + refs, + "attach_doi_backed_identity_evidence" + ); + actions.push("attach_doi_backed_identity_evidence"); + } + + const synonymMap = new Map(); + for (const synonym of asArray(node.synonyms)) { + const key = synonymKey(synonym); + if (!key) { + continue; + } + synonymMap.set(key, (synonymMap.get(key) || 0) + 1); + } + for (const [key, count] of synonymMap.entries()) { + if (count > 1) { + addFinding( + findings, + "low", + "DUPLICATE_NODE_SYNONYM", + `${node.id || "Compound"} repeats synonym key ${key}.`, + refs, + "deduplicate_compound_synonyms" + ); + actions.push("deduplicate_compound_synonyms"); + } + } + + return { + id: node.id, + label: node.label || node.id, + status: actions.some((action) => action.includes("hold") || action.includes("split")) ? "hold_identity_node" : actions.length > 0 ? "curator_review" : "publishable", + actions: [...new Set(actions)] + }; +} + +function evaluateSynonymCollisions(compounds, findings) { + const owners = new Map(); + for (const node of asArray(compounds)) { + for (const synonym of asArray(node.synonyms)) { + const key = synonymKey(synonym); + if (!key) { + continue; + } + if (!owners.has(key)) { + owners.set(key, []); + } + owners.get(key).push(node); + } + } + + for (const [key, nodes] of owners.entries()) { + const skeletons = new Set(nodes.map((node) => inchiSkeleton(node.inchiKey)).filter(Boolean)); + if (nodes.length > 1 && skeletons.size > 1) { + addFinding( + findings, + "critical", + "SYNONYM_COLLISION_ACROSS_COMPOUNDS", + `Synonym ${key} resolves to multiple chemical skeletons.`, + nodes.map((node) => node.id), + "quarantine_synonym_until_curator_disambiguates_compounds" + ); + } + } +} + +function evaluateGraphEdge(edge, nodes, policy, findings) { + const refs = [edge.id || "edge", edge.source, edge.target].filter(Boolean); + const source = nodes.get(edge.source); + const target = nodes.get(edge.target); + const actions = []; + + if (!source || (!target && edge.relationship !== "tested_in_assay")) { + addFinding( + findings, + "critical", + "CHEMICAL_EDGE_NODE_MISSING", + `${edge.id || "Edge"} references a missing chemical node.`, + refs, + "block_edge_until_both_chemical_nodes_exist" + ); + actions.push("block_edge_until_both_chemical_nodes_exist"); + return { id: edge.id, status: "hold_graph_edge", actions }; + } + + if (edge.relationship === "same_as") { + const sameSkeleton = inchiSkeleton(source.inchiKey) === inchiSkeleton(target.inchiKey); + if (!sameSkeleton) { + addFinding( + findings, + "critical", + "SAME_AS_SKELETON_MISMATCH", + `${edge.id || "Edge"} marks two different chemical skeletons as same_as.`, + refs, + "split_same_as_edge_into_distinct_compound_nodes" + ); + actions.push("split_same_as_edge_into_distinct_compound_nodes"); + } + if (normalize(source.form) !== normalize(target.form) && policy.mergeSaltForms !== true) { + addFinding( + findings, + "medium", + "SAME_AS_FORM_CONFLATION", + `${edge.id || "Edge"} conflates ${source.form || "unspecified"} and ${target.form || "unspecified"} forms.`, + refs, + "model_compound_form_relationship_instead_of_same_as" + ); + actions.push("model_compound_form_relationship_instead_of_same_as"); + } + if (source.requiresStereochemistry || target.requiresStereochemistry) { + if (source.isomericSmiles !== target.isomericSmiles) { + addFinding( + findings, + "high", + "SAME_AS_STEREOCHEMISTRY_MISMATCH", + `${edge.id || "Edge"} links stereochemically distinct compound records.`, + refs, + "hold_same_as_edge_for_stereochemistry_review" + ); + actions.push("hold_same_as_edge_for_stereochemistry_review"); + } + } + } + + if (edge.relationship === "tested_in_assay") { + if (!edge.assayContext || !edge.assayContext.matrix || !edge.assayContext.concentrationUnit) { + addFinding( + findings, + "high", + "ASSAY_CONTEXT_INCOMPLETE", + `${edge.id || "Edge"} lacks assay matrix or concentration-unit context.`, + refs, + "attach_assay_context_before_recommendation" + ); + actions.push("attach_assay_context_before_recommendation"); + } + if (edge.assayContext && edge.assayContext.compatibleForms && !asArray(edge.assayContext.compatibleForms).map(normalize).includes(normalize(source.form || "freebase"))) { + addFinding( + findings, + "medium", + "ASSAY_FORM_CONTEXT_MISMATCH", + `${edge.id || "Edge"} uses a compound form that is incompatible with the assay context.`, + refs, + "suppress_assay_recommendation_until_form_context_matches" + ); + actions.push("suppress_assay_recommendation_until_form_context_matches"); + } + } + + if (!evidenceOk(edge.evidence)) { + addFinding( + findings, + "high", + "EDGE_EVIDENCE_DOI_MISSING", + `${edge.id || "Edge"} lacks DOI-backed relationship evidence.`, + refs, + "attach_doi_backed_edge_evidence" + ); + actions.push("attach_doi_backed_edge_evidence"); + } + + return { + id: edge.id, + relationship: edge.relationship, + status: actions.some((action) => action.includes("block") || action.includes("hold") || action.includes("split")) ? "hold_graph_edge" : actions.length > 0 ? "curator_review" : "publishable", + actions: [...new Set(actions)] + }; +} + +function evaluateChemicalIdentityGraph(packet) { + const findings = []; + const policy = { + mergeSaltForms: false, + ...(packet.policy || {}) + }; + const nodeMap = nodeById(packet.compounds); + const compoundSummaries = asArray(packet.compounds).map((node) => evaluateChemicalNode(node, packet, policy, findings)); + evaluateSynonymCollisions(packet.compounds, findings); + const edgeSummaries = asArray(packet.edges).map((edge) => evaluateGraphEdge(edge, nodeMap, policy, findings)); + + const criticalFindings = findings.filter((finding) => finding.severity === "critical").length; + const highOrCriticalFindings = findings.filter((finding) => severityRank(finding.severity) >= severityRank("high")).length; + const heldNodes = compoundSummaries.filter((node) => node.status === "hold_identity_node").length; + const heldEdges = edgeSummaries.filter((edge) => edge.status === "hold_graph_edge").length; + let decision = "publish_chemical_graph"; + + if (criticalFindings > 0 || heldNodes > 0 || heldEdges > 0) { + decision = "hold_chemical_graph"; + } else if (highOrCriticalFindings > 0) { + decision = "route_to_chemical_curator"; + } + + const auditSubject = { + graphId: packet.graphId, + policy, + compoundSummaries, + edgeSummaries, + findingCodes: findings.map((finding) => finding.code).sort() + }; + + return { + summary: { + decision, + graphId: packet.graphId || "chemical-graph", + compoundsReviewed: compoundSummaries.length, + edgesReviewed: edgeSummaries.length, + heldNodes, + heldEdges, + findingCount: findings.length, + criticalFindings, + highOrCriticalFindings, + auditDigest: `sha256:${sha256(auditSubject)}` + }, + compounds: compoundSummaries, + edges: edgeSummaries, + findings: findings.sort((a, b) => severityRank(b.severity) - severityRank(a.severity) || a.code.localeCompare(b.code)) + }; +} + +module.exports = { + evaluateChemicalIdentityGraph, + sha256 +}; diff --git a/chemical-identity-stereochemistry-guard/make-demo-video.js b/chemical-identity-stereochemistry-guard/make-demo-video.js new file mode 100644 index 00000000..a0658a7d --- /dev/null +++ b/chemical-identity-stereochemistry-guard/make-demo-video.js @@ -0,0 +1,92 @@ +const fs = require("node:fs"); +const path = require("node:path"); +const { spawnSync } = require("node:child_process"); +const { evaluateChemicalIdentityGraph } = require("./index"); +const { cleanPacket, riskyPacket } = require("./sample-data"); + +const reportsDir = path.join(__dirname, "reports"); +const framesDir = path.join(reportsDir, "frames"); +fs.mkdirSync(framesDir, { recursive: true }); + +const clean = evaluateChemicalIdentityGraph(cleanPacket); +const risky = evaluateChemicalIdentityGraph(riskyPacket); +const width = 960; +const height = 540; +const frames = 72; +const fps = 18; + +function setPixel(buffer, x, y, r, g, b) { + if (x < 0 || y < 0 || x >= width || y >= height) { + return; + } + const offset = (y * width + x) * 3; + buffer[offset] = r; + buffer[offset + 1] = g; + buffer[offset + 2] = b; +} + +function fillRect(buffer, x, y, w, h, r, g, b) { + for (let row = y; row < y + h; row += 1) { + for (let col = x; col < x + w; col += 1) { + setPixel(buffer, col, row, r, g, b); + } + } +} + +function drawNodes(buffer, x, y, count, color) { + for (let index = 0; index < count; index += 1) { + fillRect(buffer, x + index * 54, y + (index % 2) * 28, 34, 34, color[0], color[1], color[2]); + } +} + +function writeFrame(index, progress) { + const buffer = Buffer.alloc(width * height * 3, 248); + fillRect(buffer, 0, 0, width, height, 248, 250, 252); + fillRect(buffer, 48, 44, 864, 452, 255, 255, 255); + fillRect(buffer, 48, 44, 864, 8, 15, 23, 42); + + const cleanWidth = Math.floor(300 * Math.min(1, progress * 1.5) * Math.max(0.04, clean.summary.compoundsReviewed / 5)); + const riskyWidth = Math.floor(300 * Math.max(0, (progress - 0.1) * 1.4) * Math.min(1, risky.summary.findingCount / 16)); + const heldWidth = Math.floor(300 * Math.max(0, (progress - 0.2) * 1.3) * Math.min(1, (risky.summary.heldNodes + risky.summary.heldEdges) / 8)); + + fillRect(buffer, 96, 126, 300, 42, 226, 232, 240); + fillRect(buffer, 96, 126, cleanWidth, 42, 16, 185, 129); + fillRect(buffer, 96, 222, 300, 42, 226, 232, 240); + fillRect(buffer, 96, 222, riskyWidth, 42, 239, 68, 68); + fillRect(buffer, 96, 318, 300, 42, 226, 232, 240); + fillRect(buffer, 96, 318, heldWidth, 42, 245, 158, 11); + + drawNodes(buffer, 110, 406, risky.summary.compoundsReviewed, [99, 102, 241]); + drawNodes(buffer, 536, 214, risky.summary.criticalFindings, [153, 27, 27]); + drawNodes(buffer, 536, 324, Math.min(8, risky.summary.findingCount), [220, 38, 38]); + fillRect(buffer, 536, 436, Math.floor(310 * progress), 14, 37, 99, 235); + + const header = Buffer.from(`P6\n${width} ${height}\n255\n`, "ascii"); + fs.writeFileSync(path.join(framesDir, `frame-${String(index).padStart(3, "0")}.ppm`), Buffer.concat([header, buffer])); +} + +for (let index = 0; index < frames; index += 1) { + writeFrame(index, index / (frames - 1)); +} + +const output = path.join(reportsDir, "demo.mp4"); +const result = spawnSync(process.env.FFMPEG_PATH || "ffmpeg", [ + "-y", + "-framerate", + String(fps), + "-i", + path.join(framesDir, "frame-%03d.ppm"), + "-pix_fmt", + "yuv420p", + "-movflags", + "+faststart", + output +], { stdio: "inherit" }); + +fs.rmSync(framesDir, { recursive: true, force: true }); + +if (result.status !== 0) { + process.exit(result.status || 1); +} + +console.log(`Wrote ${output}`); diff --git a/chemical-identity-stereochemistry-guard/package.json b/chemical-identity-stereochemistry-guard/package.json new file mode 100644 index 00000000..d9e3d684 --- /dev/null +++ b/chemical-identity-stereochemistry-guard/package.json @@ -0,0 +1,15 @@ +{ + "name": "chemical-identity-stereochemistry-guard", + "version": "1.0.0", + "description": "Dependency-free chemical identity and stereochemistry graph guard for SCIBASE knowledge graph workflows.", + "main": "index.js", + "scripts": { + "check": "node test.js", + "test": "node test.js", + "demo": "node demo.js", + "make-demo-video": "node make-demo-video.js", + "verify-video": "node verify-video.js" + }, + "license": "MIT", + "private": true +} diff --git a/chemical-identity-stereochemistry-guard/reports/chemical-identity-dashboard.svg b/chemical-identity-stereochemistry-guard/reports/chemical-identity-dashboard.svg new file mode 100644 index 00000000..c330dbbc --- /dev/null +++ b/chemical-identity-stereochemistry-guard/reports/chemical-identity-dashboard.svg @@ -0,0 +1,30 @@ + + + + + Chemical identity stereochemistry guard + Checks compound graph nodes before merge or recommendation publication. + Clean graph findings + + + 0 findings + Risky critical findings + + + 4 critical + Held nodes and edges + + + 3 held + + Top blockers + CHEMICAL_EDGE_NODE_MISSING +SAME_AS_SKELETON_MISMATCH +STEREOCHEMISTRY_MISSING +SYNONYM_COLLISION_ACROSS_COMPOUNDS +ASSAY_CONTEXT_INCOMPLETE +CHEMICAL_IDENTITY_EVIDENCE_MISSING +EDGE_EVIDENCE_DOI_MISSING +INCHIKEY_FORMAT_INVALID + Decision: hold_chemical_graph | sha256:0128a62f30867cfec310d... + diff --git a/chemical-identity-stereochemistry-guard/reports/clean-chemical-identity-report.json b/chemical-identity-stereochemistry-guard/reports/clean-chemical-identity-report.json new file mode 100644 index 00000000..f952b2d0 --- /dev/null +++ b/chemical-identity-stereochemistry-guard/reports/clean-chemical-identity-report.json @@ -0,0 +1,43 @@ +{ + "summary": { + "decision": "publish_chemical_graph", + "graphId": "KG-CHEM-CLEAN", + "compoundsReviewed": 2, + "edgesReviewed": 2, + "heldNodes": 0, + "heldEdges": 0, + "findingCount": 0, + "criticalFindings": 0, + "highOrCriticalFindings": 0, + "auditDigest": "sha256:3c544a96d9020ddb66cd94e0c2e4a6b810605af9484ae41cd68d022d7667aafe" + }, + "compounds": [ + { + "id": "cmpd-warfarin-r", + "label": "R-warfarin", + "status": "publishable", + "actions": [] + }, + { + "id": "cmpd-warfarin-sodium", + "label": "Warfarin sodium", + "status": "publishable", + "actions": [] + } + ], + "edges": [ + { + "id": "edge-warfarin-form", + "relationship": "form_of", + "status": "publishable", + "actions": [] + }, + { + "id": "edge-warfarin-assay", + "relationship": "tested_in_assay", + "status": "publishable", + "actions": [] + } + ], + "findings": [] +} diff --git a/chemical-identity-stereochemistry-guard/reports/demo.mp4 b/chemical-identity-stereochemistry-guard/reports/demo.mp4 new file mode 100644 index 00000000..c83d547a Binary files /dev/null and b/chemical-identity-stereochemistry-guard/reports/demo.mp4 differ diff --git a/chemical-identity-stereochemistry-guard/reports/risky-chemical-identity-handoff.md b/chemical-identity-stereochemistry-guard/reports/risky-chemical-identity-handoff.md new file mode 100644 index 00000000..3cd529f0 --- /dev/null +++ b/chemical-identity-stereochemistry-guard/reports/risky-chemical-identity-handoff.md @@ -0,0 +1,45 @@ +# Chemical Identity Graph Handoff + +Decision: hold_chemical_graph +Compounds reviewed: 5 +Edges reviewed: 4 +Held nodes: 1 +Held edges: 2 +Audit digest: sha256:0128a62f30867cfec310d4ff265ff6e88ad056a911cbc36f61046aa3e030e7a0 + +## Priority Findings + +| Severity | Code | Remediation | +| --- | --- | --- | +| critical | CHEMICAL_EDGE_NODE_MISSING | block_edge_until_both_chemical_nodes_exist | +| critical | SAME_AS_SKELETON_MISMATCH | split_same_as_edge_into_distinct_compound_nodes | +| critical | STEREOCHEMISTRY_MISSING | split_or_hold_compound_until_stereochemistry_is_explicit | +| critical | SYNONYM_COLLISION_ACROSS_COMPOUNDS | quarantine_synonym_until_curator_disambiguates_compounds | +| high | ASSAY_CONTEXT_INCOMPLETE | attach_assay_context_before_recommendation | +| high | CHEMICAL_IDENTITY_EVIDENCE_MISSING | attach_doi_backed_identity_evidence | +| high | EDGE_EVIDENCE_DOI_MISSING | attach_doi_backed_edge_evidence | +| high | INCHIKEY_FORMAT_INVALID | attach_valid_inchikey_before_graph_merge | +| high | ISOMERIC_SMILES_MISSING | attach_isomeric_smiles_before_recommendation | +| high | ISOTOPE_LABEL_MISSING | attach_isotope_label_before_assay_edge_publication | +| high | SAME_AS_STEREOCHEMISTRY_MISMATCH | hold_same_as_edge_for_stereochemistry_review | +| medium | ASSAY_FORM_CONTEXT_MISMATCH | suppress_assay_recommendation_until_form_context_matches | + +## Node Actions + +| Compound | Status | Actions | +| --- | --- | --- | +| cmpd-thalidomide-ambiguous | hold_identity_node | split_or_hold_compound_until_stereochemistry_is_explicit, attach_doi_backed_identity_evidence, deduplicate_compound_synonyms | +| cmpd-alpha-other | publishable | none | +| cmpd-caffeine-citrate | curator_review | retain_salt_or_hydrate_form_as_explicit_graph_node | +| cmpd-caffeine-freebase | publishable | none | +| cmpd-tracer | curator_review | attach_valid_inchikey_before_graph_merge, attach_isomeric_smiles_before_recommendation, attach_isotope_label_before_assay_edge_publication | + +## Edge Actions + +| Edge | Status | Actions | +| --- | --- | --- | +| edge-bad-same-as | hold_graph_edge | split_same_as_edge_into_distinct_compound_nodes, hold_same_as_edge_for_stereochemistry_review | +| edge-form-conflation | curator_review | model_compound_form_relationship_instead_of_same_as | +| edge-assay-incomplete | curator_review | attach_assay_context_before_recommendation, suppress_assay_recommendation_until_form_context_matches, attach_doi_backed_edge_evidence | +| edge-missing-target | hold_graph_edge | block_edge_until_both_chemical_nodes_exist | + diff --git a/chemical-identity-stereochemistry-guard/reports/risky-chemical-identity-report.json b/chemical-identity-stereochemistry-guard/reports/risky-chemical-identity-report.json new file mode 100644 index 00000000..c501892c --- /dev/null +++ b/chemical-identity-stereochemistry-guard/reports/risky-chemical-identity-report.json @@ -0,0 +1,246 @@ +{ + "summary": { + "decision": "hold_chemical_graph", + "graphId": "KG-CHEM-RISK", + "compoundsReviewed": 5, + "edgesReviewed": 4, + "heldNodes": 1, + "heldEdges": 2, + "findingCount": 15, + "criticalFindings": 4, + "highOrCriticalFindings": 11, + "auditDigest": "sha256:0128a62f30867cfec310d4ff265ff6e88ad056a911cbc36f61046aa3e030e7a0" + }, + "compounds": [ + { + "id": "cmpd-thalidomide-ambiguous", + "label": "Thalidomide", + "status": "hold_identity_node", + "actions": [ + "split_or_hold_compound_until_stereochemistry_is_explicit", + "attach_doi_backed_identity_evidence", + "deduplicate_compound_synonyms" + ] + }, + { + "id": "cmpd-alpha-other", + "label": "Alpha unrelated scaffold", + "status": "publishable", + "actions": [] + }, + { + "id": "cmpd-caffeine-citrate", + "label": "Caffeine citrate", + "status": "curator_review", + "actions": [ + "retain_salt_or_hydrate_form_as_explicit_graph_node" + ] + }, + { + "id": "cmpd-caffeine-freebase", + "label": "Caffeine", + "status": "publishable", + "actions": [] + }, + { + "id": "cmpd-tracer", + "label": "Tracer X", + "status": "curator_review", + "actions": [ + "attach_valid_inchikey_before_graph_merge", + "attach_isomeric_smiles_before_recommendation", + "attach_isotope_label_before_assay_edge_publication" + ] + } + ], + "edges": [ + { + "id": "edge-bad-same-as", + "relationship": "same_as", + "status": "hold_graph_edge", + "actions": [ + "split_same_as_edge_into_distinct_compound_nodes", + "hold_same_as_edge_for_stereochemistry_review" + ] + }, + { + "id": "edge-form-conflation", + "relationship": "same_as", + "status": "curator_review", + "actions": [ + "model_compound_form_relationship_instead_of_same_as" + ] + }, + { + "id": "edge-assay-incomplete", + "relationship": "tested_in_assay", + "status": "curator_review", + "actions": [ + "attach_assay_context_before_recommendation", + "suppress_assay_recommendation_until_form_context_matches", + "attach_doi_backed_edge_evidence" + ] + }, + { + "id": "edge-missing-target", + "status": "hold_graph_edge", + "actions": [ + "block_edge_until_both_chemical_nodes_exist" + ] + } + ], + "findings": [ + { + "severity": "critical", + "code": "CHEMICAL_EDGE_NODE_MISSING", + "message": "edge-missing-target references a missing chemical node.", + "refs": [ + "edge-missing-target", + "cmpd-tracer", + "cmpd-missing" + ], + "action": "block_edge_until_both_chemical_nodes_exist" + }, + { + "severity": "critical", + "code": "SAME_AS_SKELETON_MISMATCH", + "message": "edge-bad-same-as marks two different chemical skeletons as same_as.", + "refs": [ + "edge-bad-same-as", + "cmpd-thalidomide-ambiguous", + "cmpd-alpha-other" + ], + "action": "split_same_as_edge_into_distinct_compound_nodes" + }, + { + "severity": "critical", + "code": "STEREOCHEMISTRY_MISSING", + "message": "cmpd-thalidomide-ambiguous requires stereochemistry but the graph node has no stereochemical markers.", + "refs": [ + "cmpd-thalidomide-ambiguous" + ], + "action": "split_or_hold_compound_until_stereochemistry_is_explicit" + }, + { + "severity": "critical", + "code": "SYNONYM_COLLISION_ACROSS_COMPOUNDS", + "message": "Synonym alphacompound resolves to multiple chemical skeletons.", + "refs": [ + "cmpd-thalidomide-ambiguous", + "cmpd-thalidomide-ambiguous", + "cmpd-alpha-other" + ], + "action": "quarantine_synonym_until_curator_disambiguates_compounds" + }, + { + "severity": "high", + "code": "ASSAY_CONTEXT_INCOMPLETE", + "message": "edge-assay-incomplete lacks assay matrix or concentration-unit context.", + "refs": [ + "edge-assay-incomplete", + "cmpd-caffeine-citrate", + "assay-dose-response-7" + ], + "action": "attach_assay_context_before_recommendation" + }, + { + "severity": "high", + "code": "CHEMICAL_IDENTITY_EVIDENCE_MISSING", + "message": "cmpd-thalidomide-ambiguous lacks DOI-backed identity evidence.", + "refs": [ + "cmpd-thalidomide-ambiguous" + ], + "action": "attach_doi_backed_identity_evidence" + }, + { + "severity": "high", + "code": "EDGE_EVIDENCE_DOI_MISSING", + "message": "edge-assay-incomplete lacks DOI-backed relationship evidence.", + "refs": [ + "edge-assay-incomplete", + "cmpd-caffeine-citrate", + "assay-dose-response-7" + ], + "action": "attach_doi_backed_edge_evidence" + }, + { + "severity": "high", + "code": "INCHIKEY_FORMAT_INVALID", + "message": "cmpd-tracer has missing or invalid InChIKey evidence.", + "refs": [ + "cmpd-tracer" + ], + "action": "attach_valid_inchikey_before_graph_merge" + }, + { + "severity": "high", + "code": "ISOMERIC_SMILES_MISSING", + "message": "cmpd-tracer lacks an isomeric SMILES string.", + "refs": [ + "cmpd-tracer" + ], + "action": "attach_isomeric_smiles_before_recommendation" + }, + { + "severity": "high", + "code": "ISOTOPE_LABEL_MISSING", + "message": "cmpd-tracer is isotope-labeled but has no isotope label metadata.", + "refs": [ + "cmpd-tracer" + ], + "action": "attach_isotope_label_before_assay_edge_publication" + }, + { + "severity": "high", + "code": "SAME_AS_STEREOCHEMISTRY_MISMATCH", + "message": "edge-bad-same-as links stereochemically distinct compound records.", + "refs": [ + "edge-bad-same-as", + "cmpd-thalidomide-ambiguous", + "cmpd-alpha-other" + ], + "action": "hold_same_as_edge_for_stereochemistry_review" + }, + { + "severity": "medium", + "code": "ASSAY_FORM_CONTEXT_MISMATCH", + "message": "edge-assay-incomplete uses a compound form that is incompatible with the assay context.", + "refs": [ + "edge-assay-incomplete", + "cmpd-caffeine-citrate", + "assay-dose-response-7" + ], + "action": "suppress_assay_recommendation_until_form_context_matches" + }, + { + "severity": "medium", + "code": "SALT_FORM_NEEDS_SEPARATE_NODE", + "message": "cmpd-caffeine-citrate is a citrate salt form that should not be silently merged with the freebase node.", + "refs": [ + "cmpd-caffeine-citrate", + "cmpd-caffeine-freebase" + ], + "action": "retain_salt_or_hydrate_form_as_explicit_graph_node" + }, + { + "severity": "medium", + "code": "SAME_AS_FORM_CONFLATION", + "message": "edge-form-conflation conflates citrate salt and freebase forms.", + "refs": [ + "edge-form-conflation", + "cmpd-caffeine-citrate", + "cmpd-caffeine-freebase" + ], + "action": "model_compound_form_relationship_instead_of_same_as" + }, + { + "severity": "low", + "code": "DUPLICATE_NODE_SYNONYM", + "message": "cmpd-thalidomide-ambiguous repeats synonym key alphacompound.", + "refs": [ + "cmpd-thalidomide-ambiguous" + ], + "action": "deduplicate_compound_synonyms" + } + ] +} diff --git a/chemical-identity-stereochemistry-guard/sample-data.js b/chemical-identity-stereochemistry-guard/sample-data.js new file mode 100644 index 00000000..7edd8b70 --- /dev/null +++ b/chemical-identity-stereochemistry-guard/sample-data.js @@ -0,0 +1,188 @@ +const cleanPacket = { + graphId: "KG-CHEM-CLEAN", + policy: { + mergeSaltForms: false + }, + compounds: [ + { + id: "cmpd-warfarin-r", + label: "R-warfarin", + inchiKey: "PJVWKTKQMONHTI-HNNXBMFYSA-N", + isomericSmiles: "CC(=O)CC(C1=CC=CC=C1)C2=C(C=CC(=C2O)O)C(=O)O[C@H]3CCCO3", + requiresStereochemistry: true, + form: "freebase", + synonyms: ["R-warfarin"], + evidence: { + doi: "10.1000/warfarin.identity", + sourceType: "curated-chemistry" + } + }, + { + id: "cmpd-warfarin-sodium", + label: "Warfarin sodium", + inchiKey: "PJVWKTKQMONHTI-HNNXBMFYSA-N", + isomericSmiles: "CC(=O)CC(C1=CC=CC=C1)C2=C(C=CC(=C2O)O)C(=O)O[C@H]3CCCO3.[Na+]", + requiresStereochemistry: true, + form: "sodium salt", + synonyms: ["warfarin sodium"], + evidence: { + doi: "10.1000/warfarin.sodium", + sourceType: "curated-chemistry" + } + } + ], + edges: [ + { + id: "edge-warfarin-form", + source: "cmpd-warfarin-sodium", + target: "cmpd-warfarin-r", + relationship: "form_of", + evidence: { + doi: "10.1000/warfarin.sodium", + sourceType: "curated-chemistry" + } + }, + { + id: "edge-warfarin-assay", + source: "cmpd-warfarin-r", + target: "assay-anticoagulation-1", + relationship: "tested_in_assay", + assayContext: { + matrix: "plasma", + concentrationUnit: "uM", + compatibleForms: ["freebase"] + }, + evidence: { + doi: "10.1000/warfarin.assay", + sourceType: "assay-publication" + } + } + ] +}; + +const riskyPacket = { + graphId: "KG-CHEM-RISK", + policy: { + mergeSaltForms: false + }, + compounds: [ + { + id: "cmpd-thalidomide-ambiguous", + label: "Thalidomide", + inchiKey: "UEJJHQNACJXSKW-UHFFFAOYSA-N", + isomericSmiles: "O=C1NC(=O)C(c2ccccc2)N1", + requiresStereochemistry: true, + form: "freebase", + synonyms: ["thalidomide", "alpha compound", "alpha-compound"], + evidence: { + doi: "", + sourceType: "" + } + }, + { + id: "cmpd-alpha-other", + label: "Alpha unrelated scaffold", + inchiKey: "BSYNRYMUTXBXSQ-UHFFFAOYSA-N", + isomericSmiles: "CC(C)C1=CC=CC=C1", + requiresStereochemistry: false, + form: "freebase", + synonyms: ["Alpha Compound"], + evidence: { + doi: "10.1000/alpha.identity", + sourceType: "curated-chemistry" + } + }, + { + id: "cmpd-caffeine-citrate", + label: "Caffeine citrate", + parentCompoundId: "cmpd-caffeine-freebase", + inchiKey: "RYYVLZVUVIJVGH-UHFFFAOYSA-N", + isomericSmiles: "Cn1cnc2n(C)c(=O)n(C)c(=O)c12.O=C(O)CC(O)(CC(=O)O)C(=O)O", + requiresStereochemistry: false, + form: "citrate salt", + synonyms: ["caffeine citrate"], + evidence: { + doi: "10.1000/caffeine.citrate", + sourceType: "curated-chemistry" + } + }, + { + id: "cmpd-caffeine-freebase", + label: "Caffeine", + inchiKey: "RYYVLZVUVIJVGH-UHFFFAOYSA-N", + isomericSmiles: "Cn1cnc2n(C)c(=O)n(C)c(=O)c12", + requiresStereochemistry: false, + form: "freebase", + synonyms: ["caffeine"], + evidence: { + doi: "10.1000/caffeine.freebase", + sourceType: "curated-chemistry" + } + }, + { + id: "cmpd-tracer", + label: "Tracer X", + inchiKey: "BADKEY", + isomericSmiles: "", + requiresStereochemistry: false, + isotopeLabeled: true, + isotopeLabel: "", + form: "freebase", + synonyms: ["Tracer X"], + evidence: { + doi: "10.1000/tracer.identity", + sourceType: "curated-chemistry" + } + } + ], + edges: [ + { + id: "edge-bad-same-as", + source: "cmpd-thalidomide-ambiguous", + target: "cmpd-alpha-other", + relationship: "same_as", + evidence: { + doi: "10.1000/sameas.bad", + sourceType: "imported-graph" + } + }, + { + id: "edge-form-conflation", + source: "cmpd-caffeine-citrate", + target: "cmpd-caffeine-freebase", + relationship: "same_as", + evidence: { + doi: "10.1000/caffeine.citrate", + sourceType: "imported-graph" + } + }, + { + id: "edge-assay-incomplete", + source: "cmpd-caffeine-citrate", + target: "assay-dose-response-7", + relationship: "tested_in_assay", + assayContext: { + compatibleForms: ["freebase"] + }, + evidence: { + doi: "", + sourceType: "" + } + }, + { + id: "edge-missing-target", + source: "cmpd-tracer", + target: "cmpd-missing", + relationship: "same_as", + evidence: { + doi: "10.1000/tracer.edge", + sourceType: "imported-graph" + } + } + ] +}; + +module.exports = { + cleanPacket, + riskyPacket +}; diff --git a/chemical-identity-stereochemistry-guard/test.js b/chemical-identity-stereochemistry-guard/test.js new file mode 100644 index 00000000..cbc199df --- /dev/null +++ b/chemical-identity-stereochemistry-guard/test.js @@ -0,0 +1,41 @@ +const assert = require("node:assert/strict"); +const { evaluateChemicalIdentityGraph, sha256 } = require("./index"); +const { cleanPacket, riskyPacket } = require("./sample-data"); + +const clean = evaluateChemicalIdentityGraph(cleanPacket); +assert.equal(clean.summary.decision, "publish_chemical_graph"); +assert.equal(clean.summary.findingCount, 0); +assert.equal(clean.summary.compoundsReviewed, 2); +assert.equal(clean.summary.edgesReviewed, 2); +assert.ok(clean.summary.auditDigest.startsWith("sha256:")); + +const risky = evaluateChemicalIdentityGraph(riskyPacket); +assert.equal(risky.summary.decision, "hold_chemical_graph"); +assert.equal(risky.summary.compoundsReviewed, 5); +assert.equal(risky.summary.edgesReviewed, 4); +assert.ok(risky.summary.heldNodes >= 1); +assert.ok(risky.summary.heldEdges >= 2); +assert.ok(risky.summary.findingCount >= 13); +assert.ok(risky.summary.criticalFindings >= 4); +assert.ok(risky.summary.highOrCriticalFindings >= 9); + +const findingCodes = new Set(risky.findings.map((finding) => finding.code)); +assert.ok(findingCodes.has("STEREOCHEMISTRY_MISSING")); +assert.ok(findingCodes.has("CHEMICAL_IDENTITY_EVIDENCE_MISSING")); +assert.ok(findingCodes.has("SYNONYM_COLLISION_ACROSS_COMPOUNDS")); +assert.ok(findingCodes.has("SAME_AS_SKELETON_MISMATCH")); +assert.ok(findingCodes.has("SAME_AS_FORM_CONFLATION")); +assert.ok(findingCodes.has("ASSAY_CONTEXT_INCOMPLETE")); +assert.ok(findingCodes.has("ASSAY_FORM_CONTEXT_MISMATCH")); +assert.ok(findingCodes.has("EDGE_EVIDENCE_DOI_MISSING")); +assert.ok(findingCodes.has("CHEMICAL_EDGE_NODE_MISSING")); +assert.ok(findingCodes.has("INCHIKEY_FORMAT_INVALID")); +assert.ok(findingCodes.has("ISOMERIC_SMILES_MISSING")); +assert.ok(findingCodes.has("ISOTOPE_LABEL_MISSING")); + +const firstDigest = evaluateChemicalIdentityGraph(riskyPacket).summary.auditDigest; +const secondDigest = evaluateChemicalIdentityGraph(riskyPacket).summary.auditDigest; +assert.equal(firstDigest, secondDigest); +assert.equal(sha256({ b: 2, a: 1 }), sha256({ a: 1, b: 2 })); + +console.log("chemical identity stereochemistry guard tests passed"); diff --git a/chemical-identity-stereochemistry-guard/verify-video.js b/chemical-identity-stereochemistry-guard/verify-video.js new file mode 100644 index 00000000..39af983c --- /dev/null +++ b/chemical-identity-stereochemistry-guard/verify-video.js @@ -0,0 +1,37 @@ +const assert = require("node:assert/strict"); +const fs = require("node:fs"); +const path = require("node:path"); +const { spawnSync } = require("node:child_process"); + +const videoPath = path.join(__dirname, "reports", "demo.mp4"); +assert.ok(fs.existsSync(videoPath), "reports/demo.mp4 must exist"); +assert.ok(fs.statSync(videoPath).size > 5000, "reports/demo.mp4 should not be empty"); + +const probe = spawnSync(process.env.FFPROBE_PATH || "ffprobe", [ + "-v", + "error", + "-select_streams", + "v:0", + "-show_entries", + "stream=codec_name,width,height,r_frame_rate:format=duration", + "-of", + "json", + videoPath +], { encoding: "utf8" }); + +if (probe.status !== 0) { + process.stderr.write(probe.stderr || "ffprobe failed\n"); + process.exit(probe.status || 1); +} + +const metadata = JSON.parse(probe.stdout); +const stream = metadata.streams && metadata.streams[0]; +assert.equal(stream.codec_name, "h264"); +assert.equal(stream.width, 960); +assert.equal(stream.height, 540); +assert.equal(stream.r_frame_rate, "18/1"); + +const duration = Number(metadata.format && metadata.format.duration); +assert.ok(duration >= 3.9 && duration <= 4.2, `unexpected duration ${duration}`); + +console.log(`demo.mp4 verified: ${stream.codec_name}, ${stream.width}x${stream.height}, ${duration.toFixed(3)}s, ${stream.r_frame_rate}`);