diff --git a/spreadsheet-formula-provenance-guard/README.md b/spreadsheet-formula-provenance-guard/README.md new file mode 100644 index 00000000..466ada07 --- /dev/null +++ b/spreadsheet-formula-provenance-guard/README.md @@ -0,0 +1,17 @@ +# Spreadsheet Formula Provenance Guard + +Self-contained SCIBASE Scientific/Engineering Data & Code Hosting slice for issue #14. The guard checks whether uploaded spreadsheet datasets are reproducible and safe to publish before metadata-aware previews, normalized exports, or DOI-ready research packets are generated. + +## Why this slice is distinct + +Existing #14 submissions cover broad FAIR manifests, artifact package integrity, preview cache/version drift, raw-instrument previews, notebook previews, retention/tombstones, model-card lineage, license metadata, sensitive redaction, schema evolution, data dictionaries, persistent identifiers, SBOM/advisory checks, upload checkpoints, replica consistency, column sensitivity, malware/archive quarantine, executable sandbox egress policy, and supplementary media accessibility previews. This module focuses only on spreadsheet formula provenance: formula cells, volatile functions, stale calculated values, hidden sheets, external workbook links, macro-enabled files, normalized export checksums, and reviewer-ready transformation evidence. + +## Run + +```bash +npm test +npm run demo +npm run demo:video +``` + +Demo artifacts are written to `reports/`, including JSON, Markdown, SVG, GIF, and MP4 files. diff --git a/spreadsheet-formula-provenance-guard/demo.js b/spreadsheet-formula-provenance-guard/demo.js new file mode 100644 index 00000000..b0d166b3 --- /dev/null +++ b/spreadsheet-formula-provenance-guard/demo.js @@ -0,0 +1,61 @@ +const fs = require("fs"); +const path = require("path"); + +const { assessSpreadsheetProvenance } = require("./index"); +const { cleanSpreadsheet, riskySpreadsheet } = require("./sample-data"); + +const reportsDir = path.join(__dirname, "reports"); +fs.mkdirSync(reportsDir, { recursive: true }); + +function markdownReport(name, report) { + const findings = report.findings.length + ? report.findings + .map((item) => `- ${item.severity.toUpperCase()} ${item.code}: ${item.message}`) + .join("\n") + : "- No spreadsheet provenance findings."; + return `# Spreadsheet Formula Provenance Guard + +Scenario: ${name} + +Dataset: ${report.datasetId} +File: ${report.filename} +Decision: ${report.decision.toUpperCase()} + +Reviewed ${report.summary.sheetsReviewed} sheets, ${report.summary.formulaCellsReviewed} formula cells, and ${report.summary.exportsReviewed} exports. + +## Findings + +${findings} + +## Release Criteria + +${report.releaseCriteria.map((item) => `- ${item}`).join("\n")} +`; +} + +function svgReport(report) { + const color = report.decision === "hold" ? "#b91c1c" : report.decision === "revise" ? "#c2410c" : "#15803d"; + return ` + + Spreadsheet Formula Provenance Guard + ${report.datasetId} / ${report.filename} + + ${report.decision.toUpperCase()} + Formula cells: ${report.summary.formulaCellsReviewed} + High: ${report.summary.high} + Medium: ${report.summary.medium} + Low: ${report.summary.low} + Synthetic spreadsheet packets only. No private research data or external services. +`; +} + +for (const [name, spreadsheet] of [ + ["clean-spreadsheet", cleanSpreadsheet], + ["risky-spreadsheet", riskySpreadsheet], +]) { + const report = assessSpreadsheetProvenance(spreadsheet); + fs.writeFileSync(path.join(reportsDir, `${name}.json`), JSON.stringify(report, null, 2)); + fs.writeFileSync(path.join(reportsDir, `${name}.md`), markdownReport(name, report)); + fs.writeFileSync(path.join(reportsDir, `${name}.svg`), svgReport(report)); + console.log(`${name}: ${report.decision} (${report.summary.findings} findings)`); +} diff --git a/spreadsheet-formula-provenance-guard/demo_video.py b/spreadsheet-formula-provenance-guard/demo_video.py new file mode 100644 index 00000000..9b525849 --- /dev/null +++ b/spreadsheet-formula-provenance-guard/demo_video.py @@ -0,0 +1,46 @@ +from pathlib import Path + +import imageio.v3 as iio +import numpy as np +from PIL import Image, ImageDraw, ImageFont + + +ROOT = Path(__file__).resolve().parent +REPORTS = ROOT / "reports" +REPORTS.mkdir(exist_ok=True) + + +def font(size): + for name in ("arial.ttf", "segoeui.ttf"): + try: + return ImageFont.truetype(name, size) + except OSError: + pass + return ImageFont.load_default() + + +slides = [ + ("Spreadsheet Formula Guard", "Scientific/Engineering Data & Code Hosting #14"), + ("Checks", "formula cells, volatile functions, stale cached values"), + ("Checks", "external workbook links, hidden sheets, macro-enabled files"), + ("Decision", "hold publication until spreadsheet datasets are reproducible"), +] + +frames = [] +for index, (title, subtitle) in enumerate(slides, start=1): + image = Image.new("RGB", (960, 544), "#10201b") + draw = ImageDraw.Draw(image) + draw.rectangle((44, 52, 916, 492), outline="#34d399", width=3) + draw.text((80, 124), title, fill="#f8fafc", font=font(40)) + draw.text((80, 206), subtitle, fill="#d1fae5", font=font(25)) + draw.rectangle((80, 326, 818, 382), fill="#065f46") + draw.text((104, 342), "spreadsheet previews must be reproducible outside Excel", fill="#ecfdf5", font=font(22)) + draw.text((80, 438), f"Slide {index}/4 - synthetic reviewer artifact", fill="#cbd5e1", font=font(20)) + frames.extend([image] * 14) + +gif_path = REPORTS / "demo.gif" +mp4_path = REPORTS / "demo.mp4" +frames[0].save(gif_path, save_all=True, append_images=frames[1:], duration=120, loop=0) +iio.imwrite(mp4_path, [np.asarray(frame) for frame in frames], fps=8, codec="libx264") +print(f"wrote {gif_path}") +print(f"wrote {mp4_path}") diff --git a/spreadsheet-formula-provenance-guard/index.js b/spreadsheet-formula-provenance-guard/index.js new file mode 100644 index 00000000..7d103f02 --- /dev/null +++ b/spreadsheet-formula-provenance-guard/index.js @@ -0,0 +1,246 @@ +const HIGH = "high"; +const MEDIUM = "medium"; +const LOW = "low"; + +const VOLATILE_FUNCTIONS = ["NOW", "TODAY", "RAND", "RANDBETWEEN", "OFFSET", "INDIRECT"]; + +function requiredString(value, field) { + if (typeof value !== "string" || value.trim() === "") { + throw new TypeError(`${field} must be a non-empty string`); + } + return value.trim(); +} + +function array(value, field) { + if (!Array.isArray(value)) { + throw new TypeError(`${field} must be an array`); + } + return value; +} + +function unique(values) { + return [...new Set(values.map(String))]; +} + +function isSha256(value) { + return typeof value === "string" && /^[a-f0-9]{64}$/.test(value); +} + +function normalizeCell(raw, sheetName, index) { + return { + address: requiredString(raw.address, `${sheetName}.cells[${index}].address`), + formula: raw.formula ? String(raw.formula) : "", + cachedValue: raw.cachedValue === undefined ? null : raw.cachedValue, + recalculatedAt: raw.recalculatedAt ? String(raw.recalculatedAt) : "", + sourceRefs: unique(raw.sourceRefs || []), + }; +} + +function normalizeSheet(raw, index) { + const name = requiredString(raw.name, `sheets[${index}].name`); + return { + name, + hidden: Boolean(raw.hidden), + purpose: raw.purpose ? String(raw.purpose) : "", + cells: array(raw.cells || [], `${name}.cells`).map((cell, cellIndex) => normalizeCell(cell, name, cellIndex)), + }; +} + +function normalizeSpreadsheet(raw) { + return { + datasetId: requiredString(raw.datasetId, "datasetId"), + filename: requiredString(raw.filename, "filename"), + uploadedAt: requiredString(raw.uploadedAt, "uploadedAt"), + sha256: requiredString(raw.sha256, "sha256").toLowerCase(), + macroEnabled: Boolean(raw.macroEnabled), + externalLinks: unique(raw.externalLinks || []), + exports: array(raw.exports || [], "exports").map((item, index) => ({ + format: requiredString(item.format, `exports[${index}].format`), + path: requiredString(item.path, `exports[${index}].path`), + sha256: requiredString(item.sha256, `exports[${index}].sha256`).toLowerCase(), + })), + sheets: array(raw.sheets || [], "sheets").map(normalizeSheet), + }; +} + +function finding(code, severity, sourceId, message, remediation) { + return { code, severity, sourceId, message, remediation }; +} + +function formulaFunctions(formula) { + const matches = String(formula).toUpperCase().match(/[A-Z][A-Z0-9_.]*\s*\(/g) || []; + return unique(matches.map((item) => item.replace(/\s*\($/, ""))); +} + +function hasExternalReference(formula) { + return /\[[^\]]+\]|\bhttps?:\/\//i.test(String(formula)); +} + +function assessSpreadsheetProvenance(rawSpreadsheet) { + const spreadsheet = normalizeSpreadsheet(rawSpreadsheet); + const findings = []; + const formulaCells = spreadsheet.sheets.flatMap((sheet) => + sheet.cells + .filter((cell) => cell.formula) + .map((cell) => ({ ...cell, sheet: sheet.name, sourceId: `${sheet.name}!${cell.address}` })) + ); + + if (!isSha256(spreadsheet.sha256)) { + findings.push( + finding( + "INVALID_SPREADSHEET_DIGEST", + HIGH, + spreadsheet.filename, + `${spreadsheet.filename} does not have a valid SHA-256 upload digest.`, + "Record the original spreadsheet hash before accepting it as a citable dataset artifact." + ) + ); + } + + if (spreadsheet.macroEnabled) { + findings.push( + finding( + "MACRO_ENABLED_DATASET", + HIGH, + spreadsheet.filename, + `${spreadsheet.filename} is macro-enabled and cannot be treated as inert tabular data.`, + "Quarantine macro-enabled files or publish a macro-free normalized export with reviewed transformation notes." + ) + ); + } + + if (spreadsheet.externalLinks.length > 0) { + findings.push( + finding( + "EXTERNAL_WORKBOOK_LINKS", + HIGH, + spreadsheet.filename, + `${spreadsheet.filename} depends on external workbook links: ${spreadsheet.externalLinks.join(", ")}.`, + "Bundle source workbooks or replace formulas with reproducible local references before publication." + ) + ); + } + + for (const sheet of spreadsheet.sheets) { + if (sheet.hidden && sheet.cells.length > 0) { + findings.push( + finding( + "HIDDEN_SHEET_WITH_DATA", + MEDIUM, + sheet.name, + `${sheet.name} is hidden but contains data or formulas.`, + "Expose the sheet, document its purpose, or exclude it from the release with an audit note." + ) + ); + } + } + + for (const cell of formulaCells) { + const functions = formulaFunctions(cell.formula); + const volatile = functions.filter((name) => VOLATILE_FUNCTIONS.includes(name)); + if (volatile.length > 0) { + findings.push( + finding( + "VOLATILE_FORMULA", + HIGH, + cell.sourceId, + `${cell.sourceId} uses volatile functions: ${volatile.join(", ")}.`, + "Replace volatile formulas with fixed values or a scripted, versioned transformation." + ) + ); + } + + if (hasExternalReference(cell.formula)) { + findings.push( + finding( + "FORMULA_EXTERNAL_REFERENCE", + HIGH, + cell.sourceId, + `${cell.sourceId} references an external workbook or URL.`, + "Bundle the dependency and record its checksum, or normalize the formula into a local reproducible dataset." + ) + ); + } + + if (cell.cachedValue === null || cell.recalculatedAt === "") { + findings.push( + finding( + "STALE_OR_MISSING_CALCULATION_EVIDENCE", + MEDIUM, + cell.sourceId, + `${cell.sourceId} lacks cached value or recalculation timestamp evidence.`, + "Recalculate the workbook in a controlled environment and record cached values before preview/export." + ) + ); + } + + if (cell.sourceRefs.length === 0) { + findings.push( + finding( + "FORMULA_WITHOUT_PROVENANCE", + MEDIUM, + cell.sourceId, + `${cell.sourceId} has no provenance references for its inputs.`, + "Attach source ranges, instruments, upstream datasets, or transformation tickets for formula-derived values." + ) + ); + } + } + + const exportFormats = new Set(spreadsheet.exports.map((item) => item.format)); + if (!exportFormats.has("csv") && !exportFormats.has("parquet")) { + findings.push( + finding( + "MISSING_NORMALIZED_EXPORT", + HIGH, + "exports", + "Spreadsheet dataset has no CSV or parquet normalized export.", + "Publish a normalized tabular export so reviewers can diff and re-use the dataset outside spreadsheet software." + ) + ); + } + + for (const exported of spreadsheet.exports) { + if (!isSha256(exported.sha256)) { + findings.push( + finding( + "INVALID_EXPORT_DIGEST", + HIGH, + exported.path, + `${exported.path} does not have a valid SHA-256 digest.`, + "Record a digest for every normalized export before publication." + ) + ); + } + } + + const high = findings.filter((item) => item.severity === HIGH).length; + const medium = findings.filter((item) => item.severity === MEDIUM).length; + return { + datasetId: spreadsheet.datasetId, + filename: spreadsheet.filename, + decision: high > 0 ? "hold" : medium > 0 ? "revise" : "release", + summary: { + sheetsReviewed: spreadsheet.sheets.length, + formulaCellsReviewed: formulaCells.length, + exportsReviewed: spreadsheet.exports.length, + findings: findings.length, + high, + medium, + low: findings.filter((item) => item.severity === LOW).length, + }, + findings, + releaseCriteria: [ + "Original spreadsheet and normalized exports have stable SHA-256 digests.", + "Formula-derived cells avoid volatile functions and external workbook references.", + "Formula cells carry cached value, recalculation timestamp, and provenance evidence.", + "Hidden sheets, macros, and external links are disclosed or blocked before publication.", + "At least one normalized CSV or parquet export is available for reproducible reuse.", + ], + }; +} + +module.exports = { + assessSpreadsheetProvenance, + normalizeSpreadsheet, +}; diff --git a/spreadsheet-formula-provenance-guard/package.json b/spreadsheet-formula-provenance-guard/package.json new file mode 100644 index 00000000..dfbb7ebf --- /dev/null +++ b/spreadsheet-formula-provenance-guard/package.json @@ -0,0 +1,13 @@ +{ + "name": "spreadsheet-formula-provenance-guard", + "version": "1.0.0", + "description": "Spreadsheet formula provenance guard for SCIBASE scientific data hosting", + "main": "index.js", + "type": "commonjs", + "scripts": { + "test": "node test.js", + "demo": "node demo.js", + "demo:video": "python demo_video.py" + }, + "license": "MIT" +} diff --git a/spreadsheet-formula-provenance-guard/reports/clean-spreadsheet.json b/spreadsheet-formula-provenance-guard/reports/clean-spreadsheet.json new file mode 100644 index 00000000..ab52a18e --- /dev/null +++ b/spreadsheet-formula-provenance-guard/reports/clean-spreadsheet.json @@ -0,0 +1,22 @@ +{ + "datasetId": "dataset-climate-yield-001", + "filename": "climate-yield-model.xlsx", + "decision": "release", + "summary": { + "sheetsReviewed": 2, + "formulaCellsReviewed": 1, + "exportsReviewed": 2, + "findings": 0, + "high": 0, + "medium": 0, + "low": 0 + }, + "findings": [], + "releaseCriteria": [ + "Original spreadsheet and normalized exports have stable SHA-256 digests.", + "Formula-derived cells avoid volatile functions and external workbook references.", + "Formula cells carry cached value, recalculation timestamp, and provenance evidence.", + "Hidden sheets, macros, and external links are disclosed or blocked before publication.", + "At least one normalized CSV or parquet export is available for reproducible reuse." + ] +} \ No newline at end of file diff --git a/spreadsheet-formula-provenance-guard/reports/clean-spreadsheet.md b/spreadsheet-formula-provenance-guard/reports/clean-spreadsheet.md new file mode 100644 index 00000000..7db3f7a1 --- /dev/null +++ b/spreadsheet-formula-provenance-guard/reports/clean-spreadsheet.md @@ -0,0 +1,21 @@ +# Spreadsheet Formula Provenance Guard + +Scenario: clean-spreadsheet + +Dataset: dataset-climate-yield-001 +File: climate-yield-model.xlsx +Decision: RELEASE + +Reviewed 2 sheets, 1 formula cells, and 2 exports. + +## Findings + +- No spreadsheet provenance findings. + +## Release Criteria + +- Original spreadsheet and normalized exports have stable SHA-256 digests. +- Formula-derived cells avoid volatile functions and external workbook references. +- Formula cells carry cached value, recalculation timestamp, and provenance evidence. +- Hidden sheets, macros, and external links are disclosed or blocked before publication. +- At least one normalized CSV or parquet export is available for reproducible reuse. diff --git a/spreadsheet-formula-provenance-guard/reports/clean-spreadsheet.svg b/spreadsheet-formula-provenance-guard/reports/clean-spreadsheet.svg new file mode 100644 index 00000000..00b0b893 --- /dev/null +++ b/spreadsheet-formula-provenance-guard/reports/clean-spreadsheet.svg @@ -0,0 +1,12 @@ + + + Spreadsheet Formula Provenance Guard + dataset-climate-yield-001 / climate-yield-model.xlsx + + RELEASE + Formula cells: 1 + High: 0 + Medium: 0 + Low: 0 + Synthetic spreadsheet packets only. No private research data or external services. + \ No newline at end of file diff --git a/spreadsheet-formula-provenance-guard/reports/demo.gif b/spreadsheet-formula-provenance-guard/reports/demo.gif new file mode 100644 index 00000000..6f3bd928 Binary files /dev/null and b/spreadsheet-formula-provenance-guard/reports/demo.gif differ diff --git a/spreadsheet-formula-provenance-guard/reports/demo.mp4 b/spreadsheet-formula-provenance-guard/reports/demo.mp4 new file mode 100644 index 00000000..67bcab77 Binary files /dev/null and b/spreadsheet-formula-provenance-guard/reports/demo.mp4 differ diff --git a/spreadsheet-formula-provenance-guard/reports/risky-spreadsheet.json b/spreadsheet-formula-provenance-guard/reports/risky-spreadsheet.json new file mode 100644 index 00000000..3a202983 --- /dev/null +++ b/spreadsheet-formula-provenance-guard/reports/risky-spreadsheet.json @@ -0,0 +1,114 @@ +{ + "datasetId": "dataset-fragile-xlsx-002", + "filename": "fragile-field-results.xlsm", + "decision": "hold", + "summary": { + "sheetsReviewed": 2, + "formulaCellsReviewed": 2, + "exportsReviewed": 1, + "findings": 13, + "high": 8, + "medium": 5, + "low": 0 + }, + "findings": [ + { + "code": "INVALID_SPREADSHEET_DIGEST", + "severity": "high", + "sourceId": "fragile-field-results.xlsm", + "message": "fragile-field-results.xlsm does not have a valid SHA-256 upload digest.", + "remediation": "Record the original spreadsheet hash before accepting it as a citable dataset artifact." + }, + { + "code": "MACRO_ENABLED_DATASET", + "severity": "high", + "sourceId": "fragile-field-results.xlsm", + "message": "fragile-field-results.xlsm is macro-enabled and cannot be treated as inert tabular data.", + "remediation": "Quarantine macro-enabled files or publish a macro-free normalized export with reviewed transformation notes." + }, + { + "code": "EXTERNAL_WORKBOOK_LINKS", + "severity": "high", + "sourceId": "fragile-field-results.xlsm", + "message": "fragile-field-results.xlsm depends on external workbook links: ../lab-shared/master-calibration.xlsx.", + "remediation": "Bundle source workbooks or replace formulas with reproducible local references before publication." + }, + { + "code": "HIDDEN_SHEET_WITH_DATA", + "severity": "medium", + "sourceId": "hidden-calibration", + "message": "hidden-calibration is hidden but contains data or formulas.", + "remediation": "Expose the sheet, document its purpose, or exclude it from the release with an audit note." + }, + { + "code": "VOLATILE_FORMULA", + "severity": "high", + "sourceId": "hidden-calibration!C7", + "message": "hidden-calibration!C7 uses volatile functions: RAND.", + "remediation": "Replace volatile formulas with fixed values or a scripted, versioned transformation." + }, + { + "code": "FORMULA_EXTERNAL_REFERENCE", + "severity": "high", + "sourceId": "hidden-calibration!C7", + "message": "hidden-calibration!C7 references an external workbook or URL.", + "remediation": "Bundle the dependency and record its checksum, or normalize the formula into a local reproducible dataset." + }, + { + "code": "STALE_OR_MISSING_CALCULATION_EVIDENCE", + "severity": "medium", + "sourceId": "hidden-calibration!C7", + "message": "hidden-calibration!C7 lacks cached value or recalculation timestamp evidence.", + "remediation": "Recalculate the workbook in a controlled environment and record cached values before preview/export." + }, + { + "code": "FORMULA_WITHOUT_PROVENANCE", + "severity": "medium", + "sourceId": "hidden-calibration!C7", + "message": "hidden-calibration!C7 has no provenance references for its inputs.", + "remediation": "Attach source ranges, instruments, upstream datasets, or transformation tickets for formula-derived values." + }, + { + "code": "FORMULA_EXTERNAL_REFERENCE", + "severity": "high", + "sourceId": "hidden-calibration!C8", + "message": "hidden-calibration!C8 references an external workbook or URL.", + "remediation": "Bundle the dependency and record its checksum, or normalize the formula into a local reproducible dataset." + }, + { + "code": "STALE_OR_MISSING_CALCULATION_EVIDENCE", + "severity": "medium", + "sourceId": "hidden-calibration!C8", + "message": "hidden-calibration!C8 lacks cached value or recalculation timestamp evidence.", + "remediation": "Recalculate the workbook in a controlled environment and record cached values before preview/export." + }, + { + "code": "FORMULA_WITHOUT_PROVENANCE", + "severity": "medium", + "sourceId": "hidden-calibration!C8", + "message": "hidden-calibration!C8 has no provenance references for its inputs.", + "remediation": "Attach source ranges, instruments, upstream datasets, or transformation tickets for formula-derived values." + }, + { + "code": "MISSING_NORMALIZED_EXPORT", + "severity": "high", + "sourceId": "exports", + "message": "Spreadsheet dataset has no CSV or parquet normalized export.", + "remediation": "Publish a normalized tabular export so reviewers can diff and re-use the dataset outside spreadsheet software." + }, + { + "code": "INVALID_EXPORT_DIGEST", + "severity": "high", + "sourceId": "exports/fragile-field-results.xlsx", + "message": "exports/fragile-field-results.xlsx does not have a valid SHA-256 digest.", + "remediation": "Record a digest for every normalized export before publication." + } + ], + "releaseCriteria": [ + "Original spreadsheet and normalized exports have stable SHA-256 digests.", + "Formula-derived cells avoid volatile functions and external workbook references.", + "Formula cells carry cached value, recalculation timestamp, and provenance evidence.", + "Hidden sheets, macros, and external links are disclosed or blocked before publication.", + "At least one normalized CSV or parquet export is available for reproducible reuse." + ] +} \ No newline at end of file diff --git a/spreadsheet-formula-provenance-guard/reports/risky-spreadsheet.md b/spreadsheet-formula-provenance-guard/reports/risky-spreadsheet.md new file mode 100644 index 00000000..9eb71ddf --- /dev/null +++ b/spreadsheet-formula-provenance-guard/reports/risky-spreadsheet.md @@ -0,0 +1,33 @@ +# Spreadsheet Formula Provenance Guard + +Scenario: risky-spreadsheet + +Dataset: dataset-fragile-xlsx-002 +File: fragile-field-results.xlsm +Decision: HOLD + +Reviewed 2 sheets, 2 formula cells, and 1 exports. + +## Findings + +- HIGH INVALID_SPREADSHEET_DIGEST: fragile-field-results.xlsm does not have a valid SHA-256 upload digest. +- HIGH MACRO_ENABLED_DATASET: fragile-field-results.xlsm is macro-enabled and cannot be treated as inert tabular data. +- HIGH EXTERNAL_WORKBOOK_LINKS: fragile-field-results.xlsm depends on external workbook links: ../lab-shared/master-calibration.xlsx. +- MEDIUM HIDDEN_SHEET_WITH_DATA: hidden-calibration is hidden but contains data or formulas. +- HIGH VOLATILE_FORMULA: hidden-calibration!C7 uses volatile functions: RAND. +- HIGH FORMULA_EXTERNAL_REFERENCE: hidden-calibration!C7 references an external workbook or URL. +- MEDIUM STALE_OR_MISSING_CALCULATION_EVIDENCE: hidden-calibration!C7 lacks cached value or recalculation timestamp evidence. +- MEDIUM FORMULA_WITHOUT_PROVENANCE: hidden-calibration!C7 has no provenance references for its inputs. +- HIGH FORMULA_EXTERNAL_REFERENCE: hidden-calibration!C8 references an external workbook or URL. +- MEDIUM STALE_OR_MISSING_CALCULATION_EVIDENCE: hidden-calibration!C8 lacks cached value or recalculation timestamp evidence. +- MEDIUM FORMULA_WITHOUT_PROVENANCE: hidden-calibration!C8 has no provenance references for its inputs. +- HIGH MISSING_NORMALIZED_EXPORT: Spreadsheet dataset has no CSV or parquet normalized export. +- HIGH INVALID_EXPORT_DIGEST: exports/fragile-field-results.xlsx does not have a valid SHA-256 digest. + +## Release Criteria + +- Original spreadsheet and normalized exports have stable SHA-256 digests. +- Formula-derived cells avoid volatile functions and external workbook references. +- Formula cells carry cached value, recalculation timestamp, and provenance evidence. +- Hidden sheets, macros, and external links are disclosed or blocked before publication. +- At least one normalized CSV or parquet export is available for reproducible reuse. diff --git a/spreadsheet-formula-provenance-guard/reports/risky-spreadsheet.svg b/spreadsheet-formula-provenance-guard/reports/risky-spreadsheet.svg new file mode 100644 index 00000000..7a597b1b --- /dev/null +++ b/spreadsheet-formula-provenance-guard/reports/risky-spreadsheet.svg @@ -0,0 +1,12 @@ + + + Spreadsheet Formula Provenance Guard + dataset-fragile-xlsx-002 / fragile-field-results.xlsm + + HOLD + Formula cells: 2 + High: 8 + Medium: 5 + Low: 0 + Synthetic spreadsheet packets only. No private research data or external services. + \ No newline at end of file diff --git a/spreadsheet-formula-provenance-guard/requirements-map.md b/spreadsheet-formula-provenance-guard/requirements-map.md new file mode 100644 index 00000000..7450d321 --- /dev/null +++ b/spreadsheet-formula-provenance-guard/requirements-map.md @@ -0,0 +1,13 @@ +# Requirements Map + +Issue #14 asks for scientific data and code hosting with major file-type support, metadata-aware previews, upload versioning and diffing, FAIR-oriented metadata, executable/reusable artifacts, and machine-discoverable exports. + +This slice covers a focused spreadsheet dataset gate: + +- Major file types: targets `.xlsx`/`.xlsm` spreadsheet datasets before they are accepted as research artifacts. +- Metadata-aware previews: blocks previews when formula-derived values lack recalculation evidence or provenance. +- Upload versioning and diffing: requires stable SHA-256 digests for the original workbook and normalized exports. +- Reusable data: requires CSV or parquet normalized exports so reviewers can inspect data without spreadsheet software. +- FAIR and provenance: requires formula source references for derived values and flags hidden sheets, volatile functions, macros, and external workbook links. + +Out of scope by design: broad repository manifests, executable sandboxes, model cards, data dictionaries, malware/archive scans, access embargoes, and accessibility previews, because those are already covered by separate same-issue slices. diff --git a/spreadsheet-formula-provenance-guard/sample-data.js b/spreadsheet-formula-provenance-guard/sample-data.js new file mode 100644 index 00000000..7fa860ab --- /dev/null +++ b/spreadsheet-formula-provenance-guard/sample-data.js @@ -0,0 +1,89 @@ +const digestA = "a".repeat(64); +const digestB = "b".repeat(64); +const digestC = "c".repeat(64); + +const cleanSpreadsheet = { + datasetId: "dataset-climate-yield-001", + filename: "climate-yield-model.xlsx", + uploadedAt: "2026-06-01T19:00:00Z", + sha256: digestA, + macroEnabled: false, + externalLinks: [], + exports: [ + { format: "csv", path: "exports/climate-yield-model.csv", sha256: digestB }, + { format: "parquet", path: "exports/climate-yield-model.parquet", sha256: digestC }, + ], + sheets: [ + { + name: "observations", + hidden: false, + purpose: "Raw normalized observations", + cells: [ + { address: "A2", cachedValue: "MX-001", sourceRefs: ["instrument:station-mx-001"] }, + { address: "B2", cachedValue: 18.2, sourceRefs: ["instrument:station-mx-001"] }, + ], + }, + { + name: "model", + hidden: false, + purpose: "Derived yield score", + cells: [ + { + address: "D2", + formula: "=ROUND(observations!B2*1.8,2)", + cachedValue: 32.76, + recalculatedAt: "2026-06-01T19:05:00Z", + sourceRefs: ["observations!B2", "protocols/yield-transform-v3.md"], + }, + ], + }, + ], +}; + +const riskySpreadsheet = { + datasetId: "dataset-fragile-xlsx-002", + filename: "fragile-field-results.xlsm", + uploadedAt: "2026-06-01T19:00:00Z", + sha256: "not-a-digest", + macroEnabled: true, + externalLinks: ["../lab-shared/master-calibration.xlsx"], + exports: [ + { format: "xlsx", path: "exports/fragile-field-results.xlsx", sha256: "bad-export-digest" }, + ], + sheets: [ + { + name: "raw", + hidden: false, + purpose: "Raw observations", + cells: [ + { address: "A2", cachedValue: "BR-204", sourceRefs: [] }, + ], + }, + { + name: "hidden-calibration", + hidden: true, + purpose: "Undocumented calibration", + cells: [ + { + address: "C7", + formula: "='[master-calibration.xlsx]Sheet1'!B4+RAND()", + cachedValue: null, + recalculatedAt: "", + sourceRefs: [], + }, + { + address: "C8", + formula: "=WEBSERVICE(\"https://example.invalid/current-factor\")", + cachedValue: null, + recalculatedAt: "", + sourceRefs: [], + }, + ], + }, + ], +}; + +module.exports = { + cleanSpreadsheet, + riskySpreadsheet, +}; diff --git a/spreadsheet-formula-provenance-guard/test.js b/spreadsheet-formula-provenance-guard/test.js new file mode 100644 index 00000000..bb06073f --- /dev/null +++ b/spreadsheet-formula-provenance-guard/test.js @@ -0,0 +1,47 @@ +const assert = require("assert"); + +const { assessSpreadsheetProvenance, normalizeSpreadsheet } = require("./index"); +const { cleanSpreadsheet, riskySpreadsheet } = require("./sample-data"); + +const clean = assessSpreadsheetProvenance(cleanSpreadsheet); +assert.strictEqual(clean.decision, "release"); +assert.strictEqual(clean.summary.findings, 0); +assert.strictEqual(clean.summary.formulaCellsReviewed, 1); + +const risky = assessSpreadsheetProvenance(riskySpreadsheet); +assert.strictEqual(risky.decision, "hold"); +for (const code of [ + "INVALID_SPREADSHEET_DIGEST", + "MACRO_ENABLED_DATASET", + "EXTERNAL_WORKBOOK_LINKS", + "HIDDEN_SHEET_WITH_DATA", + "VOLATILE_FORMULA", + "FORMULA_EXTERNAL_REFERENCE", + "STALE_OR_MISSING_CALCULATION_EVIDENCE", + "FORMULA_WITHOUT_PROVENANCE", + "MISSING_NORMALIZED_EXPORT", + "INVALID_EXPORT_DIGEST", +]) { + assert(risky.findings.some((finding) => finding.code === code), `missing ${code}`); +} + +const reviseOnly = assessSpreadsheetProvenance({ + ...cleanSpreadsheet, + sheets: cleanSpreadsheet.sheets.map((sheet) => + sheet.name === "model" + ? { + ...sheet, + cells: sheet.cells.map((cell) => ({ ...cell, recalculatedAt: "" })), + } + : sheet + ), +}); +assert.strictEqual(reviseOnly.decision, "revise"); +assert(reviseOnly.findings.some((finding) => finding.code === "STALE_OR_MISSING_CALCULATION_EVIDENCE")); + +assert.throws( + () => normalizeSpreadsheet({ ...cleanSpreadsheet, datasetId: "" }), + /datasetId must be a non-empty string/ +); + +console.log("spreadsheet formula provenance guard tests passed");