Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions spreadsheet-formula-provenance-guard/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
# Spreadsheet Formula Provenance Guard

Self-contained SCIBASE Scientific/Engineering Data & Code Hosting slice for issue #14. The guard checks whether uploaded spreadsheet datasets are reproducible and safe to publish before metadata-aware previews, normalized exports, or DOI-ready research packets are generated.

## Why this slice is distinct

Existing #14 submissions cover broad FAIR manifests, artifact package integrity, preview cache/version drift, raw-instrument previews, notebook previews, retention/tombstones, model-card lineage, license metadata, sensitive redaction, schema evolution, data dictionaries, persistent identifiers, SBOM/advisory checks, upload checkpoints, replica consistency, column sensitivity, malware/archive quarantine, executable sandbox egress policy, and supplementary media accessibility previews. This module focuses only on spreadsheet formula provenance: formula cells, volatile functions, stale calculated values, hidden sheets, external workbook links, macro-enabled files, normalized export checksums, and reviewer-ready transformation evidence.

## Run

```bash
npm test
npm run demo
npm run demo:video
```

Demo artifacts are written to `reports/`, including JSON, Markdown, SVG, GIF, and MP4 files.
61 changes: 61 additions & 0 deletions spreadsheet-formula-provenance-guard/demo.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
const fs = require("fs");
const path = require("path");

const { assessSpreadsheetProvenance } = require("./index");
const { cleanSpreadsheet, riskySpreadsheet } = require("./sample-data");

const reportsDir = path.join(__dirname, "reports");
fs.mkdirSync(reportsDir, { recursive: true });

function markdownReport(name, report) {
const findings = report.findings.length
? report.findings
.map((item) => `- ${item.severity.toUpperCase()} ${item.code}: ${item.message}`)
.join("\n")
: "- No spreadsheet provenance findings.";
return `# Spreadsheet Formula Provenance Guard

Scenario: ${name}

Dataset: ${report.datasetId}
File: ${report.filename}
Decision: ${report.decision.toUpperCase()}

Reviewed ${report.summary.sheetsReviewed} sheets, ${report.summary.formulaCellsReviewed} formula cells, and ${report.summary.exportsReviewed} exports.

## Findings

${findings}

## Release Criteria

${report.releaseCriteria.map((item) => `- ${item}`).join("\n")}
`;
}

function svgReport(report) {
const color = report.decision === "hold" ? "#b91c1c" : report.decision === "revise" ? "#c2410c" : "#15803d";
return `<svg xmlns="http://www.w3.org/2000/svg" width="920" height="430" viewBox="0 0 920 430">
<rect width="920" height="430" fill="#10201b"/>
<text x="42" y="66" fill="#f8fafc" font-family="Arial" font-size="32">Spreadsheet Formula Provenance Guard</text>
<text x="42" y="112" fill="#cbd5e1" font-family="Arial" font-size="20">${report.datasetId} / ${report.filename}</text>
<rect x="42" y="150" width="210" height="80" rx="8" fill="${color}"/>
<text x="68" y="201" fill="#fff" font-family="Arial" font-size="30">${report.decision.toUpperCase()}</text>
<text x="42" y="280" fill="#e5e7eb" font-family="Arial" font-size="22">Formula cells: ${report.summary.formulaCellsReviewed}</text>
<text x="42" y="320" fill="#fecaca" font-family="Arial" font-size="20">High: ${report.summary.high}</text>
<text x="172" y="320" fill="#fed7aa" font-family="Arial" font-size="20">Medium: ${report.summary.medium}</text>
<text x="342" y="320" fill="#bfdbfe" font-family="Arial" font-size="20">Low: ${report.summary.low}</text>
<text x="42" y="374" fill="#94a3b8" font-family="Arial" font-size="18">Synthetic spreadsheet packets only. No private research data or external services.</text>
</svg>`;
}

for (const [name, spreadsheet] of [
["clean-spreadsheet", cleanSpreadsheet],
["risky-spreadsheet", riskySpreadsheet],
]) {
const report = assessSpreadsheetProvenance(spreadsheet);
fs.writeFileSync(path.join(reportsDir, `${name}.json`), JSON.stringify(report, null, 2));
fs.writeFileSync(path.join(reportsDir, `${name}.md`), markdownReport(name, report));
fs.writeFileSync(path.join(reportsDir, `${name}.svg`), svgReport(report));
console.log(`${name}: ${report.decision} (${report.summary.findings} findings)`);
}
46 changes: 46 additions & 0 deletions spreadsheet-formula-provenance-guard/demo_video.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
from pathlib import Path

import imageio.v3 as iio
import numpy as np
from PIL import Image, ImageDraw, ImageFont


ROOT = Path(__file__).resolve().parent
REPORTS = ROOT / "reports"
REPORTS.mkdir(exist_ok=True)


def font(size):
for name in ("arial.ttf", "segoeui.ttf"):
try:
return ImageFont.truetype(name, size)
except OSError:
pass
return ImageFont.load_default()


slides = [
("Spreadsheet Formula Guard", "Scientific/Engineering Data & Code Hosting #14"),
("Checks", "formula cells, volatile functions, stale cached values"),
("Checks", "external workbook links, hidden sheets, macro-enabled files"),
("Decision", "hold publication until spreadsheet datasets are reproducible"),
]

frames = []
for index, (title, subtitle) in enumerate(slides, start=1):
image = Image.new("RGB", (960, 544), "#10201b")
draw = ImageDraw.Draw(image)
draw.rectangle((44, 52, 916, 492), outline="#34d399", width=3)
draw.text((80, 124), title, fill="#f8fafc", font=font(40))
draw.text((80, 206), subtitle, fill="#d1fae5", font=font(25))
draw.rectangle((80, 326, 818, 382), fill="#065f46")
draw.text((104, 342), "spreadsheet previews must be reproducible outside Excel", fill="#ecfdf5", font=font(22))
draw.text((80, 438), f"Slide {index}/4 - synthetic reviewer artifact", fill="#cbd5e1", font=font(20))
frames.extend([image] * 14)

gif_path = REPORTS / "demo.gif"
mp4_path = REPORTS / "demo.mp4"
frames[0].save(gif_path, save_all=True, append_images=frames[1:], duration=120, loop=0)
iio.imwrite(mp4_path, [np.asarray(frame) for frame in frames], fps=8, codec="libx264")
print(f"wrote {gif_path}")
print(f"wrote {mp4_path}")
246 changes: 246 additions & 0 deletions spreadsheet-formula-provenance-guard/index.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,246 @@
const HIGH = "high";
const MEDIUM = "medium";
const LOW = "low";

const VOLATILE_FUNCTIONS = ["NOW", "TODAY", "RAND", "RANDBETWEEN", "OFFSET", "INDIRECT"];

function requiredString(value, field) {
if (typeof value !== "string" || value.trim() === "") {
throw new TypeError(`${field} must be a non-empty string`);
}
return value.trim();
}

function array(value, field) {
if (!Array.isArray(value)) {
throw new TypeError(`${field} must be an array`);
}
return value;
}

function unique(values) {
return [...new Set(values.map(String))];
}

function isSha256(value) {
return typeof value === "string" && /^[a-f0-9]{64}$/.test(value);
}

function normalizeCell(raw, sheetName, index) {
return {
address: requiredString(raw.address, `${sheetName}.cells[${index}].address`),
formula: raw.formula ? String(raw.formula) : "",
cachedValue: raw.cachedValue === undefined ? null : raw.cachedValue,
recalculatedAt: raw.recalculatedAt ? String(raw.recalculatedAt) : "",
sourceRefs: unique(raw.sourceRefs || []),
};
}

function normalizeSheet(raw, index) {
const name = requiredString(raw.name, `sheets[${index}].name`);
return {
name,
hidden: Boolean(raw.hidden),
purpose: raw.purpose ? String(raw.purpose) : "",
cells: array(raw.cells || [], `${name}.cells`).map((cell, cellIndex) => normalizeCell(cell, name, cellIndex)),
};
}

function normalizeSpreadsheet(raw) {
return {
datasetId: requiredString(raw.datasetId, "datasetId"),
filename: requiredString(raw.filename, "filename"),
uploadedAt: requiredString(raw.uploadedAt, "uploadedAt"),
sha256: requiredString(raw.sha256, "sha256").toLowerCase(),
macroEnabled: Boolean(raw.macroEnabled),
externalLinks: unique(raw.externalLinks || []),
exports: array(raw.exports || [], "exports").map((item, index) => ({
format: requiredString(item.format, `exports[${index}].format`),
path: requiredString(item.path, `exports[${index}].path`),
sha256: requiredString(item.sha256, `exports[${index}].sha256`).toLowerCase(),
})),
sheets: array(raw.sheets || [], "sheets").map(normalizeSheet),
};
}

function finding(code, severity, sourceId, message, remediation) {
return { code, severity, sourceId, message, remediation };
}

function formulaFunctions(formula) {
const matches = String(formula).toUpperCase().match(/[A-Z][A-Z0-9_.]*\s*\(/g) || [];
return unique(matches.map((item) => item.replace(/\s*\($/, "")));
}

function hasExternalReference(formula) {
return /\[[^\]]+\]|\bhttps?:\/\//i.test(String(formula));
}

function assessSpreadsheetProvenance(rawSpreadsheet) {
const spreadsheet = normalizeSpreadsheet(rawSpreadsheet);
const findings = [];
const formulaCells = spreadsheet.sheets.flatMap((sheet) =>
sheet.cells
.filter((cell) => cell.formula)
.map((cell) => ({ ...cell, sheet: sheet.name, sourceId: `${sheet.name}!${cell.address}` }))
);

if (!isSha256(spreadsheet.sha256)) {
findings.push(
finding(
"INVALID_SPREADSHEET_DIGEST",
HIGH,
spreadsheet.filename,
`${spreadsheet.filename} does not have a valid SHA-256 upload digest.`,
"Record the original spreadsheet hash before accepting it as a citable dataset artifact."
)
);
}

if (spreadsheet.macroEnabled) {
findings.push(
finding(
"MACRO_ENABLED_DATASET",
HIGH,
spreadsheet.filename,
`${spreadsheet.filename} is macro-enabled and cannot be treated as inert tabular data.`,
"Quarantine macro-enabled files or publish a macro-free normalized export with reviewed transformation notes."
)
);
}

if (spreadsheet.externalLinks.length > 0) {
findings.push(
finding(
"EXTERNAL_WORKBOOK_LINKS",
HIGH,
spreadsheet.filename,
`${spreadsheet.filename} depends on external workbook links: ${spreadsheet.externalLinks.join(", ")}.`,
"Bundle source workbooks or replace formulas with reproducible local references before publication."
)
);
}

for (const sheet of spreadsheet.sheets) {
if (sheet.hidden && sheet.cells.length > 0) {
findings.push(
finding(
"HIDDEN_SHEET_WITH_DATA",
MEDIUM,
sheet.name,
`${sheet.name} is hidden but contains data or formulas.`,
"Expose the sheet, document its purpose, or exclude it from the release with an audit note."
)
);
}
}

for (const cell of formulaCells) {
const functions = formulaFunctions(cell.formula);
const volatile = functions.filter((name) => VOLATILE_FUNCTIONS.includes(name));
if (volatile.length > 0) {
findings.push(
finding(
"VOLATILE_FORMULA",
HIGH,
cell.sourceId,
`${cell.sourceId} uses volatile functions: ${volatile.join(", ")}.`,
"Replace volatile formulas with fixed values or a scripted, versioned transformation."
)
);
}

if (hasExternalReference(cell.formula)) {
findings.push(
finding(
"FORMULA_EXTERNAL_REFERENCE",
HIGH,
cell.sourceId,
`${cell.sourceId} references an external workbook or URL.`,
"Bundle the dependency and record its checksum, or normalize the formula into a local reproducible dataset."
)
);
}

if (cell.cachedValue === null || cell.recalculatedAt === "") {
findings.push(
finding(
"STALE_OR_MISSING_CALCULATION_EVIDENCE",
MEDIUM,
cell.sourceId,
`${cell.sourceId} lacks cached value or recalculation timestamp evidence.`,
"Recalculate the workbook in a controlled environment and record cached values before preview/export."
)
);
}

if (cell.sourceRefs.length === 0) {
findings.push(
finding(
"FORMULA_WITHOUT_PROVENANCE",
MEDIUM,
cell.sourceId,
`${cell.sourceId} has no provenance references for its inputs.`,
"Attach source ranges, instruments, upstream datasets, or transformation tickets for formula-derived values."
)
);
}
}

const exportFormats = new Set(spreadsheet.exports.map((item) => item.format));
if (!exportFormats.has("csv") && !exportFormats.has("parquet")) {
findings.push(
finding(
"MISSING_NORMALIZED_EXPORT",
HIGH,
"exports",
"Spreadsheet dataset has no CSV or parquet normalized export.",
"Publish a normalized tabular export so reviewers can diff and re-use the dataset outside spreadsheet software."
)
);
}

for (const exported of spreadsheet.exports) {
if (!isSha256(exported.sha256)) {
findings.push(
finding(
"INVALID_EXPORT_DIGEST",
HIGH,
exported.path,
`${exported.path} does not have a valid SHA-256 digest.`,
"Record a digest for every normalized export before publication."
)
);
}
}

const high = findings.filter((item) => item.severity === HIGH).length;
const medium = findings.filter((item) => item.severity === MEDIUM).length;
return {
datasetId: spreadsheet.datasetId,
filename: spreadsheet.filename,
decision: high > 0 ? "hold" : medium > 0 ? "revise" : "release",
summary: {
sheetsReviewed: spreadsheet.sheets.length,
formulaCellsReviewed: formulaCells.length,
exportsReviewed: spreadsheet.exports.length,
findings: findings.length,
high,
medium,
low: findings.filter((item) => item.severity === LOW).length,
},
findings,
releaseCriteria: [
"Original spreadsheet and normalized exports have stable SHA-256 digests.",
"Formula-derived cells avoid volatile functions and external workbook references.",
"Formula cells carry cached value, recalculation timestamp, and provenance evidence.",
"Hidden sheets, macros, and external links are disclosed or blocked before publication.",
"At least one normalized CSV or parquet export is available for reproducible reuse.",
],
};
}

module.exports = {
assessSpreadsheetProvenance,
normalizeSpreadsheet,
};
13 changes: 13 additions & 0 deletions spreadsheet-formula-provenance-guard/package.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
{
"name": "spreadsheet-formula-provenance-guard",
"version": "1.0.0",
"description": "Spreadsheet formula provenance guard for SCIBASE scientific data hosting",
"main": "index.js",
"type": "commonjs",
"scripts": {
"test": "node test.js",
"demo": "node demo.js",
"demo:video": "python demo_video.py"
},
"license": "MIT"
}
Loading