Output aggregate checksums by default, accept multiple file args

bede · bede · commit 071dfe601cf1 · 2026-03-03T01:45:28.000Z
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "seqsum"
-version = "0.3.0"
+version = "0.4.0"
 edition = "2024"
 description = "Robust sequence checksums for FASTA/FASTQ"
 license = "MIT"
diff --git a/README.md b/README.md
@@ -5,9 +5,9 @@
 > [!WARNING]  
 > Seqsum was rewritten in Rust in 0.3.0. The original Python version of seqsum and how to use is archived in the [`python`](https://github.com/bede/seqsum/tree/python) branch. It remains available on PyPI.
 
-Robust checksums for nucleotide sequences. Accepts input from either standard input or `fast[a|q][.gz|.zst]` files. Generates *individual* checksums for each sequence, plus an *aggregate* checksum for a collection. Warnings are shown for duplicate sequences and within-collection checksum collisions at the selected bit depth. Sequences are uppercased before hashing with [RapidHash](https://github.com/Nicoshev/rapidhash) (v3) and may be normalised (with `-n`) to use only `ACGTN-`. Read IDs and FASTQ base quality scores do not inform the checksum. Output is tab-delimited text to stdout.
+Robust checksums for nucleotide sequences. Accepts one or more `fast[a|q][.gz|.zst]` files or standard input. Generates an *aggregate* checksum for each input file by default, similar to `md5sum`/`sha256sum`. Warnings are shown for duplicate sequences and within-collection checksum collisions at the selected bit depth. Sequences are uppercased before hashing with [RapidHash](https://github.com/Nicoshev/rapidhash) (v3) and may be normalised (with `-n`) to use only `ACGTN-`. Read IDs and FASTQ base quality scores do not inform the checksum. Output is tab-delimited text to stdout.
 
-By default, seqsum outputs individual checksums and, when there is more than one sequence, an aggregate checksum. This can be modified with `--individual` (`-i`) or `--aggregate` (`-a`).
+By default, seqsum outputs one aggregate checksum per file. Use `--individual` (`-i`) for per-record checksums, or `--all` (`-a`) for both individual and aggregate checksums. These flags are mutually exclusive.
 
 ## Install
 
@@ -26,23 +26,29 @@ cargo test
 ## Command line usage
 
 ```bash
-# Fasta with one record
+# Default: aggregate checksum per file
 $ seqsum tests/data/MN908947.fasta
-33ba13564e0a63e3	MN908947.3
+33ba13564e0a63e3	tests/data/MN908947.fasta
 
-# Fasta with two records
-$ seqsum tests/data/MN908947-BA_2_86_1.fasta
-33ba13564e0a63e3	MN908947.3
-9fef3b61d54d8902	BA.2.86.1
-d3a94eb82357ece5	aggregate
+# Multiple files
+$ seqsum tests/data/MN908947.fasta tests/data/MN908947-BA_2_86_1.fasta
+33ba13564e0a63e3	tests/data/MN908947.fasta
+d3a94eb82357ece5	tests/data/MN908947-BA_2_86_1.fasta
 
-# Fasta with two records, only show aggregate checksum
-$ seqsum tests/data/MN908947-BA_2_86_1.fasta --aggregate
-d3a94eb82357ece5	aggregate
+# Stdin
+$ cat tests/data/MN908947.fasta | seqsum
+33ba13564e0a63e3	-
 
-# Fasta via stdin
-$ cat tests/data/MN908947.fasta | seqsum -
+# Individual per-record checksums
+$ seqsum -i tests/data/MN908947-BA_2_86_1.fasta
 33ba13564e0a63e3	MN908947.3
+9fef3b61d54d8902	BA.2.86.1
+
+# All: individual checksums + aggregate
+$ seqsum -a tests/data/MN908947-BA_2_86_1.fasta
+33ba13564e0a63e3	MN908947.3	tests/data/MN908947-BA_2_86_1.fasta
+9fef3b61d54d8902	BA.2.86.1	tests/data/MN908947-BA_2_86_1.fasta
+d3a94eb82357ece5	sum	tests/data/MN908947-BA_2_86_1.fasta
 
 ```
 
diff --git a/src/lib.rs b/src/lib.rs
@@ -167,7 +167,7 @@ where
     let checksum_collisions = unique_truncated_hashes.len() < first_name_by_hash.len();
 
     Ok(SeqsumResult {
-        aggregate: (record_count > 1).then_some(aggregate_hash),
+        aggregate: (record_count > 0).then_some(aggregate_hash),
         duplicate_sequences,
         duplicate_sequence_names,
         checksum_collisions,
diff --git a/src/main.rs b/src/main.rs
@@ -9,29 +9,29 @@ use seqsum::{DEFAULT_BITS, SeqsumConfig, format_hash, sum_nt};
 #[derive(Debug, Parser)]
 #[command(author, version, about, long_about = None)]
 struct Cli {
-    /// Path to FASTA/FASTQ input, or - for stdin
+    /// Path(s) to FASTA/FASTQ input, or - for stdin
     #[arg(default_value = "-")]
-    input: String,
+    input: Vec<String>,
 
-    /// Replace U with T, and non-ACGT- characters with N before hashing
-    #[arg(short = 'n', long)]
-    normalise: bool,
+    /// Output individual record checksums
+    #[arg(short = 'i', long, conflicts_with = "all")]
+    individual: bool,
 
-    /// Require IUPAC ambiguous DNA alphabet ABCDGHKMNRSTVWY-
-    #[arg(short = 's', long)]
-    strict: bool,
+    /// Output both individual record and aggregate checksums
+    #[arg(short = 'a', long)]
+    all: bool,
 
     /// Displayed hash length in bits (4..64, multiple of 4)
     #[arg(short = 'b', long, default_value_t = DEFAULT_BITS)]
     bits: u8,
 
-    /// Output only per-record checksums
-    #[arg(short = 'i', long)]
-    individual: bool,
+    /// Replace U with T, and non-ACGT- characters with N before hashing
+    #[arg(short = 'n', long)]
+    normalise: bool,
 
-    /// Output only the aggregate checksum
-    #[arg(short = 'a', long)]
-    aggregate: bool,
+    /// Require IUPAC ambiguous DNA alphabet ABCDGHKMNRSTVWY-
+    #[arg(short = 's', long)]
+    strict: bool,
 
     /// Suppress warning messages
     #[arg(short = 'q', long)]
@@ -56,55 +56,60 @@ fn main() -> Result<()> {
         .format(|buf, record| writeln!(buf, "[{}] {}", record.level(), record.args()))
         .init();
 
-    let config = SeqsumConfig {
-        input: cli.input.clone(),
-        normalise: cli.normalise,
-        strict: cli.strict,
-        bits: cli.bits,
-    };
-
-    let show_individual = cli.individual || !cli.aggregate;
-
     let stdout = std::io::stdout();
     let mut out = BufWriter::new(stdout.lock());
 
-    let result = sum_nt(&config, |id, hash| {
-        if show_individual {
-            writeln!(out, "{}\t{id}", format_hash(hash, cli.bits))?;
+    for input in &cli.input {
+        let config = SeqsumConfig {
+            input: input.clone(),
+            normalise: cli.normalise,
+            strict: cli.strict,
+            bits: cli.bits,
+        };
+
+        let filename = input.as_str();
+
+        let result = sum_nt(&config, |id, hash| {
+            if cli.individual {
+                writeln!(out, "{}\t{id}", format_hash(hash, cli.bits))?;
+            } else if cli.all {
+                writeln!(out, "{}\t{id}\t{filename}", format_hash(hash, cli.bits))?;
+            }
+            Ok(())
+        })?;
+
+        if cli.all {
+            let aggregate = result
+                .aggregate
+                .ok_or_else(|| anyhow!("aggregate checksum unavailable"))?;
+            writeln!(
+                out,
+                "{}\tsum\t{filename}",
+                format_hash(aggregate, cli.bits)
+            )?;
+        } else if !cli.individual {
+            let aggregate = result
+                .aggregate
+                .ok_or_else(|| anyhow!("aggregate checksum unavailable"))?;
+            writeln!(out, "{}\t{filename}", format_hash(aggregate, cli.bits))?;
         }
-        Ok(())
-    })?;
 
-    let show_aggregate = if cli.aggregate {
-        true
-    } else if cli.individual {
-        false
-    } else {
-        result.record_count > 1
-    };
-
-    if show_aggregate {
-        let aggregate = result
-            .aggregate
-            .ok_or_else(|| anyhow!("aggregate checksum unavailable"))?;
-        writeln!(out, "{}\taggregate", format_hash(aggregate, cli.bits))?;
-    }
-
-    out.flush()?;
-
-    if result.duplicate_sequences {
-        if cli.verbose {
-            info!("Found duplicate sequences:");
-            for name in &result.duplicate_sequence_names {
-                info!("  {name}");
+        if result.duplicate_sequences {
+            if cli.verbose {
+                info!("Found duplicate sequences:");
+                for name in &result.duplicate_sequence_names {
+                    info!("  {name}");
+                }
+            } else {
+                warn!("Found duplicate sequences");
             }
-        } else {
-            warn!("Found duplicate sequences");
+        }
+        if result.checksum_collisions {
+            warn!("Found checksum collisions, consider increasing --bits");
         }
     }
-    if result.checksum_collisions {
-        warn!("Found checksum collisions, consider increasing --bits");
-    }
+
+    out.flush()?;
 
     Ok(())
 }
diff --git a/tests/cli.rs b/tests/cli.rs