Skip to content

Commit 071dfe6

Browse files
committed
Output aggregate checksums by default, accept multiple file args
1 parent ffcb905 commit 071dfe6

6 files changed

Lines changed: 203 additions & 111 deletions

File tree

Cargo.lock

Lines changed: 1 addition & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[package]
22
name = "seqsum"
3-
version = "0.3.0"
3+
version = "0.4.0"
44
edition = "2024"
55
description = "Robust sequence checksums for FASTA/FASTQ"
66
license = "MIT"

README.md

Lines changed: 20 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,9 @@
55
> [!WARNING]
66
> Seqsum was rewritten in Rust in 0.3.0. The original Python version of seqsum and how to use is archived in the [`python`](https://github.com/bede/seqsum/tree/python) branch. It remains available on PyPI.
77
8-
Robust checksums for nucleotide sequences. Accepts input from either standard input or `fast[a|q][.gz|.zst]` files. Generates *individual* checksums for each sequence, plus an *aggregate* checksum for a collection. Warnings are shown for duplicate sequences and within-collection checksum collisions at the selected bit depth. Sequences are uppercased before hashing with [RapidHash](https://github.com/Nicoshev/rapidhash) (v3) and may be normalised (with `-n`) to use only `ACGTN-`. Read IDs and FASTQ base quality scores do not inform the checksum. Output is tab-delimited text to stdout.
8+
Robust checksums for nucleotide sequences. Accepts one or more `fast[a|q][.gz|.zst]` files or standard input. Generates an *aggregate* checksum for each input file by default, similar to `md5sum`/`sha256sum`. Warnings are shown for duplicate sequences and within-collection checksum collisions at the selected bit depth. Sequences are uppercased before hashing with [RapidHash](https://github.com/Nicoshev/rapidhash) (v3) and may be normalised (with `-n`) to use only `ACGTN-`. Read IDs and FASTQ base quality scores do not inform the checksum. Output is tab-delimited text to stdout.
99

10-
By default, seqsum outputs individual checksums and, when there is more than one sequence, an aggregate checksum. This can be modified with `--individual` (`-i`) or `--aggregate` (`-a`).
10+
By default, seqsum outputs one aggregate checksum per file. Use `--individual` (`-i`) for per-record checksums, or `--all` (`-a`) for both individual and aggregate checksums. These flags are mutually exclusive.
1111

1212
## Install
1313

@@ -26,23 +26,29 @@ cargo test
2626
## Command line usage
2727

2828
```bash
29-
# Fasta with one record
29+
# Default: aggregate checksum per file
3030
$ seqsum tests/data/MN908947.fasta
31-
33ba13564e0a63e3 MN908947.3
31+
33ba13564e0a63e3 tests/data/MN908947.fasta
3232

33-
# Fasta with two records
34-
$ seqsum tests/data/MN908947-BA_2_86_1.fasta
35-
33ba13564e0a63e3 MN908947.3
36-
9fef3b61d54d8902 BA.2.86.1
37-
d3a94eb82357ece5 aggregate
33+
# Multiple files
34+
$ seqsum tests/data/MN908947.fasta tests/data/MN908947-BA_2_86_1.fasta
35+
33ba13564e0a63e3 tests/data/MN908947.fasta
36+
d3a94eb82357ece5 tests/data/MN908947-BA_2_86_1.fasta
3837

39-
# Fasta with two records, only show aggregate checksum
40-
$ seqsum tests/data/MN908947-BA_2_86_1.fasta --aggregate
41-
d3a94eb82357ece5 aggregate
38+
# Stdin
39+
$ cat tests/data/MN908947.fasta | seqsum
40+
33ba13564e0a63e3 -
4241

43-
# Fasta via stdin
44-
$ cat tests/data/MN908947.fasta | seqsum -
42+
# Individual per-record checksums
43+
$ seqsum -i tests/data/MN908947-BA_2_86_1.fasta
4544
33ba13564e0a63e3 MN908947.3
45+
9fef3b61d54d8902 BA.2.86.1
46+
47+
# All: individual checksums + aggregate
48+
$ seqsum -a tests/data/MN908947-BA_2_86_1.fasta
49+
33ba13564e0a63e3 MN908947.3 tests/data/MN908947-BA_2_86_1.fasta
50+
9fef3b61d54d8902 BA.2.86.1 tests/data/MN908947-BA_2_86_1.fasta
51+
d3a94eb82357ece5 sum tests/data/MN908947-BA_2_86_1.fasta
4652

4753
```
4854

src/lib.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -167,7 +167,7 @@ where
167167
let checksum_collisions = unique_truncated_hashes.len() < first_name_by_hash.len();
168168

169169
Ok(SeqsumResult {
170-
aggregate: (record_count > 1).then_some(aggregate_hash),
170+
aggregate: (record_count > 0).then_some(aggregate_hash),
171171
duplicate_sequences,
172172
duplicate_sequence_names,
173173
checksum_collisions,

src/main.rs

Lines changed: 60 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -9,29 +9,29 @@ use seqsum::{DEFAULT_BITS, SeqsumConfig, format_hash, sum_nt};
99
#[derive(Debug, Parser)]
1010
#[command(author, version, about, long_about = None)]
1111
struct Cli {
12-
/// Path to FASTA/FASTQ input, or - for stdin
12+
/// Path(s) to FASTA/FASTQ input, or - for stdin
1313
#[arg(default_value = "-")]
14-
input: String,
14+
input: Vec<String>,
1515

16-
/// Replace U with T, and non-ACGT- characters with N before hashing
17-
#[arg(short = 'n', long)]
18-
normalise: bool,
16+
/// Output individual record checksums
17+
#[arg(short = 'i', long, conflicts_with = "all")]
18+
individual: bool,
1919

20-
/// Require IUPAC ambiguous DNA alphabet ABCDGHKMNRSTVWY-
21-
#[arg(short = 's', long)]
22-
strict: bool,
20+
/// Output both individual record and aggregate checksums
21+
#[arg(short = 'a', long)]
22+
all: bool,
2323

2424
/// Displayed hash length in bits (4..64, multiple of 4)
2525
#[arg(short = 'b', long, default_value_t = DEFAULT_BITS)]
2626
bits: u8,
2727

28-
/// Output only per-record checksums
29-
#[arg(short = 'i', long)]
30-
individual: bool,
28+
/// Replace U with T, and non-ACGT- characters with N before hashing
29+
#[arg(short = 'n', long)]
30+
normalise: bool,
3131

32-
/// Output only the aggregate checksum
33-
#[arg(short = 'a', long)]
34-
aggregate: bool,
32+
/// Require IUPAC ambiguous DNA alphabet ABCDGHKMNRSTVWY-
33+
#[arg(short = 's', long)]
34+
strict: bool,
3535

3636
/// Suppress warning messages
3737
#[arg(short = 'q', long)]
@@ -56,55 +56,60 @@ fn main() -> Result<()> {
5656
.format(|buf, record| writeln!(buf, "[{}] {}", record.level(), record.args()))
5757
.init();
5858

59-
let config = SeqsumConfig {
60-
input: cli.input.clone(),
61-
normalise: cli.normalise,
62-
strict: cli.strict,
63-
bits: cli.bits,
64-
};
65-
66-
let show_individual = cli.individual || !cli.aggregate;
67-
6859
let stdout = std::io::stdout();
6960
let mut out = BufWriter::new(stdout.lock());
7061

71-
let result = sum_nt(&config, |id, hash| {
72-
if show_individual {
73-
writeln!(out, "{}\t{id}", format_hash(hash, cli.bits))?;
62+
for input in &cli.input {
63+
let config = SeqsumConfig {
64+
input: input.clone(),
65+
normalise: cli.normalise,
66+
strict: cli.strict,
67+
bits: cli.bits,
68+
};
69+
70+
let filename = input.as_str();
71+
72+
let result = sum_nt(&config, |id, hash| {
73+
if cli.individual {
74+
writeln!(out, "{}\t{id}", format_hash(hash, cli.bits))?;
75+
} else if cli.all {
76+
writeln!(out, "{}\t{id}\t{filename}", format_hash(hash, cli.bits))?;
77+
}
78+
Ok(())
79+
})?;
80+
81+
if cli.all {
82+
let aggregate = result
83+
.aggregate
84+
.ok_or_else(|| anyhow!("aggregate checksum unavailable"))?;
85+
writeln!(
86+
out,
87+
"{}\tsum\t{filename}",
88+
format_hash(aggregate, cli.bits)
89+
)?;
90+
} else if !cli.individual {
91+
let aggregate = result
92+
.aggregate
93+
.ok_or_else(|| anyhow!("aggregate checksum unavailable"))?;
94+
writeln!(out, "{}\t{filename}", format_hash(aggregate, cli.bits))?;
7495
}
75-
Ok(())
76-
})?;
7796

78-
let show_aggregate = if cli.aggregate {
79-
true
80-
} else if cli.individual {
81-
false
82-
} else {
83-
result.record_count > 1
84-
};
85-
86-
if show_aggregate {
87-
let aggregate = result
88-
.aggregate
89-
.ok_or_else(|| anyhow!("aggregate checksum unavailable"))?;
90-
writeln!(out, "{}\taggregate", format_hash(aggregate, cli.bits))?;
91-
}
92-
93-
out.flush()?;
94-
95-
if result.duplicate_sequences {
96-
if cli.verbose {
97-
info!("Found duplicate sequences:");
98-
for name in &result.duplicate_sequence_names {
99-
info!(" {name}");
97+
if result.duplicate_sequences {
98+
if cli.verbose {
99+
info!("Found duplicate sequences:");
100+
for name in &result.duplicate_sequence_names {
101+
info!(" {name}");
102+
}
103+
} else {
104+
warn!("Found duplicate sequences");
100105
}
101-
} else {
102-
warn!("Found duplicate sequences");
106+
}
107+
if result.checksum_collisions {
108+
warn!("Found checksum collisions, consider increasing --bits");
103109
}
104110
}
105-
if result.checksum_collisions {
106-
warn!("Found checksum collisions, consider increasing --bits");
107-
}
111+
112+
out.flush()?;
108113

109114
Ok(())
110115
}

0 commit comments

Comments
 (0)