Skip to content

Commit e33cccf

Browse files
authored
Validation for missing input (#115)
* chore: if neither the sample nor the fastq filepath are provided, show an error * chore: reduce the number of inputs required for rule all to simplify rule graph generation * refactor: do not use `lamba wildcards:` when not necessary
1 parent 22e64cd commit e33cccf

2 files changed

Lines changed: 22 additions & 68 deletions

File tree

Snakefile

Lines changed: 19 additions & 66 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,10 @@
11
import sys
22
from typing import Literal
33

4+
import pandas as pd
5+
from snakemake.io import Namedlist, Wildcards
6+
from snakemake.rules import RuleProxy
7+
48
from utils.parse import Config, SampleData, print_key_value_table
59

610
configfile: "config.yaml"
@@ -24,72 +28,21 @@ onstart:
2428
rule all:
2529
input:
2630
expand(f"{cfg.data_root}/{{tissue}}/{{tissue}}_config.yaml", tissue=set(data.tissues)),
27-
f"{cfg.genome.species_dir}/{cfg.species_name}_{cfg.genome.ensembl_release}_{cfg.genome.type}.fa",
28-
f"{cfg.genome.contaminants_dir}/.download_complete",
29-
f"{cfg.genome.species_dir}/star/Log.out",
30-
f"{cfg.genome.species_dir}/transcriptome.fa",
31-
32-
expand(f"{cfg.data_root}/{{tissue}}/layouts/{{tissue}}_{{tag}}_layout.txt", zip, tissue=data.tissues, tag=data.tags),
33-
expand(f"{cfg.data_root}/{{tissue}}/prepMethods/{{tissue}}_{{tag}}_prep_method.txt",zip,tissue=data.tissues,tag=data.tags),
34-
expand(f"{cfg.data_root}/{{tissue}}/align/{{tag}}/{{tissue}}_{{tag}}.bam",zip,tissue=data.tissues,tag=data.tags),
35-
expand(f"{cfg.como_root}/{{tissue}}/geneCounts/{{study}}/{{tissue}}_{{tag}}.tab",zip,tissue=data.tissues,study=data.studies,tag=data.tags),
3631
expand(f"{cfg.data_root}/{{tissue}}/multiqc/{cfg.sample_filepath.stem}/{cfg.sample_filepath.stem}_multiqc_report.html",tissue=set(data.tissues)),
37-
branch(
38-
cfg.perform.dump_fastq,
39-
then=[
40-
expand(f"{cfg.data_root}/{{tissue}}/raw/{{tissue}}_{{tag}}_{{end}}.fastq.gz",zip,tissue=data.tissues_paired,tag=data.tags_paired,end=data.ends_paired),
41-
expand(
42-
f"{cfg.data_root}/{{tissue}}/fastqc/raw/raw_{{tissue}}_{{tag}}_{{end}}_fastqc.zip",
43-
zip,
44-
tissue=data.tissues_paired,
45-
tag=data.tags_paired,
46-
end=data.ends_paired
47-
),
48-
],
49-
otherwise=[],
50-
),
51-
branch(
52-
cfg.perform.trim,
53-
then=[
54-
expand(f"{cfg.data_root}/{{tissue}}/trim/{{tissue}}_{{tag}}_{{end}}.fastq.gz",zip,tissue=data.tissues_paired,tag=data.tags_paired,end=data.ends_paired),
55-
expand(
56-
f"{cfg.data_root}/{{tissue}}/fastqc/trimmed/trimmed_{{tissue}}_{{tag}}_{{end}}_fastqc.zip",
57-
zip,
58-
tissue=data.tissues_paired,
59-
tag=data.tags_paired,
60-
end=data.ends_paired
61-
),
62-
],
63-
otherwise=[],
64-
),
65-
branch(
66-
cfg.perform.contaminant_screen,
67-
then=f"{cfg.genome.contaminants_dir}/fastq_screen.conf",
68-
otherwise=[],
69-
),
32+
expand(f"{cfg.como_root}/{{tissue}}/geneCounts/{{study}}/{{tissue}}_{{tag}}.tab",zip,tissue=data.tissues,study=data.studies,tag=data.tags),
7033
branch(
7134
cfg.perform.fragment_size,
72-
then=[
73-
expand(f"{cfg.data_root}/{{tissue}}/fragmentSizes/{{tissue}}_{{tag}}_fragment_size.txt", zip, tissue=data.tissues, tag=data.tags),
74-
expand(f"{cfg.como_root}/{{tissue}}/fragmentSizes/{{study}}/{{tissue}}_{{tag}}_fragment_size.txt",zip,tissue=data.tissues,study=data.studies,tag=data.tags)
75-
],
35+
then=[expand(f"{cfg.como_root}/{{tissue}}/fragmentSizes/{{study}}/{{tissue}}_{{tag}}_fragment_size.txt",zip,tissue=data.tissues,study=data.studies,tag=data.tags)],
7636
otherwise=[]
7737
),
7838
branch(
7939
cfg.perform.rnaseq_metrics,
80-
then=[
81-
expand(f"{cfg.data_root}/{{tissue}}/picard/rnaseq/{{tissue}}_{{tag}}_rnaseq.txt", zip, tissue=data.tissues, tag=data.tags),
82-
expand(f"{cfg.como_root}/{{tissue}}/strandedness/{{study}}/{{tissue}}_{{tag}}_strandedness.txt",zip,tissue=data.tissues,tag=data.tags,study=data.studies),
83-
],
40+
then=[expand(f"{cfg.como_root}/{{tissue}}/strandedness/{{study}}/{{tissue}}_{{tag}}_strandedness.txt",zip,tissue=data.tissues,tag=data.tags,study=data.studies)],
8441
otherwise=[],
8542
),
8643
branch(
8744
cfg.perform.insert_size,
88-
then=[
89-
expand(f"{cfg.data_root}/{{tissue}}/picard/insert/{{tissue}}_{{tag}}_insert_size.txt",zip,tissue=data.tissues,tag=data.tags),
90-
expand(f"{cfg.data_root}/{{tissue}}/picard/hist/{{tissue}}_{{tag}}_insert_size_histo.pdf",zip,tissue=data.tissues,tag=data.tags),
91-
expand(f"{cfg.como_root}/{{tissue}}/insertSizeMetrics/{{study}}/{{tissue}}_{{tag}}_insert_size.txt",zip,tissue=data.tissues,tag=data.tags,study=data.studies),
92-
],
45+
then=[expand(f"{cfg.como_root}/{{tissue}}/insertSizeMetrics/{{study}}/{{tissue}}_{{tag}}_insert_size.txt",zip,tissue=data.tissues,tag=data.tags,study=data.studies)],
9346
otherwise=[]
9447
)
9548

@@ -324,7 +277,7 @@ rule fastq_dump_paired:
324277
sra_cache="$tmpdir/sra_cache"
325278
fastq_cache="$tmpdir/fastq_cache"
326279
mkdir -p "$sra_cache" "$fastq_cache"
327-
280+
328281
prefetch --max-size u --progress --log-level info --force ALL --output-directory "$sra_cache" {params.srr} 1>{log} 2>&1
329282
330283
sra_temp="$sra_cache/{params.srr}.sra"
@@ -366,7 +319,7 @@ rule fastq_dump_single:
366319
mkdir -p "$sra_cache" "$fastq_cache"
367320
368321
prefetch --max-size u --progress --log-level info --force ALL --output-directory "$sra_cache" {params.srr} 1>>{log} 2>&1
369-
322+
370323
sra_file="$sra_cache/{params.srr}/{params.srr}.sra"
371324
fastq_file="$fastq_cache/{params.srr}.fastq"
372325
fasterq-dump --force --concatenate-reads --progress --threads {threads} --temp "$fastq_cache" --outdir "$fastq_cache" "$sra_file" 1>>{log} 2>&1
@@ -450,6 +403,8 @@ rule qc_raw_fastq_single:
450403
mv --verbose "$tmpdir/{wildcards.tissue}_{wildcards.tag}_S_fastqc.zip" "{output.s_zip}" 1>>{log} 2>&1
451404
mv --verbose "$tmpdir/{wildcards.tissue}_{wildcards.tag}_S_fastqc.html" "{output.s_html}" 1>>{log} 2>&1
452405
"""
406+
407+
453408
def trim_paired_input(wildcards) -> dict[Literal["r1"] | Literal["r2"], str | list[str]]:
454409
if cfg.perform.dump_fastq:
455410
return {"r1": rules.fastq_dump_paired.output.r1, "r2": rules.fastq_dump_paired.output.r2}
@@ -982,17 +937,15 @@ def multiqc_contamination_input(wildcards) -> list[str]:
982937
files += expand(rules.contaminant_screen_single.output.S, zip, tissue=tissues, tag=tags)
983938
return files
984939

985-
986940
rule multiqc:
987941
input:
988-
raw_fastq=lambda wildcards: [] if not cfg.perform.dump_fastq else expand(f"{cfg.data_root}/{{tissue}}/raw/{{tissue}}_{{tag}}_{{end}}.fastq.gz",zip,tissue=data.tissues_paired,tag=data.tags_paired,end=data.ends_paired),
989-
trimmed_fastq=lambda wildcards: [] if not cfg.perform.trim else expand(f"{cfg.data_root}/{{tissue}}/trim/{{tissue}}_{{tag}}_{{end}}.fastq.gz",zip,tissue=data.tissues_paired,tag=data.tags_paired,end=data.ends_paired),
990-
aligned_fastq=expand(rules.align.output.bam_file, zip, tissue=data.tissues, tag=data.tags),
991-
contaminantion=multiqc_contamination_input,
992-
insert_sizes=lambda wildcards: [] if not cfg.perform.insert_size else expand(rules.insert_size.output.txt,zip,tissue=data.tissues,tag=data.tags),
993-
rnaseq_metrics=lambda wildcards: [] if not cfg.perform.rnaseq_metrics else expand(rules.rnaseq_metrics.output.metrics, zip, tissue=data.tissues, tag=data.tags),
994-
fragment_sizes=lambda wildcards: [] if not cfg.perform.fragment_size else expand(rules.fragment_size.output, zip, tissue=data.tissues, tag=data.tags),
995-
salmon_quant=expand(rules.salmon_quantification.output.quant, zip, tissue=data.tissues, tag=data.tags),
942+
raw_qc=expand(f"{cfg.data_root}/{{tissue}}/fastqc/raw/raw_{{tissue}}_{{tag}}_{{end}}_fastqc.zip",zip,tissue=data.tissues_paired,tag=data.tags_paired,end=data.ends_paired) if cfg.perform.dump_fastq else [],
943+
trim_qc=expand(f"{cfg.data_root}/{{tissue}}/fastqc/trimmed/trimmed_{{tissue}}_{{tag}}_{{end}}_fastqc.zip",zip,tissue=data.tissues_paired,tag=data.tags_paired,end=data.ends_paired) if cfg.perform.trim else [],
944+
contaminantion=expand(f"{cfg.data_root}/{{tissue}}/fq_screen/{{tissue}}_{{tag}}_{{end}}_screen.txt",zip,tissue=data.tissues_paired,tag=data.tags_paired,end=data.ends_paired) if cfg.perform.contaminant_screen else [],
945+
insert_sizes=expand(rules.insert_size.output.txt,zip,tissue=data.tissues,tag=data.tags) if cfg.perform.insert_size else [],
946+
rnaseq_metrics=expand(rules.rnaseq_metrics.output.metrics,zip,tissue=data.tissues,tag=data.tags) if cfg.perform.rnaseq_metrics else [],
947+
fragment_sizes=expand(rules.fragment_size.output,zip,tissue=data.tissues,tag=data.tags) if cfg.perform.fragment_size else [],
948+
salmon_quant=expand(rules.salmon_quantification.output.meta,zip,tissue=data.tissues,tag=data.tags),
996949
output:
997950
output_file=f"{cfg.data_root}/{{tissue}}/multiqc/{cfg.sample_filepath.stem}/{cfg.sample_filepath.stem}_multiqc_report.html",
998951
params:

utils/parse.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -119,9 +119,11 @@ class Config:
119119
def _validate_config(config: dict[str, Any]): # noqa: C901
120120
if config["MASTER_CONTROL"] == "":
121121
raise ValueError("MASTER_CONTROL cannot be an empty string.")
122+
if config["MASTER_CONTROL"] == config["LOCAL_FASTQ_FILES"] == "":
123+
raise ValueError("Either MASTER_CONTROL or LOCAL_FASTQ_FILES must be provided.")
122124
if not Path(config["MASTER_CONTROL"]).exists():
123125
raise FileNotFoundError(f"MASTER_CONTROL path does not exist: {config['MASTER_CONTROL']}")
124-
126+
125127
if not str(config["BENCHMARK_TIMES"]).isdigit() or int(config["BENCHMARK_TIMES"]) < 0:
126128
raise ValueError("BENCHMARK_TIMES must be a non-negative integer.")
127129

@@ -162,7 +164,6 @@ def create(cls, config: dict[str, Any]) -> "Config":
162164
taxon_id: int = int(config["GENOME"]["TAXONOMY_ID"])
163165
species_name: str = species_from_taxon(taxon_id=taxon_id)
164166

165-
166167
return cls(
167168
sample_filepath=Path(config["MASTER_CONTROL"]),
168169
root=root,

0 commit comments

Comments
 (0)