11import sys
22from typing import Literal
33
4+ import pandas as pd
5+ from snakemake .io import Namedlist , Wildcards
6+ from snakemake .rules import RuleProxy
7+
48from utils .parse import Config , SampleData , print_key_value_table
59
610configfile : "config.yaml"
@@ -24,72 +28,21 @@ onstart:
2428rule all :
2529 input :
2630 expand (f"{ cfg .data_root } /{{tissue}}/{{tissue}}_config.yaml" , tissue = set (data .tissues )),
27- f"{ cfg .genome .species_dir } /{ cfg .species_name } _{ cfg .genome .ensembl_release } _{ cfg .genome .type } .fa" ,
28- f"{ cfg .genome .contaminants_dir } /.download_complete" ,
29- f"{ cfg .genome .species_dir } /star/Log.out" ,
30- f"{ cfg .genome .species_dir } /transcriptome.fa" ,
31-
32- expand (f"{ cfg .data_root } /{{tissue}}/layouts/{{tissue}}_{{tag}}_layout.txt" , zip , tissue = data .tissues , tag = data .tags ),
33- expand (f"{ cfg .data_root } /{{tissue}}/prepMethods/{{tissue}}_{{tag}}_prep_method.txt" ,zip ,tissue = data .tissues ,tag = data .tags ),
34- expand (f"{ cfg .data_root } /{{tissue}}/align/{{tag}}/{{tissue}}_{{tag}}.bam" ,zip ,tissue = data .tissues ,tag = data .tags ),
35- expand (f"{ cfg .como_root } /{{tissue}}/geneCounts/{{study}}/{{tissue}}_{{tag}}.tab" ,zip ,tissue = data .tissues ,study = data .studies ,tag = data .tags ),
3631 expand (f"{ cfg .data_root } /{{tissue}}/multiqc/{ cfg .sample_filepath .stem } /{ cfg .sample_filepath .stem } _multiqc_report.html" ,tissue = set (data .tissues )),
37- branch (
38- cfg .perform .dump_fastq ,
39- then = [
40- expand (f"{ cfg .data_root } /{{tissue}}/raw/{{tissue}}_{{tag}}_{{end}}.fastq.gz" ,zip ,tissue = data .tissues_paired ,tag = data .tags_paired ,end = data .ends_paired ),
41- expand (
42- f"{ cfg .data_root } /{{tissue}}/fastqc/raw/raw_{{tissue}}_{{tag}}_{{end}}_fastqc.zip" ,
43- zip ,
44- tissue = data .tissues_paired ,
45- tag = data .tags_paired ,
46- end = data .ends_paired
47- ),
48- ],
49- otherwise = [],
50- ),
51- branch (
52- cfg .perform .trim ,
53- then = [
54- expand (f"{ cfg .data_root } /{{tissue}}/trim/{{tissue}}_{{tag}}_{{end}}.fastq.gz" ,zip ,tissue = data .tissues_paired ,tag = data .tags_paired ,end = data .ends_paired ),
55- expand (
56- f"{ cfg .data_root } /{{tissue}}/fastqc/trimmed/trimmed_{{tissue}}_{{tag}}_{{end}}_fastqc.zip" ,
57- zip ,
58- tissue = data .tissues_paired ,
59- tag = data .tags_paired ,
60- end = data .ends_paired
61- ),
62- ],
63- otherwise = [],
64- ),
65- branch (
66- cfg .perform .contaminant_screen ,
67- then = f"{ cfg .genome .contaminants_dir } /fastq_screen.conf" ,
68- otherwise = [],
69- ),
32+ expand (f"{ cfg .como_root } /{{tissue}}/geneCounts/{{study}}/{{tissue}}_{{tag}}.tab" ,zip ,tissue = data .tissues ,study = data .studies ,tag = data .tags ),
7033 branch (
7134 cfg .perform .fragment_size ,
72- then = [
73- expand (f"{ cfg .data_root } /{{tissue}}/fragmentSizes/{{tissue}}_{{tag}}_fragment_size.txt" , zip , tissue = data .tissues , tag = data .tags ),
74- expand (f"{ cfg .como_root } /{{tissue}}/fragmentSizes/{{study}}/{{tissue}}_{{tag}}_fragment_size.txt" ,zip ,tissue = data .tissues ,study = data .studies ,tag = data .tags )
75- ],
35+ then = [expand (f"{ cfg .como_root } /{{tissue}}/fragmentSizes/{{study}}/{{tissue}}_{{tag}}_fragment_size.txt" ,zip ,tissue = data .tissues ,study = data .studies ,tag = data .tags )],
7636 otherwise = []
7737 ),
7838 branch (
7939 cfg .perform .rnaseq_metrics ,
80- then = [
81- expand (f"{ cfg .data_root } /{{tissue}}/picard/rnaseq/{{tissue}}_{{tag}}_rnaseq.txt" , zip , tissue = data .tissues , tag = data .tags ),
82- expand (f"{ cfg .como_root } /{{tissue}}/strandedness/{{study}}/{{tissue}}_{{tag}}_strandedness.txt" ,zip ,tissue = data .tissues ,tag = data .tags ,study = data .studies ),
83- ],
40+ then = [expand (f"{ cfg .como_root } /{{tissue}}/strandedness/{{study}}/{{tissue}}_{{tag}}_strandedness.txt" ,zip ,tissue = data .tissues ,tag = data .tags ,study = data .studies )],
8441 otherwise = [],
8542 ),
8643 branch (
8744 cfg .perform .insert_size ,
88- then = [
89- expand (f"{ cfg .data_root } /{{tissue}}/picard/insert/{{tissue}}_{{tag}}_insert_size.txt" ,zip ,tissue = data .tissues ,tag = data .tags ),
90- expand (f"{ cfg .data_root } /{{tissue}}/picard/hist/{{tissue}}_{{tag}}_insert_size_histo.pdf" ,zip ,tissue = data .tissues ,tag = data .tags ),
91- expand (f"{ cfg .como_root } /{{tissue}}/insertSizeMetrics/{{study}}/{{tissue}}_{{tag}}_insert_size.txt" ,zip ,tissue = data .tissues ,tag = data .tags ,study = data .studies ),
92- ],
45+ then = [expand (f"{ cfg .como_root } /{{tissue}}/insertSizeMetrics/{{study}}/{{tissue}}_{{tag}}_insert_size.txt" ,zip ,tissue = data .tissues ,tag = data .tags ,study = data .studies )],
9346 otherwise = []
9447 )
9548
@@ -324,7 +277,7 @@ rule fastq_dump_paired:
324277 sra_cache="$tmpdir/sra_cache"
325278 fastq_cache="$tmpdir/fastq_cache"
326279 mkdir -p "$sra_cache" "$fastq_cache"
327-
280+
328281 prefetch --max-size u --progress --log-level info --force ALL --output-directory "$sra_cache" {params.srr} 1>{log} 2>&1
329282
330283 sra_temp="$sra_cache/{params.srr}.sra"
@@ -366,7 +319,7 @@ rule fastq_dump_single:
366319 mkdir -p "$sra_cache" "$fastq_cache"
367320
368321 prefetch --max-size u --progress --log-level info --force ALL --output-directory "$sra_cache" {params.srr} 1>>{log} 2>&1
369-
322+
370323 sra_file="$sra_cache/{params.srr}/{params.srr}.sra"
371324 fastq_file="$fastq_cache/{params.srr}.fastq"
372325 fasterq-dump --force --concatenate-reads --progress --threads {threads} --temp "$fastq_cache" --outdir "$fastq_cache" "$sra_file" 1>>{log} 2>&1
@@ -450,6 +403,8 @@ rule qc_raw_fastq_single:
450403 mv --verbose "$tmpdir/{wildcards.tissue}_{wildcards.tag}_S_fastqc.zip" "{output.s_zip}" 1>>{log} 2>&1
451404 mv --verbose "$tmpdir/{wildcards.tissue}_{wildcards.tag}_S_fastqc.html" "{output.s_html}" 1>>{log} 2>&1
452405 """
406+
407+
453408def trim_paired_input (wildcards ) - > dict [Literal ["r1" ] | Literal ["r2" ], str | list [str ]]:
454409 if cfg .perform .dump_fastq :
455410 return {"r1" : rules .fastq_dump_paired .output .r1 , "r2" : rules .fastq_dump_paired .output .r2 }
@@ -982,17 +937,15 @@ def multiqc_contamination_input(wildcards) -> list[str]:
982937 files += expand (rules .contaminant_screen_single .output .S , zip , tissue = tissues , tag = tags )
983938 return files
984939
985-
986940rule multiqc :
987941 input :
988- raw_fastq = lambda wildcards : [] if not cfg .perform .dump_fastq else expand (f"{ cfg .data_root } /{{tissue}}/raw/{{tissue}}_{{tag}}_{{end}}.fastq.gz" ,zip ,tissue = data .tissues_paired ,tag = data .tags_paired ,end = data .ends_paired ),
989- trimmed_fastq = lambda wildcards : [] if not cfg .perform .trim else expand (f"{ cfg .data_root } /{{tissue}}/trim/{{tissue}}_{{tag}}_{{end}}.fastq.gz" ,zip ,tissue = data .tissues_paired ,tag = data .tags_paired ,end = data .ends_paired ),
990- aligned_fastq = expand (rules .align .output .bam_file , zip , tissue = data .tissues , tag = data .tags ),
991- contaminantion = multiqc_contamination_input ,
992- insert_sizes = lambda wildcards : [] if not cfg .perform .insert_size else expand (rules .insert_size .output .txt ,zip ,tissue = data .tissues ,tag = data .tags ),
993- rnaseq_metrics = lambda wildcards : [] if not cfg .perform .rnaseq_metrics else expand (rules .rnaseq_metrics .output .metrics , zip , tissue = data .tissues , tag = data .tags ),
994- fragment_sizes = lambda wildcards : [] if not cfg .perform .fragment_size else expand (rules .fragment_size .output , zip , tissue = data .tissues , tag = data .tags ),
995- salmon_quant = expand (rules .salmon_quantification .output .quant , zip , tissue = data .tissues , tag = data .tags ),
942+ raw_qc = expand (f"{ cfg .data_root } /{{tissue}}/fastqc/raw/raw_{{tissue}}_{{tag}}_{{end}}_fastqc.zip" ,zip ,tissue = data .tissues_paired ,tag = data .tags_paired ,end = data .ends_paired ) if cfg .perform .dump_fastq else [],
943+ trim_qc = expand (f"{ cfg .data_root } /{{tissue}}/fastqc/trimmed/trimmed_{{tissue}}_{{tag}}_{{end}}_fastqc.zip" ,zip ,tissue = data .tissues_paired ,tag = data .tags_paired ,end = data .ends_paired ) if cfg .perform .trim else [],
944+ contaminantion = expand (f"{ cfg .data_root } /{{tissue}}/fq_screen/{{tissue}}_{{tag}}_{{end}}_screen.txt" ,zip ,tissue = data .tissues_paired ,tag = data .tags_paired ,end = data .ends_paired ) if cfg .perform .contaminant_screen else [],
945+ insert_sizes = expand (rules .insert_size .output .txt ,zip ,tissue = data .tissues ,tag = data .tags ) if cfg .perform .insert_size else [],
946+ rnaseq_metrics = expand (rules .rnaseq_metrics .output .metrics ,zip ,tissue = data .tissues ,tag = data .tags ) if cfg .perform .rnaseq_metrics else [],
947+ fragment_sizes = expand (rules .fragment_size .output ,zip ,tissue = data .tissues ,tag = data .tags ) if cfg .perform .fragment_size else [],
948+ salmon_quant = expand (rules .salmon_quantification .output .meta ,zip ,tissue = data .tissues ,tag = data .tags ),
996949 output :
997950 output_file = f"{ cfg .data_root } /{{tissue}}/multiqc/{ cfg .sample_filepath .stem } /{ cfg .sample_filepath .stem } _multiqc_report.html" ,
998951 params :
0 commit comments