Skip to content

Commit 4221fec

Browse files
authored
Add MultiQC support (#63)
* add sample name parameter * update output to json format * update readme for multiqc
1 parent 9e0d2c4 commit 4221fec

16 files changed

Lines changed: 505 additions & 302 deletions

Makefile

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,9 @@ INCL_DIR := $(CURDIR)/include
22
SRC_DIR := $(CURDIR)/src
33
LIB_DIR := $(CURDIR)/lib
44

5+
VERSION := $(shell git describe --tags --always)
6+
VERSION_HEADER := $(INCL_DIR)/version.h
7+
58
# Set the library paths for the compiler
69
CONDA_PREFIX ?= $(shell echo $$CONDA_PREFIX)
710
LIBRARY_PATHS := -L$(LIB_DIR) -L$(CONDA_PREFIX)/lib
@@ -10,6 +13,11 @@ INCLUDE_PATHS := -I$(INCL_DIR) -I$(CONDA_PREFIX)/include
1013
# All targets
1114
all: swig_build compile
1215

16+
# Rule to generate version.h
17+
$(VERSION_HEADER):
18+
@echo "#pragma once" > $@
19+
@echo "#define VERSION \"$(VERSION)\"" >> $@
20+
1321
# Generate the SWIG Python/C++ wrappers
1422
swig_build:
1523
mkdir -p $(LIB_DIR)

README.md

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ LongReadSum supports FASTA, FASTQ, BAM, FAST5, and sequencing_summary.txt file f
1010
- [Installation using Anaconda (recommended)](#installation-using-anaconda)
1111
- [Installation using Docker](#installation-using-anaconda)
1212
- [Building from source](#building-from-source)
13+
- [MultiQC support](#multiqc-support)
1314
- General usage for common filetypes:
1415
- [Common parameters](#common-parameters)
1516
- [WGS BAM](#wgs-bam)
@@ -78,6 +79,19 @@ conda activate longreadsum
7879
make
7980
```
8081

82+
# MultiQC support
83+
[MultiQC](https://seqera.io/multiqc/) is a widely used open-source tool for
84+
aggregating bioinformatics analyses results from many tools across samples.
85+
86+
To run MultiQC, input the LongReadSum directory containing the output JSON
87+
summary file, and specify the _longreadsum_ module:
88+
89+
```
90+
multiqc $INPUT_DIRECTORY --module longreadsum --outdir $OUTPUT_DIRECTORY/multiqc
91+
```
92+
93+
Example report:
94+
<img width="1707" height="761" alt="image" src="https://github.com/user-attachments/assets/adbcacf7-44f8-48bd-9135-9293379d65d2" />
8195

8296
## Running
8397
Activate the conda environment and then run with arguments:

include/input_parameters.h

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,6 @@ class Input_Para{
2121
// Parameters
2222
int threads;
2323
size_t num_input_files;
24-
std::string out_prefix;
2524
int64_t other_flags;
2625
int32_t user_defined_fastq_base_qual_offset;
2726
std::string output_folder; // Output folder
@@ -36,10 +35,14 @@ class Input_Para{
3635
bool mod_analysis; // Perform base modification analysis on BAM file
3736
int tin_sample_size; // Number of equally spaced samples for TIN calculation
3837
int tin_min_coverage; // Minimum coverage for TIN calculation
38+
std::string sample_name; // Sample name
39+
std::string version_str = ""; // Version string for the program
3940

4041
// Functions
4142
std::string add_input_file(const std::string& input_filepath);
4243

44+
const std::string &getVersion() const;
45+
4346
Input_Para();
4447

4548
~Input_Para();

include/tin.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ typedef std::unordered_map<std::string, std::tuple<std::string, int, int, double
1515

1616
// Calculate the TIN score for each transcript in the gene BED file
1717
// (Reference: https://bmcbioinformatics.biomedcentral.com/articles/10.1186/s12859-016-0922-z#Sec11)
18-
void calculateTIN(TINStats* tin_stats, const std::string& gene_bed, const std::string& bam_filepath, int min_cov, int sample_size, const std::string& output_folder, int thread_count);
18+
void calculateTIN(TINStats& tin_stats, const std::string& gene_bed, const std::string& bam_filepath, int min_cov, int sample_size, const std::string& sample_name, const std::string& output_folder, int thread_count);
1919

2020
std::unordered_map<int, int> getReadDepths(htsFile* bam_file, hts_idx_t* idx, bam_hdr_t* header, std::string chr, int start, int end, bool transcript_strand);
2121

include/version.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
#pragma once
2+
#define VERSION "v1.4.0-5-g9e0d2c4"

src/bam_module.cpp

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,7 @@ int BAM_Module::calculateStatistics(Input_Para &input_params, Output_BAM &final_
8080
std::cout << "Calculating TIN scores for file: " << filepath << std::endl;
8181

8282
TINStats tin_stats;
83-
calculateTIN(&tin_stats, gene_bed, input_params.input_files[i], min_cov, sample_size, input_params.output_folder, input_params.threads);
83+
calculateTIN(tin_stats, gene_bed, input_params.input_files[i], min_cov, sample_size, input_params.sample_name, input_params.output_folder, input_params.threads);
8484

8585
// Print the TIN stats
8686
std::cout << "Number of transcripts: " << tin_stats.num_transcripts << std::endl;
@@ -154,8 +154,6 @@ int BAM_Module::calculateStatistics(Input_Para &input_params, Output_BAM &final_
154154

155155
while (reader.hasNextRecord()){
156156
// Read the next batch of records
157-
// std::cout << "Generating " << thread_count << " thread(s)..." <<
158-
// std::endl;
159157
printMessage("Generating " + std::to_string(thread_count) + " thread(s)...");
160158
std::vector<std::thread> thread_vector;
161159
for (int thread_index=0; thread_index<thread_count; thread_index++){
@@ -165,7 +163,6 @@ int BAM_Module::calculateStatistics(Input_Para &input_params, Output_BAM &final_
165163
}
166164

167165
// Join the threads in thread_vector
168-
// std::cout<<"Joining threads..."<<std::endl;
169166
int thread_index = 0;
170167
for (auto& t : thread_vector){
171168
if (t.joinable()){
@@ -222,11 +219,14 @@ int BAM_Module::calculateStatistics(Input_Para &input_params, Output_BAM &final_
222219
std::cout << "Saving summary statistics to file..." << std::endl;
223220

224221
// If in RRMS mode, append RRMS accepted/rejected to the output prefix
225-
std::string output_prefix = "bam";
222+
// std::string output_prefix = "bam";
223+
std::string output_prefix = input_params.sample_name;
226224
if (input_params.rrms_csv != ""){
227-
output_prefix += input_params.rrms_filter ? "_rrms_accepted" : "_rrms_rejected";
225+
output_prefix += input_params.rrms_filter ? "_accepted" : "_rejected";
228226
}
229-
std::string summary_filepath = input_params.output_folder + "/" + output_prefix + "_summary.txt";
227+
228+
std::string summary_filepath = input_params.output_folder + "/" + output_prefix + "_summary.bam.json";
229+
230230
final_output.save_summary(summary_filepath, input_params, final_output);
231231
std::cout << "Saved file: " << summary_filepath << std::endl;
232232

src/cli.py

Lines changed: 22 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -65,9 +65,6 @@ def get_common_param(margs):
6565
pat_split = margs.pattern.split("*")
6666
param_dict["input_files"].extend(
6767
glob(os.path.join("*".join(pat_split[:-1]), "*" + pat_split[-1])))
68-
69-
# read_count = int(margs.read_count)
70-
# param_dict["read_count"] = read_count
7168

7269
if len(param_dict["input_files"]) == 0:
7370
parsing_error_msg += "No input file(s) were provided.\n"
@@ -88,8 +85,6 @@ def get_common_param(margs):
8885
except OSError:
8986
parsing_error_msg += "Cannot create folder for " + param_dict["output_folder"] + " \n"
9087

91-
param_dict["out_prefix"] = margs.outprefix
92-
9388
# Set up logging to file and stdout
9489
if margs.log is None or margs.log == "":
9590
parsing_error_msg += "No log file was provided.\n"
@@ -113,6 +108,7 @@ def get_common_param(margs):
113108
param_dict["log_level"] = margs.log_level
114109

115110
param_dict["threads"] = margs.threads
111+
param_dict["sample_name"] = margs.sample
116112

117113
# Reset the param_dict if there are parsing errors
118114
if parsing_error_msg != "":
@@ -133,17 +129,15 @@ def fq_module(margs):
133129

134130
else:
135131
logging.info('Input file(s) are:\n%s', '\n'.join(param_dict["input_files"]))
136-
param_dict["out_prefix"] += "fastq"
137132

138133
# Import the SWIG Python wrapper for our C++ module
139134
input_para = lrst.Input_Para()
140135
input_para.threads = param_dict["threads"]
141136

142137
input_para.other_flags = 0
143138
input_para.user_defined_fastq_base_qual_offset = margs.udqual
144-
145139
input_para.output_folder = str(param_dict["output_folder"])
146-
input_para.out_prefix = str(param_dict["out_prefix"])
140+
input_para.sample_name = param_dict["sample_name"]
147141

148142
for _ipf in param_dict["input_files"]:
149143
input_para.add_input_file(str(_ipf))
@@ -157,7 +151,7 @@ def fq_module(margs):
157151
fq_html_gen = generate_html.ST_HTML_Generator(
158152
[["basic_st", "read_length_bar", "read_length_hist", "gc_content_hist", "base_counts", "base_quality",
159153
"read_avg_base_quality"], "FASTQ QC", param_dict], plot_filepaths, static=False)
160-
fq_html_gen.generate_html()
154+
fq_html_gen.generate_html(input_para.sample_name, "fastq")
161155

162156
logging.info("Done. Output files are in %s", param_dict["output_folder"])
163157
else:
@@ -176,12 +170,11 @@ def fa_module(margs):
176170
else:
177171
# If there are no parse errors, run the filetype-specific module
178172
logging.info('Input file(s) are:\n%s', '\n'.join(param_dict["input_files"]))
179-
param_dict["out_prefix"] += "fasta"
180173
input_para = lrst.Input_Para()
181174
input_para.threads = param_dict["threads"]
182175
input_para.other_flags = 0
183176
input_para.output_folder = str(param_dict["output_folder"])
184-
input_para.out_prefix = str(param_dict["out_prefix"])
177+
input_para.sample_name = param_dict["sample_name"]
185178

186179
for _ipf in param_dict["input_files"]:
187180
input_para.add_input_file(str(_ipf))
@@ -195,7 +188,7 @@ def fa_module(margs):
195188
fa_html_gen = generate_html.ST_HTML_Generator(
196189
[["basic_st", "read_length_bar", "read_length_hist", "gc_content_hist", "base_counts"], "FASTA QC",
197190
param_dict], plot_filepaths, static=True)
198-
fa_html_gen.generate_html()
191+
fa_html_gen.generate_html(input_para.sample_name, "fasta")
199192
logging.info("Done. Output files are in %s", param_dict["output_folder"])
200193

201194
else:
@@ -211,11 +204,10 @@ def bam_module(margs):
211204

212205
else:
213206
logging.info('Input file(s) are:\n%s', '\n'.join(param_dict["input_files"]))
214-
param_dict["out_prefix"] += "bam";
215207
input_para = lrst.Input_Para()
216208
input_para.threads = param_dict["threads"]
217209
input_para.output_folder = str(param_dict["output_folder"])
218-
input_para.out_prefix = str(param_dict["out_prefix"])
210+
input_para.sample_name = param_dict["sample_name"]
219211

220212
# Set the reference genome file and base modification threshold
221213
ref_genome = margs.ref if margs.ref != "" or margs.ref is not None else ""
@@ -269,7 +261,7 @@ def bam_module(margs):
269261
# Generate the HTML report
270262
bam_html_gen = generate_html.ST_HTML_Generator(
271263
[qc_info_list, "BAM QC", param_dict], plot_filepaths, static=False)
272-
bam_html_gen.generate_html()
264+
bam_html_gen.generate_html(input_para.sample_name, "bam")
273265
logging.info("Done. Output files are in %s", param_dict["output_folder"])
274266

275267
else:
@@ -288,26 +280,19 @@ def rrms_module(margs):
288280
input_para = lrst.Input_Para()
289281
input_para.threads = param_dict["threads"]
290282
input_para.output_folder = str(param_dict["output_folder"])
291-
input_para.out_prefix = str(param_dict["out_prefix"])
283+
input_para.sample_name = param_dict["sample_name"]
284+
292285
for _ipf in param_dict["input_files"]:
293286
input_para.add_input_file(str(_ipf))
294287

295288
# Set the RRMS input CSV file
296289
input_para.rrms_csv = margs.csv
297290
logging.info("RRMS CSV file is " + input_para.rrms_csv)
298291

299-
# Get the output prefix
300-
output_prefix = param_dict["out_prefix"]
301-
302292
# Run QC for both accepted and rejected reads
303293
rrms_filter = [True, False]
304294
for filter_type in rrms_filter:
305-
306-
# Set the RRMS filter type
307-
input_para.rrms_filter = filter_type
308-
309-
# Set the output prefix
310-
param_dict["out_prefix"] = output_prefix + "rrms_" + ("accepted" if filter_type else "rejected")
295+
input_para.rrms_filter = filter_type # True for accepted reads, False for rejected reads
311296
param_dict["mod"] = input_para.mod_analysis = False # Disable base modification analysis for RRMS (use BAM module for this)
312297

313298
# Run the QC module
@@ -332,7 +317,7 @@ def rrms_module(margs):
332317
# Generate the HTML report
333318
bam_html_gen = generate_html.ST_HTML_Generator(
334319
[qc_info_list, "BAM QC", param_dict], plot_filepaths, static=False)
335-
bam_html_gen.generate_html()
320+
bam_html_gen.generate_html(input_para.sample_name, "rrms_bam_" + ("accepted" if filter_type else "rejected"))
336321
logging.info("Done. Output files are in %s", param_dict["output_folder"])
337322

338323
else:
@@ -349,11 +334,10 @@ def seqtxt_module(margs):
349334

350335
else:
351336
logging.info('Input file(s) are:\n%s', '\n'.join(param_dict["input_files"]))
352-
param_dict["out_prefix"] += "seqtxt"
353337
input_para = lrst.Input_Para()
354338
input_para.threads = param_dict["threads"]
355339
input_para.output_folder = str(param_dict["output_folder"])
356-
input_para.out_prefix = str(param_dict["out_prefix"])
340+
input_para.sample_name = param_dict["sample_name"]
357341

358342
for _ipf in param_dict["input_files"]:
359343
input_para.add_input_file(str(_ipf))
@@ -370,7 +354,7 @@ def seqtxt_module(margs):
370354
[["basic_st", "read_length_bar", "read_length_hist"],
371355
report_title, param_dict], plot_filepaths, static=False)
372356

373-
seqtxt_html_gen.generate_html()
357+
seqtxt_html_gen.generate_html(input_para.sample_name, "basecall_summary")
374358
logging.info("Done. Output files are in %s", param_dict["output_folder"])
375359
else:
376360
logging.error("QC did not generate.")
@@ -386,11 +370,10 @@ def fast5_module(margs):
386370

387371
else:
388372
# logging.info('Input file(s) are:\n%s', '\n'.join(param_dict["input_files"]))
389-
param_dict["out_prefix"] += "FAST5"
390373
input_para = lrst.Input_Para()
391374
input_para.threads = param_dict["threads"]
392375
input_para.output_folder = str(param_dict["output_folder"])
393-
input_para.out_prefix = str(param_dict["out_prefix"])
376+
input_para.sample_name = param_dict["sample_name"]
394377
input_para.other_flags = 0 # 0 for normal QC, 1 for signal statistics output
395378

396379
for _ipf in param_dict["input_files"]:
@@ -405,7 +388,7 @@ def fast5_module(margs):
405388
fast5_html_obj = generate_html.ST_HTML_Generator(
406389
[["basic_st", "read_length_bar", "read_length_hist", "gc_content_hist", "base_counts", "base_quality"],
407390
"FAST5 QC", param_dict], plot_filepaths, static=False)
408-
fast5_html_obj.generate_html()
391+
fast5_html_obj.generate_html(input_para.sample_name, "fast5")
409392
logging.info("Done. Output files are in %s", param_dict["output_folder"])
410393

411394
else:
@@ -421,11 +404,11 @@ def fast5_signal_module(margs):
421404

422405
else:
423406
# logging.info('Input file(s) are:\n%s', '\n'.join(param_dict["input_files"]))
424-
param_dict["out_prefix"] += "fast5_signal"
425407
input_para = lrst.Input_Para()
426408
input_para.threads = param_dict["threads"]
427409
input_para.output_folder = str(param_dict["output_folder"])
428-
input_para.out_prefix = str(param_dict["out_prefix"])
410+
input_para.sample_name = param_dict["sample_name"]
411+
429412
input_para.other_flags = 1 # 0 for normal QC, 1 for signal statistics output
430413

431414
# Get the read count if specified
@@ -450,7 +433,7 @@ def fast5_signal_module(margs):
450433
plot_filepaths = plot(fast5_output, param_dict, 'FAST5s')
451434
fast5_html_obj = generate_html.ST_HTML_Generator(
452435
[["basic_st", "read_length_bar", "read_length_hist", "gc_content_hist", "base_counts", "ont_signal"], "FAST5 QC", param_dict], plot_filepaths, static=False)
453-
fast5_html_obj.generate_html(signal_plots=True)
436+
fast5_html_obj.generate_html(input_para.sample_name, "fast5_signal", signal_plots=True)
454437
logging.info("Done. Output files are in %s", param_dict["output_folder"])
455438

456439
else:
@@ -467,11 +450,10 @@ def pod5_module(margs):
467450

468451
else:
469452
# logging.info('Input file(s) are:\n%s', '\n'.join(param_dict["input_files"]))
470-
param_dict["out_prefix"] += "POD5"
471453
input_para = {}
472454
input_para['threads'] = param_dict["threads"]
473455
input_para['output_folder'] = str(param_dict["output_folder"])
474-
input_para['out_prefix'] = str(param_dict["out_prefix"])
456+
input_para['sample_name'] = param_dict["sample_name"]
475457
input_para['other_flags'] = 0 # 0 for normal QC, 1 for signal statistics output
476458
input_para['input_files'] = []
477459
for input_file in param_dict["input_files"]:
@@ -496,7 +478,6 @@ def pod5_module(margs):
496478
basecalls_input = lrst.Input_Para()
497479
basecalls_input.threads = param_dict["threads"]
498480
basecalls_input.output_folder = str(param_dict["output_folder"])
499-
basecalls_input.out_prefix = str(param_dict["out_prefix"])
500481
basecalls_input.add_input_file(basecalls)
501482
bam_output = lrst.Output_BAM()
502483
exit_code = lrst.callBAMModule(basecalls_input, bam_output)
@@ -518,7 +499,7 @@ def pod5_module(margs):
518499
webpage_title = "POD5 QC"
519500
fast5_html_obj = generate_html.ST_HTML_Generator(
520501
[["basic_st", "read_length_bar", "read_length_hist", "gc_content_hist", "base_counts", "ont_signal"], webpage_title, param_dict], plot_filepaths, static=False)
521-
fast5_html_obj.generate_html(signal_plots=True)
502+
fast5_html_obj.generate_html(input_para.sample_name, "pod5_signal", signal_plots=True)
522503
logging.info("Done. Output files are in %s", param_dict["output_folder"])
523504

524505
else:
@@ -543,6 +524,8 @@ def set_file_parser_defaults(file_parser):
543524
help="The number of threads used. Default: 1.")
544525
file_parser.add_argument("-Q", "--outprefix", type=str, default="QC_",
545526
help="The prefix for output filenames. Default: `QC_`.")
527+
file_parser.add_argument("-s", "--sample", type=str, default="Sample",
528+
help="Sample name. Default: `Sample`")
546529

547530

548531
# Set up the argument parser

0 commit comments

Comments
 (0)