Skip to content

Commit 8705df0

Browse files
guessprotocol: improvements for speed and gathering all summary info into one struct
1 parent bd1fec5 commit 8705df0

1 file changed

Lines changed: 102 additions & 60 deletions

File tree

src/utils/guessprotocol.cpp

Lines changed: 102 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
/* guessprotocol: a program for guessing whether a wgbs protocol is
22
* original, pbat or random pbat
33
*
4-
* Copyright (C) 2019-2022
4+
* Copyright (C) 2019-2023
55
*
66
* Authors: Andrew D. Smith
77
*
@@ -23,6 +23,8 @@
2323
#include <iterator>
2424
#include <algorithm>
2525
#include <stdexcept>
26+
#include <filesystem>
27+
#include <sstream>
2628

2729
#include "OptionParser.hpp"
2830
#include "smithlab_utils.hpp"
@@ -35,6 +37,43 @@ using std::cerr;
3537
using std::endl;
3638
using std::min;
3739
using std::runtime_error;
40+
using std::array;
41+
42+
static string
43+
guess_protocol(const double fraction_t_rich) {
44+
if (fraction_t_rich >= 0.8) {
45+
return "original";
46+
}
47+
if (fraction_t_rich <= 0.2) {
48+
return "pbat";
49+
}
50+
if (fraction_t_rich >= 0.4 && fraction_t_rich <= 0.6) {
51+
return "random";
52+
}
53+
return "inconclusive";
54+
}
55+
56+
struct guessprotocol_summary {
57+
string protocol;
58+
string layout;
59+
uint64_t n_t_rich_reads{};
60+
uint64_t n_reads{};
61+
double fraction_t_rich_reads{};
62+
63+
void evaluate() {
64+
fraction_t_rich_reads = static_cast<double>(n_t_rich_reads)/n_reads;
65+
protocol = guess_protocol(fraction_t_rich_reads);
66+
}
67+
68+
string tostring() const {
69+
std::ostringstream oss;
70+
oss << "protocol: " << protocol << '\n'
71+
<< "fraction_t_rich_reads: " << fraction_t_rich_reads << '\n'
72+
<< "t_rich_reads: " << n_t_rich_reads << '\n'
73+
<< "reads_examined: " << n_reads;
74+
return oss.str();
75+
}
76+
};
3877

3978
// store each read from one end
4079
struct FASTQRecord {
@@ -47,64 +86,73 @@ struct FASTQRecord {
4786
static bool
4887
mates(const size_t to_ignore_at_end, // in case names have #0/1 name ends
4988
const FASTQRecord &a, const FASTQRecord &b) {
50-
assert(to_ignore_at_end < a.name.length());
51-
return equal(begin(a.name), end(a.name) - to_ignore_at_end, begin(b.name));
89+
assert(to_ignore_at_end < std::size(a.name));
90+
return equal(cbegin(a.name), cend(a.name) - to_ignore_at_end,
91+
cbegin(b.name));
5292
}
5393

5494
// Read 4 lines one time from fastq and fill in the FASTQRecord structure
5595
static std::istream&
5696
operator>>(std::istream& s, FASTQRecord &r) {
57-
if (getline(s, r.name)) {
97+
constexpr auto n_error_codes = 5u;
98+
enum err_code { none, bad_name, bad_seq, bad_plus, bad_qual };
99+
static const array<runtime_error, n_error_codes> error_msg = {
100+
runtime_error(""),
101+
runtime_error("failed to parse fastq name line"),
102+
runtime_error("failed to parse fastq sequence line"),
103+
runtime_error("failed to parse fastq plus line"),
104+
runtime_error("failed to parse fastq qual line")
105+
};
58106

59-
if (r.name.empty() || r.name[0] != '@')
60-
throw std::runtime_error("bad name line: " + r.name);
107+
err_code ec = err_code::none;
61108

62-
r.name = r.name.substr(1, r.name.find_first_of(' '));
109+
getline(s, r.name);
63110

64-
if (!getline(s, r.seq))
65-
throw runtime_error("failed to read expected seq line");
111+
if (r.name.empty() || r.name[0] != '@')
112+
ec = err_code::bad_name;
66113

67-
string tmp;
68-
if (!getline(s, tmp))
69-
throw runtime_error("failed to read expected + line");
114+
r.name.resize(r.name.find_first_of(' '));
115+
const auto nm_sz = r.name.find_first_of(' ');
116+
copy(cbegin(r.name) + 1, cbegin(r.name) + nm_sz, begin(r.name));
70117

71-
if (!getline(s, tmp))
72-
throw runtime_error("failed to read expected score line");
73-
}
74-
return s;
75-
}
118+
if (!getline(s, r.seq))
119+
ec = err_code::bad_seq;
76120

121+
string tmp;
122+
if (!getline(s, tmp))
123+
ec = err_code::bad_plus;
77124

78-
static string
79-
guess_protocol(const double fraction_t_rich) {
80-
if (fraction_t_rich >= 0.8) {
81-
return "original";
82-
}
83-
if (fraction_t_rich <= 0.2) {
84-
return "pbat";
85-
}
86-
if (fraction_t_rich >= 0.4 && fraction_t_rich <= 0.6) {
87-
return "random";
88-
}
89-
return "inconclusive";
125+
if (!getline(s, tmp))
126+
ec = err_code::bad_qual;
127+
128+
if (ec != err_code::none)
129+
throw error_msg[ec];
130+
131+
return s;
90132
}
91133

92134
int
93135
main_guessprotocol(int argc, const char **argv) {
94136

95137
try {
96138

139+
constexpr auto description = "guess bisulfite protocol for a library";
140+
141+
string outfile;
97142
size_t reads_to_check = 1000000;
98143
size_t name_suffix_len = 0;
99144

145+
namespace fs = std::filesystem;
146+
const string cmd_name = std::filesystem::path(argv[0]).filename();
147+
100148
/****************** COMMAND LINE OPTIONS ********************/
101-
OptionParser opt_parse(strip_path(argv[0]),
102-
"guess whether protocol is ordinary, pbat or random",
149+
OptionParser opt_parse(cmd_name, description,
103150
"<end1-fastq> [<end2-fastq>]");
104151
opt_parse.add_opt("nreads", 'n', "number of reads in initial check",
105152
false, reads_to_check);
106153
opt_parse.add_opt("ignore", 'i', "length of read name suffix "
107154
"to ignore when matching", false, name_suffix_len);
155+
opt_parse.add_opt("output", 'o', "output file name", false, outfile);
108156
vector<string> leftover_args;
109157
opt_parse.parse(argc, argv, leftover_args);
110158
if (argc == 1 || opt_parse.help_requested()) {
@@ -123,8 +171,11 @@ main_guessprotocol(int argc, const char **argv) {
123171
const vector<string> reads_files(leftover_args);
124172
/****************** END COMMAND LINE OPTIONS *****************/
125173

174+
guessprotocol_summary summary;
126175
if (reads_files.size() == 2) {
127-
// Input: paired-end reads with end1 and end2
176+
summary.layout = "paired";
177+
178+
// input: paired-end reads with end1 and end2
128179
std::ifstream in1(reads_files.front());
129180
if (!in1)
130181
throw runtime_error("cannot open input file: " + reads_files.front());
@@ -133,12 +184,10 @@ main_guessprotocol(int argc, const char **argv) {
133184
if (!in2)
134185
throw runtime_error("cannot open input file: " + reads_files.back());
135186

136-
size_t n_pairs = 0;
137-
size_t t_rich_pairs = 0;
138-
139187
FASTQRecord end_one, end_two;
140-
while (in1 >> end_one && in2 >> end_two && n_pairs < reads_to_check) {
141-
++n_pairs;
188+
while (in1 >> end_one && in2 >> end_two &&
189+
summary.n_reads < reads_to_check) {
190+
summary.n_reads++;
142191

143192
// two reads should be in paired-ends
144193
if (!mates(name_suffix_len, end_one, end_two))
@@ -162,46 +211,39 @@ main_guessprotocol(int argc, const char **argv) {
162211
const double t_rich_count = (end_one_t + end_two_a);
163212
const double pbat_count = (end_one_a + end_two_t);
164213

165-
t_rich_pairs += (t_rich_count > pbat_count);
214+
summary.n_t_rich_reads += (t_rich_count > pbat_count);
166215
}
167-
const double fraction_t_rich = static_cast<double>(t_rich_pairs)/n_pairs;
168-
cout << guess_protocol(fraction_t_rich) << '\t'
169-
<< "fraction_t_rich=" << fraction_t_rich << '\t'
170-
<< "t_rich_pairs=" << t_rich_pairs << '\t'
171-
<< "pairs_examined=" << n_pairs << endl;
172216
}
173-
else { // if (reads_files.size() == 1)
174-
// Input: single-end reads
217+
else {
218+
summary.layout = "single";
219+
220+
// input: single-end reads
175221
std::ifstream in(reads_files.front());
176222
if (!in)
177223
throw runtime_error("cannot open input file: " + reads_files.front());
178224

179-
size_t n_reads = 0;
180-
size_t t_rich_reads = 0;
181-
182225
FASTQRecord r;
183-
while (in >> r && n_reads < reads_to_check) {
184-
++n_reads;
226+
while (in >> r && summary.n_reads < reads_to_check) {
227+
summary.n_reads++;
185228
const double a = (count(begin(r.seq), end(r.seq), 'A') +
186229
count(begin(r.seq), end(r.seq), 'C'));
187230
const double t = (count(begin(r.seq), end(r.seq), 'T') +
188231
count(begin(r.seq), end(r.seq), 'G'));
189-
t_rich_reads += (t > a);
232+
summary.n_t_rich_reads += (t > a);
190233
}
191-
const double fraction_t_rich = static_cast<double>(t_rich_reads)/n_reads;
192-
cout << guess_protocol(fraction_t_rich) << '\t'
193-
<< "fraction_t_rich=" << fraction_t_rich << '\t'
194-
<< "t_rich_reads=" << t_rich_reads << '\t'
195-
<< "reads_examined=" << n_reads << endl;
196234
}
235+
236+
summary.evaluate();
237+
if (!outfile.empty()) {
238+
std::ofstream out(outfile);
239+
if (!out) throw runtime_error("failed to open output file: " + outfile);
240+
out << summary.tostring() << endl;
241+
}
242+
else cout << summary.tostring() << endl;
197243
}
198244
catch (const runtime_error &e) {
199245
cerr << e.what() << endl;
200246
return EXIT_FAILURE;
201247
}
202-
catch (std::bad_alloc &ba) {
203-
cerr << "ERROR: could not allocate memory" << endl;
204-
return EXIT_FAILURE;
205-
}
206248
return EXIT_SUCCESS;
207249
}

0 commit comments

Comments
 (0)