Skip to content

Commit dfe7d65

Browse files
guessprotocol: major updates adding a probability model to the criteria
1 parent 8705df0 commit dfe7d65

1 file changed

Lines changed: 148 additions & 87 deletions

File tree

src/utils/guessprotocol.cpp

Lines changed: 148 additions & 87 deletions
Original file line numberDiff line numberDiff line change
@@ -16,61 +16,111 @@
1616
* General Public License for more details.
1717
*/
1818

19-
#include <string>
20-
#include <vector>
21-
#include <iostream>
22-
#include <fstream>
23-
#include <iterator>
19+
#include <bamxx.hpp>
20+
2421
#include <algorithm>
25-
#include <stdexcept>
2622
#include <filesystem>
23+
#include <fstream>
24+
#include <iostream>
25+
#include <iterator>
2726
#include <sstream>
27+
#include <stdexcept>
28+
#include <string>
29+
#include <vector>
2830

2931
#include "OptionParser.hpp"
30-
#include "smithlab_utils.hpp"
32+
#include "numerical_utils.hpp"
3133
#include "smithlab_os.hpp"
34+
#include "smithlab_utils.hpp"
3235

33-
using std::string;
34-
using std::vector;
35-
using std::cout;
36+
using std::array;
3637
using std::cerr;
38+
using std::cout;
3739
using std::endl;
3840
using std::min;
3941
using std::runtime_error;
40-
using std::array;
42+
using std::string;
43+
using std::vector;
4144

42-
static string
43-
guess_protocol(const double fraction_t_rich) {
44-
if (fraction_t_rich >= 0.8) {
45-
return "original";
46-
}
47-
if (fraction_t_rich <= 0.2) {
48-
return "pbat";
45+
using bamxx::bgzf_file;
46+
47+
constexpr int nuc_to_idx[] = {
48+
/* 0*/ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
49+
/* 16*/ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
50+
/* 32*/ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
51+
/* 48*/ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
52+
/* 64*/ 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
53+
/* 80*/ 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
54+
/* 96*/ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
55+
/*112*/ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
56+
/*128*/ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
57+
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
58+
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
59+
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
60+
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
61+
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
62+
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
63+
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
64+
};
65+
66+
struct nucleotide_model {
67+
vector<double> pr{};
68+
vector<double> lpr{};
69+
double bisulfite_conversion_rate{};
70+
bool is_t_rich{};
71+
72+
nucleotide_model(const vector<double> &bc, const double conv_rate,
73+
const bool itr)
74+
: pr{bc}, bisulfite_conversion_rate{conv_rate}, is_t_rich{itr} {
75+
auto nuc_from = is_t_rich ? 1 : 2;
76+
auto nuc_to = is_t_rich ? 3 : 0;
77+
pr[nuc_to] += bisulfite_conversion_rate * pr[nuc_from];
78+
pr[nuc_from] *= (1.0 - bisulfite_conversion_rate);
79+
assert(reduce(cbegin(pr), cend(pr), 0.0) == 1.0);
80+
lpr.resize(std::size(pr));
81+
transform(cbegin(pr), cend(pr), begin(lpr),
82+
[](const double x) { return log(x); });
4983
}
50-
if (fraction_t_rich >= 0.4 && fraction_t_rich <= 0.6) {
51-
return "random";
84+
85+
double operator()(const string &s) const {
86+
return accumulate(
87+
cbegin(s), cend(s), 0.0,
88+
[&](const double x, const char c) { return x + lpr[nuc_to_idx[c]]; });
89+
};
90+
91+
string tostring() const {
92+
std::ostringstream oss;
93+
oss << "pr:\n";
94+
for (auto i : pr) oss << i << '\n';
95+
oss << "log pr:\n";
96+
for (auto i : lpr) oss << i << '\n';
97+
oss << bisulfite_conversion_rate << '\n' << is_t_rich;
98+
return oss.str();
5299
}
53-
return "inconclusive";
54-
}
100+
};
55101

56102
struct guessprotocol_summary {
57103
string protocol;
58104
string layout;
59-
uint64_t n_t_rich_reads{};
105+
double n_reads_wgbs{};
60106
uint64_t n_reads{};
61-
double fraction_t_rich_reads{};
107+
double wgbs_fraction{};
62108

63109
void evaluate() {
64-
fraction_t_rich_reads = static_cast<double>(n_t_rich_reads)/n_reads;
65-
protocol = guess_protocol(fraction_t_rich_reads);
110+
const auto frac = n_reads_wgbs / n_reads;
111+
protocol = "inconclusive";
112+
if (frac > 0.8) protocol = "wgbs";
113+
if (frac < 0.2) protocol = "pbat";
114+
if (frac > 0.4 && frac < 0.6) protocol = "rpbat";
115+
wgbs_fraction = frac;
66116
}
67117

68118
string tostring() const {
69119
std::ostringstream oss;
70120
oss << "protocol: " << protocol << '\n'
71-
<< "fraction_t_rich_reads: " << fraction_t_rich_reads << '\n'
72-
<< "t_rich_reads: " << n_t_rich_reads << '\n'
73-
<< "reads_examined: " << n_reads;
121+
<< "wgbs_fraction: " << wgbs_fraction << '\n'
122+
<< "n_reads_wgbs: " << n_reads_wgbs << '\n'
123+
<< "n_reads: " << n_reads;
74124
return oss.str();
75125
}
76126
};
@@ -92,41 +142,37 @@ mates(const size_t to_ignore_at_end, // in case names have #0/1 name ends
92142
}
93143

94144
// Read 4 lines one time from fastq and fill in the FASTQRecord structure
95-
static std::istream&
96-
operator>>(std::istream& s, FASTQRecord &r) {
145+
static bgzf_file &
146+
operator>>(bgzf_file &s, FASTQRecord &r) {
97147
constexpr auto n_error_codes = 5u;
148+
98149
enum err_code { none, bad_name, bad_seq, bad_plus, bad_qual };
150+
99151
static const array<runtime_error, n_error_codes> error_msg = {
100-
runtime_error(""),
101-
runtime_error("failed to parse fastq name line"),
152+
runtime_error(""), runtime_error("failed to parse fastq name line"),
102153
runtime_error("failed to parse fastq sequence line"),
103154
runtime_error("failed to parse fastq plus line"),
104155
runtime_error("failed to parse fastq qual line")
105156
};
106157

107158
err_code ec = err_code::none;
108159

109-
getline(s, r.name);
160+
if (!getline(s, r.name)) return s;
110161

111-
if (r.name.empty() || r.name[0] != '@')
112-
ec = err_code::bad_name;
162+
if (r.name.empty() || r.name[0] != '@') ec = err_code::bad_name;
113163

114-
r.name.resize(r.name.find_first_of(' '));
115-
const auto nm_sz = r.name.find_first_of(' ');
116-
copy(cbegin(r.name) + 1, cbegin(r.name) + nm_sz, begin(r.name));
164+
const auto nm_end = r.name.find_first_of(" \t");
165+
const auto nm_sz = (nm_end == string::npos ? r.name.size() : nm_end) - 1;
166+
r.name.erase(copy_n(cbegin(r.name) + 1, nm_sz, begin(r.name)), cend(r.name));
117167

118-
if (!getline(s, r.seq))
119-
ec = err_code::bad_seq;
168+
if (!getline(s, r.seq)) ec = err_code::bad_seq;
120169

121170
string tmp;
122-
if (!getline(s, tmp))
123-
ec = err_code::bad_plus;
171+
if (!getline(s, tmp)) ec = err_code::bad_plus;
124172

125-
if (!getline(s, tmp))
126-
ec = err_code::bad_qual;
173+
if (!getline(s, tmp)) ec = err_code::bad_qual;
127174

128-
if (ec != err_code::none)
129-
throw error_msg[ec];
175+
if (ec != err_code::none) throw error_msg[ec];
130176

131177
return s;
132178
}
@@ -136,11 +182,16 @@ main_guessprotocol(int argc, const char **argv) {
136182

137183
try {
138184

185+
static const vector<double> human_base_comp = {0.2, 0.3, 0.3, 0.2};
186+
static const vector<double> flat_base_comp = {0.25, 0.25, 0.25, 0.25};
187+
139188
constexpr auto description = "guess bisulfite protocol for a library";
140189

190+
bool verbose;
141191
string outfile;
142192
size_t reads_to_check = 1000000;
143193
size_t name_suffix_len = 0;
194+
double bisulfite_conversion_rate = 0.98;
144195

145196
namespace fs = std::filesystem;
146197
const string cmd_name = std::filesystem::path(argv[0]).filename();
@@ -152,7 +203,12 @@ main_guessprotocol(int argc, const char **argv) {
152203
false, reads_to_check);
153204
opt_parse.add_opt("ignore", 'i', "length of read name suffix "
154205
"to ignore when matching", false, name_suffix_len);
206+
opt_parse.add_opt("bisulfite", 'b', "bisulfite conversion rate",
207+
false, bisulfite_conversion_rate);
155208
opt_parse.add_opt("output", 'o', "output file name", false, outfile);
209+
opt_parse.add_opt("verbose", 'v',
210+
"report available information during the run",
211+
false, verbose);
156212
vector<string> leftover_args;
157213
opt_parse.parse(argc, argv, leftover_args);
158214
if (argc == 1 || opt_parse.help_requested()) {
@@ -171,72 +227,77 @@ main_guessprotocol(int argc, const char **argv) {
171227
const vector<string> reads_files(leftover_args);
172228
/****************** END COMMAND LINE OPTIONS *****************/
173229

230+
auto base_comp = human_base_comp;
231+
nucleotide_model t_rich_model(base_comp, bisulfite_conversion_rate, true);
232+
nucleotide_model a_rich_model(base_comp, bisulfite_conversion_rate, false);
233+
174234
guessprotocol_summary summary;
235+
summary.layout = reads_files.size() == 2 ? "paired" : "single";
236+
237+
if (verbose) {
238+
if (reads_files.size() == 2)
239+
cerr << "data layout: "
240+
<< "paired" << '\n'
241+
<< "read1 file: " << reads_files.front() << '\n'
242+
<< "read2 file: " << reads_files.back() << '\n';
243+
else
244+
cerr << "data layout: "
245+
<< "single" << '\n'
246+
<< "read file: " << reads_files.front() << '\n';
247+
cerr << "reads to check: " << reads_to_check << '\n'
248+
<< "read name suffix length: " << name_suffix_len << '\n'
249+
<< "bisulfite conversion: " << bisulfite_conversion_rate << '\n';
250+
}
251+
175252
if (reads_files.size() == 2) {
176-
summary.layout = "paired";
177253

178254
// input: paired-end reads with end1 and end2
179-
std::ifstream in1(reads_files.front());
255+
bgzf_file in1(reads_files.front(), "r");
180256
if (!in1)
181-
throw runtime_error("cannot open input file: " + reads_files.front());
257+
throw runtime_error("cannot open file: " + reads_files.front());
182258

183-
std::ifstream in2(reads_files.back());
259+
bgzf_file in2(reads_files.back(), "r");
184260
if (!in2)
185-
throw runtime_error("cannot open input file: " + reads_files.back());
261+
throw runtime_error("cannot open file: " + reads_files.back());
186262

187-
FASTQRecord end_one, end_two;
188-
while (in1 >> end_one && in2 >> end_two &&
189-
summary.n_reads < reads_to_check) {
263+
FASTQRecord r1, r2;
264+
while (in1 >> r1 && in2 >> r2 && summary.n_reads < reads_to_check) {
190265
summary.n_reads++;
191266

192-
// two reads should be in paired-ends
193-
if (!mates(name_suffix_len, end_one, end_two))
194-
throw runtime_error("expected mates, got: " +
195-
end_one.name + " and " + end_two.name);
196-
197-
const double end_one_a =
198-
count(begin(end_one.seq), end(end_one.seq), 'A') +
199-
count(begin(end_one.seq), end(end_one.seq), 'C');
200-
const double end_one_t =
201-
count(begin(end_one.seq), end(end_one.seq), 'T') +
202-
count(begin(end_one.seq), end(end_one.seq), 'G');
203-
204-
const double end_two_a =
205-
count(begin(end_two.seq), end(end_two.seq), 'A') +
206-
count(begin(end_two.seq), end(end_two.seq), 'C');
207-
const double end_two_t =
208-
count(begin(end_two.seq), end(end_two.seq), 'T') +
209-
count(begin(end_two.seq), end(end_two.seq), 'G');
210-
211-
const double t_rich_count = (end_one_t + end_two_a);
212-
const double pbat_count = (end_one_a + end_two_t);
213-
214-
summary.n_t_rich_reads += (t_rich_count > pbat_count);
267+
if (!mates(name_suffix_len, r1, r2))
268+
throw runtime_error("expected mates: " + r1.name + ", " + r2.name);
269+
270+
const double ta = t_rich_model(r1.seq) + a_rich_model(r2.seq);
271+
const double at = a_rich_model(r1.seq) + t_rich_model(r2.seq);
272+
273+
const auto prob_read1_t_rich = exp(ta - log_sum_log(ta, at));
274+
summary.n_reads_wgbs += prob_read1_t_rich;
215275
}
216276
}
217277
else {
218-
summary.layout = "single";
219278

220279
// input: single-end reads
221-
std::ifstream in(reads_files.front());
280+
bgzf_file in(reads_files.front(), "r");
222281
if (!in)
223-
throw runtime_error("cannot open input file: " + reads_files.front());
282+
throw runtime_error("cannot open file: " + reads_files.front());
224283

225284
FASTQRecord r;
226285
while (in >> r && summary.n_reads < reads_to_check) {
227286
summary.n_reads++;
228-
const double a = (count(begin(r.seq), end(r.seq), 'A') +
229-
count(begin(r.seq), end(r.seq), 'C'));
230-
const double t = (count(begin(r.seq), end(r.seq), 'T') +
231-
count(begin(r.seq), end(r.seq), 'G'));
232-
summary.n_t_rich_reads += (t > a);
287+
288+
const double t = t_rich_model(r.seq);
289+
const double a = a_rich_model(r.seq);
290+
291+
const auto prob_t_rich = exp(t - log_sum_log(t, a));
292+
summary.n_reads_wgbs += prob_t_rich;
233293
}
234294
}
235295

236296
summary.evaluate();
297+
237298
if (!outfile.empty()) {
238299
std::ofstream out(outfile);
239-
if (!out) throw runtime_error("failed to open output file: " + outfile);
300+
if (!out) throw runtime_error("failed to open: " + outfile);
240301
out << summary.tostring() << endl;
241302
}
242303
else cout << summary.tostring() << endl;

0 commit comments

Comments
 (0)