1616 * General Public License for more details.
1717 */
1818
19- #include < string>
20- #include < vector>
21- #include < iostream>
22- #include < fstream>
23- #include < iterator>
19+ #include < bamxx.hpp>
20+
2421#include < algorithm>
25- #include < stdexcept>
2622#include < filesystem>
23+ #include < fstream>
24+ #include < iostream>
25+ #include < iterator>
2726#include < sstream>
27+ #include < stdexcept>
28+ #include < string>
29+ #include < vector>
2830
2931#include " OptionParser.hpp"
30- #include " smithlab_utils .hpp"
32+ #include " numerical_utils .hpp"
3133#include " smithlab_os.hpp"
34+ #include " smithlab_utils.hpp"
3235
33- using std::string;
34- using std::vector;
35- using std::cout;
36+ using std::array;
3637using std::cerr;
38+ using std::cout;
3739using std::endl;
3840using std::min;
3941using std::runtime_error;
40- using std::array;
42+ using std::string;
43+ using std::vector;
4144
42- static string
43- guess_protocol (const double fraction_t_rich) {
44- if (fraction_t_rich >= 0.8 ) {
45- return " original" ;
46- }
47- if (fraction_t_rich <= 0.2 ) {
48- return " pbat" ;
45+ using bamxx::bgzf_file;
46+
47+ constexpr int nuc_to_idx[] = {
48+ /* 0*/ 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 ,
49+ /* 16*/ 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 ,
50+ /* 32*/ 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 ,
51+ /* 48*/ 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 ,
52+ /* 64*/ 4 , 0 , 4 , 1 , 4 , 4 , 4 , 2 , 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 ,
53+ /* 80*/ 4 , 4 , 4 , 4 , 3 , 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 ,
54+ /* 96*/ 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 ,
55+ /* 112*/ 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 ,
56+ /* 128*/ 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 ,
57+ 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 ,
58+ 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 ,
59+ 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 ,
60+ 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 ,
61+ 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 ,
62+ 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 ,
63+ 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 ,
64+ };
65+
66+ struct nucleotide_model {
67+ vector<double > pr{};
68+ vector<double > lpr{};
69+ double bisulfite_conversion_rate{};
70+ bool is_t_rich{};
71+
72+ nucleotide_model (const vector<double > &bc, const double conv_rate,
73+ const bool itr)
74+ : pr{bc}, bisulfite_conversion_rate{conv_rate}, is_t_rich{itr} {
75+ auto nuc_from = is_t_rich ? 1 : 2 ;
76+ auto nuc_to = is_t_rich ? 3 : 0 ;
77+ pr[nuc_to] += bisulfite_conversion_rate * pr[nuc_from];
78+ pr[nuc_from] *= (1.0 - bisulfite_conversion_rate);
79+ assert (reduce (cbegin (pr), cend (pr), 0.0 ) == 1.0 );
80+ lpr.resize (std::size (pr));
81+ transform (cbegin (pr), cend (pr), begin (lpr),
82+ [](const double x) { return log (x); });
4983 }
50- if (fraction_t_rich >= 0.4 && fraction_t_rich <= 0.6 ) {
51- return " random" ;
84+
85+ double operator ()(const string &s) const {
86+ return accumulate (
87+ cbegin (s), cend (s), 0.0 ,
88+ [&](const double x, const char c) { return x + lpr[nuc_to_idx[c]]; });
89+ };
90+
91+ string tostring () const {
92+ std::ostringstream oss;
93+ oss << " pr:\n " ;
94+ for (auto i : pr) oss << i << ' \n ' ;
95+ oss << " log pr:\n " ;
96+ for (auto i : lpr) oss << i << ' \n ' ;
97+ oss << bisulfite_conversion_rate << ' \n ' << is_t_rich;
98+ return oss.str ();
5299 }
53- return " inconclusive" ;
54- }
100+ };
55101
56102struct guessprotocol_summary {
57103 string protocol;
58104 string layout;
59- uint64_t n_t_rich_reads {};
105+ double n_reads_wgbs {};
60106 uint64_t n_reads{};
61- double fraction_t_rich_reads {};
107+ double wgbs_fraction {};
62108
63109 void evaluate () {
64- fraction_t_rich_reads = static_cast <double >(n_t_rich_reads)/n_reads;
65- protocol = guess_protocol (fraction_t_rich_reads);
110+ const auto frac = n_reads_wgbs / n_reads;
111+ protocol = " inconclusive" ;
112+ if (frac > 0.8 ) protocol = " wgbs" ;
113+ if (frac < 0.2 ) protocol = " pbat" ;
114+ if (frac > 0.4 && frac < 0.6 ) protocol = " rpbat" ;
115+ wgbs_fraction = frac;
66116 }
67117
68118 string tostring () const {
69119 std::ostringstream oss;
70120 oss << " protocol: " << protocol << ' \n '
71- << " fraction_t_rich_reads : " << fraction_t_rich_reads << ' \n '
72- << " t_rich_reads : " << n_t_rich_reads << ' \n '
73- << " reads_examined : " << n_reads;
121+ << " wgbs_fraction : " << wgbs_fraction << ' \n '
122+ << " n_reads_wgbs : " << n_reads_wgbs << ' \n '
123+ << " n_reads : " << n_reads;
74124 return oss.str ();
75125 }
76126};
@@ -92,41 +142,37 @@ mates(const size_t to_ignore_at_end, // in case names have #0/1 name ends
92142}
93143
94144// Read 4 lines one time from fastq and fill in the FASTQRecord structure
95- static std::istream &
96- operator >>(std::istream& s, FASTQRecord &r) {
145+ static bgzf_file &
146+ operator >>(bgzf_file & s, FASTQRecord &r) {
97147 constexpr auto n_error_codes = 5u ;
148+
98149 enum err_code { none, bad_name, bad_seq, bad_plus, bad_qual };
150+
99151 static const array<runtime_error, n_error_codes> error_msg = {
100- runtime_error (" " ),
101- runtime_error (" failed to parse fastq name line" ),
152+ runtime_error (" " ), runtime_error (" failed to parse fastq name line" ),
102153 runtime_error (" failed to parse fastq sequence line" ),
103154 runtime_error (" failed to parse fastq plus line" ),
104155 runtime_error (" failed to parse fastq qual line" )
105156 };
106157
107158 err_code ec = err_code::none;
108159
109- getline (s, r.name );
160+ if (! getline (s, r.name )) return s ;
110161
111- if (r.name .empty () || r.name [0 ] != ' @' )
112- ec = err_code::bad_name;
162+ if (r.name .empty () || r.name [0 ] != ' @' ) ec = err_code::bad_name;
113163
114- r.name .resize (r. name . find_first_of (' ' ) );
115- const auto nm_sz = r.name .find_first_of ( ' ' ) ;
116- copy ( cbegin (r.name ) + 1 , cbegin (r.name ) + nm_sz, begin (r.name ));
164+ const auto nm_end = r.name .find_first_of (" \t " );
165+ const auto nm_sz = (nm_end == string::npos ? r.name .size () : nm_end) - 1 ;
166+ r. name . erase ( copy_n ( cbegin (r.name ) + 1 , nm_sz, begin (r.name )), cend (r.name ));
117167
118- if (!getline (s, r.seq ))
119- ec = err_code::bad_seq;
168+ if (!getline (s, r.seq )) ec = err_code::bad_seq;
120169
121170 string tmp;
122- if (!getline (s, tmp))
123- ec = err_code::bad_plus;
171+ if (!getline (s, tmp)) ec = err_code::bad_plus;
124172
125- if (!getline (s, tmp))
126- ec = err_code::bad_qual;
173+ if (!getline (s, tmp)) ec = err_code::bad_qual;
127174
128- if (ec != err_code::none)
129- throw error_msg[ec];
175+ if (ec != err_code::none) throw error_msg[ec];
130176
131177 return s;
132178}
@@ -136,11 +182,16 @@ main_guessprotocol(int argc, const char **argv) {
136182
137183 try {
138184
185+ static const vector<double > human_base_comp = {0.2 , 0.3 , 0.3 , 0.2 };
186+ static const vector<double > flat_base_comp = {0.25 , 0.25 , 0.25 , 0.25 };
187+
139188 constexpr auto description = " guess bisulfite protocol for a library" ;
140189
190+ bool verbose;
141191 string outfile;
142192 size_t reads_to_check = 1000000 ;
143193 size_t name_suffix_len = 0 ;
194+ double bisulfite_conversion_rate = 0.98 ;
144195
145196 namespace fs = std::filesystem;
146197 const string cmd_name = std::filesystem::path (argv[0 ]).filename ();
@@ -152,7 +203,12 @@ main_guessprotocol(int argc, const char **argv) {
152203 false , reads_to_check);
153204 opt_parse.add_opt (" ignore" , ' i' , " length of read name suffix "
154205 " to ignore when matching" , false , name_suffix_len);
206+ opt_parse.add_opt (" bisulfite" , ' b' , " bisulfite conversion rate" ,
207+ false , bisulfite_conversion_rate);
155208 opt_parse.add_opt (" output" , ' o' , " output file name" , false , outfile);
209+ opt_parse.add_opt (" verbose" , ' v' ,
210+ " report available information during the run" ,
211+ false , verbose);
156212 vector<string> leftover_args;
157213 opt_parse.parse (argc, argv, leftover_args);
158214 if (argc == 1 || opt_parse.help_requested ()) {
@@ -171,72 +227,77 @@ main_guessprotocol(int argc, const char **argv) {
171227 const vector<string> reads_files (leftover_args);
172228 /* ***************** END COMMAND LINE OPTIONS *****************/
173229
230+ auto base_comp = human_base_comp;
231+ nucleotide_model t_rich_model (base_comp, bisulfite_conversion_rate, true );
232+ nucleotide_model a_rich_model (base_comp, bisulfite_conversion_rate, false );
233+
174234 guessprotocol_summary summary;
235+ summary.layout = reads_files.size () == 2 ? " paired" : " single" ;
236+
237+ if (verbose) {
238+ if (reads_files.size () == 2 )
239+ cerr << " data layout: "
240+ << " paired" << ' \n '
241+ << " read1 file: " << reads_files.front () << ' \n '
242+ << " read2 file: " << reads_files.back () << ' \n ' ;
243+ else
244+ cerr << " data layout: "
245+ << " single" << ' \n '
246+ << " read file: " << reads_files.front () << ' \n ' ;
247+ cerr << " reads to check: " << reads_to_check << ' \n '
248+ << " read name suffix length: " << name_suffix_len << ' \n '
249+ << " bisulfite conversion: " << bisulfite_conversion_rate << ' \n ' ;
250+ }
251+
175252 if (reads_files.size () == 2 ) {
176- summary.layout = " paired" ;
177253
178254 // input: paired-end reads with end1 and end2
179- std::ifstream in1 (reads_files.front ());
255+ bgzf_file in1 (reads_files.front (), " r " );
180256 if (!in1)
181- throw runtime_error (" cannot open input file: " + reads_files.front ());
257+ throw runtime_error (" cannot open file: " + reads_files.front ());
182258
183- std::ifstream in2 (reads_files.back ());
259+ bgzf_file in2 (reads_files.back (), " r " );
184260 if (!in2)
185- throw runtime_error (" cannot open input file: " + reads_files.back ());
261+ throw runtime_error (" cannot open file: " + reads_files.back ());
186262
187- FASTQRecord end_one, end_two;
188- while (in1 >> end_one && in2 >> end_two &&
189- summary.n_reads < reads_to_check) {
263+ FASTQRecord r1, r2;
264+ while (in1 >> r1 && in2 >> r2 && summary.n_reads < reads_to_check) {
190265 summary.n_reads ++;
191266
192- // two reads should be in paired-ends
193- if (!mates (name_suffix_len, end_one, end_two))
194- throw runtime_error (" expected mates, got: " +
195- end_one.name + " and " + end_two.name );
196-
197- const double end_one_a =
198- count (begin (end_one.seq ), end (end_one.seq ), ' A' ) +
199- count (begin (end_one.seq ), end (end_one.seq ), ' C' );
200- const double end_one_t =
201- count (begin (end_one.seq ), end (end_one.seq ), ' T' ) +
202- count (begin (end_one.seq ), end (end_one.seq ), ' G' );
203-
204- const double end_two_a =
205- count (begin (end_two.seq ), end (end_two.seq ), ' A' ) +
206- count (begin (end_two.seq ), end (end_two.seq ), ' C' );
207- const double end_two_t =
208- count (begin (end_two.seq ), end (end_two.seq ), ' T' ) +
209- count (begin (end_two.seq ), end (end_two.seq ), ' G' );
210-
211- const double t_rich_count = (end_one_t + end_two_a);
212- const double pbat_count = (end_one_a + end_two_t );
213-
214- summary.n_t_rich_reads += (t_rich_count > pbat_count);
267+ if (!mates (name_suffix_len, r1, r2))
268+ throw runtime_error (" expected mates: " + r1.name + " , " + r2.name );
269+
270+ const double ta = t_rich_model (r1.seq ) + a_rich_model (r2.seq );
271+ const double at = a_rich_model (r1.seq ) + t_rich_model (r2.seq );
272+
273+ const auto prob_read1_t_rich = exp (ta - log_sum_log (ta, at));
274+ summary.n_reads_wgbs += prob_read1_t_rich;
215275 }
216276 }
217277 else {
218- summary.layout = " single" ;
219278
220279 // input: single-end reads
221- std::ifstream in (reads_files.front ());
280+ bgzf_file in (reads_files.front (), " r " );
222281 if (!in)
223- throw runtime_error (" cannot open input file: " + reads_files.front ());
282+ throw runtime_error (" cannot open file: " + reads_files.front ());
224283
225284 FASTQRecord r;
226285 while (in >> r && summary.n_reads < reads_to_check) {
227286 summary.n_reads ++;
228- const double a = (count (begin (r.seq ), end (r.seq ), ' A' ) +
229- count (begin (r.seq ), end (r.seq ), ' C' ));
230- const double t = (count (begin (r.seq ), end (r.seq ), ' T' ) +
231- count (begin (r.seq ), end (r.seq ), ' G' ));
232- summary.n_t_rich_reads += (t > a);
287+
288+ const double t = t_rich_model (r.seq );
289+ const double a = a_rich_model (r.seq );
290+
291+ const auto prob_t_rich = exp (t - log_sum_log (t, a));
292+ summary.n_reads_wgbs += prob_t_rich;
233293 }
234294 }
235295
236296 summary.evaluate ();
297+
237298 if (!outfile.empty ()) {
238299 std::ofstream out (outfile);
239- if (!out) throw runtime_error (" failed to open output file : " + outfile);
300+ if (!out) throw runtime_error (" failed to open: " + outfile);
240301 out << summary.tostring () << endl;
241302 }
242303 else cout << summary.tostring () << endl;
0 commit comments