11/* guessprotocol: a program for guessing whether a wgbs protocol is
22 * original, pbat or random pbat
33 *
4- * Copyright (C) 2019-2022
4+ * Copyright (C) 2019-2023
55 *
66 * Authors: Andrew D. Smith
77 *
2323#include < iterator>
2424#include < algorithm>
2525#include < stdexcept>
26+ #include < filesystem>
27+ #include < sstream>
2628
2729#include " OptionParser.hpp"
2830#include " smithlab_utils.hpp"
@@ -35,6 +37,43 @@ using std::cerr;
3537using std::endl;
3638using std::min;
3739using std::runtime_error;
40+ using std::array;
41+
42+ static string
43+ guess_protocol (const double fraction_t_rich) {
44+ if (fraction_t_rich >= 0.8 ) {
45+ return " original" ;
46+ }
47+ if (fraction_t_rich <= 0.2 ) {
48+ return " pbat" ;
49+ }
50+ if (fraction_t_rich >= 0.4 && fraction_t_rich <= 0.6 ) {
51+ return " random" ;
52+ }
53+ return " inconclusive" ;
54+ }
55+
56+ struct guessprotocol_summary {
57+ string protocol;
58+ string layout;
59+ uint64_t n_t_rich_reads{};
60+ uint64_t n_reads{};
61+ double fraction_t_rich_reads{};
62+
63+ void evaluate () {
64+ fraction_t_rich_reads = static_cast <double >(n_t_rich_reads)/n_reads;
65+ protocol = guess_protocol (fraction_t_rich_reads);
66+ }
67+
68+ string tostring () const {
69+ std::ostringstream oss;
70+ oss << " protocol: " << protocol << ' \n '
71+ << " fraction_t_rich_reads: " << fraction_t_rich_reads << ' \n '
72+ << " t_rich_reads: " << n_t_rich_reads << ' \n '
73+ << " reads_examined: " << n_reads;
74+ return oss.str ();
75+ }
76+ };
3877
3978// store each read from one end
4079struct FASTQRecord {
@@ -47,64 +86,73 @@ struct FASTQRecord {
4786static bool
4887mates (const size_t to_ignore_at_end, // in case names have #0/1 name ends
4988 const FASTQRecord &a, const FASTQRecord &b) {
50- assert (to_ignore_at_end < a.name .length ());
51- return equal (begin (a.name ), end (a.name ) - to_ignore_at_end, begin (b.name ));
89+ assert (to_ignore_at_end < std::size (a.name ));
90+ return equal (cbegin (a.name ), cend (a.name ) - to_ignore_at_end,
91+ cbegin (b.name ));
5292}
5393
5494// Read 4 lines one time from fastq and fill in the FASTQRecord structure
5595static std::istream&
5696operator >>(std::istream& s, FASTQRecord &r) {
57- if (getline (s, r.name )) {
97+ constexpr auto n_error_codes = 5u ;
98+ enum err_code { none, bad_name, bad_seq, bad_plus, bad_qual };
99+ static const array<runtime_error, n_error_codes> error_msg = {
100+ runtime_error (" " ),
101+ runtime_error (" failed to parse fastq name line" ),
102+ runtime_error (" failed to parse fastq sequence line" ),
103+ runtime_error (" failed to parse fastq plus line" ),
104+ runtime_error (" failed to parse fastq qual line" )
105+ };
58106
59- if (r.name .empty () || r.name [0 ] != ' @' )
60- throw std::runtime_error (" bad name line: " + r.name );
107+ err_code ec = err_code::none;
61108
62- r. name = r. name . substr ( 1 , r.name . find_first_of ( ' ' ) );
109+ getline (s , r.name );
63110
64- if (! getline (s, r. seq ) )
65- throw runtime_error ( " failed to read expected seq line " ) ;
111+ if (r. name . empty () || r. name [ 0 ] != ' @ ' )
112+ ec = err_code::bad_name ;
66113
67- string tmp ;
68- if (! getline (s, tmp))
69- throw runtime_error ( " failed to read expected + line " );
114+ r. name . resize (r. name . find_first_of ( ' ' )) ;
115+ const auto nm_sz = r. name . find_first_of ( ' ' );
116+ copy ( cbegin (r. name ) + 1 , cbegin (r. name ) + nm_sz, begin (r. name ) );
70117
71- if (!getline (s, tmp))
72- throw runtime_error (" failed to read expected score line" );
73- }
74- return s;
75- }
118+ if (!getline (s, r.seq ))
119+ ec = err_code::bad_seq;
76120
121+ string tmp;
122+ if (!getline (s, tmp))
123+ ec = err_code::bad_plus;
77124
78- static string
79- guess_protocol (const double fraction_t_rich) {
80- if (fraction_t_rich >= 0.8 ) {
81- return " original" ;
82- }
83- if (fraction_t_rich <= 0.2 ) {
84- return " pbat" ;
85- }
86- if (fraction_t_rich >= 0.4 && fraction_t_rich <= 0.6 ) {
87- return " random" ;
88- }
89- return " inconclusive" ;
125+ if (!getline (s, tmp))
126+ ec = err_code::bad_qual;
127+
128+ if (ec != err_code::none)
129+ throw error_msg[ec];
130+
131+ return s;
90132}
91133
92134int
93135main_guessprotocol (int argc, const char **argv) {
94136
95137 try {
96138
139+ constexpr auto description = " guess bisulfite protocol for a library" ;
140+
141+ string outfile;
97142 size_t reads_to_check = 1000000 ;
98143 size_t name_suffix_len = 0 ;
99144
145+ namespace fs = std::filesystem;
146+ const string cmd_name = std::filesystem::path (argv[0 ]).filename ();
147+
100148 /* ***************** COMMAND LINE OPTIONS ********************/
101- OptionParser opt_parse (strip_path (argv[0 ]),
102- " guess whether protocol is ordinary, pbat or random" ,
149+ OptionParser opt_parse (cmd_name, description,
103150 " <end1-fastq> [<end2-fastq>]" );
104151 opt_parse.add_opt (" nreads" , ' n' , " number of reads in initial check" ,
105152 false , reads_to_check);
106153 opt_parse.add_opt (" ignore" , ' i' , " length of read name suffix "
107154 " to ignore when matching" , false , name_suffix_len);
155+ opt_parse.add_opt (" output" , ' o' , " output file name" , false , outfile);
108156 vector<string> leftover_args;
109157 opt_parse.parse (argc, argv, leftover_args);
110158 if (argc == 1 || opt_parse.help_requested ()) {
@@ -123,8 +171,11 @@ main_guessprotocol(int argc, const char **argv) {
123171 const vector<string> reads_files (leftover_args);
124172 /* ***************** END COMMAND LINE OPTIONS *****************/
125173
174+ guessprotocol_summary summary;
126175 if (reads_files.size () == 2 ) {
127- // Input: paired-end reads with end1 and end2
176+ summary.layout = " paired" ;
177+
178+ // input: paired-end reads with end1 and end2
128179 std::ifstream in1 (reads_files.front ());
129180 if (!in1)
130181 throw runtime_error (" cannot open input file: " + reads_files.front ());
@@ -133,12 +184,10 @@ main_guessprotocol(int argc, const char **argv) {
133184 if (!in2)
134185 throw runtime_error (" cannot open input file: " + reads_files.back ());
135186
136- size_t n_pairs = 0 ;
137- size_t t_rich_pairs = 0 ;
138-
139187 FASTQRecord end_one, end_two;
140- while (in1 >> end_one && in2 >> end_two && n_pairs < reads_to_check) {
141- ++n_pairs;
188+ while (in1 >> end_one && in2 >> end_two &&
189+ summary.n_reads < reads_to_check) {
190+ summary.n_reads ++;
142191
143192 // two reads should be in paired-ends
144193 if (!mates (name_suffix_len, end_one, end_two))
@@ -162,46 +211,39 @@ main_guessprotocol(int argc, const char **argv) {
162211 const double t_rich_count = (end_one_t + end_two_a);
163212 const double pbat_count = (end_one_a + end_two_t );
164213
165- t_rich_pairs += (t_rich_count > pbat_count);
214+ summary. n_t_rich_reads += (t_rich_count > pbat_count);
166215 }
167- const double fraction_t_rich = static_cast <double >(t_rich_pairs)/n_pairs;
168- cout << guess_protocol (fraction_t_rich) << ' \t '
169- << " fraction_t_rich=" << fraction_t_rich << ' \t '
170- << " t_rich_pairs=" << t_rich_pairs << ' \t '
171- << " pairs_examined=" << n_pairs << endl;
172216 }
173- else { // if (reads_files.size() == 1)
174- // Input: single-end reads
217+ else {
218+ summary.layout = " single" ;
219+
220+ // input: single-end reads
175221 std::ifstream in (reads_files.front ());
176222 if (!in)
177223 throw runtime_error (" cannot open input file: " + reads_files.front ());
178224
179- size_t n_reads = 0 ;
180- size_t t_rich_reads = 0 ;
181-
182225 FASTQRecord r;
183- while (in >> r && n_reads < reads_to_check) {
184- ++n_reads ;
226+ while (in >> r && summary. n_reads < reads_to_check) {
227+ summary. n_reads ++ ;
185228 const double a = (count (begin (r.seq ), end (r.seq ), ' A' ) +
186229 count (begin (r.seq ), end (r.seq ), ' C' ));
187230 const double t = (count (begin (r.seq ), end (r.seq ), ' T' ) +
188231 count (begin (r.seq ), end (r.seq ), ' G' ));
189- t_rich_reads += (t > a);
232+ summary. n_t_rich_reads += (t > a);
190233 }
191- const double fraction_t_rich = static_cast <double >(t_rich_reads)/n_reads;
192- cout << guess_protocol (fraction_t_rich) << ' \t '
193- << " fraction_t_rich=" << fraction_t_rich << ' \t '
194- << " t_rich_reads=" << t_rich_reads << ' \t '
195- << " reads_examined=" << n_reads << endl;
196234 }
235+
236+ summary.evaluate ();
237+ if (!outfile.empty ()) {
238+ std::ofstream out (outfile);
239+ if (!out) throw runtime_error (" failed to open output file: " + outfile);
240+ out << summary.tostring () << endl;
241+ }
242+ else cout << summary.tostring () << endl;
197243 }
198244 catch (const runtime_error &e) {
199245 cerr << e.what () << endl;
200246 return EXIT_FAILURE;
201247 }
202- catch (std::bad_alloc &ba) {
203- cerr << " ERROR: could not allocate memory" << endl;
204- return EXIT_FAILURE;
205- }
206248 return EXIT_SUCCESS;
207249}
0 commit comments