1717 * General Public License for more details.
1818 */
1919
20+ #include < bamxx.hpp>
21+
2022#include < algorithm>
2123#include < fstream>
2224#include < iostream>
3234#include " dnmt_error.hpp"
3335#include " smithlab_utils.hpp"
3436
35- #include < bamxx.hpp>
36-
3737using std::accumulate;
3838using std::cerr;
3939using std::cout;
4040using std::endl;
4141using std::max;
4242using std::numeric_limits;
43+ using std::pair;
4344using std::runtime_error;
4445using std::string;
4546using std::unordered_map;
@@ -48,10 +49,12 @@ using std::vector;
4849
4950using bamxx::bam_rec;
5051
51- static void
52+ static pair< uint32_t , uint32_t >
5253count_states_pos (const bool INCLUDE_CPGS, const string &chrom,
5354 const bam_rec &aln, vector<size_t > &unconv,
5455 vector<size_t > &conv, vector<size_t > &err, size_t &hanging) {
56+ uint32_t n_conv = 0 , n_uconv = 0 ;
57+
5558 /* iterate through reference, query/read and fragment */
5659 const auto seq = bam_get_seq (aln);
5760 const auto beg_cig = bam_get_cigar (aln);
@@ -73,10 +76,14 @@ count_states_pos(const bool INCLUDE_CPGS, const string &chrom,
7376 (rpos >= chrom_lim || !is_guanine (chrom[rpos + 1 ]) ||
7477 INCLUDE_CPGS)) {
7578 const auto qc = seq_nt16_str[bam_seqi (seq, qpos)];
76- if (qc == ' C' )
79+ if (qc == ' C' ) {
7780 ++unconv[fpos];
78- else if (qc == ' T' )
81+ ++n_uconv;
82+ }
83+ else if (qc == ' T' ) {
7984 ++conv[fpos];
85+ ++n_conv;
86+ }
8087 else if (qc != ' N' )
8188 ++err[fpos];
8289 }
@@ -90,12 +97,15 @@ count_states_pos(const bool INCLUDE_CPGS, const string &chrom,
9097 }
9198
9299 assert (qpos == get_l_qseq (aln));
100+ return {n_conv, n_conv + n_uconv};
93101}
94102
95- static void
103+ static pair< uint32_t , uint32_t >
96104count_states_neg (const bool INCLUDE_CPGS, const string &chrom,
97105 const bam_rec &aln, vector<size_t > &unconv,
98106 vector<size_t > &conv, vector<size_t > &err, size_t &hanging) {
107+ uint32_t n_conv = 0 , n_uconv = 0 ;
108+
99109 /* iterate backward over query/read positions but forward over
100110 reference and fragment positions */
101111 const auto seq = bam_get_seq (aln);
@@ -117,10 +127,14 @@ count_states_neg(const bool INCLUDE_CPGS, const string &chrom,
117127 if (is_guanine (chrom[rpos]) &&
118128 (rpos == 0 || !is_cytosine (chrom[rpos - 1 ]) || INCLUDE_CPGS)) {
119129 const auto qc = seq_nt16_str[bam_seqi (seq, qpos - 1 )];
120- if (qc == ' C' )
130+ if (qc == ' C' ) {
121131 ++unconv[fpos];
122- else if (qc == ' T' )
132+ ++n_uconv;
133+ }
134+ else if (qc == ' T' ) {
123135 ++conv[fpos];
136+ ++n_conv;
137+ }
124138 else if (qc != ' N' )
125139 ++err[fpos];
126140 }
@@ -133,6 +147,7 @@ count_states_neg(const bool INCLUDE_CPGS, const string &chrom,
133147 }
134148 }
135149 assert (qpos == 0 );
150+ return {n_conv, n_conv + n_uconv};
136151}
137152
138153static void
@@ -223,16 +238,65 @@ write_output(const string &outfile, const vector<size_t> &ucvt_count_p,
223238 }
224239}
225240
241+ template <typename T> static inline void
242+ update_per_read_stats (const pair<T, T> &x, vector<vector<T>> &tab) {
243+ if (x.second < tab.size ()) ++tab[x.second ][x.first ];
244+ }
245+
246+ static inline vector<double >
247+ format_histogram (const vector<vector<uint32_t >> &tab,
248+ const size_t n_hist_bins) {
249+ constexpr auto epsilon = 1e-6 ;
250+ vector<double > hist (n_hist_bins, 0.0 );
251+ for (size_t i = 1 ; i < tab.size (); ++i) {
252+ const double denom = i + epsilon;
253+ for (size_t j = 0 ; j <= i; ++j) {
254+ const double frac = j / denom;
255+ const auto bin_id = std::floor (frac * n_hist_bins);
256+ assert (bin_id < hist.size ());
257+ hist[bin_id] += tab[i][j];
258+ }
259+ }
260+ return hist;
261+ }
262+
263+ static void
264+ write_hanging_read_message (std::ostream &out, const size_t n_hanging) {
265+ out << " Warning: hanging reads detected at chromosome ends "
266+ << " (N=" << n_hanging << " )." << endl
267+ << " High numbers of hanging reads suggest inconsistent " << endl
268+ << " reference genomes between stages of analysis." << endl
269+ << " This is likely to result in analysis errors." << endl;
270+ }
271+
272+ template <typename T> static void
273+ write_per_read_histogram (const vector<vector<T>> &tab, const size_t n_hist_bins,
274+ std::ostream &out) {
275+ const auto hist = format_histogram (tab, n_hist_bins);
276+ out << std::fixed;
277+ for (auto i = 0.0 ; i < hist.size (); ++i)
278+ out << std::setprecision (3 ) << i / hist.size () << ' \t '
279+ << std::setprecision (3 ) << (i + 1 ) / hist.size () << ' \t '
280+ << std::setprecision (0 ) << hist[i] << endl;
281+ }
282+
226283int
227284main_bsrate (int argc, const char **argv) {
228285 try {
229- // ASSUMED MAXIMUM LENGTH OF A FRAGMENT
230- static const size_t output_size = 10000 ;
286+ // assumed maximum length of a fragment
287+ constexpr const size_t output_size = 10000 ;
288+
289+ // Assumed maximum cytosines per fragment. Currently the per-read
290+ // information is collected as counts in a 2D array, so that later
291+ // features may be able to fit distributions based these counts.
292+ constexpr const size_t max_cytosine_per_frag = 1000 ;
231293
232294 bool VERBOSE = false ;
233295 bool INCLUDE_CPGS = false ;
234296 bool reads_are_a_rich = false ;
297+ bool report_per_read = false ;
235298 size_t n_threads = 1 ;
299+ size_t n_hist_bins = 20 ;
236300
237301 string chroms_file;
238302 string outfile;
@@ -254,6 +318,10 @@ main_bsrate(int argc, const char **argv) {
254318 seq_to_use);
255319 opt_parse.add_opt (" a-rich" , ' A' , " reads are A-rich" , false ,
256320 reads_are_a_rich);
321+ opt_parse.add_opt (" per-read" , ' p' , " report per-read conversion to terminal" ,
322+ false , report_per_read);
323+ opt_parse.add_opt (" bins" , ' b' , " number of bins for per-read" , false ,
324+ n_hist_bins);
257325 opt_parse.add_opt (" threads" , ' t' , " number of threads" , false , n_threads);
258326 opt_parse.add_opt (" verbose" , ' v' , " print more run info" , false , VERBOSE);
259327 vector<string> leftover_args;
@@ -295,8 +363,7 @@ main_bsrate(int argc, const char **argv) {
295363 bamxx::bam_header hdr (hts);
296364 if (!hdr) throw dnmt_error (" failed to read header" );
297365
298- if (n_threads > 1 )
299- tp.set_io (hts);
366+ if (n_threads > 1 ) tp.set_io (hts);
300367
301368 // map the bam header index for each "target" to a sequence in the
302369 // reference genome
@@ -307,6 +374,9 @@ main_bsrate(int argc, const char **argv) {
307374 chrom_lookup.insert ({sam_hdr_name2tid (hdr.h , names[i].data ()), i});
308375 }
309376
377+ vector<vector<uint32_t >> per_read_counts (
378+ max_cytosine_per_frag, vector<uint32_t >(max_cytosine_per_frag, 0 ));
379+
310380 vector<size_t > unconv_count_pos (output_size, 0ul );
311381 vector<size_t > conv_count_pos (output_size, 0ul );
312382 vector<size_t > unconv_count_neg (output_size, 0ul );
@@ -324,7 +394,6 @@ main_bsrate(int argc, const char **argv) {
324394 unordered_set<int32_t > chroms_seen;
325395
326396 while (hts.read (hdr, aln)) {
327-
328397 if (reads_are_a_rich) flip_conversion (aln);
329398
330399 // get the correct chrom if it has changed
@@ -351,24 +420,25 @@ main_bsrate(int argc, const char **argv) {
351420 }
352421
353422 if (use_this_chrom) {
354- // do the work for this mapped read
355- if (bam_is_rev (aln))
356- count_states_neg (INCLUDE_CPGS, chroms[chrom_idx], aln,
357- unconv_count_neg, conv_count_neg, err_neg, hanging);
358- else
359- count_states_pos (INCLUDE_CPGS, chroms[chrom_idx], aln,
360- unconv_count_pos, conv_count_pos, err_pos, hanging);
423+ // do the work for the current mapped read
424+ const auto conv_result =
425+ bam_is_rev (aln) ? count_states_neg (INCLUDE_CPGS, chroms[chrom_idx],
426+ aln, unconv_count_neg,
427+ conv_count_neg, err_neg, hanging)
428+ : count_states_pos (INCLUDE_CPGS, chroms[chrom_idx],
429+ aln, unconv_count_pos,
430+ conv_count_pos, err_pos, hanging);
431+ if (report_per_read)
432+ update_per_read_stats (conv_result, per_read_counts);
361433 }
362434 }
363435 write_output (outfile, unconv_count_pos, conv_count_pos, unconv_count_neg,
364436 conv_count_neg, err_pos, err_neg);
365437
366- if (hanging > 0 ) // some overhanging reads
367- cerr << " Warning: hanging reads detected at chrom ends "
368- << " (N=" << hanging << " )" << endl
369- << " High numbers of hanging reads suggest mismatch "
370- << " between assembly provided here and that used for mapping"
371- << endl;
438+ if (hanging > 0 ) write_hanging_read_message (cerr, hanging);
439+
440+ if (report_per_read)
441+ write_per_read_histogram (per_read_counts, n_hist_bins, cout);
372442 }
373443 catch (const runtime_error &e) {
374444 cerr << e.what () << endl;
0 commit comments