1- /* dmr: computes DMRs based on HMRs and probability of differences
2- * at individual CpGs
1+ /* dmr: computes DMRs based on HMRs and probability of differences at
2+ * individual CpGs
33 *
4- * Copyright (C) 2012-2022 University of Southern California and
5- * Andrew D. Smith
4+ * Copyright (C) 2012-2023 University of Southern California and
5+ * Andrew D. Smith
66 *
7- * Authors: Andrew D. Smith
7+ * Authors: Andrew D. Smith
88 *
9- * This program is free software: you can redistribute it and/or modify
10- * it under the terms of the GNU General Public License as published by
11- * the Free Software Foundation, either version 3 of the License, or
12- * (at your option) any later version.
9+ * This program is free software: you can redistribute it and/or
10+ * modify it under the terms of the GNU General Public License as
11+ * published by the Free Software Foundation, either version 3 of the
12+ * License, or (at your option) any later version.
1313 *
14- * This program is distributed in the hope that it will be useful,
15- * but WITHOUT ANY WARRANTY; without even the implied warranty of
16- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17- * GNU General Public License for more details.
14+ * This program is distributed in the hope that it will be useful, but
15+ * WITHOUT ANY WARRANTY; without even the implied warranty of
16+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17+ * General Public License for more details.
1818 */
1919
2020#include < string>
2323#include < fstream>
2424#include < algorithm>
2525#include < stdexcept>
26+ #include < charconv>
2627
2728#include " OptionParser.hpp"
2829#include " smithlab_utils.hpp"
2930#include " smithlab_os.hpp"
3031#include " GenomicRegion.hpp"
32+ #include " MSite.hpp"
33+
34+ #include < bamxx.hpp>
3135
3236using std::string;
3337using std::vector;
@@ -38,32 +42,141 @@ using std::pair;
3842using std::max;
3943using std::ifstream;
4044using std::runtime_error;
45+ using std::from_chars;
46+ using std::find_if;
4147
48+ using bamxx::bgzf_file;
4249
43- static void
44- read_diffs_file (const bool VERBOSE, const string &diffs_file,
45- vector<GenomicRegion> &cpgs) {
4650
47- cpgs.clear ();
51+ static bool
52+ parse_methdiff_line (const char *c, const char *c_end,
53+ string &chrom, uint32_t &pos, char &strand,
54+ string &context, double &diffscore,
55+ uint32_t &n_meth_a, uint32_t &n_unmeth_a,
56+ uint32_t &n_meth_b, uint32_t &n_unmeth_b) {
57+ constexpr auto is_sep = [](const char x) { return x == ' ' || x == ' \t ' ; };
58+ constexpr auto not_sep = [](const char x) { return x != ' ' && x != ' \t ' ; };
59+
60+ auto field_s = c;
61+ auto field_e = find_if (field_s + 1 , c_end, is_sep);
62+ bool failed = field_e == c_end;
63+
64+ // chromosome name
65+ {
66+ const uint32_t d = std::distance (field_s, field_e);
67+ chrom = string{field_s, d};
68+ }
69+
70+ field_s = find_if (field_e + 1 , c_end, not_sep);
71+ field_e = find_if (field_s + 1 , c_end, is_sep);
72+ failed = failed || field_e == c_end;
73+
74+ // position
75+ {
76+ const auto [ptr, ec] = from_chars (field_s, field_e, pos);
77+ failed = failed || ec != std::errc ();
78+ }
79+
80+ field_s = find_if (field_e + 1 , c_end, not_sep);
81+ field_e = find_if (field_s + 1 , c_end, is_sep);
82+ // below because strand is 1 base wide
83+ failed = failed || field_e != field_s + 1 || field_e == c_end;
84+
85+ // strand
86+ strand = *field_s;
87+ failed = failed || (strand != ' -' && strand != ' +' );
88+
89+ field_s = find_if (field_e + 1 , c_end, not_sep);
90+ field_e = find_if (field_s + 1 , c_end, is_sep);
91+ failed = failed || field_e == c_end;
92+
93+ // context
94+ {
95+ const uint32_t d = std::distance (field_s, field_e);
96+ context = string{field_s, d};
97+ }
98+
99+ field_s = find_if (field_e + 1 , c_end, not_sep);
100+ field_e = find_if (field_s + 1 , c_end, is_sep);
101+ failed = failed || field_e == c_end;
102+
103+ // score for difference in methylation (contingency table p-value)
104+ {
105+ #ifdef __APPLE__
106+ const int ret = std::sscanf (field_s, " %lf" , &diffscore);
107+ failed = failed || ret < 1 ;
108+ #else
109+ const auto [ptr, ec] = from_chars (field_s, field_e, diffscore);
110+ failed = failed || ec != std::errc ();
111+ #endif
112+ }
113+
114+ field_s = find_if (field_e + 1 , c_end, not_sep);
115+ field_e = find_if (field_s + 1 , c_end, is_sep);
116+ failed = failed || (field_e == c_end);
117+
118+ // counts methylated in methylome "a"
119+ {
120+ const auto [ptr, ec] = from_chars (field_s, c_end, n_meth_a);
121+ failed = failed || ec != std::errc ();
122+ }
123+
124+ field_s = find_if (field_e + 1 , c_end, not_sep);
125+ field_e = find_if (field_s + 1 , c_end, is_sep);
126+ failed = failed || field_e == c_end;
127+
128+ // counts unmethylated in methylome "a"
129+ {
130+ const auto [ptr, ec] = from_chars (field_s, c_end, n_unmeth_a);
131+ failed = failed || ec != std::errc ();
132+ }
133+
134+ field_s = find_if (field_e + 1 , c_end, not_sep);
135+ field_e = find_if (field_s + 1 , c_end, is_sep);
136+ failed = failed || field_e == c_end;
137+
138+ // counts methylated in methylome "b"
139+ {
140+ const auto [ptr, ec] = from_chars (field_s, c_end, n_meth_b);
141+ failed = failed || ec != std::errc ();
142+ }
143+
144+ field_s = find_if (field_e + 1 , c_end, not_sep);
145+
146+ // counts unmethylated in methylome "a"
147+ {
148+ const auto [ptr, ec] = from_chars (field_s, c_end, n_unmeth_b);
149+ // final field needs to fail if we haven't reached the end
150+ failed = failed || ec != std::errc () || ptr != c_end;
151+ }
152+
153+ return !failed;
154+ }
155+
156+
157+ static vector<MSite>
158+ read_diffs_file (const string &diffs_file) {
159+
160+ bgzf_file in (diffs_file, " r" );
161+ if (!in) throw runtime_error (" could not open file: " + diffs_file);
162+
163+ string chrom, name;
164+ char strand{};
165+ double diffscore{};
166+ uint32_t pos{}, meth_a{}, unmeth_a{}, meth_b{}, unmeth_b{};
48167
49- std::ifstream in (diffs_file);
50- string chrom, strand, name;
51- double diffscore;
52- size_t pos, meth_a, unmeth_a, meth_b, unmeth_b;
168+ vector<MSite> cpgs;
53169 string line;
54170 while (getline (in, line)) {
55171
56- std::istringstream iss ( line);
57- if (!(iss >> chrom >> pos >> strand >> name >>
58- diffscore >> meth_a >> unmeth_a >> meth_b >> unmeth_b))
172+ if (! parse_methdiff_line ( line. data (), line. data () + size (line),
173+ chrom, pos, strand, name, diffscore,
174+ meth_a, unmeth_a, meth_b, unmeth_b))
59175 throw runtime_error (" bad methdiff line: " + line);
60176
61- cpgs.push_back (GenomicRegion (chrom, pos, pos + 1 ,
62- name, diffscore, strand[0 ]));
177+ cpgs.emplace_back (chrom, pos, strand, name, diffscore, 1 );
63178 }
64- if (VERBOSE)
65- cerr << " [read " << cpgs.size ()
66- << " sites from " + diffs_file << " ]" << endl;
179+ return cpgs;
67180}
68181
69182static void
@@ -80,30 +193,28 @@ complement_regions(const size_t max_end, const vector<GenomicRegion> &a,
80193 cmpl.back ().set_end (max_end);
81194}
82195
83-
84- static void
85- get_chrom_ends ( const vector<GenomicRegion> &r, vector<size_t > & ends) {
196+ static vector< size_t >
197+ get_chrom_ends ( const vector<GenomicRegion> &r) {
198+ vector<size_t > ends;
86199 for (size_t i = 0 ; i < r.size () - 1 ; ++i)
87200 if (!r[i].same_chrom (r[i+1 ]))
88201 ends.push_back (i+1 );
89202 ends.push_back (r.size ());
203+ return ends;
90204}
91205
92-
93- static void
94- complement_regions (const size_t max_end,
95- const vector<GenomicRegion> &r, vector<GenomicRegion> &r_cmpl) {
96-
97- vector<size_t > r_chroms;
98- get_chrom_ends (r, r_chroms);
206+ static vector<GenomicRegion>
207+ complement_regions (const size_t max_end, const vector<GenomicRegion> &r) {
208+ vector<size_t > r_chroms = get_chrom_ends (r);
209+ vector<GenomicRegion> r_cmpl;
99210 size_t t = 0 ;
100- for (size_t i = 0 ; i < r_chroms. size (); ++i) {
211+ for (size_t i = 0 ; i < size (r_chroms ); ++i) {
101212 complement_regions (max_end, r, t, r_chroms[i], r_cmpl);
102213 t = r_chroms[i];
103214 }
215+ return r_cmpl;
104216}
105217
106-
107218static bool
108219check_no_overlap (const vector<GenomicRegion> ®ions) {
109220 for (size_t i = 1 ; i < regions.size (); ++i)
@@ -114,49 +225,50 @@ check_no_overlap(const vector<GenomicRegion> ®ions) {
114225}
115226
116227
117- static void
118- separate_sites (const vector<GenomicRegion> &dmrs,
119- const vector<GenomicRegion> &sites,
120- vector<pair<size_t , size_t > > &sep_sites) {
121- const size_t n_dmrs = dmrs.size ();
122-
123- for (size_t i = 0 ; i < n_dmrs; ++i) {
124- GenomicRegion a (dmrs[i]);
125- a.set_end (a.get_start () + 1 );
126- GenomicRegion b (dmrs[i]);
127- b.set_start (b.get_end ());
128- b.set_end (b.get_end () + 1 );
129-
130- auto a_insert = lower_bound (begin (sites), end (sites), a);
131- auto b_insert = lower_bound (begin (sites), end (sites), b);
132-
133- sep_sites.push_back (std::make_pair (a_insert - begin (sites),
134- b_insert - begin (sites)));
135- }
228+ static inline MSite
229+ get_left_msite (const GenomicRegion &r) {
230+ return {r.get_chrom (), r.get_start (), r.get_strand (), r.get_name (), 0.0 , 1u };
231+ }
232+
233+
234+ static inline MSite
235+ get_right_msite (const GenomicRegion &r) {
236+ return {r.get_chrom (), r.get_end (), r.get_strand (), r.get_name (), 0.0 , 1u };
136237}
137238
138239
139- template <class T > bool
140- starts_before (const T &a, const T &b) {
141- return (a.get_chrom () < b.get_chrom ()) ||
142- (a.same_chrom (b) && a.get_start () < b.get_start ());
240+ static vector<pair<size_t , size_t > >
241+ separate_sites (const vector<GenomicRegion> &dmrs,
242+ const vector<MSite> &sites) {
243+ vector<pair<size_t , size_t >> sep_sites;
244+ for (const auto &dmr: dmrs) {
245+ const auto a = get_left_msite (dmr);
246+ const auto b = get_right_msite (dmr);
247+ const auto a_insert = lower_bound (cbegin (sites), cend (sites), a);
248+ const auto b_insert = lower_bound (cbegin (sites), cend (sites), b);
249+ sep_sites.emplace_back (distance (cbegin (sites), a_insert),
250+ distance (cbegin (sites), b_insert));
251+ }
252+ return sep_sites;
143253}
144254
145- template <class T > bool
146- same_start (const T &a, const T &b) {
147- return a.same_chrom (b) && a.get_start () == b.get_start ();
255+
256+ static inline double
257+ pval_from_msite (const MSite &s) {
258+ return s.meth ; // abused as a p-value here
148259}
149260
150261
151262static void
152263get_cpg_stats (const bool LOW_CUTOFF, const double sig_cutoff,
153- const vector<GenomicRegion > &cpgs,
264+ const vector<MSite > &cpgs,
154265 const size_t start_idx, const size_t end_idx,
155266 size_t &total_cpgs, size_t &total_sig) {
156267 total_cpgs = end_idx - start_idx;
157268 for (size_t i = start_idx; i < end_idx; ++i) {
158- if ((LOW_CUTOFF && (cpgs[i].get_score () < sig_cutoff)) ||
159- (!LOW_CUTOFF && (cpgs[i].get_score () > 1.0 - sig_cutoff)))
269+ const auto pval = pval_from_msite (cpgs[i]);
270+ if ((LOW_CUTOFF && (pval < sig_cutoff)) ||
271+ (!LOW_CUTOFF && (pval > 1.0 - sig_cutoff)))
160272 ++total_sig;
161273 }
162274}
@@ -232,16 +344,12 @@ main_dmr(int argc, const char **argv) {
232344 if (VERBOSE)
233345 cerr << " [COMPUTING SYMMETRIC DIFFERENCE]" << endl;
234346
235-
236347 size_t max_end = 0 ;
237- for (size_t i = 0 ; i < regions_a.size (); ++i)
238- max_end = max (max_end, regions_a[i].get_end ());
239- for (size_t i = 0 ; i < regions_b.size (); ++i)
240- max_end = max (max_end, regions_b[i].get_end ());
348+ for (const auto &r: regions_a) max_end = max (max_end, r.get_end ());
349+ for (const auto &r: regions_b) max_end = max (max_end, r.get_end ());
241350
242- vector<GenomicRegion> a_cmpl, b_cmpl;
243- complement_regions (max_end, regions_a, a_cmpl);
244- complement_regions (max_end, regions_b, b_cmpl);
351+ const auto a_cmpl = complement_regions (max_end, regions_a);
352+ const auto b_cmpl = complement_regions (max_end, regions_b);
245353
246354 vector<GenomicRegion> dmrs_a, dmrs_b;
247355 genomic_region_intersection_by_base (regions_a, b_cmpl, dmrs_a);
@@ -250,15 +358,17 @@ main_dmr(int argc, const char **argv) {
250358 // separate the regions by chrom and by desert
251359 if (VERBOSE)
252360 cerr << " [READING CPG METH DIFFS]" << endl;
253- vector<GenomicRegion> cpgs;
254- read_diffs_file (VERBOSE, diffs_file, cpgs);
361+ const auto cpgs = read_diffs_file (diffs_file);
362+ if (VERBOSE)
363+ cerr << " [read " << size (cpgs)
364+ << " sites from " + diffs_file << " ]" << endl;
365+
255366 if (!check_sorted (cpgs))
256367 throw runtime_error (" CpGs not sorted in: " + diffs_file);
257368 if (VERBOSE)
258369 cerr << " [TOTAL CPGS]: " << cpgs.size () << endl;
259370
260- vector<pair<size_t , size_t > > sep_sites;
261- separate_sites (dmrs_a, cpgs, sep_sites);
371+ auto sep_sites = separate_sites (dmrs_a, cpgs);
262372
263373 for (size_t i = 0 ; i < dmrs_a.size (); ++i) {
264374 size_t total_cpgs = 0 , total_sig = 0 ;
@@ -269,8 +379,7 @@ main_dmr(int argc, const char **argv) {
269379 dmrs_a[i].set_score (total_sig);
270380 }
271381
272- sep_sites.clear ();
273- separate_sites (dmrs_b, cpgs, sep_sites);
382+ sep_sites = separate_sites (dmrs_b, cpgs);
274383
275384 for (size_t i = 0 ; i < dmrs_b.size (); ++i) {
276385 size_t total_cpgs = 0 , total_sig = 0 ;
@@ -282,11 +391,11 @@ main_dmr(int argc, const char **argv) {
282391 }
283392
284393 std::ofstream out_a (outfile_a);
285- copy (begin (dmrs_a), end (dmrs_a),
394+ copy (cbegin (dmrs_a), cend (dmrs_a),
286395 std::ostream_iterator<GenomicRegion>(out_a, " \n " ));
287396
288397 std::ofstream out_b (outfile_b);
289- copy (begin (dmrs_b), end (dmrs_b),
398+ copy (cbegin (dmrs_b), cend (dmrs_b),
290399 std::ostream_iterator<GenomicRegion>(out_b, " \n " ));
291400
292401 if (VERBOSE)
0 commit comments