3636#include < string>
3737#include < vector>
3838
39- using std::cbegin;
40- using std::cend;
41- using std::count_if;
42- using std::min;
43- using std::runtime_error;
44- using std::string;
45- using std::to_string;
46- using std::uint32_t ;
47- using std::vector;
48-
49- // NOLINTBEGIN(*-avoid-magic-numbers,*-narrowing-conversions)
50-
5139auto
5240pop_size::main (int argc, char *argv[]) -> int { // NOLINT(*-avoid-c-arrays)
5341 try {
54- static const std::size_t min_required_counts = 4 ;
55- static const string min_required_counts_error_message =
56- " max count before zero is less than min required count (" +
57- to_string (min_required_counts) + " ) duplicates removed" ;
42+ static constexpr auto max_iter_per_bootstrap = 100 ;
43+ static constexpr auto default_max_extrap = 1000000000ul ;
44+ static constexpr auto min_required_counts = 4ul ;
45+ static constexpr auto min_required_counts_error_message =
46+ " max count before zero is less than min required count (4)" ;
5847
59- string outfile;
60- string input_file_name;
61- string histogram_outfile;
48+ std:: string outfile;
49+ std:: string input_file_name;
50+ std:: string histogram_outfile;
6251
52+ // NOLINTBEGIN(*-avoid-magic-numbers)
6353 std::size_t orig_max_terms = 100 ;
64- double max_extrap = 0.0 ;
65- double step_size = 0.0 ;
54+ double max_extrap{};
6655 std::size_t n_desired_steps = 50 ;
6756 std::size_t n_bootstraps = 100 ;
6857 int diagonal = 0 ;
6958 double c_level = 0.95 ;
70- uint32_t seed = 408 ;
59+ std::uint32_t seed = 408 ;
60+ // NOLINTEND(*-avoid-magic-numbers)
7161
72- /* FLAGS */
73- bool verbose = false ;
74- bool VALS_INPUT = false ;
75- bool PAIRED_END = false ;
76- bool HIST_INPUT = false ;
77- bool SINGLE_ESTIMATE = false ;
78- bool allow_defects = false ;
62+ // flags
63+ bool verbose{ false } ;
64+ bool paired_end{ false } ;
65+ bool single_estimate{ false } ;
66+ bool allow_defects{ false } ;
67+
68+ std:: uint32_t n_threads{ 1 } ;
7969
80- #ifdef HAVE_HTSLIB
81- bool BAM_FORMAT_INPUT = false ;
82- std::size_t MAX_SEGMENT_LENGTH = 5000 ;
83- uint32_t n_threads{1 };
84- #endif
8570 CLI::App app{rlstrip (pop_size::about_msg)};
8671 argv = app.ensure_utf8 (argv);
8772 app.usage (" \n Usage: preseq pop_size [OPTIONS]" );
@@ -93,26 +78,18 @@ pop_size::main(int argc, char *argv[]) -> int { // NOLINT(*-avoid-c-arrays)
9378 ->option_text (" FILE" )
9479 ->required ()
9580 ->check (CLI::ExistingFile);
96- app.add_option (" -o,--output" , outfile, " yield output file default: stdout" );
81+ app.add_option (" -o,--output" , outfile, " output file" )
82+ ->option_text (" FILE" );
9783 app.add_option (" -e,--extrap" , max_extrap, " maximum extrapolation" );
9884 app.add_option (" -s,--steps" , n_desired_steps, " number of steps" );
9985 app.add_option (" -n,--boots" , n_bootstraps, " number of bootstraps" );
100- app.add_option (" -c,--cval " , c_level, " level for confidence intervals" );
86+ app.add_option (" -c,--ci-level " , c_level, " level for confidence intervals" );
10187 app.add_option (" -x,--terms" , orig_max_terms, " maximum terms in estimator" );
102- #ifdef HAVE_HTSLIB
103- app.add_flag (" -B,--bam" , BAM_FORMAT_INPUT, " input is in BAM format" );
104- app.add_option (" -l,--seg_len" , MAX_SEGMENT_LENGTH,
105- " maximum segment length when merging paired end bam reads" );
106- #endif
107- app.add_flag (" -P,--pe" , PAIRED_END, " input is paired end read file" );
108- app.add_flag (" -V,--vals" , VALS_INPUT,
109- " input is a text file containing only the observed counts" );
110- app.add_flag (" -H,--hist" , HIST_INPUT,
111- " input is a text file containing the observed histogram" );
112- app.add_flag (" -Q,--quick" , SINGLE_ESTIMATE,
88+ app.add_option (" -r,--seed" , seed, " seed for random number generator" );
89+ app.add_flag (" -p,--paired-end" , paired_end, " input is paired end read file" );
90+ app.add_flag (" -Q,--quick" , single_estimate,
11391 " quick mode (no bootstraps) for confidence intervals" );
11492 app.add_flag (" -D,--defects" , allow_defects, " no testing for defects" );
115- app.add_option (" -r,--seed" , seed, " seed for random number generator" );
11693 app.add_flag (" -v,--verbose" , verbose, " print more info" );
11794 // clang-format on
11895
@@ -123,38 +100,32 @@ pop_size::main(int argc, char *argv[]) -> int { // NOLINT(*-avoid-c-arrays)
123100 }
124101 CLI11_PARSE (app, argc, argv);
125102
103+ const auto input_format = get_input_format_type (input_file_name);
104+ if (is_unknown (input_format)) {
105+ std::cerr << " unknown input format\n " ;
106+ return EXIT_FAILURE;
107+ }
108+
109+ if (verbose) {
110+ std::cerr << " INPUT FORMAT: " << to_string (input_format) << ' \n ' ;
111+ if (is_bam (input_format) || is_bed (input_format))
112+ std::cerr << " PAIRED END: " << std::boolalpha << paired_end << ' \n ' ;
113+ }
114+
126115 const auto [n_reads, counts_hist] = [&] {
127- if (HIST_INPUT) {
128- if (verbose)
129- std::cerr << " HIST_INPUT\n " ;
116+ switch (input_format) {
117+ case input_format_type::hist:
130118 return load_histogram (input_file_name);
131- }
132- else if (VALS_INPUT) {
133- if (verbose)
134- std::cerr << " VALS_INPUT\n " ;
119+ case input_format_type::counts:
135120 return load_counts (input_file_name);
136- }
137121#ifdef HAVE_HTSLIB
138- else if (BAM_FORMAT_INPUT && PAIRED_END) {
139- if (verbose)
140- std::cerr << " PAIRED_END_BAM_INPUT\n " ;
141- return load_counts_BAM_pe (n_threads, input_file_name);
142- }
143- else if (BAM_FORMAT_INPUT) {
144- if (verbose)
145- std::cerr << " BAM_INPUT\n " ;
146- return load_counts_BAM_se (n_threads, input_file_name);
147- }
122+ case input_format_type::bam:
123+ return paired_end ? load_counts_BAM_pe (n_threads, input_file_name)
124+ : load_counts_BAM_se (n_threads, input_file_name);
148125#endif
149- else if (PAIRED_END) {
150- if (verbose)
151- std::cerr << " PAIRED_END_BED_INPUT\n " ;
152- return load_counts_bed_pe (input_file_name);
153- }
154- else { // default is single end bed file
155- if (verbose)
156- std::cerr << " BED_INPUT\n " ;
157- return load_counts_bed_se (input_file_name);
126+ default : // case input_format_type::vals:
127+ return paired_end ? load_counts_bed_pe (input_file_name)
128+ : load_counts_bed_se (input_file_name);
158129 }
159130 }();
160131
@@ -167,13 +138,14 @@ pop_size::main(int argc, char *argv[]) -> int { // NOLINT(*-avoid-c-arrays)
167138 while (first_zero < std::size (counts_hist) && counts_hist[first_zero] > 0 )
168139 ++first_zero;
169140
170- orig_max_terms = min (orig_max_terms, first_zero - 1 );
141+ orig_max_terms = std:: min (orig_max_terms, first_zero - 1 );
171142 orig_max_terms = orig_max_terms - (orig_max_terms % 2 == 1 );
172143
173- if (max_extrap < 1.0 )
174- max_extrap = 1000000000 * distinct_reads;
175- if (step_size < 1.0 )
176- step_size = (max_extrap - distinct_reads) / n_desired_steps;
144+ if (max_extrap == 0.0 )
145+ max_extrap = default_max_extrap * distinct_reads;
146+
147+ const auto step_size =
148+ (max_extrap - distinct_reads) / static_cast <double >(n_desired_steps);
177149
178150 const std::size_t distinct_counts =
179151 std::count_if (std::cbegin (counts_hist), std::cend (counts_hist),
@@ -193,59 +165,52 @@ pop_size::main(int argc, char *argv[]) -> int { // NOLINT(*-avoid-c-arrays)
193165 // check to make sure library is not overly saturated
194166 const double two_fold_extrap = GoodToulmin2xExtrap (counts_hist);
195167 if (two_fold_extrap < 0.0 )
196- throw runtime_error (" Saturation expected at double initial sample size."
197- " Unable to extrapolate" );
198-
199- // const std::size_t total_reads = get_counts_from_hist(counts_hist);
200-
201- // assert(total_reads == n_reads); // ADS: why commented out?
168+ throw std::runtime_error (
169+ " Saturation expected at double initial sample size."
170+ " Unable to extrapolate" );
202171
203172 // check that min required count is satisfied
204173 if (orig_max_terms < min_required_counts)
205- throw runtime_error (min_required_counts_error_message);
174+ throw std:: runtime_error (min_required_counts_error_message);
206175
207176 if (verbose)
208177 std::cerr << " [ESTIMATING YIELD CURVE]\n " ;
209178
210- vector<double > yield_estimates;
179+ std:: vector<double > yield_estimates;
211180
212- if (SINGLE_ESTIMATE ) {
213- const bool single_estimate_success = extrap_single_estimate (
181+ if (single_estimate ) {
182+ const bool success = extrap_single_estimate (
214183 verbose, allow_defects, counts_hist, orig_max_terms, diagonal,
215184 step_size, max_extrap, yield_estimates);
216- // IF FAILURE, EXIT
217- if (!single_estimate_success)
218- throw runtime_error (" single estimate failed, run "
219- " full mode for estimates" );
185+ if (!success)
186+ throw std::runtime_error (
187+ " single estimate failed, run full mode for estimates" );
220188
221189 std::ofstream of;
222190 if (!outfile.empty ())
223- of.open (outfile.c_str ());
191+ of.open (outfile.data ());
224192 std::ostream out (outfile.empty () ? std::cout.rdbuf () : of.rdbuf ());
225193
226194 out << " TOTAL_READS\t EXPECTED_DISTINCT\n " ;
227- out.setf (std::ios_base::fixed, std::ios_base::floatfield);
228- out.precision (1 );
229-
230195 out << 0 << ' \t ' << 0 << ' \n ' ;
231- for (std:: size_t i = 0 ; i < std::size (yield_estimates); ++i)
196+ for (auto i = 0u ; i < std::size (yield_estimates); ++i)
232197 out << (i + 1 ) * step_size << ' \t ' << yield_estimates[i] << ' \n ' ;
233198 }
234199 else {
235200 if (verbose)
236201 std::cerr << " [BOOTSTRAPPING HISTOGRAM]\n " ;
237202
238- const std::size_t max_iter = 100 * n_bootstraps;
203+ const std::size_t max_iter = max_iter_per_bootstrap * n_bootstraps;
239204
240- vector<vector<double >> bootstrap_estimates;
205+ std:: vector<std:: vector<double >> bootstrap_estimates;
241206 extrap_bootstrap (verbose, allow_defects, seed, counts_hist, n_bootstraps,
242207 orig_max_terms, diagonal, step_size, max_extrap,
243208 max_iter, bootstrap_estimates);
244209
245210 if (verbose)
246211 std::cerr << " [COMPUTING CONFIDENCE INTERVALS]\n " ;
247212 // yield ci
248- vector<double > yield_upper_ci_lognorm, yield_lower_ci_lognorm;
213+ std:: vector<double > yield_upper_ci_lognorm, yield_lower_ci_lognorm;
249214
250215 vector_median_and_ci (bootstrap_estimates, c_level, yield_estimates,
251216 yield_lower_ci_lognorm, yield_upper_ci_lognorm);
@@ -256,13 +221,13 @@ pop_size::main(int argc, char *argv[]) -> int { // NOLINT(*-avoid-c-arrays)
256221 if (!outfile.empty ())
257222 of.open (outfile);
258223 std::ostream out (outfile.empty () ? std::cout.rdbuf () : of.rdbuf ());
259-
260- out.setf (std::ios_base::fixed, std::ios_base::floatfield);
261- out.precision (1 );
224+ if (!outfile.empty () && !out)
225+ throw std::runtime_error (" failed to open outfile file: " + outfile);
262226
263227 const std::size_t n_ests = std::size (yield_estimates) - 1 ;
264228 if (n_ests < 2 )
265- throw runtime_error (" problem with number of estimates in pop_size" );
229+ throw std::runtime_error (
230+ " problem with number of estimates in pop_size" );
266231
267232 const bool converged =
268233 (yield_estimates[n_ests] - yield_estimates[n_ests - 1 ] < 1.0 );
@@ -282,5 +247,3 @@ pop_size::main(int argc, char *argv[]) -> int { // NOLINT(*-avoid-c-arrays)
282247 }
283248 return EXIT_SUCCESS;
284249}
285-
286- // NOLINTEND(*-avoid-magic-numbers,*-narrowing-conversions)
0 commit comments