Skip to content

Commit 59db7a3

Browse files
More options for output format and fixed a bug when reading input in sam format instead of bam format.
1 parent 9628132 commit 59db7a3

1 file changed

Lines changed: 33 additions & 19 deletions

File tree

src/utils/uniq.cpp

Lines changed: 33 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -123,7 +123,7 @@ write_hist_output(const vector<size_t> &hist, const string &histfile) {
123123
static void
124124
process_inner_buffer(const vector<bam1_t*>::const_iterator it,
125125
const vector<bam1_t*>::const_iterator jt,
126-
sam_hdr_t *hdr, htsFile *out,
126+
sam_hdr_t *hdr, samFile *out,
127127
size_t &reads_out,
128128
size_t &good_bases_out,
129129
size_t &reads_with_dups,
@@ -146,7 +146,7 @@ process_inner_buffer(const vector<bam1_t*>::const_iterator it,
146146
static void
147147
process_buffer(size_t &reads_out, size_t &good_bases_out,
148148
size_t &reads_with_dups, vector<size_t> &hist,
149-
vector<bam1_t*> &buffer, sam_hdr_t *hdr, htsFile *out) {
149+
vector<bam1_t*> &buffer, sam_hdr_t *hdr, samFile *out) {
150150
sort(begin(buffer), end(buffer), precedes_by_end_and_strand);
151151
auto it(begin(buffer));
152152
auto jt = it + 1;
@@ -170,41 +170,42 @@ process_buffer(size_t &reads_out, size_t &good_bases_out,
170170

171171

172172
static bam1_t *
173-
get_read(htsFile *hts, bam_hdr_t *hdr) {
173+
get_read(samFile *hts, sam_hdr_t *hdr) {
174174
bam1_t *b = bam_init1();
175175
const int result = sam_read1(hts, hdr, b);
176-
if (result > 0) return b;
176+
if (result >= 0) return b;
177177

178178
if (result < -1)
179179
throw runtime_error("error reading file: " + string(hts->fn));
180-
else // means EOF, so we free this read
180+
else // -1 should mean EOF, so we free this read
181181
bam_destroy1(b);
182182
return 0;
183183
}
184184

185185

186186
static void
187187
uniq(const bool VERBOSE, const bool NO_SORT_TEST,
188-
const string &cmd, const string &infile,
189-
const string &statfile, const string &histfile,
190-
const string &outfile) {
188+
const string &cmd, const string &infile,
189+
const string &statfile, const string &histfile,
190+
const bool bam_format, const string &outfile) {
191191

192-
htsFile* hts = hts_open(infile.c_str(), "r");
192+
samFile* hts = hts_open(infile.c_str(), "r");
193193
if (!hts || errno)
194194
throw runtime_error("bad htslib file: " + infile);
195195

196196
if (hts_get_format(hts)->category != sequence_data)
197197
throw runtime_error("bad file format: " + infile);
198198

199-
bam_hdr_t *hdr = sam_hdr_read(hts);
199+
sam_hdr_t *hdr = sam_hdr_read(hts);
200200
if (!hdr)
201201
throw runtime_error("failed to read header: " + infile);
202202

203203
// open the output file
204-
htsFile *out = hts_open(outfile.c_str(), "wb");
204+
samFile *out = bam_format ? hts_open(outfile.c_str(), "wb") :
205+
hts_open(outfile.c_str(), "w");
205206

206207
// take care of the output file's header
207-
bam_hdr_t *hdr_out = bam_hdr_dup(hdr);
208+
sam_hdr_t *hdr_out = bam_hdr_dup(hdr);
208209
if (sam_hdr_add_line(hdr_out, "PG", "ID",
209210
"DNMTOOLS", "VN", VERSION, "CL", cmd.c_str(), NULL))
210211
throw runtime_error("failed to format header");
@@ -274,6 +275,9 @@ main_uniq(int argc, const char **argv) {
274275
bool VERBOSE = false;
275276
bool NO_SORT_TEST = false;
276277

278+
bool bam_format = false;
279+
bool use_stdout = false;
280+
277281
// ADS: Not recommended to change this seed. It shouldn't matter
278282
// at all, and we want results to behave as deterministic.
279283
size_t the_seed = 408;
@@ -288,8 +292,10 @@ main_uniq(int argc, const char **argv) {
288292
opt_parse.add_opt("stats", 'S', "statistics output file", false, statfile);
289293
opt_parse.add_opt("hist", '\0', "histogram output file for library"
290294
" complexity analysis", false, histfile);
291-
opt_parse.add_opt("disable", 'D', "disable sort test",
292-
false, NO_SORT_TEST);
295+
opt_parse.add_opt("bam", 'B', "output in BAM format", false, bam_format);
296+
opt_parse.add_opt("stdout", '\0',
297+
"write to standard output", false, use_stdout);
298+
opt_parse.add_opt("disable", 'D', "disable sort test", false, NO_SORT_TEST);
293299
opt_parse.add_opt("seed", 's', "random seed", false, the_seed);
294300
opt_parse.add_opt("verbose", 'v', "print more run info", false, VERBOSE);
295301
opt_parse.set_show_defaults();
@@ -308,14 +314,21 @@ main_uniq(int argc, const char **argv) {
308314
cerr << opt_parse.option_missing_message() << endl;
309315
return EXIT_SUCCESS;
310316
}
311-
if (leftover_args.size() != 1 && leftover_args.size() != 2) {
317+
if (leftover_args.size() == 1 && !use_stdout) {
318+
cerr << opt_parse.help_message() << endl
319+
<< opt_parse.about_message() << endl;
320+
return EXIT_SUCCESS;
321+
}
322+
if (leftover_args.size() == 2 && use_stdout) {
312323
cerr << opt_parse.help_message() << endl
313324
<< opt_parse.about_message() << endl;
314325
return EXIT_SUCCESS;
315326
}
316327
const string infile(leftover_args.front());
317-
if (leftover_args.size() == 2)
328+
if (leftover_args.size() == 2 && !use_stdout)
318329
outfile = leftover_args.back();
330+
else
331+
outfile = string("-"); // so htslib can write to stdout
319332
/****************** END COMMAND LINE OPTIONS *****************/
320333

321334
// ADS: Random here is because we choose randomly when keeping one
@@ -327,11 +340,12 @@ main_uniq(int argc, const char **argv) {
327340

328341
if (VERBOSE)
329342
cerr << "[output file: " << outfile << "]" << endl
343+
<< "[output format: " << (bam_format ? "B" : "S") << "AM]" << endl
330344
<< "[command line: \"" << cmd.str() << "\"]" << endl
331-
<< "[random number seed: " << the_seed << "]" << endl
332-
<< endl;
345+
<< "[random number seed: " << the_seed << "]" << endl;
333346

334-
uniq(VERBOSE, NO_SORT_TEST, cmd.str(), infile, statfile, histfile, outfile);
347+
uniq(VERBOSE, NO_SORT_TEST, cmd.str(),
348+
infile, statfile, histfile, bam_format, outfile);
335349
}
336350
catch (const runtime_error &e) {
337351
cerr << e.what() << endl;

0 commit comments

Comments
 (0)