Skip to content

Commit a132303

Browse files
authored
Option to print all scores including the ones below the threshold
1 parent 114eee1 commit a132303

21 files changed

Lines changed: 280 additions & 178 deletions

src/Aligner.cpp

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@ Aligner::Aligner(DataGenerator *d, ITransformer *t, Block *a, string dlmIn,
6262
}
6363
featList.clear();
6464

65-
isLengthFilter = filter;
65+
canReportAll = filter;
6666

6767
ssPtr = new stringstream();
6868
}
@@ -136,7 +136,7 @@ void Aligner::processBlockHelper() {
136136
init = j + 1;
137137
}
138138
auto p1 = blockA->at(j);
139-
alignSeqVsBlock(p1.first, p1.second, blockB, kTable, monoTable,init);
139+
alignSeqVsBlock(p1.first, p1.second, blockB, kTable, monoTable, init);
140140
}
141141

142142
// Pop the block and free its memory
@@ -147,7 +147,7 @@ void Aligner::processBlockHelper() {
147147
template<class V>
148148
void Aligner::alignSeqVsBlock(std::string *info1, std::string *seq1,
149149
Block *blockB, KmerHistogram<uint64_t, V> &kTable,
150-
KmerHistogram<uint64_t, uint64_t> &monoTable,int init) {
150+
KmerHistogram<uint64_t, uint64_t> &monoTable, int init) {
151151

152152
V *h1 = kTable.build(seq1);
153153
uint64_t *mono1 = monoTable.build(seq1);
@@ -160,21 +160,23 @@ void Aligner::alignSeqVsBlock(std::string *info1, std::string *seq1,
160160
auto p2 = blockB->at(hani);
161161
string *seq2 = p2.second;
162162
int l2 = seq2->size();
163-
if (isLengthFilter
163+
164+
if (!canReportAll
164165
&& (std::min(l1, l2) / (double) std::max(l1, l2) < threshold)) {
165166
continue;
166167
}
167168

168169
V *h2 = kTable.build(seq2);
169170
uint64_t *mono2 = monoTable.build(seq2);
171+
170172
Statistician<V> s(histogramSize, k, h1, h2, mono1, mono2,
171173
compositionList, keyList);
172174

173175
s.calculate(funIndexArray, singleFeatNum, data);
174176

175177
double res = predictor.calculateIdentity(data);
176178

177-
if (res >= threshold - error) {
179+
if (canReportAll || res >= threshold - error) {
178180
canWrite = true;
179181

180182
if (res > 1.0) {

src/Aligner.h

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,7 @@ class Aligner {
5959

6060
// If enabled aligner does not align two sequences if they
6161
// can achieve the minimum identity score.
62-
bool isLengthFilter;
62+
bool canReportAll;
6363

6464
double threshold;
6565

@@ -83,7 +83,6 @@ class Aligner {
8383
KmerHistogram<uint64_t, V> &kTable,
8484
KmerHistogram<uint64_t, uint64_t> &monoTable, /*uint8_t *keyList,*/
8585
int init);
86-
8786
public:
8887
Aligner(DataGenerator*, ITransformer*, Block*, string, bool, double,
8988
double e, uint8_t*);

src/AlignerParallel.cpp

Lines changed: 27 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ AlignerParallel<V>::AlignerParallel(int kmer, int hSize, double t,
3131
threshold = t;
3232
// error = e;
3333
relaxThreshold = t - e;
34-
isLengthFilter = f;
34+
canReportAll = f;
3535
compositionList = compList;
3636
dlm = d;
3737
threadNum = tNum;
@@ -128,16 +128,18 @@ void AlignerParallel<V>::setBlockA(Block *block, bool isAllVsAll) {
128128
((sizeA - i - 1) / threadNum) + 1);
129129

130130
#pragma omp parallel for schedule(static) num_threads(threadNum)
131-
for (int j = i + 1; j < sizeA - 1; j++) {
132-
double minimum = lenList[i];
133-
double maximum = lenList[j];
134-
if (maximum < minimum) {
135-
minimum = lenList[j];
136-
maximum = lenList[i];
137-
}
138-
139-
if (isLengthFilter && (minimum / maximum < threshold)) {
140-
continue;
131+
for (int j = i + 1; j < sizeA; j++) {
132+
133+
if (!canReportAll) {
134+
double minimum = lenList[i];
135+
double maximum = lenList[j];
136+
if (maximum < minimum) {
137+
minimum = lenList[j];
138+
maximum = lenList[i];
139+
}
140+
if ((minimum / maximum < threshold)) {
141+
continue;
142+
}
141143
}
142144

143145
Statistician < V
@@ -148,7 +150,7 @@ void AlignerParallel<V>::setBlockA(Block *block, bool isAllVsAll) {
148150
s.calculate(funIndexArray, singleFeatNum, data);
149151
double res = predictor.calculateIdentity(data);
150152

151-
if (res >= relaxThreshold) {
153+
if (canReportAll || res >= relaxThreshold) {
152154
printList->at(omp_get_thread_num())->push_back(
153155
std::make_pair(infoList[j], res));
154156
}
@@ -239,7 +241,7 @@ void AlignerParallel<V>::output(
239241
res = 0.0;
240242
}
241243

242-
out << *info1 << dlm << *p.first << dlm << std::setprecision(4)
244+
out << *info1 << dlm << *p.first << dlm << std::setprecision(8)
243245
<< res << std::endl;
244246
}
245247
delete v;
@@ -266,15 +268,16 @@ void AlignerParallel<V>::processBlockB(Block *block) {
266268
#pragma omp parallel for schedule(static) num_threads(threadNum)
267269
for (int h = 0; h < sizeB; h++) {
268270

269-
double minimum = lenList[i];
270-
double maximum = lenListB[h];
271-
if (maximum < minimum) {
272-
minimum = lenListB[h];
273-
maximum = lenList[i];
274-
}
275-
276-
if (isLengthFilter && (minimum / maximum < threshold)) {
277-
continue;
271+
if (!canReportAll) {
272+
double minimum = lenList[i];
273+
double maximum = lenListB[h];
274+
if (maximum < minimum) {
275+
minimum = lenListB[h];
276+
maximum = lenList[i];
277+
}
278+
if ((minimum / maximum < threshold)) {
279+
continue;
280+
}
278281
}
279282

280283
Statistician < V
@@ -285,7 +288,7 @@ void AlignerParallel<V>::processBlockB(Block *block) {
285288
s.calculate(funIndexArray, singleFeatNum, data);
286289
double res = predictor.calculateIdentity(data);
287290

288-
if (res >= relaxThreshold) {
291+
if (canReportAll || res >= relaxThreshold) {
289292
printList->at(omp_get_thread_num())->push_back(
290293
std::make_pair(infoListB[h], res));
291294
}
@@ -309,7 +312,7 @@ void AlignerParallel<V>::processBlockB(Block *block) {
309312
}
310313
}
311314

312-
// Wait for the last print case
315+
// Wait for the last print case
313316
if (printTask.valid()) {
314317
printTask.get();
315318
threadNum++;

src/AlignerParallel.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,7 @@ class AlignerParallel {
5959
int featNum;
6060
int singleFeatNum;
6161

62-
bool isLengthFilter;
62+
bool canReportAll;
6363
double threshold;
6464
double relaxThreshold;
6565

src/BestFirst.cpp

Lines changed: 17 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,17 @@
11
/*
2-
Identity calculates DNA sequence identity scores rapidly without alignment.
2+
Identity calculates DNA sequence identity scores rapidly without alignment.
33
4-
Copyright (C) 2020 Hani Z. Girgis, PhD
4+
Copyright (C) 2020 Hani Z. Girgis, PhD
55
6-
Academic use: Affero General Public License version 1.
6+
Academic use: Affero General Public License version 1.
77
8-
Any restrictions to use for-profit or non-academics: Alternative commercial license is needed.
8+
Any restrictions to use for-profit or non-academics: Alternative commercial license is needed.
99
10-
This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
11-
without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
10+
This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
11+
without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
1212
13-
Please contact Dr. Hani Z. Girgis (hzgirgis@buffalo.edu) if you need more information.
14-
*/
13+
Please contact Dr. Hani Z. Girgis (hzgirgis@buffalo.edu) if you need more information.
14+
*/
1515

1616
/*
1717
* BestFirst.cpp
@@ -63,13 +63,22 @@ BestFirst<T>::BestFirst(const Matrix &f, const Matrix &l,
6363
best = v.first;
6464
maximum = v.second;
6565
eCount = 0;
66+
67+
// Print new features
68+
std::cout << "Better performance of: " << maximum << std::endl;
69+
const int *list = best.getList();
70+
int s = best.getSize();
71+
for (int a = 0; a < s; a++) {
72+
std::cout << "\t" << fList.at(list[a])->getName() << std::endl;
73+
}
6674
}
6775
// Step 5
6876
std::vector < Node > eList = v.first.expand(fNum);
6977
eCount++;
7078

7179
// Step 6
7280
int childNum = eList.size();
81+
// double resultList[childNum] { 0.0 };
7382
std::vector<double> resultList(childNum, 0.0);
7483
#pragma omp parallel for schedule(static) num_threads(threadNum)
7584
for (int i = 0; i < childNum; i++) {

src/DataGenerator.cpp

Lines changed: 21 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -25,27 +25,15 @@
2525
/**
2626
* This constructor should be used for all vs. all
2727
*/
28-
DataGenerator::DataGenerator(std::string fileName) {
29-
30-
FastaReader reader(fileName, Parameters::getBlockSize());
28+
DataGenerator::DataGenerator(std::string fileName, int blockSize) {
29+
FastaReader reader(fileName, blockSize);
3130
block = reader.read();
3231

33-
// Calculate average length
34-
double sum = 0.0;
35-
for (auto p : *block) {
36-
sum += p.second->size();
37-
}
38-
int seqNum = block->size();
39-
average = round(sum / seqNum);
40-
cout << "Average: " << average << endl;
41-
4232
calculateK();
43-
44-
// Calculate histogram size
45-
histogramSize = pow(4, k);
46-
cout << "Histogram size: " << histogramSize << endl;
33+
calculateHistSize();
4734

4835
// Find maximum length
36+
int seqNum = block->size();
4937
for (int h = 0; h < seqNum; h++) {
5038
uint64_t len = block->at(h).second->size();
5139
if (len > maxLength) {
@@ -128,34 +116,37 @@ DataGenerator::DataGenerator(std::string dbName, std::string qryFile,
128116
maxLength = 2 * maxLength;
129117
}
130118

131-
// Calculate average length
132-
double sum = 0.0;
133-
for (auto p : *block) {
134-
sum += p.second->size();
135-
}
136-
137-
average = round(sum / block->size());
138-
cout << "Average: " << average << endl;
139-
140119
calculateK();
141-
142-
// Calculate histogram size
143-
histogramSize = pow(4, k);
144-
cout << "Histogram size: " << histogramSize << endl;
120+
calculateHistSize();
145121

146122
// Free up memory
147123
delete qryBlock;
148124
}
149125

150126
void DataGenerator::calculateK() {
127+
// Calculate average length
128+
double sum = 0.0;
129+
for (auto p : *block) {
130+
sum += p.second->size();
131+
}
132+
133+
average = round(sum / block->size());
134+
std::cout << "Average: " << average << std::endl;
135+
// Calculate k
151136
k = ceil(log(average) / log(Parameters::getAlphabetSize()))
152137
- Parameters::getKRelax();
153138
if (k < 2) {
154139
std::cerr << "DataGenerator warning: K is too small. ";
155140
std::cerr << "K is set to 2." << std::endl;
156141
k = 2;
157142
}
158-
cout << "K: " << k << endl;
143+
std::cout << "K: " << k << std::endl;
144+
}
145+
146+
void DataGenerator::calculateHistSize() {
147+
// Calculate histogram size
148+
histogramSize = pow(4, k);
149+
std::cout << "Histogram size: " << histogramSize << std::endl;
159150
}
160151

161152
DataGenerator::~DataGenerator() {

src/DataGenerator.h

Lines changed: 12 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,17 @@
11
/*
2-
Identity calculates DNA sequence identity scores rapidly without alignment.
2+
Identity calculates DNA sequence identity scores rapidly without alignment.
33
4-
Copyright (C) 2020 Hani Z. Girgis, PhD
4+
Copyright (C) 2020 Hani Z. Girgis, PhD
55
6-
Academic use: Affero General Public License version 1.
6+
Academic use: Affero General Public License version 1.
77
8-
Any restrictions to use for-profit or non-academics: Alternative commercial license is needed.
8+
Any restrictions to use for-profit or non-academics: Alternative commercial license is needed.
99
10-
This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
11-
without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
10+
This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
11+
without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
1212
13-
Please contact Dr. Hani Z. Girgis (hzgirgis@buffalo.edu) if you need more information.
14-
*/
13+
Please contact Dr. Hani Z. Girgis (hzgirgis@buffalo.edu) if you need more information.
14+
*/
1515

1616
/*
1717
* DataGenerator.h
@@ -32,6 +32,7 @@
3232
class DataGenerator {
3333
private:
3434
void calculateK();
35+
void calculateHistSize();
3536

3637
protected:
3738
Block *block;
@@ -45,12 +46,12 @@ class DataGenerator {
4546
double *compositionList;
4647

4748
// Model of all data (single statistics only)
48-
Matrix *fTable;
49+
Matrix *fTable = nullptr;
4950
// Model of associated labels (identity scores)
50-
Matrix *lTable;
51+
Matrix *lTable = nullptr;
5152

5253
public:
53-
DataGenerator(std::string);
54+
DataGenerator(std::string, int blockSize = Parameters::getBlockSize());
5455
DataGenerator(std::string, std::string, double);
5556
virtual ~DataGenerator();
5657

src/FastaReader.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,8 @@ FastaReader::FastaReader(std::string fileNameIn, int blockSizeIn,
7070
codeMap['B'] = 'T';
7171
codeMap['V'] = 'A';
7272
codeMap['D'] = 'T';
73+
// Added on 11/19/2020 to enable reading multi-sequence alignments
74+
codeMap['-'] = '-';
7375
}
7476

7577
FastaReader::~FastaReader() {

0 commit comments

Comments
 (0)