Skip to content

Commit b15a1d4

Browse files
authored
Added source files
1 parent d79be5c commit b15a1d4

57 files changed

Lines changed: 9336 additions & 0 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

src/Aligner.cpp

Lines changed: 207 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,207 @@
1+
/*
2+
Identity calculates DNA sequence identity scores rapidly without alignment.
3+
4+
Copyright (C) 2020 Hani Z. Girgis, PhD
5+
6+
Academic use: Affero General Public License version 1.
7+
8+
Any restrictions to use for-profit or non-academics: Alternative commercial license is needed.
9+
10+
This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
11+
without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
12+
13+
Please contact Dr. Hani Z. Girgis (hzgirgis@buffalo.edu) if you need more information.
14+
*/
15+
16+
/*
17+
* Aligner.cpp
18+
*
19+
* Created on: Nov 15, 2019
20+
* Author: Hani Zakaria Girgis, PhD
21+
* An instance of this class calculates the All-vs-All on a block of sequences.
22+
* It is thread safe designed for parallel execution.
23+
*
24+
*/
25+
26+
#include "Aligner.h"
27+
28+
/**
29+
* Block a will NOT be deleted here because it is
30+
* being processed by other threads as well.
31+
*/
32+
Aligner::Aligner(DataGenerator *d, ITransformer *t, Block *a, string dlmIn,
33+
bool filter, double cutoff, double e, uint8_t *keyListIn) {
34+
blockA = a;
35+
dlm = dlmIn;
36+
id = t;
37+
threshold = cutoff;
38+
error = e;
39+
keyList = keyListIn;
40+
41+
histogramSize = d->getHistogramSize();
42+
k = d->getK();
43+
maxLength = d->getMaxLength();
44+
compositionList = d->getCompositionList();
45+
46+
int alphaSize = Parameters::getAlphabetSize();
47+
48+
auto featList = t->getFeatureList();
49+
featNum = featList.size() - 1; // The bias has not been removed yet.
50+
for (auto f : featList) {
51+
if (f->getNumOfComp() == 0 && f->getName().compare("constant") != 0) {
52+
funIndexList.push_back(f->getFunIndex());
53+
}
54+
}
55+
singleFeatNum = funIndexList.size();
56+
funIndexArray = funIndexList.data();
57+
// This predictor removes the bias from the feature list
58+
predictor = GLMPredictor(featList, false);
59+
// featList is not needed beyond this point
60+
for (auto f : featList) {
61+
delete f;
62+
}
63+
featList.clear();
64+
65+
isLengthFilter = filter;
66+
67+
ssPtr = new stringstream();
68+
}
69+
70+
Aligner::~Aligner() {
71+
delete ssPtr;
72+
73+
if (buffer.size() > 0) {
74+
std::cerr << "Aligner error: Queue must be empty. " << std::endl;
75+
std::cerr << "Queue size is: " << buffer.size() << std::endl;
76+
}
77+
}
78+
79+
pair<bool, stringstream*> Aligner::start() {
80+
// Keep processing blocks as they are enqueued.
81+
while (true) {
82+
if (buffer.size() > 0) {
83+
processBlock();
84+
} else if (canStop && buffer.size() == 0) {
85+
break;
86+
}
87+
}
88+
89+
return std::make_pair(canWrite, ssPtr);
90+
}
91+
92+
/**
93+
* Thread safe
94+
* Note: This block and its contents will be deleted
95+
* after processing it.
96+
*/
97+
void Aligner::enqueueBlock(pair<Block*, bool> p) {
98+
buffer.push(p);
99+
}
100+
101+
void Aligner::processBlock() {
102+
// Determine histogram data type
103+
if (maxLength <= std::numeric_limits<int8_t>::max()) {
104+
processBlockHelper<int8_t>();
105+
} else if (maxLength <= std::numeric_limits<int16_t>::max()) {
106+
processBlockHelper<int16_t>();
107+
} else if (maxLength <= std::numeric_limits<int32_t>::max()) {
108+
processBlockHelper<int32_t>();
109+
} else if (maxLength <= std::numeric_limits<int64_t>::max()) {
110+
processBlockHelper<int64_t>();
111+
} else {
112+
std::cout << "Aligner warning: Overflow is possible however unlikely.";
113+
std::cout << std::endl;
114+
std::cout << "A histogram entry is 64 bits." << std::endl;
115+
processBlockHelper<int64_t>();
116+
}
117+
}
118+
119+
/**
120+
* Thread safe
121+
*/
122+
template<class V>
123+
void Aligner::processBlockHelper() {
124+
// Get the front of the the queue, but do not pop it yet.
125+
auto blockB = buffer.front().first;
126+
int sizeA = blockA->size();
127+
int sizeB = blockB->size();
128+
129+
static KmerHistogram<uint64_t, V> kTable(k);
130+
static KmerHistogram<uint64_t, uint64_t> monoTable(1);
131+
132+
for (int j = 0; j < sizeA; j++) {
133+
int init = 0;
134+
// If the two blocks have the same contents.
135+
if (buffer.front().second) {
136+
init = j + 1;
137+
}
138+
auto p1 = blockA->at(j);
139+
alignSeqVsBlock(p1.first, p1.second, blockB, kTable, monoTable,init);
140+
}
141+
142+
// Pop the block and free its memory
143+
FastaReader::deleteBlock(blockB);
144+
buffer.pop();
145+
}
146+
147+
template<class V>
148+
void Aligner::alignSeqVsBlock(std::string *info1, std::string *seq1,
149+
Block *blockB, KmerHistogram<uint64_t, V> &kTable,
150+
KmerHistogram<uint64_t, uint64_t> &monoTable,int init) {
151+
152+
V *h1 = kTable.build(seq1);
153+
uint64_t *mono1 = monoTable.build(seq1);
154+
155+
int l1 = seq1->size();
156+
int sizeB = blockB->size();
157+
double data[featNum];
158+
159+
for (int hani = init; hani < sizeB; hani++) {
160+
auto p2 = blockB->at(hani);
161+
string *seq2 = p2.second;
162+
int l2 = seq2->size();
163+
if (isLengthFilter
164+
&& (std::min(l1, l2) / (double) std::max(l1, l2) < threshold)) {
165+
continue;
166+
}
167+
168+
V *h2 = kTable.build(seq2);
169+
uint64_t *mono2 = monoTable.build(seq2);
170+
Statistician<V> s(histogramSize, k, h1, h2, mono1, mono2,
171+
compositionList, keyList);
172+
173+
s.calculate(funIndexArray, singleFeatNum, data);
174+
175+
double res = predictor.calculateIdentity(data);
176+
177+
if (res >= threshold - error) {
178+
canWrite = true;
179+
180+
if (res > 1.0) {
181+
res = 1.0;
182+
} else if (res < 0.0) {
183+
res = 0.0;
184+
}
185+
186+
(*ssPtr) << *info1 << dlm << *p2.first << dlm
187+
<< std::setprecision(4) << res << std::endl;
188+
}
189+
190+
delete[] h2;
191+
delete[] mono2;
192+
}
193+
194+
delete[] h1;
195+
delete[] mono1;
196+
}
197+
198+
int Aligner::getQueueSize() {
199+
return buffer.size();
200+
}
201+
202+
/**
203+
* Thread safe
204+
*/
205+
void Aligner::stop() {
206+
canStop = true;
207+
}

src/Aligner.h

Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,97 @@
1+
/*
2+
Identity calculates DNA sequence identity scores rapidly without alignment.
3+
4+
Copyright (C) 2020 Hani Z. Girgis, PhD
5+
6+
Academic use: Affero General Public License version 1.
7+
8+
Any restrictions to use for-profit or non-academics: Alternative commercial license is needed.
9+
10+
This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
11+
without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
12+
13+
Please contact Dr. Hani Z. Girgis (hzgirgis@buffalo.edu) if you need more information.
14+
*/
15+
16+
/*
17+
* Aligner.h
18+
*
19+
* Created on: Nov 15, 2019
20+
* Author: Hani Z. Girgis, PhD
21+
*/
22+
23+
#ifndef ALIGNER_H_
24+
#define ALIGNER_H_
25+
26+
#include <iostream>
27+
#include <sstream>
28+
#include <vector>
29+
#include "Util.h"
30+
#include "FastaReader.h"
31+
#include "ITransformer.h"
32+
#include "DataGenerator.h"
33+
#include "KmerHistogram.h"
34+
#include "Statistician.h"
35+
#include "Feature.h"
36+
#include "LockFreeQueue.h"
37+
#include "GLMPredictor.h"
38+
39+
class Aligner {
40+
private:
41+
LockFreeQueue<pair<Block*, bool>, 500> buffer;
42+
bool canStop = false;
43+
44+
Block *blockA;
45+
std::string dlm;
46+
ITransformer *id;
47+
48+
// Needed for histograms and statisticians
49+
// Get them from data generator
50+
int histogramSize;
51+
int k;
52+
int64_t maxLength;
53+
double *compositionList;
54+
55+
std::vector<int> funIndexList;
56+
int *funIndexArray;
57+
int singleFeatNum;
58+
int featNum;
59+
60+
// If enabled aligner does not align two sequences if they
61+
// can achieve the minimum identity score.
62+
bool isLengthFilter;
63+
64+
double threshold;
65+
66+
// Used for relaxing the final filter as threshold - error
67+
double error = 0.0;
68+
69+
// Collect all results here
70+
stringstream *ssPtr;
71+
// If true write out the content of ssPtr
72+
bool canWrite = false;
73+
74+
GLMPredictor predictor;
75+
76+
uint8_t *keyList;
77+
78+
void processBlock();
79+
template<class V>
80+
void processBlockHelper();
81+
template<class V>
82+
void alignSeqVsBlock(std::string *info1, std::string *seq1, Block *blockB,
83+
KmerHistogram<uint64_t, V> &kTable,
84+
KmerHistogram<uint64_t, uint64_t> &monoTable, /*uint8_t *keyList,*/
85+
int init);
86+
87+
public:
88+
Aligner(DataGenerator*, ITransformer*, Block*, string, bool, double,
89+
double e, uint8_t*);
90+
virtual ~Aligner();
91+
void enqueueBlock(pair<Block*, bool>);
92+
pair<bool, stringstream*> start();
93+
void stop();
94+
int getQueueSize();
95+
};
96+
97+
#endif /* ALIGNER_H_ */

0 commit comments

Comments
 (0)