Skip to content

Commit 4e9b2e8

Browse files
authored
Merge pull request #20 from CEGRcode/revisions
BioRxiv submission update (BWA to Bowtie2)
2 parents f60907b + 9a2de36 commit 4e9b2e8

355 files changed

Lines changed: 1748716 additions & 3353 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.gitignore

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,12 @@ EpitopeID/sacCer3_EpiID/FASTA_genome/genome.fa.ann
55
EpitopeID/sacCer3_EpiID/FASTA_genome/genome.fa.bwt
66
EpitopeID/sacCer3_EpiID/FASTA_genome/genome.fa.pac
77
EpitopeID/sacCer3_EpiID/FASTA_genome/genome.fa.sa
8+
EpitopeID/sacCer3_EpiID/FASTA_genome/genome.fa.1.bt2
9+
EpitopeID/sacCer3_EpiID/FASTA_genome/genome.fa.2.bt2
10+
EpitopeID/sacCer3_EpiID/FASTA_genome/genome.fa.3.bt2
11+
EpitopeID/sacCer3_EpiID/FASTA_genome/genome.fa.4.bt2
12+
EpitopeID/sacCer3_EpiID/FASTA_genome/genome.fa.rev.1.bt2
13+
EpitopeID/sacCer3_EpiID/FASTA_genome/genome.fa.rev.2.bt2
814
EpitopeID/ecoli_EpiID/FASTA_genome/genome.fa.amb
915
EpitopeID/ecoli_EpiID/FASTA_genome/genome.fa.ann
1016
EpitopeID/ecoli_EpiID/FASTA_genome/genome.fa.bwt
@@ -16,3 +22,9 @@ EpitopeID/hg19_EpiID/FASTA_genome/genome.fa.ann
1622
EpitopeID/hg19_EpiID/FASTA_genome/genome.fa.bwt
1723
EpitopeID/hg19_EpiID/FASTA_genome/genome.fa.pac
1824
EpitopeID/hg19_EpiID/FASTA_genome/genome.fa.sa
25+
EpitopeID/hg19_EpiID/FASTA_genome/genome.fa.1.bt2
26+
EpitopeID/hg19_EpiID/FASTA_genome/genome.fa.2.bt2
27+
EpitopeID/hg19_EpiID/FASTA_genome/genome.fa.3.bt2
28+
EpitopeID/hg19_EpiID/FASTA_genome/genome.fa.4.bt2
29+
EpitopeID/hg19_EpiID/FASTA_genome/genome.fa.rev.1.bt2
30+
EpitopeID/hg19_EpiID/FASTA_genome/genome.fa.rev.2.bt2

DeletionID/delScripts/detect_deletion_BAM.py

Lines changed: 30 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -28,15 +28,15 @@ def calculateDeletion(PASS):
2828
if float(PASS[key]) == 0:
2929
SCORES.append((key, 'No Data Detected'))
3030
elif np.isnan(float(PASS[key])):
31-
SCORES.append((key, 'Region does not meet mappability threshold'))
31+
SCORES.append((key, 'Region does not meet mappability threshold'))
3232
else:
33-
SCORES.append((key, np.log(float(PASS[key]) / MEDIAN) / np.log(2)))
33+
SCORES.append((key, np.log(float(PASS[key]) / MEDIAN) / np.log(2)))
3434
return SCORES
3535

3636
def iterateBAM(bam, bed, READLENGTH, MAP, MAPTHRESH):
3737
# open BAM file
3838
samfile = pysam.AlignmentFile(bam, "rb")
39-
# open BED file
39+
# open BED file
4040
file = open(bed, "r")
4141

4242
# Counter of total tags mapping across all intervals
@@ -65,16 +65,16 @@ def iterateBAM(bam, bed, READLENGTH, MAP, MAPTHRESH):
6565
intervalCount[index] = intervalCount[index] + 1
6666
totalSize = totalSize + intervalSize
6767

68-
# Calculate avg tags per bp across the entire region
68+
# Calculate avg tags per bp across the entire region
6969
intervalAvg = list(map(lambda x : float(x) / float(intervalSize), intervalCount))
70-
71-
# Normalize avg reads per interval by mappability
70+
71+
# Normalize avg reads per interval by mappability
7272
for index in range(0, len(MAP[intervalID])):
73-
if float(MAP[intervalID][index]) >= MAPTHRESH:
74-
mapAvg[index] = float(intervalAvg[index] * intervalCount[index]) / float(MAP[intervalID][index])
75-
else:
76-
mapAvg[index] = float('NaN')
77-
73+
if float(MAP[intervalID][index]) >= MAPTHRESH:
74+
mapAvg[index] = float(intervalAvg[index] * intervalCount[index]) / float(MAP[intervalID][index])
75+
else:
76+
mapAvg[index] = float('NaN')
77+
7878
PASS[intervalID] = (mapAvg, intervalCount)
7979

8080
except (IndexError, ValueError):
@@ -90,7 +90,7 @@ def iterateBAM(bam, bed, READLENGTH, MAP, MAPTHRESH):
9090
if all(float(x) < MAPTHRESH for x in MAP[key]):
9191
normalizedScore = float('NaN')
9292
elif sum(PASS[key][1]) != 0:
93-
normalizedScore = np.nansum(PASS[key][0]) / sum(PASS[key][1])
93+
normalizedScore = np.nansum(PASS[key][0]) / sum(PASS[key][1])
9494
else:
9595
normalizedScore = 0
9696
SCORE[key] = normalizedScore
@@ -99,11 +99,11 @@ def iterateBAM(bam, bed, READLENGTH, MAP, MAPTHRESH):
9999
if float(totalSize) <= 0:
100100
print("ERROR!!!\tTotal size of all intervals surveyed is less than 1")
101101
sys.exit(-1)
102-
102+
103103
# Close files
104104
file.close()
105105
samfile.close()
106-
106+
107107
return SCORE,FAIL
108108

109109
def closestLength(READLENGTH, read):
@@ -120,7 +120,7 @@ def loadMap(MAP):
120120
file = open(MAP, "r")
121121
header = 0;
122122
MAP = {}
123-
# Iterate BED coord file, getting tag counts across interval
123+
# Iterate BED coord file, getting tag counts across interval
124124
for line in file:
125125
mapline = line.rstrip().split("\t")
126126
if header == 0:
@@ -140,8 +140,8 @@ def validateBAM(bam):
140140
print("BAM index not detected.\nAttempting to index now...\n")
141141
pysam.index(str(bam))
142142
if not os.path.isfile(bam + ".bai"):
143-
raise RuntimeError("BAM indexing failed, please check if BAM file is sorted")
144-
return False
143+
raise RuntimeError("BAM indexing failed, please check if BAM file is sorted")
144+
return False
145145
print("BAM index successfully generated.\n")
146146
return True
147147

@@ -150,9 +150,9 @@ def validateBAM(bam):
150150
if len(sys.argv) < 2 or not sys.argv[1].startswith("-"): sys.exit(usage)
151151
BAM = BED = MAP = OUT = ""
152152

153-
# Variable to set the mappability threshold so that we do not consider regions with mappability
154-
# below this number 0-1, Default to 0.25 meaning at least 25% of the region must be uniquely mappable
155-
# by at least one actively used readlength
153+
# Variable to set the mappability threshold so that we do not consider regions with mappability
154+
# below this number 0-1, Default to 0.25 meaning at least 25% of the region must be uniquely mappable
155+
# by at least one actively used readlength
156156
MAPTHRESH = 0.25
157157

158158
OUTPUTTHRESH = -3
@@ -173,11 +173,11 @@ def validateBAM(bam):
173173
print("No BAM file detected!!!")
174174
sys.exit(usage)
175175
elif BED == "":
176-
print("No BED Coordinate file detected!!!")
177-
sys.exit(usage)
176+
print("No BED Coordinate file detected!!!")
177+
sys.exit(usage)
178178
elif MAP == "":
179-
print("No Mappability file detected!!!")
180-
sys.exit(usage)
179+
print("No Mappability file detected!!!")
180+
sys.exit(usage)
181181
if OUT == "":
182182
OUT = os.path.splitext(os.path.basename(BAM))[0] + "_" + os.path.splitext(os.path.basename(BED))[0] + ".tab"
183183

@@ -189,11 +189,11 @@ def validateBAM(bam):
189189
print("Output file: ",OUT)
190190
print("Log2 output threshold: ",OUTPUTTHRESH)
191191

192-
# Validate BAM file
192+
# Validate BAM file
193193
if(not validateBAM(BAM)):
194-
print("ERROR!!!\tNo BAM index detected.\n")
195-
sys.exit(-1)
196-
194+
print("ERROR!!!\tNo BAM index detected.\n")
195+
sys.exit(-1)
196+
197197
# Load mappability file
198198
READLENGTH, REGIONMAP = loadMap(MAP)
199199
print("Mappability file loaded")
@@ -203,7 +203,7 @@ def validateBAM(bam):
203203
print("Genomic coordinate coverage calculated")
204204

205205
# Calculate log2 tag enrichment over median of mappability-normalized tag avg per region
206-
SCORE = calculateDeletion(PASS)
206+
SCORE = calculateDeletion(PASS)
207207
print("Depletion calculated")
208208

209209
# Output final data
@@ -215,7 +215,7 @@ def validateBAM(bam):
215215
for id,score in reversed(FINAL):
216216
try:
217217
if float(score) < OUTPUTTHRESH:
218-
output.write(id + "\t" + str(score) + "\n")
218+
output.write(id + "\t" + str(score) + "\n")
219219
else:
220220
break
221221
except(ValueError):

DeletionID/identify-Deletion.sh

Lines changed: 20 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1,39 +1,39 @@
11
#!/bin/bash
22

33
# Required software:
4-
# python v2.15 with scipy
4+
# python3 with scipy
55

66
usage()
77
{
8-
echo 'identify-Deletion.sh -i /path/to/BAM -o /path/to/output -d /path/to/genome/database'
9-
echo 'eg: bash identify-Deletion.sh -i /input -o /output -d /sacCer3_Del'
10-
exit
8+
echo 'identify-Deletion.sh -i /path/to/BAM -o /path/to/output -d /path/to/genome/database'
9+
echo 'eg: bash identify-Deletion.sh -i /input -o /output -d /sacCer3_Del'
10+
exit
1111
}
1212

1313
if [ "$#" -ne 6 ]; then
14-
usage
14+
usage
1515
fi
1616

1717
while getopts ":i:o:d:" IN; do
18-
case "${IN}" in
19-
i)
20-
INPUT=${OPTARG}
21-
;;
22-
o)
23-
OUTPUT=${OPTARG}
24-
;;
25-
d)
26-
DATABASE=${OPTARG}
27-
;;
28-
*)
29-
usage
30-
;;
31-
esac
18+
case "${IN}" in
19+
i)
20+
INPUT=${OPTARG}
21+
;;
22+
o)
23+
OUTPUT=${OPTARG}
24+
;;
25+
d)
26+
DATABASE=${OPTARG}
27+
;;
28+
*)
29+
usage
30+
;;
31+
esac
3232
done
3333
shift $((OPTIND-1))
3434

3535
if [ -z "${INPUT}" ] || [ -z "${OUTPUT}" ] || [ -z "$DATABASE" ]; then
36-
usage
36+
usage
3737
fi
3838

3939
echo "Input folder = ${INPUT}"

EpitopeID/epiScripts/calculate_EpitopeSignificance.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -32,20 +32,20 @@
3232
print("No Pvalue input!!!")
3333
sys.exit(usage)
3434
elif COUNT == "":
35-
print("No Single-end epitope counts input!!!")
36-
sys.exit(usage)
35+
print("No Single-end epitope counts input!!!")
36+
sys.exit(usage)
3737
elif SIZE == "":
38-
print("No Genome-size input!!!")
39-
sys.exit(usage)
38+
print("No Genome-size input!!!")
39+
sys.exit(usage)
4040
if OUT == "":
41-
OUT = os.path.splitext(os.path.basename(BAM))[0] + ".tab"
41+
OUT = os.path.splitext(os.path.basename(BAM))[0] + ".tab"
4242

4343
# Minimum fold enrichment over background
4444
MINFOLD = 2;
4545

4646
# Open output file for writing
4747
output = open(OUT, "w")
48-
# open PE_table
48+
# open PE_table
4949
file = open(TABLE, "r")
5050

5151
TABLE = []

EpitopeID/epiScripts/count_raw_epitope.pl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@
2727

2828
open(OUT, ">$output") or die "Can't open $output for writing!\n";
2929
if($#SORT == -1) {
30-
print OUT "EpitopeID\tEpitopeCount\nNo Tag ID'd\n";
30+
print OUT "EpitopeID\tEpitopeCount\nNo Tag ID'd\n";
3131
} else {
3232
print OUT "EpitopeID\tEpitopeCount\n";
3333
for($x = 0; $x <= $#SORT; $x++) {

EpitopeID/epiScripts/sum_PE_epitope-alignment.pl

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@
3636
chomp($line);
3737
next if((substr $line, 0, 1) eq "@");
3838
@array = split(/\t/, $line);
39-
39+
4040
# Set predicted terminus of epitope
4141
$LOC = "C-term";
4242
if($array[5] eq "+" && $array[18] eq "-") { $LOC = "N-term"; }
@@ -56,12 +56,12 @@
5656

5757
open(OUT, ">$output") or die "Can't open $output for writing!\n";
5858
if($#SORT == -1) {
59-
print OUT "Epitope could not be detected genomically\n";
59+
print OUT "Epitope could not be detected genomically\n";
6060
} else {
6161
for($x = 0; $x <= $#SORT; $x++) {
6262
@temparray = split(/\~/, $SORT[$x]{'id'});
6363
for($y = 0; $y <= $#temparray; $y++) { print OUT "$temparray[$y]\t" }
64-
print OUT "$SORT[$x]{'count'}\n";;
64+
print OUT "$SORT[$x]{'count'}\n";;
6565
}
6666
}
6767
close OUT;

0 commit comments

Comments
 (0)