-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathsra2bam.sh
More file actions
136 lines (105 loc) · 3.91 KB
/
sra2bam.sh
File metadata and controls
136 lines (105 loc) · 3.91 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
###############################################################
# This script is design to obtain the BAM files from SRA files.
###############################################################
set -e
set -u
# <<ASSUMPTION>>: the relevant files have already been obtained from GEO:
# Dekker et al. = GSE18199
# Sofueva et al. = GSE49017
# Rickman et al. = GSE37752
dekker=(SRR027957.sra SRR027958.sra)
sofueva=(SRR941267.sra SRR941268.sra SRR941269.sra SRR941270.sra SRR941271.sra SRR941272.sra SRR941273.sra SRR941274.sra SRR941275.sra SRR941276.sra SRR941277.sra SRR941278.sra SRR941279.sra SRR941280.sra SRR941281.sra SRR941282.sra)
rickman=(SRR493818.sra SRR493819.sra SRR493820.sra SRR493821.sra SRR493822.sra SRR493823.sra SRR493824.sra SRR493825.sra SRR493826.sra SRR493827.sra)
# <<ASSUMPTION>>: Bowtie2 and cutadapt are installed.
bowcmd=bowtie2
cutcmd=cutadapt
# <<ASSUMPTION>>: hg19 and mm10 indices have been built.
# This can be done by making a FASTA file from a BSGenome object:
#
# > bs <- BSGenome.Mmusculus.UCSC.mm10
# > outfile <- "mm10.fa"
# > for (chr in seqnames(bs)) {
# + y <- getSeq(bs, names = chr, start = 1, end = length(bs[[chr]]))
# + y <- DNAStringSet(y)
# + names(y) <- chr
# + writeXStringSet(filepath = outfile, y, append = TRUE)
# + }
#
# ... and then running "bowtie2-build mm10.fa mm10_index/mm10" in the shell.
# Alternatively, you can use your own FASTA file.
hg19=hg19_index/hg19
mm10=mm10_index/mm10
# <<ASSUMPTION>>: FixMateInformation and MarkDuplicates are available from the Picard suite.
fixcmd=FixMateInformation
markcmd=MarkDuplicates
# <<ASSUMPTION>>: samtools has been installed.
samcmd=samtools
# <<ASSUMPTION>>: diffHic has been installed on the relevant R installation.
rcmd=R
# <<ASSUMPTION>>: fastq-dump (from NCBI's SRA toolkit) has been installed.
fqdcmd=fastq-dump
###############################################################
# We pull out the Hi-C mapping script from diffHic.
mapper=hicmap.py
echo "require(diffHic); file.copy(system.file('python', 'presplit_map.py', package='diffHic'), '$mapper', overwrite=TRUE)" | ${rcmd} --no-save
if [ ! -e $mapper ]; then
echo "Extraction of Hi-C mapping script failed."
exit 1
fi
vtmp=`mktemp -d --tmpdir=.`
tmpfile=blah.txt
###############################################################
# Running through each set of files.
ligation=AAGCTAGCTT # all of them use the same ligation signature.
for i in {1..3}
do
if [[ $i -eq 1 ]]; then
allfiles=(${dekker[@]})
genome=$hg19
phred=33
elif [[ $i -eq 2 ]]; then
allfiles=(${sofueva[@]})
genome=$mm10
phred=64
else
allfiles=(${rickman[@]})
genome=$hg19
phred=33
fi
for sra in "${allfiles[@]}"
do
${fqdcmd} --split-files $sra
prefix=`echo $sra | sed "s/\.sra$//g"`
read1=${prefix}_1.fastq
read2=${prefix}_2.fastq
rawbam=temp.bam
python $mapper -o $rawbam -G $genome -1 $read1 -2 $read2 --sig $ligation -P $phred --cmd "${bowcmd} -p 8" --cut $cutcmd
fixbam=fixed.bam
${fixcmd} I=$rawbam O=$fixbam TMP_DIR=$vtmp VALIDATION_STRINGENCY=SILENT SORT_ORDER=coordinate
markbam=marked.bam
${markcmd} I=$fixbam O=$markbam M=$tmpfile TMP_DIR=$vtmp AS=true REMOVE_DUPLICATES=false VALIDATION_STRINGENCY=SILENT
${samcmd} sort -n $markbam $prefix
rm $rawbam $fixbam $markbam
rm $read1 $read2
done
done
###############################################################
# Mopping up.
rm $tmpfile
rm -r $vtmp
# Printing out a log file with version numbers.
ticket=success.log
if [[ -e $ticket ]]; then
rm $ticket
fi
set +e
$bowcmd --version | grep "bowtie2" >> $ticket
$cutcmd --version >> $ticket
stored=`$samcmd 2>&1 | grep -i "Version:"`
printf "Samtools $stored\n" >> $ticket
stored=`$markcmd --version 2>&1`
printf "MarkDuplicates version $stored\n" >> $ticket
stored=`$fixcmd --version 2>&1`
printf "FixMateInformation version $stored\n" >> $ticket
$fqdcmd --version | grep "." >> $ticket
###############################################################