-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathrnaseq-array-pipeline.sh
More file actions
241 lines (177 loc) · 7.86 KB
/
rnaseq-array-pipeline.sh
File metadata and controls
241 lines (177 loc) · 7.86 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
#!/bin/bash
#SBATCH --array=0-3 ##specify number of samples
#SBATCH --output=%A_%a.o
#SBATCH --error=%A_%a.e
#SBATCH --job-name=%a_rnaseq-array-pipeline
#SBATCH --nodes=1
#SBATCH --ntasks-per-node=1
#SBATCH --mem=60G
#SBATCH --time=04:00:00
#SBATCH --mail-user=${USER}@nyu.edu
#SBATCH --mail-type=BEGIN,END,FAIL
module purge
module load trim_galore/0.4.4
module load cutadapt/intel/1.12
module load fastqc/0.11.5
module load hisat2/intel/2.0.5
module load samtools/intel/1.3.1
module load r/intel/3.3.2
module load umi_tools/0.4.2
module load qualimap/2.2.1
module load gffread/intel/0.9.8c
module load biopython/intel/1.70
#Configuration
analysisName="test_that_se_umi"
isPE=0 #0 if SE
isUMI=1 #1 if UMIs used
#File locations
#DIR='/scratch/cgsb/gresham/LABSHARE/Scripts/Pipelines/SE_UMI_RNAseq_gz/SE_UMI_RNAseq_head'
#DIR='/scratch/cgsb/gencore/out/Gresham/2014-09-24_HAJW7ADXX/umi/1'
#DIR='/scratch/cgsb/gencore/out/Gresham/2015-03-05_C572CACXX/umi/4'
DIR='/scratch/cgsb/gresham/LABSHARE/Scripts/Pipelines/testdata_SE_UMI'
hisat_ref='/scratch/work/cgsb/genomes/In_house/Fungi/Saccharomyces_cerevisiae/Gresham/GCF_000146045.2_R64_GreshamLabSpikes/GCF_000146045.2_R64_GreshamLabSpikes_genomic_hisat_index/genome'
#Below is the directory of the custom gff
gff='/scratch/work/cgsb/genomes/In_house/Fungi/Saccharomyces_cerevisiae/Gresham/GCF_000146045.2_R64_GreshamLabSpikes/GCF_000146045.2_R64_GreshamLabSpikes_genomic.gff'
#Below is the directory of the standard gff
#gff='/genomics/genomes/Public/Fungi/Saccharomyces_cerevisiae/NCBI/R64/GCF_000146045.2_R64_genomic.gff'
#Make a local directory and out-of-stream resource copies of this script for records
mkdir -p ${analysisName}
mkdir -p ${analysisName}/logs
cp rnaseq-array-pipeline.sh "${analysisName}/logs/rnaseq-array-pipeline.$(date +%F_%R).sh"
cp ${Read1_Adapter} ${analysisName}/logs
cp ${Read2_Adapter} ${analysisName}/logs
cd ${analysisName}
echo "Generating the gtf file from the gff, using gffread"
time gffread -E ${gff} -T -o- > temp.gtf
gtf1='temp.gtf'
## list of input files
#FILES=($(ls $DIR/*fastq.gz | perl -pe 's/^.+n0\d_(.+)\.fastq\.gz/$1/g' | sort | uniq))
FILES=($(ls $DIR/*fastq | perl -pe 's/^.+n0\d_(.+)\.fastq/$1/g' | sort | uniq))
echo "Processing these samples: ${FILES[*]}"
## $ID will hold the sample_id
ID=${FILES[$SLURM_ARRAY_TASK_ID]}
echo "This is the FILE I'm running"
echo ${FILES}
echo "This is it's ID"
echo ${ID}
echo "This is the ARRAY number I've been assigned"
echo ${SLURM_ARRAY_TASK_ID}
echo "I'm making this directory and jumping into it: ${ID}"
mkdir $ID
cd $ID
#Update gtf variable as it only exists once and is in analysis directory
gtf="../"${gtf1}
echo "I think my GTF is at ${gtf}"
#############
# Alignment #
#############
if [ "${isPE}" == 1 ] ; then
## full paths to paired mate files
INPUT_1=$(ls $DIR/*n01_$ID.fastq)
INPUT_2=$(ls $DIR/*n02_$ID.fastq)
if [ "${isUMI}" = 0 ] ; then
echo "Now I'm trimming the reads for adapters, quality?"
time trim_galore --paired --fastqc $INPUT_1 $INPUT_2
time hisat2 -p ${SLURM_CPUS_ON_NODE} -q -x ${hisat_ref} -1 *n01_${ID}_val_1.fq.gz -2 *n02_${ID}_val_2.fq.gz -S ${ID}.sam
elif [ "${isUMI}" = 1 ] ; then
echo "I assume you're using the trumiseq adapters, so trimming"
time python ../../scrape_adapter.py -u -i $INPUT_1 -o Read1_Adapter.fa
time python ../../scrape_adapter.py -p -i $INPUT_2 -o Read2_Adapter.fa
time cutadapt -a file:Read1_Adapter.fa -A file:Read2_Adapter.fa -o ${ID}_1.trim -p ${ID}_2.trim $INPUT_1 $INPUT_2
time fastqc -o ${ID}_postTrimming -f fastq ${ID}_1.trim ${ID}_2.trim
time hisat2 -p ${SLURM_CPUS_ON_NODE} -q -x ${hisat_ref} -1 ${ID}_1.trim -2 ${ID}_2.trim -S $ID.sam
fi
echo "Okay, now I sort and index that bam"
time samtools sort -o ${ID}.bam ${ID}.sam
time samtools index ${ID}.bam
echo "And delete that file, and fastqz files as well."
rm ${ID}.sam
if [ "${isUMI}" = 0 ] ; then
rm *n01_${ID}_val_1.fq.gz
rm *n02_${ID}_val_2.fq.gz
elif [ "${isUMI}" = 1 ] ; then
rm ${ID}_1.trim
rm ${ID}_2.trim
fi
if [ "${isUMI}" = 1 ] ; then
echo "You want me to dedup? Okay, deduping with umi_tools, then
sorting and deleting the unsorted bam."
time umi_tools dedup --umi-separator=: --method=directional --output-stats=${ID}.stats -I ${ID}.bam -S ${ID}.dedup.bam -L ${ID}.dedup.log
time samtools sort -o ${ID}.sorted.dedup.bam ${ID}.dedup.bam
rm ${ID}.dedup.bam
##Determine counts
echo "There were "$( samtools view ${ID}.bam | wc -l )" counts before umi-deduplication"
echo "There are "$( samtools view ${ID}.sorted.dedup.bam | wc -l )" counts after umi-deduplication"
echo "Deleting a lot of intermediate files, indexing."
rm ${ID}.bam
rm ${ID}.bam.bai
mv ${ID}.sorted.dedup.bam ${ID}.bam
time samtools index ${ID}.bam
fi
elif [ "${isPE}" = 0 ] ; then
## full paths to single end files
INPUT_1=$(ls $DIR/*n01_$ID.fastq)
if [ "${isUMI}" = 0 ] ; then
echo "Trimgaloring that ${INPUT_1}, then aligning"
time trim_galore --fastqc $INPUT_1
time hisat2 -q -x ${hisat_ref} -1 $INPUT_1 $ID.sam
elif [ "${isUMI}" = 1 ] ; then
echo "Trimming that ${INPUT_1} for adapters, quality? aligning"
echo ${ID}_1.trim
time python ../../scrape_adapter.py -u -i $INPUT_1 -o Read1_Adapter.fa
time cutadapt -a file:Read1_Adapter.fa -o ${ID}_1.trim $INPUT_1
time hisat2 -q -x ${hisat_ref} -U ${ID}_1.trim -S $ID.sam
fi
echo "sort and index"
time samtools sort -o ${ID}.bam ${ID}.sam
time samtools index ${ID}.bam
#Clean up .sam and .fastq files
rm ${ID}.sam
if [ "${isUMI}" = 1 ] ; then
rm ${ID}_1.trim
echo "and doing some deduping"
time umi_tools dedup --umi-separator=: --method=directional --output-stats=${ID}.stats -I ${ID}.bam -S ${ID}.dedup.bam -L ${ID}.dedup.log
time samtools sort -o ${ID}.sorted.dedup.bam ${ID}.dedup.bam
rm ${ID}.dedup.bam
##Determine counts
echo "There were "$( samtools view ${ID}.bam | wc -l )" counts before umi-deduplication"
echo "There are "$( samtools view ${ID}.sorted.dedup.bam | wc -l )" counts after umi-deduplication"
##Clean up non-deduplicated .bam file
rm ${ID}.bam
rm ${ID}.bam.bai
mv ${ID}.sorted.dedup.bam ${ID}.bam
samtools index ${ID}.bam
fi
else
echo "Define analysis as either a paired end or single end sequencing protocol"
fi
##############
# QC metrics #
##############
#Run Qualimap on individual BAM file
#http://qualimap.bioinfo.cipf.es/doc_html/command_line.html#command-line
#NB. for qualimap rnaseq set -pe if paired end sequencing used
#Java virtual machine uses DISPLAY environment variable to detect if the X11 system is available. Sometimes this variable is set incorrectly by the operating system or some applications. To make Qualimap work simply unset this variable:
echo "Now doing qualimaps"
unset DISPLAY
if [ "${isUMI}" = 0 ] ; then
time qualimap bamqc -bam ${ID}.bam -gff ${gff} -c -outdir ${ID}_stats -outfile ${ID}
time qualimap rnaseq -bam ${ID}.bam -gtf ${gtf} -pe -outdir rnaseq_qc_results -outfile ${ID}
elif [ "${isUMI}" = 1 ] ; then
time qualimap bamqc -bam ${ID}.bam -gff ${gff} -c
time qualimap rnaseq -bam ${ID}.bam -gtf ${gtf} -outdir rnaseq_qc_results
fi
############################
# Generate table of counts #
############################
#Call Rsubread
echo "and doing rsubread"
time Rscript --vanilla ../../feature_counts.R ${gtf1} ${isPE} > ../logs/rsubread.log
echo "Completed now moving the STDOUT and STDERR logs from sbatch
into the above directory."
#################################################
# Move output and error files to logs directory #
#################################################
mv ../../${SLURM_ARRAY_JOB_ID}_${SLURM_ARRAY_TASK_ID}.o ../logs
mv ../../${SLURM_ARRAY_JOB_ID}_${SLURM_ARRAY_TASK_ID}.e ../logs
exit 0;