Skip to content

Commit b41fa80

Browse files
authored
Merge pull request #13 from CEGRcode/datasets
Run EpitopeID and StrainID on HIV and CENPK ChIPseq samples
2 parents db11665 + 0f47b8b commit b41fa80

15 files changed

Lines changed: 428 additions & 8 deletions

File tree

paper/.gitignore

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
run.setup.err
2+
run.setup.out
13
input/hg19.fa*
24
input/sacCer3.fa*
35
db/
@@ -11,6 +13,10 @@ ENCODEdata-eGFP/logs/*.out-*
1113
ENCODEdata-eGFP/logs/*.err-*
1214
ENCODEdata-eGFP/results/FASTQ
1315
ENCODEdata-eGFP/results/ID
16+
HIV_samples/logs/*.err-*
17+
HIV_samples/logs/*.out-*
18+
HIV_samples/results/FASTQ
19+
HIV_samples/results/ID
1420
SyntheticDeletion/synthetic_genome/
1521
SyntheticDeletion/logs/*.err-*
1622
SyntheticDeletion/logs/*.out-*
@@ -29,3 +35,8 @@ SyntheticStrain/results/hg19*
2935
ENCODE_CellLines/results/BAM
3036
ENCODE_CellLines/results/BAM-nospike
3137
ENCODE_CellLines/results/ID
38+
CENPK-chipseq/logs/*.out
39+
CENPK-chipseq/logs/*.err
40+
CENPK-chipseq/results/FASTQ
41+
CENPK-chipseq/results/BAM
42+
CENPK-chipseq/results/ID

paper/CENPK-chipseq/README

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
# Run StrainID on CENPK datasets to evaluate StrainID's ability to detect the variant-based strain background
2+
3+
# "Integration of multiple nutrient cues and regulation of lifespan by ribosomal transcription factor Ifh1"
4+
# (Cai et al, 2013)
5+
6+
# GEO accession: https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE39147
7+
8+
# The default sacCer3 StrainID database is used
9+
# Download data using SRA accessions using `job/00_download_data.pbs`
10+
# Align FASTQ files and process using `job/01_align_data.pbs`
11+
# Run StrainID on BAM inputs using `job/02_run_StrainID.pbs` to determine if StrainID can successfully identify the strain background
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
#!/bin/bash
2+
#PBS -l nodes=1:ppn=8
3+
#PBS -l pmem=32gb
4+
#PBS -l walltime=06:00:00
5+
#PBS -A open
6+
#PBS -o logs/download.data.log.out
7+
#PBS -e logs/download.data.log.err
8+
9+
# FIRST CHANGE PATH TO EXECUTE
10+
WRK=/path/to/GenoPipe/paper/CENPK-chipseq
11+
cd $WRK
12+
13+
module load anaconda3
14+
source activate ~/work/myconda/genopipe/
15+
16+
[ -d logs ] || mkdir logs
17+
[ -d results/FASTQ ] || mkdir -p results/FASTQ
18+
19+
parallel-fastq-dump --gzip --split-files -t 8 -O results/FASTQ -s SRR518875
20+
parallel-fastq-dump --gzip --split-files -t 8 -O results/FASTQ -s SRR518876
21+
parallel-fastq-dump --gzip --split-files -t 8 -O results/FASTQ -s SRR518877
22+
parallel-fastq-dump --gzip --split-files -t 8 -O results/FASTQ -s SRR518878
Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
#!/bin/bash
2+
#PBS -l nodes=1:ppn=8
3+
#PBS -l pmem=32gb
4+
#PBS -l walltime=06:00:00
5+
#PBS -A open
6+
#PBS -o logs/align.data.log.out
7+
#PBS -e logs/align.data.log.err
8+
9+
# FIRST CHANGE PATH TO EXECUTE
10+
WRK=/path/to/GenoPipe/paper/CENPK-chipseq
11+
cd $WRK
12+
13+
module load gcc
14+
module load samtools
15+
module load bwa
16+
module load anaconda3
17+
source activate ~/work/myconda/genopipe/
18+
19+
[ -d logs ] || mkdir logs
20+
[ -d results/BAM ] || mkdir -p results/BAM
21+
22+
YGENOME=$WRK/../input/sacCer3.fa
23+
24+
for SRR in "SRR518875" "SRR518876" "SRR518877" "SRR518878";
25+
do
26+
FQ=$WRK/results/FASTQ/$SRR
27+
BAM=$WRK/results/BAM/$SRR
28+
# align
29+
bwa mem $YGENOME $FQ\_1.fastq.gz -t 8 \
30+
| samtools sort \
31+
> $BAM.bam
32+
# index
33+
samtools index $BAM.bam
34+
done
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
#!/bin/bash
2+
#PBS -l nodes=1:ppn=4
3+
#PBS -l pmem=16gb
4+
#PBS -l walltime=03:00:00
5+
#PBS -A open
6+
#PBS -o logs/sid.cenpk.chip.log.out
7+
#PBS -e logs/sid.cenpk.chip.log.err
8+
9+
# FIRST CHANGE PATH TO EXECUTE
10+
WRK=/path/to/GenoPipe/paper/CENPK-chipseq
11+
cd $WRK
12+
13+
module load gcc
14+
module load samtools
15+
module load bwa
16+
module load anaconda3
17+
source activate ~/work/myconda/genopipe
18+
19+
[ -d logs ] || mkdir logs
20+
[ -d results/ID ] || mkdir -p results/ID
21+
22+
DB=$WRK/../db/sacCer3_VCF
23+
GENOME=$WRK/../input/sacCer3.fa
24+
25+
STRAINID=$WRK/../../StrainID
26+
cd $STRAINID
27+
bash identify-Strain.sh -i $WRK/results/BAM -g $GENOME -o $WRK/results/ID/ -v $DB

paper/CENPK-chipseq/logs/README

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
# logfiles from STDERR and STDOUT of running job files go here

paper/CENPK-chipseq/results/README

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
# Downloaded FASTQ files and StrainID results go here

paper/HIV_samples/README

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
# Run EpitopeID on HIV datasets to evaluate EpitopeID's ability to detect HIV genome insertions
2+
3+
# "Benzotriazoles Reactivate Latent HIV-1 through Inactivation of STAT5 SUMOylation"
4+
# (Bosque et al, 2017)
5+
6+
# GEO accession: https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE84199
7+
# HIV genome: https://www.ncbi.nlm.nih.gov/nuccore/AF324493
8+
9+
# EpitopeID database with HIV genome as a tag and hg19 genome as genomic sequence is setup with `../setup.sh`
10+
# Download data using SRA accessions using `job/00_download_data.pbs`
11+
# Run EpitopeID on FASTQ inputs using `job/01_run_EpitopeID.pbs` to determine if EpitopeID can localize HIV genome insertions
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
#!/bin/bash
2+
#PBS -l nodes=1:ppn=8
3+
#PBS -l pmem=32gb
4+
#PBS -l walltime=03:00:00
5+
#PBS -A open
6+
#PBS -o logs/download.data.log.out
7+
#PBS -e logs/download.data.log.err
8+
9+
# Requires
10+
# parallel fastq dump v2.8.0
11+
12+
WRK=/path/to/GenoPipe/paper/HIV_samples
13+
cd $WRK
14+
15+
module load anaconda3
16+
source activate ~/work/myconda/genopipe/
17+
18+
[ -d results/FASTQ ] || mkdir -p results/FASTQ
19+
[ -d logs ] || mkdir logs
20+
21+
parallel-fastq-dump --gzip --split-files -t 4 -O results/FASTQ -s SRR3812124
22+
parallel-fastq-dump --gzip --split-files -t 4 -O results/FASTQ -s SRR3812125
23+
parallel-fastq-dump --gzip --split-files -t 4 -O results/FASTQ -s SRR3812126
24+
parallel-fastq-dump --gzip --split-files -t 4 -O results/FASTQ -s SRR3812127
25+
parallel-fastq-dump --gzip --split-files -t 4 -O results/FASTQ -s SRR3812128
26+
parallel-fastq-dump --gzip --split-files -t 4 -O results/FASTQ -s SRR3812129
27+
mv *.fastq.gz results/FASTQ/
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
#!/bin/bash
2+
#PBS -l nodes=1:ppn=4
3+
#PBS -l pmem=16gb
4+
#PBS -l walltime=03:00:00
5+
#PBS -A open
6+
#PBS -o logs/eid.hiv.log.out
7+
#PBS -e logs/eid.hiv.log.err
8+
9+
module load gcc
10+
module load samtools
11+
module load bwa
12+
module load bedtools
13+
module load anaconda3
14+
source activate genopipe
15+
16+
WRK=/path/to/GenoPipe/paper/HIV_samples
17+
cd $WRK
18+
19+
[ -d logs ] || mkdir logs
20+
[ -d results/ID ] || mkdir -p results/ID
21+
22+
DB=$WRK/../db/hiv_EpiDB
23+
24+
EPITOPEID=$WRK/../../EpitopeID
25+
cd $EPITOPEID
26+
bash identify-Epitope.sh -i $WRK/results/FASTQ/ -o $WRK/results/ID/ -d $DB -t 4

0 commit comments

Comments
 (0)