Merge pull request #13 from CEGRcode/datasets

owlang · web-flow · commit b41fa802cd14 · 2021-10-16T22:45:23.000-04:00
Run EpitopeID and StrainID on HIV and CENPK ChIPseq samples
diff --git a/paper/.gitignore b/paper/.gitignore
@@ -1,3 +1,5 @@
+run.setup.err
+run.setup.out
 input/hg19.fa*
 input/sacCer3.fa*
 db/
@@ -11,6 +13,10 @@ ENCODEdata-eGFP/logs/*.out-*
 ENCODEdata-eGFP/logs/*.err-*
 ENCODEdata-eGFP/results/FASTQ
 ENCODEdata-eGFP/results/ID
+HIV_samples/logs/*.err-*
+HIV_samples/logs/*.out-*
+HIV_samples/results/FASTQ
+HIV_samples/results/ID
 SyntheticDeletion/synthetic_genome/
 SyntheticDeletion/logs/*.err-*
 SyntheticDeletion/logs/*.out-*
@@ -29,3 +35,8 @@ SyntheticStrain/results/hg19*
 ENCODE_CellLines/results/BAM
 ENCODE_CellLines/results/BAM-nospike
 ENCODE_CellLines/results/ID
+CENPK-chipseq/logs/*.out
+CENPK-chipseq/logs/*.err
+CENPK-chipseq/results/FASTQ
+CENPK-chipseq/results/BAM
+CENPK-chipseq/results/ID
diff --git a/paper/CENPK-chipseq/README b/paper/CENPK-chipseq/README
@@ -0,0 +1,11 @@
+# Run StrainID on CENPK datasets to evaluate StrainID's ability to detect the variant-based strain background
+
+# "Integration of multiple nutrient cues and regulation of lifespan by ribosomal transcription factor Ifh1"
+# (Cai et al, 2013)
+
+# GEO accession: https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE39147
+
+# The default sacCer3 StrainID database is used
+# Download data using SRA accessions using `job/00_download_data.pbs`
+# Align FASTQ files and process using `job/01_align_data.pbs`
+# Run StrainID on BAM inputs using `job/02_run_StrainID.pbs` to determine if StrainID can successfully identify the strain background
diff --git a/paper/CENPK-chipseq/job/00_download_data.pbs b/paper/CENPK-chipseq/job/00_download_data.pbs
@@ -0,0 +1,22 @@
+#!/bin/bash
+#PBS -l nodes=1:ppn=8
+#PBS -l pmem=32gb
+#PBS -l walltime=06:00:00
+#PBS -A open
+#PBS -o logs/download.data.log.out
+#PBS -e logs/download.data.log.err
+
+# FIRST CHANGE PATH TO EXECUTE
+WRK=/path/to/GenoPipe/paper/CENPK-chipseq
+cd $WRK
+
+module load anaconda3
+source activate ~/work/myconda/genopipe/
+
+[ -d logs ] || mkdir logs
+[ -d results/FASTQ ] || mkdir -p results/FASTQ
+
+parallel-fastq-dump --gzip --split-files -t 8 -O results/FASTQ -s SRR518875
+parallel-fastq-dump --gzip --split-files -t 8 -O results/FASTQ -s SRR518876
+parallel-fastq-dump --gzip --split-files -t 8 -O results/FASTQ -s SRR518877
+parallel-fastq-dump --gzip --split-files -t 8 -O results/FASTQ -s SRR518878
diff --git a/paper/CENPK-chipseq/job/01_align_data.pbs b/paper/CENPK-chipseq/job/01_align_data.pbs
@@ -0,0 +1,34 @@
+#!/bin/bash
+#PBS -l nodes=1:ppn=8
+#PBS -l pmem=32gb
+#PBS -l walltime=06:00:00
+#PBS -A open
+#PBS -o logs/align.data.log.out
+#PBS -e logs/align.data.log.err
+
+# FIRST CHANGE PATH TO EXECUTE
+WRK=/path/to/GenoPipe/paper/CENPK-chipseq
+cd $WRK
+
+module load gcc
+module load samtools
+module load bwa
+module load anaconda3
+source activate ~/work/myconda/genopipe/
+
+[ -d logs ] || mkdir logs
+[ -d results/BAM ] || mkdir -p results/BAM
+
+YGENOME=$WRK/../input/sacCer3.fa
+
+for SRR in "SRR518875" "SRR518876" "SRR518877" "SRR518878";
+do
+	FQ=$WRK/results/FASTQ/$SRR
+	BAM=$WRK/results/BAM/$SRR
+	# align
+	bwa mem $YGENOME $FQ\_1.fastq.gz -t 8 \
+		| samtools sort \
+		> $BAM.bam
+	# index
+	samtools index $BAM.bam
+done
diff --git a/paper/CENPK-chipseq/job/02_run_StrainID.pbs b/paper/CENPK-chipseq/job/02_run_StrainID.pbs
@@ -0,0 +1,27 @@
+#!/bin/bash
+#PBS -l nodes=1:ppn=4
+#PBS -l pmem=16gb
+#PBS -l walltime=03:00:00
+#PBS -A open
+#PBS -o logs/sid.cenpk.chip.log.out
+#PBS -e logs/sid.cenpk.chip.log.err
+
+# FIRST CHANGE PATH TO EXECUTE
+WRK=/path/to/GenoPipe/paper/CENPK-chipseq
+cd $WRK
+
+module load gcc
+module load samtools
+module load bwa
+module load anaconda3
+source activate ~/work/myconda/genopipe
+
+[ -d logs ] || mkdir logs
+[ -d results/ID ] || mkdir -p results/ID
+
+DB=$WRK/../db/sacCer3_VCF
+GENOME=$WRK/../input/sacCer3.fa
+
+STRAINID=$WRK/../../StrainID
+cd $STRAINID
+bash identify-Strain.sh -i $WRK/results/BAM -g $GENOME -o $WRK/results/ID/ -v $DB
diff --git a/paper/CENPK-chipseq/logs/README b/paper/CENPK-chipseq/logs/README
@@ -0,0 +1 @@
+# logfiles from STDERR and STDOUT of running job files go here
diff --git a/paper/CENPK-chipseq/results/README b/paper/CENPK-chipseq/results/README
@@ -0,0 +1 @@
+# Downloaded FASTQ files and StrainID results go here
diff --git a/paper/HIV_samples/README b/paper/HIV_samples/README
@@ -0,0 +1,11 @@
+# Run EpitopeID on HIV datasets to evaluate EpitopeID's ability to detect HIV genome insertions
+
+# "Benzotriazoles Reactivate Latent HIV-1 through Inactivation of STAT5 SUMOylation"
+# (Bosque et al, 2017)
+
+# GEO accession: https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE84199
+# HIV genome: https://www.ncbi.nlm.nih.gov/nuccore/AF324493
+
+# EpitopeID database with HIV genome as a tag and hg19 genome as genomic sequence is setup with `../setup.sh`
+# Download data using SRA accessions using `job/00_download_data.pbs`
+# Run EpitopeID on FASTQ inputs using `job/01_run_EpitopeID.pbs` to determine if EpitopeID can localize HIV genome insertions
diff --git a/paper/HIV_samples/job/00_download_data.pbs b/paper/HIV_samples/job/00_download_data.pbs
@@ -0,0 +1,27 @@
+#!/bin/bash
+#PBS -l nodes=1:ppn=8
+#PBS -l pmem=32gb
+#PBS -l walltime=03:00:00
+#PBS -A open
+#PBS -o logs/download.data.log.out
+#PBS -e logs/download.data.log.err
+
+# Requires
+# parallel fastq dump v2.8.0
+
+WRK=/path/to/GenoPipe/paper/HIV_samples
+cd $WRK
+
+module load anaconda3
+source activate ~/work/myconda/genopipe/
+
+[ -d results/FASTQ ] || mkdir -p results/FASTQ
+[ -d logs ] || mkdir logs
+
+parallel-fastq-dump --gzip --split-files -t 4 -O results/FASTQ -s SRR3812124
+parallel-fastq-dump --gzip --split-files -t 4 -O results/FASTQ -s SRR3812125
+parallel-fastq-dump --gzip --split-files -t 4 -O results/FASTQ -s SRR3812126
+parallel-fastq-dump --gzip --split-files -t 4 -O results/FASTQ -s SRR3812127
+parallel-fastq-dump --gzip --split-files -t 4 -O results/FASTQ -s SRR3812128
+parallel-fastq-dump --gzip --split-files -t 4 -O results/FASTQ -s SRR3812129
+mv *.fastq.gz results/FASTQ/
diff --git a/paper/HIV_samples/job/01_run_EpitopeID.pbs b/paper/HIV_samples/job/01_run_EpitopeID.pbs
@@ -0,0 +1,26 @@
+#!/bin/bash
+#PBS -l nodes=1:ppn=4
+#PBS -l pmem=16gb
+#PBS -l walltime=03:00:00
+#PBS -A open
+#PBS -o logs/eid.hiv.log.out
+#PBS -e logs/eid.hiv.log.err
+
+module load gcc
+module load samtools
+module load bwa
+module load bedtools
+module load anaconda3
+source activate genopipe
+
+WRK=/path/to/GenoPipe/paper/HIV_samples
+cd $WRK
+
+[ -d logs ] || mkdir logs
+[ -d results/ID ] || mkdir -p results/ID
+
+DB=$WRK/../db/hiv_EpiDB
+
+EPITOPEID=$WRK/../../EpitopeID
+cd $EPITOPEID
+bash identify-Epitope.sh -i $WRK/results/FASTQ/ -o $WRK/results/ID/ -d $DB -t 4
diff --git a/paper/HIV_samples/logs/README b/paper/HIV_samples/logs/README
@@ -0,0 +1 @@
+# logfiles from STDERR and STDOUT of running job files go here
diff --git a/paper/HIV_samples/results/README b/paper/HIV_samples/results/README
@@ -0,0 +1 @@
+# Downloaded FASTQ files and EpitopeID results go here
diff --git a/paper/README b/paper/README
@@ -5,13 +5,18 @@
 # Directory Structure overview
 
 paper
-|--scripts
 |--setup.sh
+|--scripts
+|--input
+|--db
 |--SyntheticEpitope
 |--SyntheticStrain
 |--SyntheticDeletion
 |--ENCODEdata-eGFP
-|--ENCODEdata-TFChIPseq
+|--ENCODEdata-CellLines
+|--HIV_samples
+|--YKOC-wgs
+|--CENPK-chipseq
 
 
 ## setup.sh
@@ -24,6 +29,10 @@ also contains the general scripts that several of the higher directory scripts c
 
 ## input
 where setup.sh puts the reference genome and the aligner indexes
+also where other reference FASTA files are stored (i.e. R500.fa, 3xFLAG.fa, and the HIV genome sequence)
+
+## db
+where the input database directories are built by setup.sh with variation as appropriate for GenoPipe module, species, and epitope set
 
 ## SyntheticEpitope
 contains the scripts and houses the results of simulations testing EpitopeID
@@ -37,6 +46,14 @@ contains the scripts and houses the results of simulations testing DeletionID
 ## ENCODEdata-eGFP
 contains the scripts and information for downloading ENCODE eGFP data to test EpitopeID
 
-## ENCODEdata-TFChIPseq
+## ENCODEdata-CellLines
 contains the scripts and information for downloading ENCODE transcription factor ChIP-seq data to test StrainID
 
+## HIV_samples
+contains the scripts and information for downloading, processing, and running EpitopeID on the Bosque et al, 2017 dataset for localizing HIV genome insertions
+
+## YKOC-wgs
+contains the scripts and information for downloading, processing, and running DeletionID on the Puddu et al, 2019 dataset for identifying deletions
+
+## CENPK-chipseq
+contains the scripts and information for downloading, processing, and running StrainID on the 
diff --git a/paper/input/AF324493.2_HIV-1_vector_pNL4-3.fa b/paper/input/AF324493.2_HIV-1_vector_pNL4-3.fa
diff --git a/paper/setup.sh b/paper/setup.sh

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+# logfiles from STDERR and STDOUT of running job files go here`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+# Downloaded FASTQ files and StrainID results go here`