Skip to content

Commit 0595656

Browse files
authored
Merge pull request #15 from CEGRcode/datasets
Run StrainID on BY4742 ChIPseq data
2 parents 908adc3 + 3d09e83 commit 0595656

10 files changed

Lines changed: 186 additions & 1 deletion

File tree

paper/.gitignore

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,11 @@ SyntheticStrain/results/hg19*
3535
ENCODE_CellLines/results/BAM
3636
ENCODE_CellLines/results/BAM-nospike
3737
ENCODE_CellLines/results/ID
38+
BY4742-chipseq/logs/*.out
39+
BY4742-chipseq/logs/*.err
40+
BY4742-chipseq/results/FASTQ
41+
BY4742-chipseq/results/BAM
42+
BY4742-chipseq/results/ID
3843
CENPK-chipseq/logs/*.out
3944
CENPK-chipseq/logs/*.err
4045
CENPK-chipseq/results/FASTQ

paper/BY4742-chipseq/README

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
# Run StrainID on BY4742 datasets to evaluate StrainID's ability to detect the variant-based strain background
2+
3+
# "Molecular mechanisms that distinguish TFIID housekeeping from regulatable SAGA promoters"
4+
# (de Jonge et al, 2017)
5+
6+
# GEO accession: https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE81787
7+
8+
# The default sacCer3 StrainID database is used
9+
# Download data using SRA accessions using `job/00_download_data.pbs`
10+
# Align FASTQ files and process using `job/01_align_data.pbs`
11+
# Run StrainID on BAM inputs using `job/02_run_StrainID.pbs` to determine if StrainID can successfully identify the strain background
12+
13+
#specifically look into performance in distinguishing two closesly related strains: BY4741 and BY4742
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
Run,ReleaseDate,LoadDate,spots,bases,spots_with_mates,avgLength,size_MB,AssemblyName,download_path,Experiment,LibraryName,LibraryStrategy,LibrarySelection,LibrarySource,LibraryLayout,InsertSize,InsertDev,Platform,Model,SRAStudy,BioProject,Study_Pubmed_id,ProjectID,Sample,BioSample,SampleType,TaxID,ScientificName,SampleName,g1k_pop_code,source,g1k_analysis_group,Subject_ID,Sex,Disease,Tumor,Affection_Status,Analyte_Type,Histological_Type,Body_Site,CenterName,Submission,dbgap_study_accession,Consent,RunHash,ReadHash
2+
SRR3497399,2017-05-11 00:00:55,2016-05-11 11:04:54,67640691,3281838684,0,48,1816,GCF_000146045.2,https://sra-downloadb.be-md.ncbi.nlm.nih.gov/sos2/sra-pub-run-7/SRR3497399/SRR3497399.1,SRX1756249,Hsf1_ChIP,ChIP-Seq,ChIP,GENOMIC,SINGLE,0,0,ABI_SOLID,AB 5500xl-W Genetic Analysis System,SRP074822,PRJNA321111,,321111,SRS1432863,SAMN04966256,simple,4932,Saccharomyces cerevisiae,Hsf1_ChIP,,,,,,,no,,,,,PRINCESS MÁXIMA CENTER FOR PEDIATRIC ONCOLOGY,SRA424700,,public,F73F96964C383156F61A669AC0074AD9,443BEFEC858B5AF9C065249419528A69
3+
SRR3497410,2017-05-11 00:00:55,2016-05-11 11:08:21,80694279,3905823162,0,48,2028,GCF_000146045.2,https://sra-downloadb.be-md.ncbi.nlm.nih.gov/sos2/sra-pub-run-7/SRR3497410/SRR3497410.1,SRX1756250,Hsf1_input,ChIP-Seq,unspecified,GENOMIC,SINGLE,0,0,ABI_SOLID,AB 5500xl-W Genetic Analysis System,SRP074822,PRJNA321111,,321111,SRS1432864,SAMN04966257,simple,4932,Saccharomyces cerevisiae,Hsf1_input,,,,,,,no,,,,,PRINCESS MÁXIMA CENTER FOR PEDIATRIC ONCOLOGY,SRA424700,,public,C76260FC0AED0743E40E038EEC232960,D96A93109D0C695D8263594CD89E9473
4+
SRR3497446,2017-05-11 00:00:55,2016-05-17 11:46:57,56996253,8537022864,56996253,149,3350,,https://sra-downloadb.st-va.ncbi.nlm.nih.gov/sos2/sra-pub-run-3/SRR3497446/SRR3497446.1,SRX1756259,Hsf1_t0_0.05U,MNase-Seq,MNase,GENOMIC,PAIRED,0,0,ILLUMINA,NextSeq 500,SRP074822,PRJNA321111,,321111,SRS1432872,SAMN04966258,simple,4932,Saccharomyces cerevisiae,Hsf1_t0_0.05U,,,,,,,no,,,,,PRINCESS MÁXIMA CENTER FOR PEDIATRIC ONCOLOGY,SRA424700,,public,1D79B28E58E73793E3F2416BE52FC99B,A4C02F8943A763A0C413FF046BA716B5
5+
SRR3497452,2017-05-11 00:00:56,2016-05-11 11:24:59,62257597,9364630504,62257597,150,3611,,https://sra-downloadb.st-va.ncbi.nlm.nih.gov/sos2/sra-pub-run-3/SRR3497452/SRR3497452.1,SRX1756270,Hsf1_t0_0.2U,MNase-Seq,MNase,GENOMIC,PAIRED,0,0,ILLUMINA,NextSeq 500,SRP074822,PRJNA321111,,321111,SRS1432882,SAMN04966260,simple,4932,Saccharomyces cerevisiae,Hsf1_t0_0.2U,,,,,,,no,,,,,PRINCESS MÁXIMA CENTER FOR PEDIATRIC ONCOLOGY,SRA424700,,public,103A6282A4032934D6050C6DD2B5E6BF,871F9BAEC203D09E57EB5726F7A84CD1
6+
SRR3497453,2017-05-11 00:00:56,2016-05-11 12:06:21,61425579,9217294875,61425579,150,3523,,https://sra-downloadb.st-va.ncbi.nlm.nih.gov/sos2/sra-pub-run-3/SRR3497453/SRR3497453.1,SRX1756274,Hsf1_t0_0.8U,MNase-Seq,MNase,GENOMIC,PAIRED,0,0,ILLUMINA,NextSeq 500,SRP074822,PRJNA321111,,321111,SRS1432886,SAMN04966262,simple,4932,Saccharomyces cerevisiae,Hsf1_t0_0.8U,,,,,,,no,,,,,PRINCESS MÁXIMA CENTER FOR PEDIATRIC ONCOLOGY,SRA424700,,public,851EA8BF587B0C7F08802CD998CA5246,5BDD7B50B89F79F3CEC01DA4ABA31660
7+
SRR3497454,2017-05-11 00:00:56,2016-05-11 12:01:49,48284496,7235073133,48284496,149,2828,,https://sra-downloadb.st-va.ncbi.nlm.nih.gov/sos2/sra-pub-run-3/SRR3497454/SRR3497454.1,SRX1756275,Hsf1_t0_3.0U,MNase-Seq,MNase,GENOMIC,PAIRED,0,0,ILLUMINA,NextSeq 500,SRP074822,PRJNA321111,,321111,SRS1432887,SAMN04966264,simple,4932,Saccharomyces cerevisiae,Hsf1_t0_3.0U,,,,,,,no,,,,,PRINCESS MÁXIMA CENTER FOR PEDIATRIC ONCOLOGY,SRA424700,,public,220B1999B9CB4227F5A35B51BCC72724,2F01C0E9269BD0FA1523461BC01D8198
8+
SRR3497449,2017-05-11 00:00:56,2016-05-11 13:19:10,53608235,8028655564,53608235,149,3153,,https://sra-downloadb.st-va.ncbi.nlm.nih.gov/sos2/sra-pub-run-3/SRR3497449/SRR3497449.1,SRX1756263,Hsf1_t30_0.05U,MNase-Seq,MNase,GENOMIC,PAIRED,0,0,ILLUMINA,NextSeq 500,SRP074822,PRJNA321111,,321111,SRS1432875,SAMN04966259,simple,4932,Saccharomyces cerevisiae,Hsf1_t30_0.05U,,,,,,,no,,,,,PRINCESS MÁXIMA CENTER FOR PEDIATRIC ONCOLOGY,SRA424700,,public,A5A12C5FFFE69D3ABB4F0C42AE0FF7EA,5D151CA79CC01ACCE3C98CF5E646DD47
9+
SRR3497456,2017-05-11 00:00:56,2016-05-11 14:19:38,62404674,9388451136,62404674,150,3646,,https://sra-downloadb.st-va.ncbi.nlm.nih.gov/sos1/sra-pub-run-8/SRR3497456/SRR3497456.1,SRX1756283,Hsf1_t30_0.2U,MNase-Seq,MNase,GENOMIC,PAIRED,0,0,ILLUMINA,NextSeq 500,SRP074822,PRJNA321111,,321111,SRS1432895,SAMN04966261,simple,4932,Saccharomyces cerevisiae,Hsf1_t30_0.2U,,,,,,,no,,,,,PRINCESS MÁXIMA CENTER FOR PEDIATRIC ONCOLOGY,SRA424700,,public,4D316816A01301B7B30F9CC1381B5018,DDDB3A1D0A7A3F6255418B0C27D316E6
10+
SRR3497459,2017-05-11 00:00:56,2016-05-11 14:21:03,70846857,10638854137,70846857,150,4074,,https://sra-downloadb.st-va.ncbi.nlm.nih.gov/sos2/sra-pub-run-3/SRR3497459/SRR3497459.1,SRX1756286,Hsf1_t30_0.8U,MNase-Seq,MNase,GENOMIC,PAIRED,0,0,ILLUMINA,NextSeq 500,SRP074822,PRJNA321111,,321111,SRS1432898,SAMN04966263,simple,4932,Saccharomyces cerevisiae,Hsf1_t30_0.8U,,,,,,,no,,,,,PRINCESS MÁXIMA CENTER FOR PEDIATRIC ONCOLOGY,SRA424700,,public,9C84D57EE58BF1B09AF37E3504851620,C6850B43795D5AF9504DF7A5F4625B13
11+
SRR3497461,2017-05-11 00:00:56,2016-05-11 15:19:34,52591831,7857035482,52591831,149,3032,,https://sra-downloadb.st-va.ncbi.nlm.nih.gov/sos1/sra-pub-run-8/SRR3497461/SRR3497461.1,SRX1756287,Hsf1_t30_3.0U,MNase-Seq,MNase,GENOMIC,PAIRED,0,0,ILLUMINA,NextSeq 500,SRP074822,PRJNA321111,,321111,SRS1432899,SAMN04966265,simple,4932,Saccharomyces cerevisiae,Hsf1_t30_3.0U,,,,,,,no,,,,,PRINCESS MÁXIMA CENTER FOR PEDIATRIC ONCOLOGY,SRA424700,,public,00F78D41D1508EDEFAC4DE473028A626,6EB7361E2E83CF786951734E6596C8F0
12+
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
#!/bin/bash
2+
#PBS -l nodes=1:ppn=6
3+
#PBS -l pmem=24gb
4+
#PBS -l walltime=03:00:00
5+
#PBS -A open
6+
#PBS -o logs/download.data.log.out
7+
#PBS -e logs/download.data.log.err
8+
#PBS -t 1-10
9+
10+
# Requires
11+
# parallel fastq dump v2.8.0
12+
13+
# FIRST CHANGE PATH TO EXECUTE
14+
WRK=/path/to/GenoPipe/paper/BY4742-chipseq
15+
cd $WRK
16+
17+
module load anaconda3
18+
source activate ~/work/myconda/genopipe/
19+
20+
[ -d logs ] || mkdir logs
21+
[ -d results/FASTQ ] || mkdir -p results/FASTQ
22+
23+
INDEX=$(($PBS_ARRAYID+1))
24+
25+
METADATA=SraRunInfo.csv
26+
INFO=`sed "${INDEX}q;d" $METADATA`
27+
SRR=`echo $INFO | cut -d"," -f1`
28+
#echo $INFO
29+
30+
# FASTQ-DUMP
31+
echo "($INDEX) Begin downloading $SRR FASTQ..."
32+
parallel-fastq-dump --gzip --split-files -t 8 -O results/FASTQ -s $SRR
33+
echo "Complete"
Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
#!/bin/bash
2+
#PBS -l nodes=1:ppn=4
3+
#PBS -l pmem=16gb
4+
#PBS -l walltime=02:00:00
5+
#PBS -A open
6+
#PBS -o logs/align.data.log.out
7+
#PBS -e logs/align.data.log.err
8+
#PBS -t 1-10
9+
10+
module load gcc
11+
module load samtools
12+
module load bwa
13+
module load anaconda3
14+
source activate ~/work/myconda/genopipe/
15+
16+
# FIRST CHANGE PATH TO EXECUTE
17+
WRK=/path/to/GenoPipe/paper/BY4742-chipseq
18+
cd $WRK
19+
20+
[ -d logs ] || mkdir logs
21+
[ -d results/BAM ] || mkdir -p results/BAM
22+
[ -d results/uniq-BAM ] || mkdir -p results/uniq-BAM
23+
24+
YGENOME=$WRK/../input/sacCer3.fa
25+
CSGENOME=$WRK/../input/sacCer3_index
26+
27+
INDEX=$(($PBS_ARRAYID+1))
28+
29+
METADATA=SraRunInfo.csv
30+
INFO=`sed "${INDEX}q;d" $METADATA`
31+
SRR=`echo $INFO | cut -d"," -f1`
32+
SAMPLE=`echo $INFO | cut -d"," -f12`
33+
PLATFORM=`echo $INFO | cut -d"," -f19`
34+
#PAIR=`echo $INFO | cut -d"," -f16`
35+
#echo $INFO
36+
37+
FQ=$WRK/results/FASTQ/$SRR
38+
BAM=$WRK/results/BAM/$SAMPLE
39+
40+
echo "($PBS_ARRAYID) Aligned $SRR $PLATFORM reads > $BAM"
41+
if [[ " $PLATFORM " =~ " ABI_SOLID " ]]; then
42+
bowtie -C -S $CSGENOME <(gzip -dc $YGENOME $FQ\_1.fastq.gz) \
43+
| samtools sort \
44+
> $BAM.bam
45+
echo "(PBS_ARRAYID) $BAM single aligned (bowtie color space)"
46+
elif [[ " $PLATFORM " =~ " ILLUMINA " ]]; then
47+
bwa mem $YGENOME $FQ\_1.fastq.gz $FQ\_2.fastq.gz -t 4 \
48+
| samtools sort \
49+
> $BAM.bam
50+
echo "($PBS_ARRAYID) $BAM pair aligned (BWA)"
51+
fi
52+
53+
#samtools view -b -F4 $BAM > $WRK/results/uniq-BAM/$SAMPLE.bam
54+
55+
echo "($PBS_ARRAYID) Indexing..."
56+
samtools index $BAM.bam
57+
echo "($PBS_ARRAYID) Complete!"
Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
#!/bin/bash
2+
#PBS -l nodes=1:ppn=6
3+
#PBS -l pmem=24gb
4+
#PBS -l walltime=03:00:00
5+
#PBS -A open
6+
#PBS -o logs/sid.log.out
7+
#PBS -e logs/sid.log.err
8+
#PBS -t 1-10
9+
10+
module load anaconda3
11+
source activate genopipe
12+
13+
# FIRST CHANGE PATH TO EXECUTE
14+
WRK=/path/to/GenoPipe/paper/BY4742-chipseq
15+
cd $WRK
16+
17+
[ -d logs ] || mkdir logs
18+
[ -d results/ID ] || mkdir -p results/ID
19+
20+
INDEX=$(($PBS_ARRAYID+1))
21+
22+
METADATA=SraRunInfo.csv
23+
INFO=`sed "${INDEX}q;d" $METADATA`
24+
SAMPLE=`echo $INFO | cut -d"," -f12`
25+
#echo $INFO
26+
27+
# Store directory paths
28+
DATABASE=$WRK/../db/sacCer3_VCF
29+
GENOME=$WRK/../input/sacCer3.fa
30+
SEED=$PBS_ARRAYID
31+
GENOPIPE=$WRK/../..
32+
33+
BAM=$WRK/results/BAM/$SAMPLE
34+
#BAM=$WRK/results/uniq-BAM/$SAMPLE
35+
ID=$WRK/results/ID/
36+
37+
# Set-up Temp directory
38+
TEMP=$WRK/temp-$PBS_ARRAYID
39+
[ -d $TEMP ] || mkdir $TEMP
40+
cd $TEMP
41+
ln -s $BAM.bam
42+
ln -s $BAM.bam.bai
43+
44+
## Execute Single StrainID and record time
45+
cd $GENOPIPE/StrainID
46+
echo "**Begin executing StrainID for ${SAMPLE}..."
47+
{ time bash identify-Strain.sh -i $TEMP -g $GENOME -v $DATABASE -s $SEED -o $ID > $ID/$SAMPLE.std ; } 2> $ID/$SAMPLE.time
48+
echo "...single StrainID for ($PBS_ARRAYID) ${SAMPLE} finished."
49+
50+
## Clean-up
51+
rm -r $TEMP
52+

paper/BY4742-chipseq/logs/README

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
# logfiles from STDERR and STDOUT of running job files go here
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
# Downloaded FASTQ files and StrainID results go here

paper/README

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ paper
1616
|--ENCODEdata-CellLines
1717
|--HIV_samples
1818
|--YKOC-wgs
19+
|--BY4742-chipseq
1920
|--CENPK-chipseq
2021

2122

@@ -55,5 +56,8 @@ contains the scripts and information for downloading, processing, and running Ep
5556
## YKOC-wgs
5657
contains the scripts and information for downloading, processing, and running DeletionID on the Puddu et al, 2019 dataset for identifying deletions
5758

59+
## BY4742-chipseq
60+
contains the scripts and information for downloading, processing, and running StrainID on the BAM files
61+
5862
## CENPK-chipseq
59-
contains the scripts and information for downloading, processing, and running StrainID on the
63+
contains the scripts and information for downloading, processing, and running StrainID on the BAM files

paper/setup.sh

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,10 @@
1313

1414
# Required software:
1515
# wget
16+
# Python 3
1617
# Perl 5.18+
1718
# bwa v0.7.14+
19+
# bowtie v1.2.3
1820
#
1921
# Optional software:
2022
# twoBitToFa
@@ -149,3 +151,8 @@ cd $WRK/db
149151
ln -s ../../StrainID/sacCer3_VCF
150152
ln -s ../../StrainID/hg19_VCF
151153
cd $WRK
154+
155+
# Setup color-space index for yeast genome
156+
# (used by BY4742-chipseq)
157+
bowtie-build -C input/sacCer3.fa input/sacCer3_index
158+

0 commit comments

Comments
 (0)