Skip to content

Commit 0de893b

Browse files
authored
Merge pull request #7 from CEGRcode/validation
Validation of DelID using YKOC data
2 parents db8aa6f + e13d7b6 commit 0de893b

16 files changed

Lines changed: 9958 additions & 1 deletion

paper/.gitignore

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,10 @@ SyntheticStrain/logs/*.err-*
1515
SyntheticStrain/logs/*.out-*
1616
YKOC-wgs/logs/*.err-*
1717
YKOC-wgs/logs/*.out-*
18+
YKOC-wgs/results/FASTQ
19+
YKOC-wgs/results/BAM
20+
YKOC-wgs/results/ID
21+
YKOC-wgs/results/BedGraphs
1822
ENCODEdata-eGFP/logs/*.out-*
1923
ENCODEdata-eGFP/logs/*.err-*
2024
ENCODEdata-eGFP/results/FASTQ

paper/YKOC-wgs/210328_Puddu_2019_STable6_verifiedKO.txt

Lines changed: 8844 additions & 0 deletions
Large diffs are not rendered by default.

paper/YKOC-wgs/README

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,15 @@
11
# Run DeletionID on WGS data of the YKOC
22

33
# Metadata with strain information, EBI accessions, and gene name maps.
4-
The `201213_Puddu_2019_STable1_del2srs.txt` file was pulled from Supplementary
4+
The `201213_Puddu_2019_STable1_del2ers.txt` file was pulled from Supplementary
55
Table 1 from Puddu et al 2019. Maps the EBI sample accessions (ERSXXXXXXX) to the
66
expected knockout strain and a sample ID used by the paper (SDXXXX). This table was
77
downloaded December 13, 2020.
88

9+
The `210328_Puddu_2019_STable6_verifiedKO.txt` file was pulled from Supplementary
10+
Table 6 from Puddu et al 2019. This file contains the list of verified KO samples
11+
keyed on their SD id and including associated scores with notes.
12+
913
The `210403_PRJEB27160_accessions.txt` metadata file was pulled to map EBI sample
1014
accessions (ERSXXXXXXX) to the run accessions (ERRXXXXXXX). The file also contains
1115
other sequencing information including ftp locations for FASTQ files, MD5 checksums,
@@ -14,6 +18,14 @@ April 3, 2021 to download this metadata.
1418

1519
https://www.ebi.ac.uk/ena/browser/view/PRJEB27160
1620

21+
Note: Four ERS sample accessions referenced in the paper are missing from the EBI
22+
metadata download:
23+
24+
ERS1041599
25+
ERS969339
26+
ERS969700
27+
ERS969701
28+
1729
# Download & Align YKOC data
1830
Use the EBI accessions to download the data to the `results/FASTQ` directory. The
1931
`job/00_download_data.pbs` PBS script uses the `wget` command but for the paper, these

paper/YKOC-wgs/fig3e_config.txt

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
FLANKING 200
2+
LOCI APE3 VAC17 PIR3
3+
SAMPLES ERS838258 ERS903113 ERS969760 ERS1076728
4+
S_LABEL APE3 VAC17 PIR3 WT-1
5+
S_COLOR forestgreen firebrick salmon gray
6+
S_MAX_Y 200 150 10 150

paper/YKOC-wgs/fig3f_config.txt

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
FLANKING 200
2+
LOCI SWT1 PNS1
3+
SAMPLES ERS1076684 ERS1076685 ERS1029415 ERS1029416 ERS1076728
4+
S_LABEL del_SWT1_rep1 del_SWT1_rep2 del_PNS1_rep1 del_PNS1_rep2 WT-1
5+
S_COLOR royalblue cornflowerblue goldenrod gold gray
6+
S_MAX_Y 150 150 150 150 150
Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
#!/bin/bash
2+
#PBS -l nodes=1:ppn=4
3+
#PBS -l pmem=16gb
4+
#PBS -l walltime=01:00:00
5+
#PBS -A open
6+
#PBS -o logs/align-picard.log.out
7+
#PBS -e logs/align-picard.log.err
8+
#PBS -t 1-9010
9+
10+
module load gcc/8.3.1
11+
module load samtools/1.5
12+
module load bwa/0.7.15
13+
module load picard/2.20.8
14+
15+
WRK=/path/to/GenoPipe/paper/YKOC-wgs
16+
cd $WRK
17+
18+
[ -d logs ] || mkdir logs
19+
[ -d results/BAM ] || mkdir results/BAM
20+
21+
GENOME=../input/sacCer3.fa
22+
METADATA=210403_PRJEB27160_accessions.txt
23+
INDEX=$(($PBS_ARRAYID+1))
24+
25+
INFO=`sed "${INDEX}q;d" $METADATA`
26+
ERR=`echo $INFO | awk '{print $4}'`
27+
ERS=`echo $INFO | awk '{print $2}'`
28+
FQ1=results/FASTQ/$ERR/$ERR\_1.fastq.gz
29+
FQ2=results/FASTQ/$ERR/$ERR\_2.fastq.gz
30+
BAM=$WRK/results/BAM/$ERS
31+
#echo $INFO
32+
33+
start=`date +%s`
34+
# Align with Pugh Lab standard alignment pipeline parameters
35+
# -T INT Don't output alignments with score lower than INT. This option affects output and occsaionally SAM flag 2.
36+
# -h INT If query has not more than INT hits with score higher than 880% of the best hit, output them all in the XA tag.
37+
# -M Mark shorter split hits as secondary (for Picard compatibility).
38+
echo "(${PBS_ARRAYID}) Aligning $ERR fastq files..."
39+
bwa mem -v 1 -T '30' -h '5' -t 4 -M $GENOME $FQ1 $FQ2 > $BAM.unsorted.bam
40+
# Sort
41+
echo "(${PBS_ARRAYID}) Sorting $ERR ..."
42+
samtools sort $BAM.unsorted.bam > $BAM.unmarked.bam
43+
# Mark duplicates
44+
echo "(${PBS_ARRAYID}) Marking duplicates $ERR ..."
45+
picard MarkDuplicates \
46+
INPUT=$BAM.unmarked.bam \
47+
OUTPUT=$BAM.marked.bam \
48+
METRICS_FILE=$BAM.metrics.txt \
49+
REMOVE_DUPLICATES='false' ASSUME_SORTED='true' \
50+
DUPLICATE_SCORING_STRATEGY='SUM_OF_BASE_QUALITIES' \
51+
#READ_NAME_REGEX='[a-zA-Z0-9]+:[0-9]:([0-9]+):([0-9]+):([0-9]+).*.' \
52+
#OPTICAL_DUPLICATE_PIXEL_DISTANCE='100' \
53+
#VALIDATION_STRINGENCY='LENIENT' #VERBOSITY=ERROR
54+
# Filter duplicates
55+
echo "(${PBS_ARRAYID}) Filter duplicates $ERR ..."
56+
samtools view -h -b -f 0x1 -F 0x404 \
57+
-o $BAM.bam \
58+
$BAM.marked.bam
59+
# Index
60+
echo "(${PBS_ARRAYID}) Index $ERR ..."
61+
samtools index $BAM.bam
62+
end=`date +%s`
63+
runtime=$((end-start))
64+
echo "...alignment and indexing for ($PBS_ARRAYID) $ERR.fq > $ERS.bam finished in ${runtime}"
65+
# Clean-up
66+
rm $BAM.unsorted.bam $BAM.unmarked.bam $BAM.marked.bam
Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
#!/bin/bash
2+
#PBS -l nodes=1:ppn=4
3+
#PBS -l pmem=16gb
4+
#PBS -l walltime=00:10:00
5+
#PBS -A open
6+
#PBS -o logs/did-picard.log.out
7+
#PBS -e logs/did-picard.log.err
8+
#PBS -t 1-9010
9+
10+
module load gcc/8.3.1
11+
module load anaconda3
12+
source activate genopipe
13+
14+
# Python3.9.0 env with pysam
15+
16+
WRK=/path/to/GenoPipe/paper/YKOC-wgs
17+
cd $WRK
18+
19+
METADATA=210403_PRJEB27160_accessions.txt
20+
INDEX=$(($PBS_ARRAYID+1))
21+
22+
INFO=`sed "${INDEX}q;d" $METADATA`
23+
ERS=`echo $INFO | awk '{print $2}'`
24+
BAM=$WRK/results/BAM/$ERS
25+
ID=$WRK/results/ID
26+
TEMP=$WRK/temp$PBS_ARRAYID
27+
#echo $INFO
28+
29+
[ -d logs ] || mkdir logs
30+
[ -d $ID ] || mkdir $ID
31+
32+
# Check if ID already generated...
33+
#if [[ -f $ID/$ENCFF\_R1-ID.tab ]]; then
34+
# echo "ID already generated ($ENCFF). Exiting.."
35+
# exit
36+
#fi
37+
38+
[ -d $TEMP ] || mkdir $TEMP
39+
[ -d $ID ] || mkdir -p $ID
40+
41+
# Set-up Temp directory
42+
cd $TEMP
43+
ln -s $BAM.bam
44+
ln -s $BAM.bam.bai
45+
46+
DATABASE=$WRK/../db/sacCer3_Del
47+
GENOPIPE=$WRK/../../
48+
49+
## Execute Mass EpitopeID and record time
50+
cd $WRK
51+
cd $GENOPIPE/DeletionID
52+
echo "(2) Begin executing EpitopeID for $R1..."
53+
start=`date +%s`
54+
bash identify-Deletion.sh -i $TEMP -o $ID -d $DATABASE
55+
end=`date +%s`
56+
runtime=$((end-start))
57+
echo "...single DeletionID for ($PBS_ARRAYID) $R1 finished in ${runtime}"
58+
59+
rm -r $TEMP
Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
#!/bin/bash
2+
#PBS -l nodes=1:ppn=4
3+
#PBS -l pmem=16gb
4+
#PBS -l walltime=00:10:00
5+
#PBS -A open
6+
#PBS -o tally-results.out
7+
#PBS -e tally-results.err
8+
9+
module load anaconda3
10+
source activate genopipe
11+
#PyLib argparse
12+
#PyLib matplotlib
13+
#PyLib matplotlib-venn
14+
15+
# FIRST CHANGE PATH TO EXECUTE
16+
WRK=/path/to/GenoPipe/paper/YKOC-wgs
17+
cd $WRK
18+
19+
#wget http://sgd-archive.yeastgenome.org/curation/chromosomal_feature/SGD_features.tab
20+
wget http://sgd-archive.yeastgenome.org/curation/chromosomal_feature/deleted_merged_features.tab
21+
wget http://sgd-archive.yeastgenome.org/curation/chromosomal_feature/saccharomyces_cerevisiae.gff.gz
22+
gzip -d saccharomyces_cerevisiae.gff.gz
23+
24+
# Tally results to generate TableS2-ykoc-output-summary
25+
python scripts/analyze_ykoc_results.py -m 201213_Puddu_2019_STable1_del2ers.txt -i results/ID \
26+
-e 210403_PRJEB27160_accessions.txt -t 210328_Puddu_2019_STable6_verifiedKO.txt \
27+
-d deleted_merged_features.tab -n saccharomyces_cerevisiae.gff \
28+
> results/ykoc_output_summary.txt \
29+
2> results/ykoc_output_summary.err
30+
31+
grep 'PASS' results/ykoc_output_summary.txt | grep $'\tVerified' > results/ConfidentPass.txt
32+
grep 'PASS' results/ykoc_output_summary.txt | grep 'NotVerified' > results/TentativePass.txt
33+
grep 'FAIL' results/ykoc_output_summary.txt | grep $'\tVerified' > results/TentativeFails.txt
34+
grep 'FAIL' results/ykoc_output_summary.txt | grep 'NotVerified' > results/TrueFails.txt
35+
36+
python scripts/make_venn_diagram.py -s results/ykoc_output_summary.txt -t Figure3C
37+
38+
#Clean-up
39+
rm saccharomyces_cerevisiae.gff* deleted_merged_features.tab
Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
#!/bin/bash
2+
#PBS -l nodes=1:ppn=4
3+
#PBS -l pmem=16gb
4+
#PBS -l walltime=03:00:00
5+
#PBS -A open
6+
#PBS -o logs/pileup.samples.log.out
7+
#PBS -e logs/pileup.samples.log.err
8+
#PBS -t 1-9010
9+
10+
module load gcc/8.3.1
11+
module load samtools/1.5
12+
module load bedtools/2.27.1
13+
module load anaconda3
14+
source activate genopipe
15+
16+
WRK=/path/to/GenoPipe/paper/YKOC-wgs
17+
cd $WRK
18+
19+
[ -d logs ] || mkdir logs
20+
[ -d results/BedGraphs ] || mkdir results/BedGraphs
21+
22+
METADATA=results/ykoc_output_summary.txt
23+
INDEX=$(($PBS_ARRAYID+1))
24+
25+
INFO=`sed "${INDEX}q;d" $METADATA`
26+
ERS=`cut -f10 $METADATA |sed "${INDEX}q;d"`
27+
BAM=results/BAM/$ERS.bam
28+
BG=results/BedGraphs/$ERS
29+
#echo $INFO
30+
31+
# Pileup to BedGraph
32+
echo "($PBS_ARRAYID) Make raw bedgraph pileups for $ERS"
33+
bedtools genomecov -bga -scale 1.0 -ibam $BAM \
34+
| sort -k1,1 -k2,2n \
35+
> $BG.raw.bedgraph
36+
37+
# Normalize BedGraph
38+
echo "($PBS_ARRAYID) Normalize bedgraph for $ERS"
39+
NORMALIZE=scripts/normalize_BedGraph.py
40+
python $NORMALIZE -i $BG.raw.bedgraph > $BG.bedgraph
41+
42+
# Make CDT row centered on expected KO
43+
echo "($PBS_ARRAYID) Generate CDT slice for $ERS"
44+
ROWCDT=scripts/make_cdt_row.py
45+
echo $INFO
46+
python $ROWCDT -s <(head -n $INDEX $METADATA |tail -n 1) -i results/BedGraphs/ -g saccharomyces_cerevisiae.gff
Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
#!/bin/bash
2+
#PBS -l nodes=1:ppn=4
3+
#PBS -l pmem=16gb
4+
#PBS -l walltime=05:00:00
5+
#PBS -A open
6+
#PBS -o logs/make.figs.log.out
7+
#PBS -e logs/make.figs.log.err
8+
9+
#1-9010
10+
#bfp2_h_g_sc_default
11+
#bfp2_j_g_bc_default
12+
13+
module load gcc/8.3.1
14+
module load samtools/1.5
15+
module load bedtools/2.27.1
16+
module load anaconda3
17+
source activate genopipe
18+
19+
WRK=/path/to/GenoPipe/paper/YKOC-wgs
20+
WRK=/storage/home/owl5022/scratch/GenotypingProject/GenoPipe/paper/YKOC-wgs
21+
cd $WRK
22+
23+
[ -d logs ] || mkdir logs
24+
[ -d results/figs ] || mkdir results/figs
25+
26+
for PARTITION in "TentativePass" "TrueFails" "TentativeFails" "ConfidentPass";
27+
do
28+
BASE=results/$PARTITION
29+
rm $BASE.temp
30+
31+
METADATA=results/$PARTITION.txt
32+
LNCT=`wc -l $METADATA | awk '{print $1}'`
33+
for ((INDEX=1; INDEX<=$LNCT; INDEX++));
34+
do
35+
#echo $INDEX
36+
ERS=`cut -d $'\t' -f10 $METADATA | sed "${INDEX}q;d"`
37+
ORF=`cut -d $'\t' -f7 $METADATA | sed "${INDEX}q;d"`
38+
echo "($INDEX) Parsed out ERS=$ERS and ORF=$ORF"
39+
ROW=results/BedGraphs/$ERS\_$ORF\_6000bp.cdt
40+
cat $ROW >> $BASE.temp
41+
done
42+
43+
sort -nk3,3 $BASE.temp > $BASE.cdt
44+
45+
# Make Heatmap PNG
46+
##ScriptManager Two-color heatmap
47+
# Black, .95 threshold
48+
# $BASE.cdt > $BASE.png
49+
50+
# Make Trace CDT
51+
TRACE=scripts/make_trace.py
52+
python $TRACE -c $BASE.cdt -g saccharomyces_cerevisiae.gff -w 6000 > $BASE\_trace.cdt
53+
54+
rm $BASE.temp
55+
done
56+
57+
# Make trace (x4 lines)
58+
##Script to make trace cdt
59+
##ScriptManager Three-color heatmap
60+
# diff colors, set alpha channel, threshold 0<0<.95
61+
62+
SUBPLOTS=scripts/make_subplots.py
63+
64+
# Figure 3E SVG image
65+
python $SUBPLOTS -t Figure3E -c fig3e_config.txt -g saccharomyces_cerevisiae.gff
66+
67+
# Figure 3F SVG image
68+
python $SUBPLOTS -t Figure3F -c fig3f_config.txt -g saccharomyces_cerevisiae.gff
69+
70+
71+

0 commit comments

Comments
 (0)