CEGRcode
diff --git a/‎paper/.gitignore‎
Lines changed: 1 addition & 0 deletions b/‎paper/.gitignore‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎paper/ENCODEdata-CellLines/230612_hg38_100-TF-ChIP-seq.txt‎
Lines changed: 102 additions & 0 deletions b/‎paper/ENCODEdata-CellLines/230612_hg38_100-TF-ChIP-seq.txt‎
Lines changed: 102 additions & 0 deletions
diff --git a/‎paper/ENCODEdata-CellLines/README.md‎
Lines changed: 20 additions & 0 deletions b/‎paper/ENCODEdata-CellLines/README.md‎
Lines changed: 20 additions & 0 deletions
diff --git a/‎paper/ENCODEdata-CellLines/job/04_Hundred_hg38.pbs‎
Lines changed: 95 additions & 0 deletions b/‎paper/ENCODEdata-CellLines/job/04_Hundred_hg38.pbs‎
Lines changed: 95 additions & 0 deletions
diff --git a/‎paper/ENCODEdata-CellLines/results/SupplementaryFigure2.png‎
312 KB b/‎paper/ENCODEdata-CellLines/results/SupplementaryFigure2.png‎
312 KB
diff --git a/‎paper/ENCODEdata-CellLines/results/SupplementaryTable11.txt.gz‎
912 KB b/‎paper/ENCODEdata-CellLines/results/SupplementaryTable11.txt.gz‎
912 KB
diff --git a/‎paper/ENCODEdata-CellLines/scripts/build_sidscore_histogram.py‎
Lines changed: 150 additions & 0 deletions b/‎paper/ENCODEdata-CellLines/scripts/build_sidscore_histogram.py‎
Lines changed: 150 additions & 0 deletions
diff --git a/‎paper/ENCODEdata-CellLines/scripts/merge_sidscores.py‎
Lines changed: 40 additions & 0 deletions b/‎paper/ENCODEdata-CellLines/scripts/merge_sidscores.py‎
Lines changed: 40 additions & 0 deletions
@@ -37,6 +37,7 @@ ENCODEdata-CellLines/results/SupplementaryTable10.txt
 ENCODEdata-CellLines/results/BAM
 ENCODEdata-CellLines/results/BAM-nospike
 ENCODEdata-CellLines/results/ID
+ENCODEdata-CellLines/results/hg38_ID
 BY4742-chipseq/logs/*.out
 BY4742-chipseq/logs/*.err
 BY4742-chipseq/results/FASTQ
 
@@ -5,7 +5,9 @@ Run StrainID on ENCODE data and evaluate StrainID's performance.
 |  |  |
 | :--: | -- |
 | Figure 6C  | `ENCODEdata-CellLines/results/ID/` |
+| Supplementary Figure 2  | `ENCODEdata-CellLines/results/hg38_ID/` |
 | Supplementary Table 10 | `ENCODEdata-CellLines/results/SupplementaryTable10.txt.gz` |
+| Supplementary Table 11 | `ENCODEdata-CellLines/results/SupplementaryTable11.txt.gz` |
 
 # Reference files
 
@@ -14,6 +16,9 @@ ENCODE metadata was pulled on May 12, 2021 using the `scripts/get_metadata.py` s
 
 Command used: `python scripts/get_metadata.py > 210512_sample_metadata.txt`
 
+## 230612_hg38_100-TF-ChIP-seq.txt
+ENCODE metadata was pulled on June 12, 2023 according to the retrieval URL in the header. The filter criteria ensured files were BAM formats using the hg38 genome build, from the TF ChIP-seq assay, and from a cell line background in the core `hg19_VCF` set. Only the top 100 BAM files were saved here.
+
 # Download ENCODE CellLine data and run through StrainID
 Use the 210512_sample_metadata.txt file with ENCODE accessions to download and process the
 data to the `results` directory. Then run the data through StrainID.
@@ -47,3 +52,18 @@ python scripts/build_violinscatter.py -i results/SupplementaryTable10.txt -o res
 python scripts/build_violinscatter.py -i results/SupplementaryTable10.txt -o results/Figure6C_CAGE.png -a "CAGE"
 python scripts/build_violinscatter.py -i results/SupplementaryTable10.txt -o results/Figure6C_small-RNA-seq.png -a "small RNA-seq"
 ```
+
+# Characterize StrainID scores from 100 ChIP-seq datasets
+To better understand the distribution of scores output by StrainID for "correct" and "incorrect" strain backgrounds, we ran StrainID on our largest reference set of over 1,000 cell lines (`hg38_DepMap`) in a sample of TF-ChIP-seq datasets from ENZCODE.
+
+## Download data & Run StrainID
+```
+qsub job/02_indexed_runSID.pbs
+```
+
+## Compile the results with the metadata information
+Compile the results into a table and plot the scores as two overlapping histograms.
+```
+python scripts/merge_sidscores.py -i results/hg38_ID/ -m 230612_hg38_100-TF-ChIP-seq.txt -o results/SupplementaryTable11.txt
+python scripts/build_sidscore_histogram.py -i results/hg38_ID/ -o results/SupplementaryFigure2.png
+```
@@ -0,0 +1,95 @@
+#!/bin/bash
+#PBS -l nodes=1:ppn=6
+#PBS -l pmem=24gb
+#PBS -l walltime=05:00:00
+#PBS -A open
+#PBS -o logs/download.data.log.out
+#PBS -e logs/download.data.log.err
+#PBS -t 1-100
+
+module load samtools
+#module load anaconda
+#source activate my-genopipe-env
+
+WRK=/path/to/GenoPipe/paper/ENCODE-CellLines
+
+# Execute from /path/to/GenoPipe/paper/ENCODE-CellLines
+
+[ -d logs ] || mkdir logs
+[ -d results/hg38_BAM ] || mkdir -p results/hg38_BAM
+[ -d results/hg38_BAM-nospike ] || mkdir -p results/hg38_BAM-nospike
+[ -d results/hg38_ID ] || mkdir -p results/hg38_ID
+
+METADATA=230612_hg38_100-TF-ChIP-seq.txt
+INFO=`sed "2d" $METADATA | sed "${PBS_ARRAYID}q;d"`
+ENCFF=`echo $INFO | awk '{FS="/t"}{print $2}'`
+#echo $INFO
+
+# Store directory paths
+DATABASE=$WRK/../db/hg38_DepMap
+GENOME=$WRK/../input/hg38.fa
+SEED=$PBS_ARRAYID
+GENOPIPE=$WRK/../..
+BAM=$WRK/results/hg38_BAM/$ENCFF
+NOSPIKE=$WRK/results/hg38_BAM-nospike/$ENCFF
+ID=$WRK/results/hg38_ID
+
+# ENCODE data download
+HREF=/files/$ENCFF/@@download/$ENCFF.bam
+echo "Fetching from https://www.encodeproject.org$HREF"
+wget -c https://www.encodeproject.org$HREF -O $BAM.bam
+
+# Checksum of resulting BAM
+MD5SUM=`echo $INFO | awk '{FS="/t"}{print $3}'`
+#if [[ `md5sum $BAM` =~ $MD5SUM ]]; then
+#	echo "($PBS_ARRAYID) $BAM passed."
+#else
+#	echo "($PBS_ARRAYID) $BAM md5checksum failed!"
+#	rm $BAM
+#	exit
+#fi
+
+# Index BAM file
+[ -f $BAM.bam.bai ] || samtools index $BAM.bam
+
+# Strip Spike-in
+python scripts/filter_spikein.py -b $BAM.bam -g ../input/hg38.fa -o $NOSPIKE.bam
+samtools index $NOSPIKE.bam
+
+# Count Bases
+[ -f $BAM.bam.bai ] && python scripts/estimate_bp_sequenced.py -b $BAM.bam > $BAM\_bpcount.txt
+[ -f $NOSPIKE.bam.bai ] && python scripts/estimate_bp_sequenced.py -b $NOSPIKE.bam > $NOSPIKE\_bpcount.txt
+
+
+#Check that BAM file was generated first
+if [ ! -f $NOSPIKE.bam ];
+then
+	echo "BAM input for $NOSPIKE/$ENCFF does not exist. Exiting."
+	exit
+fi
+
+#Check that BAM Index file exists
+if [ ! -f $NOSPIKE.bam.bai ];
+then
+	echo "BAI missing for for $ENCFF. Exiting."
+	exit
+fi
+
+# Set-up Temp directory
+TEMP=$WRK/temp-$PBS_ARRAYID
+[ -d $TEMP ] || mkdir $TEMP
+cd $TEMP
+echo $TEMP
+ln -s $NOSPIKE.bam
+ln -s $NOSPIKE.bam.bai
+
+echo $TEMP
+## Execute Single StrainID and record time
+cd $GENOPIPE/StrainID
+echo "**Begin executing StrainID for ${ENCFF}..."
+{ time bash identify-Strain.sh -i $TEMP -g $GENOME -v $DATABASE -s $SEED -o $ID > $ID/$ENCFF.std ; } 2> $ID/$ENCFF.time
+echo "...single StrainID for ($PBS_ARRAYID) ${ENCFF} finished."
+cd $WRK
+
+## Clean-up
+rm -r $TEMP
@@ -0,0 +1,150 @@
+#!/bin/python
+from os import listdir
+from os.path import isfile, join
+import sys
+import re
+import random
+import argparse
+import matplotlib.pyplot as plt
+import numpy as np
+import seaborn as sns
+import pandas as pd
+
+# Python 3 needed for encoding feature for UTF-8
+# (ENCODE uses some capital delta chars in summary descriptions of GeneticModifications)
+
+# Check Seaborn documentation: https://seaborn.pydata.org/generated/seaborn.swarmplot.html
+
+ENCODEtoStrainID = {
+	"HeLa-S3":"HELA",
+	"LNCAP":"LNCAPCLONEFGC",
+	"MCF-7":"MCF7",
+	"SK-N-SH":"SKNSH"
+}
+
+# K562       2640
+# A549       1189
+# MCF-7       556
+# SK-N-SH     210
+# HeLa-S3     196
+# HCT116       96
+
+
+def getParams():
+	'''Parse parameters from the command line'''
+	parser = argparse.ArgumentParser(description='Build histogram characterization plot from hg38 ENCODE StrainID results.')
+	parser.add_argument('-m','--metadata', metavar='metadata_fn', required=True, help='the metadata file downloaded with ENCODE dataset')
+	parser.add_argument('-i','--input-dir', metavar='input_dir', required=True, help='the directory where all the StrainID output files were saved (*strain.tab)')
+
+	parser.add_argument('-o','--output', metavar='png_fn', required=True, help='the output figure image')
+
+	parser.add_argument('-a','--assay', metavar='assay_name', default=None, help='the ENCODE assay name to filter datasets by (default:No Filter)')
+
+	args = parser.parse_args()
+	return(args)
+
+#	ENCFF000DZC.bam
+#LnCap.vcf	-5.082117812158647
+#MCF7.vcf	-6.1143012059424935
+#SKnSH.vcf	-5.7641601741217645
+#HepG2.vcf	-5.595833186705702
+#K562.vcf	1.8812984639660986
+#A549.vcf	-6.059318584695944
+#HCT116.vcf	-4.847450343904915
+#HELA.vcf	-4.906670711358038
+def parse_file(var_file, expected):
+	# Parse file
+	scores = pd.read_table(var_file, sep='\t', header=0, names=['Strain','Scores'])
+	# Add filename info
+	scores['Filename'] = var_file
+	# Add match information
+	scores['Match'] = scores['Strain']==expected
+	# Return scores
+	return(scores)
+
+if __name__ == "__main__":
+	'''Plot swarm'''
+	args = getParams()
+
+	# Hardcode some presets
+	SIZE=5
+	strains2filter = ['P2URK562.vcf']
+	strains2filter.extend(['HCT15.vcf', 'HCT8.vcf'])
+	strains2filter.extend(['HEL9217.vcf', 'HEL.vcf'])
+	strains2filter.extend(['HEP3B217.vcf'])
+	strains2filter.extend(['MCF10A.vcf', 'MCF12A.vcf'])
+	strains2filter.extend(['LN18.vcf', 'LN215.vcf', 'LN229.vcf', 'LN235.vcf', 'LN319.vcf', 'LN340.vcf', 'LN382.vcf', 'LN405.vcf', 'LN428.vcf', 'LN443.vcf', 'LN464.vcf', 'LNZTA3WT4.vcf', 'LNZ308.vcf'])
+
+	strains2filter.extend(['SKN3.vcf', 'SKNAS.vcf', 'SKNBE2.vcf', 'SKNDZ.vcf', 'SKNEP1.vcf', 'SKNFI.vcf', 'SKNMC.vcf', 'SKNMM.vcf', 'SKNO1.vcf','SKN.vcf'])
+	strains2filter.extend(['SKBR3.vcf', 'SKBR5.vcf', 'SKBR7.vcf', 'SKCO1.vcf', 'SKES1.vcf', 'SKGIIIA.vcf', 'SKGII.vcf', 'SKGI.vcf', 'SKGT2.vcf', 'SKGT4.vcf',
+			'SKHEP1.vcf', 'SKLMS1.vcf', 'SKLU1.vcf', 'SKM1.vcf', 'SKMEL19.vcf', 'SKMEL1.vcf', 'SKMEL24.vcf', 'SKMEL28.vcf', 'SKMEL2.vcf', 'SKMEL30.vcf',
+			'SKMEL31.vcf', 'SKMEL3.vcf', 'SKMEL5.vcf', 'SKMES1.vcf', 'SKMG1.vcf', 'SKMM2.vcf', 'SKOV3.vcf', 'SKPNDW.vcf', 'SKRC20.vcf', 'SKRC31.vcf', 'SKUT1.vcf'])
+
+
+	# Parse metadata
+	filedata = pd.read_csv(args.metadata, sep='\t', header=1)
+	filedata['BIOSAMPLE_NAME'] = None
+	df_list_scores = []
+
+	# Loop through each sample
+	for index, row in filedata.iterrows():
+		# Map ENCODE-formatted strain to StrainID-formatted
+		filedata['BIOSAMPLE_NAME'][index] = ENCODEtoStrainID.get(filedata['Biosample name'][index], filedata['Biosample name'][index])
+		expected_vcfname = filedata['BIOSAMPLE_NAME'][index] + ".vcf"
+
+		# Check file exists
+		id_file = join(args.input_dir,"%s_strain.tab" % filedata['Accession'][index])
+		if(not isfile(id_file)):
+			continue
+
+		# Parse ID file and add scores to final dataframe
+		scores = parse_file(id_file, expected_vcfname)
+		df_list_scores.append(scores)
+
+		# for FCL in strains2filter:
+		# 	print(scores[scores['Strain']==FCL])
+
+
+	# Concatenate the strains together
+	all_scores = pd.concat(df_list_scores)
+
+	# Apply a hardcoded filter for parental strains
+
+	# all_scores = all_scores.loc[all_scores['Strain'] in strains2filter]
+	for FCL in strains2filter:
+		print(FCL)
+		# print(all_scores[all_scores['Strain']==FCL])
+		all_scores = all_scores[all_scores['Strain']!=FCL]
+
+
+	# Get counts for samples with all NaNs and format for table
+	data_nans =  pd.DataFrame(all_scores[pd.isnull(all_scores['Scores'])])
+	data_value =  pd.DataFrame(all_scores[~pd.isnull(all_scores['Scores'])])
+
+	# print(data_nans['Strain'].value_counts())
+
+	# Plot violin, swarms, and table
+	fig, ax = plt.subplots()
+	sns.histplot(ax=ax, x="Scores", binwidth=.1, data=data_value[~data_value['Match']], color='cyan')
+	ax2 = ax.twinx()
+	sns.histplot(ax=ax2, x="Scores", binwidth=.1, data=data_value[data_value['Match']], color='orange')
+	plt.tight_layout()
+	# palette = {
+	# 	"A549":"tab:blue",
+	# 	"HCT116":"tab:orange",
+	# 	"HELA":"tab:green",
+	# 	"HepG2":"tab:red",
+	# 	"K562":"tab:purple",
+	# 	"LnCap":"tab:olive",
+	# 	"MCF7":"tab:cyan",
+	# 	"SKnSH":"tab:pink"
+	# }
+
+	# Format figure
+	ax.set_xlabel("StrainID -log2 score")
+	ax.set_ylabel("number of scores for every sample x other cell lines (blue)")
+	ax2.set_ylabel("number of scores for sample x matching cell line (orange)")
+	# Save figure
+	fig.set_size_inches(12,8)
+	#plt.show()
+	plt.savefig(args.output, dpi=500)
@@ -0,0 +1,40 @@
+import sys, os
+import pandas as pd
+import argparse
+
+def getParams():
+	'''Parse parameters from the command line'''
+	parser = argparse.ArgumentParser(description='Merge all tab files from StrainID results into a single table.')
+	parser.add_argument('-m','--metadata', metavar='metadata_fn', required=True, help='the metadata file downloaded with ENCODE dataset')
+	parser.add_argument('-i','--input-dir', metavar='input_dir', required=True, help='the directory where all the StrainID output files were saved (*strain.tab)')
+	parser.add_argument('-o','--output', metavar='txt_fn', required=True, help='the output tab-delimited table path')
+
+	args = parser.parse_args()
+	return(args)
+
+if __name__ == "__main__":
+	'''Merge results'''
+
+	args = getParams()
+
+	merged_df = None
+	# Loop through each StrainID results file
+	reader = open(args.metadata, 'r')
+	for line in reader:
+		if(line.find('/files/')!=0):
+			continue
+		tokens = line.split('\t')
+		# Load results and name VCF filename column "CellLine"
+		temp_df = pd.read_csv(args.input_dir + "/" + tokens[1] + "_strain.tab", sep='\t', header=0)
+		temp_df.columns.values[0] = 'CellLine'
+		temp_df.loc[len(temp_df.index)] = ['0Correct_Strain', tokens[12]]
+		# Merge into merged_df
+		if (merged_df is None):
+			merged_df = temp_df
+			continue
+		merged_df = pd.merge(merged_df, temp_df, on='CellLine')
+	reader.close()
+	# sort values before saving
+	merged_df = merged_df.sort_values(by=['CellLine'])
+	# write merged_df to output file
+	merged_df.to_csv(args.output, sep='\t')