update reformatted ENCODE StrainID results

owlang · owlang · commit 8c1d29bb8175 · 2023-06-15T17:20:49.000-04:00
-update README with more details about script execution
-add gzipped parsed text file results
-update gitignore with fixes to ENCODE StrainID directory typos
-change results parsing script to use pandas and include every Cell Line score instead of just the best one
diff --git a/paper/.gitignore b/paper/.gitignore
@@ -33,9 +33,9 @@ SyntheticStrain/logs/*.err-*
 SyntheticStrain/logs/*.out-*
 SyntheticStrain/results/sacCer3*
 SyntheticStrain/results/hg19*
-ENCODE_CellLines/results/BAM
-ENCODE_CellLines/results/BAM-nospike
-ENCODE_CellLines/results/ID
+ENCODEdata-CellLines/results/BAM
+ENCODEdata-CellLines/results/BAM-nospike
+ENCODEdata-CellLines/results/ID
 BY4742-chipseq/logs/*.out
 BY4742-chipseq/logs/*.err
 BY4742-chipseq/results/FASTQ
diff --git a/paper/ENCODEdata-CellLines/README.md b/paper/ENCODEdata-CellLines/README.md
@@ -2,14 +2,40 @@
 
 Run StrainID on ENCODE data and evaluate StrainID's performance.
 
+|  |  |
+| :--: | -- |
+| Figure 6C  | `ENCODEdata-CellLines/results/ID/` |
+| Supplementary Table 10 | `ENCODEdata-CellLines/results/SupplementaryTable10.txt.gz` |
+
 # Reference files
+
+## 210512_sample_metadata.txt
 ENCODE metadata was pulled on May 12, 2021 using the `scripts/get_metadata.py` script that pulls all Biosample accessions with classification="cell_line" and whose string matches one of the cell lines we have in our hg19_VCF database. These are used to pull File accessions with type=BAM and assembly=hg19. We did not filter by assay for this analysis. They are saved with all relevant metadata to the `210512_sample_metadata.txt` file.
 
 Command used: `python scripts/get_metadata.py > 210512_sample_metadata.txt`
 
 # Download ENCODE CellLine data and run through StrainID
-Use the 210512_sample_metadata.txt file with ENCODE accessions to download the
-data to the data directory. Then run the data through StrainID.
+Use the 210512_sample_metadata.txt file with ENCODE accessions to download and process the
+data to the `results` directory. Then run the data through StrainID.
+
+## Download BAM files
+```
+qsub job/00_download_data.pbs
+```
+
+## Filter BAM files
+
+```
+qsub job/01_filter_and_count.pbs
+```
+
+## Run StrainID
+```
+qsub job/02_indexed_runSID.pbs
+```
 
-# Compare the results to the metadata information
-Evaluate the accuracy of StrainID on real data.
+## Compile the results with the metadata information
+Evaluate the accuracy of StrainID on real data by merging the metadata with the StrainID results.
+```
+python scripts/analyze_encode_results.py -i results/ID -m 210512_sample_metadata.txt -o results/SupplementaryTable10.txt
+```
diff --git a/paper/ENCODEdata-CellLines/results/SupplementaryTable10.txt.gz b/paper/ENCODEdata-CellLines/results/SupplementaryTable10.txt.gz
diff --git a/paper/ENCODEdata-CellLines/scripts/analyze_encode_results.py b/paper/ENCODEdata-CellLines/scripts/analyze_encode_results.py
@@ -2,15 +2,26 @@
 from os.path import isfile, join
 import sys
 import argparse
+import numpy as np
+import pandas as pd
 
 # Python 3 needed for encoding feature for UTF-8
 # (ENCODE uses some capital delta chars in summary descriptions of GeneticModifications)
 
+CL = ["LnCap", "MCF7", "SKnSH", "HepG2", "K562", "A549", "HCT116", "HELA"]
+ENCODEtoStrainID = {
+	"HeLa-S3":"HELA",
+	"LNCAP":"LnCap",
+	"MCF-7":"MCF7",
+	"SK-N-SH":"SKnSH"
+}
+
 def getParams():
 	'''Parse parameters from the command line'''
-	parser = argparse.ArgumentParser(description='Parse metadata file and GenoPipe output to check detection rates of the GenoPipe tool.')
+	parser = argparse.ArgumentParser(description='Parse metadata file and StrainID output to check per sample detection by StrainID scores.')
 	parser.add_argument('-m','--metadata', metavar='metadata_fn', required=True, help='the metadata file downloaded with ENCODE dataset that includes info like PE/SE, cell line, assay type, and read lengths/SE-PE')
 	parser.add_argument('-i','--input-dir', metavar='input_dir', required=True, help='the directory where all the EpitopeID output files were saved (*strain.tab)')
+	parser.add_argument('-o','--output', metavar='output_fn', required=True, help='the output filepath for final TSV with parsed StrainID scores')
 
 	args = parser.parse_args()
 	return(args)
@@ -25,22 +36,21 @@ def getParams():
 #HCT116.vcf	-4.847450343904915
 #HELA.vcf	-4.906670711358038
 def parse_file(var_file):
-	dict = {}
+	scores = []
 	reader = open(var_file,'r')
 	for line in reader:
 		tokens = line.split("\t")
 		if(tokens[0]==""):
 			continue
 		score = float(tokens[1].strip())
 		if(tokens[1].strip().lower()=="inf"):
-			score = 500000
+			score = np.Inf
 		elif(tokens[1].strip().lower()=="nan"):
-			score = -500000
-
+			score = np.NaN
 		# update dict
-		dict[tokens[0].split(".")[0]] = score
+		scores.append((score, tokens[0].split(".")[0]))
 	reader.close()
-	return(dict)
+	return(scores)
 
 #ENCFF364CPX	/files/ENCFF364CPX/@@download/ENCFF364CPX.bam	79651f67b1c4d564395c18be9cdff62f	HeLa-S3	ChIP-seq	single-ended	36	/files/ENCFF807MUK/|/files/ENCFF000BAO/	unfiltered alignments	/experiments/ENCSR000AOB/	/biosample-types/cell_line_EFO_0002791/	1884977463	released	2020-02-18T20:47:36.519163+00:00
 #ENCFF325UJS	/files/ENCFF325UJS/@@download/ENCFF325UJS.bam	042b20b3e149df6c1f4e5c95f83653ee	HepG2	ChIP-seq	single-ended	36	/files/ENCFF807MUK/|/files/ENCFF000BGR/	alignments	/experiments/ENCSR000AOM/	/biosample-types/cell_line_EFO_0001187/	978259147	released	2020-02-18T09:15:21.891603+00:00
@@ -50,27 +60,38 @@ def parse_file(var_file):
 	args = getParams()
 
 	# Parse metadata
-	reader = open(args.metadata, 'r', encoding='utf-8')
-	for mline in reader:
-		# Pull relevant info from metadata tokens
-		mtokens = mline.strip().split('\t')
-		encff = mtokens[0]
+	data = pd.read_csv(args.metadata, sep='\t', names=['File_Accession','Download_URL','MD5sum', 'ENCODE_strain', 'Assay', 'library_type', 'read_length', 'derived_from', 'bam_type', 'Experiment_Accession', 'Biosample_Accession', 'File_Size', 'Audit_status', 'date'])
+
+	# Initialize summary StrainID results columnss
+	data['StrainID_strain'] = None
+	data['StrainID_success'] = None
+	data['Comment'] = None
 
+	# Initialize each strain's score to Nones
+	for c in CL:
+		data[c + "_score"] = None
+
+	# Loop through each sample
+	for index, row in data.iterrows():
+		# Map ENCODE-formatted strain to StrainID-formatted
+		data['ENCODE_strain'][index] = ENCODEtoStrainID.get(data['ENCODE_strain'][index], data['ENCODE_strain'][index])
 		# Check file exists
-		id_file = join(args.input_dir,"%s_strain.tab" % encff)
+		id_file = join(args.input_dir,"%s_strain.tab" % data['File_Accession'][index])
+		# print(data['File_Accession'][index])
 		if(not isfile(id_file)):
-			sys.stderr.write("%s: no results generated.\n" % (id_file))
+			data['StrainID_success'][index] = "Missing Results"
+			data['Comment'][index] = "Missing Results"
 			continue
-
-		# Initialize id file variables to save
-		called_strain = ""
 		# Parse id file for cell line score info and sort strains by score
 		strain_info = parse_file(id_file)
-		strain_sortbyscore = sorted( strain_info.keys(), key=lambda x: (strain_info[x]), reverse=True)
+		for s in strain_info:
+			data[s[1] + "_score"][index] = str(s[0])
+		# Sort scores to get the best one
+		strain_sortbyscore = sorted([s for s in strain_info if not np.isnan(s[0]) ], reverse=True)
 		# Assign strain with best score to called_strain
-		if(len(strain_info.keys())>1):
-			called_strain = strain_sortbyscore[0]
+		if(len(strain_sortbyscore)>0):
+			data['StrainID_strain'][index] = strain_sortbyscore[0][1]
+			data['StrainID_success'][index] = data['StrainID_strain'][index] == data['ENCODE_strain'][index]
 
-		# Write called strain with metadata
-		sys.stdout.write( "%s\t%s\t%s\t%s\n" % (encff, called_strain, strain_info.get(called_strain,"NaN"), "\t".join(mtokens[3:])) )
-	reader.close()
+	# Write final data frame
+	data.to_csv(args.output, sep="\t")