Skip to content

Commit 8c1d29b

Browse files
committed
update reformatted ENCODE StrainID results
-update README with more details about script execution -add gzipped parsed text file results -update gitignore with fixes to ENCODE StrainID directory typos -change results parsing script to use pandas and include every Cell Line score instead of just the best one
1 parent 97e0b79 commit 8c1d29b

4 files changed

Lines changed: 77 additions & 30 deletions

File tree

paper/.gitignore

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -33,9 +33,9 @@ SyntheticStrain/logs/*.err-*
3333
SyntheticStrain/logs/*.out-*
3434
SyntheticStrain/results/sacCer3*
3535
SyntheticStrain/results/hg19*
36-
ENCODE_CellLines/results/BAM
37-
ENCODE_CellLines/results/BAM-nospike
38-
ENCODE_CellLines/results/ID
36+
ENCODEdata-CellLines/results/BAM
37+
ENCODEdata-CellLines/results/BAM-nospike
38+
ENCODEdata-CellLines/results/ID
3939
BY4742-chipseq/logs/*.out
4040
BY4742-chipseq/logs/*.err
4141
BY4742-chipseq/results/FASTQ

paper/ENCODEdata-CellLines/README.md

Lines changed: 30 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,14 +2,40 @@
22

33
Run StrainID on ENCODE data and evaluate StrainID's performance.
44

5+
| | |
6+
| :--: | -- |
7+
| Figure 6C | `ENCODEdata-CellLines/results/ID/` |
8+
| Supplementary Table 10 | `ENCODEdata-CellLines/results/SupplementaryTable10.txt.gz` |
9+
510
# Reference files
11+
12+
## 210512_sample_metadata.txt
613
ENCODE metadata was pulled on May 12, 2021 using the `scripts/get_metadata.py` script that pulls all Biosample accessions with classification="cell_line" and whose string matches one of the cell lines we have in our hg19_VCF database. These are used to pull File accessions with type=BAM and assembly=hg19. We did not filter by assay for this analysis. They are saved with all relevant metadata to the `210512_sample_metadata.txt` file.
714

815
Command used: `python scripts/get_metadata.py > 210512_sample_metadata.txt`
916

1017
# Download ENCODE CellLine data and run through StrainID
11-
Use the 210512_sample_metadata.txt file with ENCODE accessions to download the
12-
data to the data directory. Then run the data through StrainID.
18+
Use the 210512_sample_metadata.txt file with ENCODE accessions to download and process the
19+
data to the `results` directory. Then run the data through StrainID.
20+
21+
## Download BAM files
22+
```
23+
qsub job/00_download_data.pbs
24+
```
25+
26+
## Filter BAM files
27+
28+
```
29+
qsub job/01_filter_and_count.pbs
30+
```
31+
32+
## Run StrainID
33+
```
34+
qsub job/02_indexed_runSID.pbs
35+
```
1336

14-
# Compare the results to the metadata information
15-
Evaluate the accuracy of StrainID on real data.
37+
## Compile the results with the metadata information
38+
Evaluate the accuracy of StrainID on real data by merging the metadata with the StrainID results.
39+
```
40+
python scripts/analyze_encode_results.py -i results/ID -m 210512_sample_metadata.txt -o results/SupplementaryTable10.txt
41+
```
1.92 MB
Binary file not shown.

paper/ENCODEdata-CellLines/scripts/analyze_encode_results.py

Lines changed: 44 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -2,15 +2,26 @@
22
from os.path import isfile, join
33
import sys
44
import argparse
5+
import numpy as np
6+
import pandas as pd
57

68
# Python 3 needed for encoding feature for UTF-8
79
# (ENCODE uses some capital delta chars in summary descriptions of GeneticModifications)
810

11+
CL = ["LnCap", "MCF7", "SKnSH", "HepG2", "K562", "A549", "HCT116", "HELA"]
12+
ENCODEtoStrainID = {
13+
"HeLa-S3":"HELA",
14+
"LNCAP":"LnCap",
15+
"MCF-7":"MCF7",
16+
"SK-N-SH":"SKnSH"
17+
}
18+
919
def getParams():
1020
'''Parse parameters from the command line'''
11-
parser = argparse.ArgumentParser(description='Parse metadata file and GenoPipe output to check detection rates of the GenoPipe tool.')
21+
parser = argparse.ArgumentParser(description='Parse metadata file and StrainID output to check per sample detection by StrainID scores.')
1222
parser.add_argument('-m','--metadata', metavar='metadata_fn', required=True, help='the metadata file downloaded with ENCODE dataset that includes info like PE/SE, cell line, assay type, and read lengths/SE-PE')
1323
parser.add_argument('-i','--input-dir', metavar='input_dir', required=True, help='the directory where all the EpitopeID output files were saved (*strain.tab)')
24+
parser.add_argument('-o','--output', metavar='output_fn', required=True, help='the output filepath for final TSV with parsed StrainID scores')
1425

1526
args = parser.parse_args()
1627
return(args)
@@ -25,22 +36,21 @@ def getParams():
2536
#HCT116.vcf -4.847450343904915
2637
#HELA.vcf -4.906670711358038
2738
def parse_file(var_file):
28-
dict = {}
39+
scores = []
2940
reader = open(var_file,'r')
3041
for line in reader:
3142
tokens = line.split("\t")
3243
if(tokens[0]==""):
3344
continue
3445
score = float(tokens[1].strip())
3546
if(tokens[1].strip().lower()=="inf"):
36-
score = 500000
47+
score = np.Inf
3748
elif(tokens[1].strip().lower()=="nan"):
38-
score = -500000
39-
49+
score = np.NaN
4050
# update dict
41-
dict[tokens[0].split(".")[0]] = score
51+
scores.append((score, tokens[0].split(".")[0]))
4252
reader.close()
43-
return(dict)
53+
return(scores)
4454

4555
#ENCFF364CPX /files/ENCFF364CPX/@@download/ENCFF364CPX.bam 79651f67b1c4d564395c18be9cdff62f HeLa-S3 ChIP-seq single-ended 36 /files/ENCFF807MUK/|/files/ENCFF000BAO/ unfiltered alignments /experiments/ENCSR000AOB/ /biosample-types/cell_line_EFO_0002791/ 1884977463 released 2020-02-18T20:47:36.519163+00:00
4656
#ENCFF325UJS /files/ENCFF325UJS/@@download/ENCFF325UJS.bam 042b20b3e149df6c1f4e5c95f83653ee HepG2 ChIP-seq single-ended 36 /files/ENCFF807MUK/|/files/ENCFF000BGR/ alignments /experiments/ENCSR000AOM/ /biosample-types/cell_line_EFO_0001187/ 978259147 released 2020-02-18T09:15:21.891603+00:00
@@ -50,27 +60,38 @@ def parse_file(var_file):
5060
args = getParams()
5161

5262
# Parse metadata
53-
reader = open(args.metadata, 'r', encoding='utf-8')
54-
for mline in reader:
55-
# Pull relevant info from metadata tokens
56-
mtokens = mline.strip().split('\t')
57-
encff = mtokens[0]
63+
data = pd.read_csv(args.metadata, sep='\t', names=['File_Accession','Download_URL','MD5sum', 'ENCODE_strain', 'Assay', 'library_type', 'read_length', 'derived_from', 'bam_type', 'Experiment_Accession', 'Biosample_Accession', 'File_Size', 'Audit_status', 'date'])
64+
65+
# Initialize summary StrainID results columnss
66+
data['StrainID_strain'] = None
67+
data['StrainID_success'] = None
68+
data['Comment'] = None
5869

70+
# Initialize each strain's score to Nones
71+
for c in CL:
72+
data[c + "_score"] = None
73+
74+
# Loop through each sample
75+
for index, row in data.iterrows():
76+
# Map ENCODE-formatted strain to StrainID-formatted
77+
data['ENCODE_strain'][index] = ENCODEtoStrainID.get(data['ENCODE_strain'][index], data['ENCODE_strain'][index])
5978
# Check file exists
60-
id_file = join(args.input_dir,"%s_strain.tab" % encff)
79+
id_file = join(args.input_dir,"%s_strain.tab" % data['File_Accession'][index])
80+
# print(data['File_Accession'][index])
6181
if(not isfile(id_file)):
62-
sys.stderr.write("%s: no results generated.\n" % (id_file))
82+
data['StrainID_success'][index] = "Missing Results"
83+
data['Comment'][index] = "Missing Results"
6384
continue
64-
65-
# Initialize id file variables to save
66-
called_strain = ""
6785
# Parse id file for cell line score info and sort strains by score
6886
strain_info = parse_file(id_file)
69-
strain_sortbyscore = sorted( strain_info.keys(), key=lambda x: (strain_info[x]), reverse=True)
87+
for s in strain_info:
88+
data[s[1] + "_score"][index] = str(s[0])
89+
# Sort scores to get the best one
90+
strain_sortbyscore = sorted([s for s in strain_info if not np.isnan(s[0]) ], reverse=True)
7091
# Assign strain with best score to called_strain
71-
if(len(strain_info.keys())>1):
72-
called_strain = strain_sortbyscore[0]
92+
if(len(strain_sortbyscore)>0):
93+
data['StrainID_strain'][index] = strain_sortbyscore[0][1]
94+
data['StrainID_success'][index] = data['StrainID_strain'][index] == data['ENCODE_strain'][index]
7395

74-
# Write called strain with metadata
75-
sys.stdout.write( "%s\t%s\t%s\t%s\n" % (encff, called_strain, strain_info.get(called_strain,"NaN"), "\t".join(mtokens[3:])) )
76-
reader.close()
96+
# Write final data frame
97+
data.to_csv(args.output, sep="\t")

0 commit comments

Comments
 (0)