Skip to content

Commit 263c46d

Browse files
committed
update ENCODE StrainID violin script and results
update script for parsing and building violin plots from ENCODE StrainID results. Add option to filter by various assays and include README lines and some highlighted violin PNG figures
1 parent 8c1d29b commit 263c46d

7 files changed

Lines changed: 86 additions & 0 deletions

File tree

paper/.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ SyntheticStrain/logs/*.err-*
3333
SyntheticStrain/logs/*.out-*
3434
SyntheticStrain/results/sacCer3*
3535
SyntheticStrain/results/hg19*
36+
ENCODEdata-CellLines/results/SupplementaryTable10.txt
3637
ENCODEdata-CellLines/results/BAM
3738
ENCODEdata-CellLines/results/BAM-nospike
3839
ENCODEdata-CellLines/results/ID

paper/ENCODEdata-CellLines/README.md

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,4 +38,12 @@ qsub job/02_indexed_runSID.pbs
3838
Evaluate the accuracy of StrainID on real data by merging the metadata with the StrainID results.
3939
```
4040
python scripts/analyze_encode_results.py -i results/ID -m 210512_sample_metadata.txt -o results/SupplementaryTable10.txt
41+
python scripts/build_violinscatter.py -i results/SupplementaryTable10.txt -o results/Figure6C.png
42+
```
43+
44+
You can even filter the visualization to show the results for specific assays by using the `-a` option flag
45+
```
46+
python scripts/build_violinscatter.py -i results/SupplementaryTable10.txt -o results/Figure6C_ChIP-seq.png -a "ChIP-seq"
47+
python scripts/build_violinscatter.py -i results/SupplementaryTable10.txt -o results/Figure6C_CAGE.png -a "CAGE"
48+
python scripts/build_violinscatter.py -i results/SupplementaryTable10.txt -o results/Figure6C_small-RNA-seq.png -a "small RNA-seq"
4149
```
614 KB
Loading
389 KB
Loading
467 KB
Loading
457 KB
Loading
Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
#!/bin/python
2+
from os import listdir
3+
from os.path import isfile, join
4+
import sys
5+
import re
6+
import random
7+
import argparse
8+
import matplotlib.pyplot as plt
9+
import numpy as np
10+
import seaborn as sns
11+
import pandas as pd
12+
13+
# Python 3.6+
14+
# relies on dict insertion order
15+
16+
# Check Seaborn documentation: https://seaborn.pydata.org/generated/seaborn.swarmplot.html
17+
18+
def getParams():
19+
'''Parse parameters from the command line'''
20+
parser = argparse.ArgumentParser(description='Build violinplots from ENCODE StrainID results.')
21+
22+
parser.add_argument('-i','--input', metavar='input_fn', required=True, help='the output tab file from analyzed_eid_results.py')
23+
parser.add_argument('-o','--output', metavar='png_fn', required=True, help='the output figure image')
24+
25+
parser.add_argument('-a','--assay', metavar='assay_name', default=None, help='the ENCODE assay name to filter datasets by (default:No Filter)')
26+
27+
args = parser.parse_args()
28+
return(args)
29+
30+
if __name__ == "__main__":
31+
'''Plot swarm'''
32+
args = getParams()
33+
34+
# Hardcode some presets
35+
SIZE=5
36+
palette = {
37+
"A549":"tab:blue",
38+
"HCT116":"tab:orange",
39+
"HELA":"tab:green",
40+
"HepG2":"tab:red",
41+
"K562":"tab:purple",
42+
"LnCap":"tab:olive",
43+
"MCF7":"tab:cyan",
44+
"SKnSH":"tab:pink"
45+
}
46+
47+
# Parse data table results and get max StrainID scores
48+
filedata = pd.read_table(args.input, sep='\t')
49+
justscores = filedata.loc[:, 'LnCap_score':'HELA_score']
50+
filedata['StrainID_bestscore'] = justscores.max(axis=1)
51+
filedata = filedata.sort_values(by='ENCODE_strain')
52+
53+
# Filter by assay name if specified
54+
if (args.assay!=None):
55+
filedata = filedata[filedata['Assay']==args.assay]
56+
57+
# Separate success/fail sets for violin vs swarms
58+
data_success = filedata[filedata['StrainID_success']=='True']
59+
data_fails = filedata[filedata['StrainID_success']=='False']
60+
61+
# Get counts for samples with all NaNs and format for table
62+
data_nans = pd.DataFrame(filedata[filedata['StrainID_success'].isnull()]['ENCODE_strain'].value_counts(), index=["A549", "HCT116", "HELA", "HepG2", "K562", "LnCap", "MCF7", "SKnSH"]).T.fillna(value=0)
63+
64+
# Plot violin, swarms, and table
65+
fig, ax = plt.subplots()
66+
sns.violinplot(ax=ax, x="ENCODE_strain", y="StrainID_bestscore", hue="StrainID_strain", data=data_success, palette=palette)
67+
sns.stripplot(ax=ax, x="ENCODE_strain", y="StrainID_bestscore", hue="StrainID_strain", data=data_fails, palette=palette)
68+
plt.table(cellText=data_nans.values, rowLabels=data_nans.index, colLabels=data_nans.columns)
69+
70+
# Format figure
71+
ax.set_ylabel("StrainID -log2 score")
72+
ax.set_ylim(-8,10)
73+
74+
# Save figure
75+
fig.set_size_inches(12,8)
76+
#plt.show()
77+
plt.savefig(args.output, dpi=500)

0 commit comments

Comments
 (0)