|
| 1 | +#!/bin/python |
| 2 | +from os import listdir |
| 3 | +from os.path import isfile, join |
| 4 | +import sys |
| 5 | +import re |
| 6 | +import random |
| 7 | +import argparse |
| 8 | +import matplotlib.pyplot as plt |
| 9 | +import numpy as np |
| 10 | +import seaborn as sns |
| 11 | +import pandas as pd |
| 12 | + |
| 13 | +# Python 3.6+ |
| 14 | +# relies on dict insertion order |
| 15 | + |
| 16 | +# Check Seaborn documentation: https://seaborn.pydata.org/generated/seaborn.swarmplot.html |
| 17 | + |
| 18 | +def getParams(): |
| 19 | + '''Parse parameters from the command line''' |
| 20 | + parser = argparse.ArgumentParser(description='Build violinplots from ENCODE StrainID results.') |
| 21 | + |
| 22 | + parser.add_argument('-i','--input', metavar='input_fn', required=True, help='the output tab file from analyzed_eid_results.py') |
| 23 | + parser.add_argument('-o','--output', metavar='png_fn', required=True, help='the output figure image') |
| 24 | + |
| 25 | + parser.add_argument('-a','--assay', metavar='assay_name', default=None, help='the ENCODE assay name to filter datasets by (default:No Filter)') |
| 26 | + |
| 27 | + args = parser.parse_args() |
| 28 | + return(args) |
| 29 | + |
| 30 | +if __name__ == "__main__": |
| 31 | + '''Plot swarm''' |
| 32 | + args = getParams() |
| 33 | + |
| 34 | + # Hardcode some presets |
| 35 | + SIZE=5 |
| 36 | + palette = { |
| 37 | + "A549":"tab:blue", |
| 38 | + "HCT116":"tab:orange", |
| 39 | + "HELA":"tab:green", |
| 40 | + "HepG2":"tab:red", |
| 41 | + "K562":"tab:purple", |
| 42 | + "LnCap":"tab:olive", |
| 43 | + "MCF7":"tab:cyan", |
| 44 | + "SKnSH":"tab:pink" |
| 45 | + } |
| 46 | + |
| 47 | + # Parse data table results and get max StrainID scores |
| 48 | + filedata = pd.read_table(args.input, sep='\t') |
| 49 | + justscores = filedata.loc[:, 'LnCap_score':'HELA_score'] |
| 50 | + filedata['StrainID_bestscore'] = justscores.max(axis=1) |
| 51 | + filedata = filedata.sort_values(by='ENCODE_strain') |
| 52 | + |
| 53 | + # Filter by assay name if specified |
| 54 | + if (args.assay!=None): |
| 55 | + filedata = filedata[filedata['Assay']==args.assay] |
| 56 | + |
| 57 | + # Separate success/fail sets for violin vs swarms |
| 58 | + data_success = filedata[filedata['StrainID_success']=='True'] |
| 59 | + data_fails = filedata[filedata['StrainID_success']=='False'] |
| 60 | + |
| 61 | + # Get counts for samples with all NaNs and format for table |
| 62 | + data_nans = pd.DataFrame(filedata[filedata['StrainID_success'].isnull()]['ENCODE_strain'].value_counts(), index=["A549", "HCT116", "HELA", "HepG2", "K562", "LnCap", "MCF7", "SKnSH"]).T.fillna(value=0) |
| 63 | + |
| 64 | + # Plot violin, swarms, and table |
| 65 | + fig, ax = plt.subplots() |
| 66 | + sns.violinplot(ax=ax, x="ENCODE_strain", y="StrainID_bestscore", hue="StrainID_strain", data=data_success, palette=palette) |
| 67 | + sns.stripplot(ax=ax, x="ENCODE_strain", y="StrainID_bestscore", hue="StrainID_strain", data=data_fails, palette=palette) |
| 68 | + plt.table(cellText=data_nans.values, rowLabels=data_nans.index, colLabels=data_nans.columns) |
| 69 | + |
| 70 | + # Format figure |
| 71 | + ax.set_ylabel("StrainID -log2 score") |
| 72 | + ax.set_ylim(-8,10) |
| 73 | + |
| 74 | + # Save figure |
| 75 | + fig.set_size_inches(12,8) |
| 76 | + #plt.show() |
| 77 | + plt.savefig(args.output, dpi=500) |
0 commit comments