|
| 1 | +#!/bin/python |
| 2 | +from os import listdir |
| 3 | +from os.path import isfile, join |
| 4 | +import sys |
| 5 | +import re |
| 6 | +import random |
| 7 | +import argparse |
| 8 | +import matplotlib.pyplot as plt |
| 9 | +import numpy as np |
| 10 | +import seaborn as sns |
| 11 | +import pandas as pd |
| 12 | + |
| 13 | +# Python 3 needed for encoding feature for UTF-8 |
| 14 | +# (ENCODE uses some capital delta chars in summary descriptions of GeneticModifications) |
| 15 | + |
| 16 | +# Check Seaborn documentation: https://seaborn.pydata.org/generated/seaborn.swarmplot.html |
| 17 | + |
| 18 | +ENCODEtoStrainID = { |
| 19 | + "HeLa-S3":"HELA", |
| 20 | + "LNCAP":"LNCAPCLONEFGC", |
| 21 | + "MCF-7":"MCF7", |
| 22 | + "SK-N-SH":"SKNSH" |
| 23 | +} |
| 24 | + |
| 25 | +# K562 2640 |
| 26 | +# A549 1189 |
| 27 | +# MCF-7 556 |
| 28 | +# SK-N-SH 210 |
| 29 | +# HeLa-S3 196 |
| 30 | +# HCT116 96 |
| 31 | + |
| 32 | + |
| 33 | +def getParams(): |
| 34 | + '''Parse parameters from the command line''' |
| 35 | + parser = argparse.ArgumentParser(description='Build histogram characterization plot from hg38 ENCODE StrainID results.') |
| 36 | + parser.add_argument('-m','--metadata', metavar='metadata_fn', required=True, help='the metadata file downloaded with ENCODE dataset') |
| 37 | + parser.add_argument('-i','--input-dir', metavar='input_dir', required=True, help='the directory where all the StrainID output files were saved (*strain.tab)') |
| 38 | + |
| 39 | + parser.add_argument('-o','--output', metavar='png_fn', required=True, help='the output figure image') |
| 40 | + |
| 41 | + parser.add_argument('-a','--assay', metavar='assay_name', default=None, help='the ENCODE assay name to filter datasets by (default:No Filter)') |
| 42 | + |
| 43 | + args = parser.parse_args() |
| 44 | + return(args) |
| 45 | + |
| 46 | +# ENCFF000DZC.bam |
| 47 | +#LnCap.vcf -5.082117812158647 |
| 48 | +#MCF7.vcf -6.1143012059424935 |
| 49 | +#SKnSH.vcf -5.7641601741217645 |
| 50 | +#HepG2.vcf -5.595833186705702 |
| 51 | +#K562.vcf 1.8812984639660986 |
| 52 | +#A549.vcf -6.059318584695944 |
| 53 | +#HCT116.vcf -4.847450343904915 |
| 54 | +#HELA.vcf -4.906670711358038 |
| 55 | +def parse_file(var_file, expected): |
| 56 | + # Parse file |
| 57 | + scores = pd.read_table(var_file, sep='\t', header=0, names=['Strain','Scores']) |
| 58 | + # Add filename info |
| 59 | + scores['Filename'] = var_file |
| 60 | + # Add match information |
| 61 | + scores['Match'] = scores['Strain']==expected |
| 62 | + # Return scores |
| 63 | + return(scores) |
| 64 | + |
| 65 | +if __name__ == "__main__": |
| 66 | + '''Plot swarm''' |
| 67 | + args = getParams() |
| 68 | + |
| 69 | + # Hardcode some presets |
| 70 | + SIZE=5 |
| 71 | + strains2filter = ['P2URK562.vcf'] |
| 72 | + strains2filter.extend(['HCT15.vcf', 'HCT8.vcf']) |
| 73 | + strains2filter.extend(['HEL9217.vcf', 'HEL.vcf']) |
| 74 | + strains2filter.extend(['HEP3B217.vcf']) |
| 75 | + strains2filter.extend(['MCF10A.vcf', 'MCF12A.vcf']) |
| 76 | + strains2filter.extend(['LN18.vcf', 'LN215.vcf', 'LN229.vcf', 'LN235.vcf', 'LN319.vcf', 'LN340.vcf', 'LN382.vcf', 'LN405.vcf', 'LN428.vcf', 'LN443.vcf', 'LN464.vcf', 'LNZTA3WT4.vcf', 'LNZ308.vcf']) |
| 77 | + |
| 78 | + strains2filter.extend(['SKN3.vcf', 'SKNAS.vcf', 'SKNBE2.vcf', 'SKNDZ.vcf', 'SKNEP1.vcf', 'SKNFI.vcf', 'SKNMC.vcf', 'SKNMM.vcf', 'SKNO1.vcf','SKN.vcf']) |
| 79 | + strains2filter.extend(['SKBR3.vcf', 'SKBR5.vcf', 'SKBR7.vcf', 'SKCO1.vcf', 'SKES1.vcf', 'SKGIIIA.vcf', 'SKGII.vcf', 'SKGI.vcf', 'SKGT2.vcf', 'SKGT4.vcf', |
| 80 | + 'SKHEP1.vcf', 'SKLMS1.vcf', 'SKLU1.vcf', 'SKM1.vcf', 'SKMEL19.vcf', 'SKMEL1.vcf', 'SKMEL24.vcf', 'SKMEL28.vcf', 'SKMEL2.vcf', 'SKMEL30.vcf', |
| 81 | + 'SKMEL31.vcf', 'SKMEL3.vcf', 'SKMEL5.vcf', 'SKMES1.vcf', 'SKMG1.vcf', 'SKMM2.vcf', 'SKOV3.vcf', 'SKPNDW.vcf', 'SKRC20.vcf', 'SKRC31.vcf', 'SKUT1.vcf']) |
| 82 | + |
| 83 | + |
| 84 | + # Parse metadata |
| 85 | + filedata = pd.read_csv(args.metadata, sep='\t', header=1) |
| 86 | + filedata['BIOSAMPLE_NAME'] = None |
| 87 | + df_list_scores = [] |
| 88 | + |
| 89 | + # Loop through each sample |
| 90 | + for index, row in filedata.iterrows(): |
| 91 | + # Map ENCODE-formatted strain to StrainID-formatted |
| 92 | + filedata['BIOSAMPLE_NAME'][index] = ENCODEtoStrainID.get(filedata['Biosample name'][index], filedata['Biosample name'][index]) |
| 93 | + expected_vcfname = filedata['BIOSAMPLE_NAME'][index] + ".vcf" |
| 94 | + |
| 95 | + # Check file exists |
| 96 | + id_file = join(args.input_dir,"%s_strain.tab" % filedata['Accession'][index]) |
| 97 | + if(not isfile(id_file)): |
| 98 | + continue |
| 99 | + |
| 100 | + # Parse ID file and add scores to final dataframe |
| 101 | + scores = parse_file(id_file, expected_vcfname) |
| 102 | + df_list_scores.append(scores) |
| 103 | + |
| 104 | + # for FCL in strains2filter: |
| 105 | + # print(scores[scores['Strain']==FCL]) |
| 106 | + |
| 107 | + |
| 108 | + # Concatenate the strains together |
| 109 | + all_scores = pd.concat(df_list_scores) |
| 110 | + |
| 111 | + # Apply a hardcoded filter for parental strains |
| 112 | + |
| 113 | + # all_scores = all_scores.loc[all_scores['Strain'] in strains2filter] |
| 114 | + for FCL in strains2filter: |
| 115 | + print(FCL) |
| 116 | + # print(all_scores[all_scores['Strain']==FCL]) |
| 117 | + all_scores = all_scores[all_scores['Strain']!=FCL] |
| 118 | + |
| 119 | + |
| 120 | + # Get counts for samples with all NaNs and format for table |
| 121 | + data_nans = pd.DataFrame(all_scores[pd.isnull(all_scores['Scores'])]) |
| 122 | + data_value = pd.DataFrame(all_scores[~pd.isnull(all_scores['Scores'])]) |
| 123 | + |
| 124 | + # print(data_nans['Strain'].value_counts()) |
| 125 | + |
| 126 | + # Plot violin, swarms, and table |
| 127 | + fig, ax = plt.subplots() |
| 128 | + sns.histplot(ax=ax, x="Scores", binwidth=.1, data=data_value[~data_value['Match']], color='cyan') |
| 129 | + ax2 = ax.twinx() |
| 130 | + sns.histplot(ax=ax2, x="Scores", binwidth=.1, data=data_value[data_value['Match']], color='orange') |
| 131 | + plt.tight_layout() |
| 132 | + # palette = { |
| 133 | + # "A549":"tab:blue", |
| 134 | + # "HCT116":"tab:orange", |
| 135 | + # "HELA":"tab:green", |
| 136 | + # "HepG2":"tab:red", |
| 137 | + # "K562":"tab:purple", |
| 138 | + # "LnCap":"tab:olive", |
| 139 | + # "MCF7":"tab:cyan", |
| 140 | + # "SKnSH":"tab:pink" |
| 141 | + # } |
| 142 | + |
| 143 | + # Format figure |
| 144 | + ax.set_xlabel("StrainID -log2 score") |
| 145 | + ax.set_ylabel("number of scores for every sample x other cell lines (blue)") |
| 146 | + ax2.set_ylabel("number of scores for sample x matching cell line (orange)") |
| 147 | + # Save figure |
| 148 | + fig.set_size_inches(12,8) |
| 149 | + #plt.show() |
| 150 | + plt.savefig(args.output, dpi=500) |
0 commit comments