1616import org .apache .logging .log4j .LogManager ;
1717import org .apache .logging .log4j .Logger ;
1818import org .jetbrains .annotations .Nullable ;
19- import org .json .JSONArray ;
2019import org .json .JSONObject ;
2120import org .junit .Assert ;
2221import org .junit .Test ;
@@ -933,7 +932,7 @@ private void checkVcfAnnotationsAndSamples(File vcfInput, boolean skipAnnotation
933932
934933 if (!skipAnnotationChecks )
935934 {
936- for (String info : Arrays .asList ("CADD_PH" , "OMIMN " , "CLN_ALLELE" , "AF" , "mGAPV" ))
935+ for (String info : Arrays .asList ("CADD_PH" , "OMIM_PHENO " , "CLN_ALLELE" , "AF" , "mGAPV" ))
937936 {
938937 if (!header .hasInfoLine (info ))
939938 {
@@ -1080,16 +1079,16 @@ private void inspectAndSummarizeVcf(JobContext ctx, File vcfInput, GeneToNameTra
10801079 }
10811080 }
10821081
1083- Set <String > omims = new LinkedHashSet <>();
1084- Set <String > omimds = new LinkedHashSet <>();
1085- if (vc .getAttribute ("OMIMN " ) != null )
1082+ Set <String > omimIds = new LinkedHashSet <>();
1083+ Set <String > omimPhenotypes = new LinkedHashSet <>();
1084+ if (vc .getAttribute ("MIMNUMBER " ) != null )
10861085 {
1087- omims .add (vc .getAttributeAsString ("OMIMN " , null ));
1086+ omimIds .add (vc .getAttributeAsString ("MIMNUMBER " , null ));
10881087 }
10891088
1090- if (vc .getAttribute ("OMIMD " ) != null )
1089+ if (vc .getAttribute ("OMIM_PHENO " ) != null )
10911090 {
1092- omimds .addAll (parseRawOmimd (vc , ctx .getLogger ()));
1091+ omimPhenotypes .addAll (parseRawOmimPheno (vc , ctx .getLogger ()));
10931092 }
10941093
10951094 Set <String > overlappingGenes = new TreeSet <>();
@@ -1131,6 +1130,11 @@ private void inspectAndSummarizeVcf(JobContext ctx, File vcfInput, GeneToNameTra
11311130 geneName = translator .getGeneMap ().get (geneName ).get ("gene_name" );
11321131 }
11331132
1133+ if (tokens .length > 7 && !"protein_coding" .equals (tokens [7 ])) {
1134+ ctx .getLogger ().info ("skipping non protein_coding ANN: " + ann );
1135+ continue ;
1136+ }
1137+
11341138 overlappingGenes .add (geneName );
11351139 }
11361140 }
@@ -1162,7 +1166,7 @@ private void inspectAndSummarizeVcf(JobContext ctx, File vcfInput, GeneToNameTra
11621166 String overlappingGenesJoin = StringUtils .join (overlappingGenes , "," );
11631167 if (!overlappingGenesReported .contains (overlappingGenesJoin ))
11641168 {
1165- maybeWriteVariantLine (queuedLines , vc , tokens [0 ], "SNPEff" , "Predicted High Impact" , description , overlappingGenes , omims , omimds , ctx .getLogger (), null );
1169+ maybeWriteVariantLine (queuedLines , vc , tokens [0 ], "SNPEff" , "Predicted High Impact" , description , overlappingGenes , omimIds , omimPhenotypes , ctx .getLogger (), null );
11661170
11671171 // NOTE: a given site could have multiple overlapping ORFs (usually different isoforms), so if we hit one allow this to tag that site and skip the remaining.
11681172 overlappingGenesReported .add (overlappingGenesJoin );
@@ -1173,14 +1177,24 @@ private void inspectAndSummarizeVcf(JobContext ctx, File vcfInput, GeneToNameTra
11731177
11741178 if (vc .getAttribute ("CLN_SIG" ) != null )
11751179 {
1176- List <String > clnAlleles = vc .getAttributeAsStringList ("CLN_ALLELE" , "" );
11771180 List <String > clnSigs = vc .getAttributeAsStringList ("CLN_SIG" , "" );
1181+ if (clnSigs .size () != vc .getAlternateAlleles ().size ())
1182+ {
1183+ throw new IllegalStateException ("CLN_SIG and alt alleles were not the same length: " + vc .toStringWithoutGenotypes ());
1184+ }
1185+
11781186 List <String > clnDisease = vc .getAttributeAsStringList ("CLN_DN" , "" );
11791187 List <String > clnAlleleIds = vc .getAttributeAsStringList ("CLN_ALLELEID" , "" );
11801188 int i = -1 ;
11811189 for (String sigList : clnSigs )
11821190 {
11831191 i ++;
1192+ if (sigList .isEmpty ())
1193+ {
1194+ continue ;
1195+ }
1196+
1197+ Allele altAllele = vc .getAlternateAllele (i );
11841198
11851199 String [] sigSplit = sigList .split ("\\ |" );
11861200 List <String > diseaseSplit = Arrays .asList (clnDisease .get (i ).split ("\\ |" ));
@@ -1196,8 +1210,7 @@ private void inspectAndSummarizeVcf(JobContext ctx, File vcfInput, GeneToNameTra
11961210
11971211 try
11981212 {
1199- String allele = clnAlleles .get (i );
1200- maybeWriteVariantLine (queuedLines , vc , allele , "ClinVar" , diseaseSplit .get (j ), description , overlappingGenes , omims , omimds , ctx .getLogger (), "ClinVar:" + clnAlleleIds .get (i ));
1213+ maybeWriteVariantLine (queuedLines , vc , altAllele .getBaseString (), "ClinVar" , diseaseSplit .get (j ), description , overlappingGenes , omimIds , omimPhenotypes , ctx .getLogger (), "ClinVar:" + clnAlleleIds .get (i ));
12011214
12021215 }
12031216 catch (IndexOutOfBoundsException e )
@@ -1214,36 +1227,57 @@ private void inspectAndSummarizeVcf(JobContext ctx, File vcfInput, GeneToNameTra
12141227 }
12151228 }
12161229
1217- //NE: nsdb Polyphen2_HVAR_score : Polyphen2 score based on HumVar, i.e. hvar_prob. The score ranges from 0 to 1, and the corresponding prediction is 'probably damaging' if it is in [0.909,1], 'possibly damaging' if it is in [0.447,0.908], 'benign' if it is in [0,0.446]. Score cutoff for binary classification is 0.5, i.e. the prediction is 'neutral' if the score is smaller than 0.5 and 'deleterious' if the score is larger than 0.5. Multiple entries separated by
1218- //NF: nsdb Polyphen2_HVAR_pred: Polyphen2 prediction based on HumVar, 'D' ('probably damaging'),'P' ('possibly damaging') and 'B' ('benign'). Multiple entries separated by
1219- if (vc .getAttribute ("NF " ) != null && !"." .equals (vc .getAttribute ("NF " )))
1230+ //Polyphen2_HVAR_S : Polyphen2 score based on HumVar, i.e. hvar_prob. The score ranges from 0 to 1, and the corresponding prediction is 'probably damaging' if it is in [0.909,1], 'possibly damaging' if it is in [0.447,0.908], 'benign' if it is in [0,0.446]. Score cutoff for binary classification is 0.5, i.e. the prediction is 'neutral' if the score is smaller than 0.5 and 'deleterious' if the score is larger than 0.5. Multiple entries separated by
1231+ //Polyphen2_HVAR_pred: Polyphen2 prediction based on HumVar, 'D' ('probably damaging'),'P' ('possibly damaging') and 'B' ('benign'). Multiple entries separated by
1232+ if (vc .getAttribute ("Polyphen2_HVAR_pred " ) != null && !"." .equals (vc .getAttribute ("Polyphen2_HVAR_pred " )))
12201233 {
1221- Set <String > polyphenPredictions = new HashSet <>(vc .getAttributeAsStringList ("NF" , null ));
1222- polyphenPredictions .remove ("B" );
1223- polyphenPredictions .remove ("P" );
1234+ List <String > polyphenPredictions = vc .getAttributeAsStringList ("Polyphen2_HVAR_pred" , null );
1235+ List <Double > polyphenScores = vc .getAttributeAsDoubleList ("Polyphen2_HVAR_S" , 0.0 );
1236+
1237+ if (polyphenPredictions .size () != vc .getAlternateAlleles ().size ())
1238+ {
1239+ throw new IllegalStateException ("Polyphen2_HVAR_pred and alt alleles were not the same length: " + vc .toStringWithoutGenotypes ());
1240+ }
1241+
1242+ if (polyphenScores .size () != vc .getAlternateAlleles ().size ())
1243+ {
1244+ throw new IllegalStateException ("Polyphen2_HVAR_S and alt alleles were not the same length: " + vc .toStringWithoutGenotypes ());
1245+ }
12241246
1225- String description = null ;
1226- if (! polyphenPredictions . isEmpty ())
1247+ int alleleIdx = - 1 ;
1248+ for ( Allele alt : vc . getAlternateAlleles ())
12271249 {
1250+ alleleIdx ++;
1251+
1252+ String prediction = polyphenPredictions .get (alleleIdx );
1253+ if (StringUtils .isEmpty (prediction ) || "B" .equals (prediction ) || "P" .equals (prediction ) || "." .equals (prediction ))
1254+ {
1255+ continue ;
1256+ }
1257+
1258+ String description = null ;
12281259 try
12291260 {
1230- Double maxScore = Collections .max (vc .getAttributeAsDoubleList ("NE" , 0.0 ));
1231- description = StringUtils .join (new String []{
1232- "Score: " + maxScore
1233- }, "," );
1261+ Double maxScore = polyphenScores .get (alleleIdx );
1262+ if (maxScore == 0.0 )
1263+ {
1264+ ctx .getLogger ().error ("Suspicious values for Polyphen2_HVAR_S: " + maxScore + ", at position: " + vc .toStringWithoutGenotypes ());
1265+ }
1266+
1267+ description = "Score: " + maxScore ;
12341268 }
12351269 catch (NumberFormatException e )
12361270 {
1237- ctx .getLogger ().warn ("Unable to parse NE attribute decimal (" + vc .getAttribute ("NE " ) + ") for variant at position: " + vc .toStringWithoutGenotypes ());
1271+ ctx .getLogger ().error ("Unable to parse Polyphen2_HVAR_S attribute decimal (" + vc .getAttribute ("Polyphen2_HVAR_S " ) + ") for variant at position: " + vc .toStringWithoutGenotypes ());
12381272 }
12391273
1240- maybeWriteVariantLine (queuedLines , vc , null , "Polyphen2" , "Prediction: " + StringUtils . join ( polyphenPredictions , "," ), description , overlappingGenes , omims , omimds , ctx .getLogger (), null );
1274+ maybeWriteVariantLine (queuedLines , vc , alt . getBaseString () , "Polyphen2" , "Prediction: " + prediction , description , overlappingGenes , omimIds , omimPhenotypes , ctx .getLogger (), null );
12411275 }
12421276 }
12431277
12441278 for (List <String > line : queuedLines )
12451279 {
1246- writer .writeNext (line .toArray (new String [line . size () ]));
1280+ writer .writeNext (line .toArray (new String [0 ]));
12471281 }
12481282 }
12491283 }
@@ -1264,17 +1298,17 @@ private void inspectAndSummarizeVcf(JobContext ctx, File vcfInput, GeneToNameTra
12641298 }
12651299 }
12661300
1267- public Collection <String > parseRawOmimd (VariantContext vc , Logger log )
1301+ public Collection <String > parseRawOmimPheno (VariantContext vc , Logger log )
12681302 {
12691303 //NOTE: because this field can have internal commas, this can be parsed incorrectly, so ignore this and re-join
12701304 String rawVal ;
1271- if (vc .getAttribute ("OMIMD " ) instanceof Collection )
1305+ if (vc .getAttribute ("OMIM_PHENO " ) instanceof Collection )
12721306 {
1273- rawVal = StringUtils .join (vc .getAttributeAsStringList ("OMIMD " , null ), "," );
1307+ rawVal = StringUtils .join (vc .getAttributeAsStringList ("OMIM_PHENO " , null ), "," );
12741308 }
12751309 else
12761310 {
1277- rawVal = vc .getAttributeAsString ("OMIMD " , null );
1311+ rawVal = vc .getAttributeAsString ("OMIM_PHENO " , null );
12781312 }
12791313
12801314 if (!StringUtils .isEmpty (rawVal ) && ! "." .equals (rawVal ))
@@ -1331,7 +1365,7 @@ private void generateSummaries(JobContext ctx, File vcf, ReferenceGenome genome,
13311365 generateSummary (ctx , variantsToTable , summaryTable , summaryTableByField , totalVariants , totalPrivateVariants , totalSubjects , typeCounts );
13321366 }
13331367
1334- private static final List <String > allowedFields = Arrays .asList ("CHROM" , "ANN" , "CLN_SIG" , "GRASP_PH " , "ENCTFBS_TF " , "FE" , "NF" , "ENCSEG_NM " );
1368+ private static final List <String > allowedFields = Arrays .asList ("CHROM" , "ANN" , "CLN_SIG" , "FANTOM5_TFBS " , "FANTOM_ENHANCER " , "Polyphen2_HVAR_pred " );
13351369
13361370 private class FieldTracker
13371371 {
@@ -1413,30 +1447,18 @@ else if ("CLN_SIG".equals(fieldName))
14131447 addForValue ("AnnotationSummary" , "ClinVar Overlap" );
14141448 }
14151449 }
1416- else if ("GRASP_PH" .equals (fieldName ))
1417- {
1418- addForValue ("AnnotationSummary" , "GWAS Associations (GRASP)" );
1419- }
1420- else if ("FE" .equals (fieldName ) && "Y" .equals (val ))
1450+ else if ("FANTOM_ENHANCER" .equals (fieldName ) && "Y" .equals (val ))
14211451 {
14221452 addForValue ("AnnotationSummary" , "Enhancer Region (FANTOM5)" );
14231453 }
1424- else if ("ENCTFBS_TF " .equals (fieldName ))
1454+ else if ("FANTOM5_TFBS " .equals (fieldName ))
14251455 {
1426- addForValue ("AnnotationSummary" , "Transcription Factor Binding (ENCODE )" );
1456+ addForValue ("AnnotationSummary" , "Transcription Factor Binding (FANTOM5 )" );
14271457 }
1428- else if ("NF " .equals (fieldName ) && val .contains ("D" ))
1458+ else if ("Polyphen2_HVAR_pred " .equals (fieldName ) && val .contains ("D" ))
14291459 {
14301460 addForValue ("AnnotationSummary" , "Damaging (Polyphen2)" );
14311461 }
1432- else if ("ENCSEG_NM" .equals (fieldName ))
1433- {
1434- List <String > values = Arrays .asList (val .split ("," ));
1435- if (values .contains ("E" ))
1436- {
1437- addForValue ("AnnotationSummary" , "Predicted Enhancer (ENCODE)" );
1438- }
1439- }
14401462 else if ("CHROM" .equals (fieldName ))
14411463 {
14421464 addForValue ("PerChromosome" , val );
@@ -1586,7 +1608,7 @@ private void generateSummary(JobContext ctx, File variantsToTable, File output,
15861608 }
15871609 }
15881610
1589- private void maybeWriteVariantLine (Set <List <String >> queuedLines , VariantContext vc , @ Nullable String allele , String source , String reason , String description , Collection <String > overlappingGenes , Collection <String > omims , Collection <String > omimds , Logger log , String identifier )
1611+ private void maybeWriteVariantLine (Set <List <String >> queuedLines , VariantContext vc , @ Nullable String allele , String source , String reason , String description , Collection <String > overlappingGenes , Collection <String > omimIds , Collection <String > omimPhenotypes , Logger log , String identifier )
15901612 {
15911613 if (allele == null )
15921614 {
@@ -1635,7 +1657,7 @@ else if (af != null)
16351657
16361658 Object cadd = vc .getAttribute ("CADD_PH" );
16371659
1638- queuedLines .add (Arrays .asList (vc .getContig (), String .valueOf (vc .getStart ()), vc .getReference ().getDisplayString (), allele , source , reason , (description == null ? "" : description ), StringUtils .join (overlappingGenes , ";" ), StringUtils .join (omims , ";" ), StringUtils .join (omimds , ";" ), af == null ? "" : af .toString (), identifier == null ? "" : identifier , cadd == null ? "" : cadd .toString ()));
1660+ queuedLines .add (Arrays .asList (vc .getContig (), String .valueOf (vc .getStart ()), vc .getReference ().getDisplayString (), allele , source , reason , (description == null ? "" : description ), StringUtils .join (overlappingGenes , ";" ), StringUtils .join (omimIds , ";" ), StringUtils .join (omimPhenotypes , ";" ), af == null ? "" : af .toString (), identifier == null ? "" : identifier , cadd == null ? "" : cadd .toString ()));
16391661 }
16401662
16411663 private boolean indexExists (File vcf )
@@ -1658,7 +1680,7 @@ public Set<String> parseOmim(String input, Logger log)
16581680 {
16591681 if (StringUtils .isEmpty (token ))
16601682 {
1661- log .warn ("OMIMD was empty: " + input + ", " + StringUtils .join (tokens , ";" ));
1683+ log .warn ("OMIM_PHENO was empty: " + input + ", " + StringUtils .join (tokens , ";" ));
16621684 continue ;
16631685 }
16641686
@@ -1830,13 +1852,6 @@ public void testOMIMParse() throws Exception
18301852 {
18311853 Set <String > ret = pr .parseOmim (pair .getLeft (), _log );
18321854 Assert .assertEquals (pair .getRight (), ret );
1833-
1834- //NOTE: since this requires an API key and queries OMIM not suitable to general testing, but this can be uncommented for local dev
1835- //for (String term : ret)
1836- //{
1837- // String updated = pr.updateOmimD(term, ContainerManager.getRoot(), _log);
1838- // Assert.assertNotNull(updated);
1839- //}
18401855 }
18411856 }
18421857 }
0 commit comments