Skip to content

Commit 12da49b

Browse files
committed
Refactor mGapReleaseGenerator to match new annotations
1 parent 6490e55 commit 12da49b

1 file changed

Lines changed: 73 additions & 58 deletions

File tree

mGAP/src/org/labkey/mgap/pipeline/mGapReleaseGenerator.java

Lines changed: 73 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,6 @@
1616
import org.apache.logging.log4j.LogManager;
1717
import org.apache.logging.log4j.Logger;
1818
import org.jetbrains.annotations.Nullable;
19-
import org.json.JSONArray;
2019
import org.json.JSONObject;
2120
import org.junit.Assert;
2221
import org.junit.Test;
@@ -933,7 +932,7 @@ private void checkVcfAnnotationsAndSamples(File vcfInput, boolean skipAnnotation
933932

934933
if (!skipAnnotationChecks)
935934
{
936-
for (String info : Arrays.asList("CADD_PH", "OMIMN", "CLN_ALLELE", "AF", "mGAPV"))
935+
for (String info : Arrays.asList("CADD_PH", "OMIM_PHENO", "CLN_ALLELE", "AF", "mGAPV"))
937936
{
938937
if (!header.hasInfoLine(info))
939938
{
@@ -1080,16 +1079,16 @@ private void inspectAndSummarizeVcf(JobContext ctx, File vcfInput, GeneToNameTra
10801079
}
10811080
}
10821081

1083-
Set<String> omims = new LinkedHashSet<>();
1084-
Set<String> omimds = new LinkedHashSet<>();
1085-
if (vc.getAttribute("OMIMN") != null)
1082+
Set<String> omimIds = new LinkedHashSet<>();
1083+
Set<String> omimPhenotypes = new LinkedHashSet<>();
1084+
if (vc.getAttribute("MIMNUMBER") != null)
10861085
{
1087-
omims.add(vc.getAttributeAsString("OMIMN", null));
1086+
omimIds.add(vc.getAttributeAsString("MIMNUMBER", null));
10881087
}
10891088

1090-
if (vc.getAttribute("OMIMD") != null)
1089+
if (vc.getAttribute("OMIM_PHENO") != null)
10911090
{
1092-
omimds.addAll(parseRawOmimd(vc, ctx.getLogger()));
1091+
omimPhenotypes.addAll(parseRawOmimPheno(vc, ctx.getLogger()));
10931092
}
10941093

10951094
Set<String> overlappingGenes = new TreeSet<>();
@@ -1131,6 +1130,11 @@ private void inspectAndSummarizeVcf(JobContext ctx, File vcfInput, GeneToNameTra
11311130
geneName = translator.getGeneMap().get(geneName).get("gene_name");
11321131
}
11331132

1133+
if (tokens.length > 7 && !"protein_coding".equals(tokens[7])) {
1134+
ctx.getLogger().info("skipping non protein_coding ANN: " + ann);
1135+
continue;
1136+
}
1137+
11341138
overlappingGenes.add(geneName);
11351139
}
11361140
}
@@ -1162,7 +1166,7 @@ private void inspectAndSummarizeVcf(JobContext ctx, File vcfInput, GeneToNameTra
11621166
String overlappingGenesJoin = StringUtils.join(overlappingGenes, ",");
11631167
if (!overlappingGenesReported.contains(overlappingGenesJoin))
11641168
{
1165-
maybeWriteVariantLine(queuedLines, vc, tokens[0], "SNPEff", "Predicted High Impact", description, overlappingGenes, omims, omimds, ctx.getLogger(), null);
1169+
maybeWriteVariantLine(queuedLines, vc, tokens[0], "SNPEff", "Predicted High Impact", description, overlappingGenes, omimIds, omimPhenotypes, ctx.getLogger(), null);
11661170

11671171
// NOTE: a given site could have multiple overlapping ORFs (usually different isoforms), so if we hit one allow this to tag that site and skip the remaining.
11681172
overlappingGenesReported.add(overlappingGenesJoin);
@@ -1173,14 +1177,24 @@ private void inspectAndSummarizeVcf(JobContext ctx, File vcfInput, GeneToNameTra
11731177

11741178
if (vc.getAttribute("CLN_SIG") != null)
11751179
{
1176-
List<String> clnAlleles = vc.getAttributeAsStringList("CLN_ALLELE", "");
11771180
List<String> clnSigs = vc.getAttributeAsStringList("CLN_SIG", "");
1181+
if (clnSigs.size() != vc.getAlternateAlleles().size())
1182+
{
1183+
throw new IllegalStateException("CLN_SIG and alt alleles were not the same length: " + vc.toStringWithoutGenotypes());
1184+
}
1185+
11781186
List<String> clnDisease = vc.getAttributeAsStringList("CLN_DN", "");
11791187
List<String> clnAlleleIds = vc.getAttributeAsStringList("CLN_ALLELEID", "");
11801188
int i = -1;
11811189
for (String sigList : clnSigs)
11821190
{
11831191
i++;
1192+
if (sigList.isEmpty())
1193+
{
1194+
continue;
1195+
}
1196+
1197+
Allele altAllele = vc.getAlternateAllele(i);
11841198

11851199
String[] sigSplit = sigList.split("\\|");
11861200
List<String> diseaseSplit = Arrays.asList(clnDisease.get(i).split("\\|"));
@@ -1196,8 +1210,7 @@ private void inspectAndSummarizeVcf(JobContext ctx, File vcfInput, GeneToNameTra
11961210

11971211
try
11981212
{
1199-
String allele = clnAlleles.get(i);
1200-
maybeWriteVariantLine(queuedLines, vc, allele, "ClinVar", diseaseSplit.get(j), description, overlappingGenes, omims, omimds, ctx.getLogger(), "ClinVar:" + clnAlleleIds.get(i));
1213+
maybeWriteVariantLine(queuedLines, vc, altAllele.getBaseString(), "ClinVar", diseaseSplit.get(j), description, overlappingGenes, omimIds, omimPhenotypes, ctx.getLogger(), "ClinVar:" + clnAlleleIds.get(i));
12011214

12021215
}
12031216
catch (IndexOutOfBoundsException e)
@@ -1214,36 +1227,57 @@ private void inspectAndSummarizeVcf(JobContext ctx, File vcfInput, GeneToNameTra
12141227
}
12151228
}
12161229

1217-
//NE: nsdb Polyphen2_HVAR_score: Polyphen2 score based on HumVar, i.e. hvar_prob. The score ranges from 0 to 1, and the corresponding prediction is 'probably damaging' if it is in [0.909,1], 'possibly damaging' if it is in [0.447,0.908], 'benign' if it is in [0,0.446]. Score cutoff for binary classification is 0.5, i.e. the prediction is 'neutral' if the score is smaller than 0.5 and 'deleterious' if the score is larger than 0.5. Multiple entries separated by
1218-
//NF: nsdb Polyphen2_HVAR_pred: Polyphen2 prediction based on HumVar, 'D' ('probably damaging'),'P' ('possibly damaging') and 'B' ('benign'). Multiple entries separated by
1219-
if (vc.getAttribute("NF") != null && !".".equals(vc.getAttribute("NF")))
1230+
//Polyphen2_HVAR_S: Polyphen2 score based on HumVar, i.e. hvar_prob. The score ranges from 0 to 1, and the corresponding prediction is 'probably damaging' if it is in [0.909,1], 'possibly damaging' if it is in [0.447,0.908], 'benign' if it is in [0,0.446]. Score cutoff for binary classification is 0.5, i.e. the prediction is 'neutral' if the score is smaller than 0.5 and 'deleterious' if the score is larger than 0.5. Multiple entries separated by
1231+
//Polyphen2_HVAR_pred: Polyphen2 prediction based on HumVar, 'D' ('probably damaging'),'P' ('possibly damaging') and 'B' ('benign'). Multiple entries separated by
1232+
if (vc.getAttribute("Polyphen2_HVAR_pred") != null && !".".equals(vc.getAttribute("Polyphen2_HVAR_pred")))
12201233
{
1221-
Set<String> polyphenPredictions = new HashSet<>(vc.getAttributeAsStringList("NF", null));
1222-
polyphenPredictions.remove("B");
1223-
polyphenPredictions.remove("P");
1234+
List<String> polyphenPredictions = vc.getAttributeAsStringList("Polyphen2_HVAR_pred", null);
1235+
List<Double> polyphenScores = vc.getAttributeAsDoubleList("Polyphen2_HVAR_S", 0.0);
1236+
1237+
if (polyphenPredictions.size() != vc.getAlternateAlleles().size())
1238+
{
1239+
throw new IllegalStateException("Polyphen2_HVAR_pred and alt alleles were not the same length: " + vc.toStringWithoutGenotypes());
1240+
}
1241+
1242+
if (polyphenScores.size() != vc.getAlternateAlleles().size())
1243+
{
1244+
throw new IllegalStateException("Polyphen2_HVAR_S and alt alleles were not the same length: " + vc.toStringWithoutGenotypes());
1245+
}
12241246

1225-
String description = null;
1226-
if (!polyphenPredictions.isEmpty())
1247+
int alleleIdx = -1;
1248+
for (Allele alt : vc.getAlternateAlleles())
12271249
{
1250+
alleleIdx++;
1251+
1252+
String prediction = polyphenPredictions.get(alleleIdx);
1253+
if (StringUtils.isEmpty(prediction) || "B".equals(prediction) || "P".equals(prediction) || ".".equals(prediction))
1254+
{
1255+
continue;
1256+
}
1257+
1258+
String description = null;
12281259
try
12291260
{
1230-
Double maxScore = Collections.max(vc.getAttributeAsDoubleList("NE", 0.0));
1231-
description = StringUtils.join(new String[]{
1232-
"Score: " + maxScore
1233-
}, ",");
1261+
Double maxScore = polyphenScores.get(alleleIdx);
1262+
if (maxScore == 0.0)
1263+
{
1264+
ctx.getLogger().error("Suspicious values for Polyphen2_HVAR_S: " + maxScore + ", at position: " + vc.toStringWithoutGenotypes());
1265+
}
1266+
1267+
description = "Score: " + maxScore;
12341268
}
12351269
catch (NumberFormatException e)
12361270
{
1237-
ctx.getLogger().warn("Unable to parse NE attribute decimal (" + vc.getAttribute("NE") + ") for variant at position: " + vc.toStringWithoutGenotypes());
1271+
ctx.getLogger().error("Unable to parse Polyphen2_HVAR_S attribute decimal (" + vc.getAttribute("Polyphen2_HVAR_S") + ") for variant at position: " + vc.toStringWithoutGenotypes());
12381272
}
12391273

1240-
maybeWriteVariantLine(queuedLines, vc, null, "Polyphen2", "Prediction: " + StringUtils.join(polyphenPredictions, ","), description, overlappingGenes, omims, omimds, ctx.getLogger(), null);
1274+
maybeWriteVariantLine(queuedLines, vc, alt.getBaseString(), "Polyphen2", "Prediction: " + prediction, description, overlappingGenes, omimIds, omimPhenotypes, ctx.getLogger(), null);
12411275
}
12421276
}
12431277

12441278
for (List<String> line : queuedLines)
12451279
{
1246-
writer.writeNext(line.toArray(new String[line.size()]));
1280+
writer.writeNext(line.toArray(new String[0]));
12471281
}
12481282
}
12491283
}
@@ -1264,17 +1298,17 @@ private void inspectAndSummarizeVcf(JobContext ctx, File vcfInput, GeneToNameTra
12641298
}
12651299
}
12661300

1267-
public Collection<String> parseRawOmimd(VariantContext vc, Logger log)
1301+
public Collection<String> parseRawOmimPheno(VariantContext vc, Logger log)
12681302
{
12691303
//NOTE: because this field can have internal commas, this can be parsed incorrectly, so ignore this and re-join
12701304
String rawVal;
1271-
if (vc.getAttribute("OMIMD") instanceof Collection)
1305+
if (vc.getAttribute("OMIM_PHENO") instanceof Collection)
12721306
{
1273-
rawVal = StringUtils.join(vc.getAttributeAsStringList("OMIMD", null), ",");
1307+
rawVal = StringUtils.join(vc.getAttributeAsStringList("OMIM_PHENO", null), ",");
12741308
}
12751309
else
12761310
{
1277-
rawVal = vc.getAttributeAsString("OMIMD", null);
1311+
rawVal = vc.getAttributeAsString("OMIM_PHENO", null);
12781312
}
12791313

12801314
if (!StringUtils.isEmpty(rawVal) && ! ".".equals(rawVal))
@@ -1331,7 +1365,7 @@ private void generateSummaries(JobContext ctx, File vcf, ReferenceGenome genome,
13311365
generateSummary(ctx, variantsToTable, summaryTable, summaryTableByField, totalVariants, totalPrivateVariants, totalSubjects, typeCounts);
13321366
}
13331367

1334-
private static final List<String> allowedFields = Arrays.asList("CHROM", "ANN", "CLN_SIG", "GRASP_PH", "ENCTFBS_TF", "FE", "NF", "ENCSEG_NM");
1368+
private static final List<String> allowedFields = Arrays.asList("CHROM", "ANN", "CLN_SIG", "FANTOM5_TFBS", "FANTOM_ENHANCER", "Polyphen2_HVAR_pred");
13351369

13361370
private class FieldTracker
13371371
{
@@ -1413,30 +1447,18 @@ else if ("CLN_SIG".equals(fieldName))
14131447
addForValue("AnnotationSummary", "ClinVar Overlap");
14141448
}
14151449
}
1416-
else if ("GRASP_PH".equals(fieldName))
1417-
{
1418-
addForValue("AnnotationSummary", "GWAS Associations (GRASP)");
1419-
}
1420-
else if ("FE".equals(fieldName) && "Y".equals(val))
1450+
else if ("FANTOM_ENHANCER".equals(fieldName) && "Y".equals(val))
14211451
{
14221452
addForValue("AnnotationSummary", "Enhancer Region (FANTOM5)");
14231453
}
1424-
else if ("ENCTFBS_TF".equals(fieldName))
1454+
else if ("FANTOM5_TFBS".equals(fieldName))
14251455
{
1426-
addForValue("AnnotationSummary", "Transcription Factor Binding (ENCODE)");
1456+
addForValue("AnnotationSummary", "Transcription Factor Binding (FANTOM5)");
14271457
}
1428-
else if ("NF".equals(fieldName) && val.contains("D"))
1458+
else if ("Polyphen2_HVAR_pred".equals(fieldName) && val.contains("D"))
14291459
{
14301460
addForValue("AnnotationSummary", "Damaging (Polyphen2)");
14311461
}
1432-
else if ("ENCSEG_NM".equals(fieldName))
1433-
{
1434-
List<String> values = Arrays.asList(val.split(","));
1435-
if (values.contains("E"))
1436-
{
1437-
addForValue("AnnotationSummary", "Predicted Enhancer (ENCODE)");
1438-
}
1439-
}
14401462
else if ("CHROM".equals(fieldName))
14411463
{
14421464
addForValue("PerChromosome", val);
@@ -1586,7 +1608,7 @@ private void generateSummary(JobContext ctx, File variantsToTable, File output,
15861608
}
15871609
}
15881610

1589-
private void maybeWriteVariantLine(Set<List<String>> queuedLines, VariantContext vc, @Nullable String allele, String source, String reason, String description, Collection<String> overlappingGenes, Collection<String> omims, Collection<String> omimds, Logger log, String identifier)
1611+
private void maybeWriteVariantLine(Set<List<String>> queuedLines, VariantContext vc, @Nullable String allele, String source, String reason, String description, Collection<String> overlappingGenes, Collection<String> omimIds, Collection<String> omimPhenotypes, Logger log, String identifier)
15901612
{
15911613
if (allele == null)
15921614
{
@@ -1635,7 +1657,7 @@ else if (af != null)
16351657

16361658
Object cadd = vc.getAttribute("CADD_PH");
16371659

1638-
queuedLines.add(Arrays.asList(vc.getContig(), String.valueOf(vc.getStart()), vc.getReference().getDisplayString(), allele, source, reason, (description == null ? "" : description), StringUtils.join(overlappingGenes, ";"), StringUtils.join(omims, ";"), StringUtils.join(omimds, ";"), af == null ? "" : af.toString(), identifier == null ? "" : identifier, cadd == null ? "" : cadd.toString()));
1660+
queuedLines.add(Arrays.asList(vc.getContig(), String.valueOf(vc.getStart()), vc.getReference().getDisplayString(), allele, source, reason, (description == null ? "" : description), StringUtils.join(overlappingGenes, ";"), StringUtils.join(omimIds, ";"), StringUtils.join(omimPhenotypes, ";"), af == null ? "" : af.toString(), identifier == null ? "" : identifier, cadd == null ? "" : cadd.toString()));
16391661
}
16401662

16411663
private boolean indexExists(File vcf)
@@ -1658,7 +1680,7 @@ public Set<String> parseOmim(String input, Logger log)
16581680
{
16591681
if (StringUtils.isEmpty(token))
16601682
{
1661-
log.warn("OMIMD was empty: " + input + ", " + StringUtils.join(tokens, ";"));
1683+
log.warn("OMIM_PHENO was empty: " + input + ", " + StringUtils.join(tokens, ";"));
16621684
continue;
16631685
}
16641686

@@ -1830,13 +1852,6 @@ public void testOMIMParse() throws Exception
18301852
{
18311853
Set<String> ret = pr.parseOmim(pair.getLeft(), _log);
18321854
Assert.assertEquals(pair.getRight(), ret);
1833-
1834-
//NOTE: since this requires an API key and queries OMIM not suitable to general testing, but this can be uncommented for local dev
1835-
//for (String term : ret)
1836-
//{
1837-
// String updated = pr.updateOmimD(term, ContainerManager.getRoot(), _log);
1838-
// Assert.assertNotNull(updated);
1839-
//}
18401855
}
18411856
}
18421857
}

0 commit comments

Comments
 (0)