Skip to content

Commit 6333ae2

Browse files
committed
Add support for additional annotation sources
1 parent 3967c49 commit 6333ae2

3 files changed

Lines changed: 189 additions & 40 deletions

File tree

mGAP/src/org/labkey/mgap/pipeline/AnnotationStep.java

Lines changed: 109 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
import org.labkey.api.exp.api.ExperimentService;
1717
import org.labkey.api.pipeline.PipelineJob;
1818
import org.labkey.api.pipeline.PipelineJobException;
19+
import org.labkey.api.pipeline.PipelineJobService;
1920
import org.labkey.api.query.FieldKey;
2021
import org.labkey.api.sequenceanalysis.SequenceAnalysisService;
2122
import org.labkey.api.sequenceanalysis.SequenceOutputFile;
@@ -84,6 +85,14 @@ public Provider()
8485
put("valueField", "rowid");
8586
put("allowBlank", false);
8687
}}, null),
88+
ToolParameterDescriptor.create("useCassandra", "Use Cassandra", "If checked, Cassandra will be run.", "checkbox", new JSONObject()
89+
{{
90+
put("checked", true);
91+
}}, true),
92+
ToolParameterDescriptor.create("useFuncotator", "Use Funcotator", "If checked, Extended Funcotator will be run.", "checkbox", new JSONObject()
93+
{{
94+
put("checked", true);
95+
}}, true),
8796
ToolParameterDescriptor.create("dropFiltered", "Drop Filtered Sites", "If checked, filtered sites will be discarded, which can substantially improve speed.", "checkbox", new JSONObject()
8897
{{
8998
put("checked", true);
@@ -147,6 +156,13 @@ public Output processVariants(File inputVCF, File outputDirectory, ReferenceGeno
147156
throw new PipelineJobException("Unable to find file: " + dbnsfpFile.getPath());
148157
}
149158

159+
boolean useFuncotator = getProvider().getParameterByName("useFuncotator").extractValue(getPipelineCtx().getJob(), getProvider(), getStepIdx(), Boolean.class, false);
160+
File funcotatorSourceDir = new File(PipelineJobService.get().getAppProperties().getToolsDirectory(), "funcotatorDataSource");
161+
if (useFuncotator && !funcotatorSourceDir.exists())
162+
{
163+
throw new PipelineJobException("Unable to find file: " + funcotatorSourceDir.getPath());
164+
}
165+
150166
getPipelineCtx().getLogger().info("processing file: " + inputVCF.getName());
151167

152168
ReferenceGenome originalGenome = getPipelineCtx().getSequenceSupport().getCachedGenome(genome.getGenomeId());
@@ -310,13 +326,29 @@ public Output processVariants(File inputVCF, File outputDirectory, ReferenceGeno
310326
output.addIntermediateFile(clinvarAnnotated);
311327
output.addIntermediateFile(new File(clinvarAnnotated.getPath() + ".tbi"));
312328

329+
//backport ClinVar
330+
getPipelineCtx().getLogger().info("backport ClinVar 2.0 to source genome");
331+
File clinvarAnnotatedBackport = new File(outputDirectory, SequenceAnalysisService.get().getUnzippedBaseName(clinvarAnnotated.getName()) + ".bp.vcf.gz");
332+
if (forceRecreate || !indexExists(clinvarAnnotatedBackport ))
333+
{
334+
BackportLiftedVcfRunner bpRunner = new BackportLiftedVcfRunner(getPipelineCtx().getLogger());
335+
bpRunner.execute(clinvarAnnotated, originalGenome.getWorkingFastaFile(), grch37Genome.getWorkingFastaFile(), clinvarAnnotatedBackport);
336+
}
337+
else
338+
{
339+
getPipelineCtx().getLogger().info("resuming with existing file: " + clinvarAnnotatedBackport.getPath());
340+
}
341+
output.addOutput(clinvarAnnotatedBackport, "VCF Annotated With Clinvar, Backported");
342+
output.addIntermediateFile(clinvarAnnotatedBackport);
343+
output.addIntermediateFile(new File(clinvarAnnotatedBackport.getPath() + ".tbi"));
344+
313345
//annotate with SnpSift
314-
getPipelineCtx().getLogger().info("annotating with SnpSift");
346+
getPipelineCtx().getLogger().info("annotating with SnpSift/dbnsfp");
315347
File snpSiftAnnotated = new File(outputDirectory, SequenceAnalysisService.get().getUnzippedBaseName(liftedToGRCh37.getName()) + ".snpSift.vcf.gz");
316348
if (forceRecreate || !indexExists(snpSiftAnnotated))
317349
{
318350
SnpSiftWrapper ssRunner = new SnpSiftWrapper(getPipelineCtx().getLogger());
319-
ssRunner.runSnpSift(dbnsfpFile, clinvarAnnotated, snpSiftAnnotated);
351+
ssRunner.runSnpSift(dbnsfpFile, liftedToGRCh37, snpSiftAnnotated);
320352
}
321353
else
322354
{
@@ -326,48 +358,46 @@ public Output processVariants(File inputVCF, File outputDirectory, ReferenceGeno
326358
output.addIntermediateFile(snpSiftAnnotated);
327359
output.addIntermediateFile(new File(snpSiftAnnotated.getPath() + ".tbi"));
328360

329-
//annotate with cassandra
330-
getPipelineCtx().getLogger().info("annotating with Cassandra");
331-
String basename = SequenceAnalysisService.get().getUnzippedBaseName(liftedToGRCh37.getName()) + ".cassandra";
332-
File cassandraAnnotated = new File(outputDirectory, basename + ".vcf.gz");
333-
if (forceRecreate || !indexExists(cassandraAnnotated))
361+
//backport SnpSift
362+
getPipelineCtx().getLogger().info("Backport SnpSift to source genome");
363+
File snpSiftAnnotatedBackport = new File(outputDirectory, SequenceAnalysisService.get().getUnzippedBaseName(snpSiftAnnotated.getName()) + ".bp.vcf.gz");
364+
if (forceRecreate || !indexExists(snpSiftAnnotatedBackport))
334365
{
335-
//we can assume splitting happened upstream, so run over the full VCF
336-
cassandraAnnotated = runCassandra(liftedToGRCh37, cassandraAnnotated, output, forceRecreate);
366+
BackportLiftedVcfRunner bpRunner = new BackportLiftedVcfRunner(getPipelineCtx().getLogger());
367+
bpRunner.execute(snpSiftAnnotated, originalGenome.getWorkingFastaFile(), grch37Genome.getWorkingFastaFile(), snpSiftAnnotatedBackport);
337368
}
338369
else
339370
{
340-
getPipelineCtx().getLogger().info("resuming with existing file: " + cassandraAnnotated.getPath());
371+
getPipelineCtx().getLogger().info("resuming with existing file: " + snpSiftAnnotatedBackport.getPath());
341372
}
373+
output.addOutput(snpSiftAnnotatedBackport, "VCF Annotated With SnpSift, Backported");
374+
output.addIntermediateFile(snpSiftAnnotatedBackport);
375+
output.addIntermediateFile(new File(snpSiftAnnotatedBackport.getPath() + ".tbi"));
342376

343-
if (cassandraAnnotated != null)
377+
//annotate with cassandra
378+
File cassandraAnnotatedBackport = null;
379+
if (getProvider().getParameterByName("useCassandra").extractValue(getPipelineCtx().getJob(), getProvider(), getStepIdx(), Boolean.class, false))
344380
{
381+
getPipelineCtx().getLogger().info("annotating with Cassandra");
382+
String basename = SequenceAnalysisService.get().getUnzippedBaseName(liftedToGRCh37.getName()) + ".cassandra";
383+
File cassandraAnnotated = new File(outputDirectory, basename + ".vcf.gz");
384+
if (forceRecreate || !indexExists(cassandraAnnotated))
385+
{
386+
//we can assume splitting happened upstream, so run over the full VCF
387+
runCassandra(liftedToGRCh37, cassandraAnnotated, output, forceRecreate);
388+
}
389+
else
390+
{
391+
getPipelineCtx().getLogger().info("resuming with existing file: " + cassandraAnnotated.getPath());
392+
}
393+
345394
output.addOutput(cassandraAnnotated, "VCF Annotated With Cassandra");
346395
output.addIntermediateFile(cassandraAnnotated);
347396
output.addIntermediateFile(new File(cassandraAnnotated.getPath() + ".tbi"));
348-
}
349-
350-
//backport ClinVar
351-
getPipelineCtx().getLogger().info("backport ClinVar 2.0 to source genome");
352-
File clinvarAnnotatedBackport = new File(outputDirectory, SequenceAnalysisService.get().getUnzippedBaseName(clinvarAnnotated.getName()) + ".bp.vcf.gz");
353-
if (forceRecreate || !indexExists(clinvarAnnotatedBackport ))
354-
{
355-
BackportLiftedVcfRunner bpRunner = new BackportLiftedVcfRunner(getPipelineCtx().getLogger());
356-
bpRunner.execute(clinvarAnnotated, originalGenome.getWorkingFastaFile(), grch37Genome.getWorkingFastaFile(), clinvarAnnotatedBackport);
357-
}
358-
else
359-
{
360-
getPipelineCtx().getLogger().info("resuming with existing file: " + clinvarAnnotatedBackport.getPath());
361-
}
362-
output.addOutput(clinvarAnnotatedBackport, "VCF Annotated With Clinvar, Backported");
363-
output.addIntermediateFile(clinvarAnnotatedBackport);
364-
output.addIntermediateFile(new File(clinvarAnnotatedBackport.getPath() + ".tbi"));
365397

366-
//backport Cassandra
367-
getPipelineCtx().getLogger().info("backport Cassandra to source genome");
368-
File cassandraAnnotatedBackport = new File(outputDirectory, SequenceAnalysisService.get().getUnzippedBaseName(cassandraAnnotated.getName()) + ".bp.vcf.gz");
369-
if (cassandraAnnotated != null)
370-
{
398+
//backport Cassandra
399+
getPipelineCtx().getLogger().info("backport Cassandra to source genome");
400+
cassandraAnnotatedBackport = new File(outputDirectory, SequenceAnalysisService.get().getUnzippedBaseName(cassandraAnnotated.getName()) + ".bp.vcf.gz");
371401
if (forceRecreate || !indexExists(cassandraAnnotatedBackport))
372402
{
373403
BackportLiftedVcfRunner bpRunner = new BackportLiftedVcfRunner(getPipelineCtx().getLogger());
@@ -383,8 +413,50 @@ public Output processVariants(File inputVCF, File outputDirectory, ReferenceGeno
383413
}
384414
else
385415
{
386-
getPipelineCtx().getLogger().info("No cassandra output, will not backport");
387-
cassandraAnnotatedBackport = null;
416+
getPipelineCtx().getLogger().debug("Cassandra will be skipped");
417+
}
418+
419+
//annotate with funcotator
420+
File funcotatorAnnotatedBackport = null;
421+
if (useFuncotator)
422+
{
423+
getPipelineCtx().getLogger().info("annotating with Funcotator");
424+
String basename = SequenceAnalysisService.get().getUnzippedBaseName(liftedToGRCh37.getName()) + ".funcotator";
425+
File funcotatorAnnotated = new File(outputDirectory, basename + ".vcf.gz");
426+
if (forceRecreate || !indexExists(funcotatorAnnotated))
427+
{
428+
//we can assume splitting happened upstream, so run over the full VCF
429+
FuncotatorWrapper fr = new FuncotatorWrapper(getPipelineCtx().getLogger());
430+
fr.runFuncotator(funcotatorSourceDir, liftedToGRCh37, snpSiftAnnotated, genome);
431+
}
432+
else
433+
{
434+
getPipelineCtx().getLogger().info("resuming with existing file: " + funcotatorAnnotated.getPath());
435+
}
436+
437+
output.addOutput(funcotatorAnnotated, "VCF Annotated With Funcotator");
438+
output.addIntermediateFile(funcotatorAnnotated);
439+
output.addIntermediateFile(new File(funcotatorAnnotated.getPath() + ".tbi"));
440+
441+
//backport Funcotator
442+
getPipelineCtx().getLogger().info("backport Funcotator to source genome");
443+
funcotatorAnnotatedBackport = new File(outputDirectory, SequenceAnalysisService.get().getUnzippedBaseName(funcotatorAnnotated.getName()) + ".bp.vcf.gz");
444+
if (forceRecreate || !indexExists(funcotatorAnnotatedBackport))
445+
{
446+
BackportLiftedVcfRunner bpRunner = new BackportLiftedVcfRunner(getPipelineCtx().getLogger());
447+
bpRunner.execute(funcotatorAnnotated, originalGenome.getWorkingFastaFile(), grch37Genome.getWorkingFastaFile(), funcotatorAnnotatedBackport);
448+
}
449+
else
450+
{
451+
getPipelineCtx().getLogger().info("resuming with existing file: " + funcotatorAnnotatedBackport.getPath());
452+
}
453+
output.addOutput(funcotatorAnnotatedBackport, "VCF Annotated With Funcotator, Backported");
454+
output.addIntermediateFile(funcotatorAnnotatedBackport);
455+
output.addIntermediateFile(new File(funcotatorAnnotatedBackport.getPath() + ".tbi"));
456+
}
457+
else
458+
{
459+
getPipelineCtx().getLogger().debug("Funcotator will be skipped");
388460
}
389461

390462
//multiannotator
@@ -405,7 +477,7 @@ public Output processVariants(File inputVCF, File outputDirectory, ReferenceGeno
405477
needToSubsetToInterval = false;
406478
}
407479

408-
maRunner.execute(inputVCF, cassandraAnnotatedBackport, clinvarAnnotatedBackport, liftoverRejects, multiAnnotated, options);
480+
maRunner.execute(inputVCF, cassandraAnnotatedBackport, clinvarAnnotatedBackport, liftoverRejects, funcotatorAnnotatedBackport, snpSiftAnnotatedBackport, multiAnnotated, options);
409481
}
410482
else
411483
{
@@ -422,7 +494,7 @@ public Output processVariants(File inputVCF, File outputDirectory, ReferenceGeno
422494
return output;
423495
}
424496

425-
private File runCassandra(File liftedToGRCh37, File finalOutput, VariantProcessingStepOutputImpl output, boolean forceRecreate) throws PipelineJobException
497+
private void runCassandra(File liftedToGRCh37, File finalOutput, VariantProcessingStepOutputImpl output, boolean forceRecreate) throws PipelineJobException
426498
{
427499
List<String> extraArgs = new ArrayList<>();
428500

@@ -489,8 +561,6 @@ private File runCassandra(File liftedToGRCh37, File finalOutput, VariantProcessi
489561
{
490562
throw new PipelineJobException(e);
491563
}
492-
493-
return finalOutput;
494564
}
495565

496566
protected static boolean indexExists(File vcf)
Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
package org.labkey.mgap.pipeline;
2+
3+
import org.apache.logging.log4j.Logger;
4+
import org.labkey.api.pipeline.PipelineJobException;
5+
import org.labkey.api.sequenceanalysis.SequenceAnalysisService;
6+
import org.labkey.api.sequenceanalysis.pipeline.ReferenceGenome;
7+
import org.labkey.api.sequenceanalysis.run.AbstractDiscvrSeqWrapper;
8+
9+
import java.io.File;
10+
import java.io.IOException;
11+
import java.util.ArrayList;
12+
import java.util.List;
13+
14+
/**
15+
* Created by bimber on 8/24/2016.
16+
*/
17+
public class FuncotatorWrapper extends AbstractDiscvrSeqWrapper
18+
{
19+
public FuncotatorWrapper(Logger log)
20+
{
21+
super(log);
22+
}
23+
24+
public void runFuncotator(File dataDir, File input, File output, ReferenceGenome genome) throws PipelineJobException
25+
{
26+
getLogger().info("Annotating VCF with Funcotator");
27+
28+
List<String> params = new ArrayList<>(getBaseArgs());
29+
30+
params.add("-R");
31+
params.add(genome.getWorkingFastaFile().getPath());
32+
33+
params.add("--ref-version");
34+
params.add("hg19");
35+
36+
params.add("--data-sources-path");
37+
params.add(dataDir.getPath());
38+
39+
params.add("--output-file-format");
40+
params.add("VCF");
41+
42+
params.add("-cf");
43+
params.add(new File(dataDir, "fieldConfig.txt").getPath());
44+
45+
params.add("-V");
46+
params.add(input.getPath());
47+
48+
params.add("-O");
49+
params.add(output.getPath());
50+
51+
execute(params);
52+
53+
if (!output.exists())
54+
{
55+
throw new PipelineJobException("output not found: " + output.getName());
56+
}
57+
58+
try
59+
{
60+
SequenceAnalysisService.get().ensureVcfIndex(output, getLogger());
61+
}
62+
catch (IOException e)
63+
{
64+
throw new PipelineJobException(e);
65+
}
66+
}
67+
}

mGAP/src/org/labkey/mgap/pipeline/MultiSourceAnnotatorRunner.java

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ public MultiSourceAnnotatorRunner(Logger log)
1515
super(log);
1616
}
1717

18-
public File execute(File inputVcf, File cassandraVcf, File clinvarVcf, File liftoverRejects, File outputVcf, @Nullable List<String> options) throws PipelineJobException
18+
public File execute(File inputVcf, @Nullable File cassandraVcf, File clinvarVcf, File liftoverRejects, @Nullable File funcotator, @Nullable File snpSift, File outputVcf, @Nullable List<String> options) throws PipelineJobException
1919
{
2020
List<String> args = getBaseArgs("MultiSourceAnnotator");
2121

@@ -34,6 +34,18 @@ public File execute(File inputVcf, File cassandraVcf, File clinvarVcf, File lift
3434
args.add("-lr");
3535
args.add(liftoverRejects.getPath());
3636

37+
if (funcotator != null)
38+
{
39+
args.add("-f");
40+
args.add(funcotator.getPath());
41+
}
42+
43+
if (snpSift != null)
44+
{
45+
args.add("-ss");
46+
args.add(snpSift.getPath());
47+
}
48+
3749
args.add("-O");
3850
args.add(outputVcf.getPath());
3951

0 commit comments

Comments
 (0)