Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding new argument to control variant output interval filtering. #6388

Merged
merged 9 commits into from
Sep 18, 2024
Merged
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ private StandardArgumentDefinitions(){}
public static final String INVALIDATE_PREVIOUS_FILTERS_LONG_NAME = "invalidate-previous-filters";
public static final String SORT_ORDER_LONG_NAME = "sort-order";
public static final String FLOW_ORDER_FOR_ANNOTATIONS = "flow-order-for-annotations";

public static final String VARIANT_OUTPUT_INTERVAL_FILTERING_MODE_LONG_NAME = "variant-output-filtering";

public static final String INPUT_SHORT_NAME = "I";
public static final String OUTPUT_SHORT_NAME = "O";
Expand Down
42 changes: 32 additions & 10 deletions src/main/java/org/broadinstitute/hellbender/engine/GATKTool.java
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,11 @@
import java.util.*;
import java.util.stream.Stream;


import org.broadinstitute.barclay.argparser.Advanced;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.argparser.ArgumentCollection;
import org.broadinstitute.barclay.argparser.CommandLineException;
import org.broadinstitute.barclay.argparser.CommandLinePluginDescriptor;
import org.broadinstitute.hellbender.cmdline.CommandLineProgram;
import org.broadinstitute.hellbender.cmdline.GATKPlugin.GATKAnnotationPluginDescriptor;
Expand Down Expand Up @@ -45,6 +48,7 @@
import org.broadinstitute.hellbender.utils.reference.ReferenceUtils;
import org.broadinstitute.hellbender.utils.variant.GATKVariantContextUtils;
import org.broadinstitute.hellbender.utils.variant.writers.ShardingVCFWriter;
import org.broadinstitute.hellbender.utils.variant.writers.IntervalFilteringVcfWriter;

/**
* Base class for all GATK tools. Tool authors that want to write a "GATK" tool but not use one of
Expand Down Expand Up @@ -417,6 +421,13 @@ public int getDefaultCloudIndexPrefetchBufferSize() {
*/
public String getProgressMeterRecordLabel() { return ProgressMeter.DEFAULT_RECORD_LABEL; }

/**
* @return null for no output filtering of variants to the variant writer. Subclasses may override this to enforce other filtering schemes.
*/
public IntervalFilteringVcfWriter.Mode getVariantFilteringOutputModeIfApplicable(){
return IntervalFilteringVcfWriter.Mode.ANYWHERE;
}

protected List<SimpleInterval> transformTraversalIntervals(final List<SimpleInterval> getIntervals, final SAMSequenceDictionary sequenceDictionary) {
return getIntervals;
}
Expand Down Expand Up @@ -600,7 +611,7 @@ public boolean requiresIntervals() {

/**
* Does this tool want to disable the progress meter? If so, override here to return true
*
*
* @return true if this tools wants to disable progress meter output, otherwise false
*/
public boolean disableProgressMeter() {
Expand Down Expand Up @@ -727,12 +738,16 @@ protected void onStartup() {

initializeIntervals(); // Must be initialized after reference, reads and features, since intervals currently require a sequence dictionary from another data source

if ( seqValidationArguments.performSequenceDictionaryValidation()) {
if (seqValidationArguments.performSequenceDictionaryValidation()) {
validateSequenceDictionaries();
}

checkToolRequirements();

if ((getVariantFilteringOutputModeIfApplicable() != IntervalFilteringVcfWriter.Mode.ANYWHERE ) && userIntervals == null){
throw new CommandLineException.MissingArgument("-L or -XL", "Intervals are required if --" + StandardArgumentDefinitions.VARIANT_OUTPUT_INTERVAL_FILTERING_MODE_LONG_NAME + " was specified.");
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If a tool overrides the default here its going to spit this message if the user doesn't provide intervals at all which i think is confusing about this message.... We should probably disambiguate the default and user supplied here...

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a good point. I'm not sure it's addressed by your changes though?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Well pulling this down into variant walker base makes this a little more obnoxious to fix... I'm just going to add a comment clarifying but this is a head scratcher

}

initializeProgressMeter(getProgressMeterRecordLabel());
}

Expand Down Expand Up @@ -911,20 +926,27 @@ public VariantContextWriter createVCFWriter(final Path outPath) {
if (outputSitesOnlyVCFs) {
options.add(Options.DO_NOT_WRITE_GENOTYPES);
}

final VariantContextWriter unfilteredWriter;
if (maxVariantsPerShard > 0) {
return new ShardingVCFWriter(
unfilteredWriter = new ShardingVCFWriter(
outPath,
maxVariantsPerShard,
sequenceDictionary,
createOutputVariantMD5,
options.toArray(new Options[options.size()]));
options.toArray(new Options[0]));
} else {
unfilteredWriter = GATKVariantContextUtils.createVCFWriter(
jamesemery marked this conversation as resolved.
Show resolved Hide resolved
outPath,
sequenceDictionary,
createOutputVariantMD5,
options.toArray(new Options[0]));
}
return GATKVariantContextUtils.createVCFWriter(
outPath,
sequenceDictionary,
createOutputVariantMD5,
options.toArray(new Options[options.size()]));

return getVariantFilteringOutputModeIfApplicable() == IntervalFilteringVcfWriter.Mode.ANYWHERE ?
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not quite sure this rename helps that much, it just adds another layer of indirection. It's then kind of confusing because tools below variantwalker might not be sure which to override.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

changed to getVariantOutputFilteringMode I don't really know what IS better here

unfilteredWriter :
new IntervalFilteringVcfWriter(unfilteredWriter,
intervalArgumentCollection.getIntervals(getBestAvailableSequenceDictionary()),
getVariantFilteringOutputModeIfApplicable());
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,13 @@
import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.variant.variantcontext.VariantContext;
import htsjdk.variant.vcf.VCFHeader;
import org.broadinstitute.barclay.argparser.Advanced;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
import org.broadinstitute.hellbender.engine.filters.CountingReadFilter;
import org.broadinstitute.hellbender.exceptions.GATKException;
import org.broadinstitute.hellbender.utils.SimpleInterval;
import org.broadinstitute.hellbender.utils.variant.writers.IntervalFilteringVcfWriter;

import java.util.Spliterator;

Expand All @@ -31,6 +33,28 @@ public abstract class VariantWalker extends VariantWalkerBase {
private FeatureDataSource<VariantContext> drivingVariants;
private FeatureInput<VariantContext> drivingVariantsFeatureInput;

@Argument(fullName = StandardArgumentDefinitions.VARIANT_OUTPUT_INTERVAL_FILTERING_MODE_LONG_NAME,
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm pretty sure all these changes need to go in VariantWalkerBase, not VariantWalker. I think that's why all the tests exploded.

doc = "Restrict the output variants to ones that match the specified intervals according to the specified matching mode.",
optional = true)
@Advanced
public IntervalFilteringVcfWriter.Mode outputVariantIntervalFilteringMode = getDefaultVariantOutputFilterMode();

/**
* @return Default interval filtering mode for variant output. Subclasses may override this to set a different default.
*/
public IntervalFilteringVcfWriter.Mode getDefaultVariantOutputFilterMode(){
return IntervalFilteringVcfWriter.Mode.ANYWHERE;
}

@Override
public IntervalFilteringVcfWriter.Mode getVariantFilteringOutputModeIfApplicable() {
if (outputVariantIntervalFilteringMode != null) {
return outputVariantIntervalFilteringMode;
} else {
return super.getVariantFilteringOutputModeIfApplicable();
jamesemery marked this conversation as resolved.
Show resolved Hide resolved
}
}

@Override
protected SAMSequenceDictionary getSequenceDictionaryForDrivingVariants() { return drivingVariants.getSequenceDictionary(); }

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,11 @@
import htsjdk.variant.variantcontext.writer.VariantContextWriter;
import htsjdk.variant.vcf.VCFHeader;
import htsjdk.variant.vcf.VCFHeaderLine;
import org.broadinstitute.barclay.argparser.*;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.argparser.ArgumentCollection;
import org.broadinstitute.barclay.argparser.CommandLineException;
import org.broadinstitute.barclay.argparser.CommandLinePluginDescriptor;
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
import org.broadinstitute.barclay.help.DocumentedFeature;
import org.broadinstitute.hellbender.cmdline.GATKPlugin.GATKAnnotationPluginDescriptor;
import org.broadinstitute.hellbender.cmdline.GATKPlugin.GATKReadFilterPluginDescriptor;
Expand All @@ -29,11 +33,23 @@
import org.broadinstitute.hellbender.tools.walkers.annotator.VariantAnnotatorEngine;
import org.broadinstitute.hellbender.tools.walkers.genotyper.GenotypeCalculationArgumentCollection;
import org.broadinstitute.hellbender.tools.walkers.mutect.M2ArgumentCollection;
import org.broadinstitute.hellbender.utils.*;
import org.broadinstitute.hellbender.utils.GenomeLoc;
import org.broadinstitute.hellbender.utils.GenomeLocParser;
import org.broadinstitute.hellbender.utils.GenomeLocSortedSet;
import org.broadinstitute.hellbender.utils.IntervalMergingRule;
import org.broadinstitute.hellbender.utils.IntervalSetRule;
import org.broadinstitute.hellbender.utils.IntervalUtils;
import org.broadinstitute.hellbender.utils.SimpleInterval;
import org.broadinstitute.hellbender.utils.variant.GATKVariantContextUtils;
import org.broadinstitute.hellbender.tools.walkers.annotator.allelespecific.ReducibleAnnotation;

import java.util.*;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;

/**
Expand Down Expand Up @@ -114,7 +130,7 @@ public final class GenotypeGVCFs extends VariantLocusWalker {
/**
* Import all data between specified intervals. Improves performance using large lists of intervals, as in exome
* sequencing, especially if GVCF data only exists for specified intervals. Use with
* --only-output-calls-starting-in-intervals if input GVCFs contain calls outside the specified intervals.
* --{@value StandardArgumentDefinitions#VARIANT_OUTPUT_INTERVAL_FILTERING_MODE_LONG_NAME} if input GVCFs contain calls outside the specified intervals.
*/
@Argument(fullName = GenomicsDBImport.MERGE_INPUT_INTERVALS_LONG_NAME,
shortName = GenomicsDBImport.MERGE_INPUT_INTERVALS_LONG_NAME,
Expand Down Expand Up @@ -155,16 +171,6 @@ public final class GenotypeGVCFs extends VariantLocusWalker {
@ArgumentCollection
private GenomicsDBArgumentCollection genomicsdbArgs = new GenomicsDBArgumentCollection();

/**
* This option can only be activated if intervals are specified.
*/
@Advanced
@Argument(fullName= ONLY_OUTPUT_CALLS_STARTING_IN_INTERVALS_FULL_NAME,
doc="Restrict variant output to sites that start within provided intervals",
optional=true)
private boolean onlyOutputCallsStartingInIntervals = false;


@Argument(fullName = FORCE_OUTPUT_INTERVALS_NAME,
suppressFileExpansion = true, doc = "sites at which to output genotypes even if non-variant in samples", optional = true)
protected final List<String> forceOutputIntervalStrings = new ArrayList<>();
Expand All @@ -186,9 +192,6 @@ public final class GenotypeGVCFs extends VariantLocusWalker {

private VariantContextWriter vcfWriter;

/** these are used when {@link #onlyOutputCallsStartingInIntervals) is true */
private List<SimpleInterval> intervals;

private OverlapDetector<GenomeLoc> forceOutputIntervals;

private boolean forceOutputIntervalsPresent;
Expand Down Expand Up @@ -269,23 +272,14 @@ public void onTraversalStart() {

final VCFHeader inputVCFHeader = getHeaderForVariants();

if(onlyOutputCallsStartingInIntervals) {
if( !hasUserSuppliedIntervals()) {
throw new CommandLineException.MissingArgument("-L or -XL", "Intervals are required if --" + ONLY_OUTPUT_CALLS_STARTING_IN_INTERVALS_FULL_NAME + " was specified.");
}
}

intervals = hasUserSuppliedIntervals() ? intervalArgumentCollection.getIntervals(getBestAvailableSequenceDictionary()) :
Collections.emptyList();

final Collection<Annotation> variantAnnotations = makeVariantAnnotations();
final Collection<Annotation> variantAnnotations = makeVariantAnnotations();
final Set<Annotation> annotationsToKeep = getAnnotationsToKeep();
annotationEngine = new VariantAnnotatorEngine(variantAnnotations, dbsnp.dbsnp, Collections.emptyList(), false, keepCombined, annotationsToKeep);

merger = new ReferenceConfidenceVariantContextMerger(annotationEngine, getHeaderForVariants(), somaticInput, false, true);

//methods that cannot be called in engine bc its protected
Set<VCFHeaderLine> defaultToolVCFHeaderLines = getDefaultToolVCFHeaderLines();
final Set<VCFHeaderLine> defaultToolVCFHeaderLines = getDefaultToolVCFHeaderLines();
vcfWriter = createVCFWriter(outputFile);

//create engine object
Expand All @@ -294,7 +288,6 @@ public void onTraversalStart() {

//call initialize method in engine class that creates VCFWriter object and writes a header to it
vcfWriter = gvcfEngine.setupVCFWriter(defaultToolVCFHeaderLines, keepCombined, dbsnp, vcfWriter);

}

private Set<Annotation> getAnnotationsToKeep() {
Expand All @@ -316,9 +309,7 @@ public void apply(final Locatable loc, List<VariantContext> variants, ReadsConte
final VariantContext regenotypedVC = gvcfEngine.callRegion(loc, variants, ref, features, merger, somaticInput, tlodThreshold, afTolerance, forceOutput);

if (regenotypedVC != null) {
final SimpleInterval variantStart = new SimpleInterval(regenotypedVC.getContig(), regenotypedVC.getStart(), regenotypedVC.getStart());
if ((forceOutput || !GATKVariantContextUtils.isSpanningDeletionOnly(regenotypedVC)) &&
(!onlyOutputCallsStartingInIntervals || intervals.stream().anyMatch(interval -> interval.contains (variantStart)))) {
if ((forceOutput || !GATKVariantContextUtils.isSpanningDeletionOnly(regenotypedVC))) {
vcfWriter.add(regenotypedVC);
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -108,15 +108,6 @@ public final class GnarlyGenotyper extends VariantWalker {
@Argument(fullName = "keep-all-sites", doc="Retain low quality and non-variant sites, applying appropriate filters", optional=true)
private boolean keepAllSites = false;

/**
* This option can only be activated if intervals are specified.
*/
@Advanced
@Argument(fullName = GenotypeGVCFs.ONLY_OUTPUT_CALLS_STARTING_IN_INTERVALS_FULL_NAME,
doc="Restrict variant output to sites that start within provided intervals",
optional=true)
private boolean onlyOutputCallsStartingInIntervals = false;

@Argument(fullName = GenomicsDBImport.MERGE_INPUT_INTERVALS_LONG_NAME,
shortName = GenomicsDBImport.MERGE_INPUT_INTERVALS_LONG_NAME,
doc = "Boolean flag to read in all data in between intervals. Improves performance reading from GenomicsDB " +
Expand Down Expand Up @@ -145,9 +136,6 @@ public final class GnarlyGenotyper extends VariantWalker {
private final RMSMappingQuality mqCalculator = RMSMappingQuality.getInstance();
private final Set<Class<? extends InfoFieldAnnotation>> allAlleleSpecificAnnotations = new HashSet<>();

/** these are used when {@link #onlyOutputCallsStartingInIntervals) is true */
private List<SimpleInterval> intervals;

@Override
public boolean requiresReference() {
return true;
Expand Down Expand Up @@ -182,14 +170,6 @@ protected GenomicsDBOptions getGenomicsDBOptions() {
public void onTraversalStart() {
final VCFHeader inputVCFHeader = getHeaderForVariants();

if(onlyOutputCallsStartingInIntervals) {
if( !intervalArgumentCollection.intervalsSpecified()) {
throw new CommandLineException.MissingArgument("-L or -XL", "Intervals are required if --" + GenotypeGVCFs.ONLY_OUTPUT_CALLS_STARTING_IN_INTERVALS_FULL_NAME + " was specified.");
}
}
intervals = intervalArgumentCollection.intervalsSpecified() ? intervalArgumentCollection.getIntervals(getBestAvailableSequenceDictionary()) :
Collections.emptyList();

final SampleList samples = new IndexedSampleList(inputVCFHeader.getGenotypeSamples());

setupVCFWriter(inputVCFHeader, samples);
Expand Down Expand Up @@ -260,11 +240,10 @@ private void setupVCFWriter(VCFHeader inputVCFHeader, SampleList samples) {
@SuppressWarnings({"unchecked", "rawtypes"})
@Override
public void apply(VariantContext variant, ReadsContext reads, ReferenceContext ref, FeatureContext features) {
SimpleInterval variantStart = new SimpleInterval(variant.getContig(), variant.getStart(), variant.getStart());
//return early if there's no non-symbolic ALT since GDB already did the merging
if ( !variant.isVariant() || !GATKVariantContextUtils.isProperlyPolymorphic(variant)
|| variant.getAttributeAsInt(VCFConstants.DEPTH_KEY,0) == 0
|| (onlyOutputCallsStartingInIntervals && !intervals.stream().anyMatch(interval -> interval.contains(variantStart)))) {
|| variant.getAttributeAsInt(VCFConstants.DEPTH_KEY,0) == 0 )
{
if (keepAllSites) {
VariantContextBuilder builder = new VariantContextBuilder(mqCalculator.finalizeRawMQ(variant)); //don't fill in QUAL here because there's no alt data
builder.filter(GATKVCFConstants.LOW_QUAL_FILTER_NAME);
Expand All @@ -291,7 +270,7 @@ public void apply(VariantContext variant, ReadsContext reads, ReferenceContext r
finalizedVC = genotyperEngine.finalizeGenotype(variant);
}
//could return null if the variant didn't pass the genotyping arg calling/emission threshold
if (finalizedVC != null && (!onlyOutputCallsStartingInIntervals || intervals.stream().anyMatch(interval -> interval.contains(variantStart)))) {
if (finalizedVC != null) {
vcfWriter.add(finalizedVC);
}
}
Expand Down
Loading
Loading