This repository has been archived by the owner on Jan 24, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
9 changed files
with
170 additions
and
12 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
9 changes: 9 additions & 0 deletions
9
...tator-cli/src/main/java/com/github/bihealth/varfish_annotator/annotate/GenomeVersion.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
package com.github.bihealth.varfish_annotator.annotate; | ||
|
||
/** Enumerate known genome versions. */ | ||
public enum GenomeVersion { | ||
GRCH37, | ||
HG19, | ||
GRCH38, | ||
HG38 | ||
} |
17 changes: 17 additions & 0 deletions
17
...rc/main/java/com/github/bihealth/varfish_annotator/annotate/IncompatibleVcfException.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
package com.github.bihealth.varfish_annotator.annotate; | ||
|
||
/** Raised on incompatible VCF files. */ | ||
public class IncompatibleVcfException extends Exception { | ||
|
||
public IncompatibleVcfException(String message, Throwable cause) { | ||
super(message, cause); | ||
} | ||
|
||
public IncompatibleVcfException(String message) { | ||
super(message); | ||
} | ||
|
||
public IncompatibleVcfException(Throwable cause) { | ||
super(cause); | ||
} | ||
} |
80 changes: 80 additions & 0 deletions
80
...src/main/java/com/github/bihealth/varfish_annotator/annotate/VcfCompatibilityChecker.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,80 @@ | ||
package com.github.bihealth.varfish_annotator.annotate; | ||
|
||
import htsjdk.samtools.SAMSequenceRecord; | ||
import htsjdk.variant.vcf.VCFContigHeaderLine; | ||
import htsjdk.variant.vcf.VCFFileReader; | ||
import java.util.List; | ||
|
||
/** | ||
* Check VCF file for compatibility with annotation. | ||
* | ||
* <p>At the moment, only a simple check is implemented that tests whether the dataset looks like | ||
* GRCh37/hg19 as this is the Genome build that VarFish supports. | ||
*/ | ||
public class VcfCompatibilityChecker { | ||
|
||
/** Length of chr1 in hg19. */ | ||
private static final int CHR1_HG19_LENGTH = 249250621; | ||
/** Length of chr1 in hg38. */ | ||
private static final int CHR1_HG38_LENGTH = 248956422; | ||
|
||
/** The {@link VCFFileReader} that is to be used for checking. */ | ||
private VCFFileReader reader; | ||
|
||
/** | ||
* Construct a new {@link VcfCompatibilityChecker}. | ||
* | ||
* @param reader The {@link VCFFileReader} to use for checking headers etc. | ||
*/ | ||
public VcfCompatibilityChecker(VCFFileReader reader) { | ||
this.reader = reader; | ||
} | ||
|
||
/** | ||
* Check whether the VCF file given to the construtor as <tt>reader</tt> looks to be compatible. | ||
* | ||
* <p>Throws an exception in case of problems and otherwise just returns. Will print a warning to | ||
* stderr if no reliable decision could be made. | ||
* | ||
* @throws IncompatibleVcfException If the VCF file looks to be incompatible. | ||
*/ | ||
public void check() throws IncompatibleVcfException { | ||
// Check whether this looks like GRCh37/h19. | ||
GenomeVersion genomeVersion = this.guessGenomeVersion(); | ||
if (genomeVersion == GenomeVersion.GRCH37 || genomeVersion == GenomeVersion.HG19) { | ||
System.err.println( | ||
"INFO: Genome looks like GRCh" + 37 + " (sequence only; regardless of 'chr' prefix)."); | ||
} else if (genomeVersion == GenomeVersion.GRCH38 || genomeVersion == GenomeVersion.HG38) { | ||
throw new IncompatibleVcfException( | ||
"VCF file looks like hg38 by chr1 length but we only support hg19/GRCh37"); | ||
} else { | ||
System.err.println("WARNING: VCF file did not contain contig line for '1' or 'chr1'"); | ||
System.err.println("WARNING: Will proceed as if it is hg19/GRCh37."); | ||
} | ||
} | ||
|
||
public GenomeVersion guessGenomeVersion() { | ||
final List<VCFContigHeaderLine> contigLines = this.reader.getHeader().getContigLines(); | ||
if (contigLines.isEmpty()) { | ||
System.err.println("WARNING: VCF file did not contain any contig lines."); | ||
return null; | ||
} else { | ||
for (VCFContigHeaderLine line : contigLines) { | ||
final SAMSequenceRecord seqRecord = line.getSAMSequenceRecord(); | ||
if (seqRecord.getSequenceName().equals("1") || seqRecord.getSequenceName().equals("chr1")) { | ||
if (seqRecord.getSequenceLength() == CHR1_HG19_LENGTH) { | ||
return seqRecord.getSequenceName().startsWith("chr") | ||
? GenomeVersion.HG19 | ||
: GenomeVersion.GRCH37; | ||
} else if (seqRecord.getSequenceLength() == CHR1_HG38_LENGTH) { | ||
return seqRecord.getSequenceName().startsWith("chr") | ||
? GenomeVersion.HG38 | ||
: GenomeVersion.GRCH37; | ||
} | ||
} | ||
} | ||
} | ||
|
||
return null; | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters