From 69bd7d95e5330c2bc58d1b78fd3b652029ba4dce Mon Sep 17 00:00:00 2001 From: Manuel Holtgrewe Date: Mon, 12 Apr 2021 13:23:13 +0200 Subject: [PATCH] Fixing handling of hg19 genome. --- CHANGES.md | 6 ++ pom.xml | 2 +- varfish-annotator-cli/pom.xml | 2 +- .../annotate/AnnotateVcf.java | 18 ++++- .../annotate/GenomeVersion.java | 9 +++ .../annotate/IncompatibleVcfException.java | 17 ++++ .../annotate/VcfCompatibilityChecker.java | 80 +++++++++++++++++++ .../annotate_svs/AnnotateSvsVcf.java | 46 +++++++++-- varfish-annotator-core/pom.xml | 2 +- 9 files changed, 170 insertions(+), 12 deletions(-) create mode 100644 varfish-annotator-cli/src/main/java/com/github/bihealth/varfish_annotator/annotate/GenomeVersion.java create mode 100644 varfish-annotator-cli/src/main/java/com/github/bihealth/varfish_annotator/annotate/IncompatibleVcfException.java create mode 100644 varfish-annotator-cli/src/main/java/com/github/bihealth/varfish_annotator/annotate/VcfCompatibilityChecker.java diff --git a/CHANGES.md b/CHANGES.md index 43b7c3f..cf1a3af 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -1,5 +1,11 @@ # Changelog +## v0.15 (work in progress) + +- Properly removing ``"chr"`` prefix for data aligned to `hg19`. +- Checking whether the supported release GRCh37/hg19 was used. + Will only allow processing of such genomes and block variants from GRCh38/hg38 which would lead to incorrect results. + ## v0.14 - Bumping junit and guava dependency. diff --git a/pom.xml b/pom.xml index 422464f..17d2242 100644 --- a/pom.xml +++ b/pom.xml @@ -4,7 +4,7 @@ com.github.bihealth VarfishAnnotator pom - 0.14 + 0.15-SNAPSHOT ${project.groupId}:${project.artifactId} Variant annotation for Varfish diff --git a/varfish-annotator-cli/pom.xml b/varfish-annotator-cli/pom.xml index b23732d..323fc50 100644 --- a/varfish-annotator-cli/pom.xml +++ b/varfish-annotator-cli/pom.xml @@ -11,7 +11,7 @@ com.github.bihealth VarfishAnnotator - 0.14 + 0.15-SNAPSHOT diff --git a/varfish-annotator-cli/src/main/java/com/github/bihealth/varfish_annotator/annotate/AnnotateVcf.java b/varfish-annotator-cli/src/main/java/com/github/bihealth/varfish_annotator/annotate/AnnotateVcf.java index 5f54f24..cf65227 100644 --- a/varfish-annotator-cli/src/main/java/com/github/bihealth/varfish_annotator/annotate/AnnotateVcf.java +++ b/varfish-annotator-cli/src/main/java/com/github/bihealth/varfish_annotator/annotate/AnnotateVcf.java @@ -145,6 +145,7 @@ public void run() { FileWriter gtWriter = new FileWriter(new File(args.getOutputGts())); FileWriter dbInfoWriter = new FileWriter(new File(args.getOutputDbInfos())); BufferedWriter dbInfoBufWriter = new BufferedWriter(dbInfoWriter); ) { + new VcfCompatibilityChecker(reader).check(); System.err.println("Deserializing Jannovar file..."); JannovarData refseqJvData = new JannovarDataSerializer(args.getRefseqSerPath()).load(); JannovarData ensemblJvData = new JannovarDataSerializer(args.getEnsemblSerPath()).load(); @@ -167,6 +168,10 @@ public void run() { System.err.println("Problem opening output files database"); e.printStackTrace(); System.exit(1); + } catch (IncompatibleVcfException e) { + System.err.println("Problem with VCF compatibility: " + e.getMessage()); + e.printStackTrace(); + System.exit(1); } } @@ -232,6 +237,9 @@ private void annotateVcf( FileWriter gtWriter) throws VarfishAnnotatorException { + // Guess genome version. + GenomeVersion genomeVersion = new VcfCompatibilityChecker(reader).guessGenomeVersion(); + // Write out header. try { gtWriter.append(Joiner.on("\t").join(HEADERS_GT) + "\n"); @@ -269,6 +277,7 @@ private void annotateVcf( } annotateVariantContext( conn, + genomeVersion, refseqJv.getRefDict(), refseqAnnotator, ensemblAnnotator, @@ -283,6 +292,7 @@ private void annotateVcf( * Annotate ctx, write out annotated variant call to gtWriter. * * @param conn Database connection. + * @param genomeVersion The genome version of the VCF file. * @param refDict {@code ReferenceDictionary} to use for chromosome mapping * @param refseqAnnotator Helper class to use for annotation of variants with Refseq * @param ensemblAnnotator Helper class to use for annotation of variants with ENSEMBL @@ -293,6 +303,7 @@ private void annotateVcf( */ private void annotateVariantContext( Connection conn, + GenomeVersion genomeVersion, ReferenceDictionary refDict, VariantContextAnnotator refseqAnnotator, VariantContextAnnotator ensemblAnnotator, @@ -305,13 +316,18 @@ private void annotateVariantContext( ImmutableList ensemblAnnotationsList = silentBuildAnnotations(ctx, ensemblAnnotator); + final String contigName = + (genomeVersion == GenomeVersion.HG19) + ? ctx.getContig().replaceFirst("chr", "") + : ctx.getContig(); + final int numAlleles = ctx.getAlleles().size(); for (int i = 1; i < numAlleles; ++i) { // Normalize the from the VCF (will probably pad variant to the left). final VariantDescription normalizedVar = normalizer.normalizeInsertion( new VariantDescription( - ctx.getContig(), + contigName, ctx.getStart() - 1, ctx.getReference().getBaseString(), ctx.getAlternateAllele(i - 1).getBaseString())); diff --git a/varfish-annotator-cli/src/main/java/com/github/bihealth/varfish_annotator/annotate/GenomeVersion.java b/varfish-annotator-cli/src/main/java/com/github/bihealth/varfish_annotator/annotate/GenomeVersion.java new file mode 100644 index 0000000..854192a --- /dev/null +++ b/varfish-annotator-cli/src/main/java/com/github/bihealth/varfish_annotator/annotate/GenomeVersion.java @@ -0,0 +1,9 @@ +package com.github.bihealth.varfish_annotator.annotate; + +/** Enumerate known genome versions. */ +public enum GenomeVersion { + GRCH37, + HG19, + GRCH38, + HG38 +} diff --git a/varfish-annotator-cli/src/main/java/com/github/bihealth/varfish_annotator/annotate/IncompatibleVcfException.java b/varfish-annotator-cli/src/main/java/com/github/bihealth/varfish_annotator/annotate/IncompatibleVcfException.java new file mode 100644 index 0000000..be33e37 --- /dev/null +++ b/varfish-annotator-cli/src/main/java/com/github/bihealth/varfish_annotator/annotate/IncompatibleVcfException.java @@ -0,0 +1,17 @@ +package com.github.bihealth.varfish_annotator.annotate; + +/** Raised on incompatible VCF files. */ +public class IncompatibleVcfException extends Exception { + + public IncompatibleVcfException(String message, Throwable cause) { + super(message, cause); + } + + public IncompatibleVcfException(String message) { + super(message); + } + + public IncompatibleVcfException(Throwable cause) { + super(cause); + } +} diff --git a/varfish-annotator-cli/src/main/java/com/github/bihealth/varfish_annotator/annotate/VcfCompatibilityChecker.java b/varfish-annotator-cli/src/main/java/com/github/bihealth/varfish_annotator/annotate/VcfCompatibilityChecker.java new file mode 100644 index 0000000..e186c59 --- /dev/null +++ b/varfish-annotator-cli/src/main/java/com/github/bihealth/varfish_annotator/annotate/VcfCompatibilityChecker.java @@ -0,0 +1,80 @@ +package com.github.bihealth.varfish_annotator.annotate; + +import htsjdk.samtools.SAMSequenceRecord; +import htsjdk.variant.vcf.VCFContigHeaderLine; +import htsjdk.variant.vcf.VCFFileReader; +import java.util.List; + +/** + * Check VCF file for compatibility with annotation. + * + *

At the moment, only a simple check is implemented that tests whether the dataset looks like + * GRCh37/hg19 as this is the Genome build that VarFish supports. + */ +public class VcfCompatibilityChecker { + + /** Length of chr1 in hg19. */ + private static final int CHR1_HG19_LENGTH = 249250621; + /** Length of chr1 in hg38. */ + private static final int CHR1_HG38_LENGTH = 248956422; + + /** The {@link VCFFileReader} that is to be used for checking. */ + private VCFFileReader reader; + + /** + * Construct a new {@link VcfCompatibilityChecker}. + * + * @param reader The {@link VCFFileReader} to use for checking headers etc. + */ + public VcfCompatibilityChecker(VCFFileReader reader) { + this.reader = reader; + } + + /** + * Check whether the VCF file given to the construtor as reader looks to be compatible. + * + *

Throws an exception in case of problems and otherwise just returns. Will print a warning to + * stderr if no reliable decision could be made. + * + * @throws IncompatibleVcfException If the VCF file looks to be incompatible. + */ + public void check() throws IncompatibleVcfException { + // Check whether this looks like GRCh37/h19. + GenomeVersion genomeVersion = this.guessGenomeVersion(); + if (genomeVersion == GenomeVersion.GRCH37 || genomeVersion == GenomeVersion.HG19) { + System.err.println( + "INFO: Genome looks like GRCh" + 37 + " (sequence only; regardless of 'chr' prefix)."); + } else if (genomeVersion == GenomeVersion.GRCH38 || genomeVersion == GenomeVersion.HG38) { + throw new IncompatibleVcfException( + "VCF file looks like hg38 by chr1 length but we only support hg19/GRCh37"); + } else { + System.err.println("WARNING: VCF file did not contain contig line for '1' or 'chr1'"); + System.err.println("WARNING: Will proceed as if it is hg19/GRCh37."); + } + } + + public GenomeVersion guessGenomeVersion() { + final List contigLines = this.reader.getHeader().getContigLines(); + if (contigLines.isEmpty()) { + System.err.println("WARNING: VCF file did not contain any contig lines."); + return null; + } else { + for (VCFContigHeaderLine line : contigLines) { + final SAMSequenceRecord seqRecord = line.getSAMSequenceRecord(); + if (seqRecord.getSequenceName().equals("1") || seqRecord.getSequenceName().equals("chr1")) { + if (seqRecord.getSequenceLength() == CHR1_HG19_LENGTH) { + return seqRecord.getSequenceName().startsWith("chr") + ? GenomeVersion.HG19 + : GenomeVersion.GRCH37; + } else if (seqRecord.getSequenceLength() == CHR1_HG38_LENGTH) { + return seqRecord.getSequenceName().startsWith("chr") + ? GenomeVersion.HG38 + : GenomeVersion.GRCH37; + } + } + } + } + + return null; + } +} diff --git a/varfish-annotator-cli/src/main/java/com/github/bihealth/varfish_annotator/annotate_svs/AnnotateSvsVcf.java b/varfish-annotator-cli/src/main/java/com/github/bihealth/varfish_annotator/annotate_svs/AnnotateSvsVcf.java index 9f9e445..ec48123 100644 --- a/varfish-annotator-cli/src/main/java/com/github/bihealth/varfish_annotator/annotate_svs/AnnotateSvsVcf.java +++ b/varfish-annotator-cli/src/main/java/com/github/bihealth/varfish_annotator/annotate_svs/AnnotateSvsVcf.java @@ -1,6 +1,9 @@ package com.github.bihealth.varfish_annotator.annotate_svs; import com.github.bihealth.varfish_annotator.VarfishAnnotatorException; +import com.github.bihealth.varfish_annotator.annotate.GenomeVersion; +import com.github.bihealth.varfish_annotator.annotate.IncompatibleVcfException; +import com.github.bihealth.varfish_annotator.annotate.VcfCompatibilityChecker; import com.github.bihealth.varfish_annotator.init_db.DbReleaseUpdater; import com.github.bihealth.varfish_annotator.utils.UcscBinning; import com.google.common.base.Joiner; @@ -120,10 +123,15 @@ public void run() { FileWriter featureEffectsWriter = new FileWriter(new File(args.getOutputFeatureEffects())); FileWriter dbInfoWriter = new FileWriter(new File(args.getOutputDbInfos())); BufferedWriter dbInfoBufWriter = new BufferedWriter(dbInfoWriter); ) { + // Guess genome version. + GenomeVersion genomeVersion = new VcfCompatibilityChecker(reader).guessGenomeVersion(); + + new VcfCompatibilityChecker(reader).check(); System.err.println("Deserializing Jannovar file..."); JannovarData refseqJvData = new JannovarDataSerializer(args.getRefseqSerPath()).load(); JannovarData ensemblJvData = new JannovarDataSerializer(args.getEnsemblSerPath()).load(); - annotateSvVcf(conn, reader, refseqJvData, ensemblJvData, gtWriter, featureEffectsWriter); + annotateSvVcf( + conn, genomeVersion, reader, refseqJvData, ensemblJvData, gtWriter, featureEffectsWriter); writeDbInfos(conn, dbInfoBufWriter); } catch (SQLException e) { System.err.println("Problem with database connection"); @@ -141,6 +149,10 @@ public void run() { System.err.println("Problem opening output files database"); e.printStackTrace(); System.exit(1); + } catch (IncompatibleVcfException e) { + System.err.println("Problem with VCF compatibility: " + e.getMessage()); + e.printStackTrace(); + System.exit(1); } } @@ -199,6 +211,7 @@ private void writeDbInfos(Connection conn, BufferedWriter dbInfoWriter) */ private void annotateSvVcf( Connection conn, + GenomeVersion genomeVersion, VCFFileReader reader, JannovarData refseqJv, JannovarData ensemblJv, @@ -248,7 +261,7 @@ private void annotateSvVcf( System.err.println("Now on contig " + ctx.getContig()); } annotateVariantContext( - refseqAnnotator, ensemblAnnotator, ctx, gtWriter, featureEffectsWriter); + refseqAnnotator, ensemblAnnotator, ctx, genomeVersion, gtWriter, featureEffectsWriter); prevChr = ctx.getContig(); } } @@ -260,6 +273,7 @@ private void annotateSvVcf( * @param refseqAnnotator Helper class to use for annotation of variants with Refseq * @param ensemblAnnotator Helper class to use for annotation of variants with ENSEMBL * @param ctx The variant to annotate. + * @param genomeVersion The genome version that {@code ctx} uses. * @param gtWriter Writer for annotated genotypes. * @param featureEffectsWriter Writer for gene-wise feature effects. * @throws VarfishAnnotatorException in case of problems @@ -268,6 +282,7 @@ private void annotateVariantContext( VariantContextAnnotator refseqAnnotator, VariantContextAnnotator ensemblAnnotator, VariantContext ctx, + GenomeVersion genomeVersion, FileWriter gtWriter, FileWriter featureEffectsWriter) throws VarfishAnnotatorException { @@ -297,7 +312,7 @@ private void annotateVariantContext( } // Write out record with the genotype. - final List gtOutRec = buildGtRecord(variantId, svGenomeVar, ctx, i); + final List gtOutRec = buildGtRecord(variantId, svGenomeVar, ctx, genomeVersion, i); try { gtWriter.append(Joiner.on("\t").useForNull(".").join(gtOutRec) + "\n"); } catch (IOException e) { @@ -383,14 +398,23 @@ private void annotateVariantContext( /** Buidl string list with the information for the genotype call record. */ private List buildGtRecord( - UUID variantId, SVGenomeVariant svGenomeVar, VariantContext ctx, int alleleNo) { + UUID variantId, + SVGenomeVariant svGenomeVar, + VariantContext ctx, + GenomeVersion genomeVersion, + int alleleNo) { final String svMethod = ctx.getCommonInfo().getAttributeAsString("SVMETHOD", null); final boolean isBnd = (svGenomeVar.getType() == Type.BND); final int pos2 = isBnd ? svGenomeVar.getPos() + 1 : svGenomeVar.getPos2(); + final String contigName = + (genomeVersion == GenomeVersion.HG19) + ? svGenomeVar.getChrName().replaceFirst("chr", "") + : svGenomeVar.getChrName(); + return ImmutableList.of( args.getRelease(), - svGenomeVar.getChrName(), + contigName, svGenomeVar.getChr(), svGenomeVar.getPos() + 1, pos2, @@ -406,15 +430,21 @@ private List buildGtRecord( // TODO: improve type and sub type annotation! svGenomeVar.getType(), svGenomeVar.getType(), - buildInfoValue(ctx, svGenomeVar), + buildInfoValue(ctx, genomeVersion, svGenomeVar), buildGenotypeValue(ctx, alleleNo)); } - private String buildInfoValue(VariantContext ctx, SVGenomeVariant svGenomeVar) { + private String buildInfoValue( + VariantContext ctx, GenomeVersion genomeVersion, SVGenomeVariant svGenomeVar) { final List mappings = new ArrayList<>(); if (svGenomeVar.getType() == Type.BND) { - mappings.add(tripleQuote("chr2") + ":" + tripleQuote(svGenomeVar.getChr2Name())); + final String contigName2 = + (genomeVersion == GenomeVersion.HG19) + ? svGenomeVar.getChr2Name().replaceFirst("chr", "") + : svGenomeVar.getChr2Name(); + + mappings.add(tripleQuote("chr2") + ":" + tripleQuote(contigName2)); mappings.add(tripleQuote("pos2") + ":" + svGenomeVar.getPos2()); } diff --git a/varfish-annotator-core/pom.xml b/varfish-annotator-core/pom.xml index c8aea22..78b82cd 100644 --- a/varfish-annotator-core/pom.xml +++ b/varfish-annotator-core/pom.xml @@ -11,7 +11,7 @@ com.github.bihealth VarfishAnnotator - 0.14 + 0.15-SNAPSHOT