Fixing handling of hg19 genome.

varfish-org · Apr 12, 2021 · 69bd7d9 · 69bd7d9
1 parent 515ef85
commit 69bd7d9
Show file tree

Hide file tree

Showing 9 changed files with 170 additions and 12 deletions.
diff --git a/CHANGES.md b/CHANGES.md
@@ -1,5 +1,11 @@
 # Changelog
 
+## v0.15 (work in progress)
+
+- Properly removing ``"chr"`` prefix for data aligned to `hg19`.
+- Checking whether the supported release GRCh37/hg19 was used.
+  Will only allow processing of such genomes and block variants from GRCh38/hg38 which would lead to incorrect results.
+
 ## v0.14
 
 - Bumping junit and guava dependency.

diff --git a/pom.xml b/pom.xml
@@ -4,7 +4,7 @@
 	<groupId>com.github.bihealth</groupId>
 	<artifactId>VarfishAnnotator</artifactId>
 	<packaging>pom</packaging>
-	<version>0.14</version>
+	<version>0.15-SNAPSHOT</version>
 
 	<name>${project.groupId}:${project.artifactId}</name>
 	<description>Variant annotation for Varfish</description>

diff --git a/varfish-annotator-cli/pom.xml b/varfish-annotator-cli/pom.xml
@@ -11,7 +11,7 @@
   <parent>
     <groupId>com.github.bihealth</groupId>
     <artifactId>VarfishAnnotator</artifactId>
-    <version>0.14</version>
+    <version>0.15-SNAPSHOT</version>
   </parent>
 
   <properties>

diff --git a/...notator-cli/src/main/java/com/github/bihealth/varfish_annotator/annotate/AnnotateVcf.java b/...notator-cli/src/main/java/com/github/bihealth/varfish_annotator/annotate/AnnotateVcf.java
@@ -145,6 +145,7 @@ public void run() {
         FileWriter gtWriter = new FileWriter(new File(args.getOutputGts()));
         FileWriter dbInfoWriter = new FileWriter(new File(args.getOutputDbInfos()));
         BufferedWriter dbInfoBufWriter = new BufferedWriter(dbInfoWriter); ) {
+      new VcfCompatibilityChecker(reader).check();
       System.err.println("Deserializing Jannovar file...");
       JannovarData refseqJvData = new JannovarDataSerializer(args.getRefseqSerPath()).load();
       JannovarData ensemblJvData = new JannovarDataSerializer(args.getEnsemblSerPath()).load();
@@ -167,6 +168,10 @@ public void run() {
       System.err.println("Problem opening output files database");
       e.printStackTrace();
       System.exit(1);
+    } catch (IncompatibleVcfException e) {
+      System.err.println("Problem with VCF compatibility: " + e.getMessage());
+      e.printStackTrace();
+      System.exit(1);
     }
   }
 
@@ -232,6 +237,9 @@ private void annotateVcf(
       FileWriter gtWriter)
       throws VarfishAnnotatorException {
 
+    // Guess genome version.
+    GenomeVersion genomeVersion = new VcfCompatibilityChecker(reader).guessGenomeVersion();
+
     // Write out header.
     try {
       gtWriter.append(Joiner.on("\t").join(HEADERS_GT) + "\n");
@@ -269,6 +277,7 @@ private void annotateVcf(
       }
       annotateVariantContext(
           conn,
+          genomeVersion,
           refseqJv.getRefDict(),
           refseqAnnotator,
           ensemblAnnotator,
@@ -283,6 +292,7 @@ private void annotateVcf(
    * Annotate <tt>ctx</tt>, write out annotated variant call to <tt>gtWriter</tt>.
    *
    * @param conn Database connection.
+   * @param genomeVersion The genome version of the VCF file.
    * @param refDict {@code ReferenceDictionary} to use for chromosome mapping
    * @param refseqAnnotator Helper class to use for annotation of variants with Refseq
    * @param ensemblAnnotator Helper class to use for annotation of variants with ENSEMBL
@@ -293,6 +303,7 @@ private void annotateVcf(
    */
   private void annotateVariantContext(
       Connection conn,
+      GenomeVersion genomeVersion,
       ReferenceDictionary refDict,
       VariantContextAnnotator refseqAnnotator,
       VariantContextAnnotator ensemblAnnotator,
@@ -305,13 +316,18 @@ private void annotateVariantContext(
     ImmutableList<VariantAnnotations> ensemblAnnotationsList =
         silentBuildAnnotations(ctx, ensemblAnnotator);
 
+    final String contigName =
+        (genomeVersion == GenomeVersion.HG19)
+            ? ctx.getContig().replaceFirst("chr", "")
+            : ctx.getContig();
+
     final int numAlleles = ctx.getAlleles().size();
     for (int i = 1; i < numAlleles; ++i) {
       // Normalize the from the VCF (will probably pad variant to the left).
       final VariantDescription normalizedVar =
           normalizer.normalizeInsertion(
               new VariantDescription(
-                  ctx.getContig(),
+                  contigName,
                   ctx.getStart() - 1,
                   ctx.getReference().getBaseString(),
                   ctx.getAlternateAllele(i - 1).getBaseString()));

diff --git a/...tator-cli/src/main/java/com/github/bihealth/varfish_annotator/annotate/GenomeVersion.java b/...tator-cli/src/main/java/com/github/bihealth/varfish_annotator/annotate/GenomeVersion.java
@@ -0,0 +1,9 @@
+package com.github.bihealth.varfish_annotator.annotate;
+
+/** Enumerate known genome versions. */
+public enum GenomeVersion {
+  GRCH37,
+  HG19,
+  GRCH38,
+  HG38
+}
diff --git a/...rc/main/java/com/github/bihealth/varfish_annotator/annotate/IncompatibleVcfException.java b/...rc/main/java/com/github/bihealth/varfish_annotator/annotate/IncompatibleVcfException.java
@@ -0,0 +1,17 @@
+package com.github.bihealth.varfish_annotator.annotate;
+
+/** Raised on incompatible VCF files. */
+public class IncompatibleVcfException extends Exception {
+
+  public IncompatibleVcfException(String message, Throwable cause) {
+    super(message, cause);
+  }
+
+  public IncompatibleVcfException(String message) {
+    super(message);
+  }
+
+  public IncompatibleVcfException(Throwable cause) {
+    super(cause);
+  }
+}
diff --git a/...src/main/java/com/github/bihealth/varfish_annotator/annotate/VcfCompatibilityChecker.java b/...src/main/java/com/github/bihealth/varfish_annotator/annotate/VcfCompatibilityChecker.java
@@ -0,0 +1,80 @@
+package com.github.bihealth.varfish_annotator.annotate;
+
+import htsjdk.samtools.SAMSequenceRecord;
+import htsjdk.variant.vcf.VCFContigHeaderLine;
+import htsjdk.variant.vcf.VCFFileReader;
+import java.util.List;
+
+/**
+ * Check VCF file for compatibility with annotation.
+ *
+ * <p>At the moment, only a simple check is implemented that tests whether the dataset looks like
+ * GRCh37/hg19 as this is the Genome build that VarFish supports.
+ */
+public class VcfCompatibilityChecker {
+
+  /** Length of chr1 in hg19. */
+  private static final int CHR1_HG19_LENGTH = 249250621;
+  /** Length of chr1 in hg38. */
+  private static final int CHR1_HG38_LENGTH = 248956422;
+
+  /** The {@link VCFFileReader} that is to be used for checking. */
+  private VCFFileReader reader;
+
+  /**
+   * Construct a new {@link VcfCompatibilityChecker}.
+   *
+   * @param reader The {@link VCFFileReader} to use for checking headers etc.
+   */
+  public VcfCompatibilityChecker(VCFFileReader reader) {
+    this.reader = reader;
+  }
+
+  /**
+   * Check whether the VCF file given to the construtor as <tt>reader</tt> looks to be compatible.
+   *
+   * <p>Throws an exception in case of problems and otherwise just returns. Will print a warning to
+   * stderr if no reliable decision could be made.
+   *
+   * @throws IncompatibleVcfException If the VCF file looks to be incompatible.
+   */
+  public void check() throws IncompatibleVcfException {
+    // Check whether this looks like GRCh37/h19.
+    GenomeVersion genomeVersion = this.guessGenomeVersion();
+    if (genomeVersion == GenomeVersion.GRCH37 || genomeVersion == GenomeVersion.HG19) {
+      System.err.println(
+          "INFO: Genome looks like GRCh" + 37 + " (sequence only; regardless of 'chr' prefix).");
+    } else if (genomeVersion == GenomeVersion.GRCH38 || genomeVersion == GenomeVersion.HG38) {
+      throw new IncompatibleVcfException(
+          "VCF file looks like hg38 by chr1 length but we only support hg19/GRCh37");
+    } else {
+      System.err.println("WARNING: VCF file did not contain contig line for '1' or 'chr1'");
+      System.err.println("WARNING: Will proceed as if it is hg19/GRCh37.");
+    }
+  }
+
+  public GenomeVersion guessGenomeVersion() {
+    final List<VCFContigHeaderLine> contigLines = this.reader.getHeader().getContigLines();
+    if (contigLines.isEmpty()) {
+      System.err.println("WARNING: VCF file did not contain any contig lines.");
+      return null;
+    } else {
+      for (VCFContigHeaderLine line : contigLines) {
+        final SAMSequenceRecord seqRecord = line.getSAMSequenceRecord();
+        if (seqRecord.getSequenceName().equals("1") || seqRecord.getSequenceName().equals("chr1")) {
+          if (seqRecord.getSequenceLength() == CHR1_HG19_LENGTH) {
+            return seqRecord.getSequenceName().startsWith("chr")
+                ? GenomeVersion.HG19
+                : GenomeVersion.GRCH37;
+          } else if (seqRecord.getSequenceLength() == CHR1_HG38_LENGTH) {
+            return seqRecord.getSequenceName().startsWith("chr")
+                ? GenomeVersion.HG38
+                : GenomeVersion.GRCH37;
+          }
+        }
+      }
+    }
+
+    return null;
+  }
+}
diff --git a/...-cli/src/main/java/com/github/bihealth/varfish_annotator/annotate_svs/AnnotateSvsVcf.java b/...-cli/src/main/java/com/github/bihealth/varfish_annotator/annotate_svs/AnnotateSvsVcf.java
@@ -1,6 +1,9 @@
 package com.github.bihealth.varfish_annotator.annotate_svs;
 
 import com.github.bihealth.varfish_annotator.VarfishAnnotatorException;
+import com.github.bihealth.varfish_annotator.annotate.GenomeVersion;
+import com.github.bihealth.varfish_annotator.annotate.IncompatibleVcfException;
+import com.github.bihealth.varfish_annotator.annotate.VcfCompatibilityChecker;
 import com.github.bihealth.varfish_annotator.init_db.DbReleaseUpdater;
 import com.github.bihealth.varfish_annotator.utils.UcscBinning;
 import com.google.common.base.Joiner;
@@ -120,10 +123,15 @@ public void run() {
         FileWriter featureEffectsWriter = new FileWriter(new File(args.getOutputFeatureEffects()));
         FileWriter dbInfoWriter = new FileWriter(new File(args.getOutputDbInfos()));
         BufferedWriter dbInfoBufWriter = new BufferedWriter(dbInfoWriter); ) {
+      // Guess genome version.
+      GenomeVersion genomeVersion = new VcfCompatibilityChecker(reader).guessGenomeVersion();
+
+      new VcfCompatibilityChecker(reader).check();
       System.err.println("Deserializing Jannovar file...");
       JannovarData refseqJvData = new JannovarDataSerializer(args.getRefseqSerPath()).load();
       JannovarData ensemblJvData = new JannovarDataSerializer(args.getEnsemblSerPath()).load();
-      annotateSvVcf(conn, reader, refseqJvData, ensemblJvData, gtWriter, featureEffectsWriter);
+      annotateSvVcf(
+          conn, genomeVersion, reader, refseqJvData, ensemblJvData, gtWriter, featureEffectsWriter);
       writeDbInfos(conn, dbInfoBufWriter);
     } catch (SQLException e) {
       System.err.println("Problem with database connection");
@@ -141,6 +149,10 @@ public void run() {
       System.err.println("Problem opening output files database");
       e.printStackTrace();
       System.exit(1);
+    } catch (IncompatibleVcfException e) {
+      System.err.println("Problem with VCF compatibility: " + e.getMessage());
+      e.printStackTrace();
+      System.exit(1);
     }
   }
 
@@ -199,6 +211,7 @@ private void writeDbInfos(Connection conn, BufferedWriter dbInfoWriter)
    */
   private void annotateSvVcf(
       Connection conn,
+      GenomeVersion genomeVersion,
       VCFFileReader reader,
       JannovarData refseqJv,
       JannovarData ensemblJv,
@@ -248,7 +261,7 @@ private void annotateSvVcf(
         System.err.println("Now on contig " + ctx.getContig());
       }
       annotateVariantContext(
-          refseqAnnotator, ensemblAnnotator, ctx, gtWriter, featureEffectsWriter);
+          refseqAnnotator, ensemblAnnotator, ctx, genomeVersion, gtWriter, featureEffectsWriter);
       prevChr = ctx.getContig();
     }
   }
@@ -260,6 +273,7 @@ private void annotateSvVcf(
    * @param refseqAnnotator Helper class to use for annotation of variants with Refseq
    * @param ensemblAnnotator Helper class to use for annotation of variants with ENSEMBL
    * @param ctx The variant to annotate.
+   * @param genomeVersion The genome version that {@code ctx} uses.
    * @param gtWriter Writer for annotated genotypes.
    * @param featureEffectsWriter Writer for gene-wise feature effects.
    * @throws VarfishAnnotatorException in case of problems
@@ -268,6 +282,7 @@ private void annotateVariantContext(
       VariantContextAnnotator refseqAnnotator,
       VariantContextAnnotator ensemblAnnotator,
       VariantContext ctx,
+      GenomeVersion genomeVersion,
       FileWriter gtWriter,
       FileWriter featureEffectsWriter)
       throws VarfishAnnotatorException {
@@ -297,7 +312,7 @@ private void annotateVariantContext(
       }
 
       // Write out record with the genotype.
-      final List<Object> gtOutRec = buildGtRecord(variantId, svGenomeVar, ctx, i);
+      final List<Object> gtOutRec = buildGtRecord(variantId, svGenomeVar, ctx, genomeVersion, i);
       try {
         gtWriter.append(Joiner.on("\t").useForNull(".").join(gtOutRec) + "\n");
       } catch (IOException e) {
@@ -383,14 +398,23 @@ private void annotateVariantContext(
 
   /** Buidl string list with the information for the genotype call record. */
   private List<Object> buildGtRecord(
-      UUID variantId, SVGenomeVariant svGenomeVar, VariantContext ctx, int alleleNo) {
+      UUID variantId,
+      SVGenomeVariant svGenomeVar,
+      VariantContext ctx,
+      GenomeVersion genomeVersion,
+      int alleleNo) {
     final String svMethod = ctx.getCommonInfo().getAttributeAsString("SVMETHOD", null);
     final boolean isBnd = (svGenomeVar.getType() == Type.BND);
     final int pos2 = isBnd ? svGenomeVar.getPos() + 1 : svGenomeVar.getPos2();
 
+    final String contigName =
+        (genomeVersion == GenomeVersion.HG19)
+            ? svGenomeVar.getChrName().replaceFirst("chr", "")
+            : svGenomeVar.getChrName();
+
     return ImmutableList.of(
         args.getRelease(),
-        svGenomeVar.getChrName(),
+        contigName,
         svGenomeVar.getChr(),
         svGenomeVar.getPos() + 1,
         pos2,
@@ -406,15 +430,21 @@ private List<Object> buildGtRecord(
         // TODO: improve type and sub type annotation!
         svGenomeVar.getType(),
         svGenomeVar.getType(),
-        buildInfoValue(ctx, svGenomeVar),
+        buildInfoValue(ctx, genomeVersion, svGenomeVar),
         buildGenotypeValue(ctx, alleleNo));
   }
 
-  private String buildInfoValue(VariantContext ctx, SVGenomeVariant svGenomeVar) {
+  private String buildInfoValue(
+      VariantContext ctx, GenomeVersion genomeVersion, SVGenomeVariant svGenomeVar) {
     final List<String> mappings = new ArrayList<>();
 
     if (svGenomeVar.getType() == Type.BND) {
-      mappings.add(tripleQuote("chr2") + ":" + tripleQuote(svGenomeVar.getChr2Name()));
+      final String contigName2 =
+          (genomeVersion == GenomeVersion.HG19)
+              ? svGenomeVar.getChr2Name().replaceFirst("chr", "")
+              : svGenomeVar.getChr2Name();
+
+      mappings.add(tripleQuote("chr2") + ":" + tripleQuote(contigName2));
       mappings.add(tripleQuote("pos2") + ":" + svGenomeVar.getPos2());
     }
 

diff --git a/varfish-annotator-core/pom.xml b/varfish-annotator-core/pom.xml
@@ -11,7 +11,7 @@
 	<parent>
 		<groupId>com.github.bihealth</groupId>
 		<artifactId>VarfishAnnotator</artifactId>
-		<version>0.14</version>
+		<version>0.15-SNAPSHOT</version>
 	</parent>
 
 	<properties>