Skip to content
This repository has been archived by the owner on Jan 24, 2024. It is now read-only.

Commit

Permalink
Fixing handling of hg19 genome.
Browse files Browse the repository at this point in the history
  • Loading branch information
holtgrewe committed Apr 12, 2021
1 parent 515ef85 commit 69bd7d9
Show file tree
Hide file tree
Showing 9 changed files with 170 additions and 12 deletions.
6 changes: 6 additions & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,11 @@
# Changelog

## v0.15 (work in progress)

- Properly removing ``"chr"`` prefix for data aligned to `hg19`.
- Checking whether the supported release GRCh37/hg19 was used.
Will only allow processing of such genomes and block variants from GRCh38/hg38 which would lead to incorrect results.

## v0.14

- Bumping junit and guava dependency.
Expand Down
2 changes: 1 addition & 1 deletion pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
<groupId>com.github.bihealth</groupId>
<artifactId>VarfishAnnotator</artifactId>
<packaging>pom</packaging>
<version>0.14</version>
<version>0.15-SNAPSHOT</version>

<name>${project.groupId}:${project.artifactId}</name>
<description>Variant annotation for Varfish</description>
Expand Down
2 changes: 1 addition & 1 deletion varfish-annotator-cli/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
<parent>
<groupId>com.github.bihealth</groupId>
<artifactId>VarfishAnnotator</artifactId>
<version>0.14</version>
<version>0.15-SNAPSHOT</version>
</parent>

<properties>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,7 @@ public void run() {
FileWriter gtWriter = new FileWriter(new File(args.getOutputGts()));
FileWriter dbInfoWriter = new FileWriter(new File(args.getOutputDbInfos()));
BufferedWriter dbInfoBufWriter = new BufferedWriter(dbInfoWriter); ) {
new VcfCompatibilityChecker(reader).check();
System.err.println("Deserializing Jannovar file...");
JannovarData refseqJvData = new JannovarDataSerializer(args.getRefseqSerPath()).load();
JannovarData ensemblJvData = new JannovarDataSerializer(args.getEnsemblSerPath()).load();
Expand All @@ -167,6 +168,10 @@ public void run() {
System.err.println("Problem opening output files database");
e.printStackTrace();
System.exit(1);
} catch (IncompatibleVcfException e) {
System.err.println("Problem with VCF compatibility: " + e.getMessage());
e.printStackTrace();
System.exit(1);
}
}

Expand Down Expand Up @@ -232,6 +237,9 @@ private void annotateVcf(
FileWriter gtWriter)
throws VarfishAnnotatorException {

// Guess genome version.
GenomeVersion genomeVersion = new VcfCompatibilityChecker(reader).guessGenomeVersion();

// Write out header.
try {
gtWriter.append(Joiner.on("\t").join(HEADERS_GT) + "\n");
Expand Down Expand Up @@ -269,6 +277,7 @@ private void annotateVcf(
}
annotateVariantContext(
conn,
genomeVersion,
refseqJv.getRefDict(),
refseqAnnotator,
ensemblAnnotator,
Expand All @@ -283,6 +292,7 @@ private void annotateVcf(
* Annotate <tt>ctx</tt>, write out annotated variant call to <tt>gtWriter</tt>.
*
* @param conn Database connection.
* @param genomeVersion The genome version of the VCF file.
* @param refDict {@code ReferenceDictionary} to use for chromosome mapping
* @param refseqAnnotator Helper class to use for annotation of variants with Refseq
* @param ensemblAnnotator Helper class to use for annotation of variants with ENSEMBL
Expand All @@ -293,6 +303,7 @@ private void annotateVcf(
*/
private void annotateVariantContext(
Connection conn,
GenomeVersion genomeVersion,
ReferenceDictionary refDict,
VariantContextAnnotator refseqAnnotator,
VariantContextAnnotator ensemblAnnotator,
Expand All @@ -305,13 +316,18 @@ private void annotateVariantContext(
ImmutableList<VariantAnnotations> ensemblAnnotationsList =
silentBuildAnnotations(ctx, ensemblAnnotator);

final String contigName =
(genomeVersion == GenomeVersion.HG19)
? ctx.getContig().replaceFirst("chr", "")
: ctx.getContig();

final int numAlleles = ctx.getAlleles().size();
for (int i = 1; i < numAlleles; ++i) {
// Normalize the from the VCF (will probably pad variant to the left).
final VariantDescription normalizedVar =
normalizer.normalizeInsertion(
new VariantDescription(
ctx.getContig(),
contigName,
ctx.getStart() - 1,
ctx.getReference().getBaseString(),
ctx.getAlternateAllele(i - 1).getBaseString()));
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
package com.github.bihealth.varfish_annotator.annotate;

/** Enumerate known genome versions. */
public enum GenomeVersion {
GRCH37,
HG19,
GRCH38,
HG38
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
package com.github.bihealth.varfish_annotator.annotate;

/** Raised on incompatible VCF files. */
public class IncompatibleVcfException extends Exception {

public IncompatibleVcfException(String message, Throwable cause) {
super(message, cause);
}

public IncompatibleVcfException(String message) {
super(message);
}

public IncompatibleVcfException(Throwable cause) {
super(cause);
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
package com.github.bihealth.varfish_annotator.annotate;

import htsjdk.samtools.SAMSequenceRecord;
import htsjdk.variant.vcf.VCFContigHeaderLine;
import htsjdk.variant.vcf.VCFFileReader;
import java.util.List;

/**
* Check VCF file for compatibility with annotation.
*
* <p>At the moment, only a simple check is implemented that tests whether the dataset looks like
* GRCh37/hg19 as this is the Genome build that VarFish supports.
*/
public class VcfCompatibilityChecker {

/** Length of chr1 in hg19. */
private static final int CHR1_HG19_LENGTH = 249250621;
/** Length of chr1 in hg38. */
private static final int CHR1_HG38_LENGTH = 248956422;

/** The {@link VCFFileReader} that is to be used for checking. */
private VCFFileReader reader;

/**
* Construct a new {@link VcfCompatibilityChecker}.
*
* @param reader The {@link VCFFileReader} to use for checking headers etc.
*/
public VcfCompatibilityChecker(VCFFileReader reader) {
this.reader = reader;
}

/**
* Check whether the VCF file given to the construtor as <tt>reader</tt> looks to be compatible.
*
* <p>Throws an exception in case of problems and otherwise just returns. Will print a warning to
* stderr if no reliable decision could be made.
*
* @throws IncompatibleVcfException If the VCF file looks to be incompatible.
*/
public void check() throws IncompatibleVcfException {
// Check whether this looks like GRCh37/h19.
GenomeVersion genomeVersion = this.guessGenomeVersion();
if (genomeVersion == GenomeVersion.GRCH37 || genomeVersion == GenomeVersion.HG19) {
System.err.println(
"INFO: Genome looks like GRCh" + 37 + " (sequence only; regardless of 'chr' prefix).");
} else if (genomeVersion == GenomeVersion.GRCH38 || genomeVersion == GenomeVersion.HG38) {
throw new IncompatibleVcfException(
"VCF file looks like hg38 by chr1 length but we only support hg19/GRCh37");
} else {
System.err.println("WARNING: VCF file did not contain contig line for '1' or 'chr1'");
System.err.println("WARNING: Will proceed as if it is hg19/GRCh37.");
}
}

public GenomeVersion guessGenomeVersion() {
final List<VCFContigHeaderLine> contigLines = this.reader.getHeader().getContigLines();
if (contigLines.isEmpty()) {
System.err.println("WARNING: VCF file did not contain any contig lines.");
return null;
} else {
for (VCFContigHeaderLine line : contigLines) {
final SAMSequenceRecord seqRecord = line.getSAMSequenceRecord();
if (seqRecord.getSequenceName().equals("1") || seqRecord.getSequenceName().equals("chr1")) {
if (seqRecord.getSequenceLength() == CHR1_HG19_LENGTH) {
return seqRecord.getSequenceName().startsWith("chr")
? GenomeVersion.HG19
: GenomeVersion.GRCH37;
} else if (seqRecord.getSequenceLength() == CHR1_HG38_LENGTH) {
return seqRecord.getSequenceName().startsWith("chr")
? GenomeVersion.HG38
: GenomeVersion.GRCH37;
}
}
}
}

return null;
}
}
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
package com.github.bihealth.varfish_annotator.annotate_svs;

import com.github.bihealth.varfish_annotator.VarfishAnnotatorException;
import com.github.bihealth.varfish_annotator.annotate.GenomeVersion;
import com.github.bihealth.varfish_annotator.annotate.IncompatibleVcfException;
import com.github.bihealth.varfish_annotator.annotate.VcfCompatibilityChecker;
import com.github.bihealth.varfish_annotator.init_db.DbReleaseUpdater;
import com.github.bihealth.varfish_annotator.utils.UcscBinning;
import com.google.common.base.Joiner;
Expand Down Expand Up @@ -120,10 +123,15 @@ public void run() {
FileWriter featureEffectsWriter = new FileWriter(new File(args.getOutputFeatureEffects()));
FileWriter dbInfoWriter = new FileWriter(new File(args.getOutputDbInfos()));
BufferedWriter dbInfoBufWriter = new BufferedWriter(dbInfoWriter); ) {
// Guess genome version.
GenomeVersion genomeVersion = new VcfCompatibilityChecker(reader).guessGenomeVersion();

new VcfCompatibilityChecker(reader).check();
System.err.println("Deserializing Jannovar file...");
JannovarData refseqJvData = new JannovarDataSerializer(args.getRefseqSerPath()).load();
JannovarData ensemblJvData = new JannovarDataSerializer(args.getEnsemblSerPath()).load();
annotateSvVcf(conn, reader, refseqJvData, ensemblJvData, gtWriter, featureEffectsWriter);
annotateSvVcf(
conn, genomeVersion, reader, refseqJvData, ensemblJvData, gtWriter, featureEffectsWriter);
writeDbInfos(conn, dbInfoBufWriter);
} catch (SQLException e) {
System.err.println("Problem with database connection");
Expand All @@ -141,6 +149,10 @@ public void run() {
System.err.println("Problem opening output files database");
e.printStackTrace();
System.exit(1);
} catch (IncompatibleVcfException e) {
System.err.println("Problem with VCF compatibility: " + e.getMessage());
e.printStackTrace();
System.exit(1);
}
}

Expand Down Expand Up @@ -199,6 +211,7 @@ private void writeDbInfos(Connection conn, BufferedWriter dbInfoWriter)
*/
private void annotateSvVcf(
Connection conn,
GenomeVersion genomeVersion,
VCFFileReader reader,
JannovarData refseqJv,
JannovarData ensemblJv,
Expand Down Expand Up @@ -248,7 +261,7 @@ private void annotateSvVcf(
System.err.println("Now on contig " + ctx.getContig());
}
annotateVariantContext(
refseqAnnotator, ensemblAnnotator, ctx, gtWriter, featureEffectsWriter);
refseqAnnotator, ensemblAnnotator, ctx, genomeVersion, gtWriter, featureEffectsWriter);
prevChr = ctx.getContig();
}
}
Expand All @@ -260,6 +273,7 @@ private void annotateSvVcf(
* @param refseqAnnotator Helper class to use for annotation of variants with Refseq
* @param ensemblAnnotator Helper class to use for annotation of variants with ENSEMBL
* @param ctx The variant to annotate.
* @param genomeVersion The genome version that {@code ctx} uses.
* @param gtWriter Writer for annotated genotypes.
* @param featureEffectsWriter Writer for gene-wise feature effects.
* @throws VarfishAnnotatorException in case of problems
Expand All @@ -268,6 +282,7 @@ private void annotateVariantContext(
VariantContextAnnotator refseqAnnotator,
VariantContextAnnotator ensemblAnnotator,
VariantContext ctx,
GenomeVersion genomeVersion,
FileWriter gtWriter,
FileWriter featureEffectsWriter)
throws VarfishAnnotatorException {
Expand Down Expand Up @@ -297,7 +312,7 @@ private void annotateVariantContext(
}

// Write out record with the genotype.
final List<Object> gtOutRec = buildGtRecord(variantId, svGenomeVar, ctx, i);
final List<Object> gtOutRec = buildGtRecord(variantId, svGenomeVar, ctx, genomeVersion, i);
try {
gtWriter.append(Joiner.on("\t").useForNull(".").join(gtOutRec) + "\n");
} catch (IOException e) {
Expand Down Expand Up @@ -383,14 +398,23 @@ private void annotateVariantContext(

/** Buidl string list with the information for the genotype call record. */
private List<Object> buildGtRecord(
UUID variantId, SVGenomeVariant svGenomeVar, VariantContext ctx, int alleleNo) {
UUID variantId,
SVGenomeVariant svGenomeVar,
VariantContext ctx,
GenomeVersion genomeVersion,
int alleleNo) {
final String svMethod = ctx.getCommonInfo().getAttributeAsString("SVMETHOD", null);
final boolean isBnd = (svGenomeVar.getType() == Type.BND);
final int pos2 = isBnd ? svGenomeVar.getPos() + 1 : svGenomeVar.getPos2();

final String contigName =
(genomeVersion == GenomeVersion.HG19)
? svGenomeVar.getChrName().replaceFirst("chr", "")
: svGenomeVar.getChrName();

return ImmutableList.of(
args.getRelease(),
svGenomeVar.getChrName(),
contigName,
svGenomeVar.getChr(),
svGenomeVar.getPos() + 1,
pos2,
Expand All @@ -406,15 +430,21 @@ private List<Object> buildGtRecord(
// TODO: improve type and sub type annotation!
svGenomeVar.getType(),
svGenomeVar.getType(),
buildInfoValue(ctx, svGenomeVar),
buildInfoValue(ctx, genomeVersion, svGenomeVar),
buildGenotypeValue(ctx, alleleNo));
}

private String buildInfoValue(VariantContext ctx, SVGenomeVariant svGenomeVar) {
private String buildInfoValue(
VariantContext ctx, GenomeVersion genomeVersion, SVGenomeVariant svGenomeVar) {
final List<String> mappings = new ArrayList<>();

if (svGenomeVar.getType() == Type.BND) {
mappings.add(tripleQuote("chr2") + ":" + tripleQuote(svGenomeVar.getChr2Name()));
final String contigName2 =
(genomeVersion == GenomeVersion.HG19)
? svGenomeVar.getChr2Name().replaceFirst("chr", "")
: svGenomeVar.getChr2Name();

mappings.add(tripleQuote("chr2") + ":" + tripleQuote(contigName2));
mappings.add(tripleQuote("pos2") + ":" + svGenomeVar.getPos2());
}

Expand Down
2 changes: 1 addition & 1 deletion varfish-annotator-core/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
<parent>
<groupId>com.github.bihealth</groupId>
<artifactId>VarfishAnnotator</artifactId>
<version>0.14</version>
<version>0.15-SNAPSHOT</version>
</parent>

<properties>
Expand Down

0 comments on commit 69bd7d9

Please sign in to comment.