From d71e5eb3de46570ab2b9a4d860582d74a515e229 Mon Sep 17 00:00:00 2001 From: Jerome Kelleher Date: Tue, 29 Mar 2016 17:35:14 +0100 Subject: [PATCH] Updates for pysam 0.9.0 + minor refactor. - Mostly minor encoding issues, as pysam is more strict about expecting bytes as input. For variant annotations: - Minor change for pysam API to info fields. - Took time conversion out of tight inner loop - Factored out duplicated code in SnpEff and VEP code paths. Conflicts: requirements.txt setup.py --- ga4gh/converters.py | 3 +- ga4gh/datamodel/variants.py | 98 ++++++++++++------------------------- requirements.txt | 2 +- scripts/download_data.py | 39 ++++++++++----- 4 files changed, 62 insertions(+), 80 deletions(-) diff --git a/ga4gh/converters.py b/ga4gh/converters.py index f64b0696e..f11d1d1c0 100644 --- a/ga4gh/converters.py +++ b/ga4gh/converters.py @@ -241,8 +241,7 @@ def toTags(cls, read): tags = [] for tag, value in read.info.items(): val = cls._parseTagValue(tag, value) - tagTuple = (tag, val) - tags.append(tagTuple) + tags.append((tag.encode(cls._encoding), val)) retval = tuple(tags) return retval diff --git a/ga4gh/datamodel/variants.py b/ga4gh/datamodel/variants.py index c5e6f0bcb..e2014eeb1 100644 --- a/ga4gh/datamodel/variants.py +++ b/ga4gh/datamodel/variants.py @@ -809,6 +809,20 @@ def __init__(self, parentContainer, localId, dataDir, self._sequenceOntology = backend.getOntology('sequence_ontology') self._creationTime = datetime.datetime.now().isoformat() + "Z" self._updatedTime = datetime.datetime.now().isoformat() + "Z" + # Annotations are currently either from VEP or SNPEff. If they are + # not from VEP we assume they are from SNPEff. + # TODO Detect this more rigorously at import time and throw an + # exception if we don't see the formats we're expecting. + self._isVep = "VEP" in self.toProtocolElement().analysis.info + # Parse the annotation creation time out of the VCF header. + # TODO Check this at import time, and raise an exception if the + # time is not in the expected format. + self._annotationCreatedDateTime = self._creationTime + for r in self.getMetadata().records: + # TODO handle more date formats + if r.key == "created": + self._annotationCreatedDateTime = datetime.datetime.strptime( + r.value, "%Y-%m-%d").isoformat() + "Z" def getVariantAnnotations(self, referenceName, startPosition, endPosition): """ @@ -826,12 +840,12 @@ def getVariantAnnotations(self, referenceName, startPosition, endPosition): referenceName, startPosition, endPosition) cursor = self.getFileHandle(varFileName).fetch( referenceName, startPosition, endPosition) - if "VEP" in self.toProtocolElement().analysis.info: - for record in cursor: - yield self.convertVariantAnnotationVEP(record) - else: - for record in cursor: - yield self.convertVariantAnnotationSnpEff(record) + transcriptConverter = self.convertTranscriptEffectSnpEff + if self._isVep: + transcriptConverter = self.convertTranscriptEffectVEP + for record in cursor: + yield self.convertVariantAnnotation( + record, transcriptConverter) def convertLocation(self, pos): """ @@ -999,78 +1013,30 @@ def convertSeqOntology(self, seqOntStr): # TODO We must fill the ontology ID based on the SO name return soTerms - def convertVariantAnnotationSnpEff(self, record): + def convertVariantAnnotation(self, record, transcriptConverter): """ - Accepts a HTSLib variant record and returns a GA4GH - annotation object with populated fields. - :param record: HTSLib record - :return: annotation protocol.VariantAnnotation + Converts the specfied pysam variant record into a GA4GH variant + annotation object using the specified function to convert the + transcripts. """ variant = self.convertVariant(record, None) annotation = self._createGaVariantAnnotation() annotation.start = variant.start annotation.end = variant.end - for r in self.getMetadata().records: - # TODO handle more date formats - if r.key == "created": - annotation.createDateTime = datetime.datetime.strptime( - r.value, "%Y-%m-%d").isoformat() + "Z" - annotation.variantId = variant.id - # Convert annotations from INFO field into TranscriptEffect - annStr = record.info.get('ANN') - hgvsG = record.info.get('HGVS.g') - transcriptEffects = [] - i = 0 - if annStr is not None: - for ann in annStr.split(','): - if hgvsG is not None: - splithgvsG = hgvsG.split(',') - # The HGVS.g field contains an element for each alternate - # allele - altshgvsG = splithgvsG[i % len(variant.alternateBases)] - else: - altshgvsG = "" - transcriptEffects.append( - self.convertTranscriptEffectSnpEff(ann, altshgvsG)) - i += 1 - annotation.transcriptEffects = transcriptEffects - annotation.id = self.getVariantAnnotationId(variant, annotation) - return annotation - - def convertVariantAnnotationVEP(self, record): - """ - Accepts a HTSLib variant record from a VEP generated - VCF and returns a GA4GH annotation object with populated fields. - :param record: HTSLib record - :return: annotation protocol.VariantAnnotation - """ - variant = self.convertVariant(record, None) - annotation = self._createGaVariantAnnotation() - annotation.start = variant.start - annotation.end = variant.end - for r in self.getMetadata().records: - # TODO handle more date formats - if r.key == "created": - annotation.createDateTime = datetime.datetime.strptime( - r.value, "%Y-%m-%d").isoformat() + "Z" + annotation.createdDateTime = self._annotationCreatedDateTime annotation.variantId = variant.id # Convert annotations from INFO field into TranscriptEffect - annStr = record.info.get('ANN') - hgvsG = record.info.get('HGVS.g') + annotations = record.info.get('ANN'.encode()) + hgvsG = record.info.get('HGVS.g'.encode()) transcriptEffects = [] - i = 0 - if annStr is not None: - for ann in annStr.split(','): + if annotations is not None: + for index, ann in enumerate(annotations): + altshgvsG = "" if hgvsG is not None: - splithgvsG = hgvsG.split(',') # The HGVS.g field contains an element for each alternate # allele - altshgvsG = splithgvsG[i % len(variant.alternateBases)] - else: - altshgvsG = "" - transcriptEffects.append( - self.convertTranscriptEffectVEP(ann, altshgvsG)) - i += 1 + altshgvsG = hgvsG[index % len(variant.alternateBases)] + transcriptEffects.append(transcriptConverter(ann, altshgvsG)) annotation.transcriptEffects = transcriptEffects annotation.id = self.getVariantAnnotationId(variant, annotation) return annotation diff --git a/requirements.txt b/requirements.txt index 05e887eaa..7a2262a10 100644 --- a/requirements.txt +++ b/requirements.txt @@ -38,7 +38,7 @@ Flask-Cors==2.0.1 Flask==0.10.1 avro==1.7.7 humanize==0.5.1 -pysam==0.8.4 +pysam==0.9.0 requests==2.7.0 oic==0.7.6 pyOpenSSL==0.15.1 diff --git a/scripts/download_data.py b/scripts/download_data.py index fffae1787..59dd433c5 100644 --- a/scripts/download_data.py +++ b/scripts/download_data.py @@ -23,10 +23,11 @@ import argparse import gzip -import json import hashlib +import json import os import requests +import shutil import tempfile import urllib2 @@ -190,6 +191,7 @@ def __init__(self, args): self.maxVariants = args.num_variants self.maxReads = args.num_reads self.samples = args.samples.split(',') + self.tempDir = tempfile.mkdtemp(prefix="ga4gh-download") self.numChromosomes = args.num_chromosomes self.chromosomes = [str(j + 1) for j in range(self.numChromosomes)] self.dirName = args.dir_name @@ -327,6 +329,13 @@ def createBamHeader(self, baseHeader): header['SQ'] = newSequences return header + def _downloadIndex(self, indexUrl, localIndexFile): + utils.log("Downloading index from {} to {}".format( + indexUrl, localIndexFile)) + response = urllib2.urlopen(indexUrl) + with open(localIndexFile, "w") as destFile: + destFile.write(response.read()) + def _downloadBam(self, sample): samplePath = '{}/alignment/'.format(sample) study = self.studyMap[sample] @@ -336,8 +345,11 @@ def _downloadBam(self, sample): destFileName = "{}.bam".format(sample) baseUrl = self.getBamBaseUrl() sampleUrl = os.path.join(baseUrl, samplePath, sourceFileName) - utils.log("Downloading index for '{}'".format(sampleUrl)) - remoteFile = pysam.AlignmentFile(sampleUrl) + indexUrl = sampleUrl + ".bai" + localIndexFile = os.path.join(self.tempDir, sourceFileName + ".bai") + self._downloadIndex(indexUrl, localIndexFile) + remoteFile = pysam.AlignmentFile( + sampleUrl, filepath_index=localIndexFile) header = self.createBamHeader(remoteFile.header) utils.log("Writing '{}'".format(destFileName)) localFile = pysam.AlignmentFile( @@ -360,8 +372,6 @@ def _downloadBam(self, sample): utils.log("{} records written".format(index)) remoteFile.close() localFile.close() - baiFileName = sourceFileName + '.bai' - os.remove(baiFileName) utils.log("Indexing '{}'".format(destFileName)) pysam.index(destFileName.encode('utf-8')) @@ -432,13 +442,17 @@ def downloadFastas(self): createCheckpointFile(chromosome) escapeDir(3) - def cleanup(self): + def removeCheckpoints(self): utils.log('Removing checkpoint files') for root, dirs, files in os.walk(self.dirName): for currentFile in files: if currentFile.endswith('.checkpoint'): os.remove(os.path.join(root, currentFile)) + def cleanup(self): + utils.log('Removing temporary files') + shutil.rmtree(self.tempDir) + class NcbiFileDownloader(AbstractFileDownloader): """ @@ -500,11 +514,14 @@ def parseArgs(): @utils.Timed() def main(args): downloaderClass = sources[args.source] - downloader = downloaderClass(args) - downloader.downloadVcfs() - downloader.downloadBams() - downloader.downloadFastas() - downloader.cleanup() + try: + downloader = downloaderClass(args) + downloader.downloadVcfs() + downloader.downloadBams() + downloader.downloadFastas() + downloader.removeCheckpoints() + finally: + downloader.cleanup() if __name__ == '__main__':