From d71e5eb3de46570ab2b9a4d860582d74a515e229 Mon Sep 17 00:00:00 2001
From: Jerome Kelleher <jk@well.ox.ac.uk>
Date: Tue, 29 Mar 2016 17:35:14 +0100
Subject: [PATCH] Updates for pysam 0.9.0 + minor refactor.

- Mostly minor encoding issues, as pysam is more strict about expecting
  bytes as input.

For variant annotations:

- Minor change for pysam API to info fields.
- Took time conversion out of tight inner loop
- Factored out duplicated code in SnpEff and VEP code paths.

Conflicts:
	requirements.txt
	setup.py
---
 ga4gh/converters.py         |  3 +-
 ga4gh/datamodel/variants.py | 98 ++++++++++++-------------------------
 requirements.txt            |  2 +-
 scripts/download_data.py    | 39 ++++++++++-----
 4 files changed, 62 insertions(+), 80 deletions(-)

diff --git a/ga4gh/converters.py b/ga4gh/converters.py
index f64b0696e..f11d1d1c0 100644
--- a/ga4gh/converters.py
+++ b/ga4gh/converters.py
@@ -241,8 +241,7 @@ def toTags(cls, read):
         tags = []
         for tag, value in read.info.items():
             val = cls._parseTagValue(tag, value)
-            tagTuple = (tag, val)
-            tags.append(tagTuple)
+            tags.append((tag.encode(cls._encoding), val))
         retval = tuple(tags)
         return retval
 
diff --git a/ga4gh/datamodel/variants.py b/ga4gh/datamodel/variants.py
index c5e6f0bcb..e2014eeb1 100644
--- a/ga4gh/datamodel/variants.py
+++ b/ga4gh/datamodel/variants.py
@@ -809,6 +809,20 @@ def __init__(self, parentContainer, localId, dataDir,
         self._sequenceOntology = backend.getOntology('sequence_ontology')
         self._creationTime = datetime.datetime.now().isoformat() + "Z"
         self._updatedTime = datetime.datetime.now().isoformat() + "Z"
+        # Annotations are currently either from VEP or SNPEff. If they are
+        # not from VEP we assume they are from SNPEff.
+        # TODO Detect this more rigorously at import time and throw an
+        # exception if we don't see the formats we're expecting.
+        self._isVep = "VEP" in self.toProtocolElement().analysis.info
+        # Parse the annotation creation time out of the VCF header.
+        # TODO Check this at import time, and raise an exception if the
+        # time is not in the expected format.
+        self._annotationCreatedDateTime = self._creationTime
+        for r in self.getMetadata().records:
+            # TODO handle more date formats
+            if r.key == "created":
+                self._annotationCreatedDateTime = datetime.datetime.strptime(
+                    r.value, "%Y-%m-%d").isoformat() + "Z"
 
     def getVariantAnnotations(self, referenceName, startPosition, endPosition):
         """
@@ -826,12 +840,12 @@ def getVariantAnnotations(self, referenceName, startPosition, endPosition):
                     referenceName, startPosition, endPosition)
             cursor = self.getFileHandle(varFileName).fetch(
                 referenceName, startPosition, endPosition)
-            if "VEP" in self.toProtocolElement().analysis.info:
-                for record in cursor:
-                    yield self.convertVariantAnnotationVEP(record)
-            else:
-                for record in cursor:
-                    yield self.convertVariantAnnotationSnpEff(record)
+            transcriptConverter = self.convertTranscriptEffectSnpEff
+            if self._isVep:
+                transcriptConverter = self.convertTranscriptEffectVEP
+            for record in cursor:
+                yield self.convertVariantAnnotation(
+                    record, transcriptConverter)
 
     def convertLocation(self, pos):
         """
@@ -999,78 +1013,30 @@ def convertSeqOntology(self, seqOntStr):
             # TODO We must fill the ontology ID based on the SO name
         return soTerms
 
-    def convertVariantAnnotationSnpEff(self, record):
+    def convertVariantAnnotation(self, record, transcriptConverter):
         """
-        Accepts a HTSLib variant record and returns a GA4GH
-        annotation object with populated fields.
-        :param record: HTSLib record
-        :return: annotation protocol.VariantAnnotation
+        Converts the specfied pysam variant record into a GA4GH variant
+        annotation object using the specified function to convert the
+        transcripts.
         """
         variant = self.convertVariant(record, None)
         annotation = self._createGaVariantAnnotation()
         annotation.start = variant.start
         annotation.end = variant.end
-        for r in self.getMetadata().records:
-            # TODO handle more date formats
-            if r.key == "created":
-                annotation.createDateTime = datetime.datetime.strptime(
-                    r.value, "%Y-%m-%d").isoformat() + "Z"
-        annotation.variantId = variant.id
-        # Convert annotations from INFO field into TranscriptEffect
-        annStr = record.info.get('ANN')
-        hgvsG = record.info.get('HGVS.g')
-        transcriptEffects = []
-        i = 0
-        if annStr is not None:
-            for ann in annStr.split(','):
-                if hgvsG is not None:
-                    splithgvsG = hgvsG.split(',')
-                    # The HGVS.g field contains an element for each alternate
-                    # allele
-                    altshgvsG = splithgvsG[i % len(variant.alternateBases)]
-                else:
-                    altshgvsG = ""
-                transcriptEffects.append(
-                    self.convertTranscriptEffectSnpEff(ann, altshgvsG))
-                i += 1
-        annotation.transcriptEffects = transcriptEffects
-        annotation.id = self.getVariantAnnotationId(variant, annotation)
-        return annotation
-
-    def convertVariantAnnotationVEP(self, record):
-        """
-        Accepts a HTSLib variant record from a VEP generated
-        VCF and returns a GA4GH annotation object with populated fields.
-        :param record: HTSLib record
-        :return: annotation protocol.VariantAnnotation
-        """
-        variant = self.convertVariant(record, None)
-        annotation = self._createGaVariantAnnotation()
-        annotation.start = variant.start
-        annotation.end = variant.end
-        for r in self.getMetadata().records:
-            # TODO handle more date formats
-            if r.key == "created":
-                annotation.createDateTime = datetime.datetime.strptime(
-                    r.value, "%Y-%m-%d").isoformat() + "Z"
+        annotation.createdDateTime = self._annotationCreatedDateTime
         annotation.variantId = variant.id
         # Convert annotations from INFO field into TranscriptEffect
-        annStr = record.info.get('ANN')
-        hgvsG = record.info.get('HGVS.g')
+        annotations = record.info.get('ANN'.encode())
+        hgvsG = record.info.get('HGVS.g'.encode())
         transcriptEffects = []
-        i = 0
-        if annStr is not None:
-            for ann in annStr.split(','):
+        if annotations is not None:
+            for index, ann in enumerate(annotations):
+                altshgvsG = ""
                 if hgvsG is not None:
-                    splithgvsG = hgvsG.split(',')
                     # The HGVS.g field contains an element for each alternate
                     # allele
-                    altshgvsG = splithgvsG[i % len(variant.alternateBases)]
-                else:
-                    altshgvsG = ""
-                transcriptEffects.append(
-                    self.convertTranscriptEffectVEP(ann, altshgvsG))
-                i += 1
+                    altshgvsG = hgvsG[index % len(variant.alternateBases)]
+                transcriptEffects.append(transcriptConverter(ann, altshgvsG))
         annotation.transcriptEffects = transcriptEffects
         annotation.id = self.getVariantAnnotationId(variant, annotation)
         return annotation
diff --git a/requirements.txt b/requirements.txt
index 05e887eaa..7a2262a10 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -38,7 +38,7 @@ Flask-Cors==2.0.1
 Flask==0.10.1
 avro==1.7.7
 humanize==0.5.1
-pysam==0.8.4
+pysam==0.9.0
 requests==2.7.0
 oic==0.7.6
 pyOpenSSL==0.15.1
diff --git a/scripts/download_data.py b/scripts/download_data.py
index fffae1787..59dd433c5 100644
--- a/scripts/download_data.py
+++ b/scripts/download_data.py
@@ -23,10 +23,11 @@
 
 import argparse
 import gzip
-import json
 import hashlib
+import json
 import os
 import requests
+import shutil
 import tempfile
 import urllib2
 
@@ -190,6 +191,7 @@ def __init__(self, args):
         self.maxVariants = args.num_variants
         self.maxReads = args.num_reads
         self.samples = args.samples.split(',')
+        self.tempDir = tempfile.mkdtemp(prefix="ga4gh-download")
         self.numChromosomes = args.num_chromosomes
         self.chromosomes = [str(j + 1) for j in range(self.numChromosomes)]
         self.dirName = args.dir_name
@@ -327,6 +329,13 @@ def createBamHeader(self, baseHeader):
         header['SQ'] = newSequences
         return header
 
+    def _downloadIndex(self, indexUrl, localIndexFile):
+        utils.log("Downloading index from {} to {}".format(
+            indexUrl, localIndexFile))
+        response = urllib2.urlopen(indexUrl)
+        with open(localIndexFile, "w") as destFile:
+            destFile.write(response.read())
+
     def _downloadBam(self, sample):
         samplePath = '{}/alignment/'.format(sample)
         study = self.studyMap[sample]
@@ -336,8 +345,11 @@ def _downloadBam(self, sample):
         destFileName = "{}.bam".format(sample)
         baseUrl = self.getBamBaseUrl()
         sampleUrl = os.path.join(baseUrl, samplePath, sourceFileName)
-        utils.log("Downloading index for '{}'".format(sampleUrl))
-        remoteFile = pysam.AlignmentFile(sampleUrl)
+        indexUrl = sampleUrl + ".bai"
+        localIndexFile = os.path.join(self.tempDir, sourceFileName + ".bai")
+        self._downloadIndex(indexUrl, localIndexFile)
+        remoteFile = pysam.AlignmentFile(
+            sampleUrl, filepath_index=localIndexFile)
         header = self.createBamHeader(remoteFile.header)
         utils.log("Writing '{}'".format(destFileName))
         localFile = pysam.AlignmentFile(
@@ -360,8 +372,6 @@ def _downloadBam(self, sample):
             utils.log("{} records written".format(index))
         remoteFile.close()
         localFile.close()
-        baiFileName = sourceFileName + '.bai'
-        os.remove(baiFileName)
         utils.log("Indexing '{}'".format(destFileName))
         pysam.index(destFileName.encode('utf-8'))
 
@@ -432,13 +442,17 @@ def downloadFastas(self):
                 createCheckpointFile(chromosome)
         escapeDir(3)
 
-    def cleanup(self):
+    def removeCheckpoints(self):
         utils.log('Removing checkpoint files')
         for root, dirs, files in os.walk(self.dirName):
             for currentFile in files:
                 if currentFile.endswith('.checkpoint'):
                     os.remove(os.path.join(root, currentFile))
 
+    def cleanup(self):
+        utils.log('Removing temporary files')
+        shutil.rmtree(self.tempDir)
+
 
 class NcbiFileDownloader(AbstractFileDownloader):
     """
@@ -500,11 +514,14 @@ def parseArgs():
 @utils.Timed()
 def main(args):
     downloaderClass = sources[args.source]
-    downloader = downloaderClass(args)
-    downloader.downloadVcfs()
-    downloader.downloadBams()
-    downloader.downloadFastas()
-    downloader.cleanup()
+    try:
+        downloader = downloaderClass(args)
+        downloader.downloadVcfs()
+        downloader.downloadBams()
+        downloader.downloadFastas()
+        downloader.removeCheckpoints()
+    finally:
+        downloader.cleanup()
 
 
 if __name__ == '__main__':