Skip to content
This repository has been archived by the owner on Jan 24, 2018. It is now read-only.

Commit

Permalink
Merge pull request #880 from jeromekelleher/pysam-0.8.5
Browse files Browse the repository at this point in the history
Updates for pysam 0.8.5.
  • Loading branch information
jeromekelleher committed Mar 31, 2016
2 parents e787eab + d71e5eb commit 849a230
Show file tree
Hide file tree
Showing 4 changed files with 62 additions and 80 deletions.
3 changes: 1 addition & 2 deletions ga4gh/converters.py
Original file line number Diff line number Diff line change
Expand Up @@ -241,8 +241,7 @@ def toTags(cls, read):
tags = []
for tag, value in read.info.items():
val = cls._parseTagValue(tag, value)
tagTuple = (tag, val)
tags.append(tagTuple)
tags.append((tag.encode(cls._encoding), val))
retval = tuple(tags)
return retval

Expand Down
98 changes: 32 additions & 66 deletions ga4gh/datamodel/variants.py
Original file line number Diff line number Diff line change
Expand Up @@ -809,6 +809,20 @@ def __init__(self, parentContainer, localId, dataDir,
self._sequenceOntology = backend.getOntology('sequence_ontology')
self._creationTime = datetime.datetime.now().isoformat() + "Z"
self._updatedTime = datetime.datetime.now().isoformat() + "Z"
# Annotations are currently either from VEP or SNPEff. If they are
# not from VEP we assume they are from SNPEff.
# TODO Detect this more rigorously at import time and throw an
# exception if we don't see the formats we're expecting.
self._isVep = "VEP" in self.toProtocolElement().analysis.info
# Parse the annotation creation time out of the VCF header.
# TODO Check this at import time, and raise an exception if the
# time is not in the expected format.
self._annotationCreatedDateTime = self._creationTime
for r in self.getMetadata().records:
# TODO handle more date formats
if r.key == "created":
self._annotationCreatedDateTime = datetime.datetime.strptime(
r.value, "%Y-%m-%d").isoformat() + "Z"

def getVariantAnnotations(self, referenceName, startPosition, endPosition):
"""
Expand All @@ -826,12 +840,12 @@ def getVariantAnnotations(self, referenceName, startPosition, endPosition):
referenceName, startPosition, endPosition)
cursor = self.getFileHandle(varFileName).fetch(
referenceName, startPosition, endPosition)
if "VEP" in self.toProtocolElement().analysis.info:
for record in cursor:
yield self.convertVariantAnnotationVEP(record)
else:
for record in cursor:
yield self.convertVariantAnnotationSnpEff(record)
transcriptConverter = self.convertTranscriptEffectSnpEff
if self._isVep:
transcriptConverter = self.convertTranscriptEffectVEP
for record in cursor:
yield self.convertVariantAnnotation(
record, transcriptConverter)

def convertLocation(self, pos):
"""
Expand Down Expand Up @@ -999,78 +1013,30 @@ def convertSeqOntology(self, seqOntStr):
# TODO We must fill the ontology ID based on the SO name
return soTerms

def convertVariantAnnotationSnpEff(self, record):
def convertVariantAnnotation(self, record, transcriptConverter):
"""
Accepts a HTSLib variant record and returns a GA4GH
annotation object with populated fields.
:param record: HTSLib record
:return: annotation protocol.VariantAnnotation
Converts the specfied pysam variant record into a GA4GH variant
annotation object using the specified function to convert the
transcripts.
"""
variant = self.convertVariant(record, None)
annotation = self._createGaVariantAnnotation()
annotation.start = variant.start
annotation.end = variant.end
for r in self.getMetadata().records:
# TODO handle more date formats
if r.key == "created":
annotation.createDateTime = datetime.datetime.strptime(
r.value, "%Y-%m-%d").isoformat() + "Z"
annotation.variantId = variant.id
# Convert annotations from INFO field into TranscriptEffect
annStr = record.info.get('ANN')
hgvsG = record.info.get('HGVS.g')
transcriptEffects = []
i = 0
if annStr is not None:
for ann in annStr.split(','):
if hgvsG is not None:
splithgvsG = hgvsG.split(',')
# The HGVS.g field contains an element for each alternate
# allele
altshgvsG = splithgvsG[i % len(variant.alternateBases)]
else:
altshgvsG = ""
transcriptEffects.append(
self.convertTranscriptEffectSnpEff(ann, altshgvsG))
i += 1
annotation.transcriptEffects = transcriptEffects
annotation.id = self.getVariantAnnotationId(variant, annotation)
return annotation

def convertVariantAnnotationVEP(self, record):
"""
Accepts a HTSLib variant record from a VEP generated
VCF and returns a GA4GH annotation object with populated fields.
:param record: HTSLib record
:return: annotation protocol.VariantAnnotation
"""
variant = self.convertVariant(record, None)
annotation = self._createGaVariantAnnotation()
annotation.start = variant.start
annotation.end = variant.end
for r in self.getMetadata().records:
# TODO handle more date formats
if r.key == "created":
annotation.createDateTime = datetime.datetime.strptime(
r.value, "%Y-%m-%d").isoformat() + "Z"
annotation.createdDateTime = self._annotationCreatedDateTime
annotation.variantId = variant.id
# Convert annotations from INFO field into TranscriptEffect
annStr = record.info.get('ANN')
hgvsG = record.info.get('HGVS.g')
annotations = record.info.get('ANN'.encode())
hgvsG = record.info.get('HGVS.g'.encode())
transcriptEffects = []
i = 0
if annStr is not None:
for ann in annStr.split(','):
if annotations is not None:
for index, ann in enumerate(annotations):
altshgvsG = ""
if hgvsG is not None:
splithgvsG = hgvsG.split(',')
# The HGVS.g field contains an element for each alternate
# allele
altshgvsG = splithgvsG[i % len(variant.alternateBases)]
else:
altshgvsG = ""
transcriptEffects.append(
self.convertTranscriptEffectVEP(ann, altshgvsG))
i += 1
altshgvsG = hgvsG[index % len(variant.alternateBases)]
transcriptEffects.append(transcriptConverter(ann, altshgvsG))
annotation.transcriptEffects = transcriptEffects
annotation.id = self.getVariantAnnotationId(variant, annotation)
return annotation
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ Flask-Cors==2.0.1
Flask==0.10.1
avro==1.7.7
humanize==0.5.1
pysam==0.8.4
pysam==0.9.0
requests==2.7.0
oic==0.7.6
pyOpenSSL==0.15.1
Expand Down
39 changes: 28 additions & 11 deletions scripts/download_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,10 +23,11 @@

import argparse
import gzip
import json
import hashlib
import json
import os
import requests
import shutil
import tempfile
import urllib2

Expand Down Expand Up @@ -190,6 +191,7 @@ def __init__(self, args):
self.maxVariants = args.num_variants
self.maxReads = args.num_reads
self.samples = args.samples.split(',')
self.tempDir = tempfile.mkdtemp(prefix="ga4gh-download")
self.numChromosomes = args.num_chromosomes
self.chromosomes = [str(j + 1) for j in range(self.numChromosomes)]
self.dirName = args.dir_name
Expand Down Expand Up @@ -327,6 +329,13 @@ def createBamHeader(self, baseHeader):
header['SQ'] = newSequences
return header

def _downloadIndex(self, indexUrl, localIndexFile):
utils.log("Downloading index from {} to {}".format(
indexUrl, localIndexFile))
response = urllib2.urlopen(indexUrl)
with open(localIndexFile, "w") as destFile:
destFile.write(response.read())

def _downloadBam(self, sample):
samplePath = '{}/alignment/'.format(sample)
study = self.studyMap[sample]
Expand All @@ -336,8 +345,11 @@ def _downloadBam(self, sample):
destFileName = "{}.bam".format(sample)
baseUrl = self.getBamBaseUrl()
sampleUrl = os.path.join(baseUrl, samplePath, sourceFileName)
utils.log("Downloading index for '{}'".format(sampleUrl))
remoteFile = pysam.AlignmentFile(sampleUrl)
indexUrl = sampleUrl + ".bai"
localIndexFile = os.path.join(self.tempDir, sourceFileName + ".bai")
self._downloadIndex(indexUrl, localIndexFile)
remoteFile = pysam.AlignmentFile(
sampleUrl, filepath_index=localIndexFile)
header = self.createBamHeader(remoteFile.header)
utils.log("Writing '{}'".format(destFileName))
localFile = pysam.AlignmentFile(
Expand All @@ -360,8 +372,6 @@ def _downloadBam(self, sample):
utils.log("{} records written".format(index))
remoteFile.close()
localFile.close()
baiFileName = sourceFileName + '.bai'
os.remove(baiFileName)
utils.log("Indexing '{}'".format(destFileName))
pysam.index(destFileName.encode('utf-8'))

Expand Down Expand Up @@ -432,13 +442,17 @@ def downloadFastas(self):
createCheckpointFile(chromosome)
escapeDir(3)

def cleanup(self):
def removeCheckpoints(self):
utils.log('Removing checkpoint files')
for root, dirs, files in os.walk(self.dirName):
for currentFile in files:
if currentFile.endswith('.checkpoint'):
os.remove(os.path.join(root, currentFile))

def cleanup(self):
utils.log('Removing temporary files')
shutil.rmtree(self.tempDir)


class NcbiFileDownloader(AbstractFileDownloader):
"""
Expand Down Expand Up @@ -500,11 +514,14 @@ def parseArgs():
@utils.Timed()
def main(args):
downloaderClass = sources[args.source]
downloader = downloaderClass(args)
downloader.downloadVcfs()
downloader.downloadBams()
downloader.downloadFastas()
downloader.cleanup()
try:
downloader = downloaderClass(args)
downloader.downloadVcfs()
downloader.downloadBams()
downloader.downloadFastas()
downloader.removeCheckpoints()
finally:
downloader.cleanup()


if __name__ == '__main__':
Expand Down

0 comments on commit 849a230

Please sign in to comment.