Skip to content
This repository was archived by the owner on Jan 24, 2018. It is now read-only.

Commit b013734

Browse files
authored
Merge pull request #1441 from saupchurch/rna-etl-perf
performance fixes for rna import
2 parents 0598dce + c67995b commit b013734

File tree

1 file changed

+14
-7
lines changed

1 file changed

+14
-7
lines changed

ga4gh/repo/rnaseq2ga.py

Lines changed: 14 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ class RnaSqliteStore(object):
2020
def __init__(self, sqliteFileName):
2121
self._dbConn = sqlite3.connect(sqliteFileName)
2222
self._cursor = self._dbConn.cursor()
23-
self._batchSize = 100
23+
self._batchSize = 2000
2424
self._rnaValueList = []
2525
self._expressionValueList = []
2626

@@ -109,16 +109,20 @@ def setUnits(self, units):
109109
elif units == "tpm":
110110
self._units = 2
111111

112-
def writeExpression(self, rnaQuantificationId, quantfilename):
112+
def writeExpression(self, rnaQuantificationId, quantfilename,
113+
featureSetNames=None):
113114
"""
114115
Reads the quantification results file and adds entries to the
115116
specified database.
116117
"""
117118
isNormalized = self._isNormalized
118119
units = self._units
119120
featureSets = None
120-
if self._dataset:
121-
featureSets = self._dataset.getFeatureSets()
121+
if self._dataset and featureSetNames:
122+
featureSets = []
123+
for annotationName in featureSetNames.split(","):
124+
featureSets.append(
125+
self._dataset.getFeatureSetByName(annotationName))
122126
with open(quantfilename, "r") as quantFile:
123127
quantificationReader = csv.DictReader(quantFile, delimiter=b"\t")
124128
for expression in quantificationReader:
@@ -246,9 +250,10 @@ def writeRnaseqTable(rnaDB, analysisIds, description, annotationId,
246250
rnaDB.batchaddRNAQuantification()
247251

248252

249-
def writeExpressionTable(writer, data):
253+
def writeExpressionTable(writer, data, featureSetNames=None):
250254
for rnaQuantId, quantfilename in data:
251-
writer.writeExpression(rnaQuantId, quantfilename)
255+
writer.writeExpression(
256+
rnaQuantId, quantfilename, featureSetNames=featureSetNames)
252257

253258

254259
def rnaseq2ga(quantificationFilename, sqlFilename, localName, rnaType,
@@ -291,4 +296,6 @@ def rnaseq2ga(quantificationFilename, sqlFilename, localName, rnaType,
291296
featureSetIds,
292297
readGroupId=readGroupIds, programs=programs,
293298
bioSampleId=bioSampleId)
294-
writeExpressionTable(writer, [(localName, quantificationFilename)])
299+
writeExpressionTable(
300+
writer, [(localName, quantificationFilename)],
301+
featureSetNames=featureSetNames)

0 commit comments

Comments
 (0)