Skip to content

Commit

Permalink
Merge pull request #292 from ambrosejcarr/develop
Browse files Browse the repository at this point in the history
Develop
  • Loading branch information
ambrosejcarr authored Dec 23, 2016
2 parents c4093c7 + e4025c7 commit b1a7f6a
Show file tree
Hide file tree
Showing 5 changed files with 80 additions and 81 deletions.
7 changes: 5 additions & 2 deletions src/seqc/ec2.py
Original file line number Diff line number Diff line change
Expand Up @@ -438,7 +438,9 @@ def pickle_function(function: object, args, kwargs) -> str:
:param dict kwargs: keyword arguments for the function
:return str: filename of the pickled function
"""
filename = '{}{}.p'.format(os.environ['TMPDIR'], function.__name__)
filename = '{}{!s}_{}.p'.format(
os.environ['TMPDIR'], random.randint(0, 1e9), function.__name__)

with open(filename, 'wb') as f:
dill.dump(dict(function=function, args=args, kwargs=kwargs), f)
return filename
Expand Down Expand Up @@ -467,7 +469,8 @@ def write_script(cls, function) -> str:
:param object function: function to be called
:return str: filename of the python script
"""
script_name = '{}{}.py'.format(os.environ['TMPDIR'], function.__name__)
script_name = '{}{!s}_{}.py'.format(
os.environ['TMPDIR'], random.randint(0, 1e9), function.__name__)
script_body = (
'{imports}'
'with open("func.p", "rb") as fin:\n'
Expand Down
6 changes: 3 additions & 3 deletions src/seqc/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,8 +153,8 @@ def download_awscli(cls, link, prefix='./', overwrite=True, recursive=False):
out, err = d.communicate()
if err:
raise ChildProcessError(err.decode())
downloaded_files = [line.strip().split()[-1] for line in
out.decode().strip().split('\n') if line.strip()]
downloaded_files = sorted([line.strip().split()[-1] for line in
out.decode().strip().split('\n') if line.strip()])
if log:
log.notify('downloaded files:\n\t[%s]' % ',\n\t'.join(
downloaded_files))
Expand Down Expand Up @@ -719,7 +719,7 @@ def download(
genomic_fastq = [f for f in filenames if '_R1_' in f]
barcode_fastq = [f for f in filenames if '_R2_' in f]

return barcode_fastq, genomic_fastq
return sorted(barcode_fastq), sorted(genomic_fastq)


class ProcessManager:
Expand Down
8 changes: 3 additions & 5 deletions src/seqc/sequence/gtf.py
Original file line number Diff line number Diff line change
Expand Up @@ -387,9 +387,8 @@ def iter_transcripts(self):
if record[2] == 'exon':
exons.append(record)
elif record[2] == 'transcript':
# we want exons in inverse order, - is already in inverse order.
if transcript_strand == '+':
exons = exons[::-1]
# we want exons in inverse order
exons = exons[::-1]
yield (
(transcript_chromosome, transcript_strand, transcript_gene_id), exons)
exons = []
Expand All @@ -398,8 +397,7 @@ def iter_transcripts(self):
transcript_gene_id = self.strip_gene_num(record[8])

# yield the final transcript
if transcript_strand == '+':
exons = exons[::-1]
exons = exons[::-1]
yield (transcript_chromosome, transcript_strand, transcript_gene_id), exons


Expand Down
48 changes: 32 additions & 16 deletions src/seqc/summary/summary.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,27 +93,33 @@ def from_status_filters(cls, ra, filename):
description = (
'Initial filters are run over the sam file while our ReadArray database is '
'being constructed. These filters indicate heuristic reasons why reads '
'should be omitted from downstream operations:\n'
'no gene: Regardless of the read\'s genomic alignment status, there was no '
'transcriptomic alignment for this read.\n'
'gene not unique: this indicates that more than one alignment was recovered '
'for this read. We attempt to resolve these multi-alignments downstream. '
'primer missing: This is an in-drop specific filter, it indices that the '
'should be omitted from downstream operations:<br><br>'
'<b>no gene</b>: Regardless of the read\'s genomic alignment status, there was no '
'transcriptomic alignment for this read.<br>'
'<b>gene not unique</b>: this indicates that more than one alignment was recovered '
'for this read. We attempt to resolve these multi-alignments downstream. <br>'
'<b>primer missing</b>: This is an in-drop specific filter, it indices that the '
'spacer sequence could not be identified, and thus neither a cell barcode '
'nor an rmt were recorded for this read.\n'
'low poly t: the primer did not display enough t-sequence in the primer '
'nor an rmt were recorded for this read.<br>'
'<b>low poly t</b>: the primer did not display enough t-sequence in the primer '
'tail, where these nucleotides are expected. This indicates an increased '
'probability that this primer randomly primed, instead of hybridizing with '
'the poly-a tail of an mRNA molecule.')
description_section = TextContent(description)

keys = ('length of read array', 'no gene', 'gene not unique', 'primer missing', 'low poly t')
# Get counts
no_gene = np.sum(ra.data['status'] & ra.filter_codes['no_gene'] > 0)
gene_not_unique = np.sum(ra.data['status'] & ra.filter_codes['gene_not_unique'] > 0)
primer_missing = np.sum(ra.data['status'] & ra.filter_codes['primer_missing'] > 0)
low_polyt = np.sum(ra.data['status'] & ra.filter_codes['low_polyt'] > 0)

keys = ('length of read array', 'reads with genomic alignments alone', 'reads mapping to multiple genes', 'primer missing', 'low poly t')
values = (
len(ra.data),
np.sum(ra.data['status'] & ra.filter_codes['no_gene'] > 0),
np.sum(ra.data['status'] & ra.filter_codes['gene_not_unique'] > 0),
np.sum(ra.data['status'] & ra.filter_codes['primer_missing'] > 0),
np.sum(ra.data['status'] & ra.filter_codes['low_polyt'] > 0)
'%d (%.2f%%)' % (no_gene, no_gene / len(ra.data) * 100),
'%d (%.2f%%)' % (gene_not_unique, gene_not_unique / len(ra.data) * 100),
'%d (%.2f%%)' % (primer_missing, primer_missing / len(ra.data) * 100),
'%d (%.2f%%)' % (low_polyt, low_polyt / len(ra.data) * 100),
)
data_section = DataContent(keys, values)
return cls(
Expand All @@ -134,9 +140,10 @@ def from_cell_barcode_correction(cls, ra, filename):
"""
description = 'description for cell barcode correction' # todo implement
description_section = TextContent(description)
count = np.sum(ra.data['status'] & ra.filter_codes['cell_error'] > 0)
data_section = DataContent(
['cell error'],
[np.sum(ra.data['status'] & ra.filter_codes['cell_error'] > 0)])
['%d (%.2f%%)' % (count, count / len(ra.data) * 100)])
return cls(
'Cell Barcode Correction',
{'Description': description_section, 'Results': data_section},
Expand All @@ -155,9 +162,10 @@ def from_rmt_correction(cls, ra, filename):

description = 'description for rmt correction' # todo implement
description_section = TextContent(description)
count = np.sum(ra.data['status'] & ra.filter_codes['rmt_error'])
data_section = DataContent(
['rmt error'],
[np.sum(ra.data['status'] & ra.filter_codes['rmt_error'])])
['%d (%.2f%%)' % (count, count / len(ra.data) * 100)])
return cls(
'RMT Barcode Correction',
{'Description': description_section, 'Results': data_section},
Expand Down Expand Up @@ -213,7 +221,15 @@ def from_final_matrix(cls, counts_matrix, figure_path, filename):
:return:
"""
plot.Diagnostics.cell_size_histogram(counts_matrix, save=figure_path)
image_legend = 'histogram legend'

# Number of cells and molecule count distributions
image_legend = "Number of cells: {} <br>".format(counts_matrix.shape[0])
ms = counts_matrix.sum(axis=1)
image_legend += "Min number of molecules: {}<br>".format(ms.min())
for prctile in [25, 50, 75]:
image_legend += '{}th percentile: {}<br>'.format(prctile, np.percentile(ms, prctile))
image_legend += "Max number of molecules: {}<br>".format(ms.max())

image_section = ImageContent(figure_path, 'cell size figure', image_legend)
return cls('Cell Summary',
{'Library Size Distribution': image_section},
Expand Down
92 changes: 37 additions & 55 deletions src/seqc/test.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,26 @@ def makedirs(output):

class TestSEQC(unittest.TestCase):

email = globals()['EMAIL'] if 'EMAIL' in globals() else None
bucket = globals()['TEST_BUCKET'] if 'TEST_BUCKET' in globals() else None
rsa_key = globals()['RSA_KEY'] if 'RSA_KEY' in globals() else None
if rsa_key is None:
try:
rsa_key = os.environ['AWS_RSA_KEY']
except KeyError:
pass

def check_parameters(self):
if self.email is None:
self.email = input('please provide an email address for SEQC to mail '
'results: ')
if self.bucket is None:
self.bucket = input('please provide an amazon s3 bucket to upload test '
'results: ')
if self.rsa_key is None:
self.rsa_key = input('please provide an RSA key with permission to create '
'aws instances: ')

# @params('in_drop', 'in_drop_v2', 'drop_seq', 'ten_x', 'mars_seq')
def test_local(self, platform='in_drop_v2'):
"""test seqc after pre-downloading all files"""
Expand All @@ -49,10 +69,8 @@ def test_local(self, platform='in_drop_v2'):
'pipeline errors.\n')
test_name = 'test_no_aws_%s' % platform
makedirs(LOCAL_OUTPUT % test_name)
if 'EMAIL' in globals():
email = globals()['EMAIL']
else:
email = input('please provide an email address for SEQC to mail results: ')
if self.email is None:
self.email = input('please provide an email address for SEQC to mail results: ')
argv = [
'run',
platform,
Expand All @@ -61,68 +79,44 @@ def test_local(self, platform='in_drop_v2'):
'-b', BARCODE_FASTQ % platform,
'-g', GENOMIC_FASTQ % platform,
'--barcode-files', PLATFORM_BARCODES % platform,
'-e', email,
'-e', self.email,
'--local']
main.main(argv)
os.remove('./seqc_log.txt') # clean up the dummy log we made.

# @params('in_drop', 'in_drop_v2', 'drop_seq', 'ten_x', 'mars_seq')
def test_remote_from_raw_fastq(self, platform='in_drop_v2'):
test_name = 'test_remote_%s' % platform
if 'EMAIL' in globals():
email = globals()['EMAIL']
else:
email = input('please provide an email address for SEQC to mail results: ')
if 'TEST_BUCKET' in globals():
bucket = globals()['TEST_BUCKET']
else:
bucket = input('please provide an amazon s3 bucket to upload test results: ')
if 'RSA_KEY' in globals():
rsa_key = globals()['RSA_KEY']
else:
rsa_key = input('please provide an RSA key with permission to create aws '
'instances: ')
self.check_parameters()
argv = [
'run',
platform,
'-o', REMOTE_OUTPUT,
'-u', UPLOAD % (bucket, test_name),
'-u', UPLOAD % (self.bucket, test_name),
'-i', INDEX,
'-e', email,
'-e', self.email,
'-b', BARCODE_FASTQ % platform,
'-g', GENOMIC_FASTQ % platform,
'--instance-type', 'c4.large',
'--spot-bid', '1.0',
'-k', rsa_key, '--debug']
'-k', self.rsa_key, '--debug']
if platform != 'drop_seq':
argv += ['--barcode-files', PLATFORM_BARCODES % platform]
main.main(argv)

# @params('in_drop', 'in_drop_v2', 'drop_seq', 'ten_x', 'mars_seq')
def test_remote_from_merged(self, platform='in_drop_v2'):
test_name = 'test_remote_%s' % platform
if 'EMAIL' in globals():
email = globals()['EMAIL']
else:
email = input('please provide an email address for SEQC to mail results: ')
if 'TEST_BUCKET' in globals():
bucket = globals()['TEST_BUCKET']
else:
bucket = input('please provide an amazon s3 bucket to upload test results: ')
if 'RSA_KEY' in globals():
rsa_key = globals()['RSA_KEY']
else:
rsa_key = input('please provide an RSA key with permission to create aws '
'instances: ')
self.check_parameters()
argv = [
'run',
platform,
'-o', REMOTE_OUTPUT,
'-u', UPLOAD % (bucket, test_name),
'-u', UPLOAD % (self.bucket, test_name),
'-i', INDEX,
'-e', email,
'-e', self.email,
'-m', MERGED % (platform, platform),
'-k', rsa_key,
'-k', self.rsa_key,
'--instance-type', 'c4.large',
# '--spot-bid', '1.0'
]
Expand All @@ -133,28 +127,16 @@ def test_remote_from_merged(self, platform='in_drop_v2'):
# @params('in_drop', 'in_drop_v2', 'drop_seq', 'ten_x', 'mars_seq')
def test_remote_from_samfile(self, platform='in_drop_v2'):
test_name = 'test_remote_%s' % platform
if 'EMAIL' in globals():
email = globals()['EMAIL']
else:
email = input('please provide an email address for SEQC to mail results: ')
if 'TEST_BUCKET' in globals():
bucket = globals()['TEST_BUCKET']
else:
bucket = input('please provide an amazon s3 bucket to upload test results: ')
if 'RSA_KEY' in globals():
rsa_key = globals()['RSA_KEY']
else:
rsa_key = input('please provide an RSA key with permission to create aws '
'instances: ')
self.check_parameters()
argv = [
'run',
platform,
'-o', REMOTE_OUTPUT,
'-u', UPLOAD % (bucket, test_name),
'-u', UPLOAD % (self.bucket, test_name),
'-i', 's3://seqc-pub/genomes/hg38_long_polya/',
'-e', email,
'-e', self.email,
'-a', SAMFILE % platform,
'-k', rsa_key,
'-k', self.rsa_key,
'--instance-type', 'c4.8xlarge',
'--debug',
# '--spot-bid', '1.0'
Expand Down Expand Up @@ -361,7 +343,7 @@ class TestReadArrayCreation(unittest.TestCase):

@classmethod
def setUpClass(cls):
platform='in_drop_v2'
platform = 'in_drop_v2'
# cls.bamfile = LOCAL_OUTPUT % platform + '_bamfile.bam'
# cls.annotation = LOCAL_OUTPUT % platform + '_annotations.gtf'
# S3.download(SAMFILE % platform, cls.bamfile, recursive=False)
Expand Down Expand Up @@ -402,4 +384,4 @@ def get_length_of_gtf(self):
if __name__ == "__main__":
nose2.main()

#########################################################################################
#########################################################################################

0 comments on commit b1a7f6a

Please sign in to comment.