Merge pull request #292 from ambrosejcarr/develop

Develop
ambrosejcarr · Dec 23, 2016 · b1a7f6a · b1a7f6a
2 parents c4093c7 + e4025c7
commit b1a7f6a
Show file tree

Hide file tree

Showing 5 changed files with 80 additions and 81 deletions.
diff --git a/src/seqc/ec2.py b/src/seqc/ec2.py
@@ -438,7 +438,9 @@ def pickle_function(function: object, args, kwargs) -> str:
         :param dict kwargs: keyword arguments for the function
         :return str: filename of the pickled function
         """
-        filename = '{}{}.p'.format(os.environ['TMPDIR'], function.__name__)
+        filename = '{}{!s}_{}.p'.format(
+            os.environ['TMPDIR'], random.randint(0, 1e9), function.__name__)
+
         with open(filename, 'wb') as f:
             dill.dump(dict(function=function, args=args, kwargs=kwargs), f)
         return filename
@@ -467,7 +469,8 @@ def write_script(cls, function) -> str:
         :param object function: function to be called
         :return str: filename of the python script
         """
-        script_name = '{}{}.py'.format(os.environ['TMPDIR'], function.__name__)
+        script_name = '{}{!s}_{}.py'.format(
+            os.environ['TMPDIR'], random.randint(0, 1e9), function.__name__)
         script_body = (
             '{imports}'
             'with open("func.p", "rb") as fin:\n'

diff --git a/src/seqc/io.py b/src/seqc/io.py
@@ -153,8 +153,8 @@ def download_awscli(cls, link, prefix='./', overwrite=True, recursive=False):
         out, err = d.communicate()
         if err:
             raise ChildProcessError(err.decode())
-        downloaded_files = [line.strip().split()[-1] for line in
-                            out.decode().strip().split('\n') if line.strip()]
+        downloaded_files = sorted([line.strip().split()[-1] for line in
+                                   out.decode().strip().split('\n') if line.strip()])
         if log:
             log.notify('downloaded files:\n\t[%s]' % ',\n\t'.join(
                 downloaded_files))
@@ -719,7 +719,7 @@ def download(
             genomic_fastq = [f for f in filenames if '_R1_' in f]
             barcode_fastq = [f for f in filenames if '_R2_' in f]
 
-        return barcode_fastq, genomic_fastq
+        return sorted(barcode_fastq), sorted(genomic_fastq)
 
 
 class ProcessManager:

diff --git a/src/seqc/sequence/gtf.py b/src/seqc/sequence/gtf.py
@@ -387,9 +387,8 @@ def iter_transcripts(self):
             if record[2] == 'exon':
                 exons.append(record)
             elif record[2] == 'transcript':
-                # we want exons in inverse order, - is already in inverse order.
-                if transcript_strand == '+':
-                    exons = exons[::-1]
+                # we want exons in inverse order
+                exons = exons[::-1]
                 yield (
                     (transcript_chromosome, transcript_strand, transcript_gene_id), exons)
                 exons = []
@@ -398,8 +397,7 @@ def iter_transcripts(self):
                 transcript_gene_id = self.strip_gene_num(record[8])
 
         # yield the final transcript
-        if transcript_strand == '+':
-            exons = exons[::-1]
+        exons = exons[::-1]
         yield (transcript_chromosome, transcript_strand, transcript_gene_id), exons
 
 

diff --git a/src/seqc/summary/summary.py b/src/seqc/summary/summary.py
@@ -93,27 +93,33 @@ def from_status_filters(cls, ra, filename):
         description = (
             'Initial filters are run over the sam file while our ReadArray database is '
             'being constructed. These filters indicate heuristic reasons why reads '
-            'should be omitted from downstream operations:\n'
-            'no gene: Regardless of the read\'s genomic alignment status, there was no '
-            'transcriptomic alignment for this read.\n'
-            'gene not unique: this indicates that more than one alignment was recovered '
-            'for this read. We attempt to resolve these multi-alignments downstream. '
-            'primer missing: This is an in-drop specific filter, it indices that the '
+            'should be omitted from downstream operations:<br><br>'
+            '<b>no gene</b>: Regardless of the read\'s genomic alignment status, there was no '
+            'transcriptomic alignment for this read.<br>'
+            '<b>gene not unique</b>: this indicates that more than one alignment was recovered '
+            'for this read. We attempt to resolve these multi-alignments downstream. <br>'
+            '<b>primer missing</b>: This is an in-drop specific filter, it indices that the '
             'spacer sequence could not be identified, and thus neither a cell barcode '
-            'nor an rmt were recorded for this read.\n'
-            'low poly t: the primer did not display enough t-sequence in the primer '
+            'nor an rmt were recorded for this read.<br>'
+            '<b>low poly t</b>: the primer did not display enough t-sequence in the primer '
             'tail, where these nucleotides are expected. This indicates an increased '
             'probability that this primer randomly primed, instead of hybridizing with '
             'the poly-a tail of an mRNA molecule.')
         description_section = TextContent(description)
 
-        keys = ('length of read array',  'no gene', 'gene not unique', 'primer missing', 'low poly t')
+        # Get counts 
+        no_gene = np.sum(ra.data['status'] & ra.filter_codes['no_gene'] > 0)
+        gene_not_unique = np.sum(ra.data['status'] & ra.filter_codes['gene_not_unique'] > 0)
+        primer_missing = np.sum(ra.data['status'] & ra.filter_codes['primer_missing'] > 0)
+        low_polyt = np.sum(ra.data['status'] & ra.filter_codes['low_polyt'] > 0)
+
+        keys = ('length of read array',  'reads with genomic alignments alone', 'reads mapping to multiple genes', 'primer missing', 'low poly t')
         values = (
             len(ra.data),
-            np.sum(ra.data['status'] & ra.filter_codes['no_gene'] > 0),
-            np.sum(ra.data['status'] & ra.filter_codes['gene_not_unique'] > 0),
-            np.sum(ra.data['status'] & ra.filter_codes['primer_missing'] > 0),
-            np.sum(ra.data['status'] & ra.filter_codes['low_polyt'] > 0)
+            '%d (%.2f%%)' % (no_gene, no_gene / len(ra.data) * 100),
+            '%d (%.2f%%)' % (gene_not_unique, gene_not_unique / len(ra.data) * 100),
+            '%d (%.2f%%)' % (primer_missing, primer_missing / len(ra.data) * 100),
+            '%d (%.2f%%)' % (low_polyt, low_polyt / len(ra.data) * 100),
         )
         data_section = DataContent(keys, values)
         return cls(
@@ -134,9 +140,10 @@ def from_cell_barcode_correction(cls, ra, filename):
         """
         description = 'description for cell barcode correction'  # todo implement
         description_section = TextContent(description)
+        count = np.sum(ra.data['status'] & ra.filter_codes['cell_error'] > 0)
         data_section = DataContent(
             ['cell error'],
-            [np.sum(ra.data['status'] & ra.filter_codes['cell_error'] > 0)])
+            ['%d (%.2f%%)' % (count, count / len(ra.data) * 100)])
         return cls(
             'Cell Barcode Correction',
             {'Description': description_section, 'Results': data_section},
@@ -155,9 +162,10 @@ def from_rmt_correction(cls, ra, filename):
 
         description = 'description for rmt correction'  # todo implement
         description_section = TextContent(description)
+        count = np.sum(ra.data['status'] & ra.filter_codes['rmt_error'])
         data_section = DataContent(
             ['rmt error'],
-            [np.sum(ra.data['status'] & ra.filter_codes['rmt_error'])])
+            ['%d (%.2f%%)' % (count, count / len(ra.data) * 100)])
         return cls(
             'RMT Barcode Correction',
             {'Description': description_section, 'Results': data_section},
@@ -213,7 +221,15 @@ def from_final_matrix(cls, counts_matrix, figure_path, filename):
         :return:
         """
         plot.Diagnostics.cell_size_histogram(counts_matrix, save=figure_path)
-        image_legend = 'histogram legend'
+
+        # Number of cells and molecule count distributions
+        image_legend = "Number of cells: {} <br>".format(counts_matrix.shape[0])
+        ms = counts_matrix.sum(axis=1)
+        image_legend += "Min number of molecules: {}<br>".format(ms.min())
+        for prctile in [25, 50, 75]:
+            image_legend += '{}th percentile: {}<br>'.format(prctile, np.percentile(ms, prctile))
+        image_legend += "Max number of molecules: {}<br>".format(ms.max())
+
         image_section = ImageContent(figure_path, 'cell size figure', image_legend)
         return cls('Cell Summary',
                    {'Library Size Distribution': image_section},

diff --git a/src/seqc/test.py b/src/seqc/test.py
@@ -41,6 +41,26 @@ def makedirs(output):
 
 class TestSEQC(unittest.TestCase):
 
+    email = globals()['EMAIL'] if 'EMAIL' in globals() else None
+    bucket = globals()['TEST_BUCKET'] if 'TEST_BUCKET' in globals() else None
+    rsa_key = globals()['RSA_KEY'] if 'RSA_KEY' in globals() else None
+    if rsa_key is None:
+        try:
+            rsa_key = os.environ['AWS_RSA_KEY']
+        except KeyError:
+            pass
+
+    def check_parameters(self):
+        if self.email is None:
+            self.email = input('please provide an email address for SEQC to mail '
+                               'results: ')
+        if self.bucket is None:
+            self.bucket = input('please provide an amazon s3 bucket to upload test '
+                                'results: ')
+        if self.rsa_key is None:
+            self.rsa_key = input('please provide an RSA key with permission to create '
+                                 'aws instances: ')
+
     # @params('in_drop', 'in_drop_v2', 'drop_seq', 'ten_x', 'mars_seq')
     def test_local(self, platform='in_drop_v2'):
         """test seqc after pre-downloading all files"""
@@ -49,10 +69,8 @@ def test_local(self, platform='in_drop_v2'):
                     'pipeline errors.\n')
         test_name = 'test_no_aws_%s' % platform
         makedirs(LOCAL_OUTPUT % test_name)
-        if 'EMAIL' in globals():
-            email = globals()['EMAIL']
-        else:
-            email = input('please provide an email address for SEQC to mail results: ')
+        if self.email is None:
+            self.email = input('please provide an email address for SEQC to mail results: ')
         argv = [
             'run',
             platform,
@@ -61,68 +79,44 @@ def test_local(self, platform='in_drop_v2'):
             '-b', BARCODE_FASTQ % platform,
             '-g', GENOMIC_FASTQ % platform,
             '--barcode-files', PLATFORM_BARCODES % platform,
-            '-e', email,
+            '-e', self.email,
             '--local']
         main.main(argv)
         os.remove('./seqc_log.txt')  # clean up the dummy log we made.
 
     # @params('in_drop', 'in_drop_v2', 'drop_seq', 'ten_x', 'mars_seq')
     def test_remote_from_raw_fastq(self, platform='in_drop_v2'):
         test_name = 'test_remote_%s' % platform
-        if 'EMAIL' in globals():
-            email = globals()['EMAIL']
-        else:
-            email = input('please provide an email address for SEQC to mail results: ')
-        if 'TEST_BUCKET' in globals():
-            bucket = globals()['TEST_BUCKET']
-        else:
-            bucket = input('please provide an amazon s3 bucket to upload test results: ')
-        if 'RSA_KEY' in globals():
-            rsa_key = globals()['RSA_KEY']
-        else:
-            rsa_key = input('please provide an RSA key with permission to create aws '
-                            'instances: ')
+        self.check_parameters()
         argv = [
             'run',
             platform,
             '-o', REMOTE_OUTPUT,
-            '-u', UPLOAD % (bucket, test_name),
+            '-u', UPLOAD % (self.bucket, test_name),
             '-i', INDEX,
-            '-e', email,
+            '-e', self.email,
             '-b', BARCODE_FASTQ % platform,
             '-g', GENOMIC_FASTQ % platform,
             '--instance-type', 'c4.large',
             '--spot-bid', '1.0',
-            '-k', rsa_key, '--debug']
+            '-k', self.rsa_key, '--debug']
         if platform != 'drop_seq':
             argv += ['--barcode-files', PLATFORM_BARCODES % platform]
         main.main(argv)
 
     # @params('in_drop', 'in_drop_v2', 'drop_seq', 'ten_x', 'mars_seq')
     def test_remote_from_merged(self, platform='in_drop_v2'):
         test_name = 'test_remote_%s' % platform
-        if 'EMAIL' in globals():
-            email = globals()['EMAIL']
-        else:
-            email = input('please provide an email address for SEQC to mail results: ')
-        if 'TEST_BUCKET' in globals():
-            bucket = globals()['TEST_BUCKET']
-        else:
-            bucket = input('please provide an amazon s3 bucket to upload test results: ')
-        if 'RSA_KEY' in globals():
-            rsa_key = globals()['RSA_KEY']
-        else:
-            rsa_key = input('please provide an RSA key with permission to create aws '
-                            'instances: ')
+        self.check_parameters()
         argv = [
             'run',
             platform,
             '-o', REMOTE_OUTPUT,
-            '-u', UPLOAD % (bucket, test_name),
+            '-u', UPLOAD % (self.bucket, test_name),
             '-i', INDEX,
-            '-e', email,
+            '-e', self.email,
             '-m', MERGED % (platform, platform),
-            '-k', rsa_key,
+            '-k', self.rsa_key,
             '--instance-type', 'c4.large',
             # '--spot-bid', '1.0'
         ]
@@ -133,28 +127,16 @@ def test_remote_from_merged(self, platform='in_drop_v2'):
     # @params('in_drop', 'in_drop_v2', 'drop_seq', 'ten_x', 'mars_seq')
     def test_remote_from_samfile(self, platform='in_drop_v2'):
         test_name = 'test_remote_%s' % platform
-        if 'EMAIL' in globals():
-            email = globals()['EMAIL']
-        else:
-            email = input('please provide an email address for SEQC to mail results: ')
-        if 'TEST_BUCKET' in globals():
-            bucket = globals()['TEST_BUCKET']
-        else:
-            bucket = input('please provide an amazon s3 bucket to upload test results: ')
-        if 'RSA_KEY' in globals():
-            rsa_key = globals()['RSA_KEY']
-        else:
-            rsa_key = input('please provide an RSA key with permission to create aws '
-                            'instances: ')
+        self.check_parameters()
         argv = [
             'run',
             platform,
             '-o', REMOTE_OUTPUT,
-            '-u', UPLOAD % (bucket, test_name),
+            '-u', UPLOAD % (self.bucket, test_name),
             '-i', 's3://seqc-pub/genomes/hg38_long_polya/',
-            '-e', email,
+            '-e', self.email,
             '-a', SAMFILE % platform,
-            '-k', rsa_key,
+            '-k', self.rsa_key,
             '--instance-type', 'c4.8xlarge',
             '--debug',
             # '--spot-bid', '1.0'
@@ -361,7 +343,7 @@ class TestReadArrayCreation(unittest.TestCase):
 
     @classmethod
     def setUpClass(cls):
-        platform='in_drop_v2'
+        platform = 'in_drop_v2'
         # cls.bamfile = LOCAL_OUTPUT % platform + '_bamfile.bam'
         # cls.annotation = LOCAL_OUTPUT % platform + '_annotations.gtf'
         # S3.download(SAMFILE % platform, cls.bamfile, recursive=False)
@@ -402,4 +384,4 @@ def get_length_of_gtf(self):
 if __name__ == "__main__":
     nose2.main()
 
-#########################################################################################
+#########################################################################################