Develop (#312)

Merging develop back to master; using master as main branch from now on.
ambrosejcarr · Nov 10, 2017 · 7f5b70a · 7f5b70a
1 parent b1a7f6a
commit 7f5b70a
Show file tree

Hide file tree

Showing 30 changed files with 1,090 additions and 286 deletions.
diff --git a/.gitignore b/.gitignore
@@ -5,3 +5,7 @@
 testfiles*
 *.DS_Store*
 *seqc.log
+build/*
+dist/*
+.project
+.pydevproject
diff --git a/MANIFEST.in b/MANIFEST.in
@@ -0,0 +1,5 @@
+include src/seqc/summary/*/*.css
+include src/seqc/summary/fonts/*
+include src/seqc/summary/*/*.py
+include src/seqc/summary/*/*.js
+include src/seqc/summary/*/*.html
diff --git a/README.md b/README.md
@@ -44,6 +44,10 @@ Once all dependencies have been installed, SEQC can be installed on any machine
 
 Please note that to avoid passing the -k/--rsa-key command when you execute SEQC runs, you can also set the environment variable `AWS_RSA_KEY` to the path to your newly created key.
 
+### Testing SEQC:
+
+All the unit tests in class `TestSEQC` in `test.py` have been tested. Currently, only two platforms `ten_x_v2` and `in_drop_v2` have been tested. Old unit tests from these two platforms together with other platforms are stored at `s3://dp-lab-data/seqc-old-unit-test/`. 
+
 ### Running SEQC:
 
 After SEQC is installed, help can be listed:

diff --git a/setup.py b/setup.py
@@ -4,6 +4,15 @@
 from subprocess import call, check_output
 from setuptools import setup
 from warnings import warn
+import py_compile
+
+# Replace py_compile.compile with a function that calls it with doraise=True so stop when there is a syntax error
+orig_py_compile = py_compile.compile
+
+def doraise_py_compile(file, cfile=None, dfile=None, doraise=False):
+    orig_py_compile(file, cfile=cfile, dfile=dfile, doraise=True)
+
+py_compile.compile = doraise_py_compile
 
 if sys.version_info.major != 3:
     raise RuntimeError('SEQC requires Python 3')
@@ -18,45 +27,50 @@
 with open('src/seqc/version.py') as f:
     exec(f.read())
 
-setup(name='seqc',
-      version=__version__,  # read in from the exec of version.py; ignore error
-      description='Single Cell Sequencing Processing and QC Suite',
-      author='Ambrose J. Carr',
-      author_email='mail@ambrosejcarr.com',
-      package_dir={'': 'src'},
-      packages=['seqc', 'seqc.sequence', 'seqc.alignment', 'seqc.core', 'seqc.stats',
-                'seqc.summary'],
-      install_requires=[
-          'numpy>=1.10.0',
-          'bhtsne',
-          'wikipedia',
-          'awscli',
-          'cython>0.14',
-          'numexpr>=2.4',
-          'pandas>=0.18.1',
-          'paramiko>=2.0.2',
-          'regex',
-          'requests',
-          'nose2',
-          'scipy>=0.14.0',
-          'boto3',
-          'intervaltree',
-          'matplotlib',
-          'tinydb',
-          'tables',
-          'pysam',
-          'fastcluster',
-          'statsmodels',
-          'ecdsa',
-          'dill',
-          'jinja2',
-          'pycrypto',
-          'scikit_learn>=0.17'],
-      scripts=['src/scripts/SEQC'],
-      extras_require={
-          'GSEA_XML': ['html5lib', 'lxml', 'BeautifulSoup4'],
-      }
-      )
+setup(
+    name='seqc',
+    version=__version__,  # read in from the exec of version.py; ignore error
+    description='Single Cell Sequencing Processing and QC Suite',
+    author='Ambrose J. Carr',
+    author_email='mail@ambrosejcarr.com',
+    package_dir={'': 'src'},
+    package_data={'': ['*.r', '*.R']},
+    packages=['seqc', 'seqc.sequence', 'seqc.alignment', 'seqc.core', 'seqc.stats',
+            'seqc.summary'],
+    install_requires=[
+      'numpy>=1.10.0',
+      'bhtsne',
+      'wikipedia',
+      'awscli',
+      'Cython>0.14',
+      'numexpr>=2.4',
+      'pandas>=0.18.1',
+      'paramiko>=2.0.2',
+      'regex',
+      'requests',
+      'nose2',
+      'scipy>=0.14.0',
+      'boto3',
+      'intervaltree',
+      'matplotlib',
+      'tinydb',
+      'tables',
+      'fastcluster',
+      'statsmodels',
+      'ecdsa',
+      'dill',
+      'multiprocessing_on_dill',
+      'jinja2',
+      'pycrypto',
+      'cairocffi>=0.8.0',
+      'weasyprint',
+      'scikit_learn>=0.17'],
+    scripts=['src/scripts/SEQC'],
+    extras_require={
+      'GSEA_XML': ['html5lib', 'lxml', 'BeautifulSoup4'],
+    },
+    include_package_data=True
+)
 
 # look for star
 if not shutil.which('STAR'):

diff --git a/src/seqc/__init__.py b/src/seqc/__init__.py
@@ -1,4 +1,4 @@
 from .h5 import H5
 from .version import __version__
 from . import stats
-from . import plot
+# from . import plot
diff --git a/src/seqc/alignment/star.py b/src/seqc/alignment/star.py
@@ -21,7 +21,7 @@ def default_alignment_args(
         '--runThreadN': str(n_threads),
         '--genomeDir': index,
         '--outFilterType': 'BySJout',
-        '--outFilterMultimapNmax': '100',  # require unique alignments
+        '--outFilterMultimapNmax': '10',  # require unique alignments
         '--limitOutSJcollapsed': '2000000',  # deal with many splice variants
         '--alignSJDBoverhangMin': '8',
         '--outFilterMismatchNoverLmax': '0.04',

diff --git a/src/seqc/barcode_correction.py b/src/seqc/barcode_correction.py
@@ -5,12 +5,72 @@
 import seqc.sequence.barcodes
 from seqc import log
 
-# todo document me
+
+def ten_x_barcode_correction(ra, platform, barcode_files, max_ed=2,
+            default_error_rate=0.02):
+    '''
+    Correct reads with incorrect barcodes according to the correct barcodes files.
+    Reads with barcodes that have too many errors are filtered out.    
+    :param ra: seqc.read_array.ReadArray object
+    :param platform: the platform object
+    :param barcode_files: the list of the paths of barcode files
+    :param max_ed: maximum allowed Hamming distance from known cell barcodes 
+    :param default_error_rate: assumed sequencing error rate
+    :return:
+    '''
+
+    # Read the barcodes into lists
+    valid_barcodes = set()
+    for barcode_file in barcode_files:
+        with open(barcode_file, 'r') as f:
+            valid_barcodes = set([DNA3Bit.encode(line.strip()) for line in
+                                      f.readlines()])
+
+    # Group reads by cells
+    indices_grouped_by_cells = ra.group_indices_by_cell(multimapping=True)
+
+    # Find all valid barcodes and their counts
+    valid_barcode_count = dict()
+    for inds in indices_grouped_by_cells:
+        # Extract barcodes for one of the reads
+        barcode = platform.extract_barcodes(ra.data['cell'][inds[0]])[0]
+        if barcode in valid_barcodes:
+            valid_barcode_count[barcode] = len(inds)
+
+    # Find the set of invalid barcodes and check out whether they can be corrected
+    for inds in indices_grouped_by_cells:
+
+        # Extract barcodes for one of the reads
+        barcode = platform.extract_barcodes(ra.data['cell'][inds[0]])[0]
+        if barcode not in valid_barcode_count:
+        # Identify correct barcode as one Hamming distance away with most reads
+            hammind_dist_1_barcodes = seqc.sequence.barcodes.generate_hamming_dist_1(barcode)
+            fat_bc = -1
+            fat_bc_count = 0
+            for bc in hammind_dist_1_barcodes:
+                if (bc in valid_barcode_count) and (valid_barcode_count[bc] > fat_bc_count):
+                    fat_bc = bc
+                    fat_bc_count = valid_barcode_count[bc]
+
+            if fat_bc < 0:
+                ra.data['status'][inds] |= ra.filter_codes['cell_error']
+            else:
+                # Update the read array with the correct barcode
+                ra.data['cell'][inds] = fat_bc
+
+
+
 def in_drop(ra, platform, barcode_files, max_ed=2,
             default_error_rate=0.02):
     """
     Correct reads with incorrect barcodes according to the correct barcodes files.
     Reads with barcodes that have too many errors are filtered out.
+    :param ra: seqc.read_array.ReadArray object
+    :param platform: the platform object
+    :param barcode_files: the list of the paths of barcode files
+    :param max_ed: maximum allowed Hamming distance from known cell barcodes 
+    :param default_error_rate: assumed sequencing error rate
+    :return:
     """
 
     # Read the barcodes into lists

diff --git a/src/seqc/core/__init__.py b/src/seqc/core/__init__.py
@@ -3,4 +3,4 @@
 from .index import index
 from .instances import instances
 from .terminate import terminate
-from .start import start
+from .start import start
diff --git a/src/seqc/core/main.py b/src/seqc/core/main.py
@@ -4,7 +4,30 @@
 from seqc import core
 from seqc.core import parser, verify
 from seqc import ec2
+import boto3
 
+def clean_up_security_groups():
+    '''
+    Cleanning all the unused security groups that were created/started using SEQC
+    when the number of unused ones is greater than 300 
+    '''
+    ec2 = boto3.resource('ec2') 
+    sgs = list(ec2.security_groups.all())
+    insts = list(ec2.instances.all())
+
+    all_sgs = set([sg.group_name for sg in sgs])                    # get all security groups                        
+    all_inst_sgs = set([sg['GroupName'] 
+        for inst in insts for sg in inst.security_groups])          # get security groups associated with instances
+    unused_sgs = all_sgs - all_inst_sgs                             # get ones without instance association
+
+    if len(unused_sgs) >= 300:
+        print("Cleaning up the unused security groups:")
+        client = boto3.client('ec2')
+        for g in unused_sgs:
+            all_inst_sgs = set([sg['GroupName'] for inst in insts for sg in inst.security_groups])      # since deleting ones takes a while, doublecheck whether 
+            if g.startswith("SEQC") and (g not in all_inst_sgs):    # only cleaning ones associated with SEQC                                        # the security group is still unused
+                client.delete_security_group(GroupName=g)
+                print(g+" deleted")
 
 def main(argv):
     """Check arguments, then call the appropriate sub-module
@@ -15,7 +38,9 @@ def main(argv):
     :param argv: output of sys.argv[1:]
     """
     arguments = parser.parse_args(argv)
+
     func = getattr(core, arguments.subparser_name)
+
     assert func is not None
     if arguments.remote:
         # todo improve how verification works; it's not really necessary, what is needed
@@ -26,6 +51,7 @@ def main(argv):
             k: getattr(verified_args, k) for k in
             ('rsa_key', 'instance_type', 'spot_bid', 'volume_size') if
             getattr(verified_args, k)}
+        clean_up_security_groups()
         ec2.AWSInstance(synchronous=False, **remote_args)(func)(verified_args)
     else:
         func(arguments)

diff --git a/src/seqc/core/parser.py b/src/seqc/core/parser.py
@@ -1,8 +1,9 @@
 import os
 import argparse
 import sys
+import inspect
 from subprocess import Popen, PIPE
-from seqc import version
+from seqc import version, platforms
 
 
 def parse_args(args):
@@ -21,12 +22,14 @@ def parse_args(args):
     subparsers = meta.add_subparsers(dest='subparser_name')
 
     # subparser for running experiments
-    # can use to make prettier: formatter_class=partial(argparse.HelpFormatter, width=200)
+    # can use to make prettier: formatter_class=partial(argparse.HelpFormatter, width=200)    
     p = subparsers.add_parser('run', help='initiate SEQC runs')
 
+    # Platform choices
+    choices = [x[0] for x in inspect.getmembers(platforms, inspect.isclass) if
+           issubclass(x[1], platforms.AbstractPlatform)][1:]
     p.add_argument('platform',
-                   choices=['in_drop', 'drop_seq', 'mars1_seq', 'mars2_seq',
-                            'mars_germany', 'in_drop_v2', 'in_drop_v3', 'ten_x', 'ten_x_v2'],
+                   choices=choices,
                    help='which platform are you merging annotations from?')
 
     a = p.add_argument_group('required arguments')
@@ -87,6 +90,14 @@ def parse_args(args):
                    dest='filter_mitochondrial_rna',
                    help='Do not filter cells with greater than 20 percent mitochondrial '
                         'RNA ')
+    f.set_defaults(filter_low_coverage=True)
+    f.add_argument('--no-filter-low-coverage', action='store_false',
+                   dest='filter_low_coverage',
+                   help='Do not filter cells with low coverage')
+    f.set_defaults(filter_low_gene_abundance=True)
+    f.add_argument('--no-filter-low-gene-abundance', action='store_false',
+                   dest='filter_low_gene_abundance',
+                   help='Do not filter cells with low coverage')
     f.add_argument('--low-coverage-alpha', metavar='LA',
                    help='FDR rate for low coverage reads filter in mars-seq datasets. '
                         'Float between 0 and 1, default=0.25',