Skip to content

Commit

Permalink
Develop (#312)
Browse files Browse the repository at this point in the history
Merging develop back to master; using master as main branch from now on.
  • Loading branch information
phuongdao1 authored and ambrosejcarr committed Nov 10, 2017
1 parent b1a7f6a commit 7f5b70a
Show file tree
Hide file tree
Showing 30 changed files with 1,090 additions and 286 deletions.
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,7 @@
testfiles*
*.DS_Store*
*seqc.log
build/*
dist/*
.project
.pydevproject
5 changes: 5 additions & 0 deletions MANIFEST.in
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
include src/seqc/summary/*/*.css
include src/seqc/summary/fonts/*
include src/seqc/summary/*/*.py
include src/seqc/summary/*/*.js
include src/seqc/summary/*/*.html
4 changes: 4 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,10 @@ Once all dependencies have been installed, SEQC can be installed on any machine

Please note that to avoid passing the -k/--rsa-key command when you execute SEQC runs, you can also set the environment variable `AWS_RSA_KEY` to the path to your newly created key.

### Testing SEQC:

All the unit tests in class `TestSEQC` in `test.py` have been tested. Currently, only two platforms `ten_x_v2` and `in_drop_v2` have been tested. Old unit tests from these two platforms together with other platforms are stored at `s3://dp-lab-data/seqc-old-unit-test/`.

### Running SEQC:

After SEQC is installed, help can be listed:
Expand Down
92 changes: 53 additions & 39 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,15 @@
from subprocess import call, check_output
from setuptools import setup
from warnings import warn
import py_compile

# Replace py_compile.compile with a function that calls it with doraise=True so stop when there is a syntax error
orig_py_compile = py_compile.compile

def doraise_py_compile(file, cfile=None, dfile=None, doraise=False):
orig_py_compile(file, cfile=cfile, dfile=dfile, doraise=True)

py_compile.compile = doraise_py_compile

if sys.version_info.major != 3:
raise RuntimeError('SEQC requires Python 3')
Expand All @@ -18,45 +27,50 @@
with open('src/seqc/version.py') as f:
exec(f.read())

setup(name='seqc',
version=__version__, # read in from the exec of version.py; ignore error
description='Single Cell Sequencing Processing and QC Suite',
author='Ambrose J. Carr',
author_email='mail@ambrosejcarr.com',
package_dir={'': 'src'},
packages=['seqc', 'seqc.sequence', 'seqc.alignment', 'seqc.core', 'seqc.stats',
'seqc.summary'],
install_requires=[
'numpy>=1.10.0',
'bhtsne',
'wikipedia',
'awscli',
'cython>0.14',
'numexpr>=2.4',
'pandas>=0.18.1',
'paramiko>=2.0.2',
'regex',
'requests',
'nose2',
'scipy>=0.14.0',
'boto3',
'intervaltree',
'matplotlib',
'tinydb',
'tables',
'pysam',
'fastcluster',
'statsmodels',
'ecdsa',
'dill',
'jinja2',
'pycrypto',
'scikit_learn>=0.17'],
scripts=['src/scripts/SEQC'],
extras_require={
'GSEA_XML': ['html5lib', 'lxml', 'BeautifulSoup4'],
}
)
setup(
name='seqc',
version=__version__, # read in from the exec of version.py; ignore error
description='Single Cell Sequencing Processing and QC Suite',
author='Ambrose J. Carr',
author_email='mail@ambrosejcarr.com',
package_dir={'': 'src'},
package_data={'': ['*.r', '*.R']},
packages=['seqc', 'seqc.sequence', 'seqc.alignment', 'seqc.core', 'seqc.stats',
'seqc.summary'],
install_requires=[
'numpy>=1.10.0',
'bhtsne',
'wikipedia',
'awscli',
'Cython>0.14',
'numexpr>=2.4',
'pandas>=0.18.1',
'paramiko>=2.0.2',
'regex',
'requests',
'nose2',
'scipy>=0.14.0',
'boto3',
'intervaltree',
'matplotlib',
'tinydb',
'tables',
'fastcluster',
'statsmodels',
'ecdsa',
'dill',
'multiprocessing_on_dill',
'jinja2',
'pycrypto',
'cairocffi>=0.8.0',
'weasyprint',
'scikit_learn>=0.17'],
scripts=['src/scripts/SEQC'],
extras_require={
'GSEA_XML': ['html5lib', 'lxml', 'BeautifulSoup4'],
},
include_package_data=True
)

# look for star
if not shutil.which('STAR'):
Expand Down
2 changes: 1 addition & 1 deletion src/seqc/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from .h5 import H5
from .version import __version__
from . import stats
from . import plot
# from . import plot
2 changes: 1 addition & 1 deletion src/seqc/alignment/star.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ def default_alignment_args(
'--runThreadN': str(n_threads),
'--genomeDir': index,
'--outFilterType': 'BySJout',
'--outFilterMultimapNmax': '100', # require unique alignments
'--outFilterMultimapNmax': '10', # require unique alignments
'--limitOutSJcollapsed': '2000000', # deal with many splice variants
'--alignSJDBoverhangMin': '8',
'--outFilterMismatchNoverLmax': '0.04',
Expand Down
62 changes: 61 additions & 1 deletion src/seqc/barcode_correction.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,72 @@
import seqc.sequence.barcodes
from seqc import log

# todo document me

def ten_x_barcode_correction(ra, platform, barcode_files, max_ed=2,
default_error_rate=0.02):
'''
Correct reads with incorrect barcodes according to the correct barcodes files.
Reads with barcodes that have too many errors are filtered out.
:param ra: seqc.read_array.ReadArray object
:param platform: the platform object
:param barcode_files: the list of the paths of barcode files
:param max_ed: maximum allowed Hamming distance from known cell barcodes
:param default_error_rate: assumed sequencing error rate
:return:
'''

# Read the barcodes into lists
valid_barcodes = set()
for barcode_file in barcode_files:
with open(barcode_file, 'r') as f:
valid_barcodes = set([DNA3Bit.encode(line.strip()) for line in
f.readlines()])

# Group reads by cells
indices_grouped_by_cells = ra.group_indices_by_cell(multimapping=True)

# Find all valid barcodes and their counts
valid_barcode_count = dict()
for inds in indices_grouped_by_cells:
# Extract barcodes for one of the reads
barcode = platform.extract_barcodes(ra.data['cell'][inds[0]])[0]
if barcode in valid_barcodes:
valid_barcode_count[barcode] = len(inds)

# Find the set of invalid barcodes and check out whether they can be corrected
for inds in indices_grouped_by_cells:

# Extract barcodes for one of the reads
barcode = platform.extract_barcodes(ra.data['cell'][inds[0]])[0]
if barcode not in valid_barcode_count:
# Identify correct barcode as one Hamming distance away with most reads
hammind_dist_1_barcodes = seqc.sequence.barcodes.generate_hamming_dist_1(barcode)
fat_bc = -1
fat_bc_count = 0
for bc in hammind_dist_1_barcodes:
if (bc in valid_barcode_count) and (valid_barcode_count[bc] > fat_bc_count):
fat_bc = bc
fat_bc_count = valid_barcode_count[bc]

if fat_bc < 0:
ra.data['status'][inds] |= ra.filter_codes['cell_error']
else:
# Update the read array with the correct barcode
ra.data['cell'][inds] = fat_bc



def in_drop(ra, platform, barcode_files, max_ed=2,
default_error_rate=0.02):
"""
Correct reads with incorrect barcodes according to the correct barcodes files.
Reads with barcodes that have too many errors are filtered out.
:param ra: seqc.read_array.ReadArray object
:param platform: the platform object
:param barcode_files: the list of the paths of barcode files
:param max_ed: maximum allowed Hamming distance from known cell barcodes
:param default_error_rate: assumed sequencing error rate
:return:
"""

# Read the barcodes into lists
Expand Down
2 changes: 1 addition & 1 deletion src/seqc/core/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,4 @@
from .index import index
from .instances import instances
from .terminate import terminate
from .start import start
from .start import start
26 changes: 26 additions & 0 deletions src/seqc/core/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,30 @@
from seqc import core
from seqc.core import parser, verify
from seqc import ec2
import boto3

def clean_up_security_groups():
'''
Cleanning all the unused security groups that were created/started using SEQC
when the number of unused ones is greater than 300
'''
ec2 = boto3.resource('ec2')
sgs = list(ec2.security_groups.all())
insts = list(ec2.instances.all())

all_sgs = set([sg.group_name for sg in sgs]) # get all security groups
all_inst_sgs = set([sg['GroupName']
for inst in insts for sg in inst.security_groups]) # get security groups associated with instances
unused_sgs = all_sgs - all_inst_sgs # get ones without instance association

if len(unused_sgs) >= 300:
print("Cleaning up the unused security groups:")
client = boto3.client('ec2')
for g in unused_sgs:
all_inst_sgs = set([sg['GroupName'] for inst in insts for sg in inst.security_groups]) # since deleting ones takes a while, doublecheck whether
if g.startswith("SEQC") and (g not in all_inst_sgs): # only cleaning ones associated with SEQC # the security group is still unused
client.delete_security_group(GroupName=g)
print(g+" deleted")

def main(argv):
"""Check arguments, then call the appropriate sub-module
Expand All @@ -15,7 +38,9 @@ def main(argv):
:param argv: output of sys.argv[1:]
"""
arguments = parser.parse_args(argv)

func = getattr(core, arguments.subparser_name)

assert func is not None
if arguments.remote:
# todo improve how verification works; it's not really necessary, what is needed
Expand All @@ -26,6 +51,7 @@ def main(argv):
k: getattr(verified_args, k) for k in
('rsa_key', 'instance_type', 'spot_bid', 'volume_size') if
getattr(verified_args, k)}
clean_up_security_groups()
ec2.AWSInstance(synchronous=False, **remote_args)(func)(verified_args)
else:
func(arguments)
Expand Down
19 changes: 15 additions & 4 deletions src/seqc/core/parser.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
import os
import argparse
import sys
import inspect
from subprocess import Popen, PIPE
from seqc import version
from seqc import version, platforms


def parse_args(args):
Expand All @@ -21,12 +22,14 @@ def parse_args(args):
subparsers = meta.add_subparsers(dest='subparser_name')

# subparser for running experiments
# can use to make prettier: formatter_class=partial(argparse.HelpFormatter, width=200)
# can use to make prettier: formatter_class=partial(argparse.HelpFormatter, width=200)
p = subparsers.add_parser('run', help='initiate SEQC runs')

# Platform choices
choices = [x[0] for x in inspect.getmembers(platforms, inspect.isclass) if
issubclass(x[1], platforms.AbstractPlatform)][1:]
p.add_argument('platform',
choices=['in_drop', 'drop_seq', 'mars1_seq', 'mars2_seq',
'mars_germany', 'in_drop_v2', 'in_drop_v3', 'ten_x', 'ten_x_v2'],
choices=choices,
help='which platform are you merging annotations from?')

a = p.add_argument_group('required arguments')
Expand Down Expand Up @@ -87,6 +90,14 @@ def parse_args(args):
dest='filter_mitochondrial_rna',
help='Do not filter cells with greater than 20 percent mitochondrial '
'RNA ')
f.set_defaults(filter_low_coverage=True)
f.add_argument('--no-filter-low-coverage', action='store_false',
dest='filter_low_coverage',
help='Do not filter cells with low coverage')
f.set_defaults(filter_low_gene_abundance=True)
f.add_argument('--no-filter-low-gene-abundance', action='store_false',
dest='filter_low_gene_abundance',
help='Do not filter cells with low coverage')
f.add_argument('--low-coverage-alpha', metavar='LA',
help='FDR rate for low coverage reads filter in mars-seq datasets. '
'Float between 0 and 1, default=0.25',
Expand Down
Loading

0 comments on commit 7f5b70a

Please sign in to comment.