From e7b9fd04833de66827bd75c79b0a2431c7496445 Mon Sep 17 00:00:00 2001
From: ajc <ambrosejcarr@gmail.com>
Date: Tue, 3 Nov 2015 10:56:55 -0500
Subject: [PATCH 01/68] added dummy starcluster config file, two starcluster
 plugins to (1) initialize and mount an n-Tb RAID array to the cluster
 (default: 4Tb) and automatically pull the current release version of SEQC and
 install on the cluster that is being started

---
 src/plugins/gitpull.py         |  20 ++
 src/plugins/raid.py            |  59 ++++++
 src/plugins/starcluster.config | 369 +++++++++++++++++++++++++++++++++
 3 files changed, 448 insertions(+)
 create mode 100644 src/plugins/gitpull.py
 create mode 100644 src/plugins/raid.py
 create mode 100644 src/plugins/starcluster.config

diff --git a/src/plugins/gitpull.py b/src/plugins/gitpull.py
new file mode 100644
index 0000000..4b2d62d
--- /dev/null
+++ b/src/plugins/gitpull.py
@@ -0,0 +1,20 @@
+from starcluster.clustersetup import ClusterSetup
+from starcluster.logger import log
+from subprocess import check_output, call
+
+class DownloadRepo(ClusterSetup):
+	def __init__(self, dir_name='/data/software/'):
+		if not dir_name.endswith('/'):
+                        dir_name += '/'
+                self.dir_name = dir_name
+
+	def run(self, nodes, master, user, user_shell, volumes):
+		folder = self.dir_name
+		log.info('installing seqc repo onto %s' % folder)
+
+		master.ssh.execute("mkdir %s" % folder)
+		location = folder + "seqc.tar.gz"
+		master.ssh.execute('curl -H "Authorization: token ab79b034a8caf4ae502f1374014f920a0a9062d7" -L '
+                                   'https://api.github.com/repos/ambrosejcarr/seqc/tarball > %s' % location)
+		log.info("seqc.tar.gz has been downloaded in /data/software directory")
+                master.ssh.execute('pip3 install %s' % location)
diff --git a/src/plugins/raid.py b/src/plugins/raid.py
new file mode 100644
index 0000000..772ba81
--- /dev/null
+++ b/src/plugins/raid.py
@@ -0,0 +1,59 @@
+from starcluster.clustersetup import ClusterSetup
+from starcluster.logger import log
+import string
+import re
+import time
+import os
+from subprocess import check_output, call
+
+class RaidAutomator(ClusterSetup):
+	def __init__(self, n_tb=4, availability_zone='us-east-1e'):
+		self.n_tb = n_tb
+                self.availability_zone = availability_zone
+		log.debug('total number of terabytes = %s' % n_tb)
+	
+	def run(self, nodes, master, user, user_shell, volumes):
+		log.info("setting up RAID array on master node...")
+		vol_num = self.n_tb
+		vol_size = 1024
+		inst_id = master.id
+		dev_base = "/dev/xvd"
+		alphabet = string.ascii_lowercase[5:] #starts at f
+		dev_names = []
+                availability_zone = self.availability_zone
+        
+		for i in range(int(vol_num)):
+			log.info("creating volume %s of %s..." % (i+1, vol_num))
+			vol = check_output(["aws", "ec2", "create-volume", "--size", str(vol_size), "--region", "us-east-1", "--availability-zone", availability_zone, "--volume-type", "gp2"])
+			vol_list = vol.decode().split()
+			vol_id = [s for s in vol_list if 'vol-' in s][0]
+
+			#generating new device id to mount
+			dev_id = dev_base + alphabet[i]
+			dev_names.append(dev_id)
+			
+			log.info("waiting for volume to become available...")
+			vol_stat = check_output(["aws", "ec2", "describe-volumes", "--volume-ids", vol_id])
+			vol_stat = vol_stat.decode().split()
+			
+			while vol_stat[6] != 'available':
+				time.sleep(10)
+				vol_stat = check_output(["aws", "ec2", "describe-volumes", "--volume-ids", vol_id])
+				vol_stat = vol_stat.decode().split()
+				log.info("...")
+			
+			log.info("attaching new volume...")
+			call(["aws", "ec2", "attach-volume", "--volume-id", vol_id, "--instance-id",inst_id, "--device", dev_id])
+			i+=1
+		log.info("successfully attached %s TB in %s volumes!" % (self.n_tb, vol_num))
+		
+		log.info("creating logical RAID device...")
+		all_dev = ' '.join(dev_names)
+		log.info('waiting for all volumes to be available...')
+		time.sleep(10)
+		master.ssh.execute("sudo mdadm --create --verbose /dev/md0 --level=0 --name=my_raid --raid-devices=%s %s" %(vol_num,all_dev))
+		master.ssh.execute("sudo mkfs.ext4 -L my_raid /dev/md0")
+		master.ssh.execute("sudo mkdir -p /data")
+		master.ssh.execute("sudo mount LABEL=my_raid /data")
+		log.info("successfully created RAID array in /data!")
+		
diff --git a/src/plugins/starcluster.config b/src/plugins/starcluster.config
new file mode 100644
index 0000000..8eb42d9
--- /dev/null
+++ b/src/plugins/starcluster.config
@@ -0,0 +1,369 @@
+####################################
+## StarCluster Configuration File ##
+####################################
+[global]
+# Configure the default cluster template to use when starting a cluster
+# defaults to 'smallcluster' defined below. This template should be usable
+# out-of-the-box provided you've configured your keypair correctly
+DEFAULT_TEMPLATE=c3.large
+# enable experimental features for this release
+#ENABLE_EXPERIMENTAL=True
+# number of seconds to wait when polling instances (default: 30s)
+#REFRESH_INTERVAL=15
+# specify a web browser to launch when viewing spot history plots
+#WEB_BROWSER=chromium
+# split the config into multiple files
+#INCLUDE=~/.starcluster/aws, ~/.starcluster/keys, ~/.starcluster/vols
+
+#############################################
+## AWS Credentials and Connection Settings ##
+#############################################
+[aws info]
+# This is the AWS credentials section (required).
+# These settings apply to all clusters
+AWS_ACCESS_KEY_ID = <your access key>
+AWS_SECRET_ACCESS_KEY = <your secret key>
+AWS_USER_ID= <your numerical user id>
+AWS_REGION_NAME = us-east-1
+AWS_REGION_HOST = ec2.us-east-1.amazonaws.com
+# Uncomment these settings when creating an instance-store (S3) AMI (OPTIONAL)
+#EC2_CERT = /path/to/your/cert-asdf0as9df092039asdfi02089.pem
+#EC2_PRIVATE_KEY = /path/to/your/pk-asdfasd890f200909.pem
+# Uncomment these settings to use a proxy host when connecting to AWS
+#AWS_PROXY = your.proxyhost.com
+#AWS_PROXY_PORT = 8080
+#AWS_PROXY_USER = yourproxyuser
+#AWS_PROXY_PASS = yourproxypass
+
+###########################
+## Defining EC2 Keypairs ##
+###########################
+# Sections starting with "key" define your keypairs. See "starcluster createkey
+# --help" for instructions on how to create a new keypair. Section name should
+# match your key name e.g.:
+# username should be your Columbia UNI, if a Columbia student
+[key aws]
+KEY_LOCATION= <path to your username.pem file>
+
+# You can of course have multiple keypair sections
+# [key myotherkey]
+# KEY_LOCATION=~/.ssh/myotherkey.rsa
+
+################################
+## Defining Cluster Templates ##
+################################
+
+# change this to the name of one of the keypair sections defined above
+# KEYNAME = aws
+# number of ec2 instances to launch
+# CLUSTER_SIZE = 1
+# create the following user on the cluster
+# CLUSTER_USER = sgeadmin
+# optionally specify shell (defaults to bash)
+# (options: tcsh, zsh, csh, bash, ksh)
+# CLUSTER_SHELL = bash
+# Uncomment to prepent the cluster tag to the dns name of all nodes created
+# using this cluster config.  ie: mycluster-master and mycluster-node001
+# If you choose to enable this option, it's recommended that you enable it in
+# The DEFAULT_TEMPLATE so all nodes will automatically have the prefix
+# DNS_PREFIX = True
+# AMI to use for cluster nodes. These AMIs are for the us-east-1 region.
+# Use the 'listpublic' command to list StarCluster AMIs in other regions
+# The base i386 StarCluster AMI is ami-9bf9c9f2
+# The base x86_64 StarCluster AMI is ami-3393a45a
+# The base HVM StarCluster AMI is ami-6b211202
+# Upgraded EBS-backed AMI w/ Python3 & Bowtie2: ami-16cd017e
+# Upgraded S3-backed AMI w/ Python3 & Bowtie2: 
+# NODE_IMAGE_ID = ami-d9095dbc
+# instance type for all cluster nodes
+# (options: m3.large, i2.8xlarge, c3.2xlarge, hs1.8xlarge, c1.xlarge, r3.4xlarge, g2.2xlarge, m1.small, c1.medium, m3.2xlarge, c3.8xlarge, m2.xlarge, r3.2xlarge, t1.micro, cr1.8xlarge, r3.8xlarge, cc1.4xlarge, m1.medium, r3.large, c3.xlarge, i2.xlarge, m3.medium, cc2.8xlarge, m1.large, cg1.4xlarge, i2.2xlarge, c3.large, i2.4xlarge, c3.4xlarge, r3.xlarge, m1.xlarge, hi1.4xlarge, m2.4xlarge, m2.2xlarge, m3.xlarge)
+# NODE_INSTANCE_TYPE = c3.large
+# Launch cluster in a VPC subnet (OPTIONAL)
+#SUBNET_ID=subnet-99999999
+# Uncomment to assign public IPs to cluster nodes (VPC-ONLY) (OPTIONAL)
+# WARNING: Using public IPs with a VPC requires:
+# 1. An internet gateway attached to the VPC
+# 2. A route table entry linked to the VPC's internet gateway and associated
+#    with the VPC subnet with a destination CIDR block of 0.0.0.0/0
+# WARNING: Public IPs allow direct access to your VPC nodes from the internet
+#PUBLIC_IPS=True
+# Uncomment to disable installing/configuring a queueing system on the
+# cluster (SGE)
+#DISABLE_QUEUE=True
+# Uncomment to specify a different instance type for the master node (OPTIONAL)
+# (defaults to NODE_INSTANCE_TYPE if not specified)
+#MASTER_INSTANCE_TYPE = m1.small
+# Uncomment to specify a separate AMI to use for the master node. (OPTIONAL)
+# (defaults to NODE_IMAGE_ID if not specified)
+#MASTER_IMAGE_ID = ami-3393a45a (OPTIONAL)
+# availability zone to launch the cluster in (OPTIONAL)
+# (automatically determined based on volumes (if any) or
+# selected by Amazon if not specified)
+#AVAILABILITY_ZONE = us-east-1c
+# list of volumes to attach to the master node (OPTIONAL)
+# these volumes, if any, will be NFS shared to the worker nodes
+# see "Configuring EBS Volumes" below on how to define volume sections
+#VOLUMES = oceandata, biodata
+# list of plugins to load after StarCluster's default setup routines (OPTIONAL)
+# see "Configuring StarCluster Plugins" below on how to define plugin sections
+#PLUGINS = myplugin, myplugin2
+# list of permissions (or firewall rules) to apply to the cluster's security
+# group (OPTIONAL).
+#PERMISSIONS = ssh, http
+# Uncomment to always create a spot cluster when creating a new cluster from
+# this template. The following example will place a $0.50 bid for each spot
+# request.
+#SPOT_BID = 0.50
+# Uncomment to specify one or more userdata scripts to use when launching
+# cluster instances. Supports cloudinit. All scripts combined must be less than
+# 16KB
+#USERDATA_SCRIPTS = /path/to/script1, /path/to/script2
+
+###########################################
+## Defining Additional Cluster Templates ##
+###########################################
+# You can also define multiple cluster templates. You can either supply all
+# configuration options as with smallcluster above, or create an
+# EXTENDS=<cluster_name> variable in the new cluster section to use all
+# settings from <cluster_name> as defaults. Below are example templates that
+# use the EXTENDS feature:
+
+# DEFINE AMIs here
+# C3 AMI = ami-6b13470e
+# C4 & R3 AMI = ami-d9095dbc
+
+[cluster c3.large]
+KEYNAME = aws
+CLUSTER_SHELL = bash
+CLUSTER_USER = ec2-user
+CLUSTER_SIZE = 1
+NODE_INSTANCE_TYPE = c3.large
+NODE_IMAGE_ID = ami-a12177c4
+# This availability zone will propagate to all clusters below
+AVAILABILITY_ZONE = us-east-1e
+
+[cluster c4.large]
+EXTENDS = c3.large
+NODE_INSTANCE_TYPE = c4.large
+NODE_IMAGE_ID = ami-ab2177ce
+
+[cluster r3.8xlarge]
+EXTENDS = c4.large
+NODE_INSTANCE_TYPE = r3.8xlarge
+SUBNET_ID = subnet-28c89300
+FORCE_SPOT_MASTER = True
+SPOT_BID = 0.6
+PLUGINS = raid, gitpull
+
+[cluster c3.8xlarge]
+EXTENDS = c3.large
+NODE_INSTANCE_TYPE = c3.8xlarge
+FORCE_SPOT_MASTER = True
+SPOT_BID = 0.6
+PLUGINS = raid, gitpull
+
+[cluster c4.8xlarge]
+EXTENDS = c4.large
+NODE_INSTANCE_TYPE = c4.8xlarge
+FORCE_SPOT_MASTER = True
+SPOT_BID = 0.6
+PLUGINS = raid, gitpull
+
+[cluster ipcluster]
+EXTENDS = c3.large
+PLUGINS = ipcluster
+
+#############################
+## Configuring EBS Volumes ##
+#############################
+# StarCluster can attach one or more EBS volumes to the master and then
+# NFS_share these volumes to all of the worker nodes. A new [volume] section
+# must be created for each EBS volume you wish to use with StarCluser. The
+# section name is a tag for your volume. This tag is used in the VOLUMES
+# setting of a cluster template to declare that an EBS volume is to be mounted
+# and nfs shared on the cluster. (see the commented VOLUMES setting in the
+# example 'smallcluster' template above) Below are some examples of defining
+# and configuring EBS volumes to be used with StarCluster:
+
+[volume ipcluster]
+volume_id= <your volume here>
+mount_path=/ipcluster
+
+# Sections starting with "volume" define your EBS volumes
+# [volume biodata]
+# attach vol-c9999999 to /home on master node and NFS-shre to worker nodes
+# VOLUME_ID = vol-c999999
+# MOUNT_PATH = /home
+
+# Same volume as above, but mounts to different location
+# [volume biodata2]
+# VOLUME_ID = vol-c999999
+# MOUNT_PATH = /opt/
+
+# Another volume example
+# [volume oceandata]
+# VOLUME_ID = vol-d7777777
+# MOUNT_PATH = /mydata
+
+# By default StarCluster will attempt first to mount the entire volume device,
+# failing that it will try the first partition. If you have more than one
+# partition you will need to set the PARTITION number, e.g.:
+# [volume oceandata]
+# VOLUME_ID = vol-d7777777
+# MOUNT_PATH = /mydata
+# PARTITION = 2
+
+############################################
+## Configuring Security Group Permissions ##
+############################################
+# Sections starting with "permission" define security group rules to
+# automatically apply to newly created clusters. IP_PROTOCOL in the following
+# examples can be can be: tcp, udp, or icmp. CIDR_IP defaults to 0.0.0.0/0 or
+# "open to the # world"
+
+# open port 80 on the cluster to the world
+# [permission http]
+# IP_PROTOCOL = tcp
+# FROM_PORT = 80
+# TO_PORT = 80
+
+# open https on the cluster to the world
+# [permission https]
+# IP_PROTOCOL = tcp
+# FROM_PORT = 443
+# TO_PORT = 443
+
+# open port 80 on the cluster to an ip range using CIDR_IP
+# [permission http]
+# IP_PROTOCOL = tcp
+# FROM_PORT = 80
+# TO_PORT = 80
+# CIDR_IP = 18.0.0.0/8
+
+# restrict ssh access to a single ip address (<your_ip>)
+[permission ssh]
+IP_PROTOCOL = tcp
+FROM_PORT = 22
+TO_PORT = 22
+# CIDR_IP = 160.39.203.0/24
+
+
+#####################################
+## Configuring StarCluster Plugins ##
+#####################################
+# Sections starting with "plugin" define a custom python class which perform
+# additional configurations to StarCluster's default routines. These plugins
+# can be assigned to a cluster template to customize the setup procedure when
+# starting a cluster from this template (see the commented PLUGINS setting in
+# the 'smallcluster' template above). Below is an example of defining a user
+# plugin called 'myplugin':
+
+# [plugin myplugin]
+# NOTE: myplugin module must either live in ~/.starcluster/plugins or be
+# on your PYTHONPATH
+# SETUP_CLASS = myplugin.SetupClass
+# extra settings are passed as __init__ arguments to your plugin:
+# SOME_PARAM_FOR_MY_PLUGIN = 1
+# SOME_OTHER_PARAM = 2
+
+######################
+## Built-in Plugins ##
+######################
+
+[plugin raid]
+SETUP_CLASS = raid.RaidAutomator
+n_tb = 4  # set this to a different value to add more or less disk space
+availability_zone = us-east-1e
+
+[plugin gitpull]
+SETUP_CLASS = gitpull.DownloadRepo
+dir_name = /data/software
+
+[plugin ipcluster]
+setup_class = starcluster.plugins.ipcluster.IPCluster
+enable_notebook = True
+notebook_directory = notebooks
+# set a password for the notebook for increased security
+notebook_passwd = <your password>
+packer = pickle
+
+[plugin ipclusterrestart]
+SETUP_CLASS = starcluster.plugins.ipcluster.IPClusterRestartEngines
+
+# The following plugins ship with StarCluster and should work out-of-the-box.
+# Uncomment as needed. Don't forget to update your PLUGINS list!
+# See http://star.mit.edu/cluster/docs/latest/plugins for plugin details.
+#
+# Use this plugin to install one or more packages on all nodes
+# [plugin pkginstaller]
+# SETUP_CLASS = starcluster.plugins.pkginstaller.PackageInstaller
+# # list of apt-get installable packages
+# PACKAGES = mongodb, python-pymongo
+#
+# Use this plugin to create one or more cluster users and download all user ssh
+# keys to $HOME/.starcluster/user_keys/<cluster>-<region>.tar.gz
+# [plugin createusers]
+# SETUP_CLASS = starcluster.plugins.users.CreateUsers
+# NUM_USERS = 30
+# # you can also comment out NUM_USERS and specify exact usernames, e.g.
+# # usernames = linus, tux, larry
+# DOWNLOAD_KEYS = True
+#
+# Use this plugin to configure the Condor queueing system
+# [plugin condor]
+# SETUP_CLASS = starcluster.plugins.condor.CondorPlugin
+#
+# The SGE plugin is enabled by default and not strictly required. Only use this
+# if you want to tweak advanced settings in which case you should also set
+# DISABLE_QUEUE=TRUE in your cluster template. See the plugin doc for more
+# details.
+# [plugin sge]
+# SETUP_CLASS = starcluster.plugins.sge.SGEPlugin
+# Master_IS_EXEC_HOST = False
+#
+# The IPCluster plugin configures a parallel IPython cluster with optional
+# web notebook support. This allows you to run Python code in parallel with low
+# latency message passing via ZeroMQ.
+# [plugin ipcluster]
+# SETUP_CLASS = starcluster.plugins.ipcluster.IPCluster
+# # Enable the IPython notebook server (optional)
+# ENABLE_NOTEBOOK = True
+# # Set a password for the notebook for increased security
+# # This is optional but *highly* recommended
+# NOTEBOOK_PASSWD = a-secret-password
+# # Set a custom directory for storing/loading notebooks (optional)
+# NOTEBOOK_DIRECTORY = /path/to/notebook/dir
+# # Set a custom packer. Must be one of 'json', 'pickle', or 'msgpack'
+# # This is optional.
+# PACKER = pickle
+#
+# Use this plugin to create a cluster SSH "dashboard" using tmux. The plugin
+# creates a tmux session on the master node that automatically connects to all
+# the worker nodes over SSH. Attaching to the session shows a separate window
+# for each node and each window is logged into the node via SSH.
+# [plugin tmux]
+# SETUP_CLASS = starcluster.plugins.tmux.TmuxControlCenter
+#
+# Use this plugin to change the default MPI implementation on the
+# cluster from OpenMPI to MPICH2.
+# [plugin mpich2]
+# SETUP_CLASS = starcluster.plugins.mpich2.MPICH2Setup
+#
+# Configure a hadoop cluster. (includes dumbo setup)
+# [plugin hadoop]
+# SETUP_CLASS = starcluster.plugins.hadoop.Hadoop
+#
+# Configure a distributed MySQL Cluster
+# [plugin mysqlcluster]
+# SETUP_CLASS = starcluster.plugins.mysql.MysqlCluster
+# NUM_REPLICAS = 2
+# DATA_MEMORY = 80M
+# INDEX_MEMORY = 18M
+# DUMP_FILE = test.sql
+# DUMP_INTERVAL = 60
+# DEDICATED_QUERY = True
+# NUM_DATA_NODES = 2
+#
+# Install and setup an Xvfb server on each cluster node
+# [plugin xvfb]
+# SETUP_CLASS = starcluster.plugins.xvfb.XvfbSetup

From 4f16d1140610f98f0698a0f74087d391ef566a8d Mon Sep 17 00:00:00 2001
From: ajc <ambrosejcarr@gmail.com>
Date: Tue, 3 Nov 2015 11:02:24 -0500
Subject: [PATCH 02/68] added sam iterators (by-line and by-chunk) to seqc.sam

---
 src/seqc/sam.py | 85 ++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 84 insertions(+), 1 deletion(-)

diff --git a/src/seqc/sam.py b/src/seqc/sam.py
index be56f0b..dae1cc1 100644
--- a/src/seqc/sam.py
+++ b/src/seqc/sam.py
@@ -352,4 +352,87 @@ def assess_outliers(samfile):
      1: total input reads
      2: aligned reads
     """
-    raise NotImplementedError
\ No newline at end of file
+    raise NotImplementedError
+
+
+def iterate(samfile):
+    """iterate over the lines of a .sam or .bam file"""
+    if samfile.endswith('.bam'):
+
+        from subprocess import PIPE, Popen, check_output
+        import sys
+
+        samtools = check_output(['which', 'samtools'])
+        if not samtools:
+            print('Samtools binary not found on PATH. Exiting')
+            sys.exit(1)
+        else:
+            samtools = samtools.decode('UTF-8').strip()
+        args = [samtools, 'view', samfile]
+        p = Popen(args, stdout=PIPE, bufsize=256)
+
+        while True:
+            line = p.stdout.readline()
+
+            if not line:
+                break
+
+            yield line.decode('UTF-8')
+
+    elif samfile.endswith('.sam'):
+        with open(samfile, 'r') as f:
+            for line in f:
+                if line.startswith('@'):
+                    continue
+                else:
+                    yield line
+    else:
+        raise ValueError('file extension not recognized')
+
+
+def iterate_chunk(samfile, n):
+    """read n gigabytes of sequencing information at a time."""
+
+    if samfile.endswith('.bam'):
+
+        from subprocess import PIPE, Popen, check_output
+        import sys
+
+        samtools = check_output(['which', 'samtools'])
+        if not samtools:
+            print('Samtools binary not found on PATH. Exiting')
+            sys.exit(1)
+        else:
+            samtools = samtools.decode('UTF-8').strip()
+        args = [samtools, 'view', samfile]
+        p = Popen(args, stdout=PIPE, bufsize=-1)
+
+        while True:
+            lines = p.stdout.readlines(n * 1048576)
+
+            if not lines:
+                break
+            lines = [line.decode('UTF-8') for line in lines]
+
+            yield lines
+
+    elif samfile.endswith('.sam'):
+            with open(samfile, 'r') as f:
+                first_lines = f.readlines(n * 1048576)
+                first_lines = [line for line in first_lines if not
+                               line.startswith('@')]
+
+                while True:
+                    lines = f.readlines(n * 1048576)
+
+                    if first_lines:
+                        lines = first_lines + lines
+                        first_lines = None
+
+                    if not lines:
+                        break
+
+                    yield lines
+
+    else:
+        raise ValueError('file extension not recognized')

From 3ce18f2d911fe2fb5f35a22404d8e39af54db437 Mon Sep 17 00:00:00 2001
From: ajc <ambrosejcarr@gmail.com>
Date: Tue, 3 Nov 2015 11:04:46 -0500
Subject: [PATCH 03/68] removed convert_features import from seqc.sam

---
 src/seqc/sam.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/seqc/sam.py b/src/seqc/sam.py
index dae1cc1..4103ec9 100644
--- a/src/seqc/sam.py
+++ b/src/seqc/sam.py
@@ -1,6 +1,5 @@
 __author__ = 'ambrose'
 
-from seqc.convert_features import construct_feature_table
 from threading import Thread
 from queue import Queue, Full, Empty
 from time import sleep

From d98e10763e8eda928d6df5b944daf4b0a1fc0a5a Mon Sep 17 00:00:00 2001
From: ajc <ambrosejcarr@gmail.com>
Date: Tue, 3 Nov 2015 12:40:39 -0500
Subject: [PATCH 04/68] README.md rewrite; reformatted gitpull.py and raid.py
 starcluster plugins; added new OAuth cert to gitpull.py plugin to replace
 broken one

---
 README.md              | 177 ++++++++++++++++++++++++++++++++++++++++-
 src/plugins/gitpull.py |  28 ++++---
 src/plugins/raid.py    | 109 +++++++++++++------------
 3 files changed, 249 insertions(+), 65 deletions(-)

diff --git a/README.md b/README.md
index bd23f5d..62cf6d9 100644
--- a/README.md
+++ b/README.md
@@ -1,2 +1,175 @@
-# seqc
-Single-Cell Sequencing Quality Control and Processing Software
+# SEquence Quality Control
+
+## Setting up SEQC and starcluster
+
+### Creating a new AWS account:
+1. Navigate <a href=http://aws.amazon.com>here</a> and click “Create an AWS Account.”
+2. Enter your desired login e-mail and click the “I am a new user” radio button.
+3. Fill out the Login Credentials with your name and password.
+4. Fill out your contact information, read the AWS Customer Agreement, and accept if you
+wish to create an account.
+5. Save your AWS Access ID and Secret Key -- this information is very important!
+
+### Create an RSA key to launch a new cluster:
+1. Sign into your AWS account and go to the EC2 Dashboard.
+2. Click “Key Pairs” in the NETWORK & SECURITY tab.
+3. Click “Create Key Pair” and give it a new name.
+4. This will install a new key called <keyname>.pem on your local machine. 
+5. Rename the key to an .rsa extension and move it to a secure directory.
+6. example: `<keyname>.rsa` and move it to a directory (e.g. `~/.ssh/`)
+7. Change the permission settings with `$> chmod 600 /path/to/key/keyname.rsa`
+
+### Next, install StarCluster by cloning the Github repo.
+1. `$> git clone https://github.com/ambrosejcarr/StarCluster.git`
+2. `$> sudo python setup.py install`
+
+### Set up the default StarCluster config file.
+1. Type `starcluster help` and enter `2`
+2. Under `[aws info]` enter the following information: 
+    1. `AWS_ACCESS_KEY_ID = #access_id` (This is your AWS Access ID from Step (1))
+    2. `AWS_SECRET_ACCESS_KEY = #secret_key` (This is your AWS Secret Key from Step (1))
+    3. `AWS_USER_ID= #user_id`
+    4. Click on your username on the top right corner of the AWS dashboard and click
+    “My Account” -- your Account Id should pop up at the top of the page (a 12-digit 
+    number)
+    5. `AWS_REGION_NAME = us-east-1a` (or any other of 1b, 1c, 1d, 1e - they have 
+    different spot prices) 
+3. Under Defining EC2 Keypairs:
+    1. Create a section called: `[key <keyname>]`
+    2. below, write: `KEY_LOCATION=~/.ssh/<keyname>.rsa`
+    3. Under `[cluster smallcluster]`:
+    4. `KEYNAME = <keyname>`
+    5. `NODE_IMAGE_ID = %(x86_64_ami)s`
+    6. `SUBNET_ID=subnet-99999999`
+        1. Go to the “VPC” under the Networking Tab in the AWS Dashboard
+        2. Click on “Subnets”
+        3. Find the appropriate subnet according to your availability zone
+        (e.g. “subnet-99999999”) and write it there
+    7. `AVAILABILITY_ZONE = us-east-1a` (or other, if you changed this in 2.5)
+
+### Start a cluster:
+1. `$> starcluster start -c <template_name> <cluster_name>`
+2. Wait until the cluster is finished setting up. Then, the cluster can be accessed
+using:
+3. `$> starcluster sshmaster -X <cluster_name>`  (-X gives you x-window plotting capability)
+4. To exit the cluster, simply type “exit”.
+5. Other things like `starcluster stop <cluster_name>`, `terminate`, `start -c`, etc.
+6. You can also copy files to/from the cluster using the put and get commands. 
+To copy a file or entire directory from your local computer to the cluster:
+`$> starcluster put mycluster /path/to/local/file/or/dir /remote/path/`
+7. To copy a file or an entire directory from the cluster to your local computer:
+`$> starcluster get mycluster /path/to/remote/file/or/dir /local/path/`
+
+
+## Running SEQC:
+
+After SEQC is installed, help can be listed:
+
+    $> SEQC -h
+    usage: SEQC [-h] {in-drop,drop-seq,mars-seq,cel-seq,avo-seq,strt-seq} ...
+    
+    positional arguments:
+      {in-drop,drop-seq,mars-seq,cel-seq,avo-seq,strt-seq}
+                            library construction method types
+        in-drop             in-drop help
+        drop-seq            drop-seq help
+        mars-seq            mars-seq help
+        cel-seq             cel-seq help
+        avo-seq             avo-seq help
+        strt-seq            strt-seq help
+    
+    optional arguments:
+      -h, --help            show this help message and exit
+
+
+Help on parsing individual data types can be obtained by typing:
+
+    $> SEQC in-drop -h
+    usage: SEQC in-drop [-h] [-i I] [-n N] [-o O] [-b B] [-f [F [F ...]]]
+                        [-r [R [R ...]]] [-s [S]] [-m M] [-l L]
+                        [--star-args SA [SA ...]] [--list-default-star-args]
+    
+    optional arguments:
+      -h, --help            show this help message and exit
+    
+    Required Arguments:
+      -i I, --index I       star alignment index folder. This folder will be
+                            created if it does not exist
+      -n N, --n-threads N   number of threads to run
+      -o O, --output-file O
+                            stem of filename in which to store output
+      -b B, --barcodes B    location of serialized barcode object.
+    
+    Input Files:
+      pass one input file type: sam (-s), raw fastq (-f, [-r]), or processed
+      fastq (-m)
+    
+      -f [F [F ...]], --forward [F [F ...]]
+                            forward fastq file(s)
+      -r [R [R ...]], --reverse [R [R ...]]
+                            reverse fastq file(s)
+      -s [S], --sam [S]     sam file(s) containing aligned, pre-processed reads
+      -m M, --merged-fastq M
+                            fastq file containing merged, pre-processed records
+    
+    Optional arguments for disambiguation:
+      -l L, --frag-len L    the number of bases from the 3 prime end to consider
+                            when determining trancript overlaps
+    
+    Optional arguments for STAR aligner:
+      --star-args SA [SA ...]
+                            additional arguments for STAR. Pass as arg=value
+                            without leading "--". e.g. runMode=alignReads
+      --list-default-star-args
+                            list SEQDB default args for the STAR aligner
+
+
+All SEQC runs require that you pass a SEQC index (-i/--index). These are STAR indices,
+augmented by SEQC-specific files:
+1. `annotations.gtf`: a modified GTF file, containing truncated sequence sizes that
+ reflect that we expect all data to fall within ~ 1kb of the transcriptional termination
+ sites. In addition, transcripts are tagged with "SCIDs", identifiers that merge
+ transcripts and genes which cannot be distinguished in the ~ 1kb of sequence that we
+ expect to observe.
+2. `p_coalignments_array.p`: a binary file containing, for each SCID, the probability of
+ observing a co-alignment to other genes.
+
+Human and mouse indices can be found on our aws s3 bucket at
+`s3://dplab-data/genomes/mm38/` and `s3://dplab-data/genomes/hg38`. These indices
+are built from recent releases of ENSEMBL genomes.
+
+If new indices must be generated, these can be produced by calling:
+
+    $> INDEX=<index_folder>
+    $> ORG=<organism>  # valid organisms are hg38 and mm38. Can also produce joint indicies with "[hg38, mm38]"
+    $> THREADS=<number of threads to use>  # STAR will use this number of threads to produce the index
+    $> mkdir $INDEX  # create index directory
+    $> python3 -c "import seqc.align; seqc.align.STAR.verify_index($INDEX, $ORG, $THREADS)"
+
+Some data types require serialized barcode objects (-b/--barcodes). These objects contain
+all of the barcodes for an experiment, as they would be expected to be observed.
+For example, if you expect to observe the reverse complement of the barcodes you used to
+construct the library, then this object should be built from reverse complements.   
+ 
+These barcode files can be found at `s3://dplab-data/barcodes/`. If you need to generate
+a new barcode object, this can be accomplished with the built-in PROCESS_BARCODES utility:
+
+    $> PROCESS_BARCODES -h
+    usage: PROCESS_BARCODES [-h] [-o O] [-b B [B ...]] [-p P]
+                            [--reverse-complement]
+    
+    optional arguments:
+      -h, --help            show this help message and exit
+      -o O, --output_stem O
+                            name and path for output file
+      -b B [B ...], --barcode_files B [B ...]
+                            barcode files
+      -p P, --processor P   type of experiment barcodes will be used in
+      --reverse-complement  indicates that barcodes in fastq files are reverse
+                            complements of the barcodes found in barcode files
+
+Example usage:
+
+`$> PROCESS_BARCODES -o ./in_drop_barcodes -b <barcode_file> -p in-drop --reverse-complement`
+would save a new, reverse-complemented barcode object built from `<barcode_file>` at
+`./in_drop_barcodes.p`
diff --git a/src/plugins/gitpull.py b/src/plugins/gitpull.py
index 4b2d62d..25c785a 100644
--- a/src/plugins/gitpull.py
+++ b/src/plugins/gitpull.py
@@ -2,19 +2,21 @@
 from starcluster.logger import log
 from subprocess import check_output, call
 
+
 class DownloadRepo(ClusterSetup):
-	def __init__(self, dir_name='/data/software/'):
-		if not dir_name.endswith('/'):
-                        dir_name += '/'
-                self.dir_name = dir_name
+    def __init__(self, dir_name='/data/software/'):
+        if not dir_name.endswith('/'):
+            dir_name += '/'
+        self.dir_name = dir_name
 
-	def run(self, nodes, master, user, user_shell, volumes):
-		folder = self.dir_name
-		log.info('installing seqc repo onto %s' % folder)
+    def run(self, nodes, master, user, user_shell, volumes):
+        folder = self.dir_name
+        log.info('installing seqc repo onto %s' % folder)
 
-		master.ssh.execute("mkdir %s" % folder)
-		location = folder + "seqc.tar.gz"
-		master.ssh.execute('curl -H "Authorization: token ab79b034a8caf4ae502f1374014f920a0a9062d7" -L '
-                                   'https://api.github.com/repos/ambrosejcarr/seqc/tarball > %s' % location)
-		log.info("seqc.tar.gz has been downloaded in /data/software directory")
-                master.ssh.execute('pip3 install %s' % location)
+        master.ssh.execute("mkdir %s" % folder)
+        location = folder + "seqc.tar.gz"
+        master.ssh.execute(
+            'curl -H "Authorization: token a22b2dc21f902a9a97883bcd136d9e1047d6d076" -L '
+            'https://api.github.com/repos/ambrosejcarr/seqc/tarball > %s' % location)
+        log.info("seqc.tar.gz has been downloaded in /data/software directory")
+        master.ssh.execute('pip3 install %s' % location)
diff --git a/src/plugins/raid.py b/src/plugins/raid.py
index 772ba81..807357b 100644
--- a/src/plugins/raid.py
+++ b/src/plugins/raid.py
@@ -6,54 +6,63 @@
 import os
 from subprocess import check_output, call
 
+
 class RaidAutomator(ClusterSetup):
-	def __init__(self, n_tb=4, availability_zone='us-east-1e'):
-		self.n_tb = n_tb
-                self.availability_zone = availability_zone
-		log.debug('total number of terabytes = %s' % n_tb)
-	
-	def run(self, nodes, master, user, user_shell, volumes):
-		log.info("setting up RAID array on master node...")
-		vol_num = self.n_tb
-		vol_size = 1024
-		inst_id = master.id
-		dev_base = "/dev/xvd"
-		alphabet = string.ascii_lowercase[5:] #starts at f
-		dev_names = []
-                availability_zone = self.availability_zone
-        
-		for i in range(int(vol_num)):
-			log.info("creating volume %s of %s..." % (i+1, vol_num))
-			vol = check_output(["aws", "ec2", "create-volume", "--size", str(vol_size), "--region", "us-east-1", "--availability-zone", availability_zone, "--volume-type", "gp2"])
-			vol_list = vol.decode().split()
-			vol_id = [s for s in vol_list if 'vol-' in s][0]
-
-			#generating new device id to mount
-			dev_id = dev_base + alphabet[i]
-			dev_names.append(dev_id)
-			
-			log.info("waiting for volume to become available...")
-			vol_stat = check_output(["aws", "ec2", "describe-volumes", "--volume-ids", vol_id])
-			vol_stat = vol_stat.decode().split()
-			
-			while vol_stat[6] != 'available':
-				time.sleep(10)
-				vol_stat = check_output(["aws", "ec2", "describe-volumes", "--volume-ids", vol_id])
-				vol_stat = vol_stat.decode().split()
-				log.info("...")
-			
-			log.info("attaching new volume...")
-			call(["aws", "ec2", "attach-volume", "--volume-id", vol_id, "--instance-id",inst_id, "--device", dev_id])
-			i+=1
-		log.info("successfully attached %s TB in %s volumes!" % (self.n_tb, vol_num))
-		
-		log.info("creating logical RAID device...")
-		all_dev = ' '.join(dev_names)
-		log.info('waiting for all volumes to be available...')
-		time.sleep(10)
-		master.ssh.execute("sudo mdadm --create --verbose /dev/md0 --level=0 --name=my_raid --raid-devices=%s %s" %(vol_num,all_dev))
-		master.ssh.execute("sudo mkfs.ext4 -L my_raid /dev/md0")
-		master.ssh.execute("sudo mkdir -p /data")
-		master.ssh.execute("sudo mount LABEL=my_raid /data")
-		log.info("successfully created RAID array in /data!")
-		
+    def __init__(self, n_tb=4, availability_zone='us-east-1e'):
+        self.n_tb = n_tb
+        self.availability_zone = availability_zone
+        log.debug('total number of terabytes = %s' % n_tb)
+
+    def run(self, nodes, master, user, user_shell, volumes):
+        log.info("setting up RAID array on master node...")
+        vol_num = self.n_tb
+        vol_size = 1024
+        inst_id = master.id
+        dev_base = "/dev/xvd"
+        alphabet = string.ascii_lowercase[5:]  # starts at f
+        dev_names = []
+        availability_zone = self.availability_zone
+
+        for i in range(int(vol_num)):
+            log.info("creating volume %s of %s..." % (i + 1, vol_num))
+            vol = check_output(
+                ["aws", "ec2", "create-volume", "--size", str(vol_size), "--region",
+                 "us-east-1", "--availability-zone", availability_zone, "--volume-type",
+                 "gp2"])
+            vol_list = vol.decode().split()
+            vol_id = [s for s in vol_list if 'vol-' in s][0]
+
+            # generating new device id to mount
+            dev_id = dev_base + alphabet[i]
+            dev_names.append(dev_id)
+
+            log.info("waiting for volume to become available...")
+            vol_stat = check_output(
+                ["aws", "ec2", "describe-volumes", "--volume-ids", vol_id])
+            vol_stat = vol_stat.decode().split()
+
+            while vol_stat[6] != 'available':
+                time.sleep(10)
+                vol_stat = check_output(
+                    ["aws", "ec2", "describe-volumes", "--volume-ids", vol_id])
+                vol_stat = vol_stat.decode().split()
+                log.info("...")
+
+            log.info("attaching new volume...")
+            call(["aws", "ec2", "attach-volume", "--volume-id", vol_id, "--instance-id",
+                  inst_id, "--device", dev_id])
+            i += 1
+        log.info("successfully attached %s TB in %s volumes!" % (self.n_tb, vol_num))
+
+        log.info("creating logical RAID device...")
+        all_dev = ' '.join(dev_names)
+        log.info('waiting for all volumes to be available...')
+        time.sleep(10)
+        master.ssh.execute(
+            "sudo mdadm --create --verbose /dev/md0 --level=0 --name=my_raid "
+            "--raid-devices=%s %s" % (vol_num, all_dev))
+
+        master.ssh.execute("sudo mkfs.ext4 -L my_raid /dev/md0")
+        master.ssh.execute("sudo mkdir -p /data")
+        master.ssh.execute("sudo mount LABEL=my_raid /data")
+        log.info("successfully created RAID array in /data!")

From c85b63c01f7f3832b255b375a095830f9a446bb3 Mon Sep 17 00:00:00 2001
From: ajc <ambrosejcarr@gmail.com>
Date: Tue, 3 Nov 2015 12:47:13 -0500
Subject: [PATCH 05/68] minor fixes to README.md

---
 README.md | 25 +++++++++++++++----------
 1 file changed, 15 insertions(+), 10 deletions(-)

diff --git a/README.md b/README.md
index 62cf6d9..4b8be01 100644
--- a/README.md
+++ b/README.md
@@ -124,15 +124,16 @@ Help on parsing individual data types can be obtained by typing:
                             list SEQDB default args for the STAR aligner
 
 
-All SEQC runs require that you pass a SEQC index (-i/--index). These are STAR indices,
+All SEQC runs require that you pass a SEQC index (`-i/--index`). These are STAR indices,
 augmented by SEQC-specific files:
+
 1. `annotations.gtf`: a modified GTF file, containing truncated sequence sizes that
- reflect that we expect all data to fall within ~ 1kb of the transcriptional termination
- sites. In addition, transcripts are tagged with "SCIDs", identifiers that merge
- transcripts and genes which cannot be distinguished in the ~ 1kb of sequence that we
- expect to observe.
+reflect that we expect all data to fall within ~ 1kb of the transcriptional termination
+sites. In addition, transcripts are tagged with "SCIDs", identifiers that merge
+transcripts and genes which cannot be distinguished in the ~ 1kb of sequence that we
+expect to observe.
 2. `p_coalignments_array.p`: a binary file containing, for each SCID, the probability of
- observing a co-alignment to other genes.
+observing a co-alignment to other genes.
 
 Human and mouse indices can be found on our aws s3 bucket at
 `s3://dplab-data/genomes/mm38/` and `s3://dplab-data/genomes/hg38`. These indices
@@ -141,18 +142,22 @@ are built from recent releases of ENSEMBL genomes.
 If new indices must be generated, these can be produced by calling:
 
     $> INDEX=<index_folder>
-    $> ORG=<organism>  # valid organisms are hg38 and mm38. Can also produce joint indicies with "[hg38, mm38]"
-    $> THREADS=<number of threads to use>  # STAR will use this number of threads to produce the index
+    $> # valid organisms are hg38 and mm38. Can also produce joint indicies with 
+    $> # "[hg38, mm38]"
+    $> ORG=<organism>
+    $> # STAR will use this number of threads to produce the index
+    $> THREADS=<number of threads to use>  
     $> mkdir $INDEX  # create index directory
     $> python3 -c "import seqc.align; seqc.align.STAR.verify_index($INDEX, $ORG, $THREADS)"
 
-Some data types require serialized barcode objects (-b/--barcodes). These objects contain
+Some data types require serialized barcode objects (`-b/--barcodes`). These objects contain
 all of the barcodes for an experiment, as they would be expected to be observed.
 For example, if you expect to observe the reverse complement of the barcodes you used to
 construct the library, then this object should be built from reverse complements.   
  
 These barcode files can be found at `s3://dplab-data/barcodes/`. If you need to generate
-a new barcode object, this can be accomplished with the built-in PROCESS_BARCODES utility:
+a new barcode object, this can be accomplished with the built-in `PROCESS_BARCODES`
+utility:
 
     $> PROCESS_BARCODES -h
     usage: PROCESS_BARCODES [-h] [-o O] [-b B [B ...]] [-p P]

From 7352049471a378fe0dd58865d8f52a3ac3401a40 Mon Sep 17 00:00:00 2001
From: ajc <ambrosejcarr@gmail.com>
Date: Tue, 3 Nov 2015 13:02:29 -0500
Subject: [PATCH 06/68] added dependencies to README.md

---
 README.md | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/README.md b/README.md
index 4b8be01..3212c05 100644
--- a/README.md
+++ b/README.md
@@ -1,5 +1,23 @@
 # SEquence Quality Control
 
+## Dependencies:
+
+Most dependencies will be automatically installed by SEQC. However, there are a few
+external dependencies that must be installed in order for certain functions to be enabled.
+
+1. Amazon Web Services (optional): If you wish to run SEQC on amazon (AWS), 
+you must set up an AWS account (see below). 
+2. Starcluster (optional): If you run SEQC on aws, that you create aws compute instances 
+with the python 2.7 <a href=https://github.com/jtriley/StarCluster>starcluster</a>
+package, which streamlines creation and shutdown of servers that are ready to run SEQC.
+2. libhdf5: We have provided amazon machine images or "AMIs" with all of starcluster's 
+external dependencies pre-installed. However, if you wish to use your own machine image, 
+you will need to install <a href=https://www.hdfgroup.org/HDF5>libhdf5</a>. Similarly, if
+you wish to parse any intermediate data offline on your local machine, you will need to 
+install libhdf5. 
+3. SEQC depends on several python3 packages that are automatically installed and updated.
+to view these packages, please view the `setup.py` file packaged with SEQC.
+
 ## Setting up SEQC and starcluster
 
 ### Creating a new AWS account:

From c6730b23ad510aae428e9a61f70f6a55e0f3e36d Mon Sep 17 00:00:00 2001
From: ajc <ambrosejcarr@gmail.com>
Date: Tue, 3 Nov 2015 16:25:41 -0500
Subject: [PATCH 07/68] Updated resolve_alignments() to consider only the
 sub-set of alignments that are supported by the data. Additional testing with
 full-size files is necessary

---
 src/seqc/arrays.py | 135 +++++++++++++++++++++++++++++++++++----------
 src/seqc/test.py   |   7 ++-
 2 files changed, 111 insertions(+), 31 deletions(-)

diff --git a/src/seqc/arrays.py b/src/seqc/arrays.py
index 1d56838..bba077d 100644
--- a/src/seqc/arrays.py
+++ b/src/seqc/arrays.py
@@ -7,7 +7,6 @@
 from itertools import islice, chain
 from seqc.three_bit import ThreeBit
 from seqc.qc import UnionFind, multinomial_loglikelihood, likelihood_ratio_test
-# from seqc.sam import average_quality
 from collections import deque, defaultdict
 from seqc import h5
 from scipy.sparse import coo_matrix
@@ -73,6 +72,10 @@ def __getitem__(self, item):
                                 'slice is desired us self.get_slice()'
                                 % type(item))
 
+    def __iter__(self):
+        for i in range(len(self)):
+            yield self[i]
+
     def get_slice(self):
         min_index = self.index[slice.start, 0]
         max_index = self.index[slice.stop, 1]
@@ -131,6 +134,47 @@ def extend(self, other):
         new_index = np.vstack([self.index, other_index])
         return JaggedArray(new_data, new_index)
 
+    # todo add shrink() call to resolve_alignments()
+    # todo write tests
+    def shrink(self):
+        """eliminate any unused space in self._data
+
+        if a method, such as ReadArray.resolve_alignments() has reduced the size of
+        internal arrays via __setitem__(), there will be unused space in the array. This
+        method will generate a new array object that uses the minimum amount of memory
+
+        Note that this will generate a copy of self with memory usage <= self. The user
+        is resposible for deleting the original object so that it can be garbage
+        collected.
+        """
+        raise NotImplementedError  # todo implement
+        # build a new index; identify any empty space
+        index = np.zeros(self.index.shape, dtype=self.index.dtype)
+        empty_size = 0
+        current_position = 0
+        for i in self._index:
+            if i[0] > current_position:
+                empty_size += i[0] - current_position
+            index[i] = [current_position, len(self._data[i])]
+
+        # check that there is no excess in the final position
+        last_index = self._index[-1][1]
+        if last_index > current_position:
+            empty_size += last_index - current_position
+
+        # if no empty space, return self
+        if not empty_size:
+            return self
+
+        # otherwise, build new data
+        new_size = self.data.shape[0] - empty_size
+        data = np.zeros(new_size, dtype=self.data.dtype)
+        for i, v in enumerate(self):
+            data[index[i][0]:index[i][1]] = v
+
+        return JaggedArray(data, index)
+
+
     @staticmethod
     def size_to_dtype(n):
         if n > 2 * (2 ** 64):
@@ -508,7 +552,7 @@ def group_for_disambiguation(self, required_poly_t):
         """
         indices = np.arange(self.data.shape[0])
 
-        mask = self.mask_failing_cells(n_poly_t_required=required_poly_t)
+        mask = self.mask_failing_records(n_poly_t_required=required_poly_t)
 
         # filter, then merge cell & rmt; this creates a copy of cell + rmt
         # memory overhead = n * 16b
@@ -558,7 +602,7 @@ def group_for_error_correction(self, required_poly_t=0):
         """
         indices = np.arange(self.data.shape[0])
 
-        mask = self.mask_failing_cells(n_poly_t_required=required_poly_t)
+        mask = self.mask_failing_records(n_poly_t_required=required_poly_t)
 
         # filter and merge cell/rmt
         seq_data = np.vstack([self.data['cell'][mask],
@@ -602,22 +646,17 @@ def resolve_alignments(self, expectations, required_poly_t=0, alpha=0.1):
         elif isinstance(expectations, str):
             raise FileNotFoundError('could not locate serialized expectations object: %s'
                                     % expectations)
-        elif isinstance(dict):
+        elif isinstance(expectations, dict):
             pass  # expectations already loaded
         else:
             raise TypeError('invalid expecatation object type, must be a dict expectation'
                             ' object or a string filepath')
 
-        # store indices, associated with their new features, whenever features are changed
-        # this will be used to create a new JaggedArray object at the end.
-        changing_features = {}
-        idx_map = {}
-
         # loop over potential molecules
         for read_indices in molecules.values():
 
             # get a view of the structarray
-            putative_molecule = self.data[read_indices]
+            # putative_molecule = self.data[read_indices]
             putative_features = self.features[read_indices]
 
             # check if all reads have a single, identical feature
@@ -639,20 +678,41 @@ def resolve_alignments(self, expectations, required_poly_t=0, alpha=0.1):
                 disjoint_features = putative_features[set_membership == s]
                 disjoint_group_idx = read_indices[set_membership == s]  # track index
 
-                # get observed counts; delete non-molecules
+                # get observed counts
                 obs_counter = ArrayCounter(putative_features)
-                del obs_counter[0]
 
-                # check that creating disjoint sets haven't made this trivial
-                if len(obs_counter) == 1 and len(next(iter(obs_counter))) == 1:
+                # delete failing molecules
+                # this should no longer be necessary due to changes in
+                # mask_failing_records
+                # del obs_counter[0]
+
+                # get the subset of features that are supported by the data
+                # these features should be present in all multi-alignments
+                possible_models = set(disjoint_features[0])
+                for molecule_group in disjoint_features:
+                    possible_models.intersection_update(molecule_group)
+                    if not possible_models:  # no molecule is supported by data.
+                        continue  # could not resolve BUT a subset could be unique!
+
+                # check that creating disjoint sets or eliminating unsupported molecules
+                # haven't made this trivial
+                if len(possible_models) == 1:
                     results[disjoint_group_idx] = 2
-                    continue  # no need to calculate probabilities for single model
+                    continue
+
+                # convert possible models to np.array for downstream indexing
+                possible_models = np.array(list(possible_models))
 
-                # get all potential molecule identities, create container for lh, df
-                possible_models = np.array(
-                    list(set(chain(*disjoint_features)))
-                )
-                possible_models = possible_models[possible_models != 0]
+                # # check that creating disjoint sets haven't made this trivial
+                # if len(obs_counter) == 1 and len(next(iter(obs_counter))) == 1:
+                #     results[disjoint_group_idx] = 2
+                #     continue  # no need to calculate probabilities for single model
+
+                # # get all potential molecule identities, create container for lh, df
+                # possible_models = np.array(
+                #     list(set(chain(*disjoint_features)))
+                # )
+                # possible_models = possible_models[possible_models != 0]
 
                 model_likelihoods = np.empty_like(possible_models, dtype=np.float)
                 df = np.empty_like(possible_models, dtype=np.int)
@@ -703,6 +763,7 @@ def resolve_alignments(self, expectations, required_poly_t=0, alpha=0.1):
                 self.features[disjoint_group_idx] = passing_models
 
         self._data = append_fields(self.data, 'disambiguation_results', results)
+        # self._features = self.features.shrink()
 
     @staticmethod
     def multi_delete(sorted_deque, *lists):
@@ -712,12 +773,30 @@ def multi_delete(sorted_deque, *lists):
                 del l[i]
         return lists
 
-    def mask_failing_cells(self, n_poly_t_required):
-        return ((self.data['cell'] != 0) &
-                (self.data['rmt'] != 0) &
-                (self.data['n_poly_t'] >= n_poly_t_required) &
-                (self.data['is_aligned'])
-                )
+    def mask_failing_records(self, n_poly_t_required):
+        """
+        generates a boolean mask for any records lacking cell barcodes or rmts, records
+        that are not aligned, or records missing poly_t sequences
+
+        args:
+        -----
+        n_poly_t_required: the number of 'T' nucleotides that must be present for the
+         record to be considered to have a poly-T tail.
+
+        returns:
+        --------
+        np.ndarray((n,) dtype=bool)  # n = len(self._data)
+        """
+        vbool = ((self.data['cell'] != 0) &
+                 (self.data['rmt'] != 0) &
+                 (self.data['n_poly_t'] >= n_poly_t_required) &
+                 (self.data['is_aligned'])
+                 )
+        no_feature = np.array([0])
+        has_feature = np.array([False if np.array_equal(v, no_feature) else True for v in
+                                self._features], dtype=np.bool)
+
+        return vbool & has_feature
 
     @staticmethod
     def translate_feature(reference_name, strand, true_position, feature_table,
@@ -790,7 +869,7 @@ def from_h5(cls, archive_name):
     # todo write tests
     def to_sparse_counts(self, collapse_molecules, n_poly_t_required):
 
-        mask = self.mask_failing_cells(n_poly_t_required)
+        mask = self.mask_failing_records(n_poly_t_required)
         unmasked_inds = np.arange(self.data.shape[0])[mask]
         molecule_counts = defaultdict(dict)
 
@@ -880,7 +959,7 @@ def to_sparse_counts(self, collapse_molecules, n_poly_t_required):
     # todo write tests
     def unique_features_to_sparse_counts(self, collapse_molecules, n_poly_t_required):
 
-        mask = self.mask_failing_cells(n_poly_t_required)
+        mask = self.mask_failing_records(n_poly_t_required)
         unmasked_inds = np.arange(self.data.shape[0])[mask]
         molecule_counts = defaultdict(dict)
 
diff --git a/src/seqc/test.py b/src/seqc/test.py
index e12582c..3f69326 100644
--- a/src/seqc/test.py
+++ b/src/seqc/test.py
@@ -1165,7 +1165,7 @@ def test_process_drop_seq_alignments(self):
         print(arr)
 
 
-# @unittest.skip('')
+@unittest.skip('')
 class TestSamToReadArray(unittest.TestCase):
 
     def setUp(self):
@@ -1432,7 +1432,7 @@ def test_union_find_on_multiple_feature_group_with_repitition_with_overlaps(self
 
 
 @unittest.skip('')
-class TestDisambiguation(unittest.TestCase):
+class TestResolveAlignments(unittest.TestCase):
     """this may need more tests for more sophisticated input data."""
 
     def setUp(self):
@@ -1469,7 +1469,8 @@ def test_ra_disambiguate(self):
             ra = pickle.load(f)
         ra.resolve_alignments(expectations)
         # 4 == complete disambiguation.
-        self.assertTrue(np.all(ra.data['disambiguation_results'] == 4))
+        vals, counts = np.unique(ra.data['disambiguation_results'], return_counts=True)
+        self.assertTrue(np.array_equal(vals, np.array([2, 4])))
 
 
 @unittest.skip('')

From 11eeac3a399971c9712503a152aebf35332704f2 Mon Sep 17 00:00:00 2001
From: ajc <ambrosejcarr@gmail.com>
Date: Wed, 4 Nov 2015 11:00:51 -0500
Subject: [PATCH 08/68] added ability to provide s3 links for index and
 barcodes instead of local file locations. When provided, SEQC will download
 these files before initiating the run

---
 src/scripts/SEQC   | 21 +++++++++++++++++----
 src/seqc/io_lib.py | 30 ++++++++++++++++++++++++++++--
 2 files changed, 45 insertions(+), 6 deletions(-)

diff --git a/src/scripts/SEQC b/src/scripts/SEQC
index 44a3abd..264baff 100755
--- a/src/scripts/SEQC
+++ b/src/scripts/SEQC
@@ -6,17 +6,18 @@ import pickle
 import os
 import shutil
 from copy import copy
+from seqc.io_lib import S3
 from seqc.fastq import merge_fastq
 from seqc.align import STAR
 from seqc.barcodes import DropSeqCellBarcodes
 from seqc.arrays import ReadArray
 from seqc.convert_features import ConvertFeatureCoordinates
-import numpy as np
 import sys
 import logging
 import json
 from datetime import datetime
 
+
 # todo | allow fetching from s3
 # todo | add ability to fetch genome from amazon (provide s3 link OR file path)
 # todo | add ability to fetch data from geo (provide additional alt. input argument -g)
@@ -156,6 +157,13 @@ def set_up(fout, index, barcodes):
     if not temp_dir.endswith('/'):
         temp_dir += '/'
 
+    # check that index exists. If index is an aws link, download the index
+    if index.startswith('s3://'):
+        bucket, prefix = S3.split_link(index)
+        index = temp_dir + 'index/'
+        cut_dirs = prefix.count('/')
+        S3.download_files(bucket, prefix, index, cut_dirs)
+
     # obtain gtf file from index argument
     gtf = index + 'annotations.gtf'
     if not os.path.isfile(gtf):
@@ -165,10 +173,15 @@ def set_up(fout, index, barcodes):
     if not barcodes:
         cb = DropSeqCellBarcodes()
     else:
+        if barcodes.startswith('s3://'):
+            bucket, key = S3.split_link(barcodes)
+            fout = temp_dir + 'barcodes.p'
+            barcodes = S3.download_file(bucket, key, fout)
+        # now, we should definitely have the binary file. Load it.
         with open(barcodes, 'rb') as f:
             cb = pickle.load(f)
 
-    return temp_dir, gtf, cb
+    return temp_dir, gtf, cb, index
 
 
 def merge(fout, forward, reverse, samfile, merged_fastq, processor, temp_dir, cb):
@@ -293,7 +306,7 @@ def clean_up(temp_dir):
 def in_drop(fout, forward, reverse, samfile, merged_fastq, processor, index,
             n_threads, frag_len, star_args, barcodes):
 
-    temp_dir, gtf, cb = set_up(fout, index, barcodes)
+    temp_dir, gtf, cb, index = set_up(fout, index, barcodes)
 
     merged_fastq = merge(fout, forward, reverse, samfile, merged_fastq, processor,
                          temp_dir, cb)
@@ -316,7 +329,7 @@ def in_drop(fout, forward, reverse, samfile, merged_fastq, processor, index,
 def drop_seq(fout, forward, reverse, samfile, merged_fastq, processor, index,
              n_threads, frag_len, star_args, barcodes):
 
-    temp_dir, gtf, cb = set_up(fout, index, barcodes)
+    temp_dir, gtf, cb, index = set_up(fout, index, barcodes)
 
     merged_fastq = merge(fout, forward, reverse, samfile, merged_fastq, processor,
                          temp_dir, cb)
diff --git a/src/seqc/io_lib.py b/src/seqc/io_lib.py
index b7412f9..e55df79 100644
--- a/src/seqc/io_lib.py
+++ b/src/seqc/io_lib.py
@@ -3,7 +3,6 @@
 
 # interfaces with ftp and s3 go here.
 from glob import glob
-import boto3
 import os
 import ftplib
 from threading import Thread
@@ -11,6 +10,11 @@
 from subprocess import Popen, check_output, PIPE
 from itertools import zip_longest
 
+import boto3
+import logging
+# turn off boto3 non-error logging, otherwise it logs tons of spurious information
+logging.getLogger('boto').setLevel(logging.CRITICAL)
+
 
 class S3:
     """A series of methods to upload and download files from amazon s3"""
@@ -156,7 +160,6 @@ def upload_files(file_prefix, bucket, key_prefix, cut_dirs=True):
         for file_, key in zip(all_files, upload_keys):
             client.upload_file(file_, bucket, key)
 
-
     @staticmethod
     def list(bucket, key_prefix):
         """
@@ -183,6 +186,29 @@ def remove_files(cls, bucket, key_prefix):
         for k in keys:
             _ = client.delete_object(Bucket=bucket, Key=k)
 
+    @staticmethod
+    def split_link(link_or_prefix):
+        """
+        take an amazon s3 link or link prefix and return the bucket and key for use with
+        S3.download_file() or S3.download_files()
+
+        args:
+        -----
+        link_or_prefix: an amazon s3 link, e.g. s3://dplab-data/genomes/mm38/chrStart.txt
+          link prefix e.g. s3://dplab-data/genomes/mm38/
+
+        returns:
+        --------
+        bucket: the aws bucket used in the link. Above, this would be dplab-data
+        key_or_prefix: the aws key or prefix provided in link_or_prefix. for the above
+          examples, either genomes/mm38/chrStart.txt (link) or genomes/mm38/ (prefix)
+        """
+        if not link_or_prefix.startswith('s3://'):
+            raise ValueError('aws s3 links must start with s3://')
+        link_or_prefix = link_or_prefix[5:]  # strip leading s3://
+        bucket, *key_or_prefix = link_or_prefix.split('/')
+        return bucket, '/'.join(key_or_prefix)
+
 
 class GEO:
 

From cdd11dc4375fb51f6af4c0965191a067553123be Mon Sep 17 00:00:00 2001
From: ajc <ambrosejcarr@gmail.com>
Date: Wed, 4 Nov 2015 11:17:59 -0500
Subject: [PATCH 09/68] updated logging to suppress boto3 instead of boto.
 Suspect this was failing silently

---
 src/scripts/SEQC   | 2 +-
 src/seqc/io_lib.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/scripts/SEQC b/src/scripts/SEQC
index 264baff..4d2f979 100755
--- a/src/scripts/SEQC
+++ b/src/scripts/SEQC
@@ -16,7 +16,7 @@ import sys
 import logging
 import json
 from datetime import datetime
-
+# logging.getLogger('boto3').setLevel(logging.CRITICAL)
 
 # todo | allow fetching from s3
 # todo | add ability to fetch genome from amazon (provide s3 link OR file path)
diff --git a/src/seqc/io_lib.py b/src/seqc/io_lib.py
index e55df79..d10709a 100644
--- a/src/seqc/io_lib.py
+++ b/src/seqc/io_lib.py
@@ -13,7 +13,7 @@
 import boto3
 import logging
 # turn off boto3 non-error logging, otherwise it logs tons of spurious information
-logging.getLogger('boto').setLevel(logging.CRITICAL)
+logging.getLogger('boto3').setLevel(logging.CRITICAL)
 
 
 class S3:

From ada5de7fae2c50fd40f5fabda9fb3634e9629ac2 Mon Sep 17 00:00:00 2001
From: ajc <ambrosejcarr@gmail.com>
Date: Wed, 4 Nov 2015 12:05:20 -0500
Subject: [PATCH 10/68] added log messages and finally eliminated all
 irrelevant boto logging from seqc.log

---
 src/scripts/SEQC   | 18 ++++++++++++------
 src/seqc/io_lib.py |  1 +
 2 files changed, 13 insertions(+), 6 deletions(-)

diff --git a/src/scripts/SEQC b/src/scripts/SEQC
index 4d2f979..c9d5d3d 100755
--- a/src/scripts/SEQC
+++ b/src/scripts/SEQC
@@ -18,8 +18,6 @@ import json
 from datetime import datetime
 # logging.getLogger('boto3').setLevel(logging.CRITICAL)
 
-# todo | allow fetching from s3
-# todo | add ability to fetch genome from amazon (provide s3 link OR file path)
 # todo | add ability to fetch data from geo (provide additional alt. input argument -g)
 # todo | test index generation (pull from seqdb)
 # todo | merge all of the metadata into a dictionary that is written at the end of the
@@ -59,8 +57,9 @@ def process_input():
     for i, p in enumerate(subparser_list):
         r = p.add_argument_group('Required Arguments')
         # todo needs to take input from user on what organism it should be
-        r.add_argument('-i', '--index', metavar='I', help='star alignment index folder. '
-                       'This folder will be created if it does not exist', default=None)
+        r.add_argument('-i', '--index', metavar='I', help='local or s3 location of star '
+                       'alignment index folder. This folder will be created if it does '
+                       'not exist', default=None)
         r.add_argument('-n', '--n-threads', help='number of threads to run', metavar='N',
                        type=int, default=None)
         r.add_argument('-o', '--output-file', metavar='O', default=None,
@@ -69,7 +68,7 @@ def process_input():
         # for all experiments except drop-seq, barcodes are a required input argument
         if i < 5:
             r.add_argument('-b', '--barcodes', metavar='B', default=None,
-                           help='location of serialized barcode object.')
+                           help='local or s3 location of serialized barcode object.')
 
         i = p.add_argument_group(
             title='Input Files',
@@ -159,6 +158,7 @@ def set_up(fout, index, barcodes):
 
     # check that index exists. If index is an aws link, download the index
     if index.startswith('s3://'):
+        log_info('AWS s3 link provided for index. Downloading index.')
         bucket, prefix = S3.split_link(index)
         index = temp_dir + 'index/'
         cut_dirs = prefix.count('/')
@@ -174,9 +174,15 @@ def set_up(fout, index, barcodes):
         cb = DropSeqCellBarcodes()
     else:
         if barcodes.startswith('s3://'):
+            log_info('AWS s3 link provided for barcodes. Downloading barcodes')
             bucket, key = S3.split_link(barcodes)
             fout = temp_dir + 'barcodes.p'
-            barcodes = S3.download_file(bucket, key, fout)
+            try:
+                barcodes = S3.download_file(bucket, key, fout)
+            except FileExistsError:
+                barcodes = fout
+                pass  # already have the file in this location from a previous run
+
         # now, we should definitely have the binary file. Load it.
         with open(barcodes, 'rb') as f:
             cb = pickle.load(f)
diff --git a/src/seqc/io_lib.py b/src/seqc/io_lib.py
index d10709a..ecab776 100644
--- a/src/seqc/io_lib.py
+++ b/src/seqc/io_lib.py
@@ -13,6 +13,7 @@
 import boto3
 import logging
 # turn off boto3 non-error logging, otherwise it logs tons of spurious information
+logging.getLogger('botocore').setLevel(logging.CRITICAL)
 logging.getLogger('boto3').setLevel(logging.CRITICAL)
 
 

From c10170110d7b9ee089906dd1546834115083db6d Mon Sep 17 00:00:00 2001
From: ajc <ambrosejcarr@gmail.com>
Date: Wed, 4 Nov 2015 12:44:25 -0500
Subject: [PATCH 11/68] wrote documentation for resolve_alignments

---
 src/seqc/arrays.py | 36 ++++++++++++++++++++++++++++++++++++
 1 file changed, 36 insertions(+)

diff --git a/src/seqc/arrays.py b/src/seqc/arrays.py
index bba077d..8a81f23 100644
--- a/src/seqc/arrays.py
+++ b/src/seqc/arrays.py
@@ -636,6 +636,42 @@ def group_for_error_correction(self, required_poly_t=0):
         return dict(molecules)
 
     def resolve_alignments(self, expectations, required_poly_t=0, alpha=0.1):
+        """
+        Resolve ambiguously aligned molecules and edit the ReadArray data structures
+        in-place to reflect the more specific gene assignments.
+
+        args:
+        expectations: serialized coalignment expectation file. Often found in
+          index/p_coalignment_array.p. Accepts either the loaded object or the filesystem
+          location of the serialized object
+        required_poly_t (default: 0): the number of poly_t required for a read to be
+          considered a valid input for resolution. Reads lacking a poly_t tail are likely
+          to randomly prime into non-mRNA sequences, so requiring the presence of poly_t
+          in the 5' end of the forward read can reduce unwanted contamination
+        alpha (default: 0.1): significance threshold for differentiating potential donor
+          genes. lowering the alpha value causes the algorithm to require that models
+          are better differentiated from inferior alternatives to declare the model
+          differentiated. If the significance threshold is not reached, then the inferior
+          model cannot be distinguished from the best model, and resolve_alignments fails
+          to disambiguate the molecule in question.
+
+        actions:
+        edits self.features and self.data in-place to reflect resolved alignments. adds
+          a column to self.data called 'disambiguation_results', this is an indicator
+          column that tracks the result of this method.
+          0: no model is fully supported by the data. This represents a molecule with
+             higher-order complexity. These umi-cell combinations have two molecules
+             associated with overlapping alignments and cannot be differentiated without
+             considering multiple molecule contributions. This problem can be mitigated
+             by increasing the UMI length or decreasing the concentration of input RNA.
+          1: trivial result, molecule was unambiguous
+          2: separating molecules into disjoint sets and limiting the algorithm to
+          supported gene fully disambiguated the molecule
+          3: molecule was partially disambiguated using the multinomial expectation model
+          4: molecule was fully disambiguated using the multinomial expectation model
+          5: model failed to remove any ambiguity
+        """
+
         molecules, seqs = self.group_for_disambiguation(required_poly_t)
         results = np.zeros(self.data.shape[0], dtype=np.int8)
 

From 70344b8bc50bb231c995f68b9ff1fee7f2743915 Mon Sep 17 00:00:00 2001
From: ajc <ambrosejcarr@gmail.com>
Date: Wed, 4 Nov 2015 12:49:32 -0500
Subject: [PATCH 12/68] remove #todo for writing sparse_molecule() and
 unique_features_to_sparse_counts() functions. This is done

---
 src/seqc/arrays.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/seqc/arrays.py b/src/seqc/arrays.py
index 1d56838..282cce4 100644
--- a/src/seqc/arrays.py
+++ b/src/seqc/arrays.py
@@ -787,7 +787,6 @@ def from_h5(cls, archive_name):
 
         return cls(data, features, positions)
 
-    # todo write tests
     def to_sparse_counts(self, collapse_molecules, n_poly_t_required):
 
         mask = self.mask_failing_cells(n_poly_t_required)
@@ -877,7 +876,6 @@ def to_sparse_counts(self, collapse_molecules, n_poly_t_required):
         coo = coo_matrix((values, (row_ind, col_ind)), shape=shape, dtype=dtype)
         return coo, unq_row, unq_col
 
-    # todo write tests
     def unique_features_to_sparse_counts(self, collapse_molecules, n_poly_t_required):
 
         mask = self.mask_failing_cells(n_poly_t_required)

From 8ff8dcf1265cfa1a837c173a61c1b24cae7aefe1 Mon Sep 17 00:00:00 2001
From: ajc <ambrosejcarr@gmail.com>
Date: Wed, 4 Nov 2015 15:00:33 -0500
Subject: [PATCH 13/68] added ReadArray.mask_low_support_molecules() to exclude
 molecules supported by fewer than required_support reads

---
 src/seqc/arrays.py | 30 ++++++++++++++++++++++++++++--
 1 file changed, 28 insertions(+), 2 deletions(-)

diff --git a/src/seqc/arrays.py b/src/seqc/arrays.py
index 282cce4..f5548cc 100644
--- a/src/seqc/arrays.py
+++ b/src/seqc/arrays.py
@@ -12,6 +12,7 @@
 from seqc import h5
 from scipy.sparse import coo_matrix
 import tables as tb
+import pandas as pd
 import numbers
 import pickle
 from types import *
@@ -712,6 +713,30 @@ def multi_delete(sorted_deque, *lists):
                 del l[i]
         return lists
 
+    def mask_low_support_molecules(self, required_support=2):
+        """
+        mask any molecule supported by fewer than <required_support> reads
+        """
+        # use the minimum number of columns to construct the dataframe, not sure if
+        # pandas copies data.
+        view = self.data[['cell', 'rmt']]
+        df = pd.DataFrame(view)
+        grouped = df.groupby(['cell', 'rmt'])
+        failing = []
+        for idx, g in grouped:
+            if len(g) < required_support:
+                failing.extend(g.index)
+        failing.sort()
+        ifail = 0
+        imask = 0
+        mask = np.ones(len(self.data), dtype=np.bool)
+        while ifail < len(failing):
+            if imask == failing[ifail]:
+                mask[imask] = 0
+                ifail += 1
+            imask += 1
+        return mask
+
     def mask_failing_cells(self, n_poly_t_required):
         return ((self.data['cell'] != 0) &
                 (self.data['rmt'] != 0) &
@@ -878,8 +903,9 @@ def to_sparse_counts(self, collapse_molecules, n_poly_t_required):
 
     def unique_features_to_sparse_counts(self, collapse_molecules, n_poly_t_required):
 
-        mask = self.mask_failing_cells(n_poly_t_required)
-        unmasked_inds = np.arange(self.data.shape[0])[mask]
+        read_mask = self.mask_failing_cells(n_poly_t_required)
+        low_coverage_mask = self.mask_low_support_molecules()
+        unmasked_inds = np.arange(self.data.shape[0])[read_mask & low_coverage_mask]
         molecule_counts = defaultdict(dict)
 
         if collapse_molecules:

From d3e37370ace2788852b69e77f911cf66a44608d4 Mon Sep 17 00:00:00 2001
From: ajc <ambrosejcarr@gmail.com>
Date: Wed, 4 Nov 2015 15:18:12 -0500
Subject: [PATCH 14/68] added call to low_coverage_mask() to to_sparse_counts()

---
 src/seqc/arrays.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/seqc/arrays.py b/src/seqc/arrays.py
index f5548cc..301279f 100644
--- a/src/seqc/arrays.py
+++ b/src/seqc/arrays.py
@@ -814,8 +814,10 @@ def from_h5(cls, archive_name):
 
     def to_sparse_counts(self, collapse_molecules, n_poly_t_required):
 
-        mask = self.mask_failing_cells(n_poly_t_required)
-        unmasked_inds = np.arange(self.data.shape[0])[mask]
+        # mask failing cells and molecules with < 2 reads supporting them.
+        read_mask = self.mask_failing_cells(n_poly_t_required)
+        low_coverage_mask = self.mask_low_support_molecules()
+        unmasked_inds = np.arange(self.data.shape[0])[read_mask & low_coverage_mask]
         molecule_counts = defaultdict(dict)
 
         # get a bytes-representation of each feature object
@@ -903,6 +905,7 @@ def to_sparse_counts(self, collapse_molecules, n_poly_t_required):
 
     def unique_features_to_sparse_counts(self, collapse_molecules, n_poly_t_required):
 
+        # mask failing cells and molecules with < 2 reads supporting them.
         read_mask = self.mask_failing_cells(n_poly_t_required)
         low_coverage_mask = self.mask_low_support_molecules()
         unmasked_inds = np.arange(self.data.shape[0])[read_mask & low_coverage_mask]

From 03efb91cfc70352f95c9bb21361b06efef814232 Mon Sep 17 00:00:00 2001
From: ajc <ambrosejcarr@gmail.com>
Date: Wed, 4 Nov 2015 16:02:03 -0500
Subject: [PATCH 15/68] added check for at least 1 model; was failing when no
 model was supported

---
 src/seqc/arrays.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/seqc/arrays.py b/src/seqc/arrays.py
index 8a81f23..bf43221 100644
--- a/src/seqc/arrays.py
+++ b/src/seqc/arrays.py
@@ -735,6 +735,11 @@ def resolve_alignments(self, expectations, required_poly_t=0, alpha=0.1):
                 if len(possible_models) == 1:
                     results[disjoint_group_idx] = 2
                     continue
+                # if we no longer have any supported models, we cannot disambiguate this
+                # molecule
+                elif len(possible_models) == 0:
+                    results[disjoint_group_idx] = 5
+                    continue
 
                 # convert possible models to np.array for downstream indexing
                 possible_models = np.array(list(possible_models))

From b8ed643401904f665a49b7e2b8834918e1cf669b Mon Sep 17 00:00:00 2001
From: ajc <ambrosejcarr@gmail.com>
Date: Wed, 4 Nov 2015 21:42:31 -0500
Subject: [PATCH 16/68] added resolve_alignments() back to SEQC; tests passed
 on full-size CD34-enrichment in-drop data

---
 src/scripts/SEQC | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/scripts/SEQC b/src/scripts/SEQC
index 44a3abd..4bc289e 100755
--- a/src/scripts/SEQC
+++ b/src/scripts/SEQC
@@ -302,7 +302,7 @@ def in_drop(fout, forward, reverse, samfile, merged_fastq, processor, index,
 
     arr = process_samfile(samfile, gtf, frag_len)
 
-    # resolve_alignments(index, arr, n=0, fout=fout)
+    resolve_alignments(index, arr, n=0, fout=fout)
 
     store_results(fout, arr)
 
@@ -325,7 +325,7 @@ def drop_seq(fout, forward, reverse, samfile, merged_fastq, processor, index,
 
     arr = process_samfile(samfile, gtf, frag_len)
 
-    # resolve_alignments(index, arr, n=0, fout=fout)
+    resolve_alignments(index, arr, n=0, fout=fout)
 
     arr.save_h5(fout + '.h5')
 

From 1a0175ab3c6e5fa7e12ce0fc77f1f7935ac5e50e Mon Sep 17 00:00:00 2001
From: ajc <ambrosejcarr@gmail.com>
Date: Thu, 5 Nov 2015 16:46:37 -0500
Subject: [PATCH 17/68] updated index creation. index building can now be
 called directly from SEQC, and is independent of our AWS

---
 README.md                                     |   2 +-
 src/scripts/SEQC                              | 105 +++++---
 src/seqc/align.py                             | 228 ++++++++++--------
 src/seqc/arrays.py                            |  46 ++++
 .../sa_postprocess/ordering_out_stacks.py     |   4 +-
 5 files changed, 251 insertions(+), 134 deletions(-)

diff --git a/README.md b/README.md
index 3212c05..ff48a0c 100644
--- a/README.md
+++ b/README.md
@@ -166,7 +166,7 @@ If new indices must be generated, these can be produced by calling:
     $> # STAR will use this number of threads to produce the index
     $> THREADS=<number of threads to use>  
     $> mkdir $INDEX  # create index directory
-    $> python3 -c "import seqc.align; seqc.align.STAR.verify_index($INDEX, $ORG, $THREADS)"
+    $> python3 -c "import seqc.align; seqc.align.STAR.build_index($INDEX, $ORG, $THREADS)"
 
 Some data types require serialized barcode objects (`-b/--barcodes`). These objects contain
 all of the barcodes for an experiment, as they would be expected to be observed.
diff --git a/src/scripts/SEQC b/src/scripts/SEQC
index dabd4be..e2d8760 100755
--- a/src/scripts/SEQC
+++ b/src/scripts/SEQC
@@ -8,7 +8,7 @@ import shutil
 from copy import copy
 from seqc.io_lib import S3
 from seqc.fastq import merge_fastq
-from seqc.align import STAR
+from seqc.align import STAR, _download_links
 from seqc.barcodes import DropSeqCellBarcodes
 from seqc.arrays import ReadArray
 from seqc.convert_features import ConvertFeatureCoordinates
@@ -16,7 +16,7 @@ import sys
 import logging
 import json
 from datetime import datetime
-# logging.getLogger('boto3').setLevel(logging.CRITICAL)
+
 
 # todo | add ability to fetch data from geo (provide additional alt. input argument -g)
 # todo | test index generation (pull from seqdb)
@@ -102,35 +102,66 @@ def process_input():
         parser.print_help()
         sys.exit(2)
 
-    arguments = parser.parse_args()
+    # add a sub-parser for building the index
+    pindex = subparsers.add_parser('index', help='SEQC index functions')
+    pindex.add_argument('-b', '--build', action='store_true', default=False,
+                        help='build a SEQC index')
+    pindex.add_argument('-t', '--test', action='store_true', default=False,
+                        help='test a SEQC index')
+    pindex.add_argument('-o', '--organism', required=True, nargs='+', metavar='O',
+                        help='build index for these organism(s)')
+    pindex.add_argument('-i', '--index', help='name of folder where index should be '
+                        'built or containing the index to be verified',
+                        required=True, metavar='I')
+    pindex.add_argument('-n', '--n-threads', type=int, default=4, help='number of threads'
+                        ' to use when building index', metavar='N')
+    # todo | phiX is going to need an SCID! will this be automatically generated? Is it
+    # todo |  going to get fucked up by all the post-processing last 1kb stuff?
+    # todo | change p_coalignment to array
+    pindex.add_argument('--phix', help='add phiX to the genome index and GTF file.',
+                        action='store_true', default=False)
 
-    # list star args if requested, then exit
-    if arguments.list_default_star_args:
-        printable_args = json.dumps(
-            STAR.default_alignment_args('$FASTQ', '$N_THREADS', '$INDEX', './$EXP_NAME/'),
-            separators=(',', ': '), indent=4)
-        print(printable_args)
-        sys.exit(2)
+    arguments = parser.parse_args()
 
-    # check that at least one input argument was passed:
-    check = [arguments.forward, arguments.reverse, arguments.sam, arguments.merged_fastq]
-    if not any(check):
-        print('SEQC %s: error: one or more of the following arguments must be provided: '
-              '-f/--forward, -r/--reverse, -m/--merged-fastq, -s/--sam' %
-              arguments.subparser_name)
-        sys.exit(2)
-    required = [arguments.output_file, arguments.index, arguments.n_threads]
-    if not arguments.subparser_name == 'drop-seq':
-        if not all(required + [arguments.barcodes]):
-            print('SEQC %s: error: the following arguments are required: -i/--index, -n/'
-                  '--n-threads, -o/--output-file, -b/--barcodes')
+    if arguments.subparser_name == 'index':
+        if arguments.build and not arguments.test:
+            arguments.func = STAR.build_index
+        elif arguments.test and not arguments.build:
+            arguments.func = STAR.test_index
+        else:
+            print('SEQC index: error: one but not both of the following arguments must '
+                  'be provided: -b/--build, -t/--test')
             sys.exit(2)
     else:
-        if not all(required):
-            print('SEQC %s: error: the following arguments are required: -i/--index, -n/'
-                  '--n-threads, -o/--output-file')
 
-    return arguments
+        # list star args if requested, then exit
+        if arguments.list_default_star_args:
+            printable_args = json.dumps(
+                STAR.default_alignment_args('$FASTQ', '$N_THREADS', '$INDEX', './$EXP_NAME/'),
+                separators=(',', ': '), indent=4)
+            print(printable_args)
+            sys.exit(2)
+
+        # check that at least one input argument was passed:
+        check = [arguments.forward, arguments.reverse, arguments.sam,
+                 arguments.merged_fastq]
+        if not any(check):
+            print('SEQC %s: error: one or more of the following arguments must be '
+                  'provided: -f/--forward, -r/--reverse, -m/--merged-fastq, -s/--sam' %
+                  arguments.subparser_name)
+            sys.exit(2)
+        required = [arguments.output_file, arguments.index, arguments.n_threads]
+        if not arguments.subparser_name == 'drop-seq':
+            if not all(required + [arguments.barcodes]):
+                print('SEQC %s: error: the following arguments are required: -i/--index, '
+                      '-n/--n-threads, -o/--output-file, -b/--barcodes')
+                sys.exit(2)
+        else:
+            if not all(required):
+                print('SEQC %s: error: the following arguments are required: -i/--index, '
+                      '-n/--n-threads, -o/--output-file')
+
+    return vars(arguments)
 
 
 def setup_logger():
@@ -310,7 +341,7 @@ def clean_up(temp_dir):
 
 
 def in_drop(fout, forward, reverse, samfile, merged_fastq, processor, index,
-            n_threads, frag_len, star_args, barcodes):
+            n_threads, frag_len, star_args, barcodes, **kwargs):
 
     temp_dir, gtf, cb, index = set_up(fout, index, barcodes)
 
@@ -333,7 +364,7 @@ def in_drop(fout, forward, reverse, samfile, merged_fastq, processor, index,
 
 
 def drop_seq(fout, forward, reverse, samfile, merged_fastq, processor, index,
-             n_threads, frag_len, star_args, barcodes):
+             n_threads, frag_len, star_args, barcodes, **kwargs):
 
     temp_dir, gtf, cb, index = set_up(fout, index, barcodes)
 
@@ -375,17 +406,21 @@ def strt_seq():
 
 
 if __name__ == "__main__":
-    args = process_input()
+    kwargs = process_input()
     setup_logger()
     try:
         # log command line arguments for debugging
-        arg_dict = copy(vars(args))
-        del arg_dict['func']  # function is not serializable
+        arg_copy = copy(kwargs)
+        del arg_copy['func']  # function is not serializable
         log_info('Passed command line arguments: %s' %
-                 json.dumps(arg_dict, separators=(',', ': '), indent=4))
-        args.func(args.output_file, args.forward, args.reverse, args.sam,
-                  args.merged_fastq, args.subparser_name, args.index, args.n_threads,
-                  args.frag_len, args.star_args, args.barcodes)
+                 json.dumps(arg_copy, separators=(',', ': '), indent=4))
+
+        func = kwargs['func']
+        func(**kwargs)
+
+        # args.func(args.output_file, args.forward, args.reverse, args.sam,
+        #           args.merged_fastq, args.subparser_name, args.index, args.n_threads,
+        #           args.frag_len, args.star_args, args.barcodes)
     except:
         logging.exception(datetime.now().strftime("%Y-%m-%d %H:%M:%S") + ':main:')
         raise
diff --git a/src/seqc/align.py b/src/seqc/align.py
index b8e40b6..80bdb0b 100644
--- a/src/seqc/align.py
+++ b/src/seqc/align.py
@@ -12,8 +12,7 @@
 import bz2
 
 
-# todo | make sure organism is provided as a "choice" in the argparser
-# define download locations for mouse and human genome files and annotations
+# download links for supported genomes on GEO
 _download_links = dict(hg38={
     'genome': ('ftp://ftp.ensembl.org/pub/release-80/fasta/homo_sapiens/dna/'
                'Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz'),
@@ -46,16 +45,6 @@
             'Ciona_savignyi.CSAV2.0.81.gtf.gz'),
     'cdna': [('ftp://ftp.ensembl.org/pub/release-81/fasta/ciona_savignyi/cdna/'
               'Ciona_savignyi.CSAV2.0.cdna.all.fa.gz')]
-}, phix={  # use genome for both DNA and cDNA
-    'genome': ('ftp://ftp.ncbi.nlm.nih.gov/genomes/Viruses/'
-               'Enterobacteria_phage_phiX174_sensu_lato_uid14015/NC_001422.fna'),
-    'cdna': ('ftp://ftp.ncbi.nlm.nih.gov/genomes/Viruses/'
-             'Enterobacteria_phage_phiX174_sensu_lato_uid14015/NC_001422.fna'),
-    # note GFF not GTF, this is not implemented yet
-    'gff': ('ftp://ftp.ncbi.nlm.nih.gov/genomes/Viruses/'
-            'Enterobacteria_phage_phiX174_sensu_lato_uid14015/NC_001422.gff')
-    # this is a potential gtf record:
-    # create a gtf using stringIO? NC_001422.1	RefSeq	region	1	5386	.	+	.	ID=id0;Dbxref=taxon:374840;Is_circular=true;gbkey=Src;genome=genomic;mol_type=genomic DNA;nat-host=Escherichia coli
 })
 
 
@@ -69,7 +58,7 @@ def _define_file_names(download_links):
             else:
                 file_name = link.split('/')[-1]
             file_names[organism][file_type] = file_name
-    return file_names
+    return dict(file_names)
 
 _file_names = _define_file_names(_download_links)
 
@@ -81,40 +70,6 @@ def _check_type(obj, type_):
 
 class STAR:
 
-    # def __init__(self, temp_dir, n_threads, index, *organism):
-    #     """
-    #     classdocs
-    #     """
-    #
-    #     # type-check arguments
-    #     _check_type(temp_dir, str)
-    #     _check_type(n_threads, int)
-    #     _check_type(index, str)
-    #
-    #     # create temp_dir if it does not exist
-    #     if not os.path.isdir(temp_dir):
-    #         os.makedirs(temp_dir)
-    #     if not temp_dir.endswith('/'):
-    #         temp_dir += '/'
-    #     self.temp_dir = temp_dir
-    #
-    #     # create index if it does not exist
-    #     if not os.path.isdir(index):
-    #         os.makedirs(index)
-    #     if not index.endswith('/'):
-    #         index += '/'
-    #     self.index = index
-    #
-    #     # check that organism is a valid argument
-    #     valid_organisms = ['mm38', 'hg38', 'mmhg38', 'cs2', 'ci2']
-    #     if organism:
-    #         if not all(org in valid_organisms for org in organism):
-    #             raise ValueError('Invalid organism value. Supported organisms: %r' %
-    #                              valid_organisms)
-    #     self.organism = organism
-    #
-    #     self.n_threads = n_threads
-
     @staticmethod
     def verify_organism(organism):
         valid_organisms = ['mm38', 'hg38', 'mmhg38', 'cs2', 'ci2']
@@ -151,9 +106,67 @@ def default_alignment_args(fastq_records, n_threads, index, temp_dir):
         }
         return default_align_args
 
+    @classmethod
+    def _append_phiX_to_fasta(cls, fasta, cdna=False):
+
+        # download phiX genome
+        genome_link = ('ftp://ftp.ncbi.nlm.nih.gov/genomes/Viruses/'
+                       'Enterobacteria_phage_phiX174_sensu_lato_uid14015/NC_001422.fna')
+        phix_genome = cls._download_ftp_file(genome_link, './')
+
+        # add it to the fasta file
+        with open(phix_genome, 'rb') as fin:
+            data = fin.read()
+
+            if cdna:  # change phix header to reflect our fake transcript id!
+                data = data.decode()
+                data = data.split('\n')
+                data[0] = ('>ENSPXT00000000001 chromosome:NC_001422.1:1:5386:1 '
+                           'gene:ENSPXG00000000001 gene_biotype:genome '
+                           'transcript_biotype:genome')
+                data = ('\n'.join(data)).encode()
+
+            if fasta.endswith('.gz'):
+                with gzip.open(fasta, 'ab') as fout:
+                    fout.write(data)
+            elif fasta.endswith('.bz2'):
+                raise ValueError('appending to bz2 files is not currently supported')
+            else:
+                with open(fasta, 'ab') as fout:
+                    fout.write(data)
+
+        os.remove(phix_genome)
+
+    @staticmethod
+    def _append_phiX_to_gtf(gtf):
+        # fake up some gtf records for a gene and transcript
+        gtf_data = '\n'.join([
+            '\t'.join([
+                'NC_001422.1', 'RefSeq', 'gene', '1', '5386', '.', '+', '.',
+                'ID "id0"; Dbxref "taxon:374840"; Is_circular "true"; gbkey "Src"; '
+                'genome "genomic"; mol_type "genomic DNA"; nat-host "Escherichia coli"; '
+                'gene_id "ENSPXG00000000001";'
+            ]), '\t'.join([
+                'NC_001422.1', 'RefSeq', 'transcript', '1', '5386', '.', '+', '.',
+                'ID "id0"; Dbxref "taxon:374840"; Is_circular "true"; gbkey "Src"; '
+                'genome "genomic"; mol_type "genomic DNA"; nat-host "Escherichia coli"; '
+                'transcript_id "ENSPXT00000000001"; gene_id "ENSPXG00000000001";'
+            ])
+        ])
+
+        if gtf.endswith('.gz'):
+            with gzip.open(gtf, 'ab') as fout:
+                fout.write(gtf_data.encode())
+        elif gtf.endswith('.bz2'):
+            raise ValueError('appending to bz2 files is not currently supported')
+        else:
+            with open(gtf, 'ab') as fout:
+                fout.write(gtf_data.encode())
+
     @staticmethod
     def _download_ftp_file(link, prefix, clobber=False):
         """downloads ftp_file available at 'link' into the 'prefix' directory"""
+
         # check link validity
         if not link.startswith('ftp://'):
             raise ValueError(
@@ -184,10 +197,11 @@ def _download_ftp_file(link, prefix, clobber=False):
         finally:
             ftp.close()
 
+        return prefix + file_name
+
     @staticmethod
     def _gunzip_file(gzipped_file):
         # todo want to replace with gzip module, but it is too slow!
-        # python is too damn slow.
         # with gzip.open(gzipped_file, 'rb') as f:
         #     fout = gzipped_file.replace('.gz', '')
         #     data = f.read()
@@ -198,7 +212,7 @@ def _gunzip_file(gzipped_file):
 
     @staticmethod
     def _gzip_files(*file_names):
-        # todo replace with gzip module
+        # todo want to replace with gzip module; but it is too slow!
         for f in file_names:
             call(['gzip', f])
 
@@ -215,7 +229,7 @@ def _merge_files(fout, *file_names):
                 copyfileobj(fd, wfd, 1024**2 * 128)
 
     @classmethod
-    def _generate_multiorganism_coalignment(cls, index, organism):
+    def _generate_multiorganism_coalignment(cls, index, organism, phix=True):
 
         # keys for coalignment relevant files
         keys = ['cdna', 'gtf']
@@ -228,9 +242,9 @@ def _generate_multiorganism_coalignment(cls, index, organism):
 
         # merge gtf files
         files = []
-        for organism in organism:
+        for org in organism:
             try:
-                files.append(index + _file_names[organism]['gtf'])
+                files.append(index + _file_names[org]['gtf'])
             except KeyError:
                 pass  # no gtf file for this organism
         if not len(files) > 0:
@@ -245,6 +259,11 @@ def _generate_multiorganism_coalignment(cls, index, organism):
                 cdna_files.append(index + file_)
         cls._merge_files(names['cdna'], *cdna_files)
 
+        # add phix if requested
+        if phix:
+            cls._append_phiX_to_fasta(names['cdna'], cdna=True)
+            cls._append_phiX_to_gtf(names['gtf'])
+
         # label the fasta file
         labeled_fasta = index + '%s_cdna_all_labeled.fa' % '_'.join(organism)
         prepare_fasta.standard_run(names['gtf'], names['cdna'], labeled_fasta)
@@ -260,13 +279,12 @@ def _generate_multiorganism_coalignment(cls, index, organism):
         scid_to_tx = index + 'scid_to_feature.txt'
         ordering_out_stacks.standard_run(
             index + 'jelly_output_stack_50.txt', scid_to_tx, labeled_fasta,
-            index + 'p_coalignment.pckl')
+            index + 'p_coalignment_array.p')
         create_gtf_reduced.create_gtf(names['gtf'], scid_to_tx, index +
                                       'annotations.gtf')
 
     @classmethod
-    def _generate_coalignment(cls, index, organism):
-        # todo | some files will need to be ungzipped
+    def _generate_coalignment(cls, index, organism, phix=True):
 
         # organism was of length 1, convert to string
         organism = organism[0]
@@ -276,15 +294,16 @@ def _generate_coalignment(cls, index, organism):
         merged_cdna = index + organism + '_cdna_all.fa'
         cls._merge_files(merged_cdna, *cdna_files)
 
-        # # merge ncrna and transcriptome files
-        # ncrna = index + _file_names[organism]['ncrna']
-        # tx = index + _file_names[organism]['tx']
-        # merged_tx = index + organism + '_cdna_all.fa'
-        # cls._merge_files(merged_tx, ncrna, tx)
-
-        # label the fasta file
+        # get the gtf file name
         gtf = index + _file_names[organism]['gtf']
-        labeled_fasta = index + organism + '_cnda_all_labeled.fa'
+
+        # add phix if requested
+        if phix:
+            cls._append_phiX_to_fasta(merged_cdna, cdna=True)
+            cls._append_phiX_to_gtf(gtf)
+
+        # label the fasta with transcript ids
+        labeled_fasta = index + organism + '_cdna_all_labeled.fa'
         prepare_fasta.standard_run(gtf, merged_cdna, labeled_fasta)
 
         # find julia script
@@ -298,11 +317,11 @@ def _generate_coalignment(cls, index, organism):
         scid_to_tx = index + 'scid_to_feature.txt'
         ordering_out_stacks.standard_run(
             index + 'jelly_output_stack_50.txt', scid_to_tx, labeled_fasta,
-            index + 'p_coalignment.pckl')
+            index + 'p_coalignment_array.p')
         create_gtf_reduced.create_gtf(gtf, scid_to_tx, index + 'annotations.gtf')
 
     @classmethod
-    def _build_multiorganism_index(cls, index, organism, n_threads):
+    def _build_multiorganism_index(cls, index, organism, n_threads, phix=True):
         """"""
         # check that genome has been unzipped
         genome_files = [index + _file_names[org]['genome'] for org in organism]
@@ -315,10 +334,14 @@ def _build_multiorganism_index(cls, index, organism, n_threads):
         merged_genome = index + '%s_genome.fa' % '_'.join(organism)
         cls._merge_files(merged_genome, *unzipped_files)
 
+        # if requested, add phiX
+        if phix:
+            cls._append_phiX_to_fasta(merged_genome)
+
         # check that coalignment files have been generated
         gtf = index + 'annotations.gtf'
         if not os.path.isfile(gtf):
-            cls._generate_coalignment(index, organism)
+            cls._generate_coalignment(index, organism)  # organism -> list of organisms
 
         # make index
         star_args = [
@@ -335,7 +358,7 @@ def _build_multiorganism_index(cls, index, organism, n_threads):
             raise ChildProcessError(err)
 
     @classmethod
-    def _build_index(cls, index, organism, n_threads):
+    def _build_index(cls, index, organism, n_threads, phix=True):
         """"""
         # organism was singular, convert to string
         organism = organism[0]
@@ -346,10 +369,14 @@ def _build_index(cls, index, organism, n_threads):
         if not os.path.isfile(unzipped):
             cls._gunzip_file(genome)
 
+        # if requested, add phiX
+        if phix:
+            cls._append_phiX_to_fasta(unzipped)
+
         # check that coalignment files have been generated
         gtf = index + 'annotations.gtf'
         if not os.path.isfile(gtf):
-            cls._generate_coalignment(index, organism)
+            cls._generate_coalignment(index, organism, phix=phix)
 
         # make index
         star_args = [
@@ -366,41 +393,62 @@ def _build_index(cls, index, organism, n_threads):
             raise ChildProcessError(err)
 
     @classmethod
-    def verify_index(cls, index, organism, n_threads):
+    def build_index(cls, index, organism, n_threads, phix=True, **kwargs):
+
+        # ensure index has a terminal '/'
+        if not index.endswith('/'):
+            index += '/'
 
         # check for expected index files. This list is not comprehensive
-        all_files = ['annotations.gtf', 'p_coalignment.pckl', 'SA', 'SAIndex', 'Genome',
-                     'scid_to_feature.txt']
+        all_files = ['annotations.gtf', 'p_coalignment_array.p', 'SA', 'SAIndex',
+                     'Genome', 'scid_to_feature.txt']
+        for_removal = []  # container for all the files we're downloading.
         # check that we have all the files we need to generate the index
         if not all(os.path.isfile(index + f) for f in all_files):
-            for organism in organism:
-                for file_type, name in _file_names[organism].items():
+            for org in organism:
+                for file_type, name in _file_names[org].items():
                     if isinstance(name, list):
                         for i, n in enumerate(name):
                             if not os.path.isfile(index + n):
-                                link = _download_links[organism][file_type][i]
-                                cls._download_ftp_file(link, index)
+                                link = _download_links[org][file_type][i]
+                                dlfile = cls._download_ftp_file(link, index)
+                                for_removal.extend([dlfile, dlfile.replace('.gz', '')])
                     else:
                         if not os.path.isfile(index + name):
-                            link = _download_links[organism][file_type]
-                            cls._download_ftp_file(link, index)
+                            link = _download_links[org][file_type]
+                            dlfile = cls._download_ftp_file(link, index)
+                            for_removal.extend([dlfile, dlfile.replace('.gz', '')])
 
             # generate coalignment if necessary
-            coalignment_files = ['annotations.gtf', 'p_coalignment.pckl',
+            coalignment_files = ['annotations.gtf', 'p_coalignment_array.p',
                                  'scid_to_feature.txt']
             if not all(os.path.isfile(index + f) for f in coalignment_files):
                 if len(organism) > 1:
-                    cls._generate_multiorganism_coalignment(index, organism)
+                    cls._generate_multiorganism_coalignment(index, organism, phix=phix)
                 else:
-                    cls._generate_coalignment(index, organism)
+                    cls._generate_coalignment(index, organism, phix=phix)
 
             # build index if necessary
             index_files = ['SA', 'SAIndex', 'Genome']
             if not all(os.path.isfile(index + f) for f in index_files):
                 if len(organism) > 1:
-                    cls._build_multiorganism_index(index, organism, n_threads)
+                    cls._build_multiorganism_index(index, organism, n_threads, phix=phix)
                 else:
-                    cls._build_index(index, organism, n_threads)
+                    cls._build_index(index, organism, n_threads, phix=phix)
+
+        # these methods sometimes generate some additional files that should be removed
+        for_removal.append(index + '%s_cdna_all.fa' % '_'.join(organism))
+        for_removal.append(index + '%s_cdna_all_labeled.fa' % '_'.join(organism))
+        for_removal.append(index + '%s_genome.fa' % '_'.join(organism))
+        for_removal.append(index + '%s.gtf' % '_'.join(organism))
+
+        for index_file in for_removal:
+            if os.path.isfile(index_file):
+                os.remove(index_file)
+
+    @staticmethod
+    def test_index(index, organism, n_threads, **kwargs):
+        raise NotImplementedError
 
     @classmethod
     def align(cls, fastq_file, index, n_threads, temp_dir, reverse_fastq_file=None,
@@ -530,7 +578,7 @@ def align_multiple_files(cls, fastq_files, index, n_threads, working_dir,
 
     @staticmethod
     def load_index(index):
-        # set shared memory; todo | this may bug out with larger indices; test!
+        # set shared memory; this may bug out with larger indices; test!
         try:
             _ = check_output(['sysctl', '-w', 'kernel.shmmax=36301783210'])
             _ = check_output(['sysctl', '-w', 'kernel.shmall=36301783210'])
@@ -592,17 +640,3 @@ def alignment_metadata(log_final_out, meta):
         meta['insertion_size'] = float(lines[21].strip().split('\t')[-1])
 
     return meta
-
-
-# if __name__ == "__main__":
-#     import sys
-#     if not len(sys.argv) >= 5:
-#         print('usage: python3 align.py <temp_dir> <n_threads> <index> <organism1> '
-#               '<organism2> ... <organismN>')
-#         sys.exit(2)
-#     temp_dir = sys.argv[1]
-#     n_threads = int(sys.argv[2])
-#     index = sys.argv[3]
-#     organisms = sys.argv[4:]
-#     s = STAR(temp_dir, n_threads, index, *organisms)
-#     s.verify_index()
\ No newline at end of file
diff --git a/src/seqc/arrays.py b/src/seqc/arrays.py
index 0870c59..dcb3393 100644
--- a/src/seqc/arrays.py
+++ b/src/seqc/arrays.py
@@ -1110,6 +1110,52 @@ def unique_features_to_sparse_counts(self, collapse_molecules, n_poly_t_required
         coo = coo_matrix((values, (row_ind, col_ind)), shape=shape, dtype=dtype)
         return coo, unq_row, unq_col
 
+    def summarize(self, alignment_metadata=None, save_plots=False):
+        """return descriptive statistics for this dataset
+
+        Note that this function can take a very long time to run, particularly if
+        save_plots is not False.
+
+        summarize calculates several statistics:
+        1. average reads per molecule, and the read per molecule distribution
+        2. average molecules per cell, and the molecule per cell distribution
+        3. percentage of records that:
+           a. align to the transcriptome
+           b. are phiX
+           c. are mitochondrial
+           d. that result from technical errors
+           e. have a valid cell barcode
+           f. contribute to molecules (pass all filters)
+           g. aligned ambiguously
+           h. aligned ambiguously but were corrected
+           i. aligned to unique features
+        4. nucleotide error rates for:
+           a. each error type
+           b. overall average error rate
+
+        # todo implement
+        if a folder location is provided to save_plots, summarize also generates several
+        plots:
+        1. Cell Yield across a moving threshold
+        2. Impact of UMI/RMT correction
+        3. Impact of disambiguation & error correction
+        4. GC bias across cell barcodes
+        5. filter correlation heatmap
+
+        args:
+        -----
+        save_plots (str): folder location where plots should be saved. If False, not plots
+          are generated
+
+        returns:
+        dictionary of summary statistics. If save_plots is not False, writes the result
+          to save_plots/summary.json
+        """
+
+        # get record filter percentages
+        metadata = {}
+
+
 
 def outer_join(left, right):
     """
diff --git a/src/seqc/sa_postprocess/ordering_out_stacks.py b/src/seqc/sa_postprocess/ordering_out_stacks.py
index c766a84..6c67bce 100644
--- a/src/seqc/sa_postprocess/ordering_out_stacks.py
+++ b/src/seqc/sa_postprocess/ordering_out_stacks.py
@@ -266,10 +266,12 @@ def standard_run(input_file, nums_to_transcripts_path, labeled_fasta, pickle_des
                                                              sc_translations)
 
     print("Now just have to pickle...")
+    # translate dict to arrays
+    overlap_probabilities = translate_feature_dict_to_arrays(overlap_probabilities)
     pickle_dictionary(overlap_probabilities, pickle_destination)
     print(time.time() - start_time, "seconds to complete")
 
 
 if __name__ == "__main__":
     standard_run("jelly_output_stack_50.txt", "nums_to_transcripts_99.txt",
-                 "my_labeled_fasta.fa", "p_coalignment.pckl")
+                 "my_labeled_fasta.fa", "p_coalignment_array.p")

From 4eca32d980a361333a21aab25f25883265f18eb6 Mon Sep 17 00:00:00 2001
From: ajc <ambrosejcarr@gmail.com>
Date: Thu, 5 Nov 2015 16:51:29 -0500
Subject: [PATCH 18/68] added ability to do simple checks on indices via SEQC

---
 src/seqc/align.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/src/seqc/align.py b/src/seqc/align.py
index 80bdb0b..5022a40 100644
--- a/src/seqc/align.py
+++ b/src/seqc/align.py
@@ -448,7 +448,13 @@ def build_index(cls, index, organism, n_threads, phix=True, **kwargs):
 
     @staticmethod
     def test_index(index, organism, n_threads, **kwargs):
-        raise NotImplementedError
+        critical_files = ['annotations.gtf', 'p_coalignment_array.p', 'SA', 'SAIndex',
+                          'Genome', 'scid_to_feature.txt']
+        # check that we have all the files we need to generate the index
+        if not all(os.path.isfile(index + f) for f in critical_files):
+            print('Invalid Index')
+        else:
+            print('Valid Index')
 
     @classmethod
     def align(cls, fastq_file, index, n_threads, temp_dir, reverse_fastq_file=None,

From dad1d9943685040b8cc0b80b29e1112b43975e2b Mon Sep 17 00:00:00 2001
From: ajc <ambrosejcarr@gmail.com>
Date: Thu, 5 Nov 2015 17:47:13 -0500
Subject: [PATCH 19/68] refactored logging to work across modules. Added
 logging to index creation

---
 src/scripts/SEQC                              | 25 ++++++++++---------
 src/scripts/complete_sa.jl                    |  6 ++---
 src/seqc/align.py                             | 19 ++++++++++----
 src/seqc/log.py                               | 18 +++++++++++++
 .../sa_postprocess/ordering_out_stacks.py     | 23 +++++++++--------
 src/seqc/sa_preprocess/prepare_fasta.py       | 10 ++------
 6 files changed, 62 insertions(+), 39 deletions(-)
 create mode 100644 src/seqc/log.py

diff --git a/src/scripts/SEQC b/src/scripts/SEQC
index e2d8760..e4905fa 100755
--- a/src/scripts/SEQC
+++ b/src/scripts/SEQC
@@ -11,15 +11,15 @@ from seqc.fastq import merge_fastq
 from seqc.align import STAR, _download_links
 from seqc.barcodes import DropSeqCellBarcodes
 from seqc.arrays import ReadArray
+from seqc.log import setup_logger, log_info, log_exception
 from seqc.convert_features import ConvertFeatureCoordinates
 import sys
-import logging
 import json
-from datetime import datetime
+# import logging
+# from datetime import datetime
 
 
 # todo | add ability to fetch data from geo (provide additional alt. input argument -g)
-# todo | test index generation (pull from seqdb)
 # todo | merge all of the metadata into a dictionary that is written at the end of the
 # todo |   pipeline or when any errors occur
 # todo | add ability to detect and quantify genomic/mitochondrial/other non-tx alignments
@@ -164,14 +164,14 @@ def process_input():
     return vars(arguments)
 
 
-def setup_logger():
-    """create a simple log file in the cwd to track progress and any errors"""
-    logging.basicConfig(filename='seqc.log', level=logging.DEBUG)
-
-
-def log_info(message):
-    """print a timestamped update for the user"""
-    logging.info(datetime.now().strftime("%Y-%m-%d %H:%M:%S") + ':' + message)
+# def setup_logger():
+#     """create a simple log file in the cwd to track progress and any errors"""
+#     logging.basicConfig(filename='seqc.log', level=logging.DEBUG)
+#
+#
+# def log_info(message):
+#     """print a timestamped update for the user"""
+#     logging.info(datetime.now().strftime("%Y-%m-%d %H:%M:%S") + ':' + message)
 
 
 def set_up(fout, index, barcodes):
@@ -422,5 +422,6 @@ if __name__ == "__main__":
         #           args.merged_fastq, args.subparser_name, args.index, args.n_threads,
         #           args.frag_len, args.star_args, args.barcodes)
     except:
-        logging.exception(datetime.now().strftime("%Y-%m-%d %H:%M:%S") + ':main:')
+        log_exception()
+        # logging.exception(datetime.now().strftime("%Y-%m-%d %H:%M:%S") + ':main:')
         raise
diff --git a/src/scripts/complete_sa.jl b/src/scripts/complete_sa.jl
index cd2ac29..1a186c9 100755
--- a/src/scripts/complete_sa.jl
+++ b/src/scripts/complete_sa.jl
@@ -490,8 +490,8 @@ function main(fasta_in::String, text_out::String, k_value::Int)
 	string_semi = join(semi_translated)
 
 	final = append!(translated, [0, 0, 0])
-	println("Starting to make sa.")
-	println(total_len)
+	#println("Starting to make sa.")
+	#println(total_len)
 	sa = suffix_array(final, zeros(Int, total_len), total_len, 5 + length(genes) + 1)
 	lcp = get_lcp(sa + 1, translated[1:end-3])
 
@@ -504,7 +504,7 @@ function main(fasta_in::String, text_out::String, k_value::Int)
 
 	#println(cld_tab .- 1)
 	close(out_file)
-	println("Done making sa.")
+	#println("Done making sa.")
 end
 
 #println(ARGS)
diff --git a/src/seqc/align.py b/src/seqc/align.py
index 5022a40..4878fbc 100644
--- a/src/seqc/align.py
+++ b/src/seqc/align.py
@@ -6,6 +6,7 @@
 from collections import defaultdict
 from seqc.sa_preprocess import prepare_fasta
 from seqc.sa_postprocess import ordering_out_stacks, create_gtf_reduced
+from seqc.log import log_info
 import seqc
 import ftplib
 import gzip
@@ -273,6 +274,7 @@ def _generate_multiorganism_coalignment(cls, index, organism, phix=True):
                         '/scripts/complete_sa.jl')
 
         # process with julia script
+        log_info('Generating co-alignment suffix array.')
         call(['julia', julia_script, labeled_fasta, index, '50'])
 
         # complete processing
@@ -344,6 +346,7 @@ def _build_multiorganism_index(cls, index, organism, n_threads, phix=True):
             cls._generate_coalignment(index, organism)  # organism -> list of organisms
 
         # make index
+        log_info('Beginning to generate STAR index.')
         star_args = [
             'STAR',
             '--genomeDir', index,
@@ -352,10 +355,11 @@ def _build_multiorganism_index(cls, index, organism, n_threads, phix=True):
             '--genomeFastaFiles', merged_genome,
             '--sjdbGTFfile', index + 'annotations.gtf',
             '--sjdbOverhang', '75']
-        star = Popen(star_args, stderr=PIPE)
+        star = Popen(star_args, stdout=PIPE, stderr=PIPE)
         _, err = star.communicate()
         if err:
             raise ChildProcessError(err)
+        log_info('Finished successfully. Run Complete.')
 
     @classmethod
     def _build_index(cls, index, organism, n_threads, phix=True):
@@ -379,6 +383,7 @@ def _build_index(cls, index, organism, n_threads, phix=True):
             cls._generate_coalignment(index, organism, phix=phix)
 
         # make index
+        log_info('Beginning to generate STAR index.')
         star_args = [
             'STAR',
             '--genomeDir', index,
@@ -387,10 +392,11 @@ def _build_index(cls, index, organism, n_threads, phix=True):
             '--genomeFastaFiles', unzipped,
             '--sjdbGTFfile', index + 'annotations.gtf',
             '--sjdbOverhang', '75']
-        star = Popen(star_args, stderr=PIPE)
-        _, err = star.communicate()
+        star = Popen(star_args, stdout=PIPE, stderr=PIPE)
+        out, err = star.communicate()
         if err:
             raise ChildProcessError(err)
+        log_info('Finished successfully. Run Complete.')
 
     @classmethod
     def build_index(cls, index, organism, n_threads, phix=True, **kwargs):
@@ -403,7 +409,8 @@ def build_index(cls, index, organism, n_threads, phix=True, **kwargs):
         all_files = ['annotations.gtf', 'p_coalignment_array.p', 'SA', 'SAIndex',
                      'Genome', 'scid_to_feature.txt']
         for_removal = []  # container for all the files we're downloading.
-        # check that we have all the files we need to generate the index
+
+        log_info("Downloading genome files.")
         if not all(os.path.isfile(index + f) for f in all_files):
             for org in organism:
                 for file_type, name in _file_names[org].items():
@@ -447,7 +454,9 @@ def build_index(cls, index, organism, n_threads, phix=True, **kwargs):
                 os.remove(index_file)
 
     @staticmethod
-    def test_index(index, organism, n_threads, **kwargs):
+    def test_index(index, **kwargs):
+        if not index.endswith('/'):
+            index += '/'
         critical_files = ['annotations.gtf', 'p_coalignment_array.p', 'SA', 'SAIndex',
                           'Genome', 'scid_to_feature.txt']
         # check that we have all the files we need to generate the index
diff --git a/src/seqc/log.py b/src/seqc/log.py
new file mode 100644
index 0000000..259003d
--- /dev/null
+++ b/src/seqc/log.py
@@ -0,0 +1,18 @@
+__author__ = 'ambrose'
+
+import logging
+from datetime import datetime
+
+
+def setup_logger():
+    """create a simple log file in the cwd to track progress and any errors"""
+    logging.basicConfig(filename='seqc.log', level=logging.DEBUG)
+
+
+def log_info(message):
+    """print a timestamped update for the user"""
+    logging.info(datetime.now().strftime("%Y-%m-%d %H:%M:%S") + ':' + message)
+
+
+def log_exception():
+    logging.exception(datetime.now().strftime("%Y-%m-%d %H:%M:%S") + ':main:')
diff --git a/src/seqc/sa_postprocess/ordering_out_stacks.py b/src/seqc/sa_postprocess/ordering_out_stacks.py
index 6c67bce..e8983cb 100644
--- a/src/seqc/sa_postprocess/ordering_out_stacks.py
+++ b/src/seqc/sa_postprocess/ordering_out_stacks.py
@@ -5,6 +5,7 @@
 import time
 import pickle as pickle
 from seqc.sa_postprocess import set_classes as sc
+from seqc.log import log_info
 
 from scipy.stats import poisson
 
@@ -26,7 +27,7 @@ def create_spots(jelly_file):
 
     gene_spots = {}
 
-    print("Reading through file, creating gene spots.")
+    log_info("Reading through SA file, creating gene spots (1/2)")
     line_count = 0
     for line in in_read:
         lsp = line.strip().split("\t")
@@ -42,7 +43,7 @@ def create_spots(jelly_file):
 
 
 def determine_probabilities(gene_spots, in_read):
-    print("Beginning to examine gene-level information.")
+    log_info("Beginning to examine gene-level information (1/2).")
     gene_start = time.time()
     big_dictionary = {}
     giant_dictionary = {}
@@ -109,7 +110,7 @@ def determine_sc_groups(big_dictionary, giant_dictionary,
 
     all_genes = set(all_genes)
 
-    print("Done reading file")
+    log_info("Done reading file.")
 
     dj = sc.DisjointSet()
     set_dictionary = {}
@@ -118,7 +119,7 @@ def determine_sc_groups(big_dictionary, giant_dictionary,
         set_dictionary[gene] = new_aset
         dj.makeSet(new_aset)
 
-    print("joining up transcripts")
+    log_info("Joining overlapping transcripts.")
     for gene in sorted(giant_dictionary.keys()):
         for pair in giant_dictionary[gene]:
             # if we only want to merge features if they belong to same gene, need the next if. If we want to just reduce features, just do if True:
@@ -132,7 +133,7 @@ def determine_sc_groups(big_dictionary, giant_dictionary,
                         pass
                     #print "Apparently missing one of", gene, pair
 
-    print("Collecting info")
+    log_info("Collecting transcript information.")
     raf, leaders = dj.collect()  # representatives_and_families, leaders
     overlap_probabilities = {}
 
@@ -160,7 +161,7 @@ def create_sc_spots(jelly_file, sc_translations, repres):
 
     gene_spots = {}
 
-    print("Reading through file, creating gene spots.")
+    log_info("Reading through file, creating gene spots (2/2).")
     line_count = 0
     for line in in_read:
         lsp = line.strip().split("\t")
@@ -180,7 +181,7 @@ def create_sc_spots(jelly_file, sc_translations, repres):
 
 
 def determine_probabilities_after_sc(sc_gene_spots, in_read, sc_translations):
-    print("Beginning to examine gene-level information.")
+    log_info("Beginning to examine gene-level information (2/2).")
     big_dictionary = {}
     gene_count = 0
     for gene in list(sc_gene_spots.keys())[:]:
@@ -254,9 +255,9 @@ def standard_run(input_file, nums_to_transcripts_path, labeled_fasta, pickle_des
     start_time = time.time()
 
     gene_spots, in_read = create_spots(input_file)
-    print("Done with create_spots()")
+    log_info("Done with create_spots().")
     big_dictionary, giant_dictionary = determine_probabilities(gene_spots, in_read)
-    print("Done with determine_probabilities()")
+    log_info("Done with determine_probabilities().")
     gene_spots = {}
     sc_translations, repres = determine_sc_groups(big_dictionary, giant_dictionary,
                                                   labeled_fasta,
@@ -265,11 +266,11 @@ def standard_run(input_file, nums_to_transcripts_path, labeled_fasta, pickle_des
     overlap_probabilities = determine_probabilities_after_sc(sc_gene_spots, in_read,
                                                              sc_translations)
 
-    print("Now just have to pickle...")
+    log_info("Pickling coalignment file.")
     # translate dict to arrays
     overlap_probabilities = translate_feature_dict_to_arrays(overlap_probabilities)
     pickle_dictionary(overlap_probabilities, pickle_destination)
-    print(time.time() - start_time, "seconds to complete")
+    log_info("Completed coalignment generation.")
 
 
 if __name__ == "__main__":
diff --git a/src/seqc/sa_preprocess/prepare_fasta.py b/src/seqc/sa_preprocess/prepare_fasta.py
index 57ac561..a932221 100644
--- a/src/seqc/sa_preprocess/prepare_fasta.py
+++ b/src/seqc/sa_preprocess/prepare_fasta.py
@@ -3,6 +3,7 @@
 
 from seqc.sa_preprocess import load_fasta as lf
 from seqc.sa_preprocess import buffered_printer as bf
+from seqc.log import log_info
 import random
 import gzip
 import pickle
@@ -126,16 +127,9 @@ def standard_run(annotations_gtf="annotations.gtf",
     of cdna.fa and ncrna.fa for organism of interest. output_file is the desired local
     path of the file to be outputted by this program.
     """
-    # this get_seqs.sh probably shouldn't be run even if files are missing. Should be
-    # included to give possible documentation but not run.
-    # import os
-    # import subprocess
-    # if cdna_plus_ncrna_fa not in os.listdir(os.getcwd()) and "get_seqs.sh" in os.listdir(os.getcwd()):
-    # print("Seem to be missing required fasta files. Downloading from ensembl to local directory.")
-    # proc = subprocess.check_output("./get_seqs.sh")
+    log_info("Preprocessing fasta and gtf files")
     accepted_dict = find_acceptable(annotations_gtf, cdna_plus_ncrna_fa)
     process_sequences(accepted_dict, output_file)
-    print("Done preprocessing.")
 
 
 if __name__ == "__main__":

From 1c8058388538e187e68e614a491b1b3ad6a6cc38 Mon Sep 17 00:00:00 2001
From: ajc <ambrosejcarr@gmail.com>
Date: Thu, 5 Nov 2015 17:59:35 -0500
Subject: [PATCH 20/68] refactored SEQC to allow additional subparsers

---
 src/scripts/SEQC | 68 ++++++++++++++++++++++++------------------------
 1 file changed, 34 insertions(+), 34 deletions(-)

diff --git a/src/scripts/SEQC b/src/scripts/SEQC
index e4905fa..bbd07a8 100755
--- a/src/scripts/SEQC
+++ b/src/scripts/SEQC
@@ -79,7 +79,7 @@ def process_input():
                        nargs='*', default=None)
         i.add_argument('-r', '--reverse', help='reverse fastq file(s)', metavar='R',
                        nargs='*', default=None)
-        i.add_argument('-s', '--sam', metavar='S', nargs='?', default=None,
+        i.add_argument('-s', '--samfile', metavar='S', nargs='?', default=None,
                        help='sam file(s) containing aligned, pre-processed reads')
         i.add_argument('-m', '--merged-fastq', metavar='M', default=None,
                        help='fastq file containing merged, pre-processed records')
@@ -143,7 +143,7 @@ def process_input():
             sys.exit(2)
 
         # check that at least one input argument was passed:
-        check = [arguments.forward, arguments.reverse, arguments.sam,
+        check = [arguments.forward, arguments.reverse, arguments.samfile,
                  arguments.merged_fastq]
         if not any(check):
             print('SEQC %s: error: one or more of the following arguments must be '
@@ -174,14 +174,14 @@ def process_input():
 #     logging.info(datetime.now().strftime("%Y-%m-%d %H:%M:%S") + ':' + message)
 
 
-def set_up(fout, index, barcodes):
+def set_up(output_file, index, barcodes):
     """
     create temporary directory for run, find gtf in index, and create or load a
     serialized barcode object
     """
 
     # create temporary directory based on experiment name
-    temp_dir = '.' + fout
+    temp_dir = '.' + output_file
     if not os.path.isdir(temp_dir):
         os.makedirs(temp_dir)
     if not temp_dir.endswith('/'):
@@ -207,11 +207,11 @@ def set_up(fout, index, barcodes):
         if barcodes.startswith('s3://'):
             log_info('AWS s3 link provided for barcodes. Downloading barcodes')
             bucket, key = S3.split_link(barcodes)
-            fout = temp_dir + 'barcodes.p'
+            output_file = temp_dir + 'barcodes.p'
             try:
-                barcodes = S3.download_file(bucket, key, fout)
+                barcodes = S3.download_file(bucket, key, output_file)
             except FileExistsError:
-                barcodes = fout
+                barcodes = output_file
                 pass  # already have the file in this location from a previous run
 
         # now, we should definitely have the binary file. Load it.
@@ -221,20 +221,20 @@ def set_up(fout, index, barcodes):
     return temp_dir, gtf, cb, index
 
 
-def merge(fout, forward, reverse, samfile, merged_fastq, processor, temp_dir, cb):
+def merge(output_file, forward, reverse, samfile, merged_fastq, processor, temp_dir, cb):
     if (forward or reverse) and not (samfile or merged_fastq):
 
         # pre-process reads
         log_info('Merging fastq files.')
         merged_fastq, nlc = merge_fastq(forward, reverse, processor, temp_dir, cb)
 
-    with open(fout + '_metadata.txt', 'a') as f:
+    with open(output_file + '_metadata.txt', 'a') as f:
         f.write('Low Complexity Reads: %d\n' % nlc)
 
     return merged_fastq
 
 
-def align(merged_fastq, samfile, star_args, temp_dir, n_threads, index, fout):
+def align(merged_fastq, samfile, star_args, temp_dir, n_threads, index, output_file):
     if merged_fastq and not samfile:
         # process any additional arguments for star passed from the command line
         if star_args:
@@ -249,7 +249,7 @@ def align(merged_fastq, samfile, star_args, temp_dir, n_threads, index, fout):
         samfile = temp_dir + 'Aligned.out.sam'
 
         # copy alignment summary
-        shutil.copyfile(temp_dir + 'Log.final.out', fout + '_alignment_summary.txt')
+        shutil.copyfile(temp_dir + 'Log.final.out', output_file + '_alignment_summary.txt')
         return samfile
 
 
@@ -281,7 +281,7 @@ def correct_errors():
     raise NotImplementedError
 
 
-def resolve_alignments(index, arr, n, fout):
+def resolve_alignments(index, arr, n, output_file):
     log_info('Resolving ambiguous alignments.')
     try:
         # todo
@@ -291,11 +291,11 @@ def resolve_alignments(index, arr, n, fout):
         arr.resolve_alignments(expectations, required_poly_t=n)
     except:
         log_info('Caught error in resolve_alignments(), saving data')
-        arr.save_h5(fout + '.h5')
+        arr.save_h5(output_file + '.h5')
         raise
 
 
-def save_counts_matrices(fout, arr, n):
+def save_counts_matrices(output_file, arr, n):
     log_info('Generating Gene x Cell SparseMatrices for reads and molecules.')
     mols, mrow, mcol = arr.unique_features_to_sparse_counts(
         collapse_molecules=True, n_poly_t_required=n)
@@ -303,7 +303,7 @@ def save_counts_matrices(fout, arr, n):
         collapse_molecules=False, n_poly_t_required=n)
     matrices = {'molecules': {'matrix': mols, 'row_ids': mrow, 'col_ids': mcol},
                 'reads': {'matrix': reads, 'row_ids': rrow, 'col_ids': rcol}}
-    with open(fout + '_read_and_mol_matrices.p', 'wb') as f:
+    with open(output_file + '_read_and_mol_matrices.p', 'wb') as f:
         pickle.dump(matrices, f)
 
 
@@ -321,11 +321,11 @@ def select_cells():
     raise NotImplementedError
 
 
-def store_results(fout, arr):
-    if not fout.endswith('.h5'):
-        fout += '.h5'
-    log_info('Storing processed data in %s' % fout)
-    arr.save_h5(fout)
+def store_results(output_file, arr):
+    if not output_file.endswith('.h5'):
+        output_file += '.h5'
+    log_info('Storing processed data in %s' % output_file)
+    arr.save_h5(output_file)
 
 
 def run_complete():
@@ -340,46 +340,46 @@ def clean_up(temp_dir):
         shutil.rmtree('_STARtmp')
 
 
-def in_drop(fout, forward, reverse, samfile, merged_fastq, processor, index,
+def in_drop(output_file, forward, reverse, samfile, merged_fastq, subparser_name, index,
             n_threads, frag_len, star_args, barcodes, **kwargs):
 
-    temp_dir, gtf, cb, index = set_up(fout, index, barcodes)
+    temp_dir, gtf, cb, index = set_up(output_file, index, barcodes)
 
-    merged_fastq = merge(fout, forward, reverse, samfile, merged_fastq, processor,
+    merged_fastq = merge(output_file, forward, reverse, samfile, merged_fastq, subparser_name,
                          temp_dir, cb)
 
-    samfile = align(merged_fastq, samfile, star_args, temp_dir, n_threads, index, fout)
+    samfile = align(merged_fastq, samfile, star_args, temp_dir, n_threads, index, output_file)
 
     arr = process_samfile(samfile, gtf, frag_len)
 
-    resolve_alignments(index, arr, n=0, fout=fout)
+    resolve_alignments(index, arr, n=0, output_file=output_file)
 
-    store_results(fout, arr)
+    store_results(output_file, arr)
 
-    save_counts_matrices(fout, arr, n=3)
+    save_counts_matrices(output_file, arr, n=3)
 
     # clean_up(temp_dir)
 
     run_complete()
 
 
-def drop_seq(fout, forward, reverse, samfile, merged_fastq, processor, index,
+def drop_seq(output_file, forward, reverse, samfile, merged_fastq, subparser_name, index,
              n_threads, frag_len, star_args, barcodes, **kwargs):
 
-    temp_dir, gtf, cb, index = set_up(fout, index, barcodes)
+    temp_dir, gtf, cb, index = set_up(output_file, index, barcodes)
 
-    merged_fastq = merge(fout, forward, reverse, samfile, merged_fastq, processor,
+    merged_fastq = merge(output_file, forward, reverse, samfile, merged_fastq, subparser_name,
                          temp_dir, cb)
 
-    samfile = align(merged_fastq, samfile, star_args, temp_dir, n_threads, index, fout)
+    samfile = align(merged_fastq, samfile, star_args, temp_dir, n_threads, index, output_file)
 
     arr = process_samfile(samfile, gtf, frag_len)
 
-    resolve_alignments(index, arr, n=0, fout=fout)
+    resolve_alignments(index, arr, n=0, output_file=output_file)
 
-    arr.save_h5(fout + '.h5')
+    arr.save_h5(output_file + '.h5')
 
-    save_counts_matrices(fout, arr, n=0)
+    save_counts_matrices(output_file, arr, n=0)
 
     # clean_up(temp_dir)
 

From 37eead808846f5757bb79a5c126e43b2031e6369 Mon Sep 17 00:00:00 2001
From: ajc <ambrosejcarr@gmail.com>
Date: Fri, 6 Nov 2015 17:04:43 -0500
Subject: [PATCH 21/68] updated README to reflect new capabilities and
 installation method

---
 README.md                      | 134 ++++++++++++++++++++-------------
 src/plugins/starcluster.config |   2 +-
 2 files changed, 84 insertions(+), 52 deletions(-)

diff --git a/README.md b/README.md
index ff48a0c..4b993c5 100644
--- a/README.md
+++ b/README.md
@@ -1,26 +1,61 @@
-# SEquence Quality Control
+## SEquence Quality Control (SEQC -- /sek-si:/)
 
-## Dependencies:
+### Overview:
 
-Most dependencies will be automatically installed by SEQC. However, there are a few
-external dependencies that must be installed in order for certain functions to be enabled.
+SEQC is a package that is designed to process sequencing data on the cloud. It also
+contains tools to analyze processed data, which has a much smaller memory footprint. 
+Thus, it should be installed both locally and on your remote server. However, it should
+be noted that some portions of the data pre-processing software require 30GB of RAM to 
+run, and are unlikely to work on most laptops and workstations.
+ 
+To faciliate easy installation and use, we have made available Amazon Machine Images
+(AMIs) that come with all of SEQC's dependencies pre-installed. In addition, we have
+uploaded the indices (-i/--index parameter, see "Running SEQC) and barcode data
+(-b/--barcodes) to public amazon s3 repositories. These links can be provided to SEQC and
+it will automatically fetch them prior to initiating an analysis run. 
+
+Amazon Web Services (AWS) is only accessible through a programmatic interface. To
+simplify the starting and stopping of amazon compute servers, we have written several
+plug-ins for a popular AWS interface, StarCluster. By installing starcluster, users can
+simply call starcluster start <cluster name>; starcluster sm <cluster name> for instant
+access to a compute server that is immediately ready to run SEQC on data of your choosing.
 
+### Installation \& Dependencies:
+
+#### Dependencies For Remotely Running on AWS:
 1. Amazon Web Services (optional): If you wish to run SEQC on amazon (AWS), 
-you must set up an AWS account (see below). 
-2. Starcluster (optional): If you run SEQC on aws, that you create aws compute instances 
-with the python 2.7 <a href=https://github.com/jtriley/StarCluster>starcluster</a>
+you must set up an AWS account (see below). SEQC will function on any machine running
+a nix-based operating system, but we only provide AMIs for AWS. 
+3. Starcluster (optional; install locally): If you run SEQC on aws, then you create aws
+compute instances with the python 2.7 
+<a href=https://github.com/jtriley/StarCluster>starcluster</a>
 package, which streamlines creation and shutdown of servers that are ready to run SEQC.
-2. libhdf5: We have provided amazon machine images or "AMIs" with all of starcluster's 
-external dependencies pre-installed. However, if you wish to use your own machine image, 
-you will need to install <a href=https://www.hdfgroup.org/HDF5>libhdf5</a>. Similarly, if
-you wish to parse any intermediate data offline on your local machine, you will need to 
-install libhdf5. 
+Requires <a href=https://www.python.org/downloads/release/python-2710/>Python 2.7</a> 
+with <a href=http://pip.readthedocs.org/en/stable/installing/>pip</a>.
+        
+        sudo pip install starcluster
+
+3. Install Starcluster plugins and config template from SEQC:
+
+        git clone https://github.com/ambrosejcarr/seqc.git
+        cp seqc/src/plugins/*.py ~/.starcluster/plugins/
+        cp seqc/src/plugins/starcluster.config ~/.starcluster/config
+
+#### Dependencies for Local Installation or on Other Cloud Computing Platforms:
+1. <a href=https://www.python.org/downloads/>Python 3</a>
+2. <a href=https://www.hdfgroup.org/HDF5>libhdf5</a>, a highly efficient database used to
+store SEQC output.
 3. SEQC depends on several python3 packages that are automatically installed and updated.
 to view these packages, please view the `setup.py` file packaged with SEQC.
 
-## Setting up SEQC and starcluster
+### Setting up AWS, SEQC, and starcluster
+
+Once all dependencies have been installed, SEQC can be installed on any machine by typing:
+
+    git clone https://github.com/ambrosejcarr/seqc.git
+    pip install -e seqc/
 
-### Creating a new AWS account:
+#### Setting up an AWS Account: 
 1. Navigate <a href=http://aws.amazon.com>here</a> and click “Create an AWS Account.”
 2. Enter your desired login e-mail and click the “I am a new user” radio button.
 3. Fill out the Login Credentials with your name and password.
@@ -28,7 +63,7 @@ to view these packages, please view the `setup.py` file packaged with SEQC.
 wish to create an account.
 5. Save your AWS Access ID and Secret Key -- this information is very important!
 
-### Create an RSA key to launch a new cluster:
+#### Create an RSA key to allow you to launch a cluster
 1. Sign into your AWS account and go to the EC2 Dashboard.
 2. Click “Key Pairs” in the NETWORK & SECURITY tab.
 3. Click “Create Key Pair” and give it a new name.
@@ -37,35 +72,20 @@ wish to create an account.
 6. example: `<keyname>.rsa` and move it to a directory (e.g. `~/.ssh/`)
 7. Change the permission settings with `$> chmod 600 /path/to/key/keyname.rsa`
 
-### Next, install StarCluster by cloning the Github repo.
-1. `$> git clone https://github.com/ambrosejcarr/StarCluster.git`
-2. `$> sudo python setup.py install`
-
-### Set up the default StarCluster config file.
-1. Type `starcluster help` and enter `2`
+#### Personalize the Dummy StarCluster Config File Provided by SEQC.
+1. Open the `~/.starcluster/config` file
 2. Under `[aws info]` enter the following information: 
     1. `AWS_ACCESS_KEY_ID = #access_id` (This is your AWS Access ID from Step (1))
     2. `AWS_SECRET_ACCESS_KEY = #secret_key` (This is your AWS Secret Key from Step (1))
-    3. `AWS_USER_ID= #user_id`
+    3. `AWS_USER_ID= #user_id` (This is a numerical ID from AWS, found under IAM users)
     4. Click on your username on the top right corner of the AWS dashboard and click
     “My Account” -- your Account Id should pop up at the top of the page (a 12-digit 
     number)
-    5. `AWS_REGION_NAME = us-east-1a` (or any other of 1b, 1c, 1d, 1e - they have 
-    different spot prices) 
 3. Under Defining EC2 Keypairs:
-    1. Create a section called: `[key <keyname>]`
-    2. below, write: `KEY_LOCATION=~/.ssh/<keyname>.rsa`
-    3. Under `[cluster smallcluster]`:
-    4. `KEYNAME = <keyname>`
-    5. `NODE_IMAGE_ID = %(x86_64_ami)s`
-    6. `SUBNET_ID=subnet-99999999`
-        1. Go to the “VPC” under the Networking Tab in the AWS Dashboard
-        2. Click on “Subnets”
-        3. Find the appropriate subnet according to your availability zone
-        (e.g. “subnet-99999999”) and write it there
-    7. `AVAILABILITY_ZONE = us-east-1a` (or other, if you changed this in 2.5)
-
-### Start a cluster:
+    2. change key location to the location of your `<keyname.rsa>` file:
+    `KEY_LOCATION=~/.ssh/<keyname>.rsa`
+
+#### Start a cluster:
 1. `$> starcluster start -c <template_name> <cluster_name>`
 2. Wait until the cluster is finished setting up. Then, the cluster can be accessed
 using:
@@ -79,15 +99,16 @@ To copy a file or entire directory from your local computer to the cluster:
 `$> starcluster get mycluster /path/to/remote/file/or/dir /local/path/`
 
 
-## Running SEQC:
+### Running SEQC:
 
 After SEQC is installed, help can be listed:
 
     $> SEQC -h
-    usage: SEQC [-h] {in-drop,drop-seq,mars-seq,cel-seq,avo-seq,strt-seq} ...
+    usage: SEQC [-h]
+                {in-drop,drop-seq,mars-seq,cel-seq,avo-seq,strt-seq,index} ...
     
     positional arguments:
-      {in-drop,drop-seq,mars-seq,cel-seq,avo-seq,strt-seq}
+      {in-drop,drop-seq,mars-seq,cel-seq,avo-seq,strt-seq,index}
                             library construction method types
         in-drop             in-drop help
         drop-seq            drop-seq help
@@ -95,6 +116,7 @@ After SEQC is installed, help can be listed:
         cel-seq             cel-seq help
         avo-seq             avo-seq help
         strt-seq            strt-seq help
+        index               SEQC index functions
     
     optional arguments:
       -h, --help            show this help message and exit
@@ -155,19 +177,29 @@ observing a co-alignment to other genes.
 
 Human and mouse indices can be found on our aws s3 bucket at
 `s3://dplab-data/genomes/mm38/` and `s3://dplab-data/genomes/hg38`. These indices
-are built from recent releases of ENSEMBL genomes.
+are built from recent releases of ENSEMBL genomes, and these links can be passed to SEQC,
+which as index or barcode parameters. SEQC will download them before beginning the run.
 
-If new indices must be generated, these can be produced by calling:
-
-    $> INDEX=<index_folder>
-    $> # valid organisms are hg38 and mm38. Can also produce joint indicies with 
-    $> # "[hg38, mm38]"
-    $> ORG=<organism>
-    $> # STAR will use this number of threads to produce the index
-    $> THREADS=<number of threads to use>  
-    $> mkdir $INDEX  # create index directory
-    $> python3 -c "import seqc.align; seqc.align.STAR.build_index($INDEX, $ORG, $THREADS)"
+If new indices must be generated, these can be produced by the SEQC index method:
 
+    $> SEQC index -h
+    usage: SEQC index [-h] [-b] [-t] -o O [O ...] -i I [-n N] [--phix]
+    
+    optional arguments:
+      -h, --help            show this help message and exit
+      -b, --build           build a SEQC index
+      -t, --test            test a SEQC index
+      -o O [O ...], --organism O [O ...]
+                            build index for these organism(s)
+      -i I, --index I       name of folder where index should be built or
+                            containing the index to be verified
+      -n N, --n-threads N   number of threads to use when building index
+      --phix                add phiX to the genome index and GTF file.
+     
+     $> # for example, to build a mouse index with phiX features added to mm38, in a
+     $> $ folder called 'mouse', using 7 threads
+     $> SEQC index -b -o mm38 -i mouse -n 7 --phix
+    
 Some data types require serialized barcode objects (`-b/--barcodes`). These objects contain
 all of the barcodes for an experiment, as they would be expected to be observed.
 For example, if you expect to observe the reverse complement of the barcodes you used to
diff --git a/src/plugins/starcluster.config b/src/plugins/starcluster.config
index 8eb42d9..3625e71 100644
--- a/src/plugins/starcluster.config
+++ b/src/plugins/starcluster.config
@@ -5,7 +5,7 @@
 # Configure the default cluster template to use when starting a cluster
 # defaults to 'smallcluster' defined below. This template should be usable
 # out-of-the-box provided you've configured your keypair correctly
-DEFAULT_TEMPLATE=c3.large
+DEFAULT_TEMPLATE=c4.8xlarge
 # enable experimental features for this release
 #ENABLE_EXPERIMENTAL=True
 # number of seconds to wait when polling instances (default: 30s)

From bdb906e4ce19c5bc7da63a1d14b1419c769b1c22 Mon Sep 17 00:00:00 2001
From: ajc <ambrosejcarr@gmail.com>
Date: Fri, 6 Nov 2015 17:16:03 -0500
Subject: [PATCH 22/68] fixed spelling mistakes

---
 README.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 4b993c5..37890bd 100644
--- a/README.md
+++ b/README.md
@@ -41,7 +41,7 @@ with <a href=http://pip.readthedocs.org/en/stable/installing/>pip</a>.
         cp seqc/src/plugins/*.py ~/.starcluster/plugins/
         cp seqc/src/plugins/starcluster.config ~/.starcluster/config
 
-#### Dependencies for Local Installation or on Other Cloud Computing Platforms:
+#### Dependencies for Local Installation or Other Cloud Computing Platforms:
 1. <a href=https://www.python.org/downloads/>Python 3</a>
 2. <a href=https://www.hdfgroup.org/HDF5>libhdf5</a>, a highly efficient database used to
 store SEQC output.
@@ -177,8 +177,8 @@ observing a co-alignment to other genes.
 
 Human and mouse indices can be found on our aws s3 bucket at
 `s3://dplab-data/genomes/mm38/` and `s3://dplab-data/genomes/hg38`. These indices
-are built from recent releases of ENSEMBL genomes, and these links can be passed to SEQC,
-which as index or barcode parameters. SEQC will download them before beginning the run.
+are built from recent releases of ENSEMBL genomes. These links can be passed directly to
+SEQC, which will download them before beginning the analysis
 
 If new indices must be generated, these can be produced by the SEQC index method:
 

From 74d1c64ae1a703a5d28e3423965b6314af0ec7b4 Mon Sep 17 00:00:00 2001
From: ajc <ambrosejcarr@gmail.com>
Date: Fri, 6 Nov 2015 17:38:21 -0500
Subject: [PATCH 23/68] changed config file and readme to reflect how to create
 an RSA key

---
 README.md                      | 3 +++
 src/plugins/starcluster.config | 4 ++--
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 37890bd..2dafd37 100644
--- a/README.md
+++ b/README.md
@@ -82,8 +82,11 @@ wish to create an account.
     “My Account” -- your Account Id should pop up at the top of the page (a 12-digit 
     number)
 3. Under Defining EC2 Keypairs:
+    1. rename `[key <your_key_name>]` to the name of the key you generate above.
     2. change key location to the location of your `<keyname.rsa>` file:
     `KEY_LOCATION=~/.ssh/<keyname>.rsa`
+5. Under templates, find `[cluster c3.large]`
+    1. change key to `<your_key_name>`
 
 #### Start a cluster:
 1. `$> starcluster start -c <template_name> <cluster_name>`
diff --git a/src/plugins/starcluster.config b/src/plugins/starcluster.config
index 3625e71..925babe 100644
--- a/src/plugins/starcluster.config
+++ b/src/plugins/starcluster.config
@@ -42,7 +42,7 @@ AWS_REGION_HOST = ec2.us-east-1.amazonaws.com
 # --help" for instructions on how to create a new keypair. Section name should
 # match your key name e.g.:
 # username should be your Columbia UNI, if a Columbia student
-[key aws]
+[key <your_key_name>]
 KEY_LOCATION= <path to your username.pem file>
 
 # You can of course have multiple keypair sections
@@ -133,7 +133,7 @@ KEY_LOCATION= <path to your username.pem file>
 # C4 & R3 AMI = ami-d9095dbc
 
 [cluster c3.large]
-KEYNAME = aws
+KEYNAME = <your_key_name>
 CLUSTER_SHELL = bash
 CLUSTER_USER = ec2-user
 CLUSTER_SIZE = 1

From d49d012dbe525668fc0439f0c9864a32003ed1c1 Mon Sep 17 00:00:00 2001
From: ajc <ambrosejcarr@gmail.com>
Date: Sat, 7 Nov 2015 08:56:28 -0500
Subject: [PATCH 24/68] updated README to reflect that starcluster should be
 installed from jtriley's repo

---
 README.md | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

diff --git a/README.md b/README.md
index 2dafd37..6480f93 100644
--- a/README.md
+++ b/README.md
@@ -32,14 +32,17 @@ compute instances with the python 2.7
 package, which streamlines creation and shutdown of servers that are ready to run SEQC.
 Requires <a href=https://www.python.org/downloads/release/python-2710/>Python 2.7</a> 
 with <a href=http://pip.readthedocs.org/en/stable/installing/>pip</a>.
-        
-        sudo pip install starcluster
-
+            
+        $> git clone git://github.com/jtriley/StarCluster.git
+        $> cd StarCluster
+        $> sudo python distribute_setup.py
+        $> sudo python setup.py install
+    
 3. Install Starcluster plugins and config template from SEQC:
 
-        git clone https://github.com/ambrosejcarr/seqc.git
-        cp seqc/src/plugins/*.py ~/.starcluster/plugins/
-        cp seqc/src/plugins/starcluster.config ~/.starcluster/config
+        $> git clone https://github.com/ambrosejcarr/seqc.git
+        $> cp seqc/src/plugins/*.py ~/.starcluster/plugins/
+        $> cp seqc/src/plugins/starcluster.config ~/.starcluster/config
 
 #### Dependencies for Local Installation or Other Cloud Computing Platforms:
 1. <a href=https://www.python.org/downloads/>Python 3</a>
@@ -52,8 +55,8 @@ to view these packages, please view the `setup.py` file packaged with SEQC.
 
 Once all dependencies have been installed, SEQC can be installed on any machine by typing:
 
-    git clone https://github.com/ambrosejcarr/seqc.git
-    pip install -e seqc/
+    $> git clone https://github.com/ambrosejcarr/seqc.git
+    $> pip install -e seqc/
 
 #### Setting up an AWS Account: 
 1. Navigate <a href=http://aws.amazon.com>here</a> and click “Create an AWS Account.”

From a7b823065774bdef4d1b00577666097d03d5e038 Mon Sep 17 00:00:00 2001
From: ajc <ambrosejcarr@gmail.com>
Date: Sat, 7 Nov 2015 13:02:57 -0500
Subject: [PATCH 25/68] bugfix change to save_counts_matrices

---
 src/scripts/SEQC | 49 +++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 40 insertions(+), 9 deletions(-)

diff --git a/src/scripts/SEQC b/src/scripts/SEQC
index bbd07a8..2026953 100755
--- a/src/scripts/SEQC
+++ b/src/scripts/SEQC
@@ -13,6 +13,7 @@ from seqc.barcodes import DropSeqCellBarcodes
 from seqc.arrays import ReadArray
 from seqc.log import setup_logger, log_info, log_exception
 from seqc.convert_features import ConvertFeatureCoordinates
+import numpy as np
 import sys
 import json
 # import logging
@@ -296,15 +297,45 @@ def resolve_alignments(index, arr, n, output_file):
 
 
 def save_counts_matrices(output_file, arr, n):
-    log_info('Generating Gene x Cell SparseMatrices for reads and molecules.')
-    mols, mrow, mcol = arr.unique_features_to_sparse_counts(
-        collapse_molecules=True, n_poly_t_required=n)
-    reads, rrow, rcol = arr.unique_features_to_sparse_counts(
-        collapse_molecules=False, n_poly_t_required=n)
-    matrices = {'molecules': {'matrix': mols, 'row_ids': mrow, 'col_ids': mcol},
-                'reads': {'matrix': reads, 'row_ids': rrow, 'col_ids': rcol}}
-    with open(output_file + '_read_and_mol_matrices.p', 'wb') as f:
-        pickle.dump(matrices, f)
+
+#### DEBUGGING TRY/EXCEPT; remove when clear what the problem with this error is ####
+# Traceback (most recent call last):
+#   File "/data/seqc/src/scripts/SEQC", line 419, in <module>
+#     func(**kwargs)
+#   File "/data/seqc/src/scripts/SEQC", line 359, in in_drop
+#     save_counts_matrices(output_file, arr, n=3)
+#   File "/data/seqc/src/scripts/SEQC", line 301, in save_counts_matrices
+#     collapse_molecules=True, n_poly_t_required=n)
+#   File "/data/seqc/src/seqc/arrays.py", line 1030, in unique_features_to_sparse_counts
+#     low_coverage_mask = self.mask_low_support_molecules()
+#   File "/data/seqc/src/seqc/arrays.py", line 850, in mask_low_support_molecules
+#     df = pd.DataFrame(view)
+#   File "/usr/local/lib/python3.5/site-packages/pandas/core/frame.py", line 238, in __init__
+#     if mask.any():
+#   File "/usr/local/lib/python3.5/site-packages/numpy/core/_methods.py", line 38, in _any
+#     return umr_any(a, axis, dtype, out, keepdims)
+# TypeError: cannot perform reduce with flexible type
+
+    try:
+        log_info('Generating Gene x Cell SparseMatrices for reads and molecules.')
+        mols, mrow, mcol = arr.unique_features_to_sparse_counts(
+            collapse_molecules=True, n_poly_t_required=n)
+        reads, rrow, rcol = arr.unique_features_to_sparse_counts(
+            collapse_molecules=False, n_poly_t_required=n)
+        matrices = {'molecules': {'matrix': mols, 'row_ids': mrow, 'col_ids': mcol},
+                    'reads': {'matrix': reads, 'row_ids': rrow, 'col_ids': rcol}}
+        with open(output_file + '_read_and_mol_matrices.p', 'wb') as f:
+            pickle.dump(matrices, f)
+    except TypeError:
+        all_objects = dir()
+        log_info('all array objects:' + repr(all_objects))
+        for obj in all_objects:
+            if isinstance(obj, np.ndarray):
+                log_info('Raised TypeError; logging all array object types and shapes')
+                log_info(repr(obj))
+                log_info(repr(obj.dtype))
+                log_info(repr(obj.shape))
+        raise
 
 
 def select_cells():

From 9fb2ba2bbc1860c3b18edc0e4a3ce76f2eb25839 Mon Sep 17 00:00:00 2001
From: Kristy Choi <kristyc@dyn-160-39-143-29.dyn.columbia.edu>
Date: Sat, 7 Nov 2015 14:23:12 -0500
Subject: [PATCH 26/68] reflects waiting changes

---
 src/plugins/raid.py | 27 ++++++++++++++++-----------
 1 file changed, 16 insertions(+), 11 deletions(-)

diff --git a/src/plugins/raid.py b/src/plugins/raid.py
index 807357b..cac9591 100644
--- a/src/plugins/raid.py
+++ b/src/plugins/raid.py
@@ -42,25 +42,30 @@ def run(self, nodes, master, user, user_shell, volumes):
             vol_stat = vol_stat.decode().split()
 
             while vol_stat[6] != 'available':
-                time.sleep(10)
-                vol_stat = check_output(
-                    ["aws", "ec2", "describe-volumes", "--volume-ids", vol_id])
+                time.sleep(5)
+                vol_stat = check_output(["aws", "ec2", "describe-volumes", "--volume-ids", vol_id])
                 vol_stat = vol_stat.decode().split()
-                log.info("...")
+                log.info('...')
 
             log.info("attaching new volume...")
-            call(["aws", "ec2", "attach-volume", "--volume-id", vol_id, "--instance-id",
-                  inst_id, "--device", dev_id])
+            call(["aws", "ec2", "attach-volume", "--volume-id", vol_id, "--instance-id", inst_id, "--device", dev_id])
+
+            while vol_stat[14] != 'attached':
+                time.sleep(5)
+                vol_stat = check_output(["aws", "ec2", "describe-volumes", "--volume-ids", vol_id])
+                vol_stat = vol_stat.decode().split()
+                log.info("...")
+            # configure volumes here to delete on termination :(
+            call(["aws", "ec2", "modify-instance-attribute", "--instance-id", inst_id, "--block-device-mappings",
+                      "[{\"DeviceName\": \"" + dev_id + "\",\"Ebs\":{\"DeleteOnTermination\":true}}]"])
             i += 1
-        log.info("successfully attached %s TB in %s volumes!" % (self.n_tb, vol_num))
 
+        log.info("successfully attached %s TB in %s volumes!" % (self.n_tb, vol_num))
         log.info("creating logical RAID device...")
         all_dev = ' '.join(dev_names)
-        log.info('waiting for all volumes to be available...')
-        time.sleep(10)
         master.ssh.execute(
-            "sudo mdadm --create --verbose /dev/md0 --level=0 --name=my_raid "
-            "--raid-devices=%s %s" % (vol_num, all_dev))
+        "sudo mdadm --create --verbose /dev/md0 --level=0 --name=my_raid "
+        "--raid-devices=%s %s" % (vol_num, all_dev))
 
         master.ssh.execute("sudo mkfs.ext4 -L my_raid /dev/md0")
         master.ssh.execute("sudo mkdir -p /data")

From a326f219538ebd7b09612f7d72bfe76cc5db191e Mon Sep 17 00:00:00 2001
From: Kristy Choi <kristyc@dyn-160-39-143-29.dyn.columbia.edu>
Date: Sat, 7 Nov 2015 14:29:02 -0500
Subject: [PATCH 27/68] todo edit volume file

---
 src/plugins/raid.py | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/src/plugins/raid.py b/src/plugins/raid.py
index cac9591..28d49f2 100644
--- a/src/plugins/raid.py
+++ b/src/plugins/raid.py
@@ -50,6 +50,10 @@ def run(self, nodes, master, user, user_shell, volumes):
             log.info("attaching new volume...")
             call(["aws", "ec2", "attach-volume", "--volume-id", vol_id, "--instance-id", inst_id, "--device", dev_id])
 
+            log.info("waiting for volume to finish attaching...")
+            vol_stat = check_output(["aws", "ec2", "describe-volumes", "--volume-ids", vol_id])
+            vol_stat = vol_stat.decode().split()
+
             while vol_stat[14] != 'attached':
                 time.sleep(5)
                 vol_stat = check_output(["aws", "ec2", "describe-volumes", "--volume-ids", vol_id])
@@ -57,16 +61,18 @@ def run(self, nodes, master, user, user_shell, volumes):
                 log.info("...")
             # configure volumes here to delete on termination :(
             call(["aws", "ec2", "modify-instance-attribute", "--instance-id", inst_id, "--block-device-mappings",
-                      "[{\"DeviceName\": \"" + dev_id + "\",\"Ebs\":{\"DeleteOnTermination\":true}}]"])
+                  "[{\"DeviceName\": \"" + dev_id + "\",\"Ebs\":{\"DeleteOnTermination\":true}}]"])
             i += 1
 
         log.info("successfully attached %s TB in %s volumes!" % (self.n_tb, vol_num))
         log.info("creating logical RAID device...")
         all_dev = ' '.join(dev_names)
-        master.ssh.execute(
-        "sudo mdadm --create --verbose /dev/md0 --level=0 --name=my_raid "
-        "--raid-devices=%s %s" % (vol_num, all_dev))
 
+        #write names to file here
+        ##TODO##
+        master.ssh.execute(
+            "sudo mdadm --create --verbose /dev/md0 --level=0 --name=my_raid "
+            "--raid-devices=%s %s" % (vol_num, all_dev))
         master.ssh.execute("sudo mkfs.ext4 -L my_raid /dev/md0")
         master.ssh.execute("sudo mkdir -p /data")
         master.ssh.execute("sudo mount LABEL=my_raid /data")

From 552714637d9bf27ffddd7d9c8ca5b1ee85c22c4f Mon Sep 17 00:00:00 2001
From: dvdijk <davidvandijk@gmail.com>
Date: Sat, 7 Nov 2015 14:47:11 -0500
Subject: [PATCH 28/68] Update README.md

seqc qith pip3 not pip
---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 6480f93..23fdb3f 100644
--- a/README.md
+++ b/README.md
@@ -56,7 +56,7 @@ to view these packages, please view the `setup.py` file packaged with SEQC.
 Once all dependencies have been installed, SEQC can be installed on any machine by typing:
 
     $> git clone https://github.com/ambrosejcarr/seqc.git
-    $> pip install -e seqc/
+    $> pip3 install -e seqc/
 
 #### Setting up an AWS Account: 
 1. Navigate <a href=http://aws.amazon.com>here</a> and click “Create an AWS Account.”

From e3c00102d807d98a1ee498a93b5d99ca743bc9ff Mon Sep 17 00:00:00 2001
From: ajc <ambrosejcarr@gmail.com>
Date: Sat, 7 Nov 2015 14:47:58 -0500
Subject: [PATCH 29/68] changed readme to reflect installation with python3
 instead of python (pip -> pip3)

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 6480f93..23fdb3f 100644
--- a/README.md
+++ b/README.md
@@ -56,7 +56,7 @@ to view these packages, please view the `setup.py` file packaged with SEQC.
 Once all dependencies have been installed, SEQC can be installed on any machine by typing:
 
     $> git clone https://github.com/ambrosejcarr/seqc.git
-    $> pip install -e seqc/
+    $> pip3 install -e seqc/
 
 #### Setting up an AWS Account: 
 1. Navigate <a href=http://aws.amazon.com>here</a> and click “Create an AWS Account.”

From a49e08708e3b92362f0a28a4d49f5b6640551f1d Mon Sep 17 00:00:00 2001
From: Kristy Choi <kristyc@dyn-160-39-143-29.dyn.columbia.edu>
Date: Sat, 7 Nov 2015 15:35:39 -0500
Subject: [PATCH 30/68] fixed errors, writes volumes to vol_names.txt

---
 src/plugins/raid.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/plugins/raid.py b/src/plugins/raid.py
index 28d49f2..032c608 100644
--- a/src/plugins/raid.py
+++ b/src/plugins/raid.py
@@ -68,8 +68,11 @@ def run(self, nodes, master, user, user_shell, volumes):
         log.info("creating logical RAID device...")
         all_dev = ' '.join(dev_names)
 
-        #write names to file here
-        ##TODO##
+        #writing names to file for future volume cleanup
+        with open('vol_names.txt','w') as f:
+            for x in dev_names:
+                f.write('%s\n' %x)
+
         master.ssh.execute(
             "sudo mdadm --create --verbose /dev/md0 --level=0 --name=my_raid "
             "--raid-devices=%s %s" % (vol_num, all_dev))

From 05284679be75db128d5e9d601c5a1cf91cda1418 Mon Sep 17 00:00:00 2001
From: Kristy Choi <kristyc@dyn-160-39-143-29.dyn.columbia.edu>
Date: Sat, 7 Nov 2015 15:57:26 -0500
Subject: [PATCH 31/68] fixed volume names

---
 src/plugins/raid.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/plugins/raid.py b/src/plugins/raid.py
index 032c608..ea05393 100644
--- a/src/plugins/raid.py
+++ b/src/plugins/raid.py
@@ -21,6 +21,7 @@ def run(self, nodes, master, user, user_shell, volumes):
         dev_base = "/dev/xvd"
         alphabet = string.ascii_lowercase[5:]  # starts at f
         dev_names = []
+        vol_names = []
         availability_zone = self.availability_zone
 
         for i in range(int(vol_num)):
@@ -31,6 +32,7 @@ def run(self, nodes, master, user, user_shell, volumes):
                  "gp2"])
             vol_list = vol.decode().split()
             vol_id = [s for s in vol_list if 'vol-' in s][0]
+            vol_names.append(vol_id)
 
             # generating new device id to mount
             dev_id = dev_base + alphabet[i]
@@ -70,7 +72,7 @@ def run(self, nodes, master, user, user_shell, volumes):
 
         #writing names to file for future volume cleanup
         with open('vol_names.txt','w') as f:
-            for x in dev_names:
+            for x in vol_names:
                 f.write('%s\n' %x)
 
         master.ssh.execute(

From f3554c245520f009696fc8a79003667b11af7f14 Mon Sep 17 00:00:00 2001
From: Kristy Choi <kristyc@dyn-160-39-143-29.dyn.columbia.edu>
Date: Sat, 7 Nov 2015 16:15:07 -0500
Subject: [PATCH 32/68] still need to test

---
 src/scripts/VOLUME_CLEANUP | 40 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 40 insertions(+)
 create mode 100644 src/scripts/VOLUME_CLEANUP

diff --git a/src/scripts/VOLUME_CLEANUP b/src/scripts/VOLUME_CLEANUP
new file mode 100644
index 0000000..567362f
--- /dev/null
+++ b/src/scripts/VOLUME_CLEANUP
@@ -0,0 +1,40 @@
+#!/usr/local/bin/python3
+import sys
+import boto3
+from subprocess import call
+
+
+def main():
+    ec2 = boto3.resource('ec2')
+
+    try:
+        with open('../plugins/vol_names.txt', 'r') as f:
+            vol_names = [line for line in f]
+    except IOError:
+        print('No new volumes for cleanup!')
+        sys.exit(2)
+
+    for name in vol_names:
+        volume = ec2.Volume(name)
+        if not volume.attachments:
+            call(["aws", "ec2", "delete-volume", "--volume-id", name])
+            vol_names.remove(name)
+    print('Finished deleting all new, unattached volumes')
+
+    #updating vol_names.txt file
+    call(["rm", "../plugins/vol_names.txt"])
+    with open('../plugins/vol_names.txt','r') as f:
+        for name in vol_names:
+            f.write('%s\n' %name)
+
+
+if __name__ == "__main__":
+    help = """
+	Iterates through all newly created volumes and deletes any
+	that is not currently being used.
+	"""
+    if len(sys.argv) > 1:
+        print(help)
+        sys.exit(2)
+    else:
+        main()

From 275235110fb4fafd54df6963903ed8a7271e2120 Mon Sep 17 00:00:00 2001
From: Kristy Choi <kristyc@dyn-160-39-143-29.dyn.columbia.edu>
Date: Sat, 7 Nov 2015 16:30:27 -0500
Subject: [PATCH 33/68] tweaks

---
 src/scripts/VOLUME_CLEANUP | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/scripts/VOLUME_CLEANUP b/src/scripts/VOLUME_CLEANUP
index 567362f..d1a4016 100644
--- a/src/scripts/VOLUME_CLEANUP
+++ b/src/scripts/VOLUME_CLEANUP
@@ -23,7 +23,7 @@ def main():
 
     #updating vol_names.txt file
     call(["rm", "../plugins/vol_names.txt"])
-    with open('../plugins/vol_names.txt','r') as f:
+    with open('../plugins/vol_names.txt','w') as f:
         for name in vol_names:
             f.write('%s\n' %name)
 

From d4fa68360fd8e1f876ff6f84949235694d9b6720 Mon Sep 17 00:00:00 2001
From: Kristy Choi <kristyc@dyn-160-39-246-156.dyn.columbia.edu>
Date: Sun, 8 Nov 2015 15:28:28 -0500
Subject: [PATCH 34/68] fixed code

---
 src/scripts/VOLUME_CLEANUP | 24 ++++++++++++++----------
 1 file changed, 14 insertions(+), 10 deletions(-)
 mode change 100644 => 100755 src/scripts/VOLUME_CLEANUP

diff --git a/src/scripts/VOLUME_CLEANUP b/src/scripts/VOLUME_CLEANUP
old mode 100644
new mode 100755
index d1a4016..b207453
--- a/src/scripts/VOLUME_CLEANUP
+++ b/src/scripts/VOLUME_CLEANUP
@@ -8,30 +8,34 @@ def main():
     ec2 = boto3.resource('ec2')
 
     try:
-        with open('../plugins/vol_names.txt', 'r') as f:
-            vol_names = [line for line in f]
+        with open('/Users/kristyc/.starcluster/plugins/vol_names.txt', 'r') as f:
+            vol_names = [line.strip() for line in f]
     except IOError:
         print('No new volumes for cleanup!')
         sys.exit(2)
 
+    remain_vol = []
     for name in vol_names:
         volume = ec2.Volume(name)
         if not volume.attachments:
             call(["aws", "ec2", "delete-volume", "--volume-id", name])
-            vol_names.remove(name)
+        else:
+            remain_vol.append(name)
     print('Finished deleting all new, unattached volumes')
+    print('There are %d new volumes still being used' % len(remain_vol))
 
-    #updating vol_names.txt file
-    call(["rm", "../plugins/vol_names.txt"])
-    with open('../plugins/vol_names.txt','w') as f:
-        for name in vol_names:
-            f.write('%s\n' %name)
+    # updating vol_names.txt file
+    call(["rm", "/Users/kristyc/.starcluster/plugins/vol_names.txt"])
+    if remain_vol:
+        with open('/Users/kristyc/.starcluster/plugins/vol_names.txt', 'w') as f:
+            for name in remain_vol:
+                f.write('%s\n' % name)
 
 
 if __name__ == "__main__":
     help = """
-	Iterates through all newly created volumes and deletes any
-	that is not currently being used.
+	VOLUME_CLEANUP iterates through all newly created volumes
+	and deletes any that is not currently being used.
 	"""
     if len(sys.argv) > 1:
         print(help)

From a2f7c2f7e01cba60f49b8dedb848c533c045e0d7 Mon Sep 17 00:00:00 2001
From: Kristy Choi <kristyc@dyn-160-39-246-156.dyn.columbia.edu>
Date: Sun, 8 Nov 2015 15:29:47 -0500
Subject: [PATCH 35/68] edited textfile location

---
 src/scripts/VOLUME_CLEANUP | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/scripts/VOLUME_CLEANUP b/src/scripts/VOLUME_CLEANUP
index b207453..565c1c8 100755
--- a/src/scripts/VOLUME_CLEANUP
+++ b/src/scripts/VOLUME_CLEANUP
@@ -8,7 +8,7 @@ def main():
     ec2 = boto3.resource('ec2')
 
     try:
-        with open('/Users/kristyc/.starcluster/plugins/vol_names.txt', 'r') as f:
+        with open('../plugins/vol_names.txt', 'r') as f:
             vol_names = [line.strip() for line in f]
     except IOError:
         print('No new volumes for cleanup!')
@@ -25,9 +25,9 @@ def main():
     print('There are %d new volumes still being used' % len(remain_vol))
 
     # updating vol_names.txt file
-    call(["rm", "/Users/kristyc/.starcluster/plugins/vol_names.txt"])
+    call(["rm", "../plugins/vol_names.txt"])
     if remain_vol:
-        with open('/Users/kristyc/.starcluster/plugins/vol_names.txt', 'w') as f:
+        with open('../plugins/vol_names.txt', 'w') as f:
             for name in remain_vol:
                 f.write('%s\n' % name)
 

From f544047baf36714c465485b7ae3a8a236b648aa3 Mon Sep 17 00:00:00 2001
From: ajc <ambrosejcarr@gmail.com>
Date: Mon, 9 Nov 2015 09:40:27 -0500
Subject: [PATCH 36/68] resolves iss #10

---
 src/scripts/SEQC | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/scripts/SEQC b/src/scripts/SEQC
index 2026953..cb5497c 100755
--- a/src/scripts/SEQC
+++ b/src/scripts/SEQC
@@ -443,6 +443,7 @@ if __name__ == "__main__":
         # log command line arguments for debugging
         arg_copy = copy(kwargs)
         del arg_copy['func']  # function is not serializable
+        log_info('SEQC working directory: %s' % os.getcwd())
         log_info('Passed command line arguments: %s' %
                  json.dumps(arg_copy, separators=(',', ': '), indent=4))
 

From cf47467625009cf05e957b0692c4a230ad9c28cd Mon Sep 17 00:00:00 2001
From: ajc <ambrosejcarr@gmail.com>
Date: Mon, 9 Nov 2015 09:59:06 -0500
Subject: [PATCH 37/68] changed how the temporary directory is created to allow
 absolute file paths with multiple levels to be passed. Fixes #11

---
 src/scripts/SEQC | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/scripts/SEQC b/src/scripts/SEQC
index cb5497c..ba9cf3c 100755
--- a/src/scripts/SEQC
+++ b/src/scripts/SEQC
@@ -181,8 +181,11 @@ def set_up(output_file, index, barcodes):
     serialized barcode object
     """
 
+    # temporary directory should be made in the same directory as the output prefix
+    *stem, final_dir = output_file.split('/')
+    temp_dir = '/'.join(stem + ['.' + final_dir])
+
     # create temporary directory based on experiment name
-    temp_dir = '.' + output_file
     if not os.path.isdir(temp_dir):
         os.makedirs(temp_dir)
     if not temp_dir.endswith('/'):

From cfd511885caf11610c28eaed132f72f61295fdc1 Mon Sep 17 00:00:00 2001
From: ajc <ambrosejcarr@gmail.com>
Date: Mon, 9 Nov 2015 10:17:13 -0500
Subject: [PATCH 38/68] rewrote mask_low_support_records() without pandas or
 numpy indexing to avoid reduce/flexible dtype bug. Resolves #7

---
 src/scripts/SEQC   | 47 +++++++----------------------------
 src/seqc/arrays.py | 61 ++++++++++++++++++++++++++++++++--------------
 2 files changed, 52 insertions(+), 56 deletions(-)

diff --git a/src/scripts/SEQC b/src/scripts/SEQC
index ba9cf3c..0bf9ffe 100755
--- a/src/scripts/SEQC
+++ b/src/scripts/SEQC
@@ -301,44 +301,15 @@ def resolve_alignments(index, arr, n, output_file):
 
 def save_counts_matrices(output_file, arr, n):
 
-#### DEBUGGING TRY/EXCEPT; remove when clear what the problem with this error is ####
-# Traceback (most recent call last):
-#   File "/data/seqc/src/scripts/SEQC", line 419, in <module>
-#     func(**kwargs)
-#   File "/data/seqc/src/scripts/SEQC", line 359, in in_drop
-#     save_counts_matrices(output_file, arr, n=3)
-#   File "/data/seqc/src/scripts/SEQC", line 301, in save_counts_matrices
-#     collapse_molecules=True, n_poly_t_required=n)
-#   File "/data/seqc/src/seqc/arrays.py", line 1030, in unique_features_to_sparse_counts
-#     low_coverage_mask = self.mask_low_support_molecules()
-#   File "/data/seqc/src/seqc/arrays.py", line 850, in mask_low_support_molecules
-#     df = pd.DataFrame(view)
-#   File "/usr/local/lib/python3.5/site-packages/pandas/core/frame.py", line 238, in __init__
-#     if mask.any():
-#   File "/usr/local/lib/python3.5/site-packages/numpy/core/_methods.py", line 38, in _any
-#     return umr_any(a, axis, dtype, out, keepdims)
-# TypeError: cannot perform reduce with flexible type
-
-    try:
-        log_info('Generating Gene x Cell SparseMatrices for reads and molecules.')
-        mols, mrow, mcol = arr.unique_features_to_sparse_counts(
-            collapse_molecules=True, n_poly_t_required=n)
-        reads, rrow, rcol = arr.unique_features_to_sparse_counts(
-            collapse_molecules=False, n_poly_t_required=n)
-        matrices = {'molecules': {'matrix': mols, 'row_ids': mrow, 'col_ids': mcol},
-                    'reads': {'matrix': reads, 'row_ids': rrow, 'col_ids': rcol}}
-        with open(output_file + '_read_and_mol_matrices.p', 'wb') as f:
-            pickle.dump(matrices, f)
-    except TypeError:
-        all_objects = dir()
-        log_info('all array objects:' + repr(all_objects))
-        for obj in all_objects:
-            if isinstance(obj, np.ndarray):
-                log_info('Raised TypeError; logging all array object types and shapes')
-                log_info(repr(obj))
-                log_info(repr(obj.dtype))
-                log_info(repr(obj.shape))
-        raise
+    log_info('Generating Gene x Cell SparseMatrices for reads and molecules.')
+    mols, mrow, mcol = arr.unique_features_to_sparse_counts(
+        collapse_molecules=True, n_poly_t_required=n)
+    reads, rrow, rcol = arr.unique_features_to_sparse_counts(
+        collapse_molecules=False, n_poly_t_required=n)
+    matrices = {'molecules': {'matrix': mols, 'row_ids': mrow, 'col_ids': mcol},
+                'reads': {'matrix': reads, 'row_ids': rrow, 'col_ids': rcol}}
+    with open(output_file + '_read_and_mol_matrices.p', 'wb') as f:
+        pickle.dump(matrices, f)
 
 
 def select_cells():
diff --git a/src/seqc/arrays.py b/src/seqc/arrays.py
index dcb3393..242f8c4 100644
--- a/src/seqc/arrays.py
+++ b/src/seqc/arrays.py
@@ -844,25 +844,48 @@ def mask_low_support_molecules(self, required_support=2):
         """
         mask any molecule supported by fewer than <required_support> reads
         """
-        # use the minimum number of columns to construct the dataframe, not sure if
-        # pandas copies data.
-        view = self.data[['cell', 'rmt']]
-        df = pd.DataFrame(view)
-        grouped = df.groupby(['cell', 'rmt'])
-        failing = []
-        for idx, g in grouped:
-            if len(g) < required_support:
-                failing.extend(g.index)
-        failing.sort()
-        ifail = 0
-        imask = 0
-        mask = np.ones(len(self.data), dtype=np.bool)
-        while ifail < len(failing):
-            if imask == failing[ifail]:
-                mask[imask] = 0
-                ifail += 1
-            imask += 1
+
+        # goal: track whether a given index has > molecule associated with it
+        # need to track molecule counts; defined as a combination of cell & rmt.
+        # simple way to track seems like a tuple of ints; can use defaultdict to build?
+
+        # then, to build boolean mask, run through the list a second time
+
+        # get counts
+        mol_counts = defaultdict(int)
+        for row in self.data:
+            # 0 = cell; 1 = rmt -- integer indexing is faster than row['cell'], row['rmt']
+            mol_counts[(row[0], row[1])] += 1
+
+        mol_counts = dict(mol_counts)  # defaultdict indexing can produce odd results
+
+        # build mask
+        n = self.data.shape[0]
+        mask = np.zeros(n, dtype=np.bool)
+        for i, row in enumerate(self.data):
+            if mol_counts[(row[0], row[1])] > required_support:
+                mask[i] = 1
+
         return mask
+        #
+        #
+        # view = self.data[['cell', 'rmt']].copy()
+        # df = pd.DataFrame(view)
+        # grouped = df.groupby(['cell', 'rmt'])
+        # failing = []
+        # for idx, g in grouped:
+        #     if len(g) < required_support:
+        #         failing.extend(g.index)
+        # failing.sort()
+        # ifail = 0
+        # imask = 0
+        # mask = np.ones(len(self.data), dtype=np.bool)
+        # while ifail < len(failing):
+        #     if imask == failing[ifail]:
+        #         mask[imask] = 0
+        #         ifail += 1
+        #     imask += 1
+        # return mask
 
     @staticmethod
     def translate_feature(reference_name, strand, true_position, feature_table,
@@ -1029,6 +1052,8 @@ def unique_features_to_sparse_counts(self, collapse_molecules, n_poly_t_required
         read_mask = self.mask_failing_records(n_poly_t_required)
         low_coverage_mask = self.mask_low_support_molecules()
         unmasked_inds = np.arange(self.data.shape[0])[read_mask & low_coverage_mask]
+        if unmasked_inds.shape[0] == 0:
+            raise ValueError('Zero reads passed filters. Cannot save sparse matrix')
         molecule_counts = defaultdict(dict)
 
         if collapse_molecules:

From bea929b11b3e132ba5f3757b6b3fbc8ff4c5ad3f Mon Sep 17 00:00:00 2001
From: Kristy Choi <kristyc@dyn-160-39-246-156.dyn.columbia.edu>
Date: Mon, 9 Nov 2015 10:44:20 -0500
Subject: [PATCH 39/68] configured for local use

---
 src/scripts/VOLUME_CLEANUP | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/src/scripts/VOLUME_CLEANUP b/src/scripts/VOLUME_CLEANUP
index 565c1c8..de798e7 100755
--- a/src/scripts/VOLUME_CLEANUP
+++ b/src/scripts/VOLUME_CLEANUP
@@ -1,14 +1,17 @@
 #!/usr/local/bin/python3
 import sys
 import boto3
+from os.path import expanduser
 from subprocess import call
 
 
 def main():
     ec2 = boto3.resource('ec2')
+    home = expanduser("~")
+    fpath = home + "/.starcluster/plugins/vol_names.txt"
 
     try:
-        with open('../plugins/vol_names.txt', 'r') as f:
+        with open(fpath, 'r') as f:
             vol_names = [line.strip() for line in f]
     except IOError:
         print('No new volumes for cleanup!')
@@ -25,9 +28,9 @@ def main():
     print('There are %d new volumes still being used' % len(remain_vol))
 
     # updating vol_names.txt file
-    call(["rm", "../plugins/vol_names.txt"])
+    call(["rm", fpath])
     if remain_vol:
-        with open('../plugins/vol_names.txt', 'w') as f:
+        with open(fpath, 'w') as f:
             for name in remain_vol:
                 f.write('%s\n' % name)
 

From c579d44f5d62a4de93228ce7af8d92685da03528 Mon Sep 17 00:00:00 2001
From: Kristy Choi <kristyc@dyn-160-39-246-156.dyn.columbia.edu>
Date: Mon, 9 Nov 2015 10:48:51 -0500
Subject: [PATCH 40/68] fixed for local use

---
 src/scripts/VOLUME_CLEANUP | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/scripts/VOLUME_CLEANUP b/src/scripts/VOLUME_CLEANUP
index de798e7..9a5e849 100755
--- a/src/scripts/VOLUME_CLEANUP
+++ b/src/scripts/VOLUME_CLEANUP
@@ -4,7 +4,6 @@ import boto3
 from os.path import expanduser
 from subprocess import call
 
-
 def main():
     ec2 = boto3.resource('ec2')
     home = expanduser("~")

From 8a36885a905fae8bc6e98fa55ed265d5d3d6784f Mon Sep 17 00:00:00 2001
From: Kristy Choi <kristyc@dyn-160-39-246-156.dyn.columbia.edu>
Date: Mon, 9 Nov 2015 10:59:32 -0500
Subject: [PATCH 41/68] finalized changes

---
 src/scripts/VOLUME_CLEANUP | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/scripts/VOLUME_CLEANUP b/src/scripts/VOLUME_CLEANUP
index 9a5e849..6db665d 100755
--- a/src/scripts/VOLUME_CLEANUP
+++ b/src/scripts/VOLUME_CLEANUP
@@ -24,11 +24,11 @@ def main():
         else:
             remain_vol.append(name)
     print('Finished deleting all new, unattached volumes')
-    print('There are %d new volumes still being used' % len(remain_vol))
 
     # updating vol_names.txt file
     call(["rm", fpath])
     if remain_vol:
+        print('There are %d new volumes still being used' % len(remain_vol))
         with open(fpath, 'w') as f:
             for name in remain_vol:
                 f.write('%s\n' % name)

From 6c50e5267ca1302042d79f305c293e3ca2a65cd6 Mon Sep 17 00:00:00 2001
From: Kristy Choi <eyc2120@users.noreply.github.com>
Date: Mon, 9 Nov 2015 11:34:31 -0500
Subject: [PATCH 42/68] Update README.md

---
 README.md | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/README.md b/README.md
index 23fdb3f..b97d001 100644
--- a/README.md
+++ b/README.md
@@ -51,6 +51,18 @@ store SEQC output.
 3. SEQC depends on several python3 packages that are automatically installed and updated.
 to view these packages, please view the `setup.py` file packaged with SEQC.
 
+### Setting up HDF5 on your local computer:
+1. After downloading libhdf5 from source, it can be installed by typing:
+    $> ./configure --prefix=/usr/local/
+    $> make
+    $> make install
+2. Then, you can install pytables by typing:
+    $> pip3 install tables
+3. If you installed libhdf5 without giving arguments in the "configure" step, you can also:
+   (a) $> export HDF_DIR=/your/installation/directory/for/hdf5
+   (b) You need to make sure that you have the prereqs installed previously:
+       - numpy, numexpr, cython
+
 ### Setting up AWS, SEQC, and starcluster
 
 Once all dependencies have been installed, SEQC can be installed on any machine by typing:

From ea13cf82fd5614f976357ba7a3dc1baa63e5a7db Mon Sep 17 00:00:00 2001
From: ajc <ambrosejcarr@gmail.com>
Date: Mon, 9 Nov 2015 11:36:34 -0500
Subject: [PATCH 43/68] updated __init__.py to import all modules; now import
 seqc produces the expected result. Note that test.py and log.py are NOT
 installed by default. Closes #16

---
 src/seqc/__init__.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/seqc/__init__.py b/src/seqc/__init__.py
index ceaf57d..9411aca 100644
--- a/src/seqc/__init__.py
+++ b/src/seqc/__init__.py
@@ -1 +1,5 @@
 __author__ = 'ambrose'
+
+from seqc import (
+    align, arrays, barcodes, convert_features, fastq, h5, io_lib, plot, qc, sam, three_bit
+)
\ No newline at end of file

From 5e02efc98ba860c88e5f5986f08a6a617ce2f922 Mon Sep 17 00:00:00 2001
From: Kristy Choi <kristyc@dyn-160-39-246-156.dyn.columbia.edu>
Date: Mon, 9 Nov 2015 11:56:11 -0500
Subject: [PATCH 44/68] Added part about AWSCLI

---
 README.md | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index b97d001..46cdbd9 100644
--- a/README.md
+++ b/README.md
@@ -59,9 +59,9 @@ to view these packages, please view the `setup.py` file packaged with SEQC.
 2. Then, you can install pytables by typing:
     $> pip3 install tables
 3. If you installed libhdf5 without giving arguments in the "configure" step, you can also:
-   (a) $> export HDF_DIR=/your/installation/directory/for/hdf5
-   (b) You need to make sure that you have the prereqs installed previously:
-       - numpy, numexpr, cython
+    $> export HDF_DIR=/your/installation/directory/for/hdf5
+But before you do that, you need to make sure that you have the prereqs installed previously:
+    - numpy, numexpr, cython
 
 ### Setting up AWS, SEQC, and starcluster
 
@@ -102,6 +102,14 @@ wish to create an account.
     `KEY_LOCATION=~/.ssh/<keyname>.rsa`
 5. Under templates, find `[cluster c3.large]`
     1. change key to `<your_key_name>`
+    
+#### Install and Configure AWS CLI (AWS Command Line Interface).
+1. You can install by typing `pip install awscli`
+2. Then, configure it by typing `aws configure`:
+AWS Access Key ID [*******]: `access_id`
+AWS Secret Access Key [*******]: `secret_key`
+Default region name [us-west-2]: `us-east-1` (Adjust accordingly)
+Default output format [None]: `text`
 
 #### Start a cluster:
 1. `$> starcluster start -c <template_name> <cluster_name>`

From d7179525c959bdb56d650a9f1f810316ddbcad28 Mon Sep 17 00:00:00 2001
From: Kristy Choi <kristyc@dyn-160-39-246-156.dyn.columbia.edu>
Date: Mon, 9 Nov 2015 11:57:36 -0500
Subject: [PATCH 45/68] Formatting issues

---
 README.md | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index 46cdbd9..7adf50d 100644
--- a/README.md
+++ b/README.md
@@ -106,10 +106,10 @@ wish to create an account.
 #### Install and Configure AWS CLI (AWS Command Line Interface).
 1. You can install by typing `pip install awscli`
 2. Then, configure it by typing `aws configure`:
-AWS Access Key ID [*******]: `access_id`
-AWS Secret Access Key [*******]: `secret_key`
-Default region name [us-west-2]: `us-east-1` (Adjust accordingly)
-Default output format [None]: `text`
+    AWS Access Key ID [*******]: `access_id`
+    AWS Secret Access Key [*******]: `secret_key`
+    Default region name [us-west-2]: `us-east-1` (Adjust accordingly)
+    Default output format [None]: `text`
 
 #### Start a cluster:
 1. `$> starcluster start -c <template_name> <cluster_name>`

From d526cc77b4c38f273c8e021706ee863604e63bc3 Mon Sep 17 00:00:00 2001
From: Kristy Choi <kristyc@dyn-160-39-246-156.dyn.columbia.edu>
Date: Mon, 9 Nov 2015 11:58:19 -0500
Subject: [PATCH 46/68] Formatting issues

---
 README.md | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index 7adf50d..20054de 100644
--- a/README.md
+++ b/README.md
@@ -106,10 +106,10 @@ wish to create an account.
 #### Install and Configure AWS CLI (AWS Command Line Interface).
 1. You can install by typing `pip install awscli`
 2. Then, configure it by typing `aws configure`:
-    AWS Access Key ID [*******]: `access_id`
-    AWS Secret Access Key [*******]: `secret_key`
-    Default region name [us-west-2]: `us-east-1` (Adjust accordingly)
-    Default output format [None]: `text`
+    * AWS Access Key ID [*******]: `access_id`
+    * AWS Secret Access Key [*******]: `secret_key`
+    * Default region name [us-west-2]: `us-east-1` (Adjust accordingly)
+    * Default output format [None]: `text`
 
 #### Start a cluster:
 1. `$> starcluster start -c <template_name> <cluster_name>`

From f8e2ec412f5ffb038807d38f9b2772693231c2ea Mon Sep 17 00:00:00 2001
From: Kristy Choi <kristyc@dyn-160-39-246-156.dyn.columbia.edu>
Date: Mon, 9 Nov 2015 11:59:49 -0500
Subject: [PATCH 47/68] Formatting issues

---
 README.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/README.md b/README.md
index 20054de..a117a86 100644
--- a/README.md
+++ b/README.md
@@ -53,9 +53,11 @@ to view these packages, please view the `setup.py` file packaged with SEQC.
 
 ### Setting up HDF5 on your local computer:
 1. After downloading libhdf5 from source, it can be installed by typing:
+    ```
     $> ./configure --prefix=/usr/local/
     $> make
     $> make install
+    ```
 2. Then, you can install pytables by typing:
     $> pip3 install tables
 3. If you installed libhdf5 without giving arguments in the "configure" step, you can also:

From ff850beb7ab1d968fc62bf82cd2004f72351340d Mon Sep 17 00:00:00 2001
From: Kristy Choi <kristyc@dyn-160-39-246-156.dyn.columbia.edu>
Date: Mon, 9 Nov 2015 12:08:13 -0500
Subject: [PATCH 48/68] formatting

---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index a117a86..0938f47 100644
--- a/README.md
+++ b/README.md
@@ -53,11 +53,11 @@ to view these packages, please view the `setup.py` file packaged with SEQC.
 
 ### Setting up HDF5 on your local computer:
 1. After downloading libhdf5 from source, it can be installed by typing:
-    ```
+
     $> ./configure --prefix=/usr/local/
     $> make
     $> make install
-    ```
+
 2. Then, you can install pytables by typing:
     $> pip3 install tables
 3. If you installed libhdf5 without giving arguments in the "configure" step, you can also:

From aecf9190f376c99d26840b9357dcbe286172362a Mon Sep 17 00:00:00 2001
From: Kristy Choi <kristyc@dyn-160-39-246-156.dyn.columbia.edu>
Date: Mon, 9 Nov 2015 12:09:14 -0500
Subject: [PATCH 49/68] formatting

---
 README.md | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index 0938f47..d10651e 100644
--- a/README.md
+++ b/README.md
@@ -53,17 +53,19 @@ to view these packages, please view the `setup.py` file packaged with SEQC.
 
 ### Setting up HDF5 on your local computer:
 1. After downloading libhdf5 from source, it can be installed by typing:
-
+```
     $> ./configure --prefix=/usr/local/
     $> make
     $> make install
-
+```
 2. Then, you can install pytables by typing:
-    $> pip3 install tables
+    ```pip3 install tables```
 3. If you installed libhdf5 without giving arguments in the "configure" step, you can also:
-    $> export HDF_DIR=/your/installation/directory/for/hdf5
+    ```export HDF_DIR=/your/installation/directory/for/hdf5```
 But before you do that, you need to make sure that you have the prereqs installed previously:
-    - numpy, numexpr, cython
+    * numpy
+    *numexpr
+    *cython
 
 ### Setting up AWS, SEQC, and starcluster
 

From 3ad5df96234bf3adf992924db9fba36c9ebe5713 Mon Sep 17 00:00:00 2001
From: Kristy Choi <kristyc@dyn-160-39-246-156.dyn.columbia.edu>
Date: Mon, 9 Nov 2015 12:09:39 -0500
Subject: [PATCH 50/68] formatting

---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index d10651e..524fd81 100644
--- a/README.md
+++ b/README.md
@@ -64,8 +64,8 @@ to view these packages, please view the `setup.py` file packaged with SEQC.
     ```export HDF_DIR=/your/installation/directory/for/hdf5```
 But before you do that, you need to make sure that you have the prereqs installed previously:
     * numpy
-    *numexpr
-    *cython
+    * numexpr
+    * cython
 
 ### Setting up AWS, SEQC, and starcluster
 

From 0895a6fd815bebf66588a04934070d978652d65c Mon Sep 17 00:00:00 2001
From: Kristy Choi <kristyc@dyn-160-39-246-156.dyn.columbia.edu>
Date: Mon, 9 Nov 2015 12:11:12 -0500
Subject: [PATCH 51/68] formatting

---
 README.md | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 524fd81..ab9af13 100644
--- a/README.md
+++ b/README.md
@@ -60,12 +60,16 @@ to view these packages, please view the `setup.py` file packaged with SEQC.
 ```
 2. Then, you can install pytables by typing:
     ```pip3 install tables```
-3. If you installed libhdf5 without giving arguments in the "configure" step, you can also:
-    ```export HDF_DIR=/your/installation/directory/for/hdf5```
-But before you do that, you need to make sure that you have the prereqs installed previously:
+3. If you installed libhdf5 without giving arguments in the "configure" step, make sure
+that you have the necessary prereqs already installed:
     * numpy
     * numexpr
     * cython
+Then type the code below to complete your installation:
+```
+    export HDF_DIR=/your/installation/directory/for/hdf5
+```
+
 
 ### Setting up AWS, SEQC, and starcluster
 

From 33b2124ba87184cc0657bf2df447ae061224b1fd Mon Sep 17 00:00:00 2001
From: Kristy Choi <kristyc@dyn-160-39-246-156.dyn.columbia.edu>
Date: Mon, 9 Nov 2015 12:12:23 -0500
Subject: [PATCH 52/68] formatting

---
 README.md | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index ab9af13..bdd56b1 100644
--- a/README.md
+++ b/README.md
@@ -58,14 +58,13 @@ to view these packages, please view the `setup.py` file packaged with SEQC.
     $> make
     $> make install
 ```
-2. Then, you can install pytables by typing:
-    ```pip3 install tables```
+2. Then, you can install pytables by typing: ```pip3 install tables```
 3. If you installed libhdf5 without giving arguments in the "configure" step, make sure
 that you have the necessary prereqs already installed:
     * numpy
     * numexpr
     * cython
-Then type the code below to complete your installation:
+4. Then type the code below to complete your installation:
 ```
     export HDF_DIR=/your/installation/directory/for/hdf5
 ```

From 18af08017014838bb846f98606a00743a51aa561 Mon Sep 17 00:00:00 2001
From: Kristy Choi <kristyc@dyn-160-39-246-156.dyn.columbia.edu>
Date: Mon, 9 Nov 2015 12:13:57 -0500
Subject: [PATCH 53/68] formatting

---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index bdd56b1..0735a35 100644
--- a/README.md
+++ b/README.md
@@ -53,11 +53,11 @@ to view these packages, please view the `setup.py` file packaged with SEQC.
 
 ### Setting up HDF5 on your local computer:
 1. After downloading libhdf5 from source, it can be installed by typing:
-```
+    ```
     $> ./configure --prefix=/usr/local/
     $> make
     $> make install
-```
+    ```
 2. Then, you can install pytables by typing: ```pip3 install tables```
 3. If you installed libhdf5 without giving arguments in the "configure" step, make sure
 that you have the necessary prereqs already installed:

From 7378a5b96aa4e3da76ac9bdb8841822c8271ee94 Mon Sep 17 00:00:00 2001
From: Kristy Choi <kristyc@dyn-160-39-246-156.dyn.columbia.edu>
Date: Mon, 9 Nov 2015 12:15:28 -0500
Subject: [PATCH 54/68] formatting

---
 README.md | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/README.md b/README.md
index 0735a35..f0b8820 100644
--- a/README.md
+++ b/README.md
@@ -54,9 +54,9 @@ to view these packages, please view the `setup.py` file packaged with SEQC.
 ### Setting up HDF5 on your local computer:
 1. After downloading libhdf5 from source, it can be installed by typing:
     ```
-    $> ./configure --prefix=/usr/local/
-    $> make
-    $> make install
+        $> ./configure --prefix=/usr/local/
+        $> make
+        $> make install
     ```
 2. Then, you can install pytables by typing: ```pip3 install tables```
 3. If you installed libhdf5 without giving arguments in the "configure" step, make sure
@@ -64,10 +64,7 @@ that you have the necessary prereqs already installed:
     * numpy
     * numexpr
     * cython
-4. Then type the code below to complete your installation:
-```
-    export HDF_DIR=/your/installation/directory/for/hdf5
-```
+4. Then set the $HDF_DIR environment variable to complete your installation: ```export HDF_DIR=/your/installation/directory/for/hdf5```
 
 
 ### Setting up AWS, SEQC, and starcluster

From 46638f46b816adccc9e809109fa563fa606d76f0 Mon Sep 17 00:00:00 2001
From: Kristy Choi <kristyc@dyn-160-39-246-156.dyn.columbia.edu>
Date: Mon, 9 Nov 2015 12:17:29 -0500
Subject: [PATCH 55/68] formatting

---
 README.md | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/README.md b/README.md
index f0b8820..8d74ddf 100644
--- a/README.md
+++ b/README.md
@@ -52,19 +52,21 @@ store SEQC output.
 to view these packages, please view the `setup.py` file packaged with SEQC.
 
 ### Setting up HDF5 on your local computer:
+#### Installing from Source:
 1. After downloading libhdf5 from source, it can be installed by typing:
-    ```
-        $> ./configure --prefix=/usr/local/
-        $> make
-        $> make install
-    ```
-2. Then, you can install pytables by typing: ```pip3 install tables```
-3. If you installed libhdf5 without giving arguments in the "configure" step, make sure
+```
+    $> ./configure --prefix=/usr/local/
+    $> make
+    $> make install
+```
+2. Install pytables by typing: ```pip3 install tables```
+#### Installing without previous configuration
+1. If you installed libhdf5 without giving arguments in the "configure" step, make sure
 that you have the necessary prereqs already installed:
     * numpy
     * numexpr
     * cython
-4. Then set the $HDF_DIR environment variable to complete your installation: ```export HDF_DIR=/your/installation/directory/for/hdf5```
+2. Then set the $HDF_DIR environment variable by typing: ```export HDF_DIR=/your/installation/directory/for/hdf5```
 
 
 ### Setting up AWS, SEQC, and starcluster

From 1c643f648398f902f080584cfe5ef001f7898c57 Mon Sep 17 00:00:00 2001
From: Kristy Choi <kristyc@dyn-160-39-246-156.dyn.columbia.edu>
Date: Mon, 9 Nov 2015 12:19:17 -0500
Subject: [PATCH 56/68] formatting

---
 README.md | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 8d74ddf..b4d9d4f 100644
--- a/README.md
+++ b/README.md
@@ -60,13 +60,17 @@ to view these packages, please view the `setup.py` file packaged with SEQC.
     $> make install
 ```
 2. Install pytables by typing: ```pip3 install tables```
+
 #### Installing without previous configuration
 1. If you installed libhdf5 without giving arguments in the "configure" step, make sure
 that you have the necessary prereqs already installed:
     * numpy
     * numexpr
     * cython
-2. Then set the $HDF_DIR environment variable by typing: ```export HDF_DIR=/your/installation/directory/for/hdf5```
+2. Then set the $HDF_DIR environment variable by typing: 
+```
+export HDF_DIR=/your/installation/directory/for/hdf5
+```
 
 
 ### Setting up AWS, SEQC, and starcluster

From 5ed475fa1e015e56a21d8d043c6e6b84ddd6e14d Mon Sep 17 00:00:00 2001
From: Kristy Choi <kristyc@dyn-160-39-246-156.dyn.columbia.edu>
Date: Mon, 9 Nov 2015 12:19:47 -0500
Subject: [PATCH 57/68] formatting

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index b4d9d4f..e79c7d0 100644
--- a/README.md
+++ b/README.md
@@ -59,7 +59,7 @@ to view these packages, please view the `setup.py` file packaged with SEQC.
     $> make
     $> make install
 ```
-2. Install pytables by typing: ```pip3 install tables```
+2. Install pytables by typing: `pip3 install tables`
 
 #### Installing without previous configuration
 1. If you installed libhdf5 without giving arguments in the "configure" step, make sure

From 417dfc0aafbee264f4e082cf206cb0ca00d74343 Mon Sep 17 00:00:00 2001
From: Kristy Choi <kristyc@dyn-160-39-246-156.dyn.columbia.edu>
Date: Mon, 9 Nov 2015 12:20:40 -0500
Subject: [PATCH 58/68] formatting

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index e79c7d0..85e0013 100644
--- a/README.md
+++ b/README.md
@@ -59,6 +59,7 @@ to view these packages, please view the `setup.py` file packaged with SEQC.
     $> make
     $> make install
 ```
+
 2. Install pytables by typing: `pip3 install tables`
 
 #### Installing without previous configuration

From f6554b6b46bab1aeccb5e72e49e8bf43bb1fe50e Mon Sep 17 00:00:00 2001
From: Kristy Choi <kristyc@dyn-160-39-246-156.dyn.columbia.edu>
Date: Mon, 9 Nov 2015 12:21:58 -0500
Subject: [PATCH 59/68] formatting

---
 README.md | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index 85e0013..e5055ef 100644
--- a/README.md
+++ b/README.md
@@ -54,11 +54,10 @@ to view these packages, please view the `setup.py` file packaged with SEQC.
 ### Setting up HDF5 on your local computer:
 #### Installing from Source:
 1. After downloading libhdf5 from source, it can be installed by typing:
-```
-    $> ./configure --prefix=/usr/local/
-    $> make
-    $> make install
-```
+
+        $> ./configure --prefix=/usr/local/
+        $> make
+        $> make install
 
 2. Install pytables by typing: `pip3 install tables`
 

From 6f4eeeb85758a4771daa61be1ea51c78c0830f5d Mon Sep 17 00:00:00 2001
From: Kristy Choi <kristyc@dyn-160-39-246-156.dyn.columbia.edu>
Date: Mon, 9 Nov 2015 12:22:45 -0500
Subject: [PATCH 60/68] formatting

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index e5055ef..7b8b945 100644
--- a/README.md
+++ b/README.md
@@ -71,7 +71,7 @@ that you have the necessary prereqs already installed:
 ```
 export HDF_DIR=/your/installation/directory/for/hdf5
 ```
-
+3. You should now be able to install pytables: `pip3 install tables`
 
 ### Setting up AWS, SEQC, and starcluster
 

From 0314e8c43010f277ba547afc8aa559c79e4f1620 Mon Sep 17 00:00:00 2001
From: ajc <ambrosejcarr@gmail.com>
Date: Mon, 9 Nov 2015 12:22:48 -0500
Subject: [PATCH 61/68] added checks for external dependencies to setup.py

---
 setup.py | 38 +++++++++++++++++++++++++++++++++++++-
 1 file changed, 37 insertions(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 9ebf6c8..1a24873 100644
--- a/setup.py
+++ b/setup.py
@@ -1,6 +1,25 @@
 __author__ = 'Ambrose J. Carr'
 
 from setuptools import setup
+from warnings import warn
+import os
+import shutil
+
+# pip3 cannot install external dependencies for python; warn user if external dependencies
+# are missing; do this at the end so that the users are more likely to see it.
+
+# look in /usr/local/ and /usr/local/hdf5/ for hdf5 libraries;
+# if found in /usr/local/hdf5/, set an environment variable to help pip3 install it.
+h5fail = True
+if os.path.isfile('/usr/lib/libhdf5.so'):
+    h5file = False
+elif os.path.isfile('/usr/local/lib/libhdf5.so'):
+    h5fail = False
+elif os.path.isfile('/usr/hdf5/lib/libhdf5.so'):
+    os.environ['HDF5_DIR'] = '/usr/hdf5/'
+elif os.path.isfile('/usr/local/hdf5/lib/libhdf5.so'):
+    os.environ['HDF5_DIR'] = '/usr/local/hdf5/'
+    h5fail = False
 
 setup(name='seqc',
       version='0.1',
@@ -12,13 +31,16 @@
       packages=['seqc', 'seqc.sa_postprocess', 'seqc.sa_preprocess', 'seqc.sa_process'],
       install_requires=[
           'numpy>=1.10.0',
+          'cython>0.14',  # tables requirement
+          'numexpr>=2.4',  # tables requirement
           'pandas>=0.16.0',
           'matplotlib>=1.4.3',
           'seaborn',
           'scipy>=0.14.0',
           'boto3',
           'pyftpdlib',
-          'intervaltree'],
+          'intervaltree',
+          'tables'],
       scripts=['src/scripts/SEQC',
                'src/scripts/PROCESS_BARCODES',
                'src/scripts/TEST_BARCODES',
@@ -26,3 +48,17 @@
                'src/scripts/process_single_file_scseq_experiment.py'],
       )
 
+# print any warnings
+if h5fail:
+    warn("""
+SEQC: libhdf5 shared library "libhdf5.so" not found in /usr/local/lib/,
+/usr/lib/, /usr/hdf5/lib/, or /usr/local/lib/hdf5/.
+tables will not find h5lib and installation will likely fail unless the
+HDF5_DIR environment variable has been set to the location that HDF5 was
+installed into. If HDF5 is not installed, please install it prior to
+installing SEQC.
+""")
+
+# look for star
+if not shutil.which('STAR'):
+    warn('SEQC: STAR is not installed. SEQC will not be able to align files.')

From 5e5161cee7d7698f5574ac66b0693297f0b0ef1c Mon Sep 17 00:00:00 2001
From: Kristy Choi <kristyc@dyn-160-39-246-156.dyn.columbia.edu>
Date: Mon, 9 Nov 2015 12:23:01 -0500
Subject: [PATCH 62/68] formatting

---
 README.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 7b8b945..4d713fd 100644
--- a/README.md
+++ b/README.md
@@ -68,9 +68,9 @@ that you have the necessary prereqs already installed:
     * numexpr
     * cython
 2. Then set the $HDF_DIR environment variable by typing: 
-```
-export HDF_DIR=/your/installation/directory/for/hdf5
-```
+
+        export HDF_DIR=/your/installation/directory/for/hdf5
+
 3. You should now be able to install pytables: `pip3 install tables`
 
 ### Setting up AWS, SEQC, and starcluster

From b2b8c215b3b266cb92604fc741c1988d618bd52f Mon Sep 17 00:00:00 2001
From: Kristy Choi <kristyc@dyn-160-39-246-156.dyn.columbia.edu>
Date: Mon, 9 Nov 2015 12:23:23 -0500
Subject: [PATCH 63/68] formatting

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 4d713fd..7064bf4 100644
--- a/README.md
+++ b/README.md
@@ -69,7 +69,7 @@ that you have the necessary prereqs already installed:
     * cython
 2. Then set the $HDF_DIR environment variable by typing: 
 
-        export HDF_DIR=/your/installation/directory/for/hdf5
+        $> export HDF_DIR=/your/installation/directory/for/hdf5
 
 3. You should now be able to install pytables: `pip3 install tables`
 

From f4148dd5712b1d5e18d91176bf3ba75d0342c957 Mon Sep 17 00:00:00 2001
From: ajc <ambrosejcarr@gmail.com>
Date: Mon, 9 Nov 2015 13:45:16 -0500
Subject: [PATCH 64/68] made required_support a paramter in unique_features;
 now error correction is optional. If support of 1 or smaller is passed, error
 correction returns a mask of all 1s

---
 src/seqc/arrays.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/src/seqc/arrays.py b/src/seqc/arrays.py
index 242f8c4..e456161 100644
--- a/src/seqc/arrays.py
+++ b/src/seqc/arrays.py
@@ -852,6 +852,10 @@ def mask_low_support_molecules(self, required_support=2):
         # then, to build boolean mask, run through the list a second time
 
         # get counts
+        n = self.data.shape[0]
+        if required_support <= 1:
+            return np.ones(n, dtype=np.bool)
+
         mol_counts = defaultdict(int)
         for row in self.data:
             # 0 = cell; 1 = rmt -- integer indexing is faster than row['cell'], row['rmt']
@@ -860,10 +864,9 @@ def mask_low_support_molecules(self, required_support=2):
         mol_counts = dict(mol_counts)  # defaultdict indexing can produce odd results
 
         # build mask
-        n = self.data.shape[0]
         mask = np.zeros(n, dtype=np.bool)
         for i, row in enumerate(self.data):
-            if mol_counts[(row[0], row[1])] > required_support:
+            if mol_counts[(row[0], row[1])] >= required_support:
                 mask[i] = 1
 
         return mask
@@ -1046,11 +1049,12 @@ def to_sparse_counts(self, collapse_molecules, n_poly_t_required):
         coo = coo_matrix((values, (row_ind, col_ind)), shape=shape, dtype=dtype)
         return coo, unq_row, unq_col
 
-    def unique_features_to_sparse_counts(self, collapse_molecules, n_poly_t_required):
+    def unique_features_to_sparse_counts(self, collapse_molecules, n_poly_t_required,
+                                         support_required=2):
 
         # mask failing cells and molecules with < 2 reads supporting them.
         read_mask = self.mask_failing_records(n_poly_t_required)
-        low_coverage_mask = self.mask_low_support_molecules()
+        low_coverage_mask = self.mask_low_support_molecules(support_required)
         unmasked_inds = np.arange(self.data.shape[0])[read_mask & low_coverage_mask]
         if unmasked_inds.shape[0] == 0:
             raise ValueError('Zero reads passed filters. Cannot save sparse matrix')

From 592c7ddace78632f57b9172d3c06c17f9dbd2fee Mon Sep 17 00:00:00 2001
From: ajc <ambrosejcarr@gmail.com>
Date: Mon, 9 Nov 2015 14:20:03 -0500
Subject: [PATCH 65/68] added DUST filtering to SEQC; replaces
 homopolymer_filtering; eliminates _metadata.txt file; no longer needed, all
 reads are retained in merged.fastq

---
 src/scripts/SEQC   | 18 ++++++++++++------
 src/seqc/arrays.py |  2 +-
 src/seqc/fastq.py  | 47 +++++++++++++++++++++++++++++++++-------------
 3 files changed, 47 insertions(+), 20 deletions(-)

diff --git a/src/scripts/SEQC b/src/scripts/SEQC
index 0bf9ffe..7643c87 100755
--- a/src/scripts/SEQC
+++ b/src/scripts/SEQC
@@ -225,15 +225,19 @@ def set_up(output_file, index, barcodes):
     return temp_dir, gtf, cb, index
 
 
-def merge(output_file, forward, reverse, samfile, merged_fastq, processor, temp_dir, cb):
+# removed output_file argument because it was only used for metadata
+def merge(forward, reverse, samfile, merged_fastq, processor, temp_dir, cb):
     if (forward or reverse) and not (samfile or merged_fastq):
 
         # pre-process reads
         log_info('Merging fastq files.')
-        merged_fastq, nlc = merge_fastq(forward, reverse, processor, temp_dir, cb)
+        # removed number_low_complexity because we are now scoring with a DUST-based
+        # low-quality implementation.
+        # merged_fastq, nlc = merge_fastq(forward, reverse, processor, temp_dir, cb)
+        merged_fastq = merge_fastq(forward, reverse, processor, temp_dir, cb)
 
-    with open(output_file + '_metadata.txt', 'a') as f:
-        f.write('Low Complexity Reads: %d\n' % nlc)
+    # with open(output_file + '_metadata.txt', 'a') as f:
+    #     f.write('Low Complexity Reads: %d\n' % nlc)
 
     return merged_fastq
 
@@ -350,7 +354,8 @@ def in_drop(output_file, forward, reverse, samfile, merged_fastq, subparser_name
 
     temp_dir, gtf, cb, index = set_up(output_file, index, barcodes)
 
-    merged_fastq = merge(output_file, forward, reverse, samfile, merged_fastq, subparser_name,
+    # removed output_file argument due to dust filtering
+    merged_fastq = merge(forward, reverse, samfile, merged_fastq, subparser_name,
                          temp_dir, cb)
 
     samfile = align(merged_fastq, samfile, star_args, temp_dir, n_threads, index, output_file)
@@ -373,7 +378,8 @@ def drop_seq(output_file, forward, reverse, samfile, merged_fastq, subparser_nam
 
     temp_dir, gtf, cb, index = set_up(output_file, index, barcodes)
 
-    merged_fastq = merge(output_file, forward, reverse, samfile, merged_fastq, subparser_name,
+    # removed output_file argument due to dust filtering
+    merged_fastq = merge(forward, reverse, samfile, merged_fastq, subparser_name,
                          temp_dir, cb)
 
     samfile = align(merged_fastq, samfile, star_args, temp_dir, n_threads, index, output_file)
diff --git a/src/seqc/arrays.py b/src/seqc/arrays.py
index e456161..5274a2b 100644
--- a/src/seqc/arrays.py
+++ b/src/seqc/arrays.py
@@ -344,7 +344,7 @@ class ReadArray:
         ('rmt', np.int32),
         ('n_poly_t', np.uint8),
         ('valid_cell', np.bool),
-        ('trimmed_bases', np.uint8),
+        ('dust_score', np.uint8),
         ('rev_quality', np.uint8),
         ('fwd_quality', np.uint8),
         ('is_aligned', np.bool),
diff --git a/src/seqc/fastq.py b/src/seqc/fastq.py
index 29f0bba..a752761 100644
--- a/src/seqc/fastq.py
+++ b/src/seqc/fastq.py
@@ -126,6 +126,25 @@ def remove_homopolymer(r):
     return (r[0], seq + '\n', r[2], qual + '\n'), trimmed_bases
 
 
+def dust_low_complexity_score(record):
+    # Sequence
+    seq = record[1].strip()
+
+    # Counts of 3-mers in the sequence
+    counts = {}
+    for i in range(len(seq) - 2):
+        kmer = seq[i:i + 3]
+        counts[kmer] = counts.get(kmer, 0) + 1
+
+    # Calculate dust score
+    score = np.sum([i * (i - 1) / 2 for i in counts.values()]) / (len(seq) - 3)
+
+    # Scale score (Max score possible is no. of 3mers/2)
+    score = np.int8(score / ((len(seq) - 2) / 2) * 100)
+
+    return score
+
+
 def is_primer_dimer(cell_barcode, r):
     """
     determine if the reverse sequence r is a primer_dimer if r contains a cell barcode
@@ -158,9 +177,10 @@ def is_primer_dimer(cell_barcode, r):
 
 
 def annotate_fastq_record(r, cell, rmt, n_poly_t, valid_cell, trimmed_bases, fwd_quality):
-    name = ('@' + ':'.join(str(v) for v in [cell, rmt, n_poly_t, valid_cell, trimmed_bases,
-                                            fwd_quality]) +
-            ';' + r[0][1:])
+    name = (
+        '@' + ':'.join(str(v) for v in [cell, rmt, n_poly_t, valid_cell, trimmed_bases,
+                                        fwd_quality]) +
+        ';' + r[0][1:])
     return ''.join([name] + list(r[1:]))
 
 
@@ -176,12 +196,13 @@ def process_record(forward, reverse, tbp, cb):
     primer dimers"""
     cell, rmt, n_poly_t = tbp.process_forward_sequence(forward[1][:-1])  # exclude \n
     valid_cell = cb.close_match(cell)
-    r, trimmed_bases = remove_homopolymer(reverse)
-    if len(r[1]) < 20:  # don't return short reads
-        return
+    # r, trimmed_bases = remove_homopolymer(reverse)
+    # if len(r[1]) < 20:  # don't return short reads
+    #     return
+    dust_score = dust_low_complexity_score(reverse)
     fwd_quality = average_quality(forward[3][:-1])
     r = annotate_fastq_record(
-        r, cell, rmt, n_poly_t, valid_cell, trimmed_bases, fwd_quality)
+        reverse, cell, rmt, n_poly_t, valid_cell, dust_score, fwd_quality)
     return r
 
 
@@ -208,17 +229,17 @@ def merge_fastq(forward: list, reverse: list, exp_type, temp_dir, cb, n_low_comp
 
                 for f, r in paired_fastq_records(ffile, rfile):
                     merged_record = process_record(f, r, tbp, cb)
-                    if not merged_record:
-                        n_low_complexity += 1
-                    else:
-                        merged_file.write(merged_record)
+                    # if not merged_record:
+                    #     n_low_complexity += 1
+                    # else:
+                    merged_file.write(merged_record)
             finally:
                 ffile.close()
                 rfile.close()
     finally:
         merged_file.close()
 
-    return fout, n_low_complexity
+    return fout  #, n_low_complexity
 
 
 def auto_detect_processor(experiment_name):
@@ -239,7 +260,7 @@ def auto_detect_processor(experiment_name):
 
 # # todo not working right now
 # def merge_fastq_threaded(forward, reverse, n_threads, exp_type, temp_dir,
-#                          cell_barcode_pickle):
+# cell_barcode_pickle):
 #     """multi-threaded merge of forward and reverse records into a single alignable file"""
 #
 #     def read(forward_, reverse_, in_queue):

From b993857917e2bef7f797d56222c62d3514539cd4 Mon Sep 17 00:00:00 2001
From: Kristy Choi <kristyc@dyn-160-39-246-156.dyn.columbia.edu>
Date: Mon, 9 Nov 2015 14:44:34 -0500
Subject: [PATCH 66/68] disable SGE installation for speed

---
 src/plugins/starcluster.config | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/src/plugins/starcluster.config b/src/plugins/starcluster.config
index 925babe..e6e4261 100644
--- a/src/plugins/starcluster.config
+++ b/src/plugins/starcluster.config
@@ -129,8 +129,8 @@ KEY_LOCATION= <path to your username.pem file>
 # use the EXTENDS feature:
 
 # DEFINE AMIs here
-# C3 AMI = ami-6b13470e
-# C4 & R3 AMI = ami-d9095dbc
+# C3 AMI = ami-a12177c4
+# C4 & R3 AMI = ami-ab2177ce
 
 [cluster c3.large]
 KEYNAME = <your_key_name>
@@ -141,11 +141,13 @@ NODE_INSTANCE_TYPE = c3.large
 NODE_IMAGE_ID = ami-a12177c4
 # This availability zone will propagate to all clusters below
 AVAILABILITY_ZONE = us-east-1e
+DISABLE_QUEUE=True
 
 [cluster c4.large]
 EXTENDS = c3.large
 NODE_INSTANCE_TYPE = c4.large
 NODE_IMAGE_ID = ami-ab2177ce
+DISABLE_QUEUE=True
 
 [cluster r3.8xlarge]
 EXTENDS = c4.large
@@ -154,6 +156,7 @@ SUBNET_ID = subnet-28c89300
 FORCE_SPOT_MASTER = True
 SPOT_BID = 0.6
 PLUGINS = raid, gitpull
+DISABLE_QUEUE=True
 
 [cluster c3.8xlarge]
 EXTENDS = c3.large
@@ -161,6 +164,7 @@ NODE_INSTANCE_TYPE = c3.8xlarge
 FORCE_SPOT_MASTER = True
 SPOT_BID = 0.6
 PLUGINS = raid, gitpull
+DISABLE_QUEUE=True
 
 [cluster c4.8xlarge]
 EXTENDS = c4.large
@@ -168,6 +172,7 @@ NODE_INSTANCE_TYPE = c4.8xlarge
 FORCE_SPOT_MASTER = True
 SPOT_BID = 0.6
 PLUGINS = raid, gitpull
+DISABLE_QUEUE=True
 
 [cluster ipcluster]
 EXTENDS = c3.large

From a017b2a355ffb9fc6e215c31096df25757530c76 Mon Sep 17 00:00:00 2001
From: ajc <ambrosejcarr@gmail.com>
Date: Tue, 10 Nov 2015 12:37:30 -0500
Subject: [PATCH 67/68] updated documentation for convert_features() from
 discussions with manu

---
 src/seqc/convert_features.py | 176 +++++++----------------------------
 1 file changed, 36 insertions(+), 140 deletions(-)

diff --git a/src/seqc/convert_features.py b/src/seqc/convert_features.py
index 0374ee2..dfcdb07 100644
--- a/src/seqc/convert_features.py
+++ b/src/seqc/convert_features.py
@@ -2,6 +2,7 @@
 
 from collections import defaultdict, namedtuple
 from intervaltree import IntervalTree
+import pickle
 import re
 from sys import argv, exit
 
@@ -13,6 +14,10 @@
 
 
 class Exon:
+    """
+    Dummy class to store exon data for ConvertFeatureCoordinates. Replaceable with
+    NamedTuple
+    """
 
     __slots__ = ['start', 'end']
 
@@ -22,6 +27,10 @@ def __init__(self, start, end):
 
 
 class Transcript:
+    """
+    Dummy class to store transcript data for ConvertFeatureCoordinates. Not replaceable
+    with NamedTuple because the .exons property must be mutable.
+    """
 
     __slots__ = ['start', 'end', 'strand', 'feature', 'exons', 'scid']
 
@@ -35,14 +44,27 @@ def __init__(self, start, end, strand, feature, exons, scid):
 
 
 class ConvertFeatureCoordinates:
+    """
+    creates a lookup object whose .translate() method rapidly converts chromosome
+    coordinates to the associated transcripts if the alignment falls in the final n bases
+    of the transcript.
+
+    feature_table = dict() (chr, strand position) -> {scids}
+    feature_positions = dict() scid -> (start, end)
+
+    """
 
     def __init__(self, feature_table, feature_positions):
-        # todo checks for these; this has bugged out multiple times...
+        # todo type checks for these; this has bugged out multiple times...
         self._table = feature_table
         self._positions = feature_positions
 
     @classmethod
     def from_gtf(cls, gtf, fragment_length):
+        """
+        Create a ConvertFeatureCoordinates object from a gtf file. Include only
+        features that fall in the final <fragment_length> of each transcript.
+        """
         feature_table = defaultdict(set)
         feature_positions = defaultdict(list)
         pattern = re.compile(r'(^.*?scseq_id "SC)(.*?)(";.*?$)')
@@ -62,7 +84,7 @@ def from_gtf(cls, gtf, fragment_length):
                     )
                 elif record[2] == 'exon':
                     scid = int(re.match(pattern, record[-1]).group(2))
-                    exon = Exon(int(record[3]), int(record[4]))
+                    exon = Exon(start=int(record[3]), end=int(record[4]))
                     transcripts[scid].exons.append(exon)
 
         # process the exons, deleting duplicates where they are found
@@ -76,6 +98,8 @@ def from_gtf(cls, gtf, fragment_length):
                 exons = sorted((s for s in set(transcript.exons)), key=sort_key,
                                reverse=True)
                 transcript.exons = exons
+            # process_transcript modifies feature_table and feature_positions in place
+            # each time the function is called
             cls._process_transcript(transcript, feature_table, feature_positions,
                                     fragment_length)
 
@@ -85,6 +109,7 @@ def from_gtf(cls, gtf, fragment_length):
         return cls(feature_table, feature_positions)
 
     def save(self, fout):
+        """save a serialized version of self as a pickle file"""
         dout = {'feature_table': self._table,
                 'feature_positions': self._positions,
                 'fragment_length': self._fragment_length}
@@ -93,23 +118,25 @@ def save(self, fout):
 
     @classmethod
     def load(cls, fin):
+        """load a ConvertFeatureCoordinates file from a serialized pickle"""
         with open(fin, 'rb') as f:
             din = pickle.load(f)
             return cls(**din)
 
     @staticmethod
     def _round_up(x):
+        """round x up to the nearest hundred"""
         return x if x % 100 == 0 else x + 100 - x % 100
 
     @staticmethod
     def _round_down(x):
+        """round x down to the nearest hundred"""
         return x // 100 * 100
 
     @classmethod
     def _process_transcript(cls, transcript, feature_table, feature_positions,
                             fragment_len):
-
-        """updates feature_table and feature_positions (defined below)"""
+        """updates feature_table and feature_positions for the passed transcript"""
         remaining_fragment_size = fragment_len
         if transcript.strand == '+':
             for exon in transcript.exons[::-1]:  # iterate backwards through exons
@@ -171,6 +198,10 @@ def _process_transcript(cls, transcript, feature_table, feature_positions,
                     break  # move on to next set of exons
 
     def translate(self, strand, chromosome, position):
+        """
+        translate a strand, chromosome, and position into all associated SCIDs, which
+        correspond to groups of overlapping transcripts.
+        """
         rounded_position = position // 100 * 100
         try:
             potential_ids = self._table[(strand, chromosome, rounded_position)]
@@ -192,142 +223,7 @@ def add_mtDNA(self, gtf):
         """
         raise NotImplementedError
 
-#
-#
-# def _round_up(x):
-#     return x if x % 100 == 0 else x + 100 - x % 100
-#
-#
-# def _round_down(x):
-#     return x // 100 * 100
-#
-#
-# def _process_transcript(transcript, fragment_len, feature_table, feature_positions):
-#     """updates feature_table and feature_positions (defined below)"""
-#     remaining_fragment_size = fragment_len
-#     if transcript.strand == '+':
-#         for exon in transcript.exons[::-1]:  # iterate backwards through exons
-#             feature_end = _round_up(exon.end)
-#             if exon.start < exon.end - remaining_fragment_size:
-#
-#                 # break exon into 100 base pieces, round down start, up end
-#                 feature_start = _round_down(exon.start)
-#
-#                 # add each 100-base position in interval to feature_table
-#                 for position in range(feature_start, feature_end + 100, 100):
-#                     key = (transcript.strand, transcript.feature, position)
-#                     feature_table[key].add(transcript.scid)
-#
-#                 # add feature_positions to second look-up table
-#                 feature_positions[transcript.scid].append((
-#                     exon.start, exon.end))
-#
-#                 remaining_fragment_size -= exon.end - exon.start
-#             else:
-#                 # get unrounded start and add to feature_positions
-#                 feature_start = exon.end - remaining_fragment_size
-#                 feature_positions[transcript.scid].append((
-#                     feature_start, exon.end))
-#                 # round and add positions to feature_table
-#                 feature_start = _round_down(feature_start)
-#                 for position in range(feature_start, feature_end + 100, 100):
-#                     key = (transcript.strand, transcript.feature, position)
-#                     feature_table[key].add(transcript.scid)
-#                 break  # move on to next set of exons
-#     else:
-#         for exon in transcript.exons[::-1]:  # iterate backwards
-#             feature_start = _round_down(exon.start)
-#             if exon.start + remaining_fragment_size < exon.end:
-#
-#                 # break exon into 100 base pieces, round down start, up end
-#                 feature_end = _round_up(exon.end)
-#
-#                 # add each 100-base position in interval to feature_table
-#                 for position in range(feature_start, feature_end + 100, 100):
-#                     key = (transcript.strand, transcript.feature, position)
-#                     feature_table[key].add(transcript.scid)
-#
-#                 # add feature_positions to second look-up table
-#                 feature_positions[transcript.scid].append(
-#                     (exon.start, exon.end))
-#
-#                 remaining_fragment_size -= exon.end - exon.start
-#             else:
-#                 # get unrounded start and add to feature_positions
-#                 feature_end = exon.start + remaining_fragment_size
-#                 feature_positions[transcript.scid].append((
-#                     exon.start, feature_end))
-#                 # round and add positions to feature_table
-#                 feature_end = _round_up(exon.start + remaining_fragment_size)
-#                 for position in range(feature_start, feature_end + 100, 100):
-#                     key = (transcript.strand, transcript.feature, position)
-#                     feature_table[key].add(transcript.scid)
-#                 break  # move on to next set of exons
-#
-#
-# # define some simple container classes for Exons and Transcripts
-# class Exon:
-#     def __init__(self, start, end):
-#         self.start = start
-#         self.end = end
-#
-#
-# class Transcript:
-#     def __init__(self, start, end, strand, feature, exons, scid):
-#         self.start = start
-#         self.end = end
-#         self.strand = strand
-#         self.feature = feature
-#         self.exons = exons
-#         self.scid = scid
-#
-#
-# def construct_feature_table(gtf, fragment_len=1000):
-#     """
-#     construct a feature table for n(3) conversions of genomic -> transcriptomic
-#     coordinates
-#     """
-#
-#     # define two helper functions to round to the nearest hundred
-#
-#     feature_table = defaultdict(set)
-#     feature_positions = defaultdict(list)
-#     pattern = re.compile(r'(^.*?scseq_id "SC)(.*?)(";.*?$)')
-#     # pile up exons of each scid
-#     transcripts = {}
-#     with open(gtf, 'r') as f:
-#         for record in f:
-#             # get rid of comments
-#             if record.startswith('#'):
-#                 continue
-#             record = record.split('\t')
-#             if record[2] == 'transcript':
-#                 scid = int(re.match(pattern, record[-1]).group(2))
-#                 transcripts[scid] = Transcript(
-#                     start=int(record[3]), end=int(record[4]), strand=record[6],
-#                     feature=record[0], exons=[], scid=scid
-#                 )
-#             elif record[2] == 'exon':
-#                 scid = int(re.match(pattern, record[-1]).group(2))
-#                 exon = Exon(int(record[3]), int(record[4]))
-#                 transcripts[scid].exons.append(exon)
-#
-#     # process the exons, deleting duplicates where they are found
-#     sort_key = lambda x: (x.start, x.end)
-#     for scid, transcript in transcripts.items():
-#         # eliminate duplicate exons, then regenerate sorted order among exons
-#         if transcript.strand == '+':
-#             exons = sorted((s for s in set(transcript.exons)), key=sort_key)
-#             transcript.exons = exons
-#         else:
-#             exons = sorted((s for s in set(transcript.exons)), key=sort_key,
-#                            reverse=True)
-#             transcript.exons = exons
-#         _process_transcript(transcript, fragment_len, feature_table, feature_positions)
-#
-#     return dict(feature_table), dict(feature_positions)  # get rid of defaultdict
-#
-#
+
 class GeneTable:
 
     def __init__(self, gtf):

From 7e1e2348ce56cc9fd961b81d373974d8f00a6926 Mon Sep 17 00:00:00 2001
From: ajc <ambrosejcarr@gmail.com>
Date: Tue, 10 Nov 2015 12:45:22 -0500
Subject: [PATCH 68/68] updated AMIs with git config helper, new numpy,
 additional dependencies

---
 src/plugins/starcluster.config | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/plugins/starcluster.config b/src/plugins/starcluster.config
index e6e4261..a7ca820 100644
--- a/src/plugins/starcluster.config
+++ b/src/plugins/starcluster.config
@@ -129,8 +129,8 @@ KEY_LOCATION= <path to your username.pem file>
 # use the EXTENDS feature:
 
 # DEFINE AMIs here
-# C3 AMI = ami-a12177c4
-# C4 & R3 AMI = ami-ab2177ce
+# C3 AMI = ami-2d116847
+# C4 & R3 AMI = ami-1711687d
 
 [cluster c3.large]
 KEYNAME = <your_key_name>
@@ -138,7 +138,7 @@ CLUSTER_SHELL = bash
 CLUSTER_USER = ec2-user
 CLUSTER_SIZE = 1
 NODE_INSTANCE_TYPE = c3.large
-NODE_IMAGE_ID = ami-a12177c4
+NODE_IMAGE_ID = ami-2d116847
 # This availability zone will propagate to all clusters below
 AVAILABILITY_ZONE = us-east-1e
 DISABLE_QUEUE=True
@@ -146,7 +146,7 @@ DISABLE_QUEUE=True
 [cluster c4.large]
 EXTENDS = c3.large
 NODE_INSTANCE_TYPE = c4.large
-NODE_IMAGE_ID = ami-ab2177ce
+NODE_IMAGE_ID = ami-1711687d
 DISABLE_QUEUE=True
 
 [cluster r3.8xlarge]