From 99597f358dd7abc51c886a16df6ebe4d480d7b2f Mon Sep 17 00:00:00 2001
From: Will Van Treuren <wdwvt@DN51p06l.SUNet>
Date: Sun, 7 Dec 2014 21:20:02 -0800
Subject: [PATCH 01/12] Initial commit

Provides baseline functionality without tests.
---
 qiime/otu_significance.py |  31 ++++--
 qiime/stats.py            |  30 ++++++
 scripts/correlations.py   | 221 ++++++++++++++++++++++++++++++++++++++
 3 files changed, 272 insertions(+), 10 deletions(-)
 create mode 100644 scripts/correlations.py

diff --git a/qiime/otu_significance.py b/qiime/otu_significance.py
index 0cf6e37df7..8a1f9ddc3d 100644
--- a/qiime/otu_significance.py
+++ b/qiime/otu_significance.py
@@ -331,16 +331,27 @@ def run_correlation_test(data_generator, test, test_choices,
 
 def correlation_output_formatter(bt, corr_coefs, pvals, fdr_pvals, bon_pvals,
                                  md_key):
-    """Format the output of the correlations for easy writing.
-
-    """
-    header = [
-        'OTU',
-        'Correlation Coef',
-        'pval',
-        'pval_fdr',
-        'pval_bon',
-        md_key]
+    '''Produce lines for a tab delimited text file for correlations.py.
+
+    Paramaters
+    ----------
+    bt : biom table object
+    corr_coefs : array-like
+        Floats representing correlation coefficients.
+    pvals : array-like
+        Floats representing pvalues for given correlation coefficients.
+    fdr_pvals : array-like
+        Floats representing FDR corrected pvals.
+    bon_pvals : array-like
+        Floats representing Bonferroni corrected pvals.
+    md_key : str or None
+        Key for extracting feature metadata from biom table.
+
+    Returns
+    -------
+    list of strs
+    '''
+    header = ['Feature ID', 'Corr coef', 'pval', 'pval_fdr', 'pval_bon', md_key]
     num_lines = len(corr_coefs)
     lines = ['\t'.join(header)]
     for i in range(num_lines):
diff --git a/qiime/stats.py b/qiime/stats.py
index ded20cb96b..cd98129f89 100644
--- a/qiime/stats.py
+++ b/qiime/stats.py
@@ -2599,3 +2599,33 @@ def cscore(v1, v2):
     v2_b = v2.astype(bool)
     sij = (v1_b * v2_b).sum()
     return (v1_b.sum() - sij) * (v2_b.sum() - sij)
+
+def correlate(v1, v2, method):
+    '''Correlate vectors using method.
+
+    Parameters
+    ----------
+    v1 : array-like
+        List or array of ints or floats to be correlated.
+    v2 : array-like
+        List or array of ints or floats to be correlated.
+    method : str
+        One of 'spearman', 'pearson', 'kendall', 'cscore'.
+
+    Returns
+    -------
+    rho : float
+        Correlation between the vectors.
+    '''
+    if method is 'pearson':
+        corr_fn = pearson
+    elif method is 'spearman':
+        corr_fn = spearman
+    elif method is 'kendall':
+        corr_fn = kendall
+    elif method is 'cscore':
+        corr_fn = cscore
+    else:
+        raise ValueError('Correlation function not recognized.')
+
+    return corr_fn(v1, v2)
diff --git a/scripts/correlations.py b/scripts/correlations.py
new file mode 100644
index 0000000000..55a331c5f2
--- /dev/null
+++ b/scripts/correlations.py
@@ -0,0 +1,221 @@
+#!/usr/bin/env python
+
+__author__ = "Will Van Treuren, Luke Ursell"
+__copyright__ = "Copyright 2014, The QIIME project"
+__credits__ = ["Will Van Treuren"]
+__license__ = "GPL"
+__version__ = "1.8.0-dev"
+__maintainer__ = "Will Van Treuren"
+__email__ = "wdwvt1@gmail.com"
+
+from qiime.util import (parse_command_line_parameters, make_option,
+                        sync_biom_and_mf)
+from qiime.stats import (benjamini_hochberg_step_down, bonferroni_correction,
+                         assign_correlation_pval, correlate,
+                         correlation_output_formatter, sort_by_pval, pearson,
+                         spearman, kendall, cscore)
+from qiime.parse import parse_mapping_file_to_dict
+from biom import load_table
+
+filtration_error_text = '''The biom table does not have enough samples after
+filtration to perform correlations (it has 3 or fewer). The filtration steps
+that have occurred are:
+1. Removing samples that were not found in the mapping file. 
+2. Removing samples that were found in the mapping file but whose metadata for
+the given category was not convertable to float.'''
+
+correlation_assignemnt_choices = ['spearman', 'pearson', 'kendall', 'cscore']
+pvalue_assignment_choices = ['fisher_z_transform':, 'parametric_t_distribution',
+                            'bootstrapped', 'kendall']
+bootstrap_functions = {'spearman': spearman, 'pearson': pearson,
+                       'kendall': kendall, 'cscore': cscore}
+
+script_info = {}
+script_info['brief_description'] = """
+This script allows the calculation of:
+    1. Correlations between feature abundances (relative or absolute) and 
+       numeric metadata.
+    2. Paired t-tests between two groups of samples.
+The available methods for calculating correlations are Spearmans
+Rho, Pearson, Kendall's Tau, and the C or checkerboard score. The available 
+methods for assigning p-values to the calculated correlation scores are 
+bootstrapping, Fisher's Z transformation, a parametric t-distribution, and 
+a Kendall's Tau specific p-value calculation.
+"""
+script_info['script_description'] = """
+This script calculates the correlation between feature values and a gradient of 
+sample data. Several methods are provided to allow the user to correlate 
+features to sample metadata values. This script also allows paired t testing.
+The paired t test is accomplished by passing a paired mapping file which is just
+a two column (tab separation) table with the samples that should be paired in 
+each row. It should not have a header.
+The available correlations tests are Kendall's Tau, Spearmans rank correlation,
+Pearsons  product moment correlation, and the C-score (or checkerboard score). 
+This script generates a tab separated output file which differs based on which 
+test you have chosen. If you have chosen simple correlation or paired_t then you
+will see the following headers:
+Feature ID - ID of the features being correlated. If these are OTUs, then they
+ will take the form of GreenGenes identifiers or de-novo identifiers. 
+Test-Statistic - the value of the test statistic for the given test
+P - the raw P value returned by the given test. 
+FDR_P - the P value corrected by the Benjamini-Hochberg FDR procedure for 
+ multiple comparisons.
+Bonferroni_P - the P value corrected by the Bonferroni procedure for multiple
+ comparisons.
+Metadata - this column will be present only if the biom table contained metadata
+ information for your features. If these are OTUs, and taxonomy is present in
+ the biom table, this category will contain that taxonomy (or other metadata).
+
+Warnings:
+The only supported metric for P-value assignment with the C-score is 
+bootstrapping. For more information on the C-score, read Stone and Roberts 1990
+Oecologea paper 85: 74-79. If you fail to pass 
+pval_assignment_method='bootstrapped' while you have -s cscore, the script will 
+error. 
+
+Assigning pvalues to Kendall's Tau scores with the bootstrapping method is 
+very slow.
+
+
+"""
+script_info['script_usage'] = []
+script_info['script_usage'].append(("Calculate the correlation between OTUs in the table and the pH of the samples from mich they came:", "", "%prog -i otu_table.biom -m map.txt -c pH -s spearman -o spearman_otu_gradient.txt"))
+script_info['script_usage'].append(("Calculate paired t values for a before and after group of samples:", "", "%prog -i otu_table.biom --paired_t_fp=paired_samples.txt -o kendall_longitudinal_otu_gradient.txt"))
+
+script_info['output_description']= """
+If you have chosen simple correlation or paired_t then you
+will see the following headers:
+Feature ID - ID of the features being correlated. If these are OTUs, then they
+ will take the form of GreenGenes identifiers or de-novo identifiers. 
+Test-Statistic - the value of the test statistic for the given test
+P - the raw P value returned by the given test. 
+FDR_P - the P value corrected by the Benjamini-Hochberg FDR procedure for 
+ multiple comparisons.
+Bonferroni_P - the P value corrected by the Bonferroni procedure for multiple
+ comparisons.
+Metadata - this column will be present only if the biom table contained metadata
+ information for your features. If these are OTUs, and taxonomy is present in
+ the biom table, this category will contain that taxonomy (or other metadata).
+"""
+script_info['required_options']=[
+    make_option('-i','--otu_table_fp',
+        help='path to input biom format table',
+        type='existing_path'),
+    make_option('-o', '--output_fp', type='new_filepath',
+        help='path to the output file to be created')]
+
+script_info['optional_options']=[
+    make_option('-m','--mapping_fp', type='existing_filepath',
+        help='path to category mapping file'),
+    make_option('-c', '--category', type='string',
+        help='name of the category over which to run the analysis'),
+    make_option('-s', '--test', type="choice", 
+        choices=correlation_assignment_choices,
+        default='spearman', help='Test to use. Choices are:\n%s' % \
+            (', '.join(correlation_function_choices)+'\n\t' + \
+            '[default: %default]')),
+    make_option('--pval_assignment_method', type="choice", 
+        choices=pvalue_assignment_choices,
+        default='fisher_z_transform', help='Test to use. Choices are:\n%s' % \
+            (', '.join(pvalue_assignment_choices))+'\n\t' + \
+            '[default: %default]'),
+    make_option('--metadata_key', default='taxonomy', type=str, 
+        help='Key to extract metadata from biom table. default: %default]'),
+    make_option('--paired_t_fp', type='existing_filepath', default=None, 
+        help='Pass a paired sample map as described in help to test with a '+\
+            'paired_t_two_sample test. Overrides all other options. A '+\
+            'paired sample map must be two columns without header that are '+\
+            'tab separated. Each row contains samples which should be paired.'),
+    make_option('--permutations', default=1000, type=int, 
+        help='Number of permutations to use for bootstrapped tests.'+\
+            '[default: %default]')]
+
+script_info['version'] = __version__
+
+def main():
+    option_parser, opts, args = parse_command_line_parameters(**script_info)
+
+    bt = parse_biom_table(open(opts.otu_table_fp))
+
+    if opts.paired_t_fp is not None: #user wants to conduct paired t_test
+        o = open(opts.paired_t_fp, 'U')
+        lines = o.readlines()
+        o.close()
+        b_samples = []
+        a_samples = []
+        for i in lines:
+            a,b = i.strip().split('\t')
+            a_samples.append(a)
+            b_samples.append(b)
+        data_feed = paired_t_generator(bt, b_samples, a_samples)
+        test_stats, pvals = run_paired_t(data_feed)
+        # calculate corrected pvals
+        fdr_pvals = array(benjamini_hochberg_step_down(pvals))
+        bon_pvals = bonferroni_correction(pvals)
+        # write output results after sorting
+        lines = paired_t_output_formatter(bt, test_stats, pvals, fdr_pvals, 
+            bon_pvals, md_key=opts.metadata_key)
+        lines = sort_by_pval(lines, ind=2)
+        o = open(opts.output_fp, 'w')
+        o.writelines('\n'.join(lines))
+        o.close()
+
+    else:  # user wants normal correlation analysis
+        pmf, _ = parse_mapping_file_to_dict(mf_fp)
+        category = opts.category
+
+        samples_to_correlate = []
+        md_values_to_correlate = []
+        bt_sample_ids = bt.ids(axis='sample')
+
+        for sample_id, sample_md in pmf.items():
+            if sample_id in bt_sample_ids:
+                try:
+                    v = float(sample_md[category])
+                    samples_to_correlate.append(sample_id)
+                    md_values_to_correlate.append(v)
+                except ValueError:
+                    pass  # sample in both biom and mf, but no float md in mf
+            else:
+                pass  # sample in mf, but not bt
+
+        # remove samples which are not found in the mapping file or do not have
+        # metadata that converts to float
+        bt.filter(ids_to_keep = samples_to_correlate, axis='sample')
+
+        # sort the biom table so that feature values are retrieved in the same 
+        # order as the metadata in the samples they correspond to
+        bt.sort(sort_f = lambda _: samples_to_correlate, axis='sample')
+
+        if bt.shape[1] <= 3:
+            raise ValueError(filtration_error_text)
+
+        rhos = []
+        pvals = []
+        for feature_vector in bt.iter_data(axis='observation'):
+            rho = correlate(feature_vector, md_values_to_correlate,
+                            method=opts.test)
+            pval = assign_correlation_pval(corr, len(feature_vector),
+                                           method=opts.pval_assignment_method,
+                                           permutations=opts.permutations,
+                                           perm_test_fn=bootstrap_functions[opts.pval_assignment_method]
+                                           v1=feature_vector,
+                                           v2=md_values_to_correlate)
+            rhos.append(rho)
+            pvals.append(pval)
+
+        fdr_pvals = benjamini_hochberg_step_down(pvals)
+        bon_pvals = bonferroni_correction(pvals)
+
+        lines = correlation_output_formatter(bt, rhos, pvals, fdr_pvals,
+                                             bon_pvals, opts.metadata_key)
+        lines = sort_by_pval(lines, ind=2)
+
+        o = open(opts.output_fp, 'w')
+        o.writelines('\n'.join(lines))
+        o.close()
+
+if __name__ == "__main__":
+    main()
+
+

From 9958edada8ac2de388512ba31532190fd9327072 Mon Sep 17 00:00:00 2001
From: Will Van Treuren <wdwvt@DN51p06l.SUNet>
Date: Sun, 7 Dec 2014 21:25:16 -0800
Subject: [PATCH 02/12] Updated function names/calls.

---
 scripts/correlations.py | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)
 mode change 100644 => 100755 scripts/correlations.py

diff --git a/scripts/correlations.py b/scripts/correlations.py
old mode 100644
new mode 100755
index 55a331c5f2..7ab3fdcd0c
--- a/scripts/correlations.py
+++ b/scripts/correlations.py
@@ -11,9 +11,11 @@
 from qiime.util import (parse_command_line_parameters, make_option,
                         sync_biom_and_mf)
 from qiime.stats import (benjamini_hochberg_step_down, bonferroni_correction,
-                         assign_correlation_pval, correlate,
-                         correlation_output_formatter, sort_by_pval, pearson,
+                         assign_correlation_pval, correlate, pearson,
                          spearman, kendall, cscore)
+from qiime.otu_significance import (correlation_output_formatter, sort_by_pval,
+                                    paired_t_output_formatter,
+                                    paired_t_generator, run_paired_t)
 from qiime.parse import parse_mapping_file_to_dict
 from biom import load_table
 
@@ -24,8 +26,8 @@
 2. Removing samples that were found in the mapping file but whose metadata for
 the given category was not convertable to float.'''
 
-correlation_assignemnt_choices = ['spearman', 'pearson', 'kendall', 'cscore']
-pvalue_assignment_choices = ['fisher_z_transform':, 'parametric_t_distribution',
+correlation_assignment_choices = ['spearman', 'pearson', 'kendall', 'cscore']
+pvalue_assignment_choices = ['fisher_z_transform', 'parametric_t_distribution',
                             'bootstrapped', 'kendall']
 bootstrap_functions = {'spearman': spearman, 'pearson': pearson,
                        'kendall': kendall, 'cscore': cscore}
@@ -74,10 +76,9 @@
 error. 
 
 Assigning pvalues to Kendall's Tau scores with the bootstrapping method is 
-very slow.
+very slow."""
 
 
-"""
 script_info['script_usage'] = []
 script_info['script_usage'].append(("Calculate the correlation between OTUs in the table and the pH of the samples from mich they came:", "", "%prog -i otu_table.biom -m map.txt -c pH -s spearman -o spearman_otu_gradient.txt"))
 script_info['script_usage'].append(("Calculate paired t values for a before and after group of samples:", "", "%prog -i otu_table.biom --paired_t_fp=paired_samples.txt -o kendall_longitudinal_otu_gradient.txt"))
@@ -112,7 +113,7 @@
     make_option('-s', '--test', type="choice", 
         choices=correlation_assignment_choices,
         default='spearman', help='Test to use. Choices are:\n%s' % \
-            (', '.join(correlation_function_choices)+'\n\t' + \
+            (', '.join(correlation_assignment_choices)+'\n\t' + \
             '[default: %default]')),
     make_option('--pval_assignment_method', type="choice", 
         choices=pvalue_assignment_choices,
@@ -198,7 +199,7 @@ def main():
             pval = assign_correlation_pval(corr, len(feature_vector),
                                            method=opts.pval_assignment_method,
                                            permutations=opts.permutations,
-                                           perm_test_fn=bootstrap_functions[opts.pval_assignment_method]
+                                           perm_test_fn=bootstrap_functions[opts.pval_assignment_method],
                                            v1=feature_vector,
                                            v2=md_values_to_correlate)
             rhos.append(rho)

From 6b33a60501b84f0be4423d9eade9aec4d0b0da44 Mon Sep 17 00:00:00 2001
From: Will Van Treuren <wdwvt@DN51p06l.SUNet>
Date: Sun, 7 Dec 2014 21:34:18 -0800
Subject: [PATCH 03/12] small update

removed a useless function, renamed a function, figured out which tests were missing.
---
 qiime/otu_significance.py | 24 ++----------------------
 scripts/correlations.py   | 10 +++++-----
 2 files changed, 7 insertions(+), 27 deletions(-)

diff --git a/qiime/otu_significance.py b/qiime/otu_significance.py
index 8a1f9ddc3d..696aa59d53 100644
--- a/qiime/otu_significance.py
+++ b/qiime/otu_significance.py
@@ -329,7 +329,7 @@ def run_correlation_test(data_generator, test, test_choices,
     return corr_coefs, pvals
 
 
-def correlation_output_formatter(bt, corr_coefs, pvals, fdr_pvals, bon_pvals,
+def correlate_output_formatter(bt, corr_coefs, pvals, fdr_pvals, bon_pvals,
                                  md_key):
     '''Produce lines for a tab delimited text file for correlations.py.
 
@@ -378,23 +378,6 @@ def run_paired_t(data_generator):
         pvals.append(pval)
     return test_stats, pvals
 
-
-def paired_t_output_formatter(bt, test_stats, pvals, fdr_pvals, bon_pvals,
-                              md_key):
-    """Format the output for all tests so it can be easily written."""
-    header = ['OTU', 'Test-Statistic', 'P', 'FDR_P', 'Bonferroni_P', md_key]
-    num_lines = len(pvals)
-    lines = ['\t'.join(header)]
-    for i in range(num_lines):
-        tmp = [bt.ids(axis='observation')[i], test_stats[i], pvals[i], fdr_pvals[i],
-               bon_pvals[i]]
-        lines.append('\t'.join(map(str, tmp)))
-    nls = _add_metadata(bt, md_key, lines)
-    return nls
-
-# Functions used by both scripts
-
-
 def sort_by_pval(lines, ind):
     """Sort lines with pvals in descending order.
 
@@ -407,10 +390,7 @@ def _nan_safe_sort(line):
             return val
         else:
             return inf
-    return (
-        # header+sorted lines
-        [lines[0]] + sorted(lines[1:], key=_nan_safe_sort)
-    )
+    return [lines[0]] + sorted(lines[1:], key=_nan_safe_sort))
 
 
 def _add_metadata(bt, md_key, lines):
diff --git a/scripts/correlations.py b/scripts/correlations.py
index 7ab3fdcd0c..141b4a033a 100755
--- a/scripts/correlations.py
+++ b/scripts/correlations.py
@@ -13,7 +13,7 @@
 from qiime.stats import (benjamini_hochberg_step_down, bonferroni_correction,
                          assign_correlation_pval, correlate, pearson,
                          spearman, kendall, cscore)
-from qiime.otu_significance import (correlation_output_formatter, sort_by_pval,
+from qiime.otu_significance import (correlate_output_formatter, sort_by_pval,
                                     paired_t_output_formatter,
                                     paired_t_generator, run_paired_t)
 from qiime.parse import parse_mapping_file_to_dict
@@ -154,8 +154,8 @@ def main():
         fdr_pvals = array(benjamini_hochberg_step_down(pvals))
         bon_pvals = bonferroni_correction(pvals)
         # write output results after sorting
-        lines = paired_t_output_formatter(bt, test_stats, pvals, fdr_pvals, 
-            bon_pvals, md_key=opts.metadata_key)
+        lines = correlate_output_formatter(bt, test_stats, pvals, fdr_pvals, 
+                                           bon_pvals, md_key=opts.metadata_key)
         lines = sort_by_pval(lines, ind=2)
         o = open(opts.output_fp, 'w')
         o.writelines('\n'.join(lines))
@@ -208,8 +208,8 @@ def main():
         fdr_pvals = benjamini_hochberg_step_down(pvals)
         bon_pvals = bonferroni_correction(pvals)
 
-        lines = correlation_output_formatter(bt, rhos, pvals, fdr_pvals,
-                                             bon_pvals, opts.metadata_key)
+        lines = correlate_output_formatter(bt, rhos, pvals, fdr_pvals,
+                                           bon_pvals, opts.metadata_key)
         lines = sort_by_pval(lines, ind=2)
 
         o = open(opts.output_fp, 'w')

From 8ba13767bc5131a0b4c8b9164f9b6febf67c4698 Mon Sep 17 00:00:00 2001
From: Will Van Treuren <wdwvt1@gmail.com>
Date: Sun, 7 Dec 2014 23:16:07 -0800
Subject: [PATCH 04/12] Add script usage tests + modifications

This commit adds script usage tests and modifies the help text as well as minor tweaks to functionality for correlations.py
---
 qiime/otu_significance.py                     |  79 +++++++++++++-----
 qiime/stats.py                                |  20 +++--
 qiime_test_data/correlations/map.txt          |  11 +++
 qiime_test_data/correlations/otu_table.biom   | Bin 0 -> 37016 bytes
 .../correlations/paired_samples.txt           |   5 ++
 scripts/correlations.py                       |  45 ++++++----
 6 files changed, 117 insertions(+), 43 deletions(-)
 create mode 100644 qiime_test_data/correlations/map.txt
 create mode 100644 qiime_test_data/correlations/otu_table.biom
 create mode 100644 qiime_test_data/correlations/paired_samples.txt

diff --git a/qiime/otu_significance.py b/qiime/otu_significance.py
index 696aa59d53..6a547d8efa 100644
--- a/qiime/otu_significance.py
+++ b/qiime/otu_significance.py
@@ -12,7 +12,7 @@
 
 from qiime.parse import parse_mapping_file_to_dict
 from numpy import (array, argsort, vstack, isnan, inf, nan, apply_along_axis,
-                   mean, zeros)
+                   mean, zeros, isinf, logical_or)
 
 from qiime.stats import (fisher_population_correlation,
                                   pearson, spearman, g_fit, ANOVA_one_way, 
@@ -329,15 +329,16 @@ def run_correlation_test(data_generator, test, test_choices,
     return corr_coefs, pvals
 
 
-def correlate_output_formatter(bt, corr_coefs, pvals, fdr_pvals, bon_pvals,
-                                 md_key):
+def correlate_output_formatter(bt, test_stats, pvals, fdr_pvals, bon_pvals,
+                               md_key):
     '''Produce lines for a tab delimited text file for correlations.py.
 
     Paramaters
     ----------
     bt : biom table object
-    corr_coefs : array-like
-        Floats representing correlation coefficients.
+    test_stats : array-like
+        Floats representing correlation coefficients or paired t test
+        statistics.
     pvals : array-like
         Floats representing pvalues for given correlation coefficients.
     fdr_pvals : array-like
@@ -346,34 +347,50 @@ def correlate_output_formatter(bt, corr_coefs, pvals, fdr_pvals, bon_pvals,
         Floats representing Bonferroni corrected pvals.
     md_key : str or None
         Key for extracting feature metadata from biom table.
+    type_of_stat : str
+        Type of statistic - used to modify header to report either 'Co
 
     Returns
     -------
     list of strs
     '''
-    header = ['Feature ID', 'Corr coef', 'pval', 'pval_fdr', 'pval_bon', md_key]
-    num_lines = len(corr_coefs)
+    header = ['Feature ID', 'Test stat.', 'pval', 'pval_fdr', 'pval_bon',
+              md_key]
+    num_lines = len(test_stats)
     lines = ['\t'.join(header)]
     for i in range(num_lines):
-        tmp = [bt.ids(axis='observation')[i], corr_coefs[i], pvals[i], fdr_pvals[i],
-               bon_pvals[i]]
+        tmp = [bt.ids(axis='observation')[i], test_stats[i], pvals[i],
+               fdr_pvals[i], bon_pvals[i]]
         lines.append('\t'.join(map(str, tmp)))
     nls = _add_metadata(bt, md_key, lines)
     return nls
 
+def run_paired_t(bt, s1, s2):
+    '''Perform a paired t test between samples.
 
-def paired_t_generator(bt, s_before, s_after):
-    """Produce a generator to run paired t tests on each OTU."""
-    b_data = vstack([bt.sampleData(i) for i in s_before]).T
-    a_data = vstack([bt.sampleData(i) for i in s_after]).T
-    return ((b_data[i], a_data[i]) for i in range(len(bt.ids(axis='observation'))))
+    Parameters
+    ----------
+    bt : biom table object
+        Table containing data for samples in s1 and s2.
+    s1 : list
+        List of sample ids found in bt (strs).
+    s2 : list
+        List of sample ids found in bt (strs).
 
+    Returns
+    -------
+    test_stats : list
+        Floats representing the test statistic for each paired comparison. 
+    pvals : list
+        Floats representing the p-values of the reported test statistics.
+    '''
+    test_stats = []
+    pvals = []
+    s1_indices = [bt.index(i, axis='sample') for i in s1]
+    s2_indices = [bt.index(i, axis='sample') for i in s2]
 
-def run_paired_t(data_generator):
-    """Run paired t test on data."""
-    test_stats, pvals = [], []
-    for b_data, a_data in data_generator:
-        test_stat, pval = t_paired(b_data, a_data)
+    for data in bt.iter_data(axis='observation'):
+        test_stat, pval = t_paired(data.take(s1_indices), data.take(s2_indices))
         test_stats.append(test_stat)
         pvals.append(pval)
     return test_stats, pvals
@@ -390,7 +407,7 @@ def _nan_safe_sort(line):
             return val
         else:
             return inf
-    return [lines[0]] + sorted(lines[1:], key=_nan_safe_sort))
+    return [lines[0]] + sorted(lines[1:], key=_nan_safe_sort)
 
 
 def _add_metadata(bt, md_key, lines):
@@ -404,3 +421,25 @@ def _add_metadata(bt, md_key, lines):
     else:  # remove md_header from the first line
         nls = ['\t'.join(lines[0].split('\t')[:-1])] + lines[1:]
         return nls
+
+def is_computable_float(v):
+    '''Return float if v can be converted to float excluding nan and inf.
+
+    Parameters
+    ----------
+    v : variable
+        Value to be converted to float if possible.
+
+    Returns
+    -------
+    If v can be converted to a float that is not nan or inf, return v.
+    Otherwise return False.
+    '''
+    try:
+        tmp = float(v)
+        if not logical_or(isnan(tmp), isinf(tmp)):
+            return tmp 
+        else:
+            return False
+    except ValueError:
+        return False
diff --git a/qiime/stats.py b/qiime/stats.py
index cd98129f89..5bab8ee0ca 100644
--- a/qiime/stats.py
+++ b/qiime/stats.py
@@ -1701,10 +1701,13 @@ def assign_correlation_pval(corr, n, method, permutations=None,
             raise ValueError('You must specify vectors, permutation '
                              'function, and number of permutations to calc '
                              'bootstrapped pvalues. Cant continue.')
-        r = empty(permutations)
-        for i in range(permutations):
-            r[i] = perm_test_fn(v1, permutation(v2))
-        return (abs(r) >= abs(corr)).sum() / float(permutations)
+        if any([isnan(corr), isinf(corr)]):
+            return nan
+        else:
+            r = empty(permutations)
+            for i in range(permutations):
+                r[i] = perm_test_fn(v1, permutation(v2))
+            return (abs(r) >= abs(corr)).sum() / float(permutations)
     elif method == 'kendall':
         return kendall_pval(corr, n)
     else:
@@ -2617,15 +2620,14 @@ def correlate(v1, v2, method):
     rho : float
         Correlation between the vectors.
     '''
-    if method is 'pearson':
+    if method == 'pearson':
         corr_fn = pearson
-    elif method is 'spearman':
+    elif method == 'spearman':
         corr_fn = spearman
-    elif method is 'kendall':
+    elif method == 'kendall':
         corr_fn = kendall
-    elif method is 'cscore':
+    elif method == 'cscore':
         corr_fn = cscore
     else:
         raise ValueError('Correlation function not recognized.')
-
     return corr_fn(v1, v2)
diff --git a/qiime_test_data/correlations/map.txt b/qiime_test_data/correlations/map.txt
new file mode 100644
index 0000000000..72323e1438
--- /dev/null
+++ b/qiime_test_data/correlations/map.txt
@@ -0,0 +1,11 @@
+#SampleID	pH
+s0	1.1
+s1	2.1
+s2	1.7
+s3	4.7
+s4	300
+s5	-12
+s6	89.242251
+s7	‘SKJDKJSD’
+s8	nan
+s9	100
\ No newline at end of file
diff --git a/qiime_test_data/correlations/otu_table.biom b/qiime_test_data/correlations/otu_table.biom
new file mode 100644
index 0000000000000000000000000000000000000000..0072684d06915922b4f7ecbfa91b7ce5914d86d9
GIT binary patch
literal 37016
zcmeHQU2IfE6rS5{w_DoM1^H_Mxj`T`TDmPOzX28~P=u5!O(Yn~?zUUn*zK;n3l>8Z
zFbc##`aqNi6X*j04aO#7tRyJPUlVx%BC&}kR*lgIV?ZGR(csLTb8gvLZWrQi=(cAF
zJ9o~UduGlzKb$!;bJ<&6wW4rD$p|Kb!@;td(|E+6yATd`ND8Jg-wO{K#Csq<mL(rF
zHiC(~2|#y(eBCAq?Z8*}{PN|DbE15V;Wg$30VoIQ6b}yss+TXR6`HOGb#;h)p#MOU
zh5XS#WUDXM5^6I@QC~3J8W258hPlZ2fbl5kuWG+8U>-w)#%yd1bFh|1DX^FqBu0?<
zE6a2MpUltakynz8{C+t(hMd^8Ft06R$>azglp`Q0VAX=eW#$`lnnIDFFIKiC(rS>C
z${WYoP@Wi?1<Zh(%PZo~6Bx+b8i+&<Xq{XKsfNcWK+~kQ*c@t%`1jYd(5UC;KwBW<
ziv=3X{M&?}WO`?F9;~N$?w2>xkk=3i7>!@n$b~+Dypnt5<;2Oe8+u2ZeMW;ksJ^03
zBFG6<)X!z4H9oTzaKzf$UNzDQ@(e0tq8&ugj6ePJCicVAqLMuH<;pnOyxBZS^3l90
zaTspD>bz;re=r9j9(=Nhw6rrmk3|hqFc9<cA>_mS&5_WSa9KYU$Fq(M7V*kuYu6aO
zVnXBr-xKqo2sI(S&gX9p=&P!XG|n4Fg8zV-V2+FCSS&ohyxiXs3YN(^X=W(0neovj
z1IDkkv>G!X3>PNGxd@f63U^sWWtnSEU1jBbSH*nSteNgP^IR^s%(u&kVLd{#Bmst7
zKOOiHMw7o%^5>~?DkK5t1;pVc;6pr+9tL^=#7J*|bfH2=IU^N1$|+LlC<mIwY!{R>
zO5sO2qZK;JdCY=7Mxmpeu?ii_#rY5Mj6Ne7#{<7fpDhVEubK2Yl3uLxL&wK{VDitC
zj1yIUXm^-C&32DMyH8g6q1_)>>Co;|RJscY5cOe0$CD{V2oM5<03kpK5CVh%AwUQa
z0)#-uAh51xZI#AXPq<D#2n%vt&#L|`T+cb`rHaVsf^@AnV^mLRMXmgSmtdiL4HmG;
z{DIeI$Z8?ZDXk<48}taM{y-gS>_kTgq|x`+($?705Qs7!hhvd`{~fqf9#~&F4`j1a
zc%%A7`GDg)4C#)Il8%0<?xh14f%D2S7zZd%9T(W&Z>^3mS6?I3o|=Kn87#RP%ay)5
zP*j7RRos6;{~5+Mo-bHO>5I+%8h1W(-v!5N4yTIzESnUY&vfGfwsZKi$k$k7ps7{t
zsGxt<0dj3wS*OA0C5pu6V58wt$Ud~_%{m7ANj#a-Am%$!Kin_ISbCEai<*la5V8;_
zv*L-b5QPvR1PB2_fDj-AG8_SEhuTa-r3*LmEc8M<bgr^rr?R7c*$(@OYW`45x7Rm#
znKm}pb<UZ0<mTZM3lF@r<5J=N+C{dK!gZm%&Z03(ns>kEzRpTF*`JMiTBetNXg_kQ
z(|vK_?Ai@qYHa$&0-un<L$-g0-emWWvpaU|-FfS{=c4`W@68d{&7MCec`oLiWh1^}
z&jD+W|MAjsHb2vjGdCysz253RFSFI?O-p}zqqK3+#F9gk_h^fowF~X_8!voT$aLq~
z)29lqfAR0-*T?Ep8&{7!KP`Cl+mqk@npf2QRpB39yKR$x%--DnebMaMIY&*MuJ$=P
zi=Xa{O*_!E<5F(c;q43SUbYvP28)i&JA5Mhz+3OXu<MHJudDivol{GHn7DMojb*o8
zcY=pc?6NUu>>j<E?2gmtR*XNh{EUm`^s*W4^@V3NJJZMHbgitKdi#SrQ$E=@ebvVc
z-kbU9FW3M6CojzQ-J{huq1D%&*8`VtU1{E4?EZX0$)-vB%6FcBx9aEBS_S*amNRpx
zbt}?`5Fi8y0YYGqBk=HkC@1&Q@N?1J(IS0Fx}jkT{7=<z#7ekw&^>h@|I>P-ymb9o
z`JtY&C~&)lAIem~;yof00)zk|KnM^5gh1vZkj#F<dkYwI&qBKFr#<r}jG`#h{7Brt
zdo=bF-YZaB`lw<*Ax@?O>I;-61PB2_fDj-A2!V`8AesGyb^ykB4$A6&c?$ecJ<BAj
zqDb5}ypJEMHZ9m&`JeDUqFU%l#ePDZ%!;coP?``R1PB2_fDj-AG9Ce|`y}dl_+<A<
z*ais&-`1lr3=vT8li)cxuX3Lv=4%hNg@W4<BC!`Ng2(uM_@6}kUmCPpmXwV4EL?Li
z9h;a=|9ZgIvsS7lsv?RtOzT9ysz-AD&tEJ1A3j%NDZSTX9e_B|dcYDo${+*?0YZQf
zAOr{jLLeg%koRH4EngYB6}vqnm3jn~wpvfC_uA!pnkC#<HmgkQX%mX!_@J(*aa?x6
zcM%X1TtBC3H_591s^48n<`3G_AZv(z0IV;?!vg`e|IP)q!`^uPQ~kYRtNwcp%5_4&
zOxAy$ZL*%!f8*P0xTMtntApY$=s$=<y1$RpyHA#c^E}%x(|U)D(J$A#6Kiz;82|m~
z@ydLZr;ZEk?-SmR=kq|?3-!5wDT9Za=oglnJ;Lwf^aUh5gX;S@)fP=w@ML@jeIF+t
z4@waNga9Ex2oM585dro4IF}j+))MW|^`U|J(7rs_J}}=<$$fY`G;R00n!^&*L3uwO
zKHHhl>~$7)D4y#`c!3Cn03kpK5CVh%A&~J1B(tCLK|_qwct8Fa99k7cndV30InMQI
z#oX$A=MxsC1{M1WaWWOKc#p`003kpK5CVh%A&|KUB(tCJ{~^XW{r6$Ae%mGC6h)%j
z@R8q-FSlso4#j>#oXm<_yhmh0fDj-A2mwNX5Xf8vtj?3F=i!o_=cn_encD|aQFHY?
rDW1pNp`540bD6cND3r^v>c8qaQtH1c0(^*yvFg8`j(CMr-GBcBt|aue

literal 0
HcmV?d00001

diff --git a/qiime_test_data/correlations/paired_samples.txt b/qiime_test_data/correlations/paired_samples.txt
new file mode 100644
index 0000000000..4e40751eef
--- /dev/null
+++ b/qiime_test_data/correlations/paired_samples.txt
@@ -0,0 +1,5 @@
+s1	s9
+s0	s5
+s2	s3
+s4	s7
+s6	s8
\ No newline at end of file
diff --git a/scripts/correlations.py b/scripts/correlations.py
index 141b4a033a..057c7d8a49 100755
--- a/scripts/correlations.py
+++ b/scripts/correlations.py
@@ -1,8 +1,10 @@
 #!/usr/bin/env python
 
-__author__ = "Will Van Treuren, Luke Ursell"
+__author__ = "Will Van Treuren"
 __copyright__ = "Copyright 2014, The QIIME project"
-__credits__ = ["Will Van Treuren"]
+__credits__ = ["Will Van Treuren", "Luke Ursell", "Catherine Lozupone",
+               "Jesse Stombaugh", "Doug Wendel", "Dan Knights", "Greg Caporaso", 
+               "Jai Ram Rideout"]
 __license__ = "GPL"
 __version__ = "1.8.0-dev"
 __maintainer__ = "Will Van Treuren"
@@ -14,10 +16,10 @@
                          assign_correlation_pval, correlate, pearson,
                          spearman, kendall, cscore)
 from qiime.otu_significance import (correlate_output_formatter, sort_by_pval,
-                                    paired_t_output_formatter,
-                                    paired_t_generator, run_paired_t)
+                                    run_paired_t, is_computable_float)
 from qiime.parse import parse_mapping_file_to_dict
 from biom import load_table
+from numpy import array, where
 
 filtration_error_text = '''The biom table does not have enough samples after
 filtration to perform correlations (it has 3 or fewer). The filtration steps
@@ -26,6 +28,10 @@
 2. Removing samples that were found in the mapping file but whose metadata for
 the given category was not convertable to float.'''
 
+cscore_error_text = '''The only supported metric for P-value assignment with the
+C-score is bootstrapping. For more information on the C-score, read Stone and
+Roberts 1990 Oecologea paper 85: 74-79.'''
+
 correlation_assignment_choices = ['spearman', 'pearson', 'kendall', 'cscore']
 pvalue_assignment_choices = ['fisher_z_transform', 'parametric_t_distribution',
                             'bootstrapped', 'kendall']
@@ -81,7 +87,8 @@
 
 script_info['script_usage'] = []
 script_info['script_usage'].append(("Calculate the correlation between OTUs in the table and the pH of the samples from mich they came:", "", "%prog -i otu_table.biom -m map.txt -c pH -s spearman -o spearman_otu_gradient.txt"))
-script_info['script_usage'].append(("Calculate paired t values for a before and after group of samples:", "", "%prog -i otu_table.biom --paired_t_fp=paired_samples.txt -o kendall_longitudinal_otu_gradient.txt"))
+script_info['script_usage'].append(("Calculate paired t values for a before and after group of samples:", "", "%prog -i otu_table.biom --paired_t_fp=paired_samples.txt -o paired.txt"))
+script_info['script_usage'].append(("Calculate the correlation between OTUs in the table and the pH of the samples from mich they came using bootstrapping and pearon correlation:", "", "%prog -i otu_table.biom -m map.txt -c pH -s pearson --pval_assignment_method bootstrapped --permutations 100 -o pearson_bootstrapped.txt"))
 
 script_info['output_description']= """
 If you have chosen simple correlation or paired_t then you
@@ -136,7 +143,10 @@
 def main():
     option_parser, opts, args = parse_command_line_parameters(**script_info)
 
-    bt = parse_biom_table(open(opts.otu_table_fp))
+    if opts.test == 'cscore' and opts.pval_assignment_method != 'bootstrapped':
+        raise ValueError(cscore_error_text)
+    
+    bt = load_table(opts.otu_table_fp)
 
     if opts.paired_t_fp is not None: #user wants to conduct paired t_test
         o = open(opts.paired_t_fp, 'U')
@@ -148,11 +158,13 @@ def main():
             a,b = i.strip().split('\t')
             a_samples.append(a)
             b_samples.append(b)
-        data_feed = paired_t_generator(bt, b_samples, a_samples)
-        test_stats, pvals = run_paired_t(data_feed)
+        test_stats, pvals = run_paired_t(bt, a_samples, b_samples)
         # calculate corrected pvals
         fdr_pvals = array(benjamini_hochberg_step_down(pvals))
         bon_pvals = bonferroni_correction(pvals)
+        # correct for cases where values above 1.0 due to correction
+        fdr_pvals = where(array(fdr_pvals) > 1.0, 1.0, fdr_pvals)
+        bon_pvals = where(array(bon_pvals) > 1.0, 1.0, bon_pvals)
         # write output results after sorting
         lines = correlate_output_formatter(bt, test_stats, pvals, fdr_pvals, 
                                            bon_pvals, md_key=opts.metadata_key)
@@ -162,7 +174,7 @@ def main():
         o.close()
 
     else:  # user wants normal correlation analysis
-        pmf, _ = parse_mapping_file_to_dict(mf_fp)
+        pmf, _ = parse_mapping_file_to_dict(opts.mapping_fp)
         category = opts.category
 
         samples_to_correlate = []
@@ -172,11 +184,12 @@ def main():
         for sample_id, sample_md in pmf.items():
             if sample_id in bt_sample_ids:
                 try:
-                    v = float(sample_md[category])
+                    v = is_computable_float(sample_md[category])
                     samples_to_correlate.append(sample_id)
                     md_values_to_correlate.append(v)
-                except ValueError:
-                    pass  # sample in both biom and mf, but no float md in mf
+                except KeyError:
+                    raise ValueError(('The category (%s)' % opts.category) +\
+                        ' was not found in the mapping file.')
             else:
                 pass  # sample in mf, but not bt
 
@@ -196,10 +209,11 @@ def main():
         for feature_vector in bt.iter_data(axis='observation'):
             rho = correlate(feature_vector, md_values_to_correlate,
                             method=opts.test)
-            pval = assign_correlation_pval(corr, len(feature_vector),
+            pval = assign_correlation_pval(rho, len(feature_vector),
                                            method=opts.pval_assignment_method,
                                            permutations=opts.permutations,
-                                           perm_test_fn=bootstrap_functions[opts.pval_assignment_method],
+                                           perm_test_fn=\
+                                                bootstrap_functions[opts.test],
                                            v1=feature_vector,
                                            v2=md_values_to_correlate)
             rhos.append(rho)
@@ -207,6 +221,9 @@ def main():
 
         fdr_pvals = benjamini_hochberg_step_down(pvals)
         bon_pvals = bonferroni_correction(pvals)
+        # correct for cases where values above 1.0 due to correction
+        fdr_pvals = where(array(fdr_pvals) > 1.0, 1.0, fdr_pvals)
+        bon_pvals = where(array(bon_pvals) > 1.0, 1.0, bon_pvals)
 
         lines = correlate_output_formatter(bt, rhos, pvals, fdr_pvals,
                                            bon_pvals, opts.metadata_key)

From f58e3146c0c56dc8961a40558dd4cbf912890216 Mon Sep 17 00:00:00 2001
From: Will Van Treuren <wdwvt1@gmail.com>
Date: Tue, 9 Dec 2014 15:44:03 -0800
Subject: [PATCH 05/12] Addressed formatting comments

---
 ChangeLog.md            |   1 +
 scripts/correlations.py | 121 +++++++++++++++++++---------------------
 2 files changed, 59 insertions(+), 63 deletions(-)

diff --git a/ChangeLog.md b/ChangeLog.md
index 0f9e8ceb16..ee3a9467ae 100644
--- a/ChangeLog.md
+++ b/ChangeLog.md
@@ -1,5 +1,6 @@
 QIIME 1.8.0-dev (changes since 1.8.0 go here)
 =============================================
+* ``correlations.py`` has been added. This script allows the calculation of correlations between feature abundances and continuous-valued metadata. It also reintroduces the paired-t test functionality that has been missing in 1.8.
 * ``split_otu_table.py`` now allows multiple fields to be passed to split a biom table, and
 optionally a mapping file. Check out the new documentation for the naming conventions
 (which have changed slightly) and an example.
diff --git a/scripts/correlations.py b/scripts/correlations.py
index 057c7d8a49..f938ab69fd 100755
--- a/scripts/correlations.py
+++ b/scripts/correlations.py
@@ -39,42 +39,27 @@
                        'kendall': kendall, 'cscore': cscore}
 
 script_info = {}
-script_info['brief_description'] = """
+script_info['brief_description'] = """This script calculates correlations between feature abundances and continuous-valued metadata."""
+script_info['script_description'] = """
 This script allows the calculation of:
     1. Correlations between feature abundances (relative or absolute) and 
        numeric metadata.
     2. Paired t-tests between two groups of samples.
-The available methods for calculating correlations are Spearmans
-Rho, Pearson, Kendall's Tau, and the C or checkerboard score. The available 
-methods for assigning p-values to the calculated correlation scores are 
-bootstrapping, Fisher's Z transformation, a parametric t-distribution, and 
-a Kendall's Tau specific p-value calculation.
-"""
-script_info['script_description'] = """
-This script calculates the correlation between feature values and a gradient of 
-sample data. Several methods are provided to allow the user to correlate 
-features to sample metadata values. This script also allows paired t testing.
-The paired t test is accomplished by passing a paired mapping file which is just
-a two column (tab separation) table with the samples that should be paired in 
-each row. It should not have a header.
-The available correlations tests are Kendall's Tau, Spearmans rank correlation,
-Pearsons  product moment correlation, and the C-score (or checkerboard score). 
-This script generates a tab separated output file which differs based on which 
-test you have chosen. If you have chosen simple correlation or paired_t then you
-will see the following headers:
-Feature ID - ID of the features being correlated. If these are OTUs, then they
- will take the form of GreenGenes identifiers or de-novo identifiers. 
-Test-Statistic - the value of the test statistic for the given test
-P - the raw P value returned by the given test. 
-FDR_P - the P value corrected by the Benjamini-Hochberg FDR procedure for 
- multiple comparisons.
-Bonferroni_P - the P value corrected by the Bonferroni procedure for multiple
- comparisons.
-Metadata - this column will be present only if the biom table contained metadata
- information for your features. If these are OTUs, and taxonomy is present in
- the biom table, this category will contain that taxonomy (or other metadata).
-
-Warnings:
+
+Several methods are provided to allow the user to correlate 
+features to sample metadata values including Spearmans Rho, Pearson, Kendall's
+Tau, and the C or checkerboard score. 
+
+The available methods for assigning p-values to the calculated correlation
+scores are bootstrapping, Fisher's Z transformation, a parametric
+t-distribution, and a Kendall's Tau specific p-value calculation.
+
+This script also allows paired t testing. The paired t test is accomplished by
+passing a paired mapping file which is just a two column (tab separation) table
+with the samples that should be paired in each row. It should not have a header.
+
+Notes:
+
 The only supported metric for P-value assignment with the C-score is 
 bootstrapping. For more information on the C-score, read Stone and Roberts 1990
 Oecologea paper 85: 74-79. If you fail to pass 
@@ -82,28 +67,39 @@
 error. 
 
 Assigning pvalues to Kendall's Tau scores with the bootstrapping method is 
-very slow."""
+very slow.
 
+"""
 
 script_info['script_usage'] = []
-script_info['script_usage'].append(("Calculate the correlation between OTUs in the table and the pH of the samples from mich they came:", "", "%prog -i otu_table.biom -m map.txt -c pH -s spearman -o spearman_otu_gradient.txt"))
-script_info['script_usage'].append(("Calculate paired t values for a before and after group of samples:", "", "%prog -i otu_table.biom --paired_t_fp=paired_samples.txt -o paired.txt"))
-script_info['script_usage'].append(("Calculate the correlation between OTUs in the table and the pH of the samples from mich they came using bootstrapping and pearon correlation:", "", "%prog -i otu_table.biom -m map.txt -c pH -s pearson --pval_assignment_method bootstrapped --permutations 100 -o pearson_bootstrapped.txt"))
+script_info['script_usage'].append(
+    ("Calculate the correlation between OTUs in the table and the pH of the samples from mich they came:",
+     "",
+     "%prog -i otu_table.biom -m map.txt -c pH -s spearman -o spearman_otu_gradient.txt"))
+script_info['script_usage'].append(
+    ("Calculate paired t values for a before and after group of samples:",
+     "",
+     "%prog -i otu_table.biom --paired_t_fp=paired_samples.txt -o paired.txt"))
+script_info['script_usage'].append(
+    ("Calculate the correlation between OTUs in the table and the pH of the samples from mich they came using bootstrapping and pearon correlation:",
+     "",
+     "%prog -i otu_table.biom -m map.txt -c pH -s pearson --pval_assignment_method bootstrapped --permutations 100 -o pearson_bootstrapped.txt"))
 
 script_info['output_description']= """
-If you have chosen simple correlation or paired_t then you
-will see the following headers:
-Feature ID - ID of the features being correlated. If these are OTUs, then they
- will take the form of GreenGenes identifiers or de-novo identifiers. 
-Test-Statistic - the value of the test statistic for the given test
-P - the raw P value returned by the given test. 
-FDR_P - the P value corrected by the Benjamini-Hochberg FDR procedure for 
- multiple comparisons.
-Bonferroni_P - the P value corrected by the Bonferroni procedure for multiple
- comparisons.
-Metadata - this column will be present only if the biom table contained metadata
- information for your features. If these are OTUs, and taxonomy is present in
- the biom table, this category will contain that taxonomy (or other metadata).
+The output will be a tab delimited file with the following headers. Each row
+will record the values calculated for a fiven featue:
+- Feature ID: ID of the features being correlated. If these are OTUs, then they
+  will take the form of GreenGenes identifiers or de-novo identifiers. 
+- Test-Statistic: the value of the test statistic for the given test.
+- P: the raw P value returned by the given test. 
+- FDR_P: the P value corrected by the Benjamini-Hochberg FDR procedure for 
+  multiple comparisons.
+- Bonferroni_P: the P value corrected by the Bonferroni procedure for multiple
+  comparisons.
+- Metadata - this column will be present only if the biom table contained
+  metadata information for your features. If these are OTUs, and taxonomy is
+  present in the biom table, this category will contain that taxonomy (or other
+  metadata).
 """
 script_info['required_options']=[
     make_option('-i','--otu_table_fp',
@@ -119,24 +115,23 @@
         help='name of the category over which to run the analysis'),
     make_option('-s', '--test', type="choice", 
         choices=correlation_assignment_choices,
-        default='spearman', help='Test to use. Choices are:\n%s' % \
-            (', '.join(correlation_assignment_choices)+'\n\t' + \
-            '[default: %default]')),
+        default='spearman', help='Correlation method to use. Choices are: %s' %
+        (', '.join(correlation_assignment_choices)) + ' [default: %default]'),
     make_option('--pval_assignment_method', type="choice", 
         choices=pvalue_assignment_choices,
-        default='fisher_z_transform', help='Test to use. Choices are:\n%s' % \
-            (', '.join(pvalue_assignment_choices))+'\n\t' + \
-            '[default: %default]'),
+        default='fisher_z_transform', help='Pvalue method to use. Choices are: '
+        '%s' % (', '.join(pvalue_assignment_choices)) + ' [default: %default]'),
     make_option('--metadata_key', default='taxonomy', type=str, 
-        help='Key to extract metadata from biom table. default: %default]'),
+        help='Key to extract metadata from biom table. [default: %default]'),
     make_option('--paired_t_fp', type='existing_filepath', default=None, 
-        help='Pass a paired sample map as described in help to test with a '+\
-            'paired_t_two_sample test. Overrides all other options. A '+\
-            'paired sample map must be two columns without header that are '+\
-            'tab separated. Each row contains samples which should be paired.'),
+        help='Pass a paired sample map as described in help to test with a '
+            'paired_t_two_sample test. Overrides all other options. A '
+            'paired sample map must be two columns without header that are '
+            'tab separated. Each row contains samples which should be paired.'
+            ' [default: %default]'),
     make_option('--permutations', default=1000, type=int, 
-        help='Number of permutations to use for bootstrapped tests.'+\
-            '[default: %default]')]
+        help='Number of permutations to use for bootstrapped tests.'
+            ' [default: %default]')]
 
 script_info['version'] = __version__
 
@@ -188,7 +183,7 @@ def main():
                     samples_to_correlate.append(sample_id)
                     md_values_to_correlate.append(v)
                 except KeyError:
-                    raise ValueError(('The category (%s)' % opts.category) +\
+                    raise ValueError('The category (%s)' % opts.category +
                         ' was not found in the mapping file.')
             else:
                 pass  # sample in mf, but not bt

From 12a2f638f4fc65245ef17cc13ba4ce2766c7a0b3 Mon Sep 17 00:00:00 2001
From: Will Van Treuren <wdwvt1@gmail.com>
Date: Tue, 9 Dec 2014 16:06:23 -0800
Subject: [PATCH 06/12] Updated Changlog message

---
 ChangeLog.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ChangeLog.md b/ChangeLog.md
index ee3a9467ae..ec6e2b7ea9 100644
--- a/ChangeLog.md
+++ b/ChangeLog.md
@@ -1,6 +1,6 @@
 QIIME 1.8.0-dev (changes since 1.8.0 go here)
 =============================================
-* ``correlations.py`` has been added. This script allows the calculation of correlations between feature abundances and continuous-valued metadata. It also reintroduces the paired-t test functionality that has been missing in 1.8.
+* ``correlations.py`` has been added. This script allows the calculation of correlations between feature abundances and continuous-valued metadata. It also reintroduces the paired-t test functionality that has been missing in 1.8. This script also replaces the functions that were in ``otu_category_significance.py`` in QIIME 1.7 and earlier.
 * ``split_otu_table.py`` now allows multiple fields to be passed to split a biom table, and
 optionally a mapping file. Check out the new documentation for the naming conventions
 (which have changed slightly) and an example.

From 3013f211317c5310ef9c890bfae632c26d57556d Mon Sep 17 00:00:00 2001
From: Will Van Treuren <wdwvt1@gmail.com>
Date: Wed, 10 Dec 2014 15:59:19 -0800
Subject: [PATCH 07/12] Updates after review

Updates the name of the script and adds tests to several untested functions. Add's to help documentation. Removes the paired t functionality.
---
 qiime/otu_significance.py                     |  11 +-
 .../map.txt                                   |   0
 .../otu_table.biom                            | Bin
 .../paired_samples.txt                        |   0
 ...py => observation_metadata_correlation.py} | 184 +++++++-----------
 tests/test_otu_significance.py                |  87 ++++++++-
 6 files changed, 158 insertions(+), 124 deletions(-)
 rename qiime_test_data/{correlations => observation_metadata_correlation}/map.txt (100%)
 rename qiime_test_data/{correlations => observation_metadata_correlation}/otu_table.biom (100%)
 rename qiime_test_data/{correlations => observation_metadata_correlation}/paired_samples.txt (100%)
 rename scripts/{correlations.py => observation_metadata_correlation.py} (53%)

diff --git a/qiime/otu_significance.py b/qiime/otu_significance.py
index 6a547d8efa..1a5cd3d172 100644
--- a/qiime/otu_significance.py
+++ b/qiime/otu_significance.py
@@ -435,11 +435,8 @@ def is_computable_float(v):
     If v can be converted to a float that is not nan or inf, return v.
     Otherwise return False.
     '''
-    try:
-        tmp = float(v)
-        if not logical_or(isnan(tmp), isinf(tmp)):
-            return tmp 
-        else:
-            return False
-    except ValueError:
+    tmp = float(v)
+    if not logical_or(isnan(tmp), isinf(tmp)):
+        return tmp 
+    else:
         return False
diff --git a/qiime_test_data/correlations/map.txt b/qiime_test_data/observation_metadata_correlation/map.txt
similarity index 100%
rename from qiime_test_data/correlations/map.txt
rename to qiime_test_data/observation_metadata_correlation/map.txt
diff --git a/qiime_test_data/correlations/otu_table.biom b/qiime_test_data/observation_metadata_correlation/otu_table.biom
similarity index 100%
rename from qiime_test_data/correlations/otu_table.biom
rename to qiime_test_data/observation_metadata_correlation/otu_table.biom
diff --git a/qiime_test_data/correlations/paired_samples.txt b/qiime_test_data/observation_metadata_correlation/paired_samples.txt
similarity index 100%
rename from qiime_test_data/correlations/paired_samples.txt
rename to qiime_test_data/observation_metadata_correlation/paired_samples.txt
diff --git a/scripts/correlations.py b/scripts/observation_metadata_correlation.py
similarity index 53%
rename from scripts/correlations.py
rename to scripts/observation_metadata_correlation.py
index f938ab69fd..1d71d0cf60 100755
--- a/scripts/correlations.py
+++ b/scripts/observation_metadata_correlation.py
@@ -41,25 +41,19 @@
 script_info = {}
 script_info['brief_description'] = """This script calculates correlations between feature abundances and continuous-valued metadata."""
 script_info['script_description'] = """
-This script allows the calculation of:
-    1. Correlations between feature abundances (relative or absolute) and 
-       numeric metadata.
-    2. Paired t-tests between two groups of samples.
-
-Several methods are provided to allow the user to correlate 
-features to sample metadata values including Spearmans Rho, Pearson, Kendall's
-Tau, and the C or checkerboard score. 
+This script calculates correlations between feature (aka observation) abundances
+(relative or absolute) and numeric metadata.Several methods are provided to
+allow the user to correlate features to sample metadata values including
+Spearmans Rho, Pearson, Kendall's Tau, and the C or checkerboard score.
+References for these methods are numerous, but good descriptions may be found in 
+'Biometry' by Sokal and Rolhf. 
 
 The available methods for assigning p-values to the calculated correlation
 scores are bootstrapping, Fisher's Z transformation, a parametric
-t-distribution, and a Kendall's Tau specific p-value calculation.
-
-This script also allows paired t testing. The paired t test is accomplished by
-passing a paired mapping file which is just a two column (tab separation) table
-with the samples that should be paired in each row. It should not have a header.
+t-distribution, and a Kendall's Tau specific p-value calculation. These methods
+are also described in 'Biometry'.
 
 Notes:
-
 The only supported metric for P-value assignment with the C-score is 
 bootstrapping. For more information on the C-score, read Stone and Roberts 1990
 Oecologea paper 85: 74-79. If you fail to pass 
@@ -76,10 +70,6 @@
     ("Calculate the correlation between OTUs in the table and the pH of the samples from mich they came:",
      "",
      "%prog -i otu_table.biom -m map.txt -c pH -s spearman -o spearman_otu_gradient.txt"))
-script_info['script_usage'].append(
-    ("Calculate paired t values for a before and after group of samples:",
-     "",
-     "%prog -i otu_table.biom --paired_t_fp=paired_samples.txt -o paired.txt"))
 script_info['script_usage'].append(
     ("Calculate the correlation between OTUs in the table and the pH of the samples from mich they came using bootstrapping and pearon correlation:",
      "",
@@ -106,13 +96,13 @@
         help='path to input biom format table',
         type='existing_path'),
     make_option('-o', '--output_fp', type='new_filepath',
-        help='path to the output file to be created')]
-
-script_info['optional_options']=[
+        help='path to the output file to be created'),
     make_option('-m','--mapping_fp', type='existing_filepath',
         help='path to category mapping file'),
     make_option('-c', '--category', type='string',
-        help='name of the category over which to run the analysis'),
+        help='name of the category over which to run the analysis')]
+
+script_info['optional_options']=[
     make_option('-s', '--test', type="choice", 
         choices=correlation_assignment_choices,
         default='spearman', help='Correlation method to use. Choices are: %s' %
@@ -123,12 +113,6 @@
         '%s' % (', '.join(pvalue_assignment_choices)) + ' [default: %default]'),
     make_option('--metadata_key', default='taxonomy', type=str, 
         help='Key to extract metadata from biom table. [default: %default]'),
-    make_option('--paired_t_fp', type='existing_filepath', default=None, 
-        help='Pass a paired sample map as described in help to test with a '
-            'paired_t_two_sample test. Overrides all other options. A '
-            'paired sample map must be two columns without header that are '
-            'tab separated. Each row contains samples which should be paired.'
-            ' [default: %default]'),
     make_option('--permutations', default=1000, type=int, 
         help='Number of permutations to use for bootstrapped tests.'
             ' [default: %default]')]
@@ -142,91 +126,65 @@ def main():
         raise ValueError(cscore_error_text)
     
     bt = load_table(opts.otu_table_fp)
-
-    if opts.paired_t_fp is not None: #user wants to conduct paired t_test
-        o = open(opts.paired_t_fp, 'U')
-        lines = o.readlines()
-        o.close()
-        b_samples = []
-        a_samples = []
-        for i in lines:
-            a,b = i.strip().split('\t')
-            a_samples.append(a)
-            b_samples.append(b)
-        test_stats, pvals = run_paired_t(bt, a_samples, b_samples)
-        # calculate corrected pvals
-        fdr_pvals = array(benjamini_hochberg_step_down(pvals))
-        bon_pvals = bonferroni_correction(pvals)
-        # correct for cases where values above 1.0 due to correction
-        fdr_pvals = where(array(fdr_pvals) > 1.0, 1.0, fdr_pvals)
-        bon_pvals = where(array(bon_pvals) > 1.0, 1.0, bon_pvals)
-        # write output results after sorting
-        lines = correlate_output_formatter(bt, test_stats, pvals, fdr_pvals, 
-                                           bon_pvals, md_key=opts.metadata_key)
-        lines = sort_by_pval(lines, ind=2)
-        o = open(opts.output_fp, 'w')
-        o.writelines('\n'.join(lines))
-        o.close()
-
-    else:  # user wants normal correlation analysis
-        pmf, _ = parse_mapping_file_to_dict(opts.mapping_fp)
-        category = opts.category
-
-        samples_to_correlate = []
-        md_values_to_correlate = []
-        bt_sample_ids = bt.ids(axis='sample')
-
-        for sample_id, sample_md in pmf.items():
-            if sample_id in bt_sample_ids:
-                try:
-                    v = is_computable_float(sample_md[category])
-                    samples_to_correlate.append(sample_id)
-                    md_values_to_correlate.append(v)
-                except KeyError:
-                    raise ValueError('The category (%s)' % opts.category +
-                        ' was not found in the mapping file.')
-            else:
-                pass  # sample in mf, but not bt
-
-        # remove samples which are not found in the mapping file or do not have
-        # metadata that converts to float
-        bt.filter(ids_to_keep = samples_to_correlate, axis='sample')
-
-        # sort the biom table so that feature values are retrieved in the same 
-        # order as the metadata in the samples they correspond to
-        bt.sort(sort_f = lambda _: samples_to_correlate, axis='sample')
-
-        if bt.shape[1] <= 3:
-            raise ValueError(filtration_error_text)
-
-        rhos = []
-        pvals = []
-        for feature_vector in bt.iter_data(axis='observation'):
-            rho = correlate(feature_vector, md_values_to_correlate,
-                            method=opts.test)
-            pval = assign_correlation_pval(rho, len(feature_vector),
-                                           method=opts.pval_assignment_method,
-                                           permutations=opts.permutations,
-                                           perm_test_fn=\
-                                                bootstrap_functions[opts.test],
-                                           v1=feature_vector,
-                                           v2=md_values_to_correlate)
-            rhos.append(rho)
-            pvals.append(pval)
-
-        fdr_pvals = benjamini_hochberg_step_down(pvals)
-        bon_pvals = bonferroni_correction(pvals)
-        # correct for cases where values above 1.0 due to correction
-        fdr_pvals = where(array(fdr_pvals) > 1.0, 1.0, fdr_pvals)
-        bon_pvals = where(array(bon_pvals) > 1.0, 1.0, bon_pvals)
-
-        lines = correlate_output_formatter(bt, rhos, pvals, fdr_pvals,
-                                           bon_pvals, opts.metadata_key)
-        lines = sort_by_pval(lines, ind=2)
-
-        o = open(opts.output_fp, 'w')
-        o.writelines('\n'.join(lines))
-        o.close()
+    pmf, _ = parse_mapping_file_to_dict(opts.mapping_fp)
+
+    samples_to_correlate = []
+    md_values_to_correlate = []
+    bt_sample_ids = bt.ids(axis='sample')
+
+    for sample_id, sample_md in pmf.items():
+        if sample_id in bt_sample_ids:
+            try:
+                v = is_computable_float(sample_md[opts.category])
+                samples_to_correlate.append(sample_id)
+                md_values_to_correlate.append(v)
+            except KeyError:
+                option_parser.error('The category (%s)' % opts.category +
+                    ' was not found in the mapping file.')
+            except ValueError:
+                pass  # value couldn't be converted to float, ignore this sample
+        else:
+            pass  # sample in mf, but not bt
+
+    # remove samples which are not found in the mapping file or do not have
+    # metadata that converts to float
+    bt.filter(ids_to_keep = samples_to_correlate, axis='sample')
+
+    # sort the biom table so that feature values are retrieved in the same 
+    # order as the metadata in the samples they correspond to
+    bt.sort(sort_f = lambda _: samples_to_correlate, axis='sample')
+
+    if bt.shape[1] <= 3:
+        raise ValueError(filtration_error_text)
+
+    rhos = []
+    pvals = []
+    for feature_vector in bt.iter_data(axis='observation'):
+        rho = correlate(feature_vector, md_values_to_correlate,
+                        method=opts.test)
+        pval = assign_correlation_pval(rho, len(feature_vector),
+                                       method=opts.pval_assignment_method,
+                                       permutations=opts.permutations,
+                                       perm_test_fn=\
+                                            bootstrap_functions[opts.test],
+                                       v1=feature_vector,
+                                       v2=md_values_to_correlate)
+        rhos.append(rho)
+        pvals.append(pval)
+
+    fdr_pvals = benjamini_hochberg_step_down(pvals)
+    bon_pvals = bonferroni_correction(pvals)
+    # correct for cases where values above 1.0 due to correction
+    fdr_pvals = where(array(fdr_pvals) > 1.0, 1.0, fdr_pvals)
+    bon_pvals = where(array(bon_pvals) > 1.0, 1.0, bon_pvals)
+
+    lines = correlate_output_formatter(bt, rhos, pvals, fdr_pvals,
+                                       bon_pvals, opts.metadata_key)
+    lines = sort_by_pval(lines, ind=2)
+
+    o = open(opts.output_fp, 'w')
+    o.writelines('\n'.join(lines))
+    o.close()
 
 if __name__ == "__main__":
     main()
diff --git a/tests/test_otu_significance.py b/tests/test_otu_significance.py
index d09c170b3a..0a2b3b5a3c 100755
--- a/tests/test_otu_significance.py
+++ b/tests/test_otu_significance.py
@@ -17,10 +17,11 @@
                                     GROUP_TEST_CHOICES, grouped_correlation_row_generator,
                                     run_grouped_correlation, CORRELATION_TEST_CHOICES,
                                     grouped_correlation_formatter, correlation_row_generator,
-                                    run_correlation_test)
-from qiime.stats import (assign_correlation_pval, fisher,
-                                   fisher_population_correlation)
-from numpy import array, hstack, corrcoef, asarray, nan
+                                    run_correlation_test, is_computable_float,
+                                    correlate_output_formatter, _add_metadata)
+from qiime.stats import (assign_correlation_pval, fisher, 
+                         fisher_population_correlation)
+from numpy import array, hstack, corrcoef, asarray, nan, inf
 from numpy.random import seed
 from numpy.testing import assert_almost_equal
 from os import remove
@@ -959,9 +960,84 @@ def test_run_correlation_test(self):
                                                   permutations=1000)
         assert_almost_equal(exp_bootstrapped_pvals, obs_pvals)
 
+    def test_is_computable_float(self):
+        '''Test that an arbitrary input can be converted to float.'''
+        self.assertRaises(ValueError, is_computable_float, 'adkfjsdkfj')
+        assert is_computable_float(nan) == False
+        assert is_computable_float(inf) == False
+        assert is_computable_float(-inf) == False
+        assert is_computable_float("3.5679") == 3.5679
+
+    def test_correlate_output_formatter(self):
+        '''Test that output lines are correctly generated.'''
+        # use BT_IN_1
+        bt = parse_biom_table(BT_IN_1)
+        test_stats = [3.56, 2.78, nan, nan, nan, -4.78]
+        pvals = [.001, .034, nan, nan, nan, .00001]
+        fdr_pvals = [.001, .001, nan, nan, nan, .001]
+        bon_pvals = [.006, .192, nan, nan, nan, .006]
+        md_key = 'taxonomy'
+        type_of_stat = 'T'
+        exp = [['Feature ID', 'Test stat.', 'pval', 'pval_fdr', 'pval_bon',
+                 md_key],
+               ['OTU1', '3.56', '0.001', '0.001', '0.006', 'k__One'],
+               ['OTU2', '2.78', '0.034', '0.001', '0.192', 'k__Two'],
+               ['OTU3', 'nan', 'nan', 'nan', 'nan', 'k__Three'],
+               ['OTU4', 'nan', 'nan', 'nan', 'nan', 'k__Four'],
+               ['OTU5', 'nan', 'nan', 'nan', 'nan', 'k__Five'],
+               ['OTU6', '-4.78', '1e-05', '0.001', '0.006', 'k__Six']]
+        exp = ['\t'.join(i) for i in exp]
+        obs = correlate_output_formatter(bt, test_stats, pvals, fdr_pvals,
+                                         bon_pvals, md_key)
+        assert obs == exp
+
+    def test_add_metadata(self):
+        '''Test metadata is added appropriately to output lines.'''
+        # test first with taxonomy
+        exp = \
+        ['Feature ID\tTest stat.\tpval\tpval_fdr\tpval_bon\ttaxonomy',
+         'OTU1\t3.56\t0.001\t0.001\t0.006\tk__One',
+         'OTU2\t2.78\t0.034\t0.001\t0.192\tk__Two',
+         'OTU3\tnan\tnan\tnan\tnan\tk__Three',
+         'OTU4\tnan\tnan\tnan\tnan\tk__Four',
+         'OTU5\tnan\tnan\tnan\tnan\tk__Five',
+         'OTU6\t-4.78\t1e-05\t0.001\t0.006\tk__Six']
+        inp_lines = \
+        ['Feature ID\tTest stat.\tpval\tpval_fdr\tpval_bon\ttaxonomy',
+         'OTU1\t3.56\t0.001\t0.001\t0.006',
+         'OTU2\t2.78\t0.034\t0.001\t0.192',
+         'OTU3\tnan\tnan\tnan\tnan',
+         'OTU4\tnan\tnan\tnan\tnan',
+         'OTU5\tnan\tnan\tnan\tnan',
+         'OTU6\t-4.78\t1e-05\t0.001\t0.006']
+        bt = parse_biom_table(BT_IN_1)
+        obs = _add_metadata(bt, 'taxonomy', inp_lines)
+        assert obs == exp
+        # test without taxonomy or other metadata
+        exp = \
+        ['Feature ID\tTest stat.\tpval\tpval_fdr\tpval_bon',
+         'OTU1\t3.56\t0.001\t0.001\t0.006',
+         'OTU2\t2.78\t0.034\t0.001\t0.192',
+         'OTU3\tnan\tnan\tnan\tnan',
+         'OTU4\tnan\tnan\tnan\tnan',
+         'OTU5\tnan\tnan\tnan\tnan',
+         'OTU6\t-4.78\t1e-05\t0.001\t0.006']
+        inp_lines = \
+        ['Feature ID\tTest stat.\tpval\tpval_fdr\tpval_bon\ttaxonomy',
+         'OTU1\t3.56\t0.001\t0.001\t0.006',
+         'OTU2\t2.78\t0.034\t0.001\t0.192',
+         'OTU3\tnan\tnan\tnan\tnan',
+         'OTU4\tnan\tnan\tnan\tnan',
+         'OTU5\tnan\tnan\tnan\tnan',
+         'OTU6\t-4.78\t1e-05\t0.001\t0.006']
+        bt = parse_biom_table(BT_IN_1_no_md)
+        obs = _add_metadata(bt, 'taxonomy', inp_lines)
+        assert obs == exp
+
 # globals used by certain tests.
 BT_IN_1 = '{"id": "None","format": "Biological Observation Matrix 1.0.0","format_url": "http://biom-format.org","type": "OTU table","generated_by": "testCode","date": "2013-08-20T15:48:21.166180","matrix_type": "sparse","matrix_element_type": "float","shape": [6, 6],"data": [[0,0,28.0],[0,1,52.0],[0,2,51.0],[0,3,78.0],[0,4,16.0],[0,5,77.0],[1,0,25.0],[1,1,14.0],[1,2,11.0],[1,3,32.0],[1,4,48.0],[1,5,63.0],[2,0,31.0],[2,1,2.0],[2,2,15.0],[2,3,69.0],[2,4,64.0],[2,5,27.0],[3,0,36.0],[3,1,68.0],[3,2,70.0],[3,3,65.0],[3,4,33.0],[3,5,62.0],[4,0,16.0],[4,1,41.0],[4,2,59.0],[4,3,40.0],[4,4,15.0],[4,5,3.0],[5,0,32.0],[5,1,8.0],[5,2,54.0],[5,3,98.0],[5,4,29.0],[5,5,50.0]],"rows": [{"id": "OTU1", "metadata": {"taxonomy": ["k__One"]}},{"id": "OTU2", "metadata": {"taxonomy": ["k__Two"]}},{"id": "OTU3", "metadata": {"taxonomy": ["k__Three"]}},{"id": "OTU4", "metadata": {"taxonomy": ["k__Four"]}},{"id": "OTU5", "metadata": {"taxonomy": ["k__Five"]}},{"id": "OTU6", "metadata": {"taxonomy": ["k__Six"]}}],"columns": [{"id": "Sample1", "metadata": null},{"id": "Sample2", "metadata": null},{"id": "Sample3", "metadata": null},{"id": "Sample4", "metadata": null},{"id": "Sample5", "metadata": null},{"id": "Sample6", "metadata": null}]}'
 BT_IN_2 = '{"id": "None","format": "Biological Observation Matrix 1.0.0","format_url": "http://biom-format.org","type": "OTU table","generated_by": "testCode","date": "2013-08-20T15:48:21.166180","matrix_type": "sparse","matrix_element_type": "float","shape": [6, 6],"data": [[0,0,28.0],[0,1,28.0],[0,2,28.0],[0,3,28.0],[0,4,28.0],[0,5,28.0],[1,0,28.0],[1,1,14.0],[1,2,11.0],[1,3,32.0],[1,4,48.0],[1,5,63.0],[2,0,31.0],[2,1,2.0],[2,2,15.0],[2,3,69.0],[2,4,64.0],[2,5,27.0],[3,0,36.0],[3,1,68.0],[3,2,70.0],[3,3,65.0],[3,4,33.0],[3,5,62.0],[4,0,16.0],[4,1,41.0],[4,2,59.0],[4,3,40.0],[4,4,15.0],[4,5,3.0],[5,0,32.0],[5,1,8.0],[5,2,54.0],[5,3,98.0],[5,4,29.0],[5,5,50.0]],"rows": [{"id": "OTU1", "metadata": {"taxonomy": ["k__One"]}},{"id": "OTU2", "metadata": {"taxonomy": ["k__Two"]}},{"id": "OTU3", "metadata": {"taxonomy": ["k__Three"]}},{"id": "OTU4", "metadata": {"taxonomy": ["k__Four"]}},{"id": "OTU5", "metadata": {"taxonomy": ["k__Five"]}},{"id": "OTU6", "metadata": {"taxonomy": ["k__Six"]}}],"columns": [{"id": "Sample1", "metadata": null},{"id": "Sample2", "metadata": null},{"id": "Sample3", "metadata": null},{"id": "Sample4", "metadata": null},{"id": "Sample5", "metadata": null},{"id": "Sample6", "metadata": null}]}'
+BT_IN_1_no_md = '{"id": "None","format": "Biological Observation Matrix 1.0.0","format_url": "http://biom-format.org","type": "OTU table","generated_by": "testCode","date": "2013-08-20T15:48:21.166180","matrix_type": "sparse","matrix_element_type": "float","shape": [6, 6],"data": [[0,0,28.0],[0,1,52.0],[0,2,51.0],[0,3,78.0],[0,4,16.0],[0,5,77.0],[1,0,25.0],[1,1,14.0],[1,2,11.0],[1,3,32.0],[1,4,48.0],[1,5,63.0],[2,0,31.0],[2,1,2.0],[2,2,15.0],[2,3,69.0],[2,4,64.0],[2,5,27.0],[3,0,36.0],[3,1,68.0],[3,2,70.0],[3,3,65.0],[3,4,33.0],[3,5,62.0],[4,0,16.0],[4,1,41.0],[4,2,59.0],[4,3,40.0],[4,4,15.0],[4,5,3.0],[5,0,32.0],[5,1,8.0],[5,2,54.0],[5,3,98.0],[5,4,29.0],[5,5,50.0]],"rows": [{"id": "OTU1", "metadata": null},{"id": "OTU2", "metadata": null},{"id": "OTU3", "metadata": null},{"id": "OTU4", "metadata": null},{"id": "OTU5", "metadata": null},{"id": "OTU6", "metadata": null}] ,"columns": [{"id": "Sample1", "metadata": null},{"id": "Sample2", "metadata": null},{"id": "Sample3", "metadata": null},{"id": "Sample4", "metadata": null},{"id": "Sample5", "metadata": null},{"id": "Sample6", "metadata": null}]}'
 
 MF_IN_1 = ['#SampleID\ttest_cat\ttest_corr',
            'Sample1\tcat1\t1',
@@ -974,6 +1050,9 @@ def test_run_correlation_test(self):
            'NotInOtuTable2\tcat5\t8']
 BT_4 = """{"id": "None","format": "Biological Observation Matrix 1.0.0","format_url": "http://biom-format.org","type": "OTU table","generated_by": "BIOM-Format 1.1.2","date": "2013-08-16T15:23:02.872397","matrix_type": "sparse","matrix_element_type": "float","shape": [6, 8],"data": [[0,0,28.0],[0,1,52.0],[0,2,51.0],[0,3,78.0],[0,4,16.0],[0,5,77.0],[0,6,73.0],[0,7,6.0],[1,0,25.0],[1,1,14.0],[1,2,11.0],[1,3,32.0],[1,4,48.0],[1,5,63.0],[1,6,27.0],[1,7,38.0],[2,0,31.0],[2,1,2.0],[2,2,15.0],[2,3,69.0],[2,4,64.0],[2,5,27.0],[2,6,64.0],[2,7,54.0],[3,0,36.0],[3,1,68.0],[3,2,70.0],[3,3,65.0],[3,4,33.0],[3,5,62.0],[3,6,60.0],[3,7,23.0],[4,0,16.0],[4,1,41.0],[4,2,59.0],[4,3,40.0],[4,4,15.0],[4,5,3.0],[4,6,35.0],[4,7,5.0],[5,0,32.0],[5,1,8.0],[5,2,54.0],[5,3,98.0],[5,4,29.0],[5,5,50.0],[5,6,93.0],[5,7,19.0]],"rows": [{"id": "OTU1", "metadata": {"taxonomy": "k__One"}},{"id": "OTU2", "metadata": {"taxonomy": "k__Two"}},{"id": "OTU3", "metadata": {"taxonomy": "k__Three"}},{"id": "OTU4", "metadata": {"taxonomy": "k__Four"}},{"id": "OTU5", "metadata": {"taxonomy": "k__Five"}},{"id": "OTU6", "metadata": {"taxonomy": "k__Six"}}],"columns": [{"id": "Sample1", "metadata": null},{"id": "Sample2", "metadata": null},{"id": "Sample3", "metadata": null},{"id": "Sample4", "metadata": null},{"id": "Sample5", "metadata": null},{"id": "Sample6", "metadata": null},{"id": "Sample7", "metadata": null},{"id": "Sample8", "metadata": null}]}"""
 
+['Feature ID\tTest stat.\tpval\tpval_fdr\tpval_bon\ttaxonomy', 'OTU1\t3.56\t0.001\t0.001\t0.006\tk__One', 'OTU2\t2.78\t0.034\t0.001\t0.192\tk__Two', 'OTU3\tnan\tnan\tnan\tnan\tk__Three', 'OTU4\tnan\tnan\tnan\tnan\tk__Four', 'OTU5\tnan\tnan\tnan\tnan\tk__Five', 'OTU6\t-4.78\t1e-05\t0.001\t0.006\tk__Six']
+['Feature ID\tTest stat.\tpval\tpval_fdr\tpval_bon', 'OTU1\t3.56\t0.001\t0.001\t0.006', 'OTU2\t2.78\t0.034\t0.001\t0.192', 'OTU3\tnan\tnan\tnan\tnan', 'OTU4\tnan\tnan\tnan\tnan', 'OTU5\tnan\tnan\tnan\tnan', 'OTU6\t-4.78\t1e-05\t0.001\t0.006']
+
 # run unit tests if run from command-line
 if __name__ == '__main__':
     main()

From 43a79e6e5ba5bd7722c43220552edfbfc6ae6e9e Mon Sep 17 00:00:00 2001
From: Will Van Treuren <wdwvt1@gmail.com>
Date: Wed, 10 Dec 2014 16:00:51 -0800
Subject: [PATCH 08/12] changelog update

---
 ChangeLog.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ChangeLog.md b/ChangeLog.md
index e0523e6893..dc34777ced 100644
--- a/ChangeLog.md
+++ b/ChangeLog.md
@@ -1,6 +1,6 @@
 QIIME 1.8.0-dev (changes since 1.8.0 go here)
 =============================================
-* ``correlations.py`` has been added. This script allows the calculation of correlations between feature abundances and continuous-valued metadata. It also reintroduces the paired-t test functionality that has been missing in 1.8. This script also replaces the functions that were in ``otu_category_significance.py`` in QIIME 1.7 and earlier.
+* ``observation_metadata_correlation.py`` has been added. This script allows the calculation of correlations between feature abundances and continuous-valued metadata.This script  replaces the functions that were in ``otu_category_significance.py`` in QIIME 1.7 and earlier.
 * ``split_otu_table.py`` now allows multiple fields to be passed to split a biom table, and
 optionally a mapping file. Check out the new documentation for the naming conventions
 (which have changed slightly) and an example.

From 0b7b6f6f51523473cac88755e925e6e55f113822 Mon Sep 17 00:00:00 2001
From: Will Van Treuren <wdwvt1@gmail.com>
Date: Wed, 10 Dec 2014 17:06:13 -0800
Subject: [PATCH 09/12] documentation addition

added a lot of help documentation to mimic group_significance.py.
---
 scripts/observation_metadata_correlation.py | 55 ++++++++++++++++++---
 1 file changed, 49 insertions(+), 6 deletions(-)

diff --git a/scripts/observation_metadata_correlation.py b/scripts/observation_metadata_correlation.py
index 1d71d0cf60..292cb8afec 100755
--- a/scripts/observation_metadata_correlation.py
+++ b/scripts/observation_metadata_correlation.py
@@ -46,12 +46,55 @@
 allow the user to correlate features to sample metadata values including
 Spearmans Rho, Pearson, Kendall's Tau, and the C or checkerboard score.
 References for these methods are numerous, but good descriptions may be found in 
-'Biometry' by Sokal and Rolhf. 
-
-The available methods for assigning p-values to the calculated correlation
-scores are bootstrapping, Fisher's Z transformation, a parametric
-t-distribution, and a Kendall's Tau specific p-value calculation. These methods
-are also described in 'Biometry'.
+'Biometry' by Sokal and Rolhf. A brief description of the available tests
+follows:
+
+- Pearson score: The Pearson score aka Pearson's Product Moment correlation, is
+  a scaled measure of the degree to which two sequences of numbers co-vary. For
+  'correlated' sequences, Pearson > 0, and for 'anticorrelated' sequences
+  Pearson < 0 (uncorrelated implies Pearson = 0). Pearson is a paramateric
+  and linear measure of correlation.
+
+- Spearmans Rho: The Spearman correlation is a non-paramateric measure of
+  correlation between two sequences of numbers. Spearman correlation is
+  appropriate for data where the values of the observations are not necessarily
+  accurate, but for which their relative magnitudes are (see Biometry for more 
+  details.)
+
+- Kendalls Tau: Kendall's Tau is an alternative method of calculating
+  correlation between two sequences of numbers. It is slower and less widely
+  utilized than Spearman or Pearson scores.
+
+- Cscore: The c-score or 'checkerboard score' is a measure of covariation
+  between two sequences that is derived from traditional ecology (Stone and
+  Roberts. 1990, Oecologia 85:74-79). 
+
+Raw correlation statistics alone reflect only the degree of association between
+two sequences of numbers or vectors. Assigning a likelihood to these score via
+a p-value can be done with several methods depending on the particular
+assumptions that are used. This script allows four methods for calculating
+p-values:
+
+- Bootrapping: Bootstrapping is the most robust, but slowest procedure for
+  calculating the p-value of a given correlation score. Bootstrapping takes the
+  input sequences, shuffles the order of one, and then recomputes the
+  correlation score. The p-value is then the number of times (out of the given
+  number of permutations) that the score of the permuted sequence pair was more
+  extreme than the observed pair. Bootstrapping is good when the underlying
+  properties of the distributions are unknown. 
+
+- Parametric t distribution: the traditional method for calculating the
+  significance of a correlation score, this method assumes that the scores are
+  normally distributed and computes a t statistic for each correlation score in 
+  conjunction with the length of the sequences being correlated. 
+
+- Fisher Z transform: Fisher's Z transform is a way to make the distribution of
+  correlation scores (especially when there are many highly correlation scores)
+  look more normal. It is not to be confused with Fisher's transformation for
+  the combination of p-values.
+
+- Kendalls Tau: for the Kendalls Tau calculation, the specific Kendalls Tau 
+  pvalue is provided. 
 
 Notes:
 The only supported metric for P-value assignment with the C-score is 

From c7e66aed373cf5c4549136ede990d16cc098f83f Mon Sep 17 00:00:00 2001
From: Will Van Treuren <wdwvt1@gmail.com>
Date: Fri, 19 Dec 2014 11:52:18 -0500
Subject: [PATCH 10/12] Addressed comments

---
 ChangeLog.md                                |  2 +-
 qiime/otu_significance.py                   |  2 --
 scripts/observation_metadata_correlation.py | 10 ++++------
 3 files changed, 5 insertions(+), 9 deletions(-)

diff --git a/ChangeLog.md b/ChangeLog.md
index 459ed68c44..f2da44499e 100644
--- a/ChangeLog.md
+++ b/ChangeLog.md
@@ -1,6 +1,6 @@
 QIIME 1.8.0-dev (changes since 1.8.0 go here)
 =============================================
-* ``observation_metadata_correlation.py`` has been added. This script allows the calculation of correlations between feature abundances and continuous-valued metadata.This script  replaces the functions that were in ``otu_category_significance.py`` in QIIME 1.7 and earlier.
+* ``observation_metadata_correlation.py`` has been added. This script allows the calculation of correlations between feature abundances and continuous-valued metadata. This script replaces the continuous-valued correlation function that was in ``otu_category_significance.py`` in QIIME 1.7 and earlier.
 * ``split_otu_table.py`` now allows multiple fields to be passed to split a biom table, and
 optionally a mapping file. Check out the new documentation for the naming conventions
 (which have changed slightly) and an example.
diff --git a/qiime/otu_significance.py b/qiime/otu_significance.py
index 1a5cd3d172..be0e6c2abc 100644
--- a/qiime/otu_significance.py
+++ b/qiime/otu_significance.py
@@ -347,8 +347,6 @@ def correlate_output_formatter(bt, test_stats, pvals, fdr_pvals, bon_pvals,
         Floats representing Bonferroni corrected pvals.
     md_key : str or None
         Key for extracting feature metadata from biom table.
-    type_of_stat : str
-        Type of statistic - used to modify header to report either 'Co
 
     Returns
     -------
diff --git a/scripts/observation_metadata_correlation.py b/scripts/observation_metadata_correlation.py
index 292cb8afec..827332f87f 100755
--- a/scripts/observation_metadata_correlation.py
+++ b/scripts/observation_metadata_correlation.py
@@ -42,7 +42,7 @@
 script_info['brief_description'] = """This script calculates correlations between feature abundances and continuous-valued metadata."""
 script_info['script_description'] = """
 This script calculates correlations between feature (aka observation) abundances
-(relative or absolute) and numeric metadata.Several methods are provided to
+(relative or absolute) and numeric metadata. Several methods are provided to
 allow the user to correlate features to sample metadata values including
 Spearmans Rho, Pearson, Kendall's Tau, and the C or checkerboard score.
 References for these methods are numerous, but good descriptions may be found in 
@@ -59,7 +59,7 @@
   correlation between two sequences of numbers. Spearman correlation is
   appropriate for data where the values of the observations are not necessarily
   accurate, but for which their relative magnitudes are (see Biometry for more 
-  details.)
+  details).
 
 - Kendalls Tau: Kendall's Tau is an alternative method of calculating
   correlation between two sequences of numbers. It is slower and less widely
@@ -166,7 +166,7 @@ def main():
     option_parser, opts, args = parse_command_line_parameters(**script_info)
 
     if opts.test == 'cscore' and opts.pval_assignment_method != 'bootstrapped':
-        raise ValueError(cscore_error_text)
+        option_parser.error(cscore_error_text)
     
     bt = load_table(opts.otu_table_fp)
     pmf, _ = parse_mapping_file_to_dict(opts.mapping_fp)
@@ -230,6 +230,4 @@ def main():
     o.close()
 
 if __name__ == "__main__":
-    main()
-
-
+    main()
\ No newline at end of file

From 0f5bf05a12fa0bef8fa03a9823e3f254c1f74235 Mon Sep 17 00:00:00 2001
From: Will Van Treuren <wdwvt1@gmail.com>
Date: Fri, 19 Dec 2014 12:05:16 -0500
Subject: [PATCH 11/12] Resolved remaining comment.

---
 scripts/observation_metadata_correlation.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/observation_metadata_correlation.py b/scripts/observation_metadata_correlation.py
index 827332f87f..7fa6455496 100755
--- a/scripts/observation_metadata_correlation.py
+++ b/scripts/observation_metadata_correlation.py
@@ -198,7 +198,7 @@ def main():
     bt.sort(sort_f = lambda _: samples_to_correlate, axis='sample')
 
     if bt.shape[1] <= 3:
-        raise ValueError(filtration_error_text)
+        option_parser.error(filtration_error_text)
 
     rhos = []
     pvals = []

From 9d68237db23d2782c4acb8dd63fe06c58e5b9247 Mon Sep 17 00:00:00 2001
From: Jai Ram Rideout <jai.rideout@gmail.com>
Date: Tue, 23 Dec 2014 10:56:20 -0700
Subject: [PATCH 12/12] DOC: update obseration_metadata_correlation.py script
 docs

---
 ChangeLog.md                                  |   2 +-
 .../paired_samples.txt                        |   5 -
 scripts/observation_metadata_correlation.py   | 115 +++++++++---------
 3 files changed, 61 insertions(+), 61 deletions(-)
 delete mode 100644 qiime_test_data/observation_metadata_correlation/paired_samples.txt

diff --git a/ChangeLog.md b/ChangeLog.md
index 578f5f14c7..269f73926e 100644
--- a/ChangeLog.md
+++ b/ChangeLog.md
@@ -1,6 +1,6 @@
 QIIME 1.8.0-dev (changes since 1.8.0 go here)
 =============================================
-* ``observation_metadata_correlation.py`` has been added. This script allows the calculation of correlations between feature abundances and continuous-valued metadata. This script replaces the continuous-valued correlation function that was in ``otu_category_significance.py`` in QIIME 1.7 and earlier.
+* Added new ``observation_metadata_correlation.py`` script. This script allows the calculation of correlations between feature abundances and continuous-valued metadata. This script replaces the continuous-valued correlation functionality that was in ``otu_category_significance.py`` in QIIME 1.7.0 and earlier.
 * ``split_otu_table.py`` now allows multiple fields to be passed to split a biom table, and
 optionally a mapping file. Check out the new documentation for the naming conventions
 (which have changed slightly) and an example.
diff --git a/qiime_test_data/observation_metadata_correlation/paired_samples.txt b/qiime_test_data/observation_metadata_correlation/paired_samples.txt
deleted file mode 100644
index 4e40751eef..0000000000
--- a/qiime_test_data/observation_metadata_correlation/paired_samples.txt
+++ /dev/null
@@ -1,5 +0,0 @@
-s1	s9
-s0	s5
-s2	s3
-s4	s7
-s6	s8
\ No newline at end of file
diff --git a/scripts/observation_metadata_correlation.py b/scripts/observation_metadata_correlation.py
index 7fa6455496..cacf9dd428 100755
--- a/scripts/observation_metadata_correlation.py
+++ b/scripts/observation_metadata_correlation.py
@@ -3,7 +3,7 @@
 __author__ = "Will Van Treuren"
 __copyright__ = "Copyright 2014, The QIIME project"
 __credits__ = ["Will Van Treuren", "Luke Ursell", "Catherine Lozupone",
-               "Jesse Stombaugh", "Doug Wendel", "Dan Knights", "Greg Caporaso", 
+               "Jesse Stombaugh", "Doug Wendel", "Dan Knights", "Greg Caporaso",
                "Jai Ram Rideout"]
 __license__ = "GPL"
 __version__ = "1.8.0-dev"
@@ -24,11 +24,11 @@
 filtration_error_text = '''The biom table does not have enough samples after
 filtration to perform correlations (it has 3 or fewer). The filtration steps
 that have occurred are:
-1. Removing samples that were not found in the mapping file. 
+1. Removing samples that were not found in the mapping file.
 2. Removing samples that were found in the mapping file but whose metadata for
 the given category was not convertable to float.'''
 
-cscore_error_text = '''The only supported metric for P-value assignment with the
+cscore_error_text = '''The only supported metric for p-value assignment with the
 C-score is bootstrapping. For more information on the C-score, read Stone and
 Roberts 1990 Oecologea paper 85: 74-79.'''
 
@@ -39,35 +39,35 @@
                        'kendall': kendall, 'cscore': cscore}
 
 script_info = {}
-script_info['brief_description'] = """This script calculates correlations between feature abundances and continuous-valued metadata."""
+script_info['brief_description'] = """Correlation between observation abundances and continuous-valued metadata"""
 script_info['script_description'] = """
 This script calculates correlations between feature (aka observation) abundances
 (relative or absolute) and numeric metadata. Several methods are provided to
 allow the user to correlate features to sample metadata values including
-Spearmans Rho, Pearson, Kendall's Tau, and the C or checkerboard score.
-References for these methods are numerous, but good descriptions may be found in 
+Spearman's Rho, Pearson, Kendall's Tau, and the C or checkerboard score.
+References for these methods are numerous, but good descriptions may be found in
 'Biometry' by Sokal and Rolhf. A brief description of the available tests
 follows:
 
-- Pearson score: The Pearson score aka Pearson's Product Moment correlation, is
+- Pearson score: The Pearson score, aka Pearson's Product Moment correlation, is
   a scaled measure of the degree to which two sequences of numbers co-vary. For
   'correlated' sequences, Pearson > 0, and for 'anticorrelated' sequences
   Pearson < 0 (uncorrelated implies Pearson = 0). Pearson is a paramateric
   and linear measure of correlation.
 
-- Spearmans Rho: The Spearman correlation is a non-paramateric measure of
+- Spearman's Rho: The Spearman correlation is a non-paramateric measure of
   correlation between two sequences of numbers. Spearman correlation is
   appropriate for data where the values of the observations are not necessarily
-  accurate, but for which their relative magnitudes are (see Biometry for more 
+  accurate, but for which their relative magnitudes are (see Biometry for more
   details).
 
-- Kendalls Tau: Kendall's Tau is an alternative method of calculating
+- Kendall's Tau: Kendall's Tau is an alternative method of calculating
   correlation between two sequences of numbers. It is slower and less widely
   utilized than Spearman or Pearson scores.
 
 - Cscore: The c-score or 'checkerboard score' is a measure of covariation
   between two sequences that is derived from traditional ecology (Stone and
-  Roberts. 1990, Oecologia 85:74-79). 
+  Roberts. 1990, Oecologia 85:74-79).
 
 Raw correlation statistics alone reflect only the degree of association between
 two sequences of numbers or vectors. Assigning a likelihood to these score via
@@ -81,84 +81,88 @@
   correlation score. The p-value is then the number of times (out of the given
   number of permutations) that the score of the permuted sequence pair was more
   extreme than the observed pair. Bootstrapping is good when the underlying
-  properties of the distributions are unknown. 
+  properties of the distributions are unknown.
 
 - Parametric t distribution: the traditional method for calculating the
   significance of a correlation score, this method assumes that the scores are
-  normally distributed and computes a t statistic for each correlation score in 
-  conjunction with the length of the sequences being correlated. 
+  normally distributed and computes a t statistic for each correlation score in
+  conjunction with the length of the sequences being correlated.
 
 - Fisher Z transform: Fisher's Z transform is a way to make the distribution of
-  correlation scores (especially when there are many highly correlation scores)
+  correlation scores (especially when there are many highly correlated scores)
   look more normal. It is not to be confused with Fisher's transformation for
   the combination of p-values.
 
-- Kendalls Tau: for the Kendalls Tau calculation, the specific Kendalls Tau 
-  pvalue is provided. 
+- Kendall's Tau: for the Kendall's Tau calculation, the specific Kendall's Tau
+  p-value is provided.
 
 Notes:
-The only supported metric for P-value assignment with the C-score is 
-bootstrapping. For more information on the C-score, read Stone and Roberts 1990
-Oecologea paper 85: 74-79. If you fail to pass 
-pval_assignment_method='bootstrapped' while you have -s cscore, the script will 
-error. 
 
-Assigning pvalues to Kendall's Tau scores with the bootstrapping method is 
-very slow.
+- The only supported metric for p-value assignment with the C-score is
+  bootstrapping. For more information on the C-score, read Stone and Roberts
+  1990 Oecologea paper 85: 74-79. If you don't pass
+  pval_assignment_method='bootstrapped' while you have -s cscore, the script
+  will raise an error.
+
+- Assigning p-values to Kendall's Tau scores with the bootstrapping method is
+  very slow.
 
 """
 
 script_info['script_usage'] = []
 script_info['script_usage'].append(
-    ("Calculate the correlation between OTUs in the table and the pH of the samples from mich they came:",
-     "",
+    ("Example 1:",
+     "Calculate the correlation between OTUs in the table and the pH of the samples from whence they came:",
      "%prog -i otu_table.biom -m map.txt -c pH -s spearman -o spearman_otu_gradient.txt"))
 script_info['script_usage'].append(
-    ("Calculate the correlation between OTUs in the table and the pH of the samples from mich they came using bootstrapping and pearon correlation:",
-     "",
+    ("Example 2:",
+     "Calculate the correlation between OTUs in the table and the pH of the samples from whence they came using bootstrapping and pearson correlation:",
      "%prog -i otu_table.biom -m map.txt -c pH -s pearson --pval_assignment_method bootstrapped --permutations 100 -o pearson_bootstrapped.txt"))
 
 script_info['output_description']= """
-The output will be a tab delimited file with the following headers. Each row
-will record the values calculated for a fiven featue:
-- Feature ID: ID of the features being correlated. If these are OTUs, then they
-  will take the form of GreenGenes identifiers or de-novo identifiers. 
-- Test-Statistic: the value of the test statistic for the given test.
-- P: the raw P value returned by the given test. 
-- FDR_P: the P value corrected by the Benjamini-Hochberg FDR procedure for 
+The output will be a tab-delimited file with the following headers. Each row
+will record the values calculated for a given feature:
+
+- Feature ID: ID of the features being correlated. These are the observation IDs
+  in the BIOM table.
+- Test stat.: the value of the test statistic for the given test.
+- pval: the raw p-value returned by the given test.
+- pval_fdr: the p-value corrected by the Benjamini-Hochberg FDR procedure for
   multiple comparisons.
-- Bonferroni_P: the P value corrected by the Bonferroni procedure for multiple
+- pval_bon: the p-value corrected by the Bonferroni procedure for multiple
   comparisons.
-- Metadata - this column will be present only if the biom table contained
-  metadata information for your features. If these are OTUs, and taxonomy is
-  present in the biom table, this category will contain that taxonomy (or other
-  metadata).
+- [metadata]: this column will be present only if the BIOM table contained
+  metadata information for your features. For example, if these are OTUs, and
+  taxonomy is present in the BIOM table, this column will contain OTU
+  taxonomy and will be named 'taxonomy'.
+
 """
 script_info['required_options']=[
     make_option('-i','--otu_table_fp',
-        help='path to input biom format table',
+        help='path to input BIOM table',
         type='existing_path'),
     make_option('-o', '--output_fp', type='new_filepath',
         help='path to the output file to be created'),
     make_option('-m','--mapping_fp', type='existing_filepath',
-        help='path to category mapping file'),
-    make_option('-c', '--category', type='string',
-        help='name of the category over which to run the analysis')]
+        help='path to metadata mapping file'),
+    make_option('-c', '--category',
+        help='name of the category in the metadata mapping file over which to '
+        'run the analysis')]
 
 script_info['optional_options']=[
-    make_option('-s', '--test', type="choice", 
+    make_option('-s', '--test', type="choice",
         choices=correlation_assignment_choices,
         default='spearman', help='Correlation method to use. Choices are: %s' %
         (', '.join(correlation_assignment_choices)) + ' [default: %default]'),
-    make_option('--pval_assignment_method', type="choice", 
+    make_option('--pval_assignment_method', type="choice",
         choices=pvalue_assignment_choices,
-        default='fisher_z_transform', help='Pvalue method to use. Choices are: '
+        default='fisher_z_transform', help='p-value method to use. Choices are: '
         '%s' % (', '.join(pvalue_assignment_choices)) + ' [default: %default]'),
-    make_option('--metadata_key', default='taxonomy', type=str, 
-        help='Key to extract metadata from biom table. [default: %default]'),
-    make_option('--permutations', default=1000, type=int, 
-        help='Number of permutations to use for bootstrapped tests.'
-            ' [default: %default]')]
+    make_option('--metadata_key', default='taxonomy',
+        help='Key to extract metadata from BIOM table. [default: %default]'),
+    make_option('--permutations', default=1000, type=int,
+        help='Number of permutations to use for bootstrapped p-value '
+        'calculations. [default: %default]')]
 
 script_info['version'] = __version__
 
@@ -167,7 +171,7 @@ def main():
 
     if opts.test == 'cscore' and opts.pval_assignment_method != 'bootstrapped':
         option_parser.error(cscore_error_text)
-    
+
     bt = load_table(opts.otu_table_fp)
     pmf, _ = parse_mapping_file_to_dict(opts.mapping_fp)
 
@@ -193,7 +197,7 @@ def main():
     # metadata that converts to float
     bt.filter(ids_to_keep = samples_to_correlate, axis='sample')
 
-    # sort the biom table so that feature values are retrieved in the same 
+    # sort the biom table so that feature values are retrieved in the same
     # order as the metadata in the samples they correspond to
     bt.sort(sort_f = lambda _: samples_to_correlate, axis='sample')
 
@@ -227,7 +231,8 @@ def main():
 
     o = open(opts.output_fp, 'w')
     o.writelines('\n'.join(lines))
+    o.write('\n')
     o.close()
 
 if __name__ == "__main__":
-    main()
\ No newline at end of file
+    main()