Merge pull request #1807 from jairideout/correlations

ENH: observation_metadata_correlation.py
biocore · Dec 23, 2014 · 402c1db · 402c1db
2 parents 53ff511 + 9d68237
commit 402c1db
Show file tree

Hide file tree

Showing 7 changed files with 443 additions and 57 deletions.
diff --git a/ChangeLog.md b/ChangeLog.md
@@ -1,5 +1,6 @@
 QIIME 1.8.0-dev (changes since 1.8.0 go here)
 =============================================
+* Added new ``observation_metadata_correlation.py`` script. This script allows the calculation of correlations between feature abundances and continuous-valued metadata. This script replaces the continuous-valued correlation functionality that was in ``otu_category_significance.py`` in QIIME 1.7.0 and earlier.
 * ``split_otu_table.py`` now allows multiple fields to be passed to split a biom table, and
 optionally a mapping file. Check out the new documentation for the naming conventions
 (which have changed slightly) and an example.

diff --git a/qiime/otu_significance.py b/qiime/otu_significance.py
@@ -12,7 +12,7 @@
 
 from qiime.parse import parse_mapping_file_to_dict
 from numpy import (array, argsort, vstack, isnan, inf, nan, apply_along_axis,
-                   mean, zeros)
+                   mean, zeros, isinf, logical_or)
 
 from qiime.stats import (fisher_population_correlation,
                                   pearson, spearman, g_fit, ANOVA_one_way, 
@@ -329,61 +329,70 @@ def run_correlation_test(data_generator, test, test_choices,
     return corr_coefs, pvals
 
 
-def correlation_output_formatter(bt, corr_coefs, pvals, fdr_pvals, bon_pvals,
-                                 md_key):
-    """Format the output of the correlations for easy writing.
-
-    """
-    header = [
-        'OTU',
-        'Correlation Coef',
-        'pval',
-        'pval_fdr',
-        'pval_bon',
-        md_key]
-    num_lines = len(corr_coefs)
+def correlate_output_formatter(bt, test_stats, pvals, fdr_pvals, bon_pvals,
+                               md_key):
+    '''Produce lines for a tab delimited text file for correlations.py.
+
+    Paramaters
+    ----------
+    bt : biom table object
+    test_stats : array-like
+        Floats representing correlation coefficients or paired t test
+        statistics.
+    pvals : array-like
+        Floats representing pvalues for given correlation coefficients.
+    fdr_pvals : array-like
+        Floats representing FDR corrected pvals.
+    bon_pvals : array-like
+        Floats representing Bonferroni corrected pvals.
+    md_key : str or None
+        Key for extracting feature metadata from biom table.
+
+    Returns
+    -------
+    list of strs
+    '''
+    header = ['Feature ID', 'Test stat.', 'pval', 'pval_fdr', 'pval_bon',
+              md_key]
+    num_lines = len(test_stats)
     lines = ['\t'.join(header)]
     for i in range(num_lines):
-        tmp = [bt.ids(axis='observation')[i], corr_coefs[i], pvals[i], fdr_pvals[i],
-               bon_pvals[i]]
+        tmp = [bt.ids(axis='observation')[i], test_stats[i], pvals[i],
+               fdr_pvals[i], bon_pvals[i]]
         lines.append('\t'.join(map(str, tmp)))
     nls = _add_metadata(bt, md_key, lines)
     return nls
 
+def run_paired_t(bt, s1, s2):
+    '''Perform a paired t test between samples.
+
+    Parameters
+    ----------
+    bt : biom table object
+        Table containing data for samples in s1 and s2.
+    s1 : list
+        List of sample ids found in bt (strs).
+    s2 : list
+        List of sample ids found in bt (strs).
+
+    Returns
+    -------
+    test_stats : list
+        Floats representing the test statistic for each paired comparison. 
+    pvals : list
+        Floats representing the p-values of the reported test statistics.
+    '''
+    test_stats = []
+    pvals = []
+    s1_indices = [bt.index(i, axis='sample') for i in s1]
+    s2_indices = [bt.index(i, axis='sample') for i in s2]
 
-def paired_t_generator(bt, s_before, s_after):
-    """Produce a generator to run paired t tests on each OTU."""
-    b_data = vstack([bt.sampleData(i) for i in s_before]).T
-    a_data = vstack([bt.sampleData(i) for i in s_after]).T
-    return ((b_data[i], a_data[i]) for i in range(len(bt.ids(axis='observation'))))
-
-
-def run_paired_t(data_generator):
-    """Run paired t test on data."""
-    test_stats, pvals = [], []
-    for b_data, a_data in data_generator:
-        test_stat, pval = t_paired(b_data, a_data)
+    for data in bt.iter_data(axis='observation'):
+        test_stat, pval = t_paired(data.take(s1_indices), data.take(s2_indices))
         test_stats.append(test_stat)
         pvals.append(pval)
     return test_stats, pvals
 
-
-def paired_t_output_formatter(bt, test_stats, pvals, fdr_pvals, bon_pvals,
-                              md_key):
-    """Format the output for all tests so it can be easily written."""
-    header = ['OTU', 'Test-Statistic', 'P', 'FDR_P', 'Bonferroni_P', md_key]
-    num_lines = len(pvals)
-    lines = ['\t'.join(header)]
-    for i in range(num_lines):
-        tmp = [bt.ids(axis='observation')[i], test_stats[i], pvals[i], fdr_pvals[i],
-               bon_pvals[i]]
-        lines.append('\t'.join(map(str, tmp)))
-    nls = _add_metadata(bt, md_key, lines)
-    return nls
-
-# Functions used by both scripts
-
-
 def sort_by_pval(lines, ind):
     """Sort lines with pvals in descending order.
 
@@ -396,10 +405,7 @@ def _nan_safe_sort(line):
             return val
         else:
             return inf
-    return (
-        # header+sorted lines
-        [lines[0]] + sorted(lines[1:], key=_nan_safe_sort)
-    )
+    return [lines[0]] + sorted(lines[1:], key=_nan_safe_sort)
 
 
 def _add_metadata(bt, md_key, lines):
@@ -413,3 +419,22 @@ def _add_metadata(bt, md_key, lines):
     else:  # remove md_header from the first line
         nls = ['\t'.join(lines[0].split('\t')[:-1])] + lines[1:]
         return nls
+
+def is_computable_float(v):
+    '''Return float if v can be converted to float excluding nan and inf.
+
+    Parameters
+    ----------
+    v : variable
+        Value to be converted to float if possible.
+
+    Returns
+    -------
+    If v can be converted to a float that is not nan or inf, return v.
+    Otherwise return False.
+    '''
+    tmp = float(v)
+    if not logical_or(isnan(tmp), isinf(tmp)):
+        return tmp 
+    else:
+        return False
diff --git a/qiime/stats.py b/qiime/stats.py
@@ -1629,10 +1629,13 @@ def assign_correlation_pval(corr, n, method, permutations=None,
             raise ValueError('You must specify vectors, permutation '
                              'function, and number of permutations to calc '
                              'bootstrapped pvalues. Cant continue.')
-        r = empty(permutations)
-        for i in range(permutations):
-            r[i] = perm_test_fn(v1, permutation(v2))
-        return (abs(r) >= abs(corr)).sum() / float(permutations)
+        if any([isnan(corr), isinf(corr)]):
+            return nan
+        else:
+            r = empty(permutations)
+            for i in range(permutations):
+                r[i] = perm_test_fn(v1, permutation(v2))
+            return (abs(r) >= abs(corr)).sum() / float(permutations)
     elif method == 'kendall':
         return kendall_pval(corr, n)
     else:
@@ -2425,3 +2428,32 @@ def cscore(v1, v2):
     v2_b = v2.astype(bool)
     sij = (v1_b * v2_b).sum()
     return (v1_b.sum() - sij) * (v2_b.sum() - sij)
+
+def correlate(v1, v2, method):
+    '''Correlate vectors using method.
+
+    Parameters
+    ----------
+    v1 : array-like
+        List or array of ints or floats to be correlated.
+    v2 : array-like
+        List or array of ints or floats to be correlated.
+    method : str
+        One of 'spearman', 'pearson', 'kendall', 'cscore'.
+
+    Returns
+    -------
+    rho : float
+        Correlation between the vectors.
+    '''
+    if method == 'pearson':
+        corr_fn = pearson
+    elif method == 'spearman':
+        corr_fn = spearman
+    elif method == 'kendall':
+        corr_fn = kendall
+    elif method == 'cscore':
+        corr_fn = cscore
+    else:
+        raise ValueError('Correlation function not recognized.')
+    return corr_fn(v1, v2)
diff --git a/qiime_test_data/observation_metadata_correlation/map.txt b/qiime_test_data/observation_metadata_correlation/map.txt
@@ -0,0 +1,11 @@
+#SampleID	pH
+s0	1.1
+s1	2.1
+s2	1.7
+s3	4.7
+s4	300
+s5	-12
+s6	89.242251
+s7	‘SKJDKJSD’
+s8	nan
+s9	100
diff --git a/qiime_test_data/observation_metadata_correlation/otu_table.biom b/qiime_test_data/observation_metadata_correlation/otu_table.biom