Skip to content
This repository has been archived by the owner on Nov 9, 2023. It is now read-only.

Commit

Permalink
Merge pull request #1807 from jairideout/correlations
Browse files Browse the repository at this point in the history
ENH: observation_metadata_correlation.py
  • Loading branch information
gregcaporaso committed Dec 23, 2014
2 parents 53ff511 + 9d68237 commit 402c1db
Show file tree
Hide file tree
Showing 7 changed files with 443 additions and 57 deletions.
1 change: 1 addition & 0 deletions ChangeLog.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
QIIME 1.8.0-dev (changes since 1.8.0 go here)
=============================================
* Added new ``observation_metadata_correlation.py`` script. This script allows the calculation of correlations between feature abundances and continuous-valued metadata. This script replaces the continuous-valued correlation functionality that was in ``otu_category_significance.py`` in QIIME 1.7.0 and earlier.
* ``split_otu_table.py`` now allows multiple fields to be passed to split a biom table, and
optionally a mapping file. Check out the new documentation for the naming conventions
(which have changed slightly) and an example.
Expand Down
123 changes: 74 additions & 49 deletions qiime/otu_significance.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@

from qiime.parse import parse_mapping_file_to_dict
from numpy import (array, argsort, vstack, isnan, inf, nan, apply_along_axis,
mean, zeros)
mean, zeros, isinf, logical_or)

from qiime.stats import (fisher_population_correlation,
pearson, spearman, g_fit, ANOVA_one_way,
Expand Down Expand Up @@ -329,61 +329,70 @@ def run_correlation_test(data_generator, test, test_choices,
return corr_coefs, pvals


def correlation_output_formatter(bt, corr_coefs, pvals, fdr_pvals, bon_pvals,
md_key):
"""Format the output of the correlations for easy writing.
"""
header = [
'OTU',
'Correlation Coef',
'pval',
'pval_fdr',
'pval_bon',
md_key]
num_lines = len(corr_coefs)
def correlate_output_formatter(bt, test_stats, pvals, fdr_pvals, bon_pvals,
md_key):
'''Produce lines for a tab delimited text file for correlations.py.
Paramaters
----------
bt : biom table object
test_stats : array-like
Floats representing correlation coefficients or paired t test
statistics.
pvals : array-like
Floats representing pvalues for given correlation coefficients.
fdr_pvals : array-like
Floats representing FDR corrected pvals.
bon_pvals : array-like
Floats representing Bonferroni corrected pvals.
md_key : str or None
Key for extracting feature metadata from biom table.
Returns
-------
list of strs
'''
header = ['Feature ID', 'Test stat.', 'pval', 'pval_fdr', 'pval_bon',
md_key]
num_lines = len(test_stats)
lines = ['\t'.join(header)]
for i in range(num_lines):
tmp = [bt.ids(axis='observation')[i], corr_coefs[i], pvals[i], fdr_pvals[i],
bon_pvals[i]]
tmp = [bt.ids(axis='observation')[i], test_stats[i], pvals[i],
fdr_pvals[i], bon_pvals[i]]
lines.append('\t'.join(map(str, tmp)))
nls = _add_metadata(bt, md_key, lines)
return nls

def run_paired_t(bt, s1, s2):
'''Perform a paired t test between samples.
Parameters
----------
bt : biom table object
Table containing data for samples in s1 and s2.
s1 : list
List of sample ids found in bt (strs).
s2 : list
List of sample ids found in bt (strs).
Returns
-------
test_stats : list
Floats representing the test statistic for each paired comparison.
pvals : list
Floats representing the p-values of the reported test statistics.
'''
test_stats = []
pvals = []
s1_indices = [bt.index(i, axis='sample') for i in s1]
s2_indices = [bt.index(i, axis='sample') for i in s2]

def paired_t_generator(bt, s_before, s_after):
"""Produce a generator to run paired t tests on each OTU."""
b_data = vstack([bt.sampleData(i) for i in s_before]).T
a_data = vstack([bt.sampleData(i) for i in s_after]).T
return ((b_data[i], a_data[i]) for i in range(len(bt.ids(axis='observation'))))


def run_paired_t(data_generator):
"""Run paired t test on data."""
test_stats, pvals = [], []
for b_data, a_data in data_generator:
test_stat, pval = t_paired(b_data, a_data)
for data in bt.iter_data(axis='observation'):
test_stat, pval = t_paired(data.take(s1_indices), data.take(s2_indices))
test_stats.append(test_stat)
pvals.append(pval)
return test_stats, pvals


def paired_t_output_formatter(bt, test_stats, pvals, fdr_pvals, bon_pvals,
md_key):
"""Format the output for all tests so it can be easily written."""
header = ['OTU', 'Test-Statistic', 'P', 'FDR_P', 'Bonferroni_P', md_key]
num_lines = len(pvals)
lines = ['\t'.join(header)]
for i in range(num_lines):
tmp = [bt.ids(axis='observation')[i], test_stats[i], pvals[i], fdr_pvals[i],
bon_pvals[i]]
lines.append('\t'.join(map(str, tmp)))
nls = _add_metadata(bt, md_key, lines)
return nls

# Functions used by both scripts


def sort_by_pval(lines, ind):
"""Sort lines with pvals in descending order.
Expand All @@ -396,10 +405,7 @@ def _nan_safe_sort(line):
return val
else:
return inf
return (
# header+sorted lines
[lines[0]] + sorted(lines[1:], key=_nan_safe_sort)
)
return [lines[0]] + sorted(lines[1:], key=_nan_safe_sort)


def _add_metadata(bt, md_key, lines):
Expand All @@ -413,3 +419,22 @@ def _add_metadata(bt, md_key, lines):
else: # remove md_header from the first line
nls = ['\t'.join(lines[0].split('\t')[:-1])] + lines[1:]
return nls

def is_computable_float(v):
'''Return float if v can be converted to float excluding nan and inf.
Parameters
----------
v : variable
Value to be converted to float if possible.
Returns
-------
If v can be converted to a float that is not nan or inf, return v.
Otherwise return False.
'''
tmp = float(v)
if not logical_or(isnan(tmp), isinf(tmp)):
return tmp
else:
return False
40 changes: 36 additions & 4 deletions qiime/stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -1629,10 +1629,13 @@ def assign_correlation_pval(corr, n, method, permutations=None,
raise ValueError('You must specify vectors, permutation '
'function, and number of permutations to calc '
'bootstrapped pvalues. Cant continue.')
r = empty(permutations)
for i in range(permutations):
r[i] = perm_test_fn(v1, permutation(v2))
return (abs(r) >= abs(corr)).sum() / float(permutations)
if any([isnan(corr), isinf(corr)]):
return nan
else:
r = empty(permutations)
for i in range(permutations):
r[i] = perm_test_fn(v1, permutation(v2))
return (abs(r) >= abs(corr)).sum() / float(permutations)
elif method == 'kendall':
return kendall_pval(corr, n)
else:
Expand Down Expand Up @@ -2425,3 +2428,32 @@ def cscore(v1, v2):
v2_b = v2.astype(bool)
sij = (v1_b * v2_b).sum()
return (v1_b.sum() - sij) * (v2_b.sum() - sij)

def correlate(v1, v2, method):
'''Correlate vectors using method.
Parameters
----------
v1 : array-like
List or array of ints or floats to be correlated.
v2 : array-like
List or array of ints or floats to be correlated.
method : str
One of 'spearman', 'pearson', 'kendall', 'cscore'.
Returns
-------
rho : float
Correlation between the vectors.
'''
if method == 'pearson':
corr_fn = pearson
elif method == 'spearman':
corr_fn = spearman
elif method == 'kendall':
corr_fn = kendall
elif method == 'cscore':
corr_fn = cscore
else:
raise ValueError('Correlation function not recognized.')
return corr_fn(v1, v2)
11 changes: 11 additions & 0 deletions qiime_test_data/observation_metadata_correlation/map.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
#SampleID pH
s0 1.1
s1 2.1
s2 1.7
s3 4.7
s4 300
s5 -12
s6 89.242251
s7 ‘SKJDKJSD’
s8 nan
s9 100
Binary file not shown.
Loading

0 comments on commit 402c1db

Please sign in to comment.