From 99597f358dd7abc51c886a16df6ebe4d480d7b2f Mon Sep 17 00:00:00 2001 From: Will Van Treuren Date: Sun, 7 Dec 2014 21:20:02 -0800 Subject: [PATCH 01/12] Initial commit Provides baseline functionality without tests. --- qiime/otu_significance.py | 31 ++++-- qiime/stats.py | 30 ++++++ scripts/correlations.py | 221 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 272 insertions(+), 10 deletions(-) create mode 100644 scripts/correlations.py diff --git a/qiime/otu_significance.py b/qiime/otu_significance.py index 0cf6e37df7..8a1f9ddc3d 100644 --- a/qiime/otu_significance.py +++ b/qiime/otu_significance.py @@ -331,16 +331,27 @@ def run_correlation_test(data_generator, test, test_choices, def correlation_output_formatter(bt, corr_coefs, pvals, fdr_pvals, bon_pvals, md_key): - """Format the output of the correlations for easy writing. - - """ - header = [ - 'OTU', - 'Correlation Coef', - 'pval', - 'pval_fdr', - 'pval_bon', - md_key] + '''Produce lines for a tab delimited text file for correlations.py. + + Paramaters + ---------- + bt : biom table object + corr_coefs : array-like + Floats representing correlation coefficients. + pvals : array-like + Floats representing pvalues for given correlation coefficients. + fdr_pvals : array-like + Floats representing FDR corrected pvals. + bon_pvals : array-like + Floats representing Bonferroni corrected pvals. + md_key : str or None + Key for extracting feature metadata from biom table. + + Returns + ------- + list of strs + ''' + header = ['Feature ID', 'Corr coef', 'pval', 'pval_fdr', 'pval_bon', md_key] num_lines = len(corr_coefs) lines = ['\t'.join(header)] for i in range(num_lines): diff --git a/qiime/stats.py b/qiime/stats.py index ded20cb96b..cd98129f89 100644 --- a/qiime/stats.py +++ b/qiime/stats.py @@ -2599,3 +2599,33 @@ def cscore(v1, v2): v2_b = v2.astype(bool) sij = (v1_b * v2_b).sum() return (v1_b.sum() - sij) * (v2_b.sum() - sij) + +def correlate(v1, v2, method): + '''Correlate vectors using method. + + Parameters + ---------- + v1 : array-like + List or array of ints or floats to be correlated. + v2 : array-like + List or array of ints or floats to be correlated. + method : str + One of 'spearman', 'pearson', 'kendall', 'cscore'. + + Returns + ------- + rho : float + Correlation between the vectors. + ''' + if method is 'pearson': + corr_fn = pearson + elif method is 'spearman': + corr_fn = spearman + elif method is 'kendall': + corr_fn = kendall + elif method is 'cscore': + corr_fn = cscore + else: + raise ValueError('Correlation function not recognized.') + + return corr_fn(v1, v2) diff --git a/scripts/correlations.py b/scripts/correlations.py new file mode 100644 index 0000000000..55a331c5f2 --- /dev/null +++ b/scripts/correlations.py @@ -0,0 +1,221 @@ +#!/usr/bin/env python + +__author__ = "Will Van Treuren, Luke Ursell" +__copyright__ = "Copyright 2014, The QIIME project" +__credits__ = ["Will Van Treuren"] +__license__ = "GPL" +__version__ = "1.8.0-dev" +__maintainer__ = "Will Van Treuren" +__email__ = "wdwvt1@gmail.com" + +from qiime.util import (parse_command_line_parameters, make_option, + sync_biom_and_mf) +from qiime.stats import (benjamini_hochberg_step_down, bonferroni_correction, + assign_correlation_pval, correlate, + correlation_output_formatter, sort_by_pval, pearson, + spearman, kendall, cscore) +from qiime.parse import parse_mapping_file_to_dict +from biom import load_table + +filtration_error_text = '''The biom table does not have enough samples after +filtration to perform correlations (it has 3 or fewer). The filtration steps +that have occurred are: +1. Removing samples that were not found in the mapping file. +2. Removing samples that were found in the mapping file but whose metadata for +the given category was not convertable to float.''' + +correlation_assignemnt_choices = ['spearman', 'pearson', 'kendall', 'cscore'] +pvalue_assignment_choices = ['fisher_z_transform':, 'parametric_t_distribution', + 'bootstrapped', 'kendall'] +bootstrap_functions = {'spearman': spearman, 'pearson': pearson, + 'kendall': kendall, 'cscore': cscore} + +script_info = {} +script_info['brief_description'] = """ +This script allows the calculation of: + 1. Correlations between feature abundances (relative or absolute) and + numeric metadata. + 2. Paired t-tests between two groups of samples. +The available methods for calculating correlations are Spearmans +Rho, Pearson, Kendall's Tau, and the C or checkerboard score. The available +methods for assigning p-values to the calculated correlation scores are +bootstrapping, Fisher's Z transformation, a parametric t-distribution, and +a Kendall's Tau specific p-value calculation. +""" +script_info['script_description'] = """ +This script calculates the correlation between feature values and a gradient of +sample data. Several methods are provided to allow the user to correlate +features to sample metadata values. This script also allows paired t testing. +The paired t test is accomplished by passing a paired mapping file which is just +a two column (tab separation) table with the samples that should be paired in +each row. It should not have a header. +The available correlations tests are Kendall's Tau, Spearmans rank correlation, +Pearsons product moment correlation, and the C-score (or checkerboard score). +This script generates a tab separated output file which differs based on which +test you have chosen. If you have chosen simple correlation or paired_t then you +will see the following headers: +Feature ID - ID of the features being correlated. If these are OTUs, then they + will take the form of GreenGenes identifiers or de-novo identifiers. +Test-Statistic - the value of the test statistic for the given test +P - the raw P value returned by the given test. +FDR_P - the P value corrected by the Benjamini-Hochberg FDR procedure for + multiple comparisons. +Bonferroni_P - the P value corrected by the Bonferroni procedure for multiple + comparisons. +Metadata - this column will be present only if the biom table contained metadata + information for your features. If these are OTUs, and taxonomy is present in + the biom table, this category will contain that taxonomy (or other metadata). + +Warnings: +The only supported metric for P-value assignment with the C-score is +bootstrapping. For more information on the C-score, read Stone and Roberts 1990 +Oecologea paper 85: 74-79. If you fail to pass +pval_assignment_method='bootstrapped' while you have -s cscore, the script will +error. + +Assigning pvalues to Kendall's Tau scores with the bootstrapping method is +very slow. + + +""" +script_info['script_usage'] = [] +script_info['script_usage'].append(("Calculate the correlation between OTUs in the table and the pH of the samples from mich they came:", "", "%prog -i otu_table.biom -m map.txt -c pH -s spearman -o spearman_otu_gradient.txt")) +script_info['script_usage'].append(("Calculate paired t values for a before and after group of samples:", "", "%prog -i otu_table.biom --paired_t_fp=paired_samples.txt -o kendall_longitudinal_otu_gradient.txt")) + +script_info['output_description']= """ +If you have chosen simple correlation or paired_t then you +will see the following headers: +Feature ID - ID of the features being correlated. If these are OTUs, then they + will take the form of GreenGenes identifiers or de-novo identifiers. +Test-Statistic - the value of the test statistic for the given test +P - the raw P value returned by the given test. +FDR_P - the P value corrected by the Benjamini-Hochberg FDR procedure for + multiple comparisons. +Bonferroni_P - the P value corrected by the Bonferroni procedure for multiple + comparisons. +Metadata - this column will be present only if the biom table contained metadata + information for your features. If these are OTUs, and taxonomy is present in + the biom table, this category will contain that taxonomy (or other metadata). +""" +script_info['required_options']=[ + make_option('-i','--otu_table_fp', + help='path to input biom format table', + type='existing_path'), + make_option('-o', '--output_fp', type='new_filepath', + help='path to the output file to be created')] + +script_info['optional_options']=[ + make_option('-m','--mapping_fp', type='existing_filepath', + help='path to category mapping file'), + make_option('-c', '--category', type='string', + help='name of the category over which to run the analysis'), + make_option('-s', '--test', type="choice", + choices=correlation_assignment_choices, + default='spearman', help='Test to use. Choices are:\n%s' % \ + (', '.join(correlation_function_choices)+'\n\t' + \ + '[default: %default]')), + make_option('--pval_assignment_method', type="choice", + choices=pvalue_assignment_choices, + default='fisher_z_transform', help='Test to use. Choices are:\n%s' % \ + (', '.join(pvalue_assignment_choices))+'\n\t' + \ + '[default: %default]'), + make_option('--metadata_key', default='taxonomy', type=str, + help='Key to extract metadata from biom table. default: %default]'), + make_option('--paired_t_fp', type='existing_filepath', default=None, + help='Pass a paired sample map as described in help to test with a '+\ + 'paired_t_two_sample test. Overrides all other options. A '+\ + 'paired sample map must be two columns without header that are '+\ + 'tab separated. Each row contains samples which should be paired.'), + make_option('--permutations', default=1000, type=int, + help='Number of permutations to use for bootstrapped tests.'+\ + '[default: %default]')] + +script_info['version'] = __version__ + +def main(): + option_parser, opts, args = parse_command_line_parameters(**script_info) + + bt = parse_biom_table(open(opts.otu_table_fp)) + + if opts.paired_t_fp is not None: #user wants to conduct paired t_test + o = open(opts.paired_t_fp, 'U') + lines = o.readlines() + o.close() + b_samples = [] + a_samples = [] + for i in lines: + a,b = i.strip().split('\t') + a_samples.append(a) + b_samples.append(b) + data_feed = paired_t_generator(bt, b_samples, a_samples) + test_stats, pvals = run_paired_t(data_feed) + # calculate corrected pvals + fdr_pvals = array(benjamini_hochberg_step_down(pvals)) + bon_pvals = bonferroni_correction(pvals) + # write output results after sorting + lines = paired_t_output_formatter(bt, test_stats, pvals, fdr_pvals, + bon_pvals, md_key=opts.metadata_key) + lines = sort_by_pval(lines, ind=2) + o = open(opts.output_fp, 'w') + o.writelines('\n'.join(lines)) + o.close() + + else: # user wants normal correlation analysis + pmf, _ = parse_mapping_file_to_dict(mf_fp) + category = opts.category + + samples_to_correlate = [] + md_values_to_correlate = [] + bt_sample_ids = bt.ids(axis='sample') + + for sample_id, sample_md in pmf.items(): + if sample_id in bt_sample_ids: + try: + v = float(sample_md[category]) + samples_to_correlate.append(sample_id) + md_values_to_correlate.append(v) + except ValueError: + pass # sample in both biom and mf, but no float md in mf + else: + pass # sample in mf, but not bt + + # remove samples which are not found in the mapping file or do not have + # metadata that converts to float + bt.filter(ids_to_keep = samples_to_correlate, axis='sample') + + # sort the biom table so that feature values are retrieved in the same + # order as the metadata in the samples they correspond to + bt.sort(sort_f = lambda _: samples_to_correlate, axis='sample') + + if bt.shape[1] <= 3: + raise ValueError(filtration_error_text) + + rhos = [] + pvals = [] + for feature_vector in bt.iter_data(axis='observation'): + rho = correlate(feature_vector, md_values_to_correlate, + method=opts.test) + pval = assign_correlation_pval(corr, len(feature_vector), + method=opts.pval_assignment_method, + permutations=opts.permutations, + perm_test_fn=bootstrap_functions[opts.pval_assignment_method] + v1=feature_vector, + v2=md_values_to_correlate) + rhos.append(rho) + pvals.append(pval) + + fdr_pvals = benjamini_hochberg_step_down(pvals) + bon_pvals = bonferroni_correction(pvals) + + lines = correlation_output_formatter(bt, rhos, pvals, fdr_pvals, + bon_pvals, opts.metadata_key) + lines = sort_by_pval(lines, ind=2) + + o = open(opts.output_fp, 'w') + o.writelines('\n'.join(lines)) + o.close() + +if __name__ == "__main__": + main() + + From 9958edada8ac2de388512ba31532190fd9327072 Mon Sep 17 00:00:00 2001 From: Will Van Treuren Date: Sun, 7 Dec 2014 21:25:16 -0800 Subject: [PATCH 02/12] Updated function names/calls. --- scripts/correlations.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) mode change 100644 => 100755 scripts/correlations.py diff --git a/scripts/correlations.py b/scripts/correlations.py old mode 100644 new mode 100755 index 55a331c5f2..7ab3fdcd0c --- a/scripts/correlations.py +++ b/scripts/correlations.py @@ -11,9 +11,11 @@ from qiime.util import (parse_command_line_parameters, make_option, sync_biom_and_mf) from qiime.stats import (benjamini_hochberg_step_down, bonferroni_correction, - assign_correlation_pval, correlate, - correlation_output_formatter, sort_by_pval, pearson, + assign_correlation_pval, correlate, pearson, spearman, kendall, cscore) +from qiime.otu_significance import (correlation_output_formatter, sort_by_pval, + paired_t_output_formatter, + paired_t_generator, run_paired_t) from qiime.parse import parse_mapping_file_to_dict from biom import load_table @@ -24,8 +26,8 @@ 2. Removing samples that were found in the mapping file but whose metadata for the given category was not convertable to float.''' -correlation_assignemnt_choices = ['spearman', 'pearson', 'kendall', 'cscore'] -pvalue_assignment_choices = ['fisher_z_transform':, 'parametric_t_distribution', +correlation_assignment_choices = ['spearman', 'pearson', 'kendall', 'cscore'] +pvalue_assignment_choices = ['fisher_z_transform', 'parametric_t_distribution', 'bootstrapped', 'kendall'] bootstrap_functions = {'spearman': spearman, 'pearson': pearson, 'kendall': kendall, 'cscore': cscore} @@ -74,10 +76,9 @@ error. Assigning pvalues to Kendall's Tau scores with the bootstrapping method is -very slow. +very slow.""" -""" script_info['script_usage'] = [] script_info['script_usage'].append(("Calculate the correlation between OTUs in the table and the pH of the samples from mich they came:", "", "%prog -i otu_table.biom -m map.txt -c pH -s spearman -o spearman_otu_gradient.txt")) script_info['script_usage'].append(("Calculate paired t values for a before and after group of samples:", "", "%prog -i otu_table.biom --paired_t_fp=paired_samples.txt -o kendall_longitudinal_otu_gradient.txt")) @@ -112,7 +113,7 @@ make_option('-s', '--test', type="choice", choices=correlation_assignment_choices, default='spearman', help='Test to use. Choices are:\n%s' % \ - (', '.join(correlation_function_choices)+'\n\t' + \ + (', '.join(correlation_assignment_choices)+'\n\t' + \ '[default: %default]')), make_option('--pval_assignment_method', type="choice", choices=pvalue_assignment_choices, @@ -198,7 +199,7 @@ def main(): pval = assign_correlation_pval(corr, len(feature_vector), method=opts.pval_assignment_method, permutations=opts.permutations, - perm_test_fn=bootstrap_functions[opts.pval_assignment_method] + perm_test_fn=bootstrap_functions[opts.pval_assignment_method], v1=feature_vector, v2=md_values_to_correlate) rhos.append(rho) From 6b33a60501b84f0be4423d9eade9aec4d0b0da44 Mon Sep 17 00:00:00 2001 From: Will Van Treuren Date: Sun, 7 Dec 2014 21:34:18 -0800 Subject: [PATCH 03/12] small update removed a useless function, renamed a function, figured out which tests were missing. --- qiime/otu_significance.py | 24 ++---------------------- scripts/correlations.py | 10 +++++----- 2 files changed, 7 insertions(+), 27 deletions(-) diff --git a/qiime/otu_significance.py b/qiime/otu_significance.py index 8a1f9ddc3d..696aa59d53 100644 --- a/qiime/otu_significance.py +++ b/qiime/otu_significance.py @@ -329,7 +329,7 @@ def run_correlation_test(data_generator, test, test_choices, return corr_coefs, pvals -def correlation_output_formatter(bt, corr_coefs, pvals, fdr_pvals, bon_pvals, +def correlate_output_formatter(bt, corr_coefs, pvals, fdr_pvals, bon_pvals, md_key): '''Produce lines for a tab delimited text file for correlations.py. @@ -378,23 +378,6 @@ def run_paired_t(data_generator): pvals.append(pval) return test_stats, pvals - -def paired_t_output_formatter(bt, test_stats, pvals, fdr_pvals, bon_pvals, - md_key): - """Format the output for all tests so it can be easily written.""" - header = ['OTU', 'Test-Statistic', 'P', 'FDR_P', 'Bonferroni_P', md_key] - num_lines = len(pvals) - lines = ['\t'.join(header)] - for i in range(num_lines): - tmp = [bt.ids(axis='observation')[i], test_stats[i], pvals[i], fdr_pvals[i], - bon_pvals[i]] - lines.append('\t'.join(map(str, tmp))) - nls = _add_metadata(bt, md_key, lines) - return nls - -# Functions used by both scripts - - def sort_by_pval(lines, ind): """Sort lines with pvals in descending order. @@ -407,10 +390,7 @@ def _nan_safe_sort(line): return val else: return inf - return ( - # header+sorted lines - [lines[0]] + sorted(lines[1:], key=_nan_safe_sort) - ) + return [lines[0]] + sorted(lines[1:], key=_nan_safe_sort)) def _add_metadata(bt, md_key, lines): diff --git a/scripts/correlations.py b/scripts/correlations.py index 7ab3fdcd0c..141b4a033a 100755 --- a/scripts/correlations.py +++ b/scripts/correlations.py @@ -13,7 +13,7 @@ from qiime.stats import (benjamini_hochberg_step_down, bonferroni_correction, assign_correlation_pval, correlate, pearson, spearman, kendall, cscore) -from qiime.otu_significance import (correlation_output_formatter, sort_by_pval, +from qiime.otu_significance import (correlate_output_formatter, sort_by_pval, paired_t_output_formatter, paired_t_generator, run_paired_t) from qiime.parse import parse_mapping_file_to_dict @@ -154,8 +154,8 @@ def main(): fdr_pvals = array(benjamini_hochberg_step_down(pvals)) bon_pvals = bonferroni_correction(pvals) # write output results after sorting - lines = paired_t_output_formatter(bt, test_stats, pvals, fdr_pvals, - bon_pvals, md_key=opts.metadata_key) + lines = correlate_output_formatter(bt, test_stats, pvals, fdr_pvals, + bon_pvals, md_key=opts.metadata_key) lines = sort_by_pval(lines, ind=2) o = open(opts.output_fp, 'w') o.writelines('\n'.join(lines)) @@ -208,8 +208,8 @@ def main(): fdr_pvals = benjamini_hochberg_step_down(pvals) bon_pvals = bonferroni_correction(pvals) - lines = correlation_output_formatter(bt, rhos, pvals, fdr_pvals, - bon_pvals, opts.metadata_key) + lines = correlate_output_formatter(bt, rhos, pvals, fdr_pvals, + bon_pvals, opts.metadata_key) lines = sort_by_pval(lines, ind=2) o = open(opts.output_fp, 'w') From 8ba13767bc5131a0b4c8b9164f9b6febf67c4698 Mon Sep 17 00:00:00 2001 From: Will Van Treuren Date: Sun, 7 Dec 2014 23:16:07 -0800 Subject: [PATCH 04/12] Add script usage tests + modifications This commit adds script usage tests and modifies the help text as well as minor tweaks to functionality for correlations.py --- qiime/otu_significance.py | 79 +++++++++++++----- qiime/stats.py | 20 +++-- qiime_test_data/correlations/map.txt | 11 +++ qiime_test_data/correlations/otu_table.biom | Bin 0 -> 37016 bytes .../correlations/paired_samples.txt | 5 ++ scripts/correlations.py | 45 ++++++---- 6 files changed, 117 insertions(+), 43 deletions(-) create mode 100644 qiime_test_data/correlations/map.txt create mode 100644 qiime_test_data/correlations/otu_table.biom create mode 100644 qiime_test_data/correlations/paired_samples.txt diff --git a/qiime/otu_significance.py b/qiime/otu_significance.py index 696aa59d53..6a547d8efa 100644 --- a/qiime/otu_significance.py +++ b/qiime/otu_significance.py @@ -12,7 +12,7 @@ from qiime.parse import parse_mapping_file_to_dict from numpy import (array, argsort, vstack, isnan, inf, nan, apply_along_axis, - mean, zeros) + mean, zeros, isinf, logical_or) from qiime.stats import (fisher_population_correlation, pearson, spearman, g_fit, ANOVA_one_way, @@ -329,15 +329,16 @@ def run_correlation_test(data_generator, test, test_choices, return corr_coefs, pvals -def correlate_output_formatter(bt, corr_coefs, pvals, fdr_pvals, bon_pvals, - md_key): +def correlate_output_formatter(bt, test_stats, pvals, fdr_pvals, bon_pvals, + md_key): '''Produce lines for a tab delimited text file for correlations.py. Paramaters ---------- bt : biom table object - corr_coefs : array-like - Floats representing correlation coefficients. + test_stats : array-like + Floats representing correlation coefficients or paired t test + statistics. pvals : array-like Floats representing pvalues for given correlation coefficients. fdr_pvals : array-like @@ -346,34 +347,50 @@ def correlate_output_formatter(bt, corr_coefs, pvals, fdr_pvals, bon_pvals, Floats representing Bonferroni corrected pvals. md_key : str or None Key for extracting feature metadata from biom table. + type_of_stat : str + Type of statistic - used to modify header to report either 'Co Returns ------- list of strs ''' - header = ['Feature ID', 'Corr coef', 'pval', 'pval_fdr', 'pval_bon', md_key] - num_lines = len(corr_coefs) + header = ['Feature ID', 'Test stat.', 'pval', 'pval_fdr', 'pval_bon', + md_key] + num_lines = len(test_stats) lines = ['\t'.join(header)] for i in range(num_lines): - tmp = [bt.ids(axis='observation')[i], corr_coefs[i], pvals[i], fdr_pvals[i], - bon_pvals[i]] + tmp = [bt.ids(axis='observation')[i], test_stats[i], pvals[i], + fdr_pvals[i], bon_pvals[i]] lines.append('\t'.join(map(str, tmp))) nls = _add_metadata(bt, md_key, lines) return nls +def run_paired_t(bt, s1, s2): + '''Perform a paired t test between samples. -def paired_t_generator(bt, s_before, s_after): - """Produce a generator to run paired t tests on each OTU.""" - b_data = vstack([bt.sampleData(i) for i in s_before]).T - a_data = vstack([bt.sampleData(i) for i in s_after]).T - return ((b_data[i], a_data[i]) for i in range(len(bt.ids(axis='observation')))) + Parameters + ---------- + bt : biom table object + Table containing data for samples in s1 and s2. + s1 : list + List of sample ids found in bt (strs). + s2 : list + List of sample ids found in bt (strs). + Returns + ------- + test_stats : list + Floats representing the test statistic for each paired comparison. + pvals : list + Floats representing the p-values of the reported test statistics. + ''' + test_stats = [] + pvals = [] + s1_indices = [bt.index(i, axis='sample') for i in s1] + s2_indices = [bt.index(i, axis='sample') for i in s2] -def run_paired_t(data_generator): - """Run paired t test on data.""" - test_stats, pvals = [], [] - for b_data, a_data in data_generator: - test_stat, pval = t_paired(b_data, a_data) + for data in bt.iter_data(axis='observation'): + test_stat, pval = t_paired(data.take(s1_indices), data.take(s2_indices)) test_stats.append(test_stat) pvals.append(pval) return test_stats, pvals @@ -390,7 +407,7 @@ def _nan_safe_sort(line): return val else: return inf - return [lines[0]] + sorted(lines[1:], key=_nan_safe_sort)) + return [lines[0]] + sorted(lines[1:], key=_nan_safe_sort) def _add_metadata(bt, md_key, lines): @@ -404,3 +421,25 @@ def _add_metadata(bt, md_key, lines): else: # remove md_header from the first line nls = ['\t'.join(lines[0].split('\t')[:-1])] + lines[1:] return nls + +def is_computable_float(v): + '''Return float if v can be converted to float excluding nan and inf. + + Parameters + ---------- + v : variable + Value to be converted to float if possible. + + Returns + ------- + If v can be converted to a float that is not nan or inf, return v. + Otherwise return False. + ''' + try: + tmp = float(v) + if not logical_or(isnan(tmp), isinf(tmp)): + return tmp + else: + return False + except ValueError: + return False diff --git a/qiime/stats.py b/qiime/stats.py index cd98129f89..5bab8ee0ca 100644 --- a/qiime/stats.py +++ b/qiime/stats.py @@ -1701,10 +1701,13 @@ def assign_correlation_pval(corr, n, method, permutations=None, raise ValueError('You must specify vectors, permutation ' 'function, and number of permutations to calc ' 'bootstrapped pvalues. Cant continue.') - r = empty(permutations) - for i in range(permutations): - r[i] = perm_test_fn(v1, permutation(v2)) - return (abs(r) >= abs(corr)).sum() / float(permutations) + if any([isnan(corr), isinf(corr)]): + return nan + else: + r = empty(permutations) + for i in range(permutations): + r[i] = perm_test_fn(v1, permutation(v2)) + return (abs(r) >= abs(corr)).sum() / float(permutations) elif method == 'kendall': return kendall_pval(corr, n) else: @@ -2617,15 +2620,14 @@ def correlate(v1, v2, method): rho : float Correlation between the vectors. ''' - if method is 'pearson': + if method == 'pearson': corr_fn = pearson - elif method is 'spearman': + elif method == 'spearman': corr_fn = spearman - elif method is 'kendall': + elif method == 'kendall': corr_fn = kendall - elif method is 'cscore': + elif method == 'cscore': corr_fn = cscore else: raise ValueError('Correlation function not recognized.') - return corr_fn(v1, v2) diff --git a/qiime_test_data/correlations/map.txt b/qiime_test_data/correlations/map.txt new file mode 100644 index 0000000000..72323e1438 --- /dev/null +++ b/qiime_test_data/correlations/map.txt @@ -0,0 +1,11 @@ +#SampleID pH +s0 1.1 +s1 2.1 +s2 1.7 +s3 4.7 +s4 300 +s5 -12 +s6 89.242251 +s7 ‘SKJDKJSD’ +s8 nan +s9 100 \ No newline at end of file diff --git a/qiime_test_data/correlations/otu_table.biom b/qiime_test_data/correlations/otu_table.biom new file mode 100644 index 0000000000000000000000000000000000000000..0072684d06915922b4f7ecbfa91b7ce5914d86d9 GIT binary patch literal 37016 zcmeHQU2IfE6rS5{w_DoM1^H_Mxj`T`TDmPOzX28~P=u5!O(Yn~?zUUn*zK;n3l>8Z zFbc##`aqNi6X*j04aO#7tRyJPUlVx%BC&}kR*lgIV?ZGR(csLTb8gvLZWrQi=(cAF zJ9o~UduGlzKb$!;bJ<&6wW4rD$p|Kb!@;td(|E+6yATd`ND8Jg-wO{K#Csq-w)#%yd1bFh|1DX^FqBu0?< zE6a2MpUltakynz8{C+t(hMd^8Ft06R$>azglp`Q0VAX=eW#$`lnnIDFFIKiC(rS>C z${WYoP@Wi?1xkk=3i7>!@n$b~+Dypnt5<;2Oe8+u2ZeMW;ksJ^03 zBFG6<)X!z4H9oTzaKzf$UNzDQ@(e0tq8&ugj6ePJCicVAqLMuH<;pnOyxBZS^3l90 zaTspD>bz;re=r9j9(=Nhw6rrmk3|hqFc9_mS&5_WSa9KYU$Fq(M7V*kuYu6aO zVnXBr-xKqo2sI(S&gX9p=&P!XG|n4Fg8zV-V2+FCSS&ohyxiXs3YN(^X=W(0neovj z1IDkkv>G!X3>PNGxd@f63U^sWWtnSEU1jBbSH*nSteNgP^IR^s%(u&kVLd{#Bmst7 zKOOiHMw7o%^5>~?DkK5t1;pVc;6pr+9tL^=#7J*|bfH2=IU^N1$|+LlC zO5sO2qZK;JdCY=7Mxmpeu?ii_#rY5Mj6Ne7#{<7fpDhVEubK2Yl3uLxL&wK{VDitC zj1yIUXm^-C&32DMyH8g6q1_)>>Co;|RJscY5cOe0$CD{V2oM5<03kpK5CVh%AwUQa z0)#-uAh51xZI#AXPq_kTgq|x`+($?705Qs7!hhvd`{~fqf9#~&F4`j1a zc%%A7`GDg)4C#)Il8%0^3mS6?I3o|=Kn87#RP%ay)5 zP*j7RRos6;{~5+Mo-bHO>5I+%8h1W(-v!5N4yTIzESnUY&vfGfwsZKi$k$k7ps7{t zsGxt<0dj3wS*OA0C5pu6V58wt$Ud~_%{m7ANj#a-Am%$!Kin_ISbCEai<*la5V8;_ zv*L-b5QPvR1PB2_fDj-AG8_SEhuTa-r3*LmEc8MkEzRpTF*`JMiTBetNXg_kQ z(|vK_?Ai@qYHa$&0-un>jh@|I>P-ymb9o z`JtY&C~&)lAIem~;yof00)zk|KnM^5gh1vZkj#FI;-61PB2_fDj-A2!V`8AesGyb^ykB4$A6&c?$ecJ~$7)D4y#`c!3Cn03kpK5CVh%A&~J1B(tCLK|_qwct8Fa99k7cndV30InMQI z#oX$A=MxsC1{M1WaWWOKc#p`003kpK5CVh%A&|KUB(tCJ{~^XW{r6$Ae%mGC6h)%j z@R8q-FSlso4#j>#oXm<_yhmh0fDj-A2mwNX5Xf8vtj?3F=i!o_=cn_encD|aQFHY? rDW1pNp`540bD6cND3r^v>c8qaQtH1c0(^*yvFg8`j(CMr-GBcBt|aue literal 0 HcmV?d00001 diff --git a/qiime_test_data/correlations/paired_samples.txt b/qiime_test_data/correlations/paired_samples.txt new file mode 100644 index 0000000000..4e40751eef --- /dev/null +++ b/qiime_test_data/correlations/paired_samples.txt @@ -0,0 +1,5 @@ +s1 s9 +s0 s5 +s2 s3 +s4 s7 +s6 s8 \ No newline at end of file diff --git a/scripts/correlations.py b/scripts/correlations.py index 141b4a033a..057c7d8a49 100755 --- a/scripts/correlations.py +++ b/scripts/correlations.py @@ -1,8 +1,10 @@ #!/usr/bin/env python -__author__ = "Will Van Treuren, Luke Ursell" +__author__ = "Will Van Treuren" __copyright__ = "Copyright 2014, The QIIME project" -__credits__ = ["Will Van Treuren"] +__credits__ = ["Will Van Treuren", "Luke Ursell", "Catherine Lozupone", + "Jesse Stombaugh", "Doug Wendel", "Dan Knights", "Greg Caporaso", + "Jai Ram Rideout"] __license__ = "GPL" __version__ = "1.8.0-dev" __maintainer__ = "Will Van Treuren" @@ -14,10 +16,10 @@ assign_correlation_pval, correlate, pearson, spearman, kendall, cscore) from qiime.otu_significance import (correlate_output_formatter, sort_by_pval, - paired_t_output_formatter, - paired_t_generator, run_paired_t) + run_paired_t, is_computable_float) from qiime.parse import parse_mapping_file_to_dict from biom import load_table +from numpy import array, where filtration_error_text = '''The biom table does not have enough samples after filtration to perform correlations (it has 3 or fewer). The filtration steps @@ -26,6 +28,10 @@ 2. Removing samples that were found in the mapping file but whose metadata for the given category was not convertable to float.''' +cscore_error_text = '''The only supported metric for P-value assignment with the +C-score is bootstrapping. For more information on the C-score, read Stone and +Roberts 1990 Oecologea paper 85: 74-79.''' + correlation_assignment_choices = ['spearman', 'pearson', 'kendall', 'cscore'] pvalue_assignment_choices = ['fisher_z_transform', 'parametric_t_distribution', 'bootstrapped', 'kendall'] @@ -81,7 +87,8 @@ script_info['script_usage'] = [] script_info['script_usage'].append(("Calculate the correlation between OTUs in the table and the pH of the samples from mich they came:", "", "%prog -i otu_table.biom -m map.txt -c pH -s spearman -o spearman_otu_gradient.txt")) -script_info['script_usage'].append(("Calculate paired t values for a before and after group of samples:", "", "%prog -i otu_table.biom --paired_t_fp=paired_samples.txt -o kendall_longitudinal_otu_gradient.txt")) +script_info['script_usage'].append(("Calculate paired t values for a before and after group of samples:", "", "%prog -i otu_table.biom --paired_t_fp=paired_samples.txt -o paired.txt")) +script_info['script_usage'].append(("Calculate the correlation between OTUs in the table and the pH of the samples from mich they came using bootstrapping and pearon correlation:", "", "%prog -i otu_table.biom -m map.txt -c pH -s pearson --pval_assignment_method bootstrapped --permutations 100 -o pearson_bootstrapped.txt")) script_info['output_description']= """ If you have chosen simple correlation or paired_t then you @@ -136,7 +143,10 @@ def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) - bt = parse_biom_table(open(opts.otu_table_fp)) + if opts.test == 'cscore' and opts.pval_assignment_method != 'bootstrapped': + raise ValueError(cscore_error_text) + + bt = load_table(opts.otu_table_fp) if opts.paired_t_fp is not None: #user wants to conduct paired t_test o = open(opts.paired_t_fp, 'U') @@ -148,11 +158,13 @@ def main(): a,b = i.strip().split('\t') a_samples.append(a) b_samples.append(b) - data_feed = paired_t_generator(bt, b_samples, a_samples) - test_stats, pvals = run_paired_t(data_feed) + test_stats, pvals = run_paired_t(bt, a_samples, b_samples) # calculate corrected pvals fdr_pvals = array(benjamini_hochberg_step_down(pvals)) bon_pvals = bonferroni_correction(pvals) + # correct for cases where values above 1.0 due to correction + fdr_pvals = where(array(fdr_pvals) > 1.0, 1.0, fdr_pvals) + bon_pvals = where(array(bon_pvals) > 1.0, 1.0, bon_pvals) # write output results after sorting lines = correlate_output_formatter(bt, test_stats, pvals, fdr_pvals, bon_pvals, md_key=opts.metadata_key) @@ -162,7 +174,7 @@ def main(): o.close() else: # user wants normal correlation analysis - pmf, _ = parse_mapping_file_to_dict(mf_fp) + pmf, _ = parse_mapping_file_to_dict(opts.mapping_fp) category = opts.category samples_to_correlate = [] @@ -172,11 +184,12 @@ def main(): for sample_id, sample_md in pmf.items(): if sample_id in bt_sample_ids: try: - v = float(sample_md[category]) + v = is_computable_float(sample_md[category]) samples_to_correlate.append(sample_id) md_values_to_correlate.append(v) - except ValueError: - pass # sample in both biom and mf, but no float md in mf + except KeyError: + raise ValueError(('The category (%s)' % opts.category) +\ + ' was not found in the mapping file.') else: pass # sample in mf, but not bt @@ -196,10 +209,11 @@ def main(): for feature_vector in bt.iter_data(axis='observation'): rho = correlate(feature_vector, md_values_to_correlate, method=opts.test) - pval = assign_correlation_pval(corr, len(feature_vector), + pval = assign_correlation_pval(rho, len(feature_vector), method=opts.pval_assignment_method, permutations=opts.permutations, - perm_test_fn=bootstrap_functions[opts.pval_assignment_method], + perm_test_fn=\ + bootstrap_functions[opts.test], v1=feature_vector, v2=md_values_to_correlate) rhos.append(rho) @@ -207,6 +221,9 @@ def main(): fdr_pvals = benjamini_hochberg_step_down(pvals) bon_pvals = bonferroni_correction(pvals) + # correct for cases where values above 1.0 due to correction + fdr_pvals = where(array(fdr_pvals) > 1.0, 1.0, fdr_pvals) + bon_pvals = where(array(bon_pvals) > 1.0, 1.0, bon_pvals) lines = correlate_output_formatter(bt, rhos, pvals, fdr_pvals, bon_pvals, opts.metadata_key) From f58e3146c0c56dc8961a40558dd4cbf912890216 Mon Sep 17 00:00:00 2001 From: Will Van Treuren Date: Tue, 9 Dec 2014 15:44:03 -0800 Subject: [PATCH 05/12] Addressed formatting comments --- ChangeLog.md | 1 + scripts/correlations.py | 121 +++++++++++++++++++--------------------- 2 files changed, 59 insertions(+), 63 deletions(-) diff --git a/ChangeLog.md b/ChangeLog.md index 0f9e8ceb16..ee3a9467ae 100644 --- a/ChangeLog.md +++ b/ChangeLog.md @@ -1,5 +1,6 @@ QIIME 1.8.0-dev (changes since 1.8.0 go here) ============================================= +* ``correlations.py`` has been added. This script allows the calculation of correlations between feature abundances and continuous-valued metadata. It also reintroduces the paired-t test functionality that has been missing in 1.8. * ``split_otu_table.py`` now allows multiple fields to be passed to split a biom table, and optionally a mapping file. Check out the new documentation for the naming conventions (which have changed slightly) and an example. diff --git a/scripts/correlations.py b/scripts/correlations.py index 057c7d8a49..f938ab69fd 100755 --- a/scripts/correlations.py +++ b/scripts/correlations.py @@ -39,42 +39,27 @@ 'kendall': kendall, 'cscore': cscore} script_info = {} -script_info['brief_description'] = """ +script_info['brief_description'] = """This script calculates correlations between feature abundances and continuous-valued metadata.""" +script_info['script_description'] = """ This script allows the calculation of: 1. Correlations between feature abundances (relative or absolute) and numeric metadata. 2. Paired t-tests between two groups of samples. -The available methods for calculating correlations are Spearmans -Rho, Pearson, Kendall's Tau, and the C or checkerboard score. The available -methods for assigning p-values to the calculated correlation scores are -bootstrapping, Fisher's Z transformation, a parametric t-distribution, and -a Kendall's Tau specific p-value calculation. -""" -script_info['script_description'] = """ -This script calculates the correlation between feature values and a gradient of -sample data. Several methods are provided to allow the user to correlate -features to sample metadata values. This script also allows paired t testing. -The paired t test is accomplished by passing a paired mapping file which is just -a two column (tab separation) table with the samples that should be paired in -each row. It should not have a header. -The available correlations tests are Kendall's Tau, Spearmans rank correlation, -Pearsons product moment correlation, and the C-score (or checkerboard score). -This script generates a tab separated output file which differs based on which -test you have chosen. If you have chosen simple correlation or paired_t then you -will see the following headers: -Feature ID - ID of the features being correlated. If these are OTUs, then they - will take the form of GreenGenes identifiers or de-novo identifiers. -Test-Statistic - the value of the test statistic for the given test -P - the raw P value returned by the given test. -FDR_P - the P value corrected by the Benjamini-Hochberg FDR procedure for - multiple comparisons. -Bonferroni_P - the P value corrected by the Bonferroni procedure for multiple - comparisons. -Metadata - this column will be present only if the biom table contained metadata - information for your features. If these are OTUs, and taxonomy is present in - the biom table, this category will contain that taxonomy (or other metadata). - -Warnings: + +Several methods are provided to allow the user to correlate +features to sample metadata values including Spearmans Rho, Pearson, Kendall's +Tau, and the C or checkerboard score. + +The available methods for assigning p-values to the calculated correlation +scores are bootstrapping, Fisher's Z transformation, a parametric +t-distribution, and a Kendall's Tau specific p-value calculation. + +This script also allows paired t testing. The paired t test is accomplished by +passing a paired mapping file which is just a two column (tab separation) table +with the samples that should be paired in each row. It should not have a header. + +Notes: + The only supported metric for P-value assignment with the C-score is bootstrapping. For more information on the C-score, read Stone and Roberts 1990 Oecologea paper 85: 74-79. If you fail to pass @@ -82,28 +67,39 @@ error. Assigning pvalues to Kendall's Tau scores with the bootstrapping method is -very slow.""" +very slow. +""" script_info['script_usage'] = [] -script_info['script_usage'].append(("Calculate the correlation between OTUs in the table and the pH of the samples from mich they came:", "", "%prog -i otu_table.biom -m map.txt -c pH -s spearman -o spearman_otu_gradient.txt")) -script_info['script_usage'].append(("Calculate paired t values for a before and after group of samples:", "", "%prog -i otu_table.biom --paired_t_fp=paired_samples.txt -o paired.txt")) -script_info['script_usage'].append(("Calculate the correlation between OTUs in the table and the pH of the samples from mich they came using bootstrapping and pearon correlation:", "", "%prog -i otu_table.biom -m map.txt -c pH -s pearson --pval_assignment_method bootstrapped --permutations 100 -o pearson_bootstrapped.txt")) +script_info['script_usage'].append( + ("Calculate the correlation between OTUs in the table and the pH of the samples from mich they came:", + "", + "%prog -i otu_table.biom -m map.txt -c pH -s spearman -o spearman_otu_gradient.txt")) +script_info['script_usage'].append( + ("Calculate paired t values for a before and after group of samples:", + "", + "%prog -i otu_table.biom --paired_t_fp=paired_samples.txt -o paired.txt")) +script_info['script_usage'].append( + ("Calculate the correlation between OTUs in the table and the pH of the samples from mich they came using bootstrapping and pearon correlation:", + "", + "%prog -i otu_table.biom -m map.txt -c pH -s pearson --pval_assignment_method bootstrapped --permutations 100 -o pearson_bootstrapped.txt")) script_info['output_description']= """ -If you have chosen simple correlation or paired_t then you -will see the following headers: -Feature ID - ID of the features being correlated. If these are OTUs, then they - will take the form of GreenGenes identifiers or de-novo identifiers. -Test-Statistic - the value of the test statistic for the given test -P - the raw P value returned by the given test. -FDR_P - the P value corrected by the Benjamini-Hochberg FDR procedure for - multiple comparisons. -Bonferroni_P - the P value corrected by the Bonferroni procedure for multiple - comparisons. -Metadata - this column will be present only if the biom table contained metadata - information for your features. If these are OTUs, and taxonomy is present in - the biom table, this category will contain that taxonomy (or other metadata). +The output will be a tab delimited file with the following headers. Each row +will record the values calculated for a fiven featue: +- Feature ID: ID of the features being correlated. If these are OTUs, then they + will take the form of GreenGenes identifiers or de-novo identifiers. +- Test-Statistic: the value of the test statistic for the given test. +- P: the raw P value returned by the given test. +- FDR_P: the P value corrected by the Benjamini-Hochberg FDR procedure for + multiple comparisons. +- Bonferroni_P: the P value corrected by the Bonferroni procedure for multiple + comparisons. +- Metadata - this column will be present only if the biom table contained + metadata information for your features. If these are OTUs, and taxonomy is + present in the biom table, this category will contain that taxonomy (or other + metadata). """ script_info['required_options']=[ make_option('-i','--otu_table_fp', @@ -119,24 +115,23 @@ help='name of the category over which to run the analysis'), make_option('-s', '--test', type="choice", choices=correlation_assignment_choices, - default='spearman', help='Test to use. Choices are:\n%s' % \ - (', '.join(correlation_assignment_choices)+'\n\t' + \ - '[default: %default]')), + default='spearman', help='Correlation method to use. Choices are: %s' % + (', '.join(correlation_assignment_choices)) + ' [default: %default]'), make_option('--pval_assignment_method', type="choice", choices=pvalue_assignment_choices, - default='fisher_z_transform', help='Test to use. Choices are:\n%s' % \ - (', '.join(pvalue_assignment_choices))+'\n\t' + \ - '[default: %default]'), + default='fisher_z_transform', help='Pvalue method to use. Choices are: ' + '%s' % (', '.join(pvalue_assignment_choices)) + ' [default: %default]'), make_option('--metadata_key', default='taxonomy', type=str, - help='Key to extract metadata from biom table. default: %default]'), + help='Key to extract metadata from biom table. [default: %default]'), make_option('--paired_t_fp', type='existing_filepath', default=None, - help='Pass a paired sample map as described in help to test with a '+\ - 'paired_t_two_sample test. Overrides all other options. A '+\ - 'paired sample map must be two columns without header that are '+\ - 'tab separated. Each row contains samples which should be paired.'), + help='Pass a paired sample map as described in help to test with a ' + 'paired_t_two_sample test. Overrides all other options. A ' + 'paired sample map must be two columns without header that are ' + 'tab separated. Each row contains samples which should be paired.' + ' [default: %default]'), make_option('--permutations', default=1000, type=int, - help='Number of permutations to use for bootstrapped tests.'+\ - '[default: %default]')] + help='Number of permutations to use for bootstrapped tests.' + ' [default: %default]')] script_info['version'] = __version__ @@ -188,7 +183,7 @@ def main(): samples_to_correlate.append(sample_id) md_values_to_correlate.append(v) except KeyError: - raise ValueError(('The category (%s)' % opts.category) +\ + raise ValueError('The category (%s)' % opts.category + ' was not found in the mapping file.') else: pass # sample in mf, but not bt From 12a2f638f4fc65245ef17cc13ba4ce2766c7a0b3 Mon Sep 17 00:00:00 2001 From: Will Van Treuren Date: Tue, 9 Dec 2014 16:06:23 -0800 Subject: [PATCH 06/12] Updated Changlog message --- ChangeLog.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ChangeLog.md b/ChangeLog.md index ee3a9467ae..ec6e2b7ea9 100644 --- a/ChangeLog.md +++ b/ChangeLog.md @@ -1,6 +1,6 @@ QIIME 1.8.0-dev (changes since 1.8.0 go here) ============================================= -* ``correlations.py`` has been added. This script allows the calculation of correlations between feature abundances and continuous-valued metadata. It also reintroduces the paired-t test functionality that has been missing in 1.8. +* ``correlations.py`` has been added. This script allows the calculation of correlations between feature abundances and continuous-valued metadata. It also reintroduces the paired-t test functionality that has been missing in 1.8. This script also replaces the functions that were in ``otu_category_significance.py`` in QIIME 1.7 and earlier. * ``split_otu_table.py`` now allows multiple fields to be passed to split a biom table, and optionally a mapping file. Check out the new documentation for the naming conventions (which have changed slightly) and an example. From 3013f211317c5310ef9c890bfae632c26d57556d Mon Sep 17 00:00:00 2001 From: Will Van Treuren Date: Wed, 10 Dec 2014 15:59:19 -0800 Subject: [PATCH 07/12] Updates after review Updates the name of the script and adds tests to several untested functions. Add's to help documentation. Removes the paired t functionality. --- qiime/otu_significance.py | 11 +- .../map.txt | 0 .../otu_table.biom | Bin .../paired_samples.txt | 0 ...py => observation_metadata_correlation.py} | 184 +++++++----------- tests/test_otu_significance.py | 87 ++++++++- 6 files changed, 158 insertions(+), 124 deletions(-) rename qiime_test_data/{correlations => observation_metadata_correlation}/map.txt (100%) rename qiime_test_data/{correlations => observation_metadata_correlation}/otu_table.biom (100%) rename qiime_test_data/{correlations => observation_metadata_correlation}/paired_samples.txt (100%) rename scripts/{correlations.py => observation_metadata_correlation.py} (53%) diff --git a/qiime/otu_significance.py b/qiime/otu_significance.py index 6a547d8efa..1a5cd3d172 100644 --- a/qiime/otu_significance.py +++ b/qiime/otu_significance.py @@ -435,11 +435,8 @@ def is_computable_float(v): If v can be converted to a float that is not nan or inf, return v. Otherwise return False. ''' - try: - tmp = float(v) - if not logical_or(isnan(tmp), isinf(tmp)): - return tmp - else: - return False - except ValueError: + tmp = float(v) + if not logical_or(isnan(tmp), isinf(tmp)): + return tmp + else: return False diff --git a/qiime_test_data/correlations/map.txt b/qiime_test_data/observation_metadata_correlation/map.txt similarity index 100% rename from qiime_test_data/correlations/map.txt rename to qiime_test_data/observation_metadata_correlation/map.txt diff --git a/qiime_test_data/correlations/otu_table.biom b/qiime_test_data/observation_metadata_correlation/otu_table.biom similarity index 100% rename from qiime_test_data/correlations/otu_table.biom rename to qiime_test_data/observation_metadata_correlation/otu_table.biom diff --git a/qiime_test_data/correlations/paired_samples.txt b/qiime_test_data/observation_metadata_correlation/paired_samples.txt similarity index 100% rename from qiime_test_data/correlations/paired_samples.txt rename to qiime_test_data/observation_metadata_correlation/paired_samples.txt diff --git a/scripts/correlations.py b/scripts/observation_metadata_correlation.py similarity index 53% rename from scripts/correlations.py rename to scripts/observation_metadata_correlation.py index f938ab69fd..1d71d0cf60 100755 --- a/scripts/correlations.py +++ b/scripts/observation_metadata_correlation.py @@ -41,25 +41,19 @@ script_info = {} script_info['brief_description'] = """This script calculates correlations between feature abundances and continuous-valued metadata.""" script_info['script_description'] = """ -This script allows the calculation of: - 1. Correlations between feature abundances (relative or absolute) and - numeric metadata. - 2. Paired t-tests between two groups of samples. - -Several methods are provided to allow the user to correlate -features to sample metadata values including Spearmans Rho, Pearson, Kendall's -Tau, and the C or checkerboard score. +This script calculates correlations between feature (aka observation) abundances +(relative or absolute) and numeric metadata.Several methods are provided to +allow the user to correlate features to sample metadata values including +Spearmans Rho, Pearson, Kendall's Tau, and the C or checkerboard score. +References for these methods are numerous, but good descriptions may be found in +'Biometry' by Sokal and Rolhf. The available methods for assigning p-values to the calculated correlation scores are bootstrapping, Fisher's Z transformation, a parametric -t-distribution, and a Kendall's Tau specific p-value calculation. - -This script also allows paired t testing. The paired t test is accomplished by -passing a paired mapping file which is just a two column (tab separation) table -with the samples that should be paired in each row. It should not have a header. +t-distribution, and a Kendall's Tau specific p-value calculation. These methods +are also described in 'Biometry'. Notes: - The only supported metric for P-value assignment with the C-score is bootstrapping. For more information on the C-score, read Stone and Roberts 1990 Oecologea paper 85: 74-79. If you fail to pass @@ -76,10 +70,6 @@ ("Calculate the correlation between OTUs in the table and the pH of the samples from mich they came:", "", "%prog -i otu_table.biom -m map.txt -c pH -s spearman -o spearman_otu_gradient.txt")) -script_info['script_usage'].append( - ("Calculate paired t values for a before and after group of samples:", - "", - "%prog -i otu_table.biom --paired_t_fp=paired_samples.txt -o paired.txt")) script_info['script_usage'].append( ("Calculate the correlation between OTUs in the table and the pH of the samples from mich they came using bootstrapping and pearon correlation:", "", @@ -106,13 +96,13 @@ help='path to input biom format table', type='existing_path'), make_option('-o', '--output_fp', type='new_filepath', - help='path to the output file to be created')] - -script_info['optional_options']=[ + help='path to the output file to be created'), make_option('-m','--mapping_fp', type='existing_filepath', help='path to category mapping file'), make_option('-c', '--category', type='string', - help='name of the category over which to run the analysis'), + help='name of the category over which to run the analysis')] + +script_info['optional_options']=[ make_option('-s', '--test', type="choice", choices=correlation_assignment_choices, default='spearman', help='Correlation method to use. Choices are: %s' % @@ -123,12 +113,6 @@ '%s' % (', '.join(pvalue_assignment_choices)) + ' [default: %default]'), make_option('--metadata_key', default='taxonomy', type=str, help='Key to extract metadata from biom table. [default: %default]'), - make_option('--paired_t_fp', type='existing_filepath', default=None, - help='Pass a paired sample map as described in help to test with a ' - 'paired_t_two_sample test. Overrides all other options. A ' - 'paired sample map must be two columns without header that are ' - 'tab separated. Each row contains samples which should be paired.' - ' [default: %default]'), make_option('--permutations', default=1000, type=int, help='Number of permutations to use for bootstrapped tests.' ' [default: %default]')] @@ -142,91 +126,65 @@ def main(): raise ValueError(cscore_error_text) bt = load_table(opts.otu_table_fp) - - if opts.paired_t_fp is not None: #user wants to conduct paired t_test - o = open(opts.paired_t_fp, 'U') - lines = o.readlines() - o.close() - b_samples = [] - a_samples = [] - for i in lines: - a,b = i.strip().split('\t') - a_samples.append(a) - b_samples.append(b) - test_stats, pvals = run_paired_t(bt, a_samples, b_samples) - # calculate corrected pvals - fdr_pvals = array(benjamini_hochberg_step_down(pvals)) - bon_pvals = bonferroni_correction(pvals) - # correct for cases where values above 1.0 due to correction - fdr_pvals = where(array(fdr_pvals) > 1.0, 1.0, fdr_pvals) - bon_pvals = where(array(bon_pvals) > 1.0, 1.0, bon_pvals) - # write output results after sorting - lines = correlate_output_formatter(bt, test_stats, pvals, fdr_pvals, - bon_pvals, md_key=opts.metadata_key) - lines = sort_by_pval(lines, ind=2) - o = open(opts.output_fp, 'w') - o.writelines('\n'.join(lines)) - o.close() - - else: # user wants normal correlation analysis - pmf, _ = parse_mapping_file_to_dict(opts.mapping_fp) - category = opts.category - - samples_to_correlate = [] - md_values_to_correlate = [] - bt_sample_ids = bt.ids(axis='sample') - - for sample_id, sample_md in pmf.items(): - if sample_id in bt_sample_ids: - try: - v = is_computable_float(sample_md[category]) - samples_to_correlate.append(sample_id) - md_values_to_correlate.append(v) - except KeyError: - raise ValueError('The category (%s)' % opts.category + - ' was not found in the mapping file.') - else: - pass # sample in mf, but not bt - - # remove samples which are not found in the mapping file or do not have - # metadata that converts to float - bt.filter(ids_to_keep = samples_to_correlate, axis='sample') - - # sort the biom table so that feature values are retrieved in the same - # order as the metadata in the samples they correspond to - bt.sort(sort_f = lambda _: samples_to_correlate, axis='sample') - - if bt.shape[1] <= 3: - raise ValueError(filtration_error_text) - - rhos = [] - pvals = [] - for feature_vector in bt.iter_data(axis='observation'): - rho = correlate(feature_vector, md_values_to_correlate, - method=opts.test) - pval = assign_correlation_pval(rho, len(feature_vector), - method=opts.pval_assignment_method, - permutations=opts.permutations, - perm_test_fn=\ - bootstrap_functions[opts.test], - v1=feature_vector, - v2=md_values_to_correlate) - rhos.append(rho) - pvals.append(pval) - - fdr_pvals = benjamini_hochberg_step_down(pvals) - bon_pvals = bonferroni_correction(pvals) - # correct for cases where values above 1.0 due to correction - fdr_pvals = where(array(fdr_pvals) > 1.0, 1.0, fdr_pvals) - bon_pvals = where(array(bon_pvals) > 1.0, 1.0, bon_pvals) - - lines = correlate_output_formatter(bt, rhos, pvals, fdr_pvals, - bon_pvals, opts.metadata_key) - lines = sort_by_pval(lines, ind=2) - - o = open(opts.output_fp, 'w') - o.writelines('\n'.join(lines)) - o.close() + pmf, _ = parse_mapping_file_to_dict(opts.mapping_fp) + + samples_to_correlate = [] + md_values_to_correlate = [] + bt_sample_ids = bt.ids(axis='sample') + + for sample_id, sample_md in pmf.items(): + if sample_id in bt_sample_ids: + try: + v = is_computable_float(sample_md[opts.category]) + samples_to_correlate.append(sample_id) + md_values_to_correlate.append(v) + except KeyError: + option_parser.error('The category (%s)' % opts.category + + ' was not found in the mapping file.') + except ValueError: + pass # value couldn't be converted to float, ignore this sample + else: + pass # sample in mf, but not bt + + # remove samples which are not found in the mapping file or do not have + # metadata that converts to float + bt.filter(ids_to_keep = samples_to_correlate, axis='sample') + + # sort the biom table so that feature values are retrieved in the same + # order as the metadata in the samples they correspond to + bt.sort(sort_f = lambda _: samples_to_correlate, axis='sample') + + if bt.shape[1] <= 3: + raise ValueError(filtration_error_text) + + rhos = [] + pvals = [] + for feature_vector in bt.iter_data(axis='observation'): + rho = correlate(feature_vector, md_values_to_correlate, + method=opts.test) + pval = assign_correlation_pval(rho, len(feature_vector), + method=opts.pval_assignment_method, + permutations=opts.permutations, + perm_test_fn=\ + bootstrap_functions[opts.test], + v1=feature_vector, + v2=md_values_to_correlate) + rhos.append(rho) + pvals.append(pval) + + fdr_pvals = benjamini_hochberg_step_down(pvals) + bon_pvals = bonferroni_correction(pvals) + # correct for cases where values above 1.0 due to correction + fdr_pvals = where(array(fdr_pvals) > 1.0, 1.0, fdr_pvals) + bon_pvals = where(array(bon_pvals) > 1.0, 1.0, bon_pvals) + + lines = correlate_output_formatter(bt, rhos, pvals, fdr_pvals, + bon_pvals, opts.metadata_key) + lines = sort_by_pval(lines, ind=2) + + o = open(opts.output_fp, 'w') + o.writelines('\n'.join(lines)) + o.close() if __name__ == "__main__": main() diff --git a/tests/test_otu_significance.py b/tests/test_otu_significance.py index d09c170b3a..0a2b3b5a3c 100755 --- a/tests/test_otu_significance.py +++ b/tests/test_otu_significance.py @@ -17,10 +17,11 @@ GROUP_TEST_CHOICES, grouped_correlation_row_generator, run_grouped_correlation, CORRELATION_TEST_CHOICES, grouped_correlation_formatter, correlation_row_generator, - run_correlation_test) -from qiime.stats import (assign_correlation_pval, fisher, - fisher_population_correlation) -from numpy import array, hstack, corrcoef, asarray, nan + run_correlation_test, is_computable_float, + correlate_output_formatter, _add_metadata) +from qiime.stats import (assign_correlation_pval, fisher, + fisher_population_correlation) +from numpy import array, hstack, corrcoef, asarray, nan, inf from numpy.random import seed from numpy.testing import assert_almost_equal from os import remove @@ -959,9 +960,84 @@ def test_run_correlation_test(self): permutations=1000) assert_almost_equal(exp_bootstrapped_pvals, obs_pvals) + def test_is_computable_float(self): + '''Test that an arbitrary input can be converted to float.''' + self.assertRaises(ValueError, is_computable_float, 'adkfjsdkfj') + assert is_computable_float(nan) == False + assert is_computable_float(inf) == False + assert is_computable_float(-inf) == False + assert is_computable_float("3.5679") == 3.5679 + + def test_correlate_output_formatter(self): + '''Test that output lines are correctly generated.''' + # use BT_IN_1 + bt = parse_biom_table(BT_IN_1) + test_stats = [3.56, 2.78, nan, nan, nan, -4.78] + pvals = [.001, .034, nan, nan, nan, .00001] + fdr_pvals = [.001, .001, nan, nan, nan, .001] + bon_pvals = [.006, .192, nan, nan, nan, .006] + md_key = 'taxonomy' + type_of_stat = 'T' + exp = [['Feature ID', 'Test stat.', 'pval', 'pval_fdr', 'pval_bon', + md_key], + ['OTU1', '3.56', '0.001', '0.001', '0.006', 'k__One'], + ['OTU2', '2.78', '0.034', '0.001', '0.192', 'k__Two'], + ['OTU3', 'nan', 'nan', 'nan', 'nan', 'k__Three'], + ['OTU4', 'nan', 'nan', 'nan', 'nan', 'k__Four'], + ['OTU5', 'nan', 'nan', 'nan', 'nan', 'k__Five'], + ['OTU6', '-4.78', '1e-05', '0.001', '0.006', 'k__Six']] + exp = ['\t'.join(i) for i in exp] + obs = correlate_output_formatter(bt, test_stats, pvals, fdr_pvals, + bon_pvals, md_key) + assert obs == exp + + def test_add_metadata(self): + '''Test metadata is added appropriately to output lines.''' + # test first with taxonomy + exp = \ + ['Feature ID\tTest stat.\tpval\tpval_fdr\tpval_bon\ttaxonomy', + 'OTU1\t3.56\t0.001\t0.001\t0.006\tk__One', + 'OTU2\t2.78\t0.034\t0.001\t0.192\tk__Two', + 'OTU3\tnan\tnan\tnan\tnan\tk__Three', + 'OTU4\tnan\tnan\tnan\tnan\tk__Four', + 'OTU5\tnan\tnan\tnan\tnan\tk__Five', + 'OTU6\t-4.78\t1e-05\t0.001\t0.006\tk__Six'] + inp_lines = \ + ['Feature ID\tTest stat.\tpval\tpval_fdr\tpval_bon\ttaxonomy', + 'OTU1\t3.56\t0.001\t0.001\t0.006', + 'OTU2\t2.78\t0.034\t0.001\t0.192', + 'OTU3\tnan\tnan\tnan\tnan', + 'OTU4\tnan\tnan\tnan\tnan', + 'OTU5\tnan\tnan\tnan\tnan', + 'OTU6\t-4.78\t1e-05\t0.001\t0.006'] + bt = parse_biom_table(BT_IN_1) + obs = _add_metadata(bt, 'taxonomy', inp_lines) + assert obs == exp + # test without taxonomy or other metadata + exp = \ + ['Feature ID\tTest stat.\tpval\tpval_fdr\tpval_bon', + 'OTU1\t3.56\t0.001\t0.001\t0.006', + 'OTU2\t2.78\t0.034\t0.001\t0.192', + 'OTU3\tnan\tnan\tnan\tnan', + 'OTU4\tnan\tnan\tnan\tnan', + 'OTU5\tnan\tnan\tnan\tnan', + 'OTU6\t-4.78\t1e-05\t0.001\t0.006'] + inp_lines = \ + ['Feature ID\tTest stat.\tpval\tpval_fdr\tpval_bon\ttaxonomy', + 'OTU1\t3.56\t0.001\t0.001\t0.006', + 'OTU2\t2.78\t0.034\t0.001\t0.192', + 'OTU3\tnan\tnan\tnan\tnan', + 'OTU4\tnan\tnan\tnan\tnan', + 'OTU5\tnan\tnan\tnan\tnan', + 'OTU6\t-4.78\t1e-05\t0.001\t0.006'] + bt = parse_biom_table(BT_IN_1_no_md) + obs = _add_metadata(bt, 'taxonomy', inp_lines) + assert obs == exp + # globals used by certain tests. BT_IN_1 = '{"id": "None","format": "Biological Observation Matrix 1.0.0","format_url": "http://biom-format.org","type": "OTU table","generated_by": "testCode","date": "2013-08-20T15:48:21.166180","matrix_type": "sparse","matrix_element_type": "float","shape": [6, 6],"data": [[0,0,28.0],[0,1,52.0],[0,2,51.0],[0,3,78.0],[0,4,16.0],[0,5,77.0],[1,0,25.0],[1,1,14.0],[1,2,11.0],[1,3,32.0],[1,4,48.0],[1,5,63.0],[2,0,31.0],[2,1,2.0],[2,2,15.0],[2,3,69.0],[2,4,64.0],[2,5,27.0],[3,0,36.0],[3,1,68.0],[3,2,70.0],[3,3,65.0],[3,4,33.0],[3,5,62.0],[4,0,16.0],[4,1,41.0],[4,2,59.0],[4,3,40.0],[4,4,15.0],[4,5,3.0],[5,0,32.0],[5,1,8.0],[5,2,54.0],[5,3,98.0],[5,4,29.0],[5,5,50.0]],"rows": [{"id": "OTU1", "metadata": {"taxonomy": ["k__One"]}},{"id": "OTU2", "metadata": {"taxonomy": ["k__Two"]}},{"id": "OTU3", "metadata": {"taxonomy": ["k__Three"]}},{"id": "OTU4", "metadata": {"taxonomy": ["k__Four"]}},{"id": "OTU5", "metadata": {"taxonomy": ["k__Five"]}},{"id": "OTU6", "metadata": {"taxonomy": ["k__Six"]}}],"columns": [{"id": "Sample1", "metadata": null},{"id": "Sample2", "metadata": null},{"id": "Sample3", "metadata": null},{"id": "Sample4", "metadata": null},{"id": "Sample5", "metadata": null},{"id": "Sample6", "metadata": null}]}' BT_IN_2 = '{"id": "None","format": "Biological Observation Matrix 1.0.0","format_url": "http://biom-format.org","type": "OTU table","generated_by": "testCode","date": "2013-08-20T15:48:21.166180","matrix_type": "sparse","matrix_element_type": "float","shape": [6, 6],"data": [[0,0,28.0],[0,1,28.0],[0,2,28.0],[0,3,28.0],[0,4,28.0],[0,5,28.0],[1,0,28.0],[1,1,14.0],[1,2,11.0],[1,3,32.0],[1,4,48.0],[1,5,63.0],[2,0,31.0],[2,1,2.0],[2,2,15.0],[2,3,69.0],[2,4,64.0],[2,5,27.0],[3,0,36.0],[3,1,68.0],[3,2,70.0],[3,3,65.0],[3,4,33.0],[3,5,62.0],[4,0,16.0],[4,1,41.0],[4,2,59.0],[4,3,40.0],[4,4,15.0],[4,5,3.0],[5,0,32.0],[5,1,8.0],[5,2,54.0],[5,3,98.0],[5,4,29.0],[5,5,50.0]],"rows": [{"id": "OTU1", "metadata": {"taxonomy": ["k__One"]}},{"id": "OTU2", "metadata": {"taxonomy": ["k__Two"]}},{"id": "OTU3", "metadata": {"taxonomy": ["k__Three"]}},{"id": "OTU4", "metadata": {"taxonomy": ["k__Four"]}},{"id": "OTU5", "metadata": {"taxonomy": ["k__Five"]}},{"id": "OTU6", "metadata": {"taxonomy": ["k__Six"]}}],"columns": [{"id": "Sample1", "metadata": null},{"id": "Sample2", "metadata": null},{"id": "Sample3", "metadata": null},{"id": "Sample4", "metadata": null},{"id": "Sample5", "metadata": null},{"id": "Sample6", "metadata": null}]}' +BT_IN_1_no_md = '{"id": "None","format": "Biological Observation Matrix 1.0.0","format_url": "http://biom-format.org","type": "OTU table","generated_by": "testCode","date": "2013-08-20T15:48:21.166180","matrix_type": "sparse","matrix_element_type": "float","shape": [6, 6],"data": [[0,0,28.0],[0,1,52.0],[0,2,51.0],[0,3,78.0],[0,4,16.0],[0,5,77.0],[1,0,25.0],[1,1,14.0],[1,2,11.0],[1,3,32.0],[1,4,48.0],[1,5,63.0],[2,0,31.0],[2,1,2.0],[2,2,15.0],[2,3,69.0],[2,4,64.0],[2,5,27.0],[3,0,36.0],[3,1,68.0],[3,2,70.0],[3,3,65.0],[3,4,33.0],[3,5,62.0],[4,0,16.0],[4,1,41.0],[4,2,59.0],[4,3,40.0],[4,4,15.0],[4,5,3.0],[5,0,32.0],[5,1,8.0],[5,2,54.0],[5,3,98.0],[5,4,29.0],[5,5,50.0]],"rows": [{"id": "OTU1", "metadata": null},{"id": "OTU2", "metadata": null},{"id": "OTU3", "metadata": null},{"id": "OTU4", "metadata": null},{"id": "OTU5", "metadata": null},{"id": "OTU6", "metadata": null}] ,"columns": [{"id": "Sample1", "metadata": null},{"id": "Sample2", "metadata": null},{"id": "Sample3", "metadata": null},{"id": "Sample4", "metadata": null},{"id": "Sample5", "metadata": null},{"id": "Sample6", "metadata": null}]}' MF_IN_1 = ['#SampleID\ttest_cat\ttest_corr', 'Sample1\tcat1\t1', @@ -974,6 +1050,9 @@ def test_run_correlation_test(self): 'NotInOtuTable2\tcat5\t8'] BT_4 = """{"id": "None","format": "Biological Observation Matrix 1.0.0","format_url": "http://biom-format.org","type": "OTU table","generated_by": "BIOM-Format 1.1.2","date": "2013-08-16T15:23:02.872397","matrix_type": "sparse","matrix_element_type": "float","shape": [6, 8],"data": [[0,0,28.0],[0,1,52.0],[0,2,51.0],[0,3,78.0],[0,4,16.0],[0,5,77.0],[0,6,73.0],[0,7,6.0],[1,0,25.0],[1,1,14.0],[1,2,11.0],[1,3,32.0],[1,4,48.0],[1,5,63.0],[1,6,27.0],[1,7,38.0],[2,0,31.0],[2,1,2.0],[2,2,15.0],[2,3,69.0],[2,4,64.0],[2,5,27.0],[2,6,64.0],[2,7,54.0],[3,0,36.0],[3,1,68.0],[3,2,70.0],[3,3,65.0],[3,4,33.0],[3,5,62.0],[3,6,60.0],[3,7,23.0],[4,0,16.0],[4,1,41.0],[4,2,59.0],[4,3,40.0],[4,4,15.0],[4,5,3.0],[4,6,35.0],[4,7,5.0],[5,0,32.0],[5,1,8.0],[5,2,54.0],[5,3,98.0],[5,4,29.0],[5,5,50.0],[5,6,93.0],[5,7,19.0]],"rows": [{"id": "OTU1", "metadata": {"taxonomy": "k__One"}},{"id": "OTU2", "metadata": {"taxonomy": "k__Two"}},{"id": "OTU3", "metadata": {"taxonomy": "k__Three"}},{"id": "OTU4", "metadata": {"taxonomy": "k__Four"}},{"id": "OTU5", "metadata": {"taxonomy": "k__Five"}},{"id": "OTU6", "metadata": {"taxonomy": "k__Six"}}],"columns": [{"id": "Sample1", "metadata": null},{"id": "Sample2", "metadata": null},{"id": "Sample3", "metadata": null},{"id": "Sample4", "metadata": null},{"id": "Sample5", "metadata": null},{"id": "Sample6", "metadata": null},{"id": "Sample7", "metadata": null},{"id": "Sample8", "metadata": null}]}""" +['Feature ID\tTest stat.\tpval\tpval_fdr\tpval_bon\ttaxonomy', 'OTU1\t3.56\t0.001\t0.001\t0.006\tk__One', 'OTU2\t2.78\t0.034\t0.001\t0.192\tk__Two', 'OTU3\tnan\tnan\tnan\tnan\tk__Three', 'OTU4\tnan\tnan\tnan\tnan\tk__Four', 'OTU5\tnan\tnan\tnan\tnan\tk__Five', 'OTU6\t-4.78\t1e-05\t0.001\t0.006\tk__Six'] +['Feature ID\tTest stat.\tpval\tpval_fdr\tpval_bon', 'OTU1\t3.56\t0.001\t0.001\t0.006', 'OTU2\t2.78\t0.034\t0.001\t0.192', 'OTU3\tnan\tnan\tnan\tnan', 'OTU4\tnan\tnan\tnan\tnan', 'OTU5\tnan\tnan\tnan\tnan', 'OTU6\t-4.78\t1e-05\t0.001\t0.006'] + # run unit tests if run from command-line if __name__ == '__main__': main() From 43a79e6e5ba5bd7722c43220552edfbfc6ae6e9e Mon Sep 17 00:00:00 2001 From: Will Van Treuren Date: Wed, 10 Dec 2014 16:00:51 -0800 Subject: [PATCH 08/12] changelog update --- ChangeLog.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ChangeLog.md b/ChangeLog.md index e0523e6893..dc34777ced 100644 --- a/ChangeLog.md +++ b/ChangeLog.md @@ -1,6 +1,6 @@ QIIME 1.8.0-dev (changes since 1.8.0 go here) ============================================= -* ``correlations.py`` has been added. This script allows the calculation of correlations between feature abundances and continuous-valued metadata. It also reintroduces the paired-t test functionality that has been missing in 1.8. This script also replaces the functions that were in ``otu_category_significance.py`` in QIIME 1.7 and earlier. +* ``observation_metadata_correlation.py`` has been added. This script allows the calculation of correlations between feature abundances and continuous-valued metadata.This script replaces the functions that were in ``otu_category_significance.py`` in QIIME 1.7 and earlier. * ``split_otu_table.py`` now allows multiple fields to be passed to split a biom table, and optionally a mapping file. Check out the new documentation for the naming conventions (which have changed slightly) and an example. From 0b7b6f6f51523473cac88755e925e6e55f113822 Mon Sep 17 00:00:00 2001 From: Will Van Treuren Date: Wed, 10 Dec 2014 17:06:13 -0800 Subject: [PATCH 09/12] documentation addition added a lot of help documentation to mimic group_significance.py. --- scripts/observation_metadata_correlation.py | 55 ++++++++++++++++++--- 1 file changed, 49 insertions(+), 6 deletions(-) diff --git a/scripts/observation_metadata_correlation.py b/scripts/observation_metadata_correlation.py index 1d71d0cf60..292cb8afec 100755 --- a/scripts/observation_metadata_correlation.py +++ b/scripts/observation_metadata_correlation.py @@ -46,12 +46,55 @@ allow the user to correlate features to sample metadata values including Spearmans Rho, Pearson, Kendall's Tau, and the C or checkerboard score. References for these methods are numerous, but good descriptions may be found in -'Biometry' by Sokal and Rolhf. - -The available methods for assigning p-values to the calculated correlation -scores are bootstrapping, Fisher's Z transformation, a parametric -t-distribution, and a Kendall's Tau specific p-value calculation. These methods -are also described in 'Biometry'. +'Biometry' by Sokal and Rolhf. A brief description of the available tests +follows: + +- Pearson score: The Pearson score aka Pearson's Product Moment correlation, is + a scaled measure of the degree to which two sequences of numbers co-vary. For + 'correlated' sequences, Pearson > 0, and for 'anticorrelated' sequences + Pearson < 0 (uncorrelated implies Pearson = 0). Pearson is a paramateric + and linear measure of correlation. + +- Spearmans Rho: The Spearman correlation is a non-paramateric measure of + correlation between two sequences of numbers. Spearman correlation is + appropriate for data where the values of the observations are not necessarily + accurate, but for which their relative magnitudes are (see Biometry for more + details.) + +- Kendalls Tau: Kendall's Tau is an alternative method of calculating + correlation between two sequences of numbers. It is slower and less widely + utilized than Spearman or Pearson scores. + +- Cscore: The c-score or 'checkerboard score' is a measure of covariation + between two sequences that is derived from traditional ecology (Stone and + Roberts. 1990, Oecologia 85:74-79). + +Raw correlation statistics alone reflect only the degree of association between +two sequences of numbers or vectors. Assigning a likelihood to these score via +a p-value can be done with several methods depending on the particular +assumptions that are used. This script allows four methods for calculating +p-values: + +- Bootrapping: Bootstrapping is the most robust, but slowest procedure for + calculating the p-value of a given correlation score. Bootstrapping takes the + input sequences, shuffles the order of one, and then recomputes the + correlation score. The p-value is then the number of times (out of the given + number of permutations) that the score of the permuted sequence pair was more + extreme than the observed pair. Bootstrapping is good when the underlying + properties of the distributions are unknown. + +- Parametric t distribution: the traditional method for calculating the + significance of a correlation score, this method assumes that the scores are + normally distributed and computes a t statistic for each correlation score in + conjunction with the length of the sequences being correlated. + +- Fisher Z transform: Fisher's Z transform is a way to make the distribution of + correlation scores (especially when there are many highly correlation scores) + look more normal. It is not to be confused with Fisher's transformation for + the combination of p-values. + +- Kendalls Tau: for the Kendalls Tau calculation, the specific Kendalls Tau + pvalue is provided. Notes: The only supported metric for P-value assignment with the C-score is From c7e66aed373cf5c4549136ede990d16cc098f83f Mon Sep 17 00:00:00 2001 From: Will Van Treuren Date: Fri, 19 Dec 2014 11:52:18 -0500 Subject: [PATCH 10/12] Addressed comments --- ChangeLog.md | 2 +- qiime/otu_significance.py | 2 -- scripts/observation_metadata_correlation.py | 10 ++++------ 3 files changed, 5 insertions(+), 9 deletions(-) diff --git a/ChangeLog.md b/ChangeLog.md index 459ed68c44..f2da44499e 100644 --- a/ChangeLog.md +++ b/ChangeLog.md @@ -1,6 +1,6 @@ QIIME 1.8.0-dev (changes since 1.8.0 go here) ============================================= -* ``observation_metadata_correlation.py`` has been added. This script allows the calculation of correlations between feature abundances and continuous-valued metadata.This script replaces the functions that were in ``otu_category_significance.py`` in QIIME 1.7 and earlier. +* ``observation_metadata_correlation.py`` has been added. This script allows the calculation of correlations between feature abundances and continuous-valued metadata. This script replaces the continuous-valued correlation function that was in ``otu_category_significance.py`` in QIIME 1.7 and earlier. * ``split_otu_table.py`` now allows multiple fields to be passed to split a biom table, and optionally a mapping file. Check out the new documentation for the naming conventions (which have changed slightly) and an example. diff --git a/qiime/otu_significance.py b/qiime/otu_significance.py index 1a5cd3d172..be0e6c2abc 100644 --- a/qiime/otu_significance.py +++ b/qiime/otu_significance.py @@ -347,8 +347,6 @@ def correlate_output_formatter(bt, test_stats, pvals, fdr_pvals, bon_pvals, Floats representing Bonferroni corrected pvals. md_key : str or None Key for extracting feature metadata from biom table. - type_of_stat : str - Type of statistic - used to modify header to report either 'Co Returns ------- diff --git a/scripts/observation_metadata_correlation.py b/scripts/observation_metadata_correlation.py index 292cb8afec..827332f87f 100755 --- a/scripts/observation_metadata_correlation.py +++ b/scripts/observation_metadata_correlation.py @@ -42,7 +42,7 @@ script_info['brief_description'] = """This script calculates correlations between feature abundances and continuous-valued metadata.""" script_info['script_description'] = """ This script calculates correlations between feature (aka observation) abundances -(relative or absolute) and numeric metadata.Several methods are provided to +(relative or absolute) and numeric metadata. Several methods are provided to allow the user to correlate features to sample metadata values including Spearmans Rho, Pearson, Kendall's Tau, and the C or checkerboard score. References for these methods are numerous, but good descriptions may be found in @@ -59,7 +59,7 @@ correlation between two sequences of numbers. Spearman correlation is appropriate for data where the values of the observations are not necessarily accurate, but for which their relative magnitudes are (see Biometry for more - details.) + details). - Kendalls Tau: Kendall's Tau is an alternative method of calculating correlation between two sequences of numbers. It is slower and less widely @@ -166,7 +166,7 @@ def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) if opts.test == 'cscore' and opts.pval_assignment_method != 'bootstrapped': - raise ValueError(cscore_error_text) + option_parser.error(cscore_error_text) bt = load_table(opts.otu_table_fp) pmf, _ = parse_mapping_file_to_dict(opts.mapping_fp) @@ -230,6 +230,4 @@ def main(): o.close() if __name__ == "__main__": - main() - - + main() \ No newline at end of file From 0f5bf05a12fa0bef8fa03a9823e3f254c1f74235 Mon Sep 17 00:00:00 2001 From: Will Van Treuren Date: Fri, 19 Dec 2014 12:05:16 -0500 Subject: [PATCH 11/12] Resolved remaining comment. --- scripts/observation_metadata_correlation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/observation_metadata_correlation.py b/scripts/observation_metadata_correlation.py index 827332f87f..7fa6455496 100755 --- a/scripts/observation_metadata_correlation.py +++ b/scripts/observation_metadata_correlation.py @@ -198,7 +198,7 @@ def main(): bt.sort(sort_f = lambda _: samples_to_correlate, axis='sample') if bt.shape[1] <= 3: - raise ValueError(filtration_error_text) + option_parser.error(filtration_error_text) rhos = [] pvals = [] From 9d68237db23d2782c4acb8dd63fe06c58e5b9247 Mon Sep 17 00:00:00 2001 From: Jai Ram Rideout Date: Tue, 23 Dec 2014 10:56:20 -0700 Subject: [PATCH 12/12] DOC: update obseration_metadata_correlation.py script docs --- ChangeLog.md | 2 +- .../paired_samples.txt | 5 - scripts/observation_metadata_correlation.py | 115 +++++++++--------- 3 files changed, 61 insertions(+), 61 deletions(-) delete mode 100644 qiime_test_data/observation_metadata_correlation/paired_samples.txt diff --git a/ChangeLog.md b/ChangeLog.md index 578f5f14c7..269f73926e 100644 --- a/ChangeLog.md +++ b/ChangeLog.md @@ -1,6 +1,6 @@ QIIME 1.8.0-dev (changes since 1.8.0 go here) ============================================= -* ``observation_metadata_correlation.py`` has been added. This script allows the calculation of correlations between feature abundances and continuous-valued metadata. This script replaces the continuous-valued correlation function that was in ``otu_category_significance.py`` in QIIME 1.7 and earlier. +* Added new ``observation_metadata_correlation.py`` script. This script allows the calculation of correlations between feature abundances and continuous-valued metadata. This script replaces the continuous-valued correlation functionality that was in ``otu_category_significance.py`` in QIIME 1.7.0 and earlier. * ``split_otu_table.py`` now allows multiple fields to be passed to split a biom table, and optionally a mapping file. Check out the new documentation for the naming conventions (which have changed slightly) and an example. diff --git a/qiime_test_data/observation_metadata_correlation/paired_samples.txt b/qiime_test_data/observation_metadata_correlation/paired_samples.txt deleted file mode 100644 index 4e40751eef..0000000000 --- a/qiime_test_data/observation_metadata_correlation/paired_samples.txt +++ /dev/null @@ -1,5 +0,0 @@ -s1 s9 -s0 s5 -s2 s3 -s4 s7 -s6 s8 \ No newline at end of file diff --git a/scripts/observation_metadata_correlation.py b/scripts/observation_metadata_correlation.py index 7fa6455496..cacf9dd428 100755 --- a/scripts/observation_metadata_correlation.py +++ b/scripts/observation_metadata_correlation.py @@ -3,7 +3,7 @@ __author__ = "Will Van Treuren" __copyright__ = "Copyright 2014, The QIIME project" __credits__ = ["Will Van Treuren", "Luke Ursell", "Catherine Lozupone", - "Jesse Stombaugh", "Doug Wendel", "Dan Knights", "Greg Caporaso", + "Jesse Stombaugh", "Doug Wendel", "Dan Knights", "Greg Caporaso", "Jai Ram Rideout"] __license__ = "GPL" __version__ = "1.8.0-dev" @@ -24,11 +24,11 @@ filtration_error_text = '''The biom table does not have enough samples after filtration to perform correlations (it has 3 or fewer). The filtration steps that have occurred are: -1. Removing samples that were not found in the mapping file. +1. Removing samples that were not found in the mapping file. 2. Removing samples that were found in the mapping file but whose metadata for the given category was not convertable to float.''' -cscore_error_text = '''The only supported metric for P-value assignment with the +cscore_error_text = '''The only supported metric for p-value assignment with the C-score is bootstrapping. For more information on the C-score, read Stone and Roberts 1990 Oecologea paper 85: 74-79.''' @@ -39,35 +39,35 @@ 'kendall': kendall, 'cscore': cscore} script_info = {} -script_info['brief_description'] = """This script calculates correlations between feature abundances and continuous-valued metadata.""" +script_info['brief_description'] = """Correlation between observation abundances and continuous-valued metadata""" script_info['script_description'] = """ This script calculates correlations between feature (aka observation) abundances (relative or absolute) and numeric metadata. Several methods are provided to allow the user to correlate features to sample metadata values including -Spearmans Rho, Pearson, Kendall's Tau, and the C or checkerboard score. -References for these methods are numerous, but good descriptions may be found in +Spearman's Rho, Pearson, Kendall's Tau, and the C or checkerboard score. +References for these methods are numerous, but good descriptions may be found in 'Biometry' by Sokal and Rolhf. A brief description of the available tests follows: -- Pearson score: The Pearson score aka Pearson's Product Moment correlation, is +- Pearson score: The Pearson score, aka Pearson's Product Moment correlation, is a scaled measure of the degree to which two sequences of numbers co-vary. For 'correlated' sequences, Pearson > 0, and for 'anticorrelated' sequences Pearson < 0 (uncorrelated implies Pearson = 0). Pearson is a paramateric and linear measure of correlation. -- Spearmans Rho: The Spearman correlation is a non-paramateric measure of +- Spearman's Rho: The Spearman correlation is a non-paramateric measure of correlation between two sequences of numbers. Spearman correlation is appropriate for data where the values of the observations are not necessarily - accurate, but for which their relative magnitudes are (see Biometry for more + accurate, but for which their relative magnitudes are (see Biometry for more details). -- Kendalls Tau: Kendall's Tau is an alternative method of calculating +- Kendall's Tau: Kendall's Tau is an alternative method of calculating correlation between two sequences of numbers. It is slower and less widely utilized than Spearman or Pearson scores. - Cscore: The c-score or 'checkerboard score' is a measure of covariation between two sequences that is derived from traditional ecology (Stone and - Roberts. 1990, Oecologia 85:74-79). + Roberts. 1990, Oecologia 85:74-79). Raw correlation statistics alone reflect only the degree of association between two sequences of numbers or vectors. Assigning a likelihood to these score via @@ -81,84 +81,88 @@ correlation score. The p-value is then the number of times (out of the given number of permutations) that the score of the permuted sequence pair was more extreme than the observed pair. Bootstrapping is good when the underlying - properties of the distributions are unknown. + properties of the distributions are unknown. - Parametric t distribution: the traditional method for calculating the significance of a correlation score, this method assumes that the scores are - normally distributed and computes a t statistic for each correlation score in - conjunction with the length of the sequences being correlated. + normally distributed and computes a t statistic for each correlation score in + conjunction with the length of the sequences being correlated. - Fisher Z transform: Fisher's Z transform is a way to make the distribution of - correlation scores (especially when there are many highly correlation scores) + correlation scores (especially when there are many highly correlated scores) look more normal. It is not to be confused with Fisher's transformation for the combination of p-values. -- Kendalls Tau: for the Kendalls Tau calculation, the specific Kendalls Tau - pvalue is provided. +- Kendall's Tau: for the Kendall's Tau calculation, the specific Kendall's Tau + p-value is provided. Notes: -The only supported metric for P-value assignment with the C-score is -bootstrapping. For more information on the C-score, read Stone and Roberts 1990 -Oecologea paper 85: 74-79. If you fail to pass -pval_assignment_method='bootstrapped' while you have -s cscore, the script will -error. -Assigning pvalues to Kendall's Tau scores with the bootstrapping method is -very slow. +- The only supported metric for p-value assignment with the C-score is + bootstrapping. For more information on the C-score, read Stone and Roberts + 1990 Oecologea paper 85: 74-79. If you don't pass + pval_assignment_method='bootstrapped' while you have -s cscore, the script + will raise an error. + +- Assigning p-values to Kendall's Tau scores with the bootstrapping method is + very slow. """ script_info['script_usage'] = [] script_info['script_usage'].append( - ("Calculate the correlation between OTUs in the table and the pH of the samples from mich they came:", - "", + ("Example 1:", + "Calculate the correlation between OTUs in the table and the pH of the samples from whence they came:", "%prog -i otu_table.biom -m map.txt -c pH -s spearman -o spearman_otu_gradient.txt")) script_info['script_usage'].append( - ("Calculate the correlation between OTUs in the table and the pH of the samples from mich they came using bootstrapping and pearon correlation:", - "", + ("Example 2:", + "Calculate the correlation between OTUs in the table and the pH of the samples from whence they came using bootstrapping and pearson correlation:", "%prog -i otu_table.biom -m map.txt -c pH -s pearson --pval_assignment_method bootstrapped --permutations 100 -o pearson_bootstrapped.txt")) script_info['output_description']= """ -The output will be a tab delimited file with the following headers. Each row -will record the values calculated for a fiven featue: -- Feature ID: ID of the features being correlated. If these are OTUs, then they - will take the form of GreenGenes identifiers or de-novo identifiers. -- Test-Statistic: the value of the test statistic for the given test. -- P: the raw P value returned by the given test. -- FDR_P: the P value corrected by the Benjamini-Hochberg FDR procedure for +The output will be a tab-delimited file with the following headers. Each row +will record the values calculated for a given feature: + +- Feature ID: ID of the features being correlated. These are the observation IDs + in the BIOM table. +- Test stat.: the value of the test statistic for the given test. +- pval: the raw p-value returned by the given test. +- pval_fdr: the p-value corrected by the Benjamini-Hochberg FDR procedure for multiple comparisons. -- Bonferroni_P: the P value corrected by the Bonferroni procedure for multiple +- pval_bon: the p-value corrected by the Bonferroni procedure for multiple comparisons. -- Metadata - this column will be present only if the biom table contained - metadata information for your features. If these are OTUs, and taxonomy is - present in the biom table, this category will contain that taxonomy (or other - metadata). +- [metadata]: this column will be present only if the BIOM table contained + metadata information for your features. For example, if these are OTUs, and + taxonomy is present in the BIOM table, this column will contain OTU + taxonomy and will be named 'taxonomy'. + """ script_info['required_options']=[ make_option('-i','--otu_table_fp', - help='path to input biom format table', + help='path to input BIOM table', type='existing_path'), make_option('-o', '--output_fp', type='new_filepath', help='path to the output file to be created'), make_option('-m','--mapping_fp', type='existing_filepath', - help='path to category mapping file'), - make_option('-c', '--category', type='string', - help='name of the category over which to run the analysis')] + help='path to metadata mapping file'), + make_option('-c', '--category', + help='name of the category in the metadata mapping file over which to ' + 'run the analysis')] script_info['optional_options']=[ - make_option('-s', '--test', type="choice", + make_option('-s', '--test', type="choice", choices=correlation_assignment_choices, default='spearman', help='Correlation method to use. Choices are: %s' % (', '.join(correlation_assignment_choices)) + ' [default: %default]'), - make_option('--pval_assignment_method', type="choice", + make_option('--pval_assignment_method', type="choice", choices=pvalue_assignment_choices, - default='fisher_z_transform', help='Pvalue method to use. Choices are: ' + default='fisher_z_transform', help='p-value method to use. Choices are: ' '%s' % (', '.join(pvalue_assignment_choices)) + ' [default: %default]'), - make_option('--metadata_key', default='taxonomy', type=str, - help='Key to extract metadata from biom table. [default: %default]'), - make_option('--permutations', default=1000, type=int, - help='Number of permutations to use for bootstrapped tests.' - ' [default: %default]')] + make_option('--metadata_key', default='taxonomy', + help='Key to extract metadata from BIOM table. [default: %default]'), + make_option('--permutations', default=1000, type=int, + help='Number of permutations to use for bootstrapped p-value ' + 'calculations. [default: %default]')] script_info['version'] = __version__ @@ -167,7 +171,7 @@ def main(): if opts.test == 'cscore' and opts.pval_assignment_method != 'bootstrapped': option_parser.error(cscore_error_text) - + bt = load_table(opts.otu_table_fp) pmf, _ = parse_mapping_file_to_dict(opts.mapping_fp) @@ -193,7 +197,7 @@ def main(): # metadata that converts to float bt.filter(ids_to_keep = samples_to_correlate, axis='sample') - # sort the biom table so that feature values are retrieved in the same + # sort the biom table so that feature values are retrieved in the same # order as the metadata in the samples they correspond to bt.sort(sort_f = lambda _: samples_to_correlate, axis='sample') @@ -227,7 +231,8 @@ def main(): o = open(opts.output_fp, 'w') o.writelines('\n'.join(lines)) + o.write('\n') o.close() if __name__ == "__main__": - main() \ No newline at end of file + main()