diff --git a/ChangeLog.md b/ChangeLog.md index e92b269fa4..59fbbed641 100644 --- a/ChangeLog.md +++ b/ChangeLog.md @@ -1,5 +1,6 @@ QIIME 1.8.0-dev (changes since 1.8.0 go here) ============================================= +* Added new ``observation_metadata_correlation.py`` script. This script allows the calculation of correlations between feature abundances and continuous-valued metadata. This script replaces the continuous-valued correlation functionality that was in ``otu_category_significance.py`` in QIIME 1.7.0 and earlier. * ``split_otu_table.py`` now allows multiple fields to be passed to split a biom table, and optionally a mapping file. Check out the new documentation for the naming conventions (which have changed slightly) and an example. diff --git a/qiime/otu_significance.py b/qiime/otu_significance.py index 0cf6e37df7..be0e6c2abc 100644 --- a/qiime/otu_significance.py +++ b/qiime/otu_significance.py @@ -12,7 +12,7 @@ from qiime.parse import parse_mapping_file_to_dict from numpy import (array, argsort, vstack, isnan, inf, nan, apply_along_axis, - mean, zeros) + mean, zeros, isinf, logical_or) from qiime.stats import (fisher_population_correlation, pearson, spearman, g_fit, ANOVA_one_way, @@ -329,61 +329,70 @@ def run_correlation_test(data_generator, test, test_choices, return corr_coefs, pvals -def correlation_output_formatter(bt, corr_coefs, pvals, fdr_pvals, bon_pvals, - md_key): - """Format the output of the correlations for easy writing. - - """ - header = [ - 'OTU', - 'Correlation Coef', - 'pval', - 'pval_fdr', - 'pval_bon', - md_key] - num_lines = len(corr_coefs) +def correlate_output_formatter(bt, test_stats, pvals, fdr_pvals, bon_pvals, + md_key): + '''Produce lines for a tab delimited text file for correlations.py. + + Paramaters + ---------- + bt : biom table object + test_stats : array-like + Floats representing correlation coefficients or paired t test + statistics. + pvals : array-like + Floats representing pvalues for given correlation coefficients. + fdr_pvals : array-like + Floats representing FDR corrected pvals. + bon_pvals : array-like + Floats representing Bonferroni corrected pvals. + md_key : str or None + Key for extracting feature metadata from biom table. + + Returns + ------- + list of strs + ''' + header = ['Feature ID', 'Test stat.', 'pval', 'pval_fdr', 'pval_bon', + md_key] + num_lines = len(test_stats) lines = ['\t'.join(header)] for i in range(num_lines): - tmp = [bt.ids(axis='observation')[i], corr_coefs[i], pvals[i], fdr_pvals[i], - bon_pvals[i]] + tmp = [bt.ids(axis='observation')[i], test_stats[i], pvals[i], + fdr_pvals[i], bon_pvals[i]] lines.append('\t'.join(map(str, tmp))) nls = _add_metadata(bt, md_key, lines) return nls +def run_paired_t(bt, s1, s2): + '''Perform a paired t test between samples. + + Parameters + ---------- + bt : biom table object + Table containing data for samples in s1 and s2. + s1 : list + List of sample ids found in bt (strs). + s2 : list + List of sample ids found in bt (strs). + + Returns + ------- + test_stats : list + Floats representing the test statistic for each paired comparison. + pvals : list + Floats representing the p-values of the reported test statistics. + ''' + test_stats = [] + pvals = [] + s1_indices = [bt.index(i, axis='sample') for i in s1] + s2_indices = [bt.index(i, axis='sample') for i in s2] -def paired_t_generator(bt, s_before, s_after): - """Produce a generator to run paired t tests on each OTU.""" - b_data = vstack([bt.sampleData(i) for i in s_before]).T - a_data = vstack([bt.sampleData(i) for i in s_after]).T - return ((b_data[i], a_data[i]) for i in range(len(bt.ids(axis='observation')))) - - -def run_paired_t(data_generator): - """Run paired t test on data.""" - test_stats, pvals = [], [] - for b_data, a_data in data_generator: - test_stat, pval = t_paired(b_data, a_data) + for data in bt.iter_data(axis='observation'): + test_stat, pval = t_paired(data.take(s1_indices), data.take(s2_indices)) test_stats.append(test_stat) pvals.append(pval) return test_stats, pvals - -def paired_t_output_formatter(bt, test_stats, pvals, fdr_pvals, bon_pvals, - md_key): - """Format the output for all tests so it can be easily written.""" - header = ['OTU', 'Test-Statistic', 'P', 'FDR_P', 'Bonferroni_P', md_key] - num_lines = len(pvals) - lines = ['\t'.join(header)] - for i in range(num_lines): - tmp = [bt.ids(axis='observation')[i], test_stats[i], pvals[i], fdr_pvals[i], - bon_pvals[i]] - lines.append('\t'.join(map(str, tmp))) - nls = _add_metadata(bt, md_key, lines) - return nls - -# Functions used by both scripts - - def sort_by_pval(lines, ind): """Sort lines with pvals in descending order. @@ -396,10 +405,7 @@ def _nan_safe_sort(line): return val else: return inf - return ( - # header+sorted lines - [lines[0]] + sorted(lines[1:], key=_nan_safe_sort) - ) + return [lines[0]] + sorted(lines[1:], key=_nan_safe_sort) def _add_metadata(bt, md_key, lines): @@ -413,3 +419,22 @@ def _add_metadata(bt, md_key, lines): else: # remove md_header from the first line nls = ['\t'.join(lines[0].split('\t')[:-1])] + lines[1:] return nls + +def is_computable_float(v): + '''Return float if v can be converted to float excluding nan and inf. + + Parameters + ---------- + v : variable + Value to be converted to float if possible. + + Returns + ------- + If v can be converted to a float that is not nan or inf, return v. + Otherwise return False. + ''' + tmp = float(v) + if not logical_or(isnan(tmp), isinf(tmp)): + return tmp + else: + return False diff --git a/qiime/stats.py b/qiime/stats.py index 01d19d42bb..37b2cd1ac8 100644 --- a/qiime/stats.py +++ b/qiime/stats.py @@ -1629,10 +1629,13 @@ def assign_correlation_pval(corr, n, method, permutations=None, raise ValueError('You must specify vectors, permutation ' 'function, and number of permutations to calc ' 'bootstrapped pvalues. Cant continue.') - r = empty(permutations) - for i in range(permutations): - r[i] = perm_test_fn(v1, permutation(v2)) - return (abs(r) >= abs(corr)).sum() / float(permutations) + if any([isnan(corr), isinf(corr)]): + return nan + else: + r = empty(permutations) + for i in range(permutations): + r[i] = perm_test_fn(v1, permutation(v2)) + return (abs(r) >= abs(corr)).sum() / float(permutations) elif method == 'kendall': return kendall_pval(corr, n) else: @@ -2425,3 +2428,32 @@ def cscore(v1, v2): v2_b = v2.astype(bool) sij = (v1_b * v2_b).sum() return (v1_b.sum() - sij) * (v2_b.sum() - sij) + +def correlate(v1, v2, method): + '''Correlate vectors using method. + + Parameters + ---------- + v1 : array-like + List or array of ints or floats to be correlated. + v2 : array-like + List or array of ints or floats to be correlated. + method : str + One of 'spearman', 'pearson', 'kendall', 'cscore'. + + Returns + ------- + rho : float + Correlation between the vectors. + ''' + if method == 'pearson': + corr_fn = pearson + elif method == 'spearman': + corr_fn = spearman + elif method == 'kendall': + corr_fn = kendall + elif method == 'cscore': + corr_fn = cscore + else: + raise ValueError('Correlation function not recognized.') + return corr_fn(v1, v2) diff --git a/qiime_test_data/observation_metadata_correlation/map.txt b/qiime_test_data/observation_metadata_correlation/map.txt new file mode 100644 index 0000000000..72323e1438 --- /dev/null +++ b/qiime_test_data/observation_metadata_correlation/map.txt @@ -0,0 +1,11 @@ +#SampleID pH +s0 1.1 +s1 2.1 +s2 1.7 +s3 4.7 +s4 300 +s5 -12 +s6 89.242251 +s7 ‘SKJDKJSD’ +s8 nan +s9 100 \ No newline at end of file diff --git a/qiime_test_data/observation_metadata_correlation/otu_table.biom b/qiime_test_data/observation_metadata_correlation/otu_table.biom new file mode 100644 index 0000000000..0072684d06 Binary files /dev/null and b/qiime_test_data/observation_metadata_correlation/otu_table.biom differ diff --git a/scripts/observation_metadata_correlation.py b/scripts/observation_metadata_correlation.py new file mode 100755 index 0000000000..cacf9dd428 --- /dev/null +++ b/scripts/observation_metadata_correlation.py @@ -0,0 +1,238 @@ +#!/usr/bin/env python + +__author__ = "Will Van Treuren" +__copyright__ = "Copyright 2014, The QIIME project" +__credits__ = ["Will Van Treuren", "Luke Ursell", "Catherine Lozupone", + "Jesse Stombaugh", "Doug Wendel", "Dan Knights", "Greg Caporaso", + "Jai Ram Rideout"] +__license__ = "GPL" +__version__ = "1.8.0-dev" +__maintainer__ = "Will Van Treuren" +__email__ = "wdwvt1@gmail.com" + +from qiime.util import (parse_command_line_parameters, make_option, + sync_biom_and_mf) +from qiime.stats import (benjamini_hochberg_step_down, bonferroni_correction, + assign_correlation_pval, correlate, pearson, + spearman, kendall, cscore) +from qiime.otu_significance import (correlate_output_formatter, sort_by_pval, + run_paired_t, is_computable_float) +from qiime.parse import parse_mapping_file_to_dict +from biom import load_table +from numpy import array, where + +filtration_error_text = '''The biom table does not have enough samples after +filtration to perform correlations (it has 3 or fewer). The filtration steps +that have occurred are: +1. Removing samples that were not found in the mapping file. +2. Removing samples that were found in the mapping file but whose metadata for +the given category was not convertable to float.''' + +cscore_error_text = '''The only supported metric for p-value assignment with the +C-score is bootstrapping. For more information on the C-score, read Stone and +Roberts 1990 Oecologea paper 85: 74-79.''' + +correlation_assignment_choices = ['spearman', 'pearson', 'kendall', 'cscore'] +pvalue_assignment_choices = ['fisher_z_transform', 'parametric_t_distribution', + 'bootstrapped', 'kendall'] +bootstrap_functions = {'spearman': spearman, 'pearson': pearson, + 'kendall': kendall, 'cscore': cscore} + +script_info = {} +script_info['brief_description'] = """Correlation between observation abundances and continuous-valued metadata""" +script_info['script_description'] = """ +This script calculates correlations between feature (aka observation) abundances +(relative or absolute) and numeric metadata. Several methods are provided to +allow the user to correlate features to sample metadata values including +Spearman's Rho, Pearson, Kendall's Tau, and the C or checkerboard score. +References for these methods are numerous, but good descriptions may be found in +'Biometry' by Sokal and Rolhf. A brief description of the available tests +follows: + +- Pearson score: The Pearson score, aka Pearson's Product Moment correlation, is + a scaled measure of the degree to which two sequences of numbers co-vary. For + 'correlated' sequences, Pearson > 0, and for 'anticorrelated' sequences + Pearson < 0 (uncorrelated implies Pearson = 0). Pearson is a paramateric + and linear measure of correlation. + +- Spearman's Rho: The Spearman correlation is a non-paramateric measure of + correlation between two sequences of numbers. Spearman correlation is + appropriate for data where the values of the observations are not necessarily + accurate, but for which their relative magnitudes are (see Biometry for more + details). + +- Kendall's Tau: Kendall's Tau is an alternative method of calculating + correlation between two sequences of numbers. It is slower and less widely + utilized than Spearman or Pearson scores. + +- Cscore: The c-score or 'checkerboard score' is a measure of covariation + between two sequences that is derived from traditional ecology (Stone and + Roberts. 1990, Oecologia 85:74-79). + +Raw correlation statistics alone reflect only the degree of association between +two sequences of numbers or vectors. Assigning a likelihood to these score via +a p-value can be done with several methods depending on the particular +assumptions that are used. This script allows four methods for calculating +p-values: + +- Bootrapping: Bootstrapping is the most robust, but slowest procedure for + calculating the p-value of a given correlation score. Bootstrapping takes the + input sequences, shuffles the order of one, and then recomputes the + correlation score. The p-value is then the number of times (out of the given + number of permutations) that the score of the permuted sequence pair was more + extreme than the observed pair. Bootstrapping is good when the underlying + properties of the distributions are unknown. + +- Parametric t distribution: the traditional method for calculating the + significance of a correlation score, this method assumes that the scores are + normally distributed and computes a t statistic for each correlation score in + conjunction with the length of the sequences being correlated. + +- Fisher Z transform: Fisher's Z transform is a way to make the distribution of + correlation scores (especially when there are many highly correlated scores) + look more normal. It is not to be confused with Fisher's transformation for + the combination of p-values. + +- Kendall's Tau: for the Kendall's Tau calculation, the specific Kendall's Tau + p-value is provided. + +Notes: + +- The only supported metric for p-value assignment with the C-score is + bootstrapping. For more information on the C-score, read Stone and Roberts + 1990 Oecologea paper 85: 74-79. If you don't pass + pval_assignment_method='bootstrapped' while you have -s cscore, the script + will raise an error. + +- Assigning p-values to Kendall's Tau scores with the bootstrapping method is + very slow. + +""" + +script_info['script_usage'] = [] +script_info['script_usage'].append( + ("Example 1:", + "Calculate the correlation between OTUs in the table and the pH of the samples from whence they came:", + "%prog -i otu_table.biom -m map.txt -c pH -s spearman -o spearman_otu_gradient.txt")) +script_info['script_usage'].append( + ("Example 2:", + "Calculate the correlation between OTUs in the table and the pH of the samples from whence they came using bootstrapping and pearson correlation:", + "%prog -i otu_table.biom -m map.txt -c pH -s pearson --pval_assignment_method bootstrapped --permutations 100 -o pearson_bootstrapped.txt")) + +script_info['output_description']= """ +The output will be a tab-delimited file with the following headers. Each row +will record the values calculated for a given feature: + +- Feature ID: ID of the features being correlated. These are the observation IDs + in the BIOM table. +- Test stat.: the value of the test statistic for the given test. +- pval: the raw p-value returned by the given test. +- pval_fdr: the p-value corrected by the Benjamini-Hochberg FDR procedure for + multiple comparisons. +- pval_bon: the p-value corrected by the Bonferroni procedure for multiple + comparisons. +- [metadata]: this column will be present only if the BIOM table contained + metadata information for your features. For example, if these are OTUs, and + taxonomy is present in the BIOM table, this column will contain OTU + taxonomy and will be named 'taxonomy'. + +""" +script_info['required_options']=[ + make_option('-i','--otu_table_fp', + help='path to input BIOM table', + type='existing_path'), + make_option('-o', '--output_fp', type='new_filepath', + help='path to the output file to be created'), + make_option('-m','--mapping_fp', type='existing_filepath', + help='path to metadata mapping file'), + make_option('-c', '--category', + help='name of the category in the metadata mapping file over which to ' + 'run the analysis')] + +script_info['optional_options']=[ + make_option('-s', '--test', type="choice", + choices=correlation_assignment_choices, + default='spearman', help='Correlation method to use. Choices are: %s' % + (', '.join(correlation_assignment_choices)) + ' [default: %default]'), + make_option('--pval_assignment_method', type="choice", + choices=pvalue_assignment_choices, + default='fisher_z_transform', help='p-value method to use. Choices are: ' + '%s' % (', '.join(pvalue_assignment_choices)) + ' [default: %default]'), + make_option('--metadata_key', default='taxonomy', + help='Key to extract metadata from BIOM table. [default: %default]'), + make_option('--permutations', default=1000, type=int, + help='Number of permutations to use for bootstrapped p-value ' + 'calculations. [default: %default]')] + +script_info['version'] = __version__ + +def main(): + option_parser, opts, args = parse_command_line_parameters(**script_info) + + if opts.test == 'cscore' and opts.pval_assignment_method != 'bootstrapped': + option_parser.error(cscore_error_text) + + bt = load_table(opts.otu_table_fp) + pmf, _ = parse_mapping_file_to_dict(opts.mapping_fp) + + samples_to_correlate = [] + md_values_to_correlate = [] + bt_sample_ids = bt.ids(axis='sample') + + for sample_id, sample_md in pmf.items(): + if sample_id in bt_sample_ids: + try: + v = is_computable_float(sample_md[opts.category]) + samples_to_correlate.append(sample_id) + md_values_to_correlate.append(v) + except KeyError: + option_parser.error('The category (%s)' % opts.category + + ' was not found in the mapping file.') + except ValueError: + pass # value couldn't be converted to float, ignore this sample + else: + pass # sample in mf, but not bt + + # remove samples which are not found in the mapping file or do not have + # metadata that converts to float + bt.filter(ids_to_keep = samples_to_correlate, axis='sample') + + # sort the biom table so that feature values are retrieved in the same + # order as the metadata in the samples they correspond to + bt.sort(sort_f = lambda _: samples_to_correlate, axis='sample') + + if bt.shape[1] <= 3: + option_parser.error(filtration_error_text) + + rhos = [] + pvals = [] + for feature_vector in bt.iter_data(axis='observation'): + rho = correlate(feature_vector, md_values_to_correlate, + method=opts.test) + pval = assign_correlation_pval(rho, len(feature_vector), + method=opts.pval_assignment_method, + permutations=opts.permutations, + perm_test_fn=\ + bootstrap_functions[opts.test], + v1=feature_vector, + v2=md_values_to_correlate) + rhos.append(rho) + pvals.append(pval) + + fdr_pvals = benjamini_hochberg_step_down(pvals) + bon_pvals = bonferroni_correction(pvals) + # correct for cases where values above 1.0 due to correction + fdr_pvals = where(array(fdr_pvals) > 1.0, 1.0, fdr_pvals) + bon_pvals = where(array(bon_pvals) > 1.0, 1.0, bon_pvals) + + lines = correlate_output_formatter(bt, rhos, pvals, fdr_pvals, + bon_pvals, opts.metadata_key) + lines = sort_by_pval(lines, ind=2) + + o = open(opts.output_fp, 'w') + o.writelines('\n'.join(lines)) + o.write('\n') + o.close() + +if __name__ == "__main__": + main() diff --git a/tests/test_otu_significance.py b/tests/test_otu_significance.py index d09c170b3a..0a2b3b5a3c 100755 --- a/tests/test_otu_significance.py +++ b/tests/test_otu_significance.py @@ -17,10 +17,11 @@ GROUP_TEST_CHOICES, grouped_correlation_row_generator, run_grouped_correlation, CORRELATION_TEST_CHOICES, grouped_correlation_formatter, correlation_row_generator, - run_correlation_test) -from qiime.stats import (assign_correlation_pval, fisher, - fisher_population_correlation) -from numpy import array, hstack, corrcoef, asarray, nan + run_correlation_test, is_computable_float, + correlate_output_formatter, _add_metadata) +from qiime.stats import (assign_correlation_pval, fisher, + fisher_population_correlation) +from numpy import array, hstack, corrcoef, asarray, nan, inf from numpy.random import seed from numpy.testing import assert_almost_equal from os import remove @@ -959,9 +960,84 @@ def test_run_correlation_test(self): permutations=1000) assert_almost_equal(exp_bootstrapped_pvals, obs_pvals) + def test_is_computable_float(self): + '''Test that an arbitrary input can be converted to float.''' + self.assertRaises(ValueError, is_computable_float, 'adkfjsdkfj') + assert is_computable_float(nan) == False + assert is_computable_float(inf) == False + assert is_computable_float(-inf) == False + assert is_computable_float("3.5679") == 3.5679 + + def test_correlate_output_formatter(self): + '''Test that output lines are correctly generated.''' + # use BT_IN_1 + bt = parse_biom_table(BT_IN_1) + test_stats = [3.56, 2.78, nan, nan, nan, -4.78] + pvals = [.001, .034, nan, nan, nan, .00001] + fdr_pvals = [.001, .001, nan, nan, nan, .001] + bon_pvals = [.006, .192, nan, nan, nan, .006] + md_key = 'taxonomy' + type_of_stat = 'T' + exp = [['Feature ID', 'Test stat.', 'pval', 'pval_fdr', 'pval_bon', + md_key], + ['OTU1', '3.56', '0.001', '0.001', '0.006', 'k__One'], + ['OTU2', '2.78', '0.034', '0.001', '0.192', 'k__Two'], + ['OTU3', 'nan', 'nan', 'nan', 'nan', 'k__Three'], + ['OTU4', 'nan', 'nan', 'nan', 'nan', 'k__Four'], + ['OTU5', 'nan', 'nan', 'nan', 'nan', 'k__Five'], + ['OTU6', '-4.78', '1e-05', '0.001', '0.006', 'k__Six']] + exp = ['\t'.join(i) for i in exp] + obs = correlate_output_formatter(bt, test_stats, pvals, fdr_pvals, + bon_pvals, md_key) + assert obs == exp + + def test_add_metadata(self): + '''Test metadata is added appropriately to output lines.''' + # test first with taxonomy + exp = \ + ['Feature ID\tTest stat.\tpval\tpval_fdr\tpval_bon\ttaxonomy', + 'OTU1\t3.56\t0.001\t0.001\t0.006\tk__One', + 'OTU2\t2.78\t0.034\t0.001\t0.192\tk__Two', + 'OTU3\tnan\tnan\tnan\tnan\tk__Three', + 'OTU4\tnan\tnan\tnan\tnan\tk__Four', + 'OTU5\tnan\tnan\tnan\tnan\tk__Five', + 'OTU6\t-4.78\t1e-05\t0.001\t0.006\tk__Six'] + inp_lines = \ + ['Feature ID\tTest stat.\tpval\tpval_fdr\tpval_bon\ttaxonomy', + 'OTU1\t3.56\t0.001\t0.001\t0.006', + 'OTU2\t2.78\t0.034\t0.001\t0.192', + 'OTU3\tnan\tnan\tnan\tnan', + 'OTU4\tnan\tnan\tnan\tnan', + 'OTU5\tnan\tnan\tnan\tnan', + 'OTU6\t-4.78\t1e-05\t0.001\t0.006'] + bt = parse_biom_table(BT_IN_1) + obs = _add_metadata(bt, 'taxonomy', inp_lines) + assert obs == exp + # test without taxonomy or other metadata + exp = \ + ['Feature ID\tTest stat.\tpval\tpval_fdr\tpval_bon', + 'OTU1\t3.56\t0.001\t0.001\t0.006', + 'OTU2\t2.78\t0.034\t0.001\t0.192', + 'OTU3\tnan\tnan\tnan\tnan', + 'OTU4\tnan\tnan\tnan\tnan', + 'OTU5\tnan\tnan\tnan\tnan', + 'OTU6\t-4.78\t1e-05\t0.001\t0.006'] + inp_lines = \ + ['Feature ID\tTest stat.\tpval\tpval_fdr\tpval_bon\ttaxonomy', + 'OTU1\t3.56\t0.001\t0.001\t0.006', + 'OTU2\t2.78\t0.034\t0.001\t0.192', + 'OTU3\tnan\tnan\tnan\tnan', + 'OTU4\tnan\tnan\tnan\tnan', + 'OTU5\tnan\tnan\tnan\tnan', + 'OTU6\t-4.78\t1e-05\t0.001\t0.006'] + bt = parse_biom_table(BT_IN_1_no_md) + obs = _add_metadata(bt, 'taxonomy', inp_lines) + assert obs == exp + # globals used by certain tests. BT_IN_1 = '{"id": "None","format": "Biological Observation Matrix 1.0.0","format_url": "http://biom-format.org","type": "OTU table","generated_by": "testCode","date": "2013-08-20T15:48:21.166180","matrix_type": "sparse","matrix_element_type": "float","shape": [6, 6],"data": [[0,0,28.0],[0,1,52.0],[0,2,51.0],[0,3,78.0],[0,4,16.0],[0,5,77.0],[1,0,25.0],[1,1,14.0],[1,2,11.0],[1,3,32.0],[1,4,48.0],[1,5,63.0],[2,0,31.0],[2,1,2.0],[2,2,15.0],[2,3,69.0],[2,4,64.0],[2,5,27.0],[3,0,36.0],[3,1,68.0],[3,2,70.0],[3,3,65.0],[3,4,33.0],[3,5,62.0],[4,0,16.0],[4,1,41.0],[4,2,59.0],[4,3,40.0],[4,4,15.0],[4,5,3.0],[5,0,32.0],[5,1,8.0],[5,2,54.0],[5,3,98.0],[5,4,29.0],[5,5,50.0]],"rows": [{"id": "OTU1", "metadata": {"taxonomy": ["k__One"]}},{"id": "OTU2", "metadata": {"taxonomy": ["k__Two"]}},{"id": "OTU3", "metadata": {"taxonomy": ["k__Three"]}},{"id": "OTU4", "metadata": {"taxonomy": ["k__Four"]}},{"id": "OTU5", "metadata": {"taxonomy": ["k__Five"]}},{"id": "OTU6", "metadata": {"taxonomy": ["k__Six"]}}],"columns": [{"id": "Sample1", "metadata": null},{"id": "Sample2", "metadata": null},{"id": "Sample3", "metadata": null},{"id": "Sample4", "metadata": null},{"id": "Sample5", "metadata": null},{"id": "Sample6", "metadata": null}]}' BT_IN_2 = '{"id": "None","format": "Biological Observation Matrix 1.0.0","format_url": "http://biom-format.org","type": "OTU table","generated_by": "testCode","date": "2013-08-20T15:48:21.166180","matrix_type": "sparse","matrix_element_type": "float","shape": [6, 6],"data": [[0,0,28.0],[0,1,28.0],[0,2,28.0],[0,3,28.0],[0,4,28.0],[0,5,28.0],[1,0,28.0],[1,1,14.0],[1,2,11.0],[1,3,32.0],[1,4,48.0],[1,5,63.0],[2,0,31.0],[2,1,2.0],[2,2,15.0],[2,3,69.0],[2,4,64.0],[2,5,27.0],[3,0,36.0],[3,1,68.0],[3,2,70.0],[3,3,65.0],[3,4,33.0],[3,5,62.0],[4,0,16.0],[4,1,41.0],[4,2,59.0],[4,3,40.0],[4,4,15.0],[4,5,3.0],[5,0,32.0],[5,1,8.0],[5,2,54.0],[5,3,98.0],[5,4,29.0],[5,5,50.0]],"rows": [{"id": "OTU1", "metadata": {"taxonomy": ["k__One"]}},{"id": "OTU2", "metadata": {"taxonomy": ["k__Two"]}},{"id": "OTU3", "metadata": {"taxonomy": ["k__Three"]}},{"id": "OTU4", "metadata": {"taxonomy": ["k__Four"]}},{"id": "OTU5", "metadata": {"taxonomy": ["k__Five"]}},{"id": "OTU6", "metadata": {"taxonomy": ["k__Six"]}}],"columns": [{"id": "Sample1", "metadata": null},{"id": "Sample2", "metadata": null},{"id": "Sample3", "metadata": null},{"id": "Sample4", "metadata": null},{"id": "Sample5", "metadata": null},{"id": "Sample6", "metadata": null}]}' +BT_IN_1_no_md = '{"id": "None","format": "Biological Observation Matrix 1.0.0","format_url": "http://biom-format.org","type": "OTU table","generated_by": "testCode","date": "2013-08-20T15:48:21.166180","matrix_type": "sparse","matrix_element_type": "float","shape": [6, 6],"data": [[0,0,28.0],[0,1,52.0],[0,2,51.0],[0,3,78.0],[0,4,16.0],[0,5,77.0],[1,0,25.0],[1,1,14.0],[1,2,11.0],[1,3,32.0],[1,4,48.0],[1,5,63.0],[2,0,31.0],[2,1,2.0],[2,2,15.0],[2,3,69.0],[2,4,64.0],[2,5,27.0],[3,0,36.0],[3,1,68.0],[3,2,70.0],[3,3,65.0],[3,4,33.0],[3,5,62.0],[4,0,16.0],[4,1,41.0],[4,2,59.0],[4,3,40.0],[4,4,15.0],[4,5,3.0],[5,0,32.0],[5,1,8.0],[5,2,54.0],[5,3,98.0],[5,4,29.0],[5,5,50.0]],"rows": [{"id": "OTU1", "metadata": null},{"id": "OTU2", "metadata": null},{"id": "OTU3", "metadata": null},{"id": "OTU4", "metadata": null},{"id": "OTU5", "metadata": null},{"id": "OTU6", "metadata": null}] ,"columns": [{"id": "Sample1", "metadata": null},{"id": "Sample2", "metadata": null},{"id": "Sample3", "metadata": null},{"id": "Sample4", "metadata": null},{"id": "Sample5", "metadata": null},{"id": "Sample6", "metadata": null}]}' MF_IN_1 = ['#SampleID\ttest_cat\ttest_corr', 'Sample1\tcat1\t1', @@ -974,6 +1050,9 @@ def test_run_correlation_test(self): 'NotInOtuTable2\tcat5\t8'] BT_4 = """{"id": "None","format": "Biological Observation Matrix 1.0.0","format_url": "http://biom-format.org","type": "OTU table","generated_by": "BIOM-Format 1.1.2","date": "2013-08-16T15:23:02.872397","matrix_type": "sparse","matrix_element_type": "float","shape": [6, 8],"data": [[0,0,28.0],[0,1,52.0],[0,2,51.0],[0,3,78.0],[0,4,16.0],[0,5,77.0],[0,6,73.0],[0,7,6.0],[1,0,25.0],[1,1,14.0],[1,2,11.0],[1,3,32.0],[1,4,48.0],[1,5,63.0],[1,6,27.0],[1,7,38.0],[2,0,31.0],[2,1,2.0],[2,2,15.0],[2,3,69.0],[2,4,64.0],[2,5,27.0],[2,6,64.0],[2,7,54.0],[3,0,36.0],[3,1,68.0],[3,2,70.0],[3,3,65.0],[3,4,33.0],[3,5,62.0],[3,6,60.0],[3,7,23.0],[4,0,16.0],[4,1,41.0],[4,2,59.0],[4,3,40.0],[4,4,15.0],[4,5,3.0],[4,6,35.0],[4,7,5.0],[5,0,32.0],[5,1,8.0],[5,2,54.0],[5,3,98.0],[5,4,29.0],[5,5,50.0],[5,6,93.0],[5,7,19.0]],"rows": [{"id": "OTU1", "metadata": {"taxonomy": "k__One"}},{"id": "OTU2", "metadata": {"taxonomy": "k__Two"}},{"id": "OTU3", "metadata": {"taxonomy": "k__Three"}},{"id": "OTU4", "metadata": {"taxonomy": "k__Four"}},{"id": "OTU5", "metadata": {"taxonomy": "k__Five"}},{"id": "OTU6", "metadata": {"taxonomy": "k__Six"}}],"columns": [{"id": "Sample1", "metadata": null},{"id": "Sample2", "metadata": null},{"id": "Sample3", "metadata": null},{"id": "Sample4", "metadata": null},{"id": "Sample5", "metadata": null},{"id": "Sample6", "metadata": null},{"id": "Sample7", "metadata": null},{"id": "Sample8", "metadata": null}]}""" +['Feature ID\tTest stat.\tpval\tpval_fdr\tpval_bon\ttaxonomy', 'OTU1\t3.56\t0.001\t0.001\t0.006\tk__One', 'OTU2\t2.78\t0.034\t0.001\t0.192\tk__Two', 'OTU3\tnan\tnan\tnan\tnan\tk__Three', 'OTU4\tnan\tnan\tnan\tnan\tk__Four', 'OTU5\tnan\tnan\tnan\tnan\tk__Five', 'OTU6\t-4.78\t1e-05\t0.001\t0.006\tk__Six'] +['Feature ID\tTest stat.\tpval\tpval_fdr\tpval_bon', 'OTU1\t3.56\t0.001\t0.001\t0.006', 'OTU2\t2.78\t0.034\t0.001\t0.192', 'OTU3\tnan\tnan\tnan\tnan', 'OTU4\tnan\tnan\tnan\tnan', 'OTU5\tnan\tnan\tnan\tnan', 'OTU6\t-4.78\t1e-05\t0.001\t0.006'] + # run unit tests if run from command-line if __name__ == '__main__': main()