From 4ac94e10821727b9a401a4bf2dcd15f2575a5bc5 Mon Sep 17 00:00:00 2001 From: Will Van Treuren Date: Thu, 14 May 2015 21:58:14 -0700 Subject: [PATCH 1/8] Fixes issue #2009 --- scripts/observation_metadata_correlation.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/scripts/observation_metadata_correlation.py b/scripts/observation_metadata_correlation.py index f4670c1eb3..0ec3abd0e0 100755 --- a/scripts/observation_metadata_correlation.py +++ b/scripts/observation_metadata_correlation.py @@ -199,14 +199,14 @@ def main(): # sort the biom table so that feature values are retrieved in the same # order as the metadata in the samples they correspond to - bt.sort(sort_f = lambda _: samples_to_correlate, axis='sample') + nbt = bt.sort(sort_f = lambda _: samples_to_correlate, axis='sample') - if bt.shape[1] <= 3: + if nbt.shape[1] <= 3: option_parser.error(filtration_error_text) rhos = [] pvals = [] - for feature_vector in bt.iter_data(axis='observation'): + for feature_vector in nbt.iter_data(axis='observation'): rho = correlate(feature_vector, md_values_to_correlate, method=opts.test) pval = assign_correlation_pval(rho, len(feature_vector), @@ -225,7 +225,7 @@ def main(): fdr_pvals = where(array(fdr_pvals) > 1.0, 1.0, fdr_pvals) bon_pvals = where(array(bon_pvals) > 1.0, 1.0, bon_pvals) - lines = correlate_output_formatter(bt, rhos, pvals, fdr_pvals, + lines = correlate_output_formatter(nbt, rhos, pvals, fdr_pvals, bon_pvals, opts.metadata_key) lines = sort_by_pval(lines, ind=2) From 4ddf91c348debd153da17e94b5c381125e11d7d0 Mon Sep 17 00:00:00 2001 From: Jai Ram Rideout Date: Fri, 22 May 2015 10:34:42 -0700 Subject: [PATCH 2/8] MAINT: remove unused qiime.beta_diversity.single_object_beta function --- qiime/beta_diversity.py | 99 ---------------------------------- tests/test_beta_diversity.py | 102 ++--------------------------------- 2 files changed, 3 insertions(+), 198 deletions(-) diff --git a/qiime/beta_diversity.py b/qiime/beta_diversity.py index c774ed1c21..413302d750 100644 --- a/qiime/beta_diversity.py +++ b/qiime/beta_diversity.py @@ -304,105 +304,6 @@ def single_file_beta(input_path, metrics, tree_path, output_dir, f.close() -def single_object_beta(otu_table, metrics, tr, rowids=None, - full_tree=False): - """mod of single_file_beta to recieve and return otu obj, tree str - - uses name in metrics to name output beta diversity files - assumes input tree is already trimmed to contain only otus present - in otu_table, doesn't call getSubTree() - inputs: - otu_table -- a otu_table in the biom format - metrics -- metrics (str, comma delimited if more than 1 metric) - tr -- a phylonode cogent tree object if needed by the chosen beta - diversity metric - rowids -- comma seperated string - """ - otumtx = asarray([v for v in otu_table.iter_sample_data()]) - - if tr: - tree = tr - else: - tree = None - - metrics_list = metrics.split(',') - - for metric in metrics_list: - try: - metric_f = get_nonphylogenetic_metric(metric) - is_phylogenetic = False - except AttributeError: - try: - metric_f = get_phylogenetic_metric(metric) - is_phylogenetic = True - if tree is None: - stderr.write("metric %s requires a tree, but none found\n" - % (metric,)) - exit(1) - except AttributeError: - stderr.write("Could not find metric %s.\n\nKnown metrics are: %s\n" - % (metric, ', '.join(list_known_metrics()))) - exit(1) - if rowids is None: - # standard, full way - if is_phylogenetic: - dissims = metric_f(otumtx, otu_table.ids(axis='observation'), - tree, otu_table.ids(), - make_subtree=(not full_tree)) - else: - dissims = metric_f(otumtx) - - return ( - format_distance_matrix( - otu_table.ids(), - dissims).split( - '\n') - ) - else: - # only calc d(rowid1, *) for each rowid - rowids_list = rowids.split(',') - row_dissims = [] # same order as rowids_list - for rowid in rowids_list: - rowidx = otu_table.index(rowid) - - # first test if we can the dissim is a fn of only the pair - # if not, just calc the whole matrix - if metric_f.__name__ == 'dist_chisq' or \ - metric_f.__name__ == 'dist_gower' or \ - metric_f.__name__ == 'dist_hellinger' or\ - metric_f.__name__ == 'binary_dist_chisq': - warnings.warn('dissimilarity ' + metric_f.__name__ + - ' is not parallelized, calculating the whole matrix...') - row_dissims.append(metric_f(otumtx)[rowidx]) - else: - try: - row_metric = get_phylogenetic_row_metric(metric) - except AttributeError: - # do element by element - dissims = [] - sample_ids = otu_table.ids() - observation_ids = otu_table.ids(axis='observation') - for i in range(len(sample_ids)): - if is_phylogenetic: - dissim = metric_f( - otumtx[[rowidx, i], :], observation_ids, - tree, [sample_ids[rowidx], sample_ids[i]], - make_subtree=(not full_tree))[0, 1] - else: - dissim = metric_f(otumtx[[rowidx, i], :])[0, 1] - dissims.append(dissim) - row_dissims.append(dissims) - else: - # do whole row at once - dissims = row_metric(otumtx, - otu_table.ids(axis='observation'), - tree, otu_table.ids(), rowid, - make_subtree=(not full_tree)) - row_dissims.append(dissims) - - return format_matrix(row_dissims, rowids_list, otu_table.ids()) - - def multiple_file_beta(input_path, output_dir, metrics, tree_path, rowids=None, full_tree=False): """ runs beta diversity for each input file in the input directory diff --git a/tests/test_beta_diversity.py b/tests/test_beta_diversity.py index dac929c19c..91927b5dd0 100644 --- a/tests/test_beta_diversity.py +++ b/tests/test_beta_diversity.py @@ -2,7 +2,8 @@ __author__ = "Justin Kuczynski" __copyright__ = "Copyright 2011, The QIIME Project" -__credits__ = ["Justin Kuczynski", "Rob Knight", "Jose Antonio Navas Molina"] +__credits__ = ["Justin Kuczynski", "Rob Knight", "Jose Antonio Navas Molina", + "Jai Ram Rideout"] __license__ = "GPL" __version__ = "1.9.0-dev" __maintainer__ = "Justin Kuczynski" @@ -27,8 +28,7 @@ from qiime.util import get_qiime_temp_dir, write_biom_table from qiime.parse import parse_newick, parse_distmat, parse_matrix from qiime.beta_diversity import BetaDiversityCalc, single_file_beta,\ - list_known_nonphylogenetic_metrics, list_known_phylogenetic_metrics,\ - single_object_beta + list_known_nonphylogenetic_metrics, list_known_phylogenetic_metrics from qiime.beta_metrics import dist_unweighted_unifrac @@ -266,102 +266,6 @@ def test_single_file_beta_missin_metric_list(self): self.single_file_beta(missing_otu_table, missing_tree, missing_sams=['M'], use_metric_list=True) - def single_object_beta(self, otu_table, metric, tree_string, - missing_sams=None): - """ running single_file_beta should give same result using --rows""" - if missing_sams is None: - missing_sams = [] - - metrics = list_known_nonphylogenetic_metrics() - metrics.extend(list_known_phylogenetic_metrics()) - - # new metrics that don't trivially parallelize must be dealt with - # carefully - warnings.filterwarnings('ignore', 'dissimilarity binary_dist_chisq is\ - not parallelized, calculating the whole matrix...') - warnings.filterwarnings('ignore', 'dissimilarity dist_chisq is not\ - parallelized, calculating the whole matrix...') - warnings.filterwarnings('ignore', 'dissimilarity dist_gower is not\ - parallelized, calculating the whole matrix...') - warnings.filterwarnings('ignore', 'dissimilarity dist_hellinger is\ - not parallelized, calculating the whole matrix...') - warnings.filterwarnings('ignore', 'unifrac had no information for\ - sample M*') - - for metric in metrics: - # do it - beta_out = single_object_beta(otu_table, metric, - tree_string, rowids=None, - full_tree=False) - - sams, dmtx = parse_distmat(beta_out) - - # do it by rows - for i in range(len(sams)): - if sams[i] in missing_sams: - continue - rows = sams[i] - # row_outname = output_dir + '/' + metric + '_' +\ - # in_fname - r_out = single_object_beta(otu_table, metric, - tree_string, rowids=rows, - full_tree=False) - col_sams, row_sams, row_dmtx = parse_matrix(r_out) - - self.assertEqual(row_dmtx.shape, (len(rows.split(',')), - len(sams))) - - # make sure rows same as full - for j in range(len(rows.split(','))): - for k in range(len(sams)): - row_v1 = row_dmtx[j, k] - full_v1 =\ - dmtx[sams.index(row_sams[j]), - sams.index(col_sams[k])] - npt.assert_almost_equal(row_v1, full_v1) - - # full tree run: - if 'full_tree' in str(metric).lower(): - continue - # do it by rows with full tree - for i in range(len(sams)): - if sams[i] in missing_sams: - continue - rows = sams[i] - - #~ row_outname = output_dir + '/ft/' + metric + '_' +\ - #~ in_fname - r_out = single_object_beta(otu_table, metric, - tree_string, rowids=None, - full_tree=True) - col_sams, row_sams, row_dmtx = parse_matrix(r_out) - - self.assertEqual(row_dmtx.shape, (len(rows.split(',')), - len(sams))) - - # make sure rows same as full - for j in range(len(rows.split(','))): - for k in range(len(sams)): - row_v1 = row_dmtx[j, k] - full_v1 =\ - dmtx[sams.index(row_sams[j]), - sams.index(col_sams[k])] - npt.assert_almost_equal(row_v1, full_v1) - - # do it with full tree - r_out = single_object_beta(otu_table, metric, - tree_string, rowids=None, - full_tree=True) - sams_ft, dmtx_ft = parse_distmat(r_out) - self.assertEqual(sams_ft, sams) - npt.assert_almost_equal(dmtx_ft, dmtx) - - def test_single_object_beta(self): - self.single_file_beta(l19_otu_table, l19_tree) - - def test_single_object_beta_missing(self): - self.single_file_beta(missing_otu_table, missing_tree, - missing_sams=['M']) l19_otu_table = """{"rows": [{"id": "tax1", "metadata": {}}, {"id": "tax2",\ "metadata": {}}, {"id": "tax3", "metadata": {}}, {"id": "tax4", "metadata":\ From 7dfe532e5baa3bd64ffaf1cbc640b8668bed4da5 Mon Sep 17 00:00:00 2001 From: Jai Ram Rideout Date: Fri, 22 May 2015 10:54:41 -0700 Subject: [PATCH 3/8] MAINT: remove unused qiime.beta_diversity.BetaDiversityCalc class --- qiime/beta_diversity.py | 68 ------------------------------------ tests/test_beta_diversity.py | 41 ++-------------------- 2 files changed, 2 insertions(+), 107 deletions(-) diff --git a/qiime/beta_diversity.py b/qiime/beta_diversity.py index 413302d750..06b958a987 100644 --- a/qiime/beta_diversity.py +++ b/qiime/beta_diversity.py @@ -122,74 +122,6 @@ def list_known_metrics(): list_known_phylogenetic_metrics() -class BetaDiversityCalc(FunctionWithParams): - - """A BetaDiversityCalc takes taxon x sample counts, returns distance mat. - - Some BetaDiversityCalcs also require a tree. - - typical usage is: - calc = BetaDiversityCalc(my_metric, 'metric name', \ - IsPhylogenetic=False) - calc(data_path=stuff, result_path=stuff2) - """ - _tracked_properties = ['IsPhylogenetic', 'Metric'] - - def __init__(self, metric, name, is_phylogenetic=False, params=None): - """Return new BetaDiversityCalc object with specified params. - - Note: metric must conform to the following interface: - must return 2d sample distance matrix with sample order preserved. - * phylogenetic methods: take samples by taxa array, taxon_names, - cogent PhyloNode tree - * nonphylo methods: take only samples by taxa array, numpy 2d - - Not sure what params does - """ - self.Metric = metric # should be f(table, tree) -> dist matrix - self.Name = name - self.IsPhylogenetic = is_phylogenetic - self.Params = params or {} - - def getResult(self, data_path, tree_path=None): - """Returns distance matrix from (indcidence matrix and optionally tree). - - Parameters: - - data_path: path to data file, matrix (samples = cols, taxa = rows) - in tab-delimited text format - - tree_path: path or object. - if method is phylogenetic, must supply tree_path. - if path, path to - Newick-format tree file where taxon ids match taxon ids in the - input data file. - - returns 2d dist matrix, list of sample names ordered as in dist mtx - """ - # if it's a phylogenetic metric, read the tree - if self.IsPhylogenetic: - tree = self.getTree(tree_path) - else: - tree = None - - otu_table = load_table(data_path) - otumtx = asarray([v for v in otu_table.iter_data(axis='sample')]) - - # get the 2d dist matrix from beta diversity analysis - if self.IsPhylogenetic: - return (self.Metric(otumtx, otu_table.ids(axis='observation'), - tree, otu_table.ids()), - list(otu_table.ids())) - else: - return self.Metric(otumtx), list(otu_table.ids()) - - def formatResult(self, result): - """Generate formatted distance matrix. result is (data, sample_names)""" - data, sample_names = result - return format_distance_matrix(sample_names, data) - - def single_file_beta(input_path, metrics, tree_path, output_dir, rowids=None, full_tree=False): """ does beta diversity calc on a single otu table diff --git a/tests/test_beta_diversity.py b/tests/test_beta_diversity.py index 91927b5dd0..397302cb80 100644 --- a/tests/test_beta_diversity.py +++ b/tests/test_beta_diversity.py @@ -27,15 +27,12 @@ from qiime.util import get_qiime_temp_dir, write_biom_table from qiime.parse import parse_newick, parse_distmat, parse_matrix -from qiime.beta_diversity import BetaDiversityCalc, single_file_beta,\ +from qiime.beta_diversity import single_file_beta,\ list_known_nonphylogenetic_metrics, list_known_phylogenetic_metrics from qiime.beta_metrics import dist_unweighted_unifrac -class BetaDiversityCalcTests(TestCase): - - """Tests of the BetaDiversityCalc class""" - +class BetaDiversityTests(TestCase): def setUp(self): self.tmp_dir = get_qiime_temp_dir() @@ -98,40 +95,6 @@ def tearDown(self): for folder in self.folders_to_remove: shutil.rmtree(folder) - def test_l19_chi(self): - """beta calc run should return same values as directly calling metric""" - beta_calc_chisq = BetaDiversityCalc(dist_chisq, 'chi square', False) - matrix, labels = beta_calc_chisq(data_path=self.l19_fp, tree_path=None) - self.assertEqual(labels, self.l19_sample_names) - npt.assert_almost_equal(matrix, dist_chisq(self.l19_data)) - - def test_l19_unifrac(self): - """beta calc run should also work for phylo metric""" - beta_calc = BetaDiversityCalc(dist_unweighted_unifrac, 'unifrac', True) - matrix, labels = beta_calc(data_path=self.l19_fp, - tree_path=self.l19_tree, result_path=None, log_path=None) - self.assertEqual(labels, self.l19_sample_names) - - def test_l19_unifrac_escaped_names(self): - """beta calc works for unifrac when tips names are escaped in newick - """ - beta_calc = BetaDiversityCalc(dist_unweighted_unifrac, 'unifrac', True) - non_escaped_result = beta_calc(data_path=self.l19_fp, - tree_path=self.l19_tree, result_path=None, log_path=None) - - l19_tree_str = "(((('tax7':0.1,'tax3':0.2):.98,tax8:.3, 'tax4':.3):.4,\ - (('ta_x1':0.3, tax6:.09):0.43,tax2:0.4):0.5):.2, (tax9:0.3, 'endbigtaxon':.08));" - - fd, tree_fp = mkstemp(prefix='Beta_div_tests', suffix='.tre') - os.close(fd) - open(tree_fp, 'w').write(l19_tree_str) - self.files_to_remove.append(tree_fp) - escaped_result = beta_calc(data_path=self.l19_w_underscore_fp, - tree_path=tree_fp, result_path=None, - log_path=None) - npt.assert_almost_equal(escaped_result[0], non_escaped_result[0]) - self.assertEqual(escaped_result[1], non_escaped_result[1]) - def single_file_beta( self, otu_table_string, tree_string, missing_sams=None, use_metric_list=False): From ac035f360ae789b82fe99ebbeddcda05645fc0f3 Mon Sep 17 00:00:00 2001 From: Greg Caporaso Date: Fri, 22 May 2015 12:30:19 -0700 Subject: [PATCH 4/8] MAINT: addressed @ElDeveloper's comment --- scripts/observation_metadata_correlation.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/scripts/observation_metadata_correlation.py b/scripts/observation_metadata_correlation.py index 0ec3abd0e0..eab5699128 100755 --- a/scripts/observation_metadata_correlation.py +++ b/scripts/observation_metadata_correlation.py @@ -199,14 +199,14 @@ def main(): # sort the biom table so that feature values are retrieved in the same # order as the metadata in the samples they correspond to - nbt = bt.sort(sort_f = lambda _: samples_to_correlate, axis='sample') + bt = bt.sort(sort_f = lambda _: samples_to_correlate, axis='sample') - if nbt.shape[1] <= 3: + if bt.shape[1] <= 3: option_parser.error(filtration_error_text) rhos = [] pvals = [] - for feature_vector in nbt.iter_data(axis='observation'): + for feature_vector in bt.iter_data(axis='observation'): rho = correlate(feature_vector, md_values_to_correlate, method=opts.test) pval = assign_correlation_pval(rho, len(feature_vector), @@ -225,7 +225,7 @@ def main(): fdr_pvals = where(array(fdr_pvals) > 1.0, 1.0, fdr_pvals) bon_pvals = where(array(bon_pvals) > 1.0, 1.0, bon_pvals) - lines = correlate_output_formatter(nbt, rhos, pvals, fdr_pvals, + lines = correlate_output_formatter(bt, rhos, pvals, fdr_pvals, bon_pvals, opts.metadata_key) lines = sort_by_pval(lines, ind=2) From 1db053f580fcfaf18f2265bb18f589ab05be1d36 Mon Sep 17 00:00:00 2001 From: Greg Caporaso Date: Fri, 22 May 2015 12:37:25 -0700 Subject: [PATCH 5/8] REL: added description of fix --- ChangeLog.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ChangeLog.md b/ChangeLog.md index 97d6020774..0671e30067 100644 --- a/ChangeLog.md +++ b/ChangeLog.md @@ -12,7 +12,8 @@ Bug fixes * Forced BIOM table type to "OTU table" for all tables written with QIIME. This fixes [#1928](https://github.com/biocore/qiime/issues/1928). * The ``--similarity`` option in ``pick_otus.py`` now only accepts sequence similarity thresholds between 0.0 and 1.0 (inclusive). Previous behavior would allow values outside this range, which would cause uninformative error messages to be raised by the external tools that ``pick_otus.py`` wraps ([#1979](https://github.com/biocore/qiime/issues/1979)). * ``split_libraries_fastq.py`` now explicitly disallows ``-p 0``. This could lead to empty sequences being written to the resulting output file ([#1984](https://github.com/biocore/qiime/issues/1984)). -* Fixed issued where ``filter_samples_from_otu_table.py`` could only filter the mapping file when ``--valid_states`` was passed as the filtering method ([#2003](https://github.com/biocore/qiime/issues/2003)). +* Fixed issued where ``filter_samples_from_otu_table.py`` could only filter the mapping file when ``--valid_states`` was passed as the filtering method ([#2003](https://github.com/biocore/qiime/issues/2003)). +* Fixed serious bug in ``observation_metadata_correlation.py``, described in [#2009](https://github.com/biocore/qiime/issues/2009). **All previous output generated with ``observation_metadata_correlation.py`` was incorrect, and analyses using those results should be re-run.** This most commonly would have resulted in Type 2 error (false negatives), where observations whose abundance is correlated with metadata are not reported. Usability enhancements ---------------------- From b2a8759c16d2cfb853875d09085e10b2caa8493e Mon Sep 17 00:00:00 2001 From: Greg Caporaso Date: Fri, 22 May 2015 12:46:51 -0700 Subject: [PATCH 6/8] REL: emphasizing important bugs --- ChangeLog.md | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/ChangeLog.md b/ChangeLog.md index 0671e30067..46ffa8fc9d 100644 --- a/ChangeLog.md +++ b/ChangeLog.md @@ -4,16 +4,17 @@ QIIME 1.9.0-dev Bug fixes --------- -* Updated minimum required version of the [qiime-default-reference](http://github.com/biocore/qiime-default-reference) package to 0.1.2. This release includes an important bug fix described in more detail in [this QIIME blog post](https://qiime.wordpress.com/2015/04/15/qiime-1-9-0-bug-affecting-pynast-alignment-of-16s-amplicons-generated-with-non-515f806r-primers/) and in [biocore/qiime-default-reference#14](https://github.com/biocore/qiime-default-reference/issues/14). +* Updated minimum required version of the [qiime-default-reference](http://github.com/biocore/qiime-default-reference) package to 0.1.2. **This release includes an important bug fix described in more detail in [this QIIME blog post](https://qiime.wordpress.com/2015/04/15/qiime-1-9-0-bug-affecting-pynast-alignment-of-16s-amplicons-generated-with-non-515f806r-primers/) and in [biocore/qiime-default-reference#14](https://github.com/biocore/qiime-default-reference/issues/14).** +* Fixed bug in ``differential_abundance.py`` fitZIG algorithm ([#1960](https://github.com/biocore/qiime/pull/1960)). **This was a serious bug that was encountered when users would call ``differential_abundance.py -a metagenomeSeq_fitZIG``. Any results previosuly generated with that command should be re-run.** +* Fixed serious bug in ``observation_metadata_correlation.py``, described in [#2009](https://github.com/biocore/qiime/issues/2009). **All previous output generated with ``observation_metadata_correlation.py`` was incorrect, and analyses using those results should be re-run.** This most commonly would have resulted in massive Type 2 error (false negatives), where observations whose abundance is correlated with metadata are not reported, though Type 1 error (false positives) are also possible. * ``count_seqs.py`` no longer fails on empty files. [#1991](https://github.com/biocore/qiime/issues/1991) -* Fixed bug in ``differential_abundance.py`` fitZIG algorithm ([#1960](https://github.com/biocore/qiime/pull/1960)). This was a serious bug that was encountered when users would call ``differential_abundance.py -a metagenomeSeq_fitZIG``. Any results generated with that command in QIIME 1.9.0 should be re-run. * Updated minimum required version of [biom-format](http://github.com/biocore/biom-format) package to 2.1.4. This is a bug fix release. Details are available in the [biom-format ChangeLog](https://github.com/biocore/biom-format/blob/master/ChangeLog.md). * Updated minimum required version of [Emperor](http://github.com/biocore/emperor) package to 0.9.51. * Forced BIOM table type to "OTU table" for all tables written with QIIME. This fixes [#1928](https://github.com/biocore/qiime/issues/1928). * The ``--similarity`` option in ``pick_otus.py`` now only accepts sequence similarity thresholds between 0.0 and 1.0 (inclusive). Previous behavior would allow values outside this range, which would cause uninformative error messages to be raised by the external tools that ``pick_otus.py`` wraps ([#1979](https://github.com/biocore/qiime/issues/1979)). * ``split_libraries_fastq.py`` now explicitly disallows ``-p 0``. This could lead to empty sequences being written to the resulting output file ([#1984](https://github.com/biocore/qiime/issues/1984)). * Fixed issued where ``filter_samples_from_otu_table.py`` could only filter the mapping file when ``--valid_states`` was passed as the filtering method ([#2003](https://github.com/biocore/qiime/issues/2003)). -* Fixed serious bug in ``observation_metadata_correlation.py``, described in [#2009](https://github.com/biocore/qiime/issues/2009). **All previous output generated with ``observation_metadata_correlation.py`` was incorrect, and analyses using those results should be re-run.** This most commonly would have resulted in Type 2 error (false negatives), where observations whose abundance is correlated with metadata are not reported. + Usability enhancements ---------------------- From fe29687cc7e62fdd0900255822ce23daed1e8e3b Mon Sep 17 00:00:00 2001 From: Jai Ram Rideout Date: Fri, 22 May 2015 12:49:14 -0700 Subject: [PATCH 7/8] HACK: add convert_matching_names_to_zero to qiime.format.format_matrix Updated format_distance_matrix to call this. See the parameter's docstring for details. --- qiime/beta_diversity.py | 13 ++++--------- qiime/format.py | 30 +++++++++++++++++++++++++----- qiime/parallel/beta_diversity.py | 3 +-- tests/test_format.py | 11 +++++++++++ 4 files changed, 41 insertions(+), 16 deletions(-) diff --git a/qiime/beta_diversity.py b/qiime/beta_diversity.py index 06b958a987..4371794ef2 100644 --- a/qiime/beta_diversity.py +++ b/qiime/beta_diversity.py @@ -225,15 +225,10 @@ def single_file_beta(input_path, metrics, tree_path, output_dir, make_subtree=(not full_tree)) row_dissims.append(dissims) - # rows_outfilepath = os.path.join(output_dir, metric + '_' +\ - # '_'.join(rowids_list) + '_' + os.path.split(input_path)[1]) - f = open(outfilepath, 'w') - f.write( - format_matrix( - row_dissims, - rowids_list, - otu_table.ids())) - f.close() + with open(outfilepath, 'w') as f: + f.write(format_matrix(row_dissims, rowids_list, + otu_table.ids(), + convert_matching_names_to_zero=True)) def multiple_file_beta(input_path, output_dir, metrics, tree_path, diff --git a/qiime/format.py b/qiime/format.py index ec8fe66d2c..d326e7257f 100755 --- a/qiime/format.py +++ b/qiime/format.py @@ -87,7 +87,7 @@ def format_qiime_parameters(params, header="#QIIME parameters"): qiime_params.append(full_line) return qiime_params - + def format_add_taxa_summary_mapping(summary, tax_order, mapping, header, delimiter=';'): @@ -275,16 +275,30 @@ def format_otu_map(otu_map, otu_id_prefix): def format_distance_matrix(labels, data): """Writes distance matrix as tab-delimited text, uses format_matrix""" - return format_matrix(data, labels, labels) + return format_matrix(data, labels, labels, + convert_matching_names_to_zero=True) -def format_matrix(data, row_names, col_names): +def format_matrix(data, row_names, col_names, + convert_matching_names_to_zero=False): """Writes matrix as tab-delimited text. format is rows: samples, cols: metrics/ confidence levels, etc data: array or 2d list. + + convert_matching_names_to_zero : bool + If ``True``, each element in `data` with a matching row name and column + name will be converted to 0.0 if the original value is close to zero + (as determined by ``numpy.isclose(value, 0.0)``). This is a necessary + evil to handle distance matrices with diagonals that are close to zero. + We have to perform this conversion here instead of simply checking the + distance matrix's diagonal because parallelized beta diversity + calculations will compute subsets of the entire distance matrix and we + don't know where the true diagonal will fall. See + https://github.com/biocore/qiime/issues/1933 for details. + """ len_col = len(col_names) try: @@ -308,8 +322,14 @@ def format_matrix(data, row_names, col_names): col_names = map(str, col_names) # just in case they weren't strings initially lines.append('\t'.join([''] + col_names)) - for sam, vals in zip(row_names, data): - lines.append('\t'.join([sam] + map(str, vals))) + for row_name, values in zip(row_names, data): + line = [row_name] + for col_name, value in zip(col_names, values): + if convert_matching_names_to_zero and col_name == row_name and \ + numpy.isclose(value, 0.0): + value = 0.0 + line.append(str(value)) + lines.append('\t'.join(line)) return '\n'.join(lines) diff --git a/qiime/parallel/beta_diversity.py b/qiime/parallel/beta_diversity.py index ce21ba37d7..9d70e0b3e5 100644 --- a/qiime/parallel/beta_diversity.py +++ b/qiime/parallel/beta_diversity.py @@ -244,7 +244,6 @@ def assemble_distance_matrix(dm_components): """ assemble distance matrix components into a complete dm string """ - print "I get called." data = {} # iterate over compenents for c in dm_components: @@ -269,7 +268,7 @@ def assemble_distance_matrix(dm_components): dm = [] # construct the dm one row at a time for l1 in labels: - dm.append([data[l1][l2] for l2 in labels]) + dm.append([float(data[l1][l2]) for l2 in labels]) # create the dm string and return it dm = format_distance_matrix(labels, dm) return dm diff --git a/tests/test_format.py b/tests/test_format.py index 9f695fdd8b..d376f633de 100755 --- a/tests/test_format.py +++ b/tests/test_format.py @@ -319,6 +319,17 @@ def test_format_distance_matrix(self): '\t11\t22\t33\n11\t1\t2\t3\n22\t4\t5\t6\n33\t7\t8\t9') self.assertRaises(ValueError, format_distance_matrix, labels[:2], a) + def test_format_distance_matrix_almost_zero_diagonal(self): + # only diagonal values should be converted to 0.0 if they are close to + # zero. other values in the matrix should not be changed. + a = array([[0.00001, 1, 0.0000000000001], + [1.0, 0.0000000000001, 3], + [0.0000000000001, 3.0, 0.0]]) + res = format_distance_matrix(['foo', 'bar', 'baz'], a) + self.assertEqual(res, + '\tfoo\tbar\tbaz\nfoo\t1e-05\t1.0\t1e-13\nbar\t1.0' + '\t0.0\t3.0\nbaz\t1e-13\t3.0\t0.0') + def test_format_matrix(self): """format_matrix should return tab-delimited mat""" a = [[1, 2, 3], [4, 5, 6], [7, 8, 9]] From 2c2110965c4a6d2811458a573c2bafd3b8eed74d Mon Sep 17 00:00:00 2001 From: Jai Ram Rideout Date: Fri, 22 May 2015 12:56:12 -0700 Subject: [PATCH 8/8] DOC: add changelog message --- ChangeLog.md | 1 + 1 file changed, 1 insertion(+) diff --git a/ChangeLog.md b/ChangeLog.md index 46ffa8fc9d..1f20d12609 100644 --- a/ChangeLog.md +++ b/ChangeLog.md @@ -14,6 +14,7 @@ Bug fixes * The ``--similarity`` option in ``pick_otus.py`` now only accepts sequence similarity thresholds between 0.0 and 1.0 (inclusive). Previous behavior would allow values outside this range, which would cause uninformative error messages to be raised by the external tools that ``pick_otus.py`` wraps ([#1979](https://github.com/biocore/qiime/issues/1979)). * ``split_libraries_fastq.py`` now explicitly disallows ``-p 0``. This could lead to empty sequences being written to the resulting output file ([#1984](https://github.com/biocore/qiime/issues/1984)). * Fixed issued where ``filter_samples_from_otu_table.py`` could only filter the mapping file when ``--valid_states`` was passed as the filtering method ([#2003](https://github.com/biocore/qiime/issues/2003)). +* Fixed bug where distance matrix files generated by QIIME (e.g., using ``beta_diversity.py``) could have diagonals with values that were close to zero in rare cases (depending on input data, machine architecture, installed dependencies, etc.). These files could not be loaded by QIIME scripts that accepted distance matrix files as input (e.g., ``principal_coordinates.py``) and would result in an error message stating that the distance matrix was not hollow. Values on the diagonal that are close to zero are now set to 0.0 ([#1933](https://github.com/biocore/qiime/issues/1933)). Usability enhancements