update readme, labels, Dockerfile

CAMI-challenge · May 17, 2023 · 17ee29b · 17ee29b
1 parent 21c93de
commit 17ee29b
Show file tree

Hide file tree

Showing 9 changed files with 90 additions and 126 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -1,6 +1,5 @@
 FROM python:3.11.3-slim
 
-ADD image /usr/local
 ADD *.py /usr/local/bin/
 ADD src /usr/local/bin/src
 ADD src/utils /usr/local/bin/src/utils

diff --git a/MANIFEST.in b/MANIFEST.in
@@ -1,2 +1,2 @@
-include requirements/*
+include requirements.txt
 include LICENSE
diff --git a/README.md b/README.md
@@ -34,13 +34,31 @@ AMBER is an evaluation package for the comparative assessment of genome reconstr
 
 AMBER 2.0.4 has been tested with Python 3.11.
 
-AMBER 2.0.3 requires Python 3.7
-
 See [requirements.txt](requirements.txt) for all dependencies.
 
-## Steps
+## Installation options
+
+There are several options to install AMBER:
+
+* [Bioconda](#conda)
+* [Python pip](#python-pip)
+* [Docker](#docker)
+
+## Bioconda
+
+Install and configure [Bioconda](https://bioconda.github.io/index.html) if not already installed. Then use the following command to create a Conda environment and install AMBER:
+
+~~~BASH
+conda create --name amber cami-amber
+~~~
+
+Activate the Conda environment with:
+
+~~~BASH
+conda activate amber
+~~~
 
-You can run [AMBER using Docker (see below)](#running-amberpy-using-docker) or install it as follows.
+## Python pip
 
 Install pip if not already installed (tested on Linux Ubuntu 22.04):
 
@@ -57,7 +75,7 @@ sudo apt update
 Then run:
 
 ~~~BASH
-pip3 install cami-amber 
+pip install cami-amber 
 ~~~
 
 Make sure to add AMBER to your PATH:
@@ -67,6 +85,31 @@ echo 'PATH=$PATH:${HOME}/.local/bin' >> ~/.bashrc
 source ~/.bashrc
 ~~~
 
+Alternatively, download or git-clone AMBER from GitHub. In AMBER's directory, install all requirements with the command:
+
+~~~BASH
+pip install -r requirements.txt 
+~~~
+
+# Docker
+
+You can pull a pre-built [AMBER Docker BioContainer](https://bioconda.github.io/recipes/cami-amber/README.html) as follows:
+
+~~~BASH
+docker pull quay.io/biocontainers/cami-amber:<tag>
+~~~
+
+See [valid values for &lt;tag&gt;](https://quay.io/repository/biocontainers/cami-amber?tab=tags).
+
+Alternatively, download or git-clone AMBER from GitHub. In AMBER's directory, build the Docker image with the command:
+
+~~~BASH
+docker build -t amber .
+~~~
+
+See bellow an example of how to [run AMBER using Docker](#running-amberpy-using-docker).
+
+
 # User guide
 
 ## Input
@@ -158,24 +201,18 @@ test/elated_franklin_0 \
 
 ## Running _amber.py_ using Docker
 
-Download or git-clone AMBER from GitHub. In AMBER's directory, build the Docker image with the command:
-
-~~~BASH
-docker build -t amber:latest .
-~~~
-
-_amber.py_ can then be run with the `docker run` command. Example:
+_amber.py_ can be run with the `docker run` command. Example:
 
 ~~~BASH
-docker run -v /path/to/AMBER/test:/host amber:latest \
+docker run -v $(pwd):/host amber \
 amber.py \
 -l "CONCOCT (CAMI), MaxBin 2.0.2 (CAMI)" \
 -p 1 \
--r /host/unique_common.tsv \
+-r /host/test/unique_common.tsv \
 -k "circular element" \
--g /host/gsa_mapping.binning \
-/host/goofy_hypatia_2 \
-/host/naughty_carson_2 \
+-g /host/test/gsa_mapping.binning \
+/host/test/goofy_hypatia_2 \
+/host/test/naughty_carson_2 \
 -o /host/output_dir
 ~~~
 
@@ -263,7 +300,7 @@ tox
 You can use all libraries that AMBER depends on by activating tox's virtual environment with the command: 
 
 ~~~BASH
-source  <project_directory>/.tox/py37/bin/activate
+source  <project_directory>/.tox/py311/bin/activate
 ~~~
 
 ### Update GitHub page

diff --git a/image/bin/evaluate.sh b/image/bin/evaluate.sh
diff --git a/image/share/Taskfile b/image/share/Taskfile
diff --git a/image/share/schema.yaml b/image/share/schema.yaml
diff --git a/src/amber_html.py b/src/amber_html.py
@@ -153,21 +153,17 @@ def __call__(self, value, clip=None):
         return np.ma.masked_array(np.interp(value, x, y))
 
 
-def upper1(x):
-    return x[:1].upper() + x[1:]
-
-
 def get_colors_and_ranges(name, all_values):
     color1 = 'dodgerblue'
     color2 = 'red'
     hue1 = 12
     hue2 = 240
 
     metrics = [utils_labels.MISCLASSIFICATION_PER_BP, utils_labels.MISCLASSIFICATION_PER_SEQ]
-    if name in map(upper1, metrics + [x + utils_labels.UNFILTERED for x in metrics]):
+    if name in metrics + [x + utils_labels.UNFILTERED for x in metrics]:
         return color2, color1, hue2, hue1, 0, 1
     metrics = [utils_labels.UNIFRAC_BP, utils_labels.UNIFRAC_SEQ]
-    if name in map(upper1, metrics + [x + utils_labels.UNFILTERED for x in metrics]):
+    if name in metrics + [x + utils_labels.UNFILTERED for x in metrics]:
         return color2, color1, hue2, hue1, 0, max(all_values)
     return color1, color2, hue1, hue2, 0, 1
 
@@ -177,7 +173,7 @@ def get_heatmap_colors(pd_series, **args):
 
     metrics = [utils_labels.AVG_PRECISION_BP_SEM, utils_labels.AVG_RECALL_BP_SEM, utils_labels.AVG_PRECISION_SEQ_SEM,
                utils_labels.AVG_RECALL_SEQ_SEM, utils_labels.AVG_RECALL_BP_SEM_CAMI1, utils_labels.AVG_RECALL_SEQ_SEM_CAMI1]
-    if pd_series.name in map(upper1, metrics + [x + utils_labels.UNFILTERED for x in metrics]):
+    if pd_series.name in metrics + [x + utils_labels.UNFILTERED for x in metrics]:
         return ['background-color: white' for x in values]
 
     dropped_gs = False
@@ -435,6 +431,7 @@ def get_html_dict(metrics):
         d_dict = {}
         for tuple in metrics:
             d_dict[tuple[0] + '<'] = '<div class="tooltip">{}<span class="tooltiptext">{}: {}</span></div><'.format(utils_labels.LABELS[tuple[0]], utils_labels.LABELS[tuple[0]], tuple[1])
+            d_dict[tuple[0] + utils_labels.UNFILTERED + '<'] = '<div class="tooltip">{}<span class="tooltiptext">{}: {}</span></div><'.format(utils_labels.LABELS[tuple[0]] + utils_labels.UNFILTERED, utils_labels.LABELS[tuple[0]], tuple[1])
         return d_dict
 
     d = get_html_dict(get_labels_taxonomic() if is_taxonomic else get_labels_genome())
@@ -521,8 +518,8 @@ def create_precision_recall_figure(df_summary, xname1, yname1, xname2, yname2, t
                  (xname2, '@{' + xname2 + '}'),
                  (yname2, '@{' + yname2 + '}')]
     p = figure(title=title, width=580, height=400, x_range=(0, 1), y_range=(0, 1), toolbar_location="below")
-    p.xaxis.axis_label = upper1(xname1.split('(')[0])
-    p.yaxis.axis_label = upper1(yname1.split('(')[0])
+    p.xaxis.axis_label = utils_labels.LABELS[xname1]
+    p.yaxis.axis_label = utils_labels.LABELS[yname1]
     p.xaxis.axis_label_text_font_style = 'normal'
     p.yaxis.axis_label_text_font_style = 'normal'
     for color, (index, row) in zip(bokeh_colors, df_summary.iterrows()):
@@ -613,7 +610,7 @@ def create_tax_figure(tool, df_summary, metrics_list, errors_list):
     legend_it = []
     for i, (metric, error, color) in enumerate(zip(metrics_list, errors_list, line_colors)):
         pline = p.line(x='x', y=metric, line_color=color, source=source, line_width=2)
-        legend_it.append(LegendItem(label=metric, renderers=[pline]))
+        legend_it.append(LegendItem(label=utils_labels.LABELS[metric], renderers=[pline]))
         if error:
             band = Band(base='x', lower=metric + "lower", upper=metric + "upper", source=source, level='underlay',
                               fill_alpha=.3, line_width=1, line_color='black', fill_color=color)
@@ -643,18 +640,19 @@ def create_tax_figure(tool, df_summary, metrics_list, errors_list):
 
 
 def create_rankings_table(df_summary, show_rank=False):
-    columns= [utils_labels.AVG_PRECISION_BP,
-              utils_labels.AVG_RECALL_BP,
-              utils_labels.PRECISION_PER_BP,
-              utils_labels.RECALL_PER_BP,
-              utils_labels.ARI_BY_SEQ,
-              utils_labels.ARI_BY_BP,
-              utils_labels.PERCENTAGE_ASSIGNED_BPS,
-              utils_labels.ACCURACY_PER_BP]
+    columns = [utils_labels.AVG_PRECISION_BP,
+               utils_labels.AVG_RECALL_BP,
+               utils_labels.PRECISION_PER_BP,
+               utils_labels.RECALL_PER_BP,
+               utils_labels.ARI_BY_SEQ,
+               utils_labels.ARI_BY_BP,
+               utils_labels.PERCENTAGE_ASSIGNED_BPS,
+               utils_labels.ACCURACY_PER_BP]
     if show_rank:
         columns.insert(0, utils_labels.RANK)
-    pd_rankings = df_summary[columns].rename(columns={utils_labels.RANK: 'Taxonomic rank'}).round(decimals=5).reset_index()
-    pd_rankings.columns = list(map(upper1, pd_rankings.columns))
+    labels_dict = utils_labels.LABELS.copy()
+    labels_dict[utils_labels.RANK] = 'Taxonomic rank'
+    pd_rankings = df_summary[columns].rename(columns=labels_dict).round(decimals=5).reset_index()
 
     def create_table_column(field):
         return TableColumn(title=field, field=field, width=100)

diff --git a/src/plots.py b/src/plots.py
@@ -91,8 +91,13 @@ def plot_by_genome_coverage(pd_bins, pd_target_column, available_tools, output_d
 
     axs.set_ylim([-0.01, 1.01])
 
-    axs.set_xticklabels(['{:,.1f}'.format(np.exp(x)) for x in axs.get_xticks()], fontsize=12)
-    axs.set_yticklabels(['{:3.0f}'.format(x * 100) for x in axs.get_yticks()], fontsize=12)
+    vals = axs.get_xticks()
+    axs.xaxis.set_major_locator(ticker.FixedLocator(vals))
+    axs.set_xticklabels(['{:,.1f}'.format(np.exp(x)) for x in vals], fontsize=12)
+
+    vals = axs.get_yticks()
+    axs.yaxis.set_major_locator(ticker.FixedLocator(vals))
+    axs.set_yticklabels(['{:3.0f}'.format(x * 100) for x in vals], fontsize=12)
 
     axs.tick_params(axis='x', labelsize=12)
 
@@ -132,7 +137,7 @@ def get_pd_genomes_recall(sample_id_to_queries_list):
 
 def plot_precision_recall_by_coverage(sample_id_to_queries_list, pd_bins_g, coverages_pd, available_tools, output_dir):
     # compute average genome coverage if coverages for multiple samples were provided
-    coverages_pd = coverages_pd.groupby(['GENOMEID']).mean()
+    coverages_pd = coverages_pd.groupby(['GENOMEID']).mean(numeric_only=True)
     coverages_pd.rename(columns={'GENOMEID': 'genome_id'})
     coverages_pd = coverages_pd.sort_values(by=['COVERAGE'])
     coverages_pd['rank'] = coverages_pd['COVERAGE'].rank()

diff --git a/src/utils/labels.py b/src/utils/labels.py
@@ -210,13 +210,13 @@
 def get_genome_bins_columns():
     return OrderedDict([('BINID', 'Bin ID'),
                         ('genome_id', 'Most abundant genome'),
-                        ('precision_bp', PRECISION_PER_BP),
-                        ('recall_bp', RECALL_PER_BP),
+                        ('precision_bp', LABELS[PRECISION_PER_BP]),
+                        ('recall_bp', LABELS[RECALL_PER_BP]),
                         ('total_length', 'Bin size (bp)'),
                         ('tp_length', 'True positives (bp)'),
                         ('length_gs', 'True size of most abundant genome (bp)'),
-                        ('precision_seq', PRECISION_PER_SEQ),
-                        ('recall_seq', RECALL_PER_SEQ),
+                        ('precision_seq', LABELS[PRECISION_PER_SEQ]),
+                        ('recall_seq', LABELS[RECALL_PER_SEQ]),
                         ('total_seq_counts', 'Bin size (seq)'),
                         ('tp_seq_counts', 'True positives (seq)'),
                         ('seq_counts_gs', 'True size of most abundant genome (seq)')])
@@ -226,13 +226,13 @@ def get_tax_bins_columns():
     return OrderedDict([('TAXID', 'Taxon ID'),
                         ('name', 'Scientific name'),
                         ('rank', 'Taxonomic rank'),
-                        ('precision_bp', PRECISION_PER_BP),
-                        ('recall_bp', RECALL_PER_BP),
+                        ('precision_bp', LABELS[PRECISION_PER_BP]),
+                        ('recall_bp', LABELS[RECALL_PER_BP]),
                         ('total_length', 'Bin size (bp)'),
                         ('tp_length', 'True positives (bp)'),
                         ('length_gs', 'True size (bp)'),
-                        ('precision_seq', PRECISION_PER_SEQ),
-                        ('recall_seq', RECALL_PER_SEQ),
+                        ('precision_seq', LABELS[PRECISION_PER_SEQ]),
+                        ('recall_seq', LABELS[RECALL_PER_SEQ]),
                         ('total_seq_counts', 'Bin size (seq)'),
                         ('tp_seq_counts', 'True positives (seq)'),
                         ('seq_counts_gs', 'True size (seq)'),