widdowquinn · baileythegreen · Oct 8, 2021 · Nov 2, 2021 · Nov 2, 2021 · Nov 2, 2021
@@ -0,0 +1,65 @@
+.. _pyani-subcmd-anim:
+
+=================
+``pyani compare``
+=================
+
+The ``compare`` subcommand compares ``pyani`` runs that may involve different methods or parameters, producing plots of the differences and summary files for each pair of runs. Runs for comparison are specified in the form of a set of runs to use as references, and a second set of runs as queries.
+
+.. code-block:: text
+
+
+    usage: pyani compare [-h] [-l LOGFILE] [-v] [--debug] [--disable_tqdm]
+                         [--version] [--citation] -o OUTDIR --ref_ids RUN_ID
+                         [RUN_ID ...] --run_ids RUN_ID [RUN_ID ...]
+                         [--dbpath DBPATH] [--formats FORMAT [FORMAT ...]]
+                         [--method [METHOD]] [--workers WORKERS]
+
+
+
+
+
+.. _SQLite3: https://www.sqlite.org/index.html
+
+-----------------
+Flagged arguments
+-----------------
+
+``--dbpath DBPATH``
+    Path to the location of the local ``pyani`` database to be used. Default: ``.pyani/pyanidb``
+
+``--debug``
+    Turn on debugging output.
+
+``--disable_tqdm``
+    Disable the ``tqdm`` progress bar while the download process runs. This is useful when testing to avoid aesthetic problems with test output.
+
+``--formats FORMAT [FORMAT ...]``
+    Graphics output format(s); more than one can be specified. Valid options are: (pdf/png/svg/jpg). (default: png)
+
+
+``-h, --help``
+    Display usage information for ``pyani anim``.
+
+``-l LOGFILE, --logfile LOGFILE``
+    Provide the location ``LOGFILE`` to which a logfile of the download process will be written.
+
+``--method {seaborn,mpl,plotly}``
+    Graphics method to use for plotting. (default: seaborn)
+
+``-o OUTDIR, --outdir OUTDIR``
+    Path to a directory where comparison output files will be written.
+
+``--ref_ids RUN_ID [RUN_ID ...]``
+    Space-separated list of run_ids to use as reference(s)
+    for comparisons (default: None).
+
+``--run_ids RUN_ID [RUN_ID ...]``
+    Space-separated list of run_ids to compare to
+    reference(s) (default: None).
+
+``-v, --verbose``
+    Provide verbose output to ``STDOUT``
+
+``--workers WORKERS``
+    Spawn WORKERS worker processes with the ``--scheduler multiprocessing`` option. Default: 0 (use all cores)
@@ -161,22 +161,32 @@ def params_mpl(dfm: pd.DataFrame) -> Dict[str, Tuple[str, Any, Any]]:
     DEPRECATED FROM v0.3 onwards
     """
     return {
-        "ANIb_alignment_lengths": ("afmhot", dfm.values.min(), dfm.values.max()),
-        "ANIb_percentage_identity": ("spbnd_BuRd", 0, 1),
-        "ANIb_alignment_coverage": ("BuRd", 0, 1),
-        "ANIb_hadamard": ("hadamard_BuRd", 0, 1),
-        "ANIb_similarity_errors": ("afmhot", dfm.values.min(), dfm.values.max()),
-        "ANIm_alignment_lengths": ("afmhot", dfm.values.min(), dfm.values.max()),
-        "ANIm_percentage_identity": ("spbnd_BuRd", 0, 1),
-        "ANIm_alignment_coverage": ("BuRd", 0, 1),
-        "ANIm_hadamard": ("hadamard_BuRd", 0, 1),
-        "ANIm_similarity_errors": ("afmhot", dfm.values.min(), dfm.values.max()),
-        "TETRA_correlations": ("spbnd_BuRd", 0, 1),
-        "ANIblastall_alignment_lengths": ("afmhot", dfm.values.min(), dfm.values.max()),
-        "ANIblastall_percentage_identity": ("spbnd_BuRd", 0, 1),
-        "ANIblastall_alignment_coverage": ("BuRd", 0, 1),
-        "ANIblastall_hadamard": ("hadamard_BuRd", 0, 1),
-        "ANIblastall_similarity_errors": ("afmhot", dfm.values.min(), dfm.values.max()),
+        "ANIb_alignment_lengths": ("afmhot", dfm.values.min(), dfm.values.max(), 0),
+        "ANIb_percentage_identity": ("spbnd_BuRd", 0, 1, 0),
+        "ANIb_alignment_coverage": ("BuRd", 0, 1, 0),
+        "ANIb_hadamard": ("hadamard_BuRd", 0, 1, 0),
+        "ANIb_similarity_errors": ("afmhot", dfm.values.min(), dfm.values.max(), 0),
+        "ANIm_alignment_lengths": ("afmhot", dfm.values.min(), dfm.values.max(), 0),
+        "ANIm_percentage_identity": ("spbnd_BuRd", 0, 1, 0),
+        "ANIm_alignment_coverage": ("BuRd", 0, 1, 0),
+        "ANIm_hadamard": ("hadamard_BuRd", 0, 1, 0),
+        "ANIm_similarity_errors": ("afmhot", dfm.values.min(), dfm.values.max(), 0),
+        "TETRA_correlations": ("spbnd_BuRd", 0, 1, 0),
+        "ANIblastall_alignment_lengths": (
+            "afmhot",
+            dfm.values.min(),
+            dfm.values.max(),
+            0,
+        ),
+        "ANIblastall_percentage_identity": ("spbnd_BuRd", 0, 1, 0),
+        "ANIblastall_alignment_coverage": ("BuRd", 0, 1, 0),
+        "ANIblastall_hadamard": ("hadamard_BuRd", 0, 1, 0),
+        "ANIblastall_similarity_errors": (
+            "afmhot",
+            dfm.values.min(),
+            dfm.values.max(),
+            0,
+        ),
     }
 
 

@@ -80,6 +80,7 @@ def __init__(
         self.cmap = plt.get_cmap(params[0])
         self.vmin = params[1]
         self.vmax = params[2]
+        self.center = params[3]
         self.labels = labels
         self.classes = classes
 

@@ -310,7 +310,8 @@ def heatmap(dfr, outfilename=None, title=None, params=None):
 
     # Layout figure grid and add title
     # Set figure size by the number of rows in the dataframe
-    figsize = max(8, dfr.shape[0] * 0.175)
+    figsize = max(8, dfr.shape[0] * 0.3)
+
     fig = plt.figure(figsize=(figsize, figsize))
     # if title:
     #     fig.suptitle(title)
@@ -406,3 +407,46 @@ def scatter(
     if outfilename:
         fig.savefig(outfilename)
     return fig
+
+
+def bland_altman(
+    dfr1, dfr2, outfilename, matname1, matname2, title=None, info=None, params=None
+):
+    """Return matplotlib Bland-Altman plot.
+
+    :param dfr1:  pandas DataFrame with x-axis data
+    :param dfr2:  pandas DataFrame with y-axis data
+    :param outfilename:  path to output file (indicates output format)
+    :param matname1:  name of x-axis data
+    :param matname2:  name of y-axis data
+    :param title:  title for the plot
+    :param info:   information about the data in the plot
+    :param params:  a list of parameters for plotting: [colormap, vmin, vmax]
+    """
+    # Make an empty dataframe to collect the input data in
+    data = pd.DataFrame()
+
+    # Add data
+    data["avg"] = (dfr1 + dfr2).values.flatten() / 2
+    data["AminusB"] = (dfr1 - dfr2).values.flatten()
+
+    # Add lable information, if available
+    # if params.labels:
+    #     hue = "labels"
+    #  combined['labels'] =   #  add labels to dataframe; unsure of their configuration at this point
+    # else:
+    hue = None
+
+    fig, ax = plt.subplots(figsize=(8, 8))
+    fig.suptitle(f"Bland-Altman plot for {matname1}")
+    ax.set_xlabel(f"Average of run {matname1} scores")
+    ax.set_ylabel(f"Difference between run {matname1} scores")
+
+    plt.scatter("avg", "AminusB", data=data, c=hue, s=2)
+
+    # Return figure output, and write, if required
+    plt.subplots_adjust(top=0.85)  # Leave room for title
+    fig.set_tight_layout(True)
+    if outfilename:
+        fig.savefig(outfilename)
+    return fig
@@ -42,6 +42,9 @@
 import matplotlib  # pylint: disable=C0411
 import pandas as pd
 import seaborn as sns
+import logging
+from typing import List, Dict
+from pyani import pyani_config
 
 matplotlib.use("Agg")
 import matplotlib.pyplot as plt  # noqa: E402,E501 # pylint: disable=wrong-import-position,wrong-import-order,ungrouped-imports
@@ -132,6 +135,7 @@ def get_clustermap(dfr, params, title=None, annot=True):
             cmap=params.cmap,
             vmin=params.vmin,
             vmax=params.vmax,
+            center=params.center,
             col_colors=params.colorbar,
             row_colors=params.colorbar,
             figsize=(params.figsize, params.figsize),
@@ -144,8 +148,7 @@ def get_clustermap(dfr, params, title=None, annot=True):
 
     fig.cax.yaxis.set_label_position("left")
     if title:
-        fig.cax.set_ylabel(title)
-
+        fig.ax_heatmap.set_title(title, pad=1000, fontdict={"fontsize": 75})
     # Return clustermap
     return fig
 
@@ -215,11 +218,15 @@ def distribution(dfr, outfilename, matname, title=None):
 
     # Modify axes after data is plotted
     for _ in axes:
-        if matname == "sim_errors":
+        if matname.endswith("absdiffs"):
+            _.set_xlim(0, _.get_xlim()[1])
+        elif matname.endswith("diffs"):
+            pass
+        elif matname.split("_")[0] == "sim_errors":
             _.set_xlim(0, _.get_xlim()[1])
-        elif matname in ["hadamard", "coverage"]:
+        elif matname.split("_")[0] in ["hadamard", "coverage"]:
             _.set_xlim(0, 1.01)
-        elif matname == "identity":
+        elif matname.split("_")[0] == "identity":
             _.set_xlim(0.75, 1.01)
 
     # Tidy figure
@@ -284,3 +291,52 @@ def scatter(
 
     # Return clustermap
     return fig
+
+
+def bland_altman(
+    dfr1,
+    dfr2,
+    outfilename,
+    matname1,
+    matname2,
+    run_ids,
+    title=None,
+    info=None,
+    params=None,
+):
+    """Return seaborn Bland-Altman plot.
+
+    :param dfr1:  pandas DataFrame with x-axis data
+    :param dfr2:  pandas DataFrame with y-axis data
+    :param outfilename:  path to output file (indicates output format)
+    :param matname1:  name of x-axis data
+    :param matname2:  name of y-axis data
+    :param run_ids:   tuple of run_ids (ref, query)
+    :param title:  title for the plot
+    :param info:   information about the data in the plot
+    :param params:  a list of parameters for plotting: [colormap, vmin, vmax]
+    """
+    data = pd.DataFrame()
+    ref_id, query_id = run_ids
+    data["avg"] = (dfr1 + dfr2).values.flatten() / 2
+    data["AminusB"] = (dfr1 - dfr2).values.flatten()
+
+    fig = sns.lmplot(
+        x="avg", y="AminusB", data=data, fit_reg=False, scatter_kws={"s": 2}, height=9
+    )
+
+    fig.ax.hlines(0, fig.ax.get_xbound()[0], fig.ax.get_xbound()[1], linewidths=1)
+    fig.ax.margins(x=0)
+    # fig.figtext(1, .5, info)
+    fig.set(
+        xlabel=f"Average of run {matname1} scores",
+        ylabel=f"Difference between {matname1} scores (run {ref_id} - run {query_id})",
+    )
+    plt.title(title)
+
+    fig.tight_layout()
+
+    if outfilename:
+        plt.savefig(outfilename)
+
+    return fig
@@ -45,7 +45,7 @@
 import logging
 
 from pathlib import Path
-from typing import Any, Dict, List, NamedTuple, Optional, Tuple
+from typing import Any, Dict, List, NamedTuple, Optional, Tuple, Set
 
 import numpy as np  # type: ignore
 import pandas as pd  # type: ignore
@@ -328,6 +328,13 @@ def __repr__(self) -> str:
         return "<Comparison(comparison_id={})>".format(self.comparison_id)
 
 
+# Convenience struct for the genomes used in a comparison
+# Used to contrast runs, et cetera
+class GenomePair(NamedTuple):
+    query_id: Genome
+    subject_id: Genome
+
+
 def create_db(dbpath: Path) -> None:
     """Create an empty pyani SQLite3 database at the passed path.
 
@@ -370,6 +377,30 @@ def get_comparison_dict(session: Any) -> Dict[Tuple, Any]:
     }
 
 
+def get_genome_pair_dict(session: Any, run_id: int) -> Dict[Tuple, Any]:
+    """Return a dictionary of compared genome pairs from a
+    list of comparison_ids in the database.
+
+    :param session:         live SQLAlchemy session of pyani database
+    :param comparison_ids:  list of Comparison objects to pull data from
+
+    Returns comparison ids keyed by GenomePair objects (_.query_id, _.subject_id)
+    """
+    # Get comparison IDs
+    comparisons = [
+        _.comparison_id
+        for _ in set(session.query(runcomparison).filter_by(run_id=run_id))
+    ]
+
+    # Get information from comparisons
+    return {
+        GenomePair(_.query_id, _.subject_id): _.comparison_id
+        for _ in session.query(Comparison).filter(
+            Comparison.comparison_id.in_(comparisons)
+        )
+    }
+
+
 def get_matrix_labels_for_run(session: Any, run_id: int) -> Dict:
     """Return dictionary of genome labels, keyed by row/column ID.
 
@@ -496,6 +527,56 @@ def filter_existing_comparisons(
     return comparisons_to_run
 
 
+def filter_uncommon_comparisons(session, run_a_comps, run_b_comps) -> Tuple[Set, Set]:
+    """Filter list of (Genome, Genome) comparisons for those that exist in common
+    between two runs in the database. The method, parameters, version information
+    may all differ between the runs, so we ignore all of that here and only look
+    at the genome IDs. Input dictionaries are generated by `get_genome_pair_dict()`.
+
+    :param session:  live SQLAlchemy session of pyani database
+    :param run_a_comps:  dict containing comparison_ids keyed by GenomePairs for run_a
+    :param run_b_comps:  dict containing comparison_ids keyed by GenomePairs for run_b
+
+    Returns a Tuple of sets of comparison_ids that contain the comparison_ids of
+    the common query-subject pairs' for each run.
+    """
+    # run_a_comps = get_genome_pair_dict(session, run_a)
+    # run_b_comps = get_genome_pair_dict(session, run_b)
+
+    # Find common query-subject pairs of genomes
+    common_genome_pairs = set(run_a_comps.keys()) & set(run_b_comps.keys())
+
+    # Each query-subject pair has a different comparison_id for each run.
+    # Take the union of the sets of comparison_ids from each run
+    # for the common query-subject pairs
+    run_a_common = set(run_a_comps[_] for _ in common_genome_pairs)
+    run_b_common = set(run_b_comps[_] for _ in common_genome_pairs)
+
+    return run_a_common, run_b_common
+
+
+def get_df_of_scores(session: Any, comparison_ids: List, cols: List):
+    """Extracts scores for a set of comparisons from the database.
+
+    :param session:  live SQLAlchemy session of pyani database
+    :param comparison_ids:  List of comparisons to extract
+    :param cols:  List of columns to extract
+
+    Returns a dataframe of aln_length, sim_errs, identity, cov_query, and
+    cov_subject for each comparison in comparison_ids, with query_id and
+    subject_id as indices.
+    """
+    # Query creation
+    scores = session.query(*cols).filter(Comparison.comparison_id.in_(comparison_ids))
+
+    # REad the query into pandas
+    scores_df = pd.read_sql(
+        scores.statement, scores.session.bind, index_col=["query_id", "subject_id"]
+    )
+
+    return scores_df
+
+
 def add_run(session, method, cmdline, date, status, name):
     """Create a new Run and add it to the session.