From 5a77d6dc9b7c65822edfa86eea0a6a0f413d8d27 Mon Sep 17 00:00:00 2001
From: Michael McKinsey <michaelmckinsey1@gmail.com>
Date: Mon, 10 Nov 2025 10:46:36 -0800
Subject: [PATCH 01/18] Enable line chart in analyze

---
 lib/benchpark/cmd/analyze.py | 243 +++++++++++++++++++++++------------
 pyproject.toml               |   3 +-
 2 files changed, 160 insertions(+), 86 deletions(-)

diff --git a/lib/benchpark/cmd/analyze.py b/lib/benchpark/cmd/analyze.py
index 582947818..6e7c7ae1b 100644
--- a/lib/benchpark/cmd/analyze.py
+++ b/lib/benchpark/cmd/analyze.py
@@ -3,21 +3,23 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-import logging
 import os
 import re
-import shlex
-import shutil
+import logging
 import sys
+import shlex
 import tarfile
+import shutil
 import warnings
-from datetime import datetime
+from tqdm import tqdm
 from glob import glob
+from datetime import datetime
 
-import matplotlib as mpl
 import matplotlib.pyplot as plt
+import matplotlib as mpl
 import pandas as pd
 import thicket as th
+import seaborn
 
 # -----------------------------
 # Constants
@@ -110,10 +112,6 @@ def _validate_workspace_dir(workspace_dir):
         raise ValueError(
             f"Workspace dir '{workspace_dir}' does not exist or is not a directory"
         )
-    if ".ramble-workspace" not in os.listdir(workspace_dir):
-        raise ValueError(
-            f"Directory '{workspace_dir}' must be a valid ramble workspace (missing .ramble-workspace)"
-        )
     return os.path.abspath(workspace_dir)
 
 
@@ -169,9 +167,9 @@ def analyze_archive(analyze_dir, cali_files, output=None):
 # -----------------------------
 # Chart Generation
 # -----------------------------
-def make_stacked_line_chart(**kwargs):
+def make_chart(**kwargs):
     """
-    Generates a stacked area line chart based on Thicket DataFrame.
+    Generates a chart based on Thicket DataFrame.
 
     Args:
         df (pd.DataFrame): DataFrame to plot.
@@ -198,35 +196,73 @@ def make_stacked_line_chart(**kwargs):
 
     os.makedirs(kwargs["out_dir"], exist_ok=True)
 
-    tdf_calls = df[[(i, "Calls/rank (max)") for i in x_axis]].T.reset_index(
-        level=1, drop=True
-    )
-    calls_list = []
-    for column in tdf_calls.columns:
-        mx = max(tdf_calls[column])
-        val = int(mx) if mx > 0 else 0
-        calls_list.append((column, val))
+    # tdf_calls = df["Calls/rank (max)"].T.reset_index(
+    #     level=1, drop=True
+    # )
+    # calls_list = []
+    # for column in tdf_calls.columns:
+    #     mx = max(tdf_calls[column])
+    #     val = int(mx) if mx > 0 else 0
+    #     calls_list.append((column, val))
 
-    tdf = df[[(i, value) for i in x_axis]].T.reset_index(level=1, drop=True)
+    # tdf = df[[(i, value) for i in x_axis]].T.reset_index(level=1, drop=True)
     mpl.rcParams["axes.prop_cycle"] = mpl.cycler(color=COLOR_PALETTE)
     if kwargs.get("chart_fontsize"):
         mpl.rcParams.update({"font.size": kwargs.get("chart_fontsize")})
 
+    # tcol = tdf.columns[0]
+    # tdf["cluster"] = tdf.index.map(lambda x: x[-1]).map(mapping)
+    # tdf["profile"] = tdf.index.map(lambda x: ", ".join(str(i) for i in x[:-1]))
+    # tdf = tdf.reset_index(drop=True)
+
     xlabel = kwargs.get("chart_xlabel")
     if isinstance(xlabel, list):
         xlabel = ", ".join(NAME_REMAP[x] for x in xlabel)
     else:
         if xlabel in NAME_REMAP:
             xlabel = NAME_REMAP[xlabel]
-    fig, ax = plt.subplots()
-    tdf.plot(
-        kind="area",
-        title=kwargs.get("chart_title", ""),
-        xlabel=xlabel,
-        ylabel=y_label,
-        figsize=kwargs["chart_figsize"] if kwargs["chart_figsize"] else (12, 7),
+    fig, ax = plt.subplots(figsize=kwargs.get("chart_figsize", (12, 7)))
+    kind = kwargs.get("chart_kind", "line")
+    ax.set_title(kwargs.get("chart_title", ""))
+    ax.set_xlabel(xlabel)
+    ax.set_ylabel(y_label)
+    # plt.yscale("log", base=2)
+    plt.grid(True)
+    df = df.sort_values(by=x_axis)
+    plot_args = dict(
         ax=ax,
     )
+    if kind == "area":
+        plot_args["kind"] = "area"
+        df["xaxis"] = df.apply(lambda row: tuple(row[col] for col in x_axis), axis=1)
+    else:
+        plot_args["data"] = df
+        plot_args["x"] = "xaxis"
+        plot_args["y"] = yaxis_metric
+        df["xaxis"] = df.apply(
+            lambda row: ", ".join([str(row[col]) for col in x_axis]), axis=1
+        )
+    if kwargs["cluster"] == "multiple":
+        plot_args["hue"] = "cluster"
+    # Add marker only if line plot
+    if kind == "line":
+        plot_args["marker"] = "o"
+        seaborn.lineplot(**plot_args)
+    elif kind == "area":
+        tdf = (
+            df[[yaxis_metric, "name", "xaxis"]]
+            .reset_index(drop=True)
+            .sort_values("xaxis")
+        )
+        tdf = tdf.pivot(index="xaxis", columns="name", values=yaxis_metric)
+        tdf.plot(**plot_args)
+    elif kind == "scatter":
+        seaborn.scatterplot(**plot_args)
+    elif kind == "bar":
+        seaborn.barplot(**plot_args)
+    else:
+        raise NotImplementedError(f"Uknown plot kind {kind}")
+
     y_axis_limits = kwargs.get("chart_yaxis_limits")
     if y_axis_limits is not None:
         ax.set_ylim(y_axis_limits[0], y_axis_limits[1])
@@ -234,13 +270,13 @@ def make_stacked_line_chart(**kwargs):
     handles, labels = ax.get_legend_handles_labels()
     handles = list(reversed(handles))
     labels = list(reversed(labels))
-    calls_list = list(reversed(calls_list))
-    for i, label in enumerate(labels):
-        obj = calls_list[i][0]
-        name = obj if isinstance(obj, str) else obj[0].frame["name"]
-        if name not in label:
-            raise ValueError(f"Name '{name}' is not in label '{label}'")
-        labels[i] = str(name) + " (" + str(calls_list[i][1]) + ")"
+    # calls_list = list(reversed(calls_list))
+    # for i, label in enumerate(labels):
+    #     obj = calls_list[i][0]
+    #     name = obj if isinstance(obj, str) else obj[0].frame["name"]
+    #     if name not in label:
+    #         raise ValueError(f"Name '{name}' is not in label '{label}'")
+    #     labels[i] = str(name) + " (" + str(calls_list[i][1]) + ")"
     ax.legend(
         handles,
         labels,
@@ -248,13 +284,14 @@ def make_stacked_line_chart(**kwargs):
         loc="center left",
         title="Region (Calls/rank (max))",
     )
+    ax.set_xlabel(xlabel)
 
     fig.autofmt_xdate()
     plt.tight_layout()
 
     filename = os.path.join(kwargs["out_dir"], kwargs["chart_file_name"])
     logger.info(f"Saving figure data points to {filename}.csv")
-    tdf.to_csv(filename + ".csv")
+    df.to_csv(filename + ".csv")
     logger.info(f"Saving figure to {filename}.png")
     plt.savefig(filename + ".png")
     logger.info(
@@ -279,7 +316,13 @@ def prepare_data(**kwargs):
     tk = th.Thicket.from_caliperreader(
         files, intersection=intersection, disable_tqdm=True
     )
-    tk.update_inclusive_columns()
+    if kwargs["yaxis_metric"] in tk.inc_metrics and not kwargs["no_update_inc_cols"]:
+        pbar = tqdm(total=1, desc="Updating inclusive columns")
+        tk.update_inclusive_columns()
+        pbar.update(1)
+        pbar.close()
+
+    # cluster_to_ps = dict(zip(tk.metadata["cluster"], tk.metadata["total_problem_size"]))
 
     clean_tree = tk.tree(kwargs["tree_metric"], render_header=True)
     clean_tree = re.compile(r"\x1b\[([0-9;]*m)").sub("", clean_tree)
@@ -300,6 +343,15 @@ def prepare_data(**kwargs):
 
     # Remove singular roots if inclusive metric
     metric = kwargs["yaxis_metric"]
+
+    tk.dataframe["Bandwidth (GB/s)"] = (
+        tk.dataframe["Bytes/Rep"]
+        / tk.dataframe["Avg time/rank (exc)"]
+        / 10**9
+        * tk.dataframe["Reps"]
+        * tk.metadata["mpi.world.size"]
+    )
+
     if metric in tk.inc_metrics and len(tk.graph.roots) == 1:
         root_name = tk.graph.roots[0].frame["name"]
         logger.info(
@@ -366,15 +418,22 @@ def prepare_data(**kwargs):
         tk.dataframe = pd.concat([tk.dataframe.filter(like=p, axis=0) for p in prefix])
 
     # Group by varied parameters
-    grouped = tk.groupby(x_axis_metadata)
-    ctk = th.Thicket.concat_thickets(
-        list(grouped.values()), headers=list(grouped.keys()), axis="columns"
-    )
+    # grouped = tk.groupby(x_axis_metadata)
+    # print(grouped.keys())
+    # ctk = th.Thicket.concat_thickets(
+    #     list(grouped.values()), headers=list(grouped.keys()), axis="index"
+    # )
+
+    tk.metadata_columns_to_perfdata(["cluster"] + list(NAME_REMAP.keys()))
 
     cluster_col = "cluster" if "cluster" in tk.metadata.columns else "host.cluster"
     # Check these values are constant
     app = validate_single_metadata_value("application_name", tk)
-    cluster = validate_single_metadata_value(cluster_col, tk)
+    try:
+        cluster = validate_single_metadata_value(cluster_col, tk)
+    except ValueError:
+        print("Multiple clusters detected. Using multi-cluster mode.")
+        cluster = "multiple"
     version = validate_single_metadata_value("version", tk)
 
     # Find programming model from spec
@@ -389,12 +448,18 @@ def prepare_data(**kwargs):
         "weak": ["process_problem_size"],
         "throughput": ["n_resources", "n_nodes"],
     }[scaling]
-    constant_str = ", ".join(
-        f"{int(tk.metadata[key].iloc[0]):,} {NAME_REMAP[key]}" for key in constant_keys
+    constant_str = (
+        ", ".join(
+            f"{int(tk.metadata[key].iloc[0]):,} {NAME_REMAP[key]}"
+            for key in constant_keys
+        )
+        if cluster != "multiple"
+        else ""
     )
     # Check constant
-    for key in constant_keys:
-        validate_single_metadata_value(key, tk)
+    if cluster != "multiple":
+        for key in constant_keys:
+            validate_single_metadata_value(key, tk)
 
     if not kwargs.get("chart_title"):
         kwargs["chart_title"] = (
@@ -414,36 +479,27 @@ def prepare_data(**kwargs):
         f.write(clean_tree)
     logger.info(f"Saving Input Calltree to {tree_file}")
 
-    for key in grouped.keys():
-        ctk.dataframe[(key, "perc")] = (
-            ctk.dataframe[(key, metric)] / ctk.dataframe[(key, metric)].sum()
-        ) * 100
-
-    top_n = kwargs.get("top_n_regions", -1)
-    if top_n != -1:
-        num_nodes = len(ctk.graph)
-        if num_nodes < kwargs.get("top_n_regions", -1):
-            raise ValueError(
-                f"Value for '--top-n-regions' must be less than number of regions ({num_nodes})"
-            )
-        temp_df_idx = ctk.dataframe.nlargest(
-            top_n, [(list(grouped.keys())[0], metric)]
-        ).index
-        temp_df = ctk.dataframe[ctk.dataframe.index.isin(temp_df_idx)]
-        temp_df.loc["Sum(removed_regions)"] = 0
-        for p in ctk.profile:
-            diff = (
-                ctk.dataframe.loc[:, (p[1], metric)].sum()
-                - temp_df.loc[:, (p[1], metric)].sum()
-            )
-            if isinstance(diff, pd.Series):
-                assert len(diff) == 1
-                diff = diff.iloc[0]
-            temp_df.loc["Sum(removed_regions)", (p[1], metric)] = diff
-        ctk.dataframe = temp_df
-        logger.info(
-            f"Filtered top {top_n} regions for chart display. Added the sum of the regions that were removed as single region."
-        )
+    # for key in grouped.keys():
+    #     tk.dataframe["perc"] = tk.dataframe[tk.dataframe[g] == ]
+    #     ctk.dataframe[(key, "perc")] = (
+    #         ctk.dataframe[(key, metric)] / ctk.dataframe[(key, metric)].sum()
+    #     ) * 100
+
+    # top_n = kwargs.get("top_n_regions", -1)
+    # if top_n != -1:
+    #     temp_df_idx = tk.dataframe.nlargest(
+    #         top_n, metric).index
+    #     temp_df = tk.dataframe[tk.dataframe.index.isin(temp_df_idx)]
+    #     temp_df.loc["Sum(removed_regions)"] = 0
+    #     for p in tk.profile:
+    #         temp_df.loc["Sum(removed_regions)", metric] = (
+    #             tk.dataframe.loc[:, metric].sum()
+    #             - temp_df.loc[:, metric].sum()
+    #         )
+    #     tk.dataframe = temp_df
+    #     logger.info(
+    #         f"Filtered top {top_n} regions for chart display. Added the sum of the regions that were removed as single region."
+    #     )
 
     if not kwargs.get("chart_xlabel"):
         kwargs["chart_xlabel"] = x_axis_metadata
@@ -456,8 +512,10 @@ def prepare_data(**kwargs):
             raise ValueError(
                 f"Expected one scaling factor, found: {list(scaling_factors)}"
             )
+    # kwargs["cluster_to_ps"] = cluster_to_ps
+    kwargs["cluster"] = cluster
 
-    make_stacked_line_chart(df=ctk.dataframe, x_axis=list(grouped.keys()), **kwargs)
+    make_chart(df=tk.dataframe, x_axis=x_axis_metadata, **kwargs)
 
 
 def setup_parser(root_parser):
@@ -469,7 +527,7 @@ def setup_parser(root_parser):
         "--workspace-dir",
         required=True,
         type=str,
-        help="Directory of ramble workspace.",
+        help="Directory Caliper files. Files will be found recursively.",
         metavar="RAMBLE_WORKSPACE_DIR",
     )
     root_parser.add_argument(
@@ -482,7 +540,10 @@ def setup_parser(root_parser):
     root_parser.add_argument(
         "--chart-type",
         default="raw",
-        choices=["raw", "percentage"],
+        choices=[
+            "raw",
+            # "percentage"
+        ],
         type=str,
         help="Specify processing on the metric. 'raw' does nothing, 'percentage' shows the metric values as a percentage relative to the total summation of all regions.",
     )
@@ -516,13 +577,13 @@ def setup_parser(root_parser):
         help="Query for one or more regions REGION. Includes children of region.",
         metavar="REGION",
     )
-    root_parser.add_argument(
-        "--top-n-regions",
-        default=-1,
-        type=int,
-        help="Filters only top N largest metric entries to be included in chart (based on the first profile).",
-        metavar="N",
-    )
+    # root_parser.add_argument(
+    #     "--top-n-regions",
+    #     default=-1,
+    #     type=int,
+    #     help="Filters only top N largest metric entries to be included in chart (based on the first profile).",
+    #     metavar="N",
+    # )
     root_parser.add_argument(
         "--group-regions-name",
         action="store_true",
@@ -596,6 +657,18 @@ def setup_parser(root_parser):
         default=None,
         help="With 'archive', path for the .tar.gz (defaults to CWD/<workspace>-<timestamp>.tar.gz)",
     )
+    root_parser.add_argument(
+        "--chart-kind",
+        type=str,
+        default="area",
+        choices=["area", "line", "bar", "scatter"],
+        help="Type of chart to generate",
+    )
+    root_parser.add_argument(
+        "--no-update-inc-cols",
+        action="store_true",
+        help="Don't call Thicket.update_inclusive_columns() which can take a while.",
+    )
 
 
 def command(args):
diff --git a/pyproject.toml b/pyproject.toml
index 6091dfd14..cb0a9becf 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -73,6 +73,7 @@ explicit_start = false
 
 [project.optional-dependencies]
 analyze = [
-  "llnl-thicket[plotting]",
+  "llnl-hatchet==2024.1.3",
+  "llnl-thicket[plotting]==2025.1.0",
   "matplotlib"
 ]
\ No newline at end of file

From 8155d49cd590cad8f2b25d3c5527e7a813e6537a Mon Sep 17 00:00:00 2001
From: Michael McKinsey <michaelmckinsey1@gmail.com>
Date: Mon, 8 Dec 2025 14:33:11 -0800
Subject: [PATCH 02/18] isort

---
 lib/benchpark/cmd/analyze.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/lib/benchpark/cmd/analyze.py b/lib/benchpark/cmd/analyze.py
index 6e7c7ae1b..a3e7e4ca2 100644
--- a/lib/benchpark/cmd/analyze.py
+++ b/lib/benchpark/cmd/analyze.py
@@ -3,23 +3,23 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
+import logging
 import os
 import re
-import logging
-import sys
 import shlex
-import tarfile
 import shutil
+import sys
+import tarfile
 import warnings
-from tqdm import tqdm
-from glob import glob
 from datetime import datetime
+from glob import glob
 
-import matplotlib.pyplot as plt
 import matplotlib as mpl
+import matplotlib.pyplot as plt
 import pandas as pd
-import thicket as th
 import seaborn
+import thicket as th
+from tqdm import tqdm
 
 # -----------------------------
 # Constants

From 0c7703cb270b98fed1503ae18e514310fdb84a15 Mon Sep 17 00:00:00 2001
From: Michael McKinsey <michaelmckinsey1@gmail.com>
Date: Mon, 8 Dec 2025 14:56:57 -0800
Subject: [PATCH 03/18] Fix legend title and prevent multi-cluster area chart

---
 lib/benchpark/cmd/analyze.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/lib/benchpark/cmd/analyze.py b/lib/benchpark/cmd/analyze.py
index a3e7e4ca2..951e8b0c5 100644
--- a/lib/benchpark/cmd/analyze.py
+++ b/lib/benchpark/cmd/analyze.py
@@ -277,12 +277,13 @@ def make_chart(**kwargs):
     #     if name not in label:
     #         raise ValueError(f"Name '{name}' is not in label '{label}'")
     #     labels[i] = str(name) + " (" + str(calls_list[i][1]) + ")"
+    title = "Region (Calls/rank (max))" if kwargs["cluster"] != "multiple" else "Cluster"
     ax.legend(
         handles,
         labels,
         bbox_to_anchor=(1, 0.5),
         loc="center left",
-        title="Region (Calls/rank (max))",
+        title=title,
     )
     ax.set_xlabel(xlabel)
 
@@ -434,6 +435,8 @@ def prepare_data(**kwargs):
     except ValueError:
         print("Multiple clusters detected. Using multi-cluster mode.")
         cluster = "multiple"
+        if kwargs.get("chart_kind") == "area":
+            raise ValueError("Data from multiple workspaces (clusters) not allowed for 'area' chart type.")
     version = validate_single_metadata_value("version", tk)
 
     # Find programming model from spec

From f11a66c33cbdf345395a3966d1e3b27539ad2fff Mon Sep 17 00:00:00 2001
From: Michael McKinsey <michaelmckinsey1@gmail.com>
Date: Mon, 8 Dec 2025 15:24:22 -0800
Subject: [PATCH 04/18] Fix percentage option

---
 lib/benchpark/cmd/analyze.py | 31 +++++++++++++++++++------------
 1 file changed, 19 insertions(+), 12 deletions(-)

diff --git a/lib/benchpark/cmd/analyze.py b/lib/benchpark/cmd/analyze.py
index 951e8b0c5..31ef0e3e2 100644
--- a/lib/benchpark/cmd/analyze.py
+++ b/lib/benchpark/cmd/analyze.py
@@ -189,10 +189,12 @@ def make_chart(**kwargs):
     x_axis = kwargs.get("x_axis")
     yaxis_metric = kwargs.get("yaxis_metric")
 
-    value = "perc" if chart_type == "percentage" else yaxis_metric
     y_label = kwargs.get("chart_ylabel") or (
         f"Percentage of {yaxis_metric}" if chart_type == "percentage" else yaxis_metric
     )
+    yaxis_metric = (
+        yaxis_metric + "-perc" if chart_type == "percentage" else yaxis_metric
+    )
 
     os.makedirs(kwargs["out_dir"], exist_ok=True)
 
@@ -277,7 +279,9 @@ def make_chart(**kwargs):
     #     if name not in label:
     #         raise ValueError(f"Name '{name}' is not in label '{label}'")
     #     labels[i] = str(name) + " (" + str(calls_list[i][1]) + ")"
-    title = "Region (Calls/rank (max))" if kwargs["cluster"] != "multiple" else "Cluster"
+    title = (
+        "Region (Calls/rank (max))" if kwargs["cluster"] != "multiple" else "Cluster"
+    )
     ax.legend(
         handles,
         labels,
@@ -436,7 +440,9 @@ def prepare_data(**kwargs):
         print("Multiple clusters detected. Using multi-cluster mode.")
         cluster = "multiple"
         if kwargs.get("chart_kind") == "area":
-            raise ValueError("Data from multiple workspaces (clusters) not allowed for 'area' chart type.")
+            raise ValueError(
+                "Data from multiple workspaces (clusters) not allowed for 'area' chart type."
+            )
     version = validate_single_metadata_value("version", tk)
 
     # Find programming model from spec
@@ -482,11 +488,15 @@ def prepare_data(**kwargs):
         f.write(clean_tree)
     logger.info(f"Saving Input Calltree to {tree_file}")
 
-    # for key in grouped.keys():
-    #     tk.dataframe["perc"] = tk.dataframe[tk.dataframe[g] == ]
-    #     ctk.dataframe[(key, "perc")] = (
-    #         ctk.dataframe[(key, metric)] / ctk.dataframe[(key, metric)].sum()
-    #     ) * 100
+    # Compute percentage
+    if kwargs.get("chart_type") == "percentage":
+        tk.dataframe[metric + "-perc"] = 0
+        for profile in tk.profile:
+            tk.dataframe.loc[(slice(None), profile), metric + "-perc"] = (
+                tk.dataframe.loc[(slice(None), profile), metric]
+                * 100
+                / tk.dataframe.loc[(slice(None), profile), metric].sum()
+            )
 
     # top_n = kwargs.get("top_n_regions", -1)
     # if top_n != -1:
@@ -543,10 +553,7 @@ def setup_parser(root_parser):
     root_parser.add_argument(
         "--chart-type",
         default="raw",
-        choices=[
-            "raw",
-            # "percentage"
-        ],
+        choices=["raw", "percentage"],
         type=str,
         help="Specify processing on the metric. 'raw' does nothing, 'percentage' shows the metric values as a percentage relative to the total summation of all regions.",
     )

From f5f81f462fb9a051251acab9d2d5a3c729409aa4 Mon Sep 17 00:00:00 2001
From: Michael McKinsey <michaelmckinsey1@gmail.com>
Date: Mon, 8 Dec 2025 15:55:48 -0800
Subject: [PATCH 05/18] fix calls/rank in legend

---
 lib/benchpark/cmd/analyze.py | 39 ++++++++----------------------------
 1 file changed, 8 insertions(+), 31 deletions(-)

diff --git a/lib/benchpark/cmd/analyze.py b/lib/benchpark/cmd/analyze.py
index 31ef0e3e2..4ccf26aca 100644
--- a/lib/benchpark/cmd/analyze.py
+++ b/lib/benchpark/cmd/analyze.py
@@ -198,25 +198,15 @@ def make_chart(**kwargs):
 
     os.makedirs(kwargs["out_dir"], exist_ok=True)
 
-    # tdf_calls = df["Calls/rank (max)"].T.reset_index(
-    #     level=1, drop=True
-    # )
-    # calls_list = []
-    # for column in tdf_calls.columns:
-    #     mx = max(tdf_calls[column])
-    #     val = int(mx) if mx > 0 else 0
-    #     calls_list.append((column, val))
+    # Calls/rank in legend
+    calls_list = []
+    for node in set(df.index.get_level_values("node")):
+        calls_list.append(df.loc[node, "Calls/rank (max)"].max())
 
-    # tdf = df[[(i, value) for i in x_axis]].T.reset_index(level=1, drop=True)
     mpl.rcParams["axes.prop_cycle"] = mpl.cycler(color=COLOR_PALETTE)
     if kwargs.get("chart_fontsize"):
         mpl.rcParams.update({"font.size": kwargs.get("chart_fontsize")})
 
-    # tcol = tdf.columns[0]
-    # tdf["cluster"] = tdf.index.map(lambda x: x[-1]).map(mapping)
-    # tdf["profile"] = tdf.index.map(lambda x: ", ".join(str(i) for i in x[:-1]))
-    # tdf = tdf.reset_index(drop=True)
-
     xlabel = kwargs.get("chart_xlabel")
     if isinstance(xlabel, list):
         xlabel = ", ".join(NAME_REMAP[x] for x in xlabel)
@@ -228,7 +218,6 @@ def make_chart(**kwargs):
     ax.set_title(kwargs.get("chart_title", ""))
     ax.set_xlabel(xlabel)
     ax.set_ylabel(y_label)
-    # plt.yscale("log", base=2)
     plt.grid(True)
     df = df.sort_values(by=x_axis)
     plot_args = dict(
@@ -272,13 +261,10 @@ def make_chart(**kwargs):
     handles, labels = ax.get_legend_handles_labels()
     handles = list(reversed(handles))
     labels = list(reversed(labels))
-    # calls_list = list(reversed(calls_list))
-    # for i, label in enumerate(labels):
-    #     obj = calls_list[i][0]
-    #     name = obj if isinstance(obj, str) else obj[0].frame["name"]
-    #     if name not in label:
-    #         raise ValueError(f"Name '{name}' is not in label '{label}'")
-    #     labels[i] = str(name) + " (" + str(calls_list[i][1]) + ")"
+    if kwargs["cluster"] != "multiple":
+        calls_list = list(reversed(calls_list))
+        for i, label in enumerate(labels):
+            labels[i] = str(label) + " (" + str(int(calls_list[i])) + ")"
     title = (
         "Region (Calls/rank (max))" if kwargs["cluster"] != "multiple" else "Cluster"
     )
@@ -327,8 +313,6 @@ def prepare_data(**kwargs):
         pbar.update(1)
         pbar.close()
 
-    # cluster_to_ps = dict(zip(tk.metadata["cluster"], tk.metadata["total_problem_size"]))
-
     clean_tree = tk.tree(kwargs["tree_metric"], render_header=True)
     clean_tree = re.compile(r"\x1b\[([0-9;]*m)").sub("", clean_tree)
 
@@ -422,13 +406,6 @@ def prepare_data(**kwargs):
     if prefix:
         tk.dataframe = pd.concat([tk.dataframe.filter(like=p, axis=0) for p in prefix])
 
-    # Group by varied parameters
-    # grouped = tk.groupby(x_axis_metadata)
-    # print(grouped.keys())
-    # ctk = th.Thicket.concat_thickets(
-    #     list(grouped.values()), headers=list(grouped.keys()), axis="index"
-    # )
-
     tk.metadata_columns_to_perfdata(["cluster"] + list(NAME_REMAP.keys()))
 
     cluster_col = "cluster" if "cluster" in tk.metadata.columns else "host.cluster"

From e476d0b9c1b83a106692c6f4ef25ee707f8a6939 Mon Sep 17 00:00:00 2001
From: Michael McKinsey <michaelmckinsey1@gmail.com>
Date: Mon, 8 Dec 2025 17:01:27 -0800
Subject: [PATCH 06/18] Fix topn

---
 lib/benchpark/cmd/analyze.py | 69 ++++++++++++++++++++++--------------
 1 file changed, 42 insertions(+), 27 deletions(-)

diff --git a/lib/benchpark/cmd/analyze.py b/lib/benchpark/cmd/analyze.py
index 4ccf26aca..a7132b999 100644
--- a/lib/benchpark/cmd/analyze.py
+++ b/lib/benchpark/cmd/analyze.py
@@ -18,6 +18,7 @@
 import matplotlib.pyplot as plt
 import pandas as pd
 import seaborn
+import hatchet as ht
 import thicket as th
 from tqdm import tqdm
 
@@ -199,9 +200,11 @@ def make_chart(**kwargs):
     os.makedirs(kwargs["out_dir"], exist_ok=True)
 
     # Calls/rank in legend
-    calls_list = []
+    calls_dict = {}
     for node in set(df.index.get_level_values("node")):
-        calls_list.append(df.loc[node, "Calls/rank (max)"].max())
+        v = df.loc[node, "Calls/rank (max)"].max()
+        name = node.frame["name"] if isinstance(node, ht.node.Node) else node
+        calls_dict[name] = int(v) if pd.notna(v) else v
 
     mpl.rcParams["axes.prop_cycle"] = mpl.cycler(color=COLOR_PALETTE)
     if kwargs.get("chart_fontsize"):
@@ -262,9 +265,8 @@ def make_chart(**kwargs):
     handles = list(reversed(handles))
     labels = list(reversed(labels))
     if kwargs["cluster"] != "multiple":
-        calls_list = list(reversed(calls_list))
         for i, label in enumerate(labels):
-            labels[i] = str(label) + " (" + str(int(calls_list[i])) + ")"
+            labels[i] = str(label) + " (" + str(calls_dict[label]) + ")"
     title = (
         "Region (Calls/rank (max))" if kwargs["cluster"] != "multiple" else "Cluster"
     )
@@ -475,21 +477,35 @@ def prepare_data(**kwargs):
                 / tk.dataframe.loc[(slice(None), profile), metric].sum()
             )
 
-    # top_n = kwargs.get("top_n_regions", -1)
-    # if top_n != -1:
-    #     temp_df_idx = tk.dataframe.nlargest(
-    #         top_n, metric).index
-    #     temp_df = tk.dataframe[tk.dataframe.index.isin(temp_df_idx)]
-    #     temp_df.loc["Sum(removed_regions)"] = 0
-    #     for p in tk.profile:
-    #         temp_df.loc["Sum(removed_regions)", metric] = (
-    #             tk.dataframe.loc[:, metric].sum()
-    #             - temp_df.loc[:, metric].sum()
-    #         )
-    #     tk.dataframe = temp_df
-    #     logger.info(
-    #         f"Filtered top {top_n} regions for chart display. Added the sum of the regions that were removed as single region."
-    #     )
+    top_n = kwargs.get("top_n_regions", -1)
+    if top_n != -1:
+        chosen_profile = tk.profile[0]
+        temp_df_idx = (
+            tk.dataframe.loc[(slice(None), chosen_profile), :]
+            .nlargest(top_n, metric)
+            .index.get_level_values("node")
+        )
+        temp_df = tk.dataframe[
+            tk.dataframe.index.get_level_values("node").isin(temp_df_idx)
+        ]
+        for p in tk.profile:
+            temp_df.loc[("Sum(removed_regions)", p), metric] = (
+                tk.dataframe.loc[(slice(None), p), metric].sum()
+                - temp_df.loc[(slice(None), p), metric].sum()
+            )
+            for xp in kwargs["xaxis_parameter"]:
+                temp_df.loc[("Sum(removed_regions)", p), xp] = tk.dataframe.loc[
+                    (slice(None), p), xp
+                ].iloc[0]
+        temp_df.loc[("Sum(removed_regions)",), "name"] = "Sum(removed_regions)"
+        tk.dataframe = temp_df
+        logger.info(
+            f"Filtered top {top_n} regions for chart display (based on first profile in Thicket.profile). Added the sum of the regions that were removed as single region."
+        )
+
+    # Convert int-like columns to int
+    for col in kwargs["xaxis_parameter"]:
+        tk.dataframe[col] = tk.dataframe[col].astype(int)
 
     if not kwargs.get("chart_xlabel"):
         kwargs["chart_xlabel"] = x_axis_metadata
@@ -502,7 +518,6 @@ def prepare_data(**kwargs):
             raise ValueError(
                 f"Expected one scaling factor, found: {list(scaling_factors)}"
             )
-    # kwargs["cluster_to_ps"] = cluster_to_ps
     kwargs["cluster"] = cluster
 
     make_chart(df=tk.dataframe, x_axis=x_axis_metadata, **kwargs)
@@ -564,13 +579,13 @@ def setup_parser(root_parser):
         help="Query for one or more regions REGION. Includes children of region.",
         metavar="REGION",
     )
-    # root_parser.add_argument(
-    #     "--top-n-regions",
-    #     default=-1,
-    #     type=int,
-    #     help="Filters only top N largest metric entries to be included in chart (based on the first profile).",
-    #     metavar="N",
-    # )
+    root_parser.add_argument(
+        "--top-n-regions",
+        default=-1,
+        type=int,
+        help="Filters only top N largest metric entries to be included in chart (based on the first profile).",
+        metavar="N",
+    )
     root_parser.add_argument(
         "--group-regions-name",
         action="store_true",

From 24754d8ff93c193b7feed12c517514d33ba68838 Mon Sep 17 00:00:00 2001
From: Michael McKinsey <michaelmckinsey1@gmail.com>
Date: Mon, 8 Dec 2025 19:09:05 -0800
Subject: [PATCH 07/18] Add class for derived metrics

---
 lib/benchpark/cmd/analyze.py | 37 +++++++++++++++++++++++++++++-------
 1 file changed, 30 insertions(+), 7 deletions(-)

diff --git a/lib/benchpark/cmd/analyze.py b/lib/benchpark/cmd/analyze.py
index a7132b999..f843a32e1 100644
--- a/lib/benchpark/cmd/analyze.py
+++ b/lib/benchpark/cmd/analyze.py
@@ -61,6 +61,32 @@
 logging.basicConfig(level=logging.INFO, format="%(levelname)s:%(name)s: %(message)s")
 
 
+class RAJAPerf:
+    def __init__(self, tk):
+        self.tk = tk
+        # Matches application_name column in metadata
+        self.name = "raja-perf"
+
+    def set_metrics(self):
+        self.tk.dataframe["Bandwidth (GB/s)"] = (
+            self.tk.dataframe["Bytes/Rep"]
+            / self.tk.dataframe["Avg time/rank (exc)"]
+            / 10**9
+            * self.tk.dataframe["Reps"]
+            * self.tk.metadata["mpi.world.size"]
+        )
+
+        self.tk.dataframe["FLOP Rate (GFLOPS)"] = (
+            self.tk.dataframe["Flops/Rep"]
+            / self.tk.dataframe["Avg time/rank (exc)"]
+            / 10**9
+            * self.tk.dataframe["Reps"]
+            * self.tk.metadata["mpi.world.size"]
+        )
+
+        return ["Bandwidth (GB/s)", "FLOP Rate (GFLOPS)"]
+
+
 # -----------------------------
 # Helper Functions
 # -----------------------------
@@ -335,13 +361,10 @@ def prepare_data(**kwargs):
     # Remove singular roots if inclusive metric
     metric = kwargs["yaxis_metric"]
 
-    tk.dataframe["Bandwidth (GB/s)"] = (
-        tk.dataframe["Bytes/Rep"]
-        / tk.dataframe["Avg time/rank (exc)"]
-        / 10**9
-        * tk.dataframe["Reps"]
-        * tk.metadata["mpi.world.size"]
-    )
+    known_applications = {"raja-perf": RAJAPerf}
+    for ta in tk.metadata["application_name"].unique():
+        added_mets = known_applications[ta](tk).set_metrics()
+        logger.info(f"Added the following derived metrics for app '{ta}': {added_mets}")
 
     if metric in tk.inc_metrics and len(tk.graph.roots) == 1:
         root_name = tk.graph.roots[0].frame["name"]

From d274a923ca2c4c0cf5c5de405f06261083c3216c Mon Sep 17 00:00:00 2001
From: Michael McKinsey <michaelmckinsey1@gmail.com>
Date: Mon, 8 Dec 2025 19:11:16 -0800
Subject: [PATCH 08/18] Improve log

---
 lib/benchpark/cmd/analyze.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/benchpark/cmd/analyze.py b/lib/benchpark/cmd/analyze.py
index f843a32e1..394670b58 100644
--- a/lib/benchpark/cmd/analyze.py
+++ b/lib/benchpark/cmd/analyze.py
@@ -364,7 +364,7 @@ def prepare_data(**kwargs):
     known_applications = {"raja-perf": RAJAPerf}
     for ta in tk.metadata["application_name"].unique():
         added_mets = known_applications[ta](tk).set_metrics()
-        logger.info(f"Added the following derived metrics for app '{ta}': {added_mets}")
+        logger.info(f"Added the following derived metrics for app '{ta}':\n\t{added_mets}\n\tUse them via the '--yaxis-metric' parameter.")
 
     if metric in tk.inc_metrics and len(tk.graph.roots) == 1:
         root_name = tk.graph.roots[0].frame["name"]

From 5f8d5c60de260899be7f29b9b6fab9672d711c33 Mon Sep 17 00:00:00 2001
From: Michael McKinsey <michaelmckinsey1@gmail.com>
Date: Tue, 9 Dec 2025 15:48:23 -0800
Subject: [PATCH 09/18] Updates

---
 lib/benchpark/cmd/analyze.py | 36 +++++++++++++++++++++---------------
 1 file changed, 21 insertions(+), 15 deletions(-)

diff --git a/lib/benchpark/cmd/analyze.py b/lib/benchpark/cmd/analyze.py
index 394670b58..656bd65e1 100644
--- a/lib/benchpark/cmd/analyze.py
+++ b/lib/benchpark/cmd/analyze.py
@@ -68,7 +68,7 @@ def __init__(self, tk):
         self.name = "raja-perf"
 
     def set_metrics(self):
-        self.tk.dataframe["Bandwidth (GB/s)"] = (
+        self.tk.dataframe["Memory Bandwidth (GB/s)"] = (
             self.tk.dataframe["Bytes/Rep"]
             / self.tk.dataframe["Avg time/rank (exc)"]
             / 10**9
@@ -84,7 +84,7 @@ def set_metrics(self):
             * self.tk.metadata["mpi.world.size"]
         )
 
-        return ["Bandwidth (GB/s)", "FLOP Rate (GFLOPS)"]
+        return ["Memory Bandwidth (GB/s)", "FLOP Rate (GFLOPS)"]
 
 
 # -----------------------------
@@ -247,6 +247,8 @@ def make_chart(**kwargs):
     ax.set_title(kwargs.get("chart_title", ""))
     ax.set_xlabel(xlabel)
     ax.set_ylabel(y_label)
+    if kwargs["yaxis_log"]:
+        ax.set_yscale('log', base=2) 
     plt.grid(True)
     df = df.sort_values(by=x_axis)
     plot_args = dict(
@@ -660,6 +662,23 @@ def setup_parser(root_parser):
         default="Calls/rank (max)",
         help="Metric to show on the tree output",
     )
+    root_parser.add_argument(
+        "--chart-kind",
+        type=str,
+        default="area",
+        choices=["area", "line", "bar", "scatter"],
+        help="Type of chart to generate",
+    )
+    root_parser.add_argument(
+        "--no-update-inc-cols",
+        action="store_true",
+        help="Don't call Thicket.update_inclusive_columns() which can take a while.",
+    )
+    root_parser.add_argument(
+        "--yaxis-log",
+        action="store_true",
+        help="Change yaxis to log base 2."
+    )
 
     # Workspace commands
     root_parser.add_argument(
@@ -682,19 +701,6 @@ def setup_parser(root_parser):
         default=None,
         help="With 'archive', path for the .tar.gz (defaults to CWD/<workspace>-<timestamp>.tar.gz)",
     )
-    root_parser.add_argument(
-        "--chart-kind",
-        type=str,
-        default="area",
-        choices=["area", "line", "bar", "scatter"],
-        help="Type of chart to generate",
-    )
-    root_parser.add_argument(
-        "--no-update-inc-cols",
-        action="store_true",
-        help="Don't call Thicket.update_inclusive_columns() which can take a while.",
-    )
-
 
 def command(args):
     """

From 81c16aa476e1a6f3b646c3c55ac650075f28feae Mon Sep 17 00:00:00 2001
From: Michael McKinsey <michaelmckinsey1@gmail.com>
Date: Wed, 10 Dec 2025 13:34:30 -0800
Subject: [PATCH 10/18] Enable using metadata columns

---
 lib/benchpark/cmd/analyze.py | 32 ++++++++++++++++++++++++--------
 1 file changed, 24 insertions(+), 8 deletions(-)

diff --git a/lib/benchpark/cmd/analyze.py b/lib/benchpark/cmd/analyze.py
index 656bd65e1..16109881e 100644
--- a/lib/benchpark/cmd/analyze.py
+++ b/lib/benchpark/cmd/analyze.py
@@ -248,7 +248,7 @@ def make_chart(**kwargs):
     ax.set_xlabel(xlabel)
     ax.set_ylabel(y_label)
     if kwargs["yaxis_log"]:
-        ax.set_yscale('log', base=2) 
+        ax.set_yscale("log", base=2)
     plt.grid(True)
     df = df.sort_values(by=x_axis)
     plot_args = dict(
@@ -264,6 +264,8 @@ def make_chart(**kwargs):
         df["xaxis"] = df.apply(
             lambda row: ", ".join([str(row[col]) for col in x_axis]), axis=1
         )
+    if yaxis_metric not in df.columns:
+        raise KeyError(f"'{yaxis_metric}' not in the data. Choose from: {df.columns}")
     if kwargs["cluster"] == "multiple":
         plot_args["hue"] = "cluster"
     # Add marker only if line plot
@@ -276,7 +278,13 @@ def make_chart(**kwargs):
             .reset_index(drop=True)
             .sort_values("xaxis")
         )
-        tdf = tdf.pivot(index="xaxis", columns="name", values=yaxis_metric)
+        try:
+            tdf = tdf.pivot(index="xaxis", columns="name", values=yaxis_metric)
+        except ValueError:
+            print(
+                "Duplicate data points detected:\n\t(1) Check if you have duplicate caliper files per input configuration '--file-name-match'.\n\t(2) Duplicate regions can be grouped with '--group-regions-name'\n\t(3) MPI regions can be removed with '--no-mpi'\n\t(4) If your calltrees are disjoint, use '--calltree-unification intersection'"
+            )
+            raise
         tdf.plot(**plot_args)
     elif kind == "scatter":
         seaborn.scatterplot(**plot_args)
@@ -360,14 +368,17 @@ def prepare_data(**kwargs):
         )
         tk = tk.query(query)
 
-    # Remove singular roots if inclusive metric
     metric = kwargs["yaxis_metric"]
 
     known_applications = {"raja-perf": RAJAPerf}
     for ta in tk.metadata["application_name"].unique():
-        added_mets = known_applications[ta](tk).set_metrics()
-        logger.info(f"Added the following derived metrics for app '{ta}':\n\t{added_mets}\n\tUse them via the '--yaxis-metric' parameter.")
+        if ta in known_applications.keys():
+            added_mets = known_applications[ta](tk).set_metrics()
+            logger.info(
+                f"Added the following derived metrics for app '{ta}':\n\t{added_mets}\n\tUse them via the '--yaxis-metric' parameter."
+            )
 
+    # Remove singular roots if inclusive metric
     if metric in tk.inc_metrics and len(tk.graph.roots) == 1:
         root_name = tk.graph.roots[0].frame["name"]
         logger.info(
@@ -545,6 +556,12 @@ def prepare_data(**kwargs):
             )
     kwargs["cluster"] = cluster
 
+    if metric in tk.metadata.columns:
+        tk.metadata_columns_to_perfdata(metric)
+        logger.info(
+            f"Adding metadata column '{metric}' to the performance data from the metadata."
+        )
+
     make_chart(df=tk.dataframe, x_axis=x_axis_metadata, **kwargs)
 
 
@@ -675,9 +692,7 @@ def setup_parser(root_parser):
         help="Don't call Thicket.update_inclusive_columns() which can take a while.",
     )
     root_parser.add_argument(
-        "--yaxis-log",
-        action="store_true",
-        help="Change yaxis to log base 2."
+        "--yaxis-log", action="store_true", help="Change yaxis to log base 2."
     )
 
     # Workspace commands
@@ -702,6 +717,7 @@ def setup_parser(root_parser):
         help="With 'archive', path for the .tar.gz (defaults to CWD/<workspace>-<timestamp>.tar.gz)",
     )
 
+
 def command(args):
     """
     Implements either analysis (default) or the trailing `clean`/`archive` actions

From bbc00361431d782f93dae013f09cca3c2da50ba8 Mon Sep 17 00:00:00 2001
From: Michael McKinsey <michaelmckinsey1@gmail.com>
Date: Wed, 10 Dec 2025 13:37:03 -0800
Subject: [PATCH 11/18] turning off legend

---
 lib/benchpark/cmd/analyze.py | 20 +++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/lib/benchpark/cmd/analyze.py b/lib/benchpark/cmd/analyze.py
index 16109881e..2ba591d8a 100644
--- a/lib/benchpark/cmd/analyze.py
+++ b/lib/benchpark/cmd/analyze.py
@@ -306,13 +306,14 @@ def make_chart(**kwargs):
     title = (
         "Region (Calls/rank (max))" if kwargs["cluster"] != "multiple" else "Cluster"
     )
-    ax.legend(
-        handles,
-        labels,
-        bbox_to_anchor=(1, 0.5),
-        loc="center left",
-        title=title,
-    )
+    if not kwargs["disable_legend"]:
+        ax.legend(
+            handles,
+            labels,
+            bbox_to_anchor=(1, 0.5),
+            loc="center left",
+            title=title,
+        )
     ax.set_xlabel(xlabel)
 
     fig.autofmt_xdate()
@@ -694,6 +695,11 @@ def setup_parser(root_parser):
     root_parser.add_argument(
         "--yaxis-log", action="store_true", help="Change yaxis to log base 2."
     )
+    root_parser.add_argument(
+        "--disable-legend",
+        action="store_true",
+        help="Turn off the legend on the figure",
+    )
 
     # Workspace commands
     root_parser.add_argument(

From 185617eefa653800f622d9d8f5f64286d307a6d9 Mon Sep 17 00:00:00 2001
From: Michael McKinsey <michaelmckinsey1@gmail.com>
Date: Thu, 11 Dec 2025 13:44:23 -0800
Subject: [PATCH 12/18] Fix bug

---
 lib/benchpark/cmd/analyze.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lib/benchpark/cmd/analyze.py b/lib/benchpark/cmd/analyze.py
index 2ba591d8a..8d6e68286 100644
--- a/lib/benchpark/cmd/analyze.py
+++ b/lib/benchpark/cmd/analyze.py
@@ -445,9 +445,9 @@ def prepare_data(**kwargs):
     if prefix:
         tk.dataframe = pd.concat([tk.dataframe.filter(like=p, axis=0) for p in prefix])
 
-    tk.metadata_columns_to_perfdata(["cluster"] + list(NAME_REMAP.keys()))
-
     cluster_col = "cluster" if "cluster" in tk.metadata.columns else "host.cluster"
+    tk.metadata_columns_to_perfdata([cluster_col] + list(NAME_REMAP.keys()))
+
     # Check these values are constant
     app = validate_single_metadata_value("application_name", tk)
     try:

From 2ea29207457cfef909a0d98f6dc6d59e09992781 Mon Sep 17 00:00:00 2001
From: Michael McKinsey <michaelmckinsey1@gmail.com>
Date: Thu, 11 Dec 2025 14:20:57 -0800
Subject: [PATCH 13/18] Enable duplicate regions again

---
 lib/benchpark/cmd/analyze.py | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/lib/benchpark/cmd/analyze.py b/lib/benchpark/cmd/analyze.py
index 8d6e68286..a47d16145 100644
--- a/lib/benchpark/cmd/analyze.py
+++ b/lib/benchpark/cmd/analyze.py
@@ -275,16 +275,11 @@ def make_chart(**kwargs):
     elif kind == "area":
         tdf = (
             df[[yaxis_metric, "name", "xaxis"]]
-            .reset_index(drop=True)
+            .reset_index()
             .sort_values("xaxis")
         )
-        try:
-            tdf = tdf.pivot(index="xaxis", columns="name", values=yaxis_metric)
-        except ValueError:
-            print(
-                "Duplicate data points detected:\n\t(1) Check if you have duplicate caliper files per input configuration '--file-name-match'.\n\t(2) Duplicate regions can be grouped with '--group-regions-name'\n\t(3) MPI regions can be removed with '--no-mpi'\n\t(4) If your calltrees are disjoint, use '--calltree-unification intersection'"
-            )
-            raise
+        tdf = tdf.pivot(index="xaxis", columns="node", values=yaxis_metric)
+        tdf = tdf.rename(columns={col: col.frame["name"] for col in tdf.columns})
         tdf.plot(**plot_args)
     elif kind == "scatter":
         seaborn.scatterplot(**plot_args)

From 7fbe877f4b3b9ac6e00305fa91b25e817ad46f75 Mon Sep 17 00:00:00 2001
From: Michael McKinsey <michaelmckinsey1@gmail.com>
Date: Thu, 11 Dec 2025 15:07:15 -0800
Subject: [PATCH 14/18] Fix for topnregions

---
 lib/benchpark/cmd/analyze.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/lib/benchpark/cmd/analyze.py b/lib/benchpark/cmd/analyze.py
index a47d16145..0f799610f 100644
--- a/lib/benchpark/cmd/analyze.py
+++ b/lib/benchpark/cmd/analyze.py
@@ -278,6 +278,7 @@ def make_chart(**kwargs):
             .reset_index()
             .sort_values("xaxis")
         )
+        tdf["node"] = tdf["node"].apply(lambda i: ht.node.Node(ht.frame.Frame({"name": i})) if isinstance(i, str) else i)
         tdf = tdf.pivot(index="xaxis", columns="node", values=yaxis_metric)
         tdf = tdf.rename(columns={col: col.frame["name"] for col in tdf.columns})
         tdf.plot(**plot_args)

From f153c8cd35dde1ae8c6994d32eee36db5fbb7ad4 Mon Sep 17 00:00:00 2001
From: Michael McKinsey <michaelmckinsey1@gmail.com>
Date: Fri, 12 Dec 2025 10:41:31 -0800
Subject: [PATCH 15/18] exlude regions

---
 lib/benchpark/cmd/analyze.py | 28 +++++++++++++++++++++-------
 1 file changed, 21 insertions(+), 7 deletions(-)

diff --git a/lib/benchpark/cmd/analyze.py b/lib/benchpark/cmd/analyze.py
index 0f799610f..e7438e886 100644
--- a/lib/benchpark/cmd/analyze.py
+++ b/lib/benchpark/cmd/analyze.py
@@ -273,12 +273,12 @@ def make_chart(**kwargs):
         plot_args["marker"] = "o"
         seaborn.lineplot(**plot_args)
     elif kind == "area":
-        tdf = (
-            df[[yaxis_metric, "name", "xaxis"]]
-            .reset_index()
-            .sort_values("xaxis")
+        tdf = df[[yaxis_metric, "name", "xaxis"]].reset_index().sort_values("xaxis")
+        tdf["node"] = tdf["node"].apply(
+            lambda i: (
+                ht.node.Node(ht.frame.Frame({"name": i})) if isinstance(i, str) else i
+            )
         )
-        tdf["node"] = tdf["node"].apply(lambda i: ht.node.Node(ht.frame.Frame({"name": i})) if isinstance(i, str) else i)
         tdf = tdf.pivot(index="xaxis", columns="node", values=yaxis_metric)
         tdf = tdf.rename(columns={col: col.frame["name"] for col in tdf.columns})
         tdf.plot(**plot_args)
@@ -351,15 +351,23 @@ def prepare_data(**kwargs):
     clean_tree = tk.tree(kwargs["tree_metric"], render_header=True)
     clean_tree = re.compile(r"\x1b\[([0-9;]*m)").sub("", clean_tree)
 
-    # Remove MPI regions, if necesasry
+    exclude_regions = []
+    # Remove MPI regions, if necessary
     if kwargs.get("no_mpi"):
+        exclude_regions.append("MPI_")
+    if kwargs.get("exclude_regions"):
+        exclude_regions.extend(kwargs.get("exclude_regions"))
+    if len(exclude_regions) > 0:
+        logger.info(
+            f"Removing regions that match the following pattern: {exclude_regions}"
+        )
         query = th.query.Query().match(
             ".",
             lambda row: row["name"]
             .apply(
                 # 'n is None' avoid comparison for MPI in n (will cause error)
                 lambda n: n is None
-                or "MPI_" not in n
+                or all(excl not in n for excl in exclude_regions)
             )
             .all(),
         )
@@ -696,6 +704,12 @@ def setup_parser(root_parser):
         action="store_true",
         help="Turn off the legend on the figure",
     )
+    root_parser.add_argument(
+        "--exclude-regions",
+        nargs="+",
+        type=str,
+        help="One or more patterns to exclude based on region name",
+    )
 
     # Workspace commands
     root_parser.add_argument(

From 5f2cf663b5c9a18ee4d8e4d1ddbccab3079eb5fa Mon Sep 17 00:00:00 2001
From: Michael McKinsey <michaelmckinsey1@gmail.com>
Date: Fri, 12 Dec 2025 17:13:01 -0800
Subject: [PATCH 16/18] Add docs

---
 docs/benchpark-analyze.rst | 38 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 38 insertions(+)

diff --git a/docs/benchpark-analyze.rst b/docs/benchpark-analyze.rst
index 4204719b4..97a8bb971 100644
--- a/docs/benchpark-analyze.rst
+++ b/docs/benchpark-analyze.rst
@@ -204,3 +204,41 @@ profile). We can also add the ``--no-mpi`` argument to filter out all ``MPI_*``
 .. figure:: _static/images/kripke_cuda_strong_raw_exc-2.png
     :width: 800
     :align: center
+
+*****************************************
+ Visualize Data From Multiple Workspaces
+*****************************************
+
+Data from multiple clusters will end up in seperate Ramble workspaces. Simply point at
+the Benchpark workspace instead of the Ramble workspace to include multiple Ramble
+workspaces in your analysis. This example uses the ``line`` chart functionality to
+visualize a single node memory bandwidth study. Other options are ``bar`` and
+``scatter``.
+
+.. note::
+
+    The ``area`` chart will not work for data from multiple Ramble workspaces.
+
+.. code-block:: console
+
+    $ benchpark analyze --workspace-dir wkp/ --query-regions-byname Stream_TRIAD --chart-kind line --file-name-match Base_Seq-default --yaxis-metric 'Memory Bandwidth (GB/s)' --chart-yaxis-limits 8 2048 --chart-figsize 12 7 --yaxis-log --no-mpi
+
+.. figure:: _static/images/raja-perf_mpi_strong_raw_exc.png
+    :width: 800
+    :align: center
+
+*****************************
+ Visualize a Metadata Column
+*****************************
+
+``benchpark analyze`` is not limited to performance data columns. Provide the name of a
+metadata column to visualize that instead. This is useful for metrics like FOM's, which
+only have one value per profile.
+
+.. code-block:: console
+
+    $ benchpark analyze --workspace-dir problem1/ --yaxis-metric Final-FOM --chart-kind line --disable-legend
+
+.. figure:: _static/images/amg2023_rocm_weak_raw_exc.png
+    :width: 800
+    :align: center

From b9bf2819eb5ee93452750365d48eb6d31a38dbb7 Mon Sep 17 00:00:00 2001
From: Michael McKinsey <michaelmckinsey1@gmail.com>
Date: Fri, 12 Dec 2025 17:15:20 -0800
Subject: [PATCH 17/18] lint

---
 docs/benchpark-analyze.rst   | 2 +-
 lib/benchpark/cmd/analyze.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/benchpark-analyze.rst b/docs/benchpark-analyze.rst
index b202f1f55..d5a470d80 100644
--- a/docs/benchpark-analyze.rst
+++ b/docs/benchpark-analyze.rst
@@ -209,7 +209,7 @@ profile). We can also add the ``--no-mpi`` argument to filter out all ``MPI_*``
  Visualize Data From Multiple Workspaces
 *****************************************
 
-Data from multiple clusters will end up in seperate Ramble workspaces. Simply point at
+Data from multiple clusters will end up in separate Ramble workspaces. Simply point at
 the Benchpark workspace instead of the Ramble workspace to include multiple Ramble
 workspaces in your analysis. This example uses the ``line`` chart functionality to
 visualize a single node memory bandwidth study. Other options are ``bar`` and
diff --git a/lib/benchpark/cmd/analyze.py b/lib/benchpark/cmd/analyze.py
index e7438e886..8eb9af5cb 100644
--- a/lib/benchpark/cmd/analyze.py
+++ b/lib/benchpark/cmd/analyze.py
@@ -287,7 +287,7 @@ def make_chart(**kwargs):
     elif kind == "bar":
         seaborn.barplot(**plot_args)
     else:
-        raise NotImplementedError(f"Uknown plot kind {kind}")
+        raise NotImplementedError(f"Unknown plot kind {kind}")
 
     y_axis_limits = kwargs.get("chart_yaxis_limits")
     if y_axis_limits is not None:

From eba355d4bbd8f1e5e9aa52bd03d3b5738932f924 Mon Sep 17 00:00:00 2001
From: Michael McKinsey <michaelmckinsey1@gmail.com>
Date: Fri, 12 Dec 2025 17:48:24 -0800
Subject: [PATCH 18/18] lint

---
 lib/benchpark/cmd/analyze.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/benchpark/cmd/analyze.py b/lib/benchpark/cmd/analyze.py
index 8eb9af5cb..bb8d5f568 100644
--- a/lib/benchpark/cmd/analyze.py
+++ b/lib/benchpark/cmd/analyze.py
@@ -14,11 +14,11 @@
 from datetime import datetime
 from glob import glob
 
+import hatchet as ht
 import matplotlib as mpl
 import matplotlib.pyplot as plt
 import pandas as pd
 import seaborn
-import hatchet as ht
 import thicket as th
 from tqdm import tqdm