From 3bb871a3c59c0b35239de0f47d4734ac70674f48 Mon Sep 17 00:00:00 2001
From: Arturo Vargas <vargas45@llnl.gov>
Date: Fri, 9 Jan 2026 14:46:51 -0800
Subject: [PATCH 1/5] add helper scripts to process data

---
 scripts/run_kernels.sh       |  25 ++
 scripts/study_run_kernels.py | 483 +++++++++++++++++++++++++++++++++++
 2 files changed, 508 insertions(+)
 create mode 100644 scripts/run_kernels.sh
 create mode 100644 scripts/study_run_kernels.py

diff --git a/scripts/run_kernels.sh b/scripts/run_kernels.sh
new file mode 100644
index 000000000..df3e1bbaf
--- /dev/null
+++ b/scripts/run_kernels.sh
@@ -0,0 +1,25 @@
+#!/usr/bin/env bash
+
+#Clean directory
+rm -rf *.csv *.txt
+
+# Collection of float factors between 0.5 and 10
+FACTORS=(0.5 1.0 2.5 5.0 7.5 10.0)
+
+# List of kernels to run
+KERNELS=("MASS3DPA" "DEL_DOT_VEC_2D")
+
+for KERNEL_NAME in "${KERNELS[@]}"; do
+    echo "Running kernel: $KERNEL_NAME"
+
+    for factor in "${FACTORS[@]}"; do
+        echo "  Running with sizefact = $factor"
+        ./bin/raja-perf.exe \
+            -k "$KERNEL_NAME" \
+            --npasses 3 \
+            --npasses-combiners Average Minimum Maximum \
+            --outfile "${KERNEL_NAME}_factor_${factor}" \
+            --sizefact "$factor" \
+            --warmup-perfrun-same
+    done
+done
diff --git a/scripts/study_run_kernels.py b/scripts/study_run_kernels.py
new file mode 100644
index 000000000..bf972aafd
--- /dev/null
+++ b/scripts/study_run_kernels.py
@@ -0,0 +1,483 @@
+import os
+import glob
+import numpy as np
+import pandas as pd
+
+# ============= Configuration =============
+
+ROOT_DIR = "/usr/WS1/vargas45/RAJAPERF_DEV/RAJAPerf-stage/build_lc_toss4-amdclang-7.1.0-gfx942"  # change if needed
+
+# Use "factor" instead of "mref" in file patterns
+GLOB_PATTERNS = [
+    "**/*factor*kernel-run-data.csv",              # broad match
+]
+
+# Optional filter to only keep specific kernels by substring match (case-insensitive)
+# Leave empty to include all kernels discovered.
+KERNEL_WHITELIST = [
+    # "MASS3DPA",
+]
+
+# Derivative reporting configuration
+DERIV_USE_RELATIVE = True
+DERIV_EPS_REL = 0.03          # relative threshold on |dy/dx| normalized by (y_range/x_range)
+DERIV_EPS_ABS = 1e-4          # absolute threshold on |dy/dx|, only used if DERIV_USE_RELATIVE=False
+DERIV_MIN_CONSEC = 3          # minimum consecutive points below threshold to consider a plateau run
+DERIV_SMOOTH_WINDOW = 3       # moving average window for smoothing y before derivative
+DERIV_MIN_FRAC_OF_MAX_Y = 0.9 # only search after reaching this fraction of max(y)
+DERIV_REPORT_MAX_POINTS = 8   # limit how many points to print per series
+DERIV_REPORT_ABS = True       # print |dy/dx| if True, else print signed dy/dx
+
+# Output and plotting configuration
+OUTPUT_DIR = "./results"
+FIG_DIR = os.path.join(OUTPUT_DIR, "figures")
+COMBINED_CSV_PATH = os.path.join(OUTPUT_DIR, "combined_table.csv")
+FIG_DPI = 300
+SHOW_PLOTS = True  # show interactive windows while also saving PNGs
+
+# Use a non-interactive backend only when not showing plots
+if not SHOW_PLOTS:
+    import matplotlib
+    matplotlib.use("Agg")
+
+import matplotlib.pyplot as plt
+
+
+# ============= Helper functions =============
+
+def ensure_dir(path):
+    os.makedirs(path, exist_ok=True)
+
+def sanitize_filename(text):
+    return "".join(c if c.isalnum() or c in "-_." else "_" for c in str(text))
+
+def find_csv_files(root_dir, patterns):
+    """Recursively find CSV files matching any of the given patterns."""
+    all_files = []
+    for pattern in patterns:
+        search_pattern = os.path.join(root_dir, pattern)
+        files = glob.glob(search_pattern, recursive=True)
+        all_files.extend(files)
+    all_files = sorted(set(all_files))  # Remove duplicates and sort
+    return all_files
+
+def _likely_header_score(line):
+    """
+    Score a potential header line based on presence of common column tokens.
+    Higher is more likely to be the header.
+    """
+    tokens = [
+        "Kernel",
+        "Variant",
+        "Problem size",
+        "Problem Size",
+        "Mean flops",
+        "GFlop",
+        "GFLOP",
+        "GFLOPs",
+        "GFLOPS",
+    ]
+    score = 0
+    for t in tokens:
+        if t in line:
+            score += 1
+    return score
+
+def read_single_csv(path):
+    """
+    Read one CSV, trying to detect the header row by locating a line
+    that contains key column names. Returns a DataFrame or None.
+    """
+    try:
+        with open(path, "r", encoding="utf-8") as f:
+            lines = f.readlines()
+    except Exception as e:
+        print(f"Failed to read {path}: {e}")
+        return None
+
+    header_idx = None
+    best_score = -1
+    for i, line in enumerate(lines[:50]):  # only inspect the first 50 lines
+        score = _likely_header_score(line)
+        if score > best_score:
+            best_score = score
+            header_idx = i
+
+    if header_idx is None:
+        print(f"Could not find header in {path}, skipping.")
+        return None
+
+    try:
+        df = pd.read_csv(path, header=header_idx)
+    except Exception as e:
+        print(f"Failed to parse CSV {path}: {e}")
+        return None
+
+    df["__source_file__"] = path
+    return df
+
+def normalize_columns(df):
+    """
+    Normalize common column names to a standard set if possible.
+    """
+    candidates = {
+        # Standard name : possible variants
+        "Kernel": ["Kernel", "Kernel name", "Benchmark", "Test"],
+        "Variant": ["Variant", "Implementation", "Policy", "Config", "Backend", "Suite"],
+        "Problem size": [
+            "Problem size", "Problem Size", "Size", "N", "DOF", "Elements",
+            "ProblemSize", "Problem-size"
+        ],
+        "Mean flops (gigaFLOP per sec.)": [
+            "Mean flops (gigaFLOP per sec.)",
+            "Mean flops (GFlop/s)",
+            "Mean Flops (GFlop/s)",
+            "GFLOP/s", "GFLOPs/s", "GFLOPS", "GFlops/s", "GFlop/s", "GF/s",
+            "Mean GFLOP/s", "Mean GFLOPs/s"
+        ],
+    }
+
+    new_col_map = {}
+    # strip whitespace from existing columns first
+    df = df.rename(columns={c: c.strip() for c in df.columns})
+
+    for standard_name, names in candidates.items():
+        for c in names:
+            if c in df.columns:
+                new_col_map[c] = standard_name
+                break  # first match wins
+
+    df = df.rename(columns=new_col_map)
+    return df
+
+def _moving_average(y, window):
+    if window is None or window <= 1 or len(y) < 3:
+        return y
+    window = max(2, int(window))
+    kernel = np.ones(window, dtype=float) / float(window)
+    return np.convolve(y, kernel, mode="same")
+
+def _find_first_run(mask, min_len):
+    """Return the start index and run length of the first run of True with length >= min_len."""
+    run = 0
+    for i, v in enumerate(mask):
+        if v:
+            run += 1
+            if run >= min_len:
+                start = i - run + 1
+                j = i + 1
+                while j < len(mask) and mask[j]:
+                    j += 1
+                return start, j - start
+        else:
+            run = 0
+    return None, 0
+
+def classify_backend_from_variant(variant):
+    """
+    Heuristic classification of backend based on the Variant string.
+    Captures common cases even if names do not end with specific suffixes.
+    """
+    s = str(variant).strip()
+    low = s.lower()
+    if "hip" in low:
+        return "HIP"
+    if "cuda" in low:
+        return "CUDA"
+    if "openmp" in low or low.endswith("_omp") or " omp" in low or low.startswith("omp"):
+        return "OpenMP"
+    if "seq" in low or "serial" in low or "baseline" in low or "sequential" in low:
+        return "Seq"
+    return "Unknown"
+
+def report_near_zero_derivative_points(
+    x,
+    y,
+    backend_label,
+    kernel,
+    variant,
+    use_relative=DERIV_USE_RELATIVE,
+    eps_rel=DERIV_EPS_REL,
+    eps_abs=DERIV_EPS_ABS,
+    min_consecutive=DERIV_MIN_CONSEC,
+    smooth_window=DERIV_SMOOTH_WINDOW,
+    min_frac_of_max_y=DERIV_MIN_FRAC_OF_MAX_Y,
+    max_points=DERIV_REPORT_MAX_POINTS,
+    report_abs=DERIV_REPORT_ABS,
+):
+    """
+    Prints lines "Problem size=<x>, dy/dx=<val>" for points with small enough derivative.
+    Uses either a relative threshold or an absolute slope threshold.
+    Focuses on the near-peak region to avoid early flat areas.
+    """
+    x = np.asarray(x, dtype=float)
+    y = np.asarray(y, dtype=float)
+
+    # Aggregate duplicate x values by averaging y
+    if len(x) != len(np.unique(x)):
+        tmp = pd.DataFrame({"x": x, "y": y}).groupby("x", as_index=False)["y"].mean()
+        x = tmp["x"].values
+        y = tmp["y"].values
+
+    # Sort by x
+    order = np.argsort(x)
+    x = x[order]
+    y = y[order]
+
+    if len(x) < max(3, min_consecutive):
+        print(
+            f"[DERIV] Backend={backend_label}, Kernel={kernel}, Variant={variant}: not enough points for derivative analysis"
+        )
+        return
+
+    # Optional smoothing
+    y_sm = _moving_average(y, smooth_window)
+
+    x_range = float(x.max() - x.min())
+    if x_range == 0.0:
+        print(
+            f"[DERIV] Backend={backend_label}, Kernel={kernel}, Variant={variant}: zero x-range, cannot compute derivative"
+        )
+        return
+
+    deriv = np.gradient(y_sm, x)  # dy/dx, same length as x
+
+    # Restrict to near-peak region if requested
+    search_mask = np.ones_like(deriv, dtype=bool)
+    y_range = float(y_sm.max() - y_sm.min())
+    if min_frac_of_max_y is not None and 0.0 < min_frac_of_max_y < 1.0 and y_range > 0:
+        thresh_y = y_sm.max() * float(min_frac_of_max_y)
+        search_mask = y_sm >= thresh_y
+
+    if use_relative:
+        # Normalize slope by typical scale y_range/x_range for a dimensionless measure
+        norm_factor = (y_range / x_range) if y_range > 0 else 1.0
+        deriv_norm = np.abs(deriv) / norm_factor
+        near_zero_mask = (deriv_norm <= float(eps_rel)) & search_mask
+    else:
+        near_zero_mask = (np.abs(deriv) <= float(eps_abs)) & search_mask
+
+    # Prefer the first sustained run of small derivatives
+    start_idx, run_len = _find_first_run(near_zero_mask, int(min_consecutive))
+
+    if start_idx is not None:
+        run_indices = np.arange(start_idx, start_idx + run_len)
+        # Downsample to at most max_points for readability
+        if len(run_indices) > max_points:
+            picks_rel = np.linspace(0, len(run_indices) - 1, num=max_points)
+            pick = run_indices[np.round(picks_rel).astype(int)]
+        else:
+            pick = run_indices
+        print(
+            f"[DERIV] Backend={backend_label}, Kernel={kernel}, Variant={variant}: sustained near-zero derivative region found, points={run_len}"
+        )
+    else:
+        # Fallback: choose up to max_points with smallest slope in the search region
+        candidates = np.where(search_mask)[0]
+        if candidates.size == 0:
+            print(
+                f"[DERIV] Backend={backend_label}, Kernel={kernel}, Variant={variant}: no valid search region for derivative analysis"
+            )
+            return
+
+        if use_relative:
+            norm_factor = (y_range / x_range) if y_range > 0 else 1.0
+            deriv_norm = np.abs(deriv) / (norm_factor if norm_factor > 0 else 1.0)
+            order_c = np.argsort(deriv_norm[candidates])
+        else:
+            order_c = np.argsort(np.abs(deriv[candidates]))
+        pick = candidates[order_c[:max_points]]
+        print(
+            f"[DERIV] Backend={backend_label}, Kernel={kernel}, Variant={variant}: no sustained plateau, showing {len(pick)} smallest-slope points"
+        )
+
+    # Ensure sorted by x before printing
+    pick = np.array(sorted(pick.tolist()))
+
+    # Print lines in the requested format
+    for idx in pick:
+        dy = deriv[idx]
+        dy_out = abs(dy) if report_abs else dy
+        print(f"  Problem size={x[idx]:.6g}, dy/dx={dy_out:.6g}")
+
+def plot_backend(df_backend, backend_label, fig_dir, show_plots, fig_dpi):
+    """
+    For a given backend:
+      - One figure per Kernel with solid line and markers per Variant.
+      - Prints derivative-based small-slope points per Variant.
+      - Saves each figure as PNG.
+      - Shows figures interactively if requested.
+    """
+    if df_backend.empty:
+        print(f"\nNo data for backend {backend_label}.")
+        return
+
+    kernels = sorted(df_backend["Kernel"].dropna().unique())
+    variants = sorted(df_backend["Variant"].dropna().unique())
+
+    # Use a larger palette in case many variants exist
+    cmap = plt.cm.tab20
+    colors = [cmap(i % cmap.N) for i in range(max(1, len(variants)))]
+    color_map = {v: colors[i % len(colors)] for i, v in enumerate(variants)}
+
+    for kernel in kernels:
+        df_k = df_backend[df_backend["Kernel"] == kernel]
+        if df_k.empty:
+            continue
+
+        fig = plt.figure(figsize=(10, 6))
+
+        for variant, g in df_k.groupby("Variant"):
+            g_sorted = g.sort_values("Problem size")
+            x = g_sorted["Problem size"].values
+            y = g_sorted["Mean flops (gigaFLOP per sec.)"].values
+
+            color = color_map.get(variant, "black")
+
+            # Actual data curve: solid line with markers
+            plt.plot(
+                x,
+                y,
+                marker="o",
+                linestyle="-",
+                color=color,
+                label=f"{variant}",
+            )
+
+            # Derivative-based report of small-slope points
+            report_near_zero_derivative_points(
+                x,
+                y,
+                backend_label,
+                kernel,
+                variant,
+                use_relative=DERIV_USE_RELATIVE,
+                eps_rel=DERIV_EPS_REL,
+                eps_abs=DERIV_EPS_ABS,
+                min_consecutive=DERIV_MIN_CONSEC,
+                smooth_window=DERIV_SMOOTH_WINDOW,
+                min_frac_of_max_y=DERIV_MIN_FRAC_OF_MAX_Y,
+                max_points=DERIV_REPORT_MAX_POINTS,
+                report_abs=DERIV_REPORT_ABS,
+            )
+
+        plt.xlabel("Problem size")
+        plt.ylabel("Mean flops (gigaFLOP per sec.)")
+        plt.title(f"{backend_label} backend, Kernel: {kernel}")
+        plt.grid(True)
+        plt.tight_layout()
+        plt.legend(fontsize="small", bbox_to_anchor=(1.05, 1), loc="upper left")
+
+        # Save figure as PNG
+        kernel_safe = sanitize_filename(kernel)
+        fname = f"{backend_label}_Kernel-{kernel_safe}.png"
+        fig_path = os.path.join(fig_dir, fname)
+        plt.savefig(fig_path, dpi=fig_dpi, bbox_inches="tight")
+        print(f"[SAVE] Figure saved to: {fig_path}")
+
+        if show_plots:
+            plt.show()
+        else:
+            plt.close(fig)
+
+# ============= Main logic =============
+
+def main():
+    ensure_dir(OUTPUT_DIR)
+    ensure_dir(FIG_DIR)
+
+    files = find_csv_files(ROOT_DIR, GLOB_PATTERNS)
+    if not files:
+        print(f"No files matching patterns {GLOB_PATTERNS} found under '{ROOT_DIR}'")
+        return
+
+    print("Found CSV files:")
+    for f in files:
+        print("  ", f)
+
+    dfs = []
+    for path in files:
+        df = read_single_csv(path)
+        if df is None:
+            continue
+        df = normalize_columns(df)
+
+        # Verify required columns exist post-normalization, else report and skip
+        required_any = {"Kernel", "Variant", "Problem size", "Mean flops (gigaFLOP per sec.)"}
+        if not required_any.issubset(set(df.columns)):
+            print(f"[SKIP] {path} missing required columns after normalization.")
+            print("       Columns present:", list(df.columns))
+            continue
+
+        dfs.append(df)
+
+    if not dfs:
+        print("No CSV files could be parsed with required columns.")
+        return
+
+    combined_df = pd.concat(dfs, ignore_index=True)
+
+    # Basic cleaning
+    combined_df["Kernel"] = combined_df["Kernel"].astype(str).str.strip()
+    combined_df["Variant"] = combined_df["Variant"].astype(str).str.strip()
+
+    # Optional kernel filter
+    if KERNEL_WHITELIST:
+        wl = [w.lower() for w in KERNEL_WHITELIST]
+        combined_df = combined_df[
+            combined_df["Kernel"].str.lower().apply(lambda k: any(w in k for w in wl))
+        ]
+        if combined_df.empty:
+            print("After applying KERNEL_WHITELIST, no rows remain.")
+            return
+
+    # Convert numeric columns
+    combined_df["Problem size"] = pd.to_numeric(combined_df["Problem size"], errors="coerce")
+    combined_df["Mean flops (gigaFLOP per sec.)"] = pd.to_numeric(
+        combined_df["Mean flops (gigaFLOP per sec.)"], errors="coerce"
+    )
+
+    # Drop rows without x or y
+    before_drop = len(combined_df)
+    combined_df = combined_df.dropna(subset=["Problem size", "Mean flops (gigaFLOP per sec.)"])
+    dropped = before_drop - len(combined_df)
+    if dropped > 0:
+        print(f"[CLEAN] Dropped {dropped} rows with non-numeric Problem size or Mean flops.")
+
+    # Save concatenated table to CSV
+    ensure_dir(os.path.dirname(COMBINED_CSV_PATH))
+    combined_df.to_csv(COMBINED_CSV_PATH, index=False)
+    print(f"[SAVE] Combined table saved to: {COMBINED_CSV_PATH}")
+
+    # Backend classification
+    combined_df["Backend"] = combined_df["Variant"].apply(classify_backend_from_variant)
+
+    # Quick summary to help verify MASS3DPA is present
+    print("\nKernels discovered:")
+    print(sorted(combined_df["Kernel"].unique()))
+
+    print("\nCounts by Kernel and Backend:")
+    summary = (
+        combined_df.groupby(["Kernel", "Backend"])
+        .size()
+        .reset_index(name="rows")
+        .sort_values(["Kernel", "Backend"])
+    )
+    for _, row in summary.iterrows():
+        print(f"  Kernel={row['Kernel']}, Backend={row['Backend']}: rows={row['rows']}")
+
+    # Plot aggregated "All" view so kernels appear even if backend classification is Unknown
+    print("\n[Plot] Generating 'All' plots per kernel...")
+    plot_backend(combined_df, "All", FIG_DIR, SHOW_PLOTS, FIG_DPI)
+
+    # Plot per requested backends, only if data exists
+    for b in ["CUDA", "HIP", "Seq", "OpenMP"]:
+        df_b = combined_df[combined_df["Backend"] == b]
+        if df_b.empty:
+            print(f"[Plot] Skipping backend {b}, no rows.")
+            continue
+        print(f"\n[Plot] Generating '{b}' plots per kernel...")
+        plot_backend(df_b, b, FIG_DIR, SHOW_PLOTS, FIG_DPI)
+
+if __name__ == "__main__":
+    main()

From 77b86f2f4d4c699e4df0e674eb8fad0402f4992c Mon Sep 17 00:00:00 2001
From: Arturo Vargas <vargas45@llnl.gov>
Date: Fri, 9 Jan 2026 14:58:06 -0800
Subject: [PATCH 2/5] minor

---
 scripts/study_run_kernels.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/study_run_kernels.py b/scripts/study_run_kernels.py
index bf972aafd..15972c4b5 100644
--- a/scripts/study_run_kernels.py
+++ b/scripts/study_run_kernels.py
@@ -5,7 +5,7 @@
 
 # ============= Configuration =============
 
-ROOT_DIR = "/usr/WS1/vargas45/RAJAPERF_DEV/RAJAPerf-stage/build_lc_toss4-amdclang-7.1.0-gfx942"  # change if needed
+ROOT_DIR = "."  # change if needed
 
 # Use "factor" instead of "mref" in file patterns
 GLOB_PATTERNS = [

From dcaa3aff1d2592519992819d99d6ffe8a531a2e1 Mon Sep 17 00:00:00 2001
From: Arturo Vargas <vargas45@llnl.gov>
Date: Tue, 13 Jan 2026 11:51:49 -0800
Subject: [PATCH 3/5] add new script that adds tunings

---
 scripts/study_run_kernel_tunings.py | 532 ++++++++++++++++++++++++++++
 1 file changed, 532 insertions(+)
 create mode 100644 scripts/study_run_kernel_tunings.py

diff --git a/scripts/study_run_kernel_tunings.py b/scripts/study_run_kernel_tunings.py
new file mode 100644
index 000000000..a978c7382
--- /dev/null
+++ b/scripts/study_run_kernel_tunings.py
@@ -0,0 +1,532 @@
+import os
+import glob
+import numpy as np
+import pandas as pd
+
+# ============= Configuration =============
+
+ROOT_DIR = "."  # change if needed
+
+# Use "factor" instead of "mref" in file patterns
+GLOB_PATTERNS = [
+    "**/*factor*kernel-run-data.csv",              # broad match
+]
+
+# Optional filter to only keep specific kernels by substring match (case-insensitive)
+# Leave empty to include all kernels discovered.
+KERNEL_WHITELIST = [
+    # "MASS3DPA",
+]
+
+# Derivative reporting configuration
+DERIV_USE_RELATIVE = True
+DERIV_EPS_REL = 0.03          # relative threshold on |dy/dx| normalized by (y_range/x_range)
+DERIV_EPS_ABS = 1e-4          # absolute threshold on |dy/dx|, only used if DERIV_USE_RELATIVE=False
+DERIV_MIN_CONSEC = 3          # minimum consecutive points below threshold to consider a plateau run
+DERIV_SMOOTH_WINDOW = 3       # moving average window for smoothing y before derivative
+DERIV_MIN_FRAC_OF_MAX_Y = 0.9 # only search after reaching this fraction of max(y)
+DERIV_REPORT_MAX_POINTS = 8   # limit how many points to print per series
+DERIV_REPORT_ABS = True       # print |dy/dx| if True, else print signed dy/dx
+
+# Output and plotting configuration
+OUTPUT_DIR = "./results"
+FIG_DIR = os.path.join(OUTPUT_DIR, "figures")
+COMBINED_CSV_PATH = os.path.join(OUTPUT_DIR, "combined_table.csv")
+FIG_DPI = 300
+SHOW_PLOTS = True  # show interactive windows while also saving PNGs
+
+# Use a non-interactive backend only when not showing plots
+if not SHOW_PLOTS:
+    import matplotlib
+    matplotlib.use("Agg")
+
+import matplotlib.pyplot as plt
+
+
+# ============= Helper functions =============
+
+def ensure_dir(path):
+    os.makedirs(path, exist_ok=True)
+
+def sanitize_filename(text):
+    return "".join(c if c.isalnum() or c in "-_." else "_" for c in str(text))
+
+def find_csv_files(root_dir, patterns):
+    """Recursively find CSV files matching any of the given patterns."""
+    all_files = []
+    for pattern in patterns:
+        search_pattern = os.path.join(root_dir, pattern)
+        files = glob.glob(search_pattern, recursive=True)
+        all_files.extend(files)
+    all_files = sorted(set(all_files))  # Remove duplicates and sort
+    return all_files
+
+def _likely_header_score(line):
+    """
+    Score a potential header line based on presence of common column tokens.
+    Higher is more likely to be the header.
+    """
+    tokens = [
+        "Kernel",
+        "Variant",
+        "Problem size",
+        "Problem Size",
+        "Mean flops",
+        "GFlop",
+        "GFLOP",
+        "GFLOPs",
+        "GFLOPS",
+    ]
+    score = 0
+    for t in tokens:
+        if t in line:
+            score += 1
+    return score
+
+def read_single_csv(path):
+    """
+    Read one CSV, trying to detect the header row by locating a line
+    that contains key column names. Returns a DataFrame or None.
+    """
+    try:
+        with open(path, "r", encoding="utf-8") as f:
+            lines = f.readlines()
+    except Exception as e:
+        print(f"Failed to read {path}: {e}")
+        return None
+
+    header_idx = None
+    best_score = -1
+    for i, line in enumerate(lines[:50]):  # only inspect the first 50 lines
+        score = _likely_header_score(line)
+        if score > best_score:
+            best_score = score
+            header_idx = i
+
+    if header_idx is None:
+        print(f"Could not find header in {path}, skipping.")
+        return None
+
+    try:
+        df = pd.read_csv(path, header=header_idx)
+    except Exception as e:
+        print(f"Failed to parse CSV {path}: {e}")
+        return None
+
+    df["__source_file__"] = path
+    return df
+
+def normalize_columns(df):
+    """
+    Normalize common column names to a standard set if possible.
+    """
+    candidates = {
+        # Standard name : possible variants
+        "Kernel": ["Kernel", "Kernel name", "Benchmark", "Test"],
+        "Variant": ["Variant", "Implementation", "Policy", "Config", "Backend", "Suite"],
+        "Problem size": [
+            "Problem size", "Problem Size", "Size", "N", "DOF", "Elements",
+            "ProblemSize", "Problem-size"
+        ],
+        "Mean flops (gigaFLOP per sec.)": [
+            "Mean flops (gigaFLOP per sec.)",
+            "Mean flops (GFlop/s)",
+            "Mean Flops (GFlop/s)",
+            "GFLOP/s", "GFLOPs/s", "GFLOPS", "GFlops/s", "GFlop/s", "GF/s",
+            "Mean GFLOP/s", "Mean GFLOPs/s"
+        ],
+    }
+
+    new_col_map = {}
+    # strip whitespace from existing columns first
+    df = df.rename(columns={c: c.strip() for c in df.columns})
+
+    for standard_name, names in candidates.items():
+        for c in names:
+            if c in df.columns:
+                new_col_map[c] = standard_name
+                break  # first match wins
+
+    df = df.rename(columns=new_col_map)
+    return df
+
+def _moving_average(y, window):
+    if window is None or window <= 1 or len(y) < 3:
+        return y
+    window = max(2, int(window))
+    kernel = np.ones(window, dtype=float) / float(window)
+    return np.convolve(y, kernel, mode="same")
+
+def _find_first_run(mask, min_len):
+    """Return the start index and run length of the first run of True with length >= min_len."""
+    run = 0
+    for i, v in enumerate(mask):
+        if v:
+            run += 1
+            if run >= min_len:
+                start = i - run + 1
+                j = i + 1
+                while j < len(mask) and mask[j]:
+                    j += 1
+                return start, j - start
+        else:
+            run = 0
+    return None, 0
+
+def classify_backend_from_variant(variant):
+    """
+    Heuristic classification of backend based on the Variant string.
+    Captures common cases even if names do not end with specific suffixes.
+    """
+    s = str(variant).strip()
+    low = s.lower()
+    if "hip" in low:
+        return "HIP"
+    if "cuda" in low:
+        return "CUDA"
+    if "openmp" in low or low.endswith("_omp") or " omp" in low or low.startswith("omp"):
+        return "OpenMP"
+    if "seq" in low or "serial" in low or "baseline" in low or "sequential" in low:
+        return "Seq"
+    return "Unknown"
+
+# NEW: classify tuning; adjust logic to match your actual naming scheme
+def classify_tuning(row):
+    """
+    Return a tuning label for a row.
+    You can customize this to use any available columns.
+    Examples:
+      - A dedicated 'Tuning' column
+      - Parsing 'Variant' into backend + tuning
+      - Using problem-size or factor strings
+    For now, use:
+      - If a 'Tuning' column exists, use that
+      - Else, if '__source_file__' used different settings, use its basename
+      - Else, return 'default'
+    """
+    # If the CSV already has a Tuning column, use it
+    if "Tuning" in row and pd.notna(row["Tuning"]):
+        return str(row["Tuning"]).strip()
+
+    # Otherwise, derive from source file name as a proxy for tuning
+    src = row.get("__source_file__", "")
+    if isinstance(src, str) and src:
+        return os.path.basename(src)
+
+    return "default"
+
+def report_near_zero_derivative_points(
+    x,
+    y,
+    backend_label,
+    kernel,
+    variant,
+    tuning_label,
+    use_relative=DERIV_USE_RELATIVE,
+    eps_rel=DERIV_EPS_REL,
+    eps_abs=DERIV_EPS_ABS,
+    min_consecutive=DERIV_MIN_CONSEC,
+    smooth_window=DERIV_SMOOTH_WINDOW,
+    min_frac_of_max_y=DERIV_MIN_FRAC_OF_MAX_Y,
+    max_points=DERIV_REPORT_MAX_POINTS,
+    report_abs=DERIV_REPORT_ABS,
+):
+    """
+    Prints lines "Problem size=<x>, dy/dx=<val>" for points with small enough derivative.
+    Uses either a relative threshold or an absolute slope threshold.
+    Focuses on the near-peak region to avoid early flat areas.
+    """
+    x = np.asarray(x, dtype=float)
+    y = np.asarray(y, dtype=float)
+
+    # Aggregate duplicate x values by averaging y
+    if len(x) != len(np.unique(x)):
+        tmp = pd.DataFrame({"x": x, "y": y}).groupby("x", as_index=False)["y"].mean()
+        x = tmp["x"].values
+        y = tmp["y"].values
+
+    # Sort by x
+    order = np.argsort(x)
+    x = x[order]
+    y = y[order]
+
+    if len(x) < max(3, min_consecutive):
+        print(
+            f"[DERIV] Backend={backend_label}, Kernel={kernel}, Variant={variant}, Tuning={tuning_label}: not enough points for derivative analysis"
+        )
+        return
+
+    # Optional smoothing
+    y_sm = _moving_average(y, smooth_window)
+
+    x_range = float(x.max() - x.min())
+    if x_range == 0.0:
+        print(
+            f"[DERIV] Backend={backend_label}, Kernel={kernel}, Variant={variant}, Tuning={tuning_label}: zero x-range, cannot compute derivative"
+        )
+        return
+
+    deriv = np.gradient(y_sm, x)  # dy/dx, same length as x
+
+    # Restrict to near-peak region if requested
+    search_mask = np.ones_like(deriv, dtype=bool)
+    y_range = float(y_sm.max() - y_sm.min())
+    if min_frac_of_max_y is not None and 0.0 < min_frac_of_max_y < 1.0 and y_range > 0:
+        thresh_y = y_sm.max() * float(min_frac_of_max_y)
+        search_mask = y_sm >= thresh_y
+
+    if use_relative:
+        # Normalize slope by typical scale y_range/x_range for a dimensionless measure
+        norm_factor = (y_range / x_range) if y_range > 0 else 1.0
+        deriv_norm = np.abs(deriv) / norm_factor
+        near_zero_mask = (deriv_norm <= float(eps_rel)) & search_mask
+    else:
+        near_zero_mask = (np.abs(deriv) <= float(eps_abs)) & search_mask
+
+    # Prefer the first sustained run of small derivatives
+    start_idx, run_len = _find_first_run(near_zero_mask, int(min_consecutive))
+
+    if start_idx is not None:
+        run_indices = np.arange(start_idx, start_idx + run_len)
+        # Downsample to at most max_points for readability
+        if len(run_indices) > max_points:
+            picks_rel = np.linspace(0, len(run_indices) - 1, num=max_points)
+            pick = run_indices[np.round(picks_rel).astype(int)]
+        else:
+            pick = run_indices
+        print(
+            f"[DERIV] Backend={backend_label}, Kernel={kernel}, Variant={variant}, Tuning={tuning_label}: sustained near-zero derivative region found, points={run_len}"
+        )
+    else:
+        # Fallback: choose up to max_points with smallest slope in the search region
+        candidates = np.where(search_mask)[0]
+        if candidates.size == 0:
+            print(
+                f"[DERIV] Backend={backend_label}, Kernel={kernel}, Variant={variant}, Tuning={tuning_label}: no valid search region for derivative analysis"
+            )
+            return
+
+        if use_relative:
+            norm_factor = (y_range / x_range) if y_range > 0 else 1.0
+            deriv_norm = np.abs(deriv) / (norm_factor if norm_factor > 0 else 1.0)
+            order_c = np.argsort(deriv_norm[candidates])
+        else:
+            order_c = np.argsort(np.abs(deriv[candidates]))
+        pick = candidates[order_c[:max_points]]
+        print(
+            f"[DERIV] Backend={backend_label}, Kernel={kernel}, Variant={variant}, Tuning={tuning_label}: no sustained plateau, showing {len(pick)} smallest-slope points"
+        )
+
+    # Ensure sorted by x before printing
+    pick = np.array(sorted(pick.tolist()))
+
+    # Print lines in the requested format
+    for idx in pick:
+        dy = deriv[idx]
+        dy_out = abs(dy) if report_abs else dy
+        print(f"  Problem size={x[idx]:.6g}, dy/dx={dy_out:.6g}")
+
+def plot_backend(df_backend, backend_label, fig_dir, show_plots, fig_dpi):
+    """
+    For a given backend:
+      - One figure per Kernel with solid line and markers per (Variant, Tuning).
+      - Prints derivative-based small-slope points per (Variant, Tuning).
+      - Saves each figure as PNG.
+      - Shows figures interactively if requested.
+    """
+    if df_backend.empty:
+        print(f"\nNo data for backend {backend_label}.")
+        return
+
+    kernels = sorted(df_backend["Kernel"].dropna().unique())
+
+    # Unique (Variant, Tuning) combos for color/marker assignments
+    vt_pairs = sorted(
+        df_backend[["Variant", "Tuning"]]
+        .dropna()
+        .drop_duplicates()
+        .itertuples(index=False, name=None)
+    )
+
+    cmap = plt.cm.tab20
+    num_pairs = max(1, len(vt_pairs))
+    colors = [cmap(i % cmap.N) for i in range(num_pairs)]
+    markers = ["o", "s", "^", "D", "v", ">", "<", "P", "X"]  # repeat if needed
+
+    style_map = {}
+    for idx, (variant, tuning) in enumerate(vt_pairs):
+        color = colors[idx % len(colors)]
+        marker = markers[idx % len(markers)]
+        style_map[(variant, tuning)] = (color, marker)
+
+    for kernel in kernels:
+        df_k = df_backend[df_backend["Kernel"] == kernel]
+        if df_k.empty:
+            continue
+
+        fig = plt.figure(figsize=(10, 6))
+
+        # group by Variant and Tuning to get separate curves
+        for (variant, tuning), g in df_k.groupby(["Variant", "Tuning"]):
+            g_sorted = g.sort_values("Problem size")
+            x = g_sorted["Problem size"].values
+            y = g_sorted["Mean flops (gigaFLOP per sec.)"].values
+
+            color, marker = style_map.get((variant, tuning), ("black", "o"))
+
+            label = f"{variant} | {tuning}"
+
+            plt.plot(
+                x,
+                y,
+                marker=marker,
+                linestyle="-",
+                color=color,
+                label=label,
+            )
+
+            # Derivative-based report of small-slope points
+            report_near_zero_derivative_points(
+                x,
+                y,
+                backend_label,
+                kernel,
+                variant,
+                tuning,
+                use_relative=DERIV_USE_RELATIVE,
+                eps_rel=DERIV_EPS_REL,
+                eps_abs=DERIV_EPS_ABS,
+                min_consecutive=DERIV_MIN_CONSEC,
+                smooth_window=DERIV_SMOOTH_WINDOW,
+                min_frac_of_max_y=DERIV_MIN_FRAC_OF_MAX_Y,
+                max_points=DERIV_REPORT_MAX_POINTS,
+                report_abs=DERIV_REPORT_ABS,
+            )
+
+        plt.xlabel("Problem size")
+        plt.ylabel("Mean flops (gigaFLOP per sec.)")
+
+        # TITLE MODIFIED: explicitly list backend
+        plt.title(f"Kernel: {kernel} | Backend: {backend_label}")
+
+        plt.grid(True)
+        plt.tight_layout()
+        plt.legend(fontsize="small", bbox_to_anchor=(1.05, 1), loc="upper left")
+
+        # Save figure as PNG
+        kernel_safe = sanitize_filename(kernel)
+        backend_safe = sanitize_filename(backend_label)
+        fname = f"{backend_safe}_Kernel-{kernel_safe}.png"
+        fig_path = os.path.join(fig_dir, fname)
+        plt.savefig(fig_path, dpi=fig_dpi, bbox_inches="tight")
+        print(f"[SAVE] Figure saved to: {fig_path}")
+
+        if show_plots:
+            plt.show()
+        else:
+            plt.close(fig)
+
+# ============= Main logic =============
+
+def main():
+    ensure_dir(OUTPUT_DIR)
+    ensure_dir(FIG_DIR)
+
+    files = find_csv_files(ROOT_DIR, GLOB_PATTERNS)
+    if not files:
+        print(f"No files matching patterns {GLOB_PATTERNS} found under '{ROOT_DIR}'")
+        return
+
+    print("Found CSV files:")
+    for f in files:
+        print("  ", f)
+
+    dfs = []
+    for path in files:
+        df = read_single_csv(path)
+        if df is None:
+            continue
+        df = normalize_columns(df)
+
+        # Verify required columns exist post-normalization, else report and skip
+        required_any = {"Kernel", "Variant", "Problem size", "Mean flops (gigaFLOP per sec.)"}
+        if not required_any.issubset(set(df.columns)):
+            print(f"[SKIP] {path} missing required columns after normalization.")
+            print("       Columns present:", list(df.columns))
+            continue
+
+        dfs.append(df)
+
+    if not dfs:
+        print("No CSV files could be parsed with required columns.")
+        return
+
+    combined_df = pd.concat(dfs, ignore_index=True)
+
+    # Basic cleaning
+    combined_df["Kernel"] = combined_df["Kernel"].astype(str).str.strip()
+    combined_df["Variant"] = combined_df["Variant"].astype(str).str.strip()
+
+    # Optional kernel filter
+    if KERNEL_WHITELIST:
+        wl = [w.lower() for w in KERNEL_WHITELIST]
+        combined_df = combined_df[
+            combined_df["Kernel"].str.lower().apply(lambda k: any(w in k for w in wl))
+        ]
+        if combined_df.empty:
+            print("After applying KERNEL_WHITELIST, no rows remain.")
+            return
+
+    # Convert numeric columns
+    combined_df["Problem size"] = pd.to_numeric(combined_df["Problem size"], errors="coerce")
+    combined_df["Mean flops (gigaFLOP per sec.)"] = pd.to_numeric(
+        combined_df["Mean flops (gigaFLOP per sec.)"], errors="coerce"
+    )
+
+    # Drop rows without x or y
+    before_drop = len(combined_df)
+    combined_df = combined_df.dropna(subset=["Problem size", "Mean flops (gigaFLOP per sec.)"])
+    dropped = before_drop - len(combined_df)
+    if dropped > 0:
+        print(f"[CLEAN] Dropped {dropped} rows with non-numeric Problem size or Mean flops.")
+
+    # Backend classification
+    combined_df["Backend"] = combined_df["Variant"].apply(classify_backend_from_variant)
+
+    # NEW: derive Tuning column
+    combined_df["Tuning"] = combined_df.apply(classify_tuning, axis=1)
+
+    # Save concatenated table to CSV, now including Backend and Tuning
+    ensure_dir(os.path.dirname(COMBINED_CSV_PATH))
+    combined_df.to_csv(COMBINED_CSV_PATH, index=False)
+    print(f"[SAVE] Combined table saved to: {COMBINED_CSV_PATH}")
+
+    # Quick summary to help verify MASS3DPA is present
+    print("\nKernels discovered:")
+    print(sorted(combined_df["Kernel"].unique()))
+
+    print("\nCounts by Kernel and Backend:")
+    summary = (
+        combined_df.groupby(["Kernel", "Backend"])
+        .size()
+        .reset_index(name="rows")
+        .sort_values(["Kernel", "Backend"])
+    )
+    for _, row in summary.iterrows():
+        print(f"  Kernel={row['Kernel']}, Backend={row['Backend']}: rows={row['rows']}")
+
+    # Plot aggregated "All" view so kernels appear even if backend classification is Unknown
+    print("\n[Plot] Generating 'All' plots per kernel...")
+    plot_backend(combined_df, "All", FIG_DIR, SHOW_PLOTS, FIG_DPI)
+
+    # Plot per requested backends, only if data exists
+    for b in ["CUDA", "HIP", "Seq", "OpenMP"]:
+        df_b = combined_df[combined_df["Backend"] == b]
+        if df_b.empty:
+            print(f"[Plot] Skipping backend {b}, no rows.")
+            continue
+        print(f"\n[Plot] Generating '{b}' plots per kernel...")
+        plot_backend(df_b, b, FIG_DIR, SHOW_PLOTS, FIG_DPI)
+
+if __name__ == "__main__":
+    main()

From daefd53d6ee599532b83d1fee8ab713d69d24bff Mon Sep 17 00:00:00 2001
From: Rich Hornung <hornung1@llnl.gov>
Date: Tue, 13 Jan 2026 12:00:53 -0800
Subject: [PATCH 4/5] Move benchmarking scripts to subdirectory

---
 scripts/{ => benchmarking}/run_kernels.sh              | 0
 scripts/{ => benchmarking}/study_run_kernel_tunings.py | 0
 scripts/{ => benchmarking}/study_run_kernels.py        | 0
 3 files changed, 0 insertions(+), 0 deletions(-)
 rename scripts/{ => benchmarking}/run_kernels.sh (100%)
 rename scripts/{ => benchmarking}/study_run_kernel_tunings.py (100%)
 rename scripts/{ => benchmarking}/study_run_kernels.py (100%)

diff --git a/scripts/run_kernels.sh b/scripts/benchmarking/run_kernels.sh
similarity index 100%
rename from scripts/run_kernels.sh
rename to scripts/benchmarking/run_kernels.sh
diff --git a/scripts/study_run_kernel_tunings.py b/scripts/benchmarking/study_run_kernel_tunings.py
similarity index 100%
rename from scripts/study_run_kernel_tunings.py
rename to scripts/benchmarking/study_run_kernel_tunings.py
diff --git a/scripts/study_run_kernels.py b/scripts/benchmarking/study_run_kernels.py
similarity index 100%
rename from scripts/study_run_kernels.py
rename to scripts/benchmarking/study_run_kernels.py

From f5f8bc7fb6c49cd4485bc3337b9dc06b83330dd0 Mon Sep 17 00:00:00 2001
From: Rich Hornung <hornung1@llnl.gov>
Date: Tue, 13 Jan 2026 15:48:31 -0800
Subject: [PATCH 5/5] Add new script to run full benchmark with MPI

---
 .../benchmarking/run_full_benchmark-mpi.sh    | 47 +++++++++++++++++++
 scripts/benchmarking/run_kernels.sh           |  0
 .../benchmarking/study_run_kernel_tunings.py  |  0
 scripts/benchmarking/study_run_kernels.py     |  0
 4 files changed, 47 insertions(+)
 create mode 100755 scripts/benchmarking/run_full_benchmark-mpi.sh
 mode change 100644 => 100755 scripts/benchmarking/run_kernels.sh
 mode change 100644 => 100755 scripts/benchmarking/study_run_kernel_tunings.py
 mode change 100644 => 100755 scripts/benchmarking/study_run_kernels.py

diff --git a/scripts/benchmarking/run_full_benchmark-mpi.sh b/scripts/benchmarking/run_full_benchmark-mpi.sh
new file mode 100755
index 000000000..69aa61b23
--- /dev/null
+++ b/scripts/benchmarking/run_full_benchmark-mpi.sh
@@ -0,0 +1,47 @@
+#!/usr/bin/env bash
+
+## Run all benchmark kernels for GPU, non-lambda variants only
+## on 4 MPI ranks and dump the results in the specified directory.
+
+flux alloc -xN1 -t 20 bash -c '
+
+OUTDIR=RPBenchmarkTestMPI
+
+# Collection of problem size factors between 0.5 and 6
+FACTORS=(0.5 1.0 2.0 3.0 4.0 5.0 6.0)
+
+# List of kernels to run
+KERNELS=("CONVECTION3DPA"
+         "DEL_DOT_VEC_2D"
+         "DIFFUSION3DPA"
+         "EDGE3D"
+         "ENERGY"
+         "INTSC_HEXHEX"
+         "INTSC_HEXRECT"
+         "LTIMES"
+         "MASS3DEA"
+         "MASSVEC3DPA"
+         "MATVEC_3D_STENCIL"
+         "NODAL_ACCUMULATION_3D"
+         "VOL3D"
+         "MULTI_REDUCE"
+         "REDUCE_STRUCT"
+         "HALO_EXCHANGE_FUSED")
+
+for KERNEL_NAME in "${KERNELS[@]}"; do
+    echo "Running kernel: $KERNEL_NAME"
+
+    for factor in "${FACTORS[@]}"; do
+        echo "  Running with sizefact = $factor"
+        flux run -xN1 -n4 ./bin/raja-perf.exe \
+            -k "$KERNEL_NAME" \
+            --npasses 1 \
+            --npasses-combiners Average Minimum Maximum \
+            --outdir ${OUTDIR} \
+            --outfile "${KERNEL_NAME}_factor_${factor}" \
+            --sizefact "$factor" \
+            --warmup-perfrun-same \
+            -ev Seq Lambda
+    done
+done
+'
diff --git a/scripts/benchmarking/run_kernels.sh b/scripts/benchmarking/run_kernels.sh
old mode 100644
new mode 100755
diff --git a/scripts/benchmarking/study_run_kernel_tunings.py b/scripts/benchmarking/study_run_kernel_tunings.py
old mode 100644
new mode 100755
diff --git a/scripts/benchmarking/study_run_kernels.py b/scripts/benchmarking/study_run_kernels.py
old mode 100644
new mode 100755