From 1d102a7743e470c411c8976becf68bce2802389d Mon Sep 17 00:00:00 2001
From: Michael Richard Mckinsey <mckinsey@lassen20.coral.llnl.gov>
Date: Thu, 8 Jun 2023 16:51:31 -0700
Subject: [PATCH 01/13] Implement Caliper from PR254 Without NAME.TUNING
 Sub-Nodes

---
 CMakeLists.txt               |  28 +++++
 src/CMakeLists.txt           |   1 +
 src/common/CMakeLists.txt    |   1 +
 src/common/Executor.cpp      | 117 ++++++++++++++++++
 src/common/Executor.hpp      |   4 +
 src/common/KernelBase.cpp    |  67 ++++++++++
 src/common/KernelBase.hpp    | 233 ++++++++++++++++++++++++++++++++++-
 src/common/RAJAPerfSuite.hpp |   6 +
 src/common/RunParams.cpp     |  29 +++++
 src/common/RunParams.hpp     |   8 ++
 src/rajaperf_config.hpp.in   |  39 +++---
 11 files changed, 510 insertions(+), 23 deletions(-)
diff --git a/CMakeLists.txt b/CMakeLists.txt
index f34947517..253d7d872 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -124,6 +124,34 @@ if ((ENABLE_HIP) AND (NOT ENABLE_KOKKOS))
   list(APPEND RAJA_PERFSUITE_DEPENDS blt::hip_runtime)
 endif()
 
+#
+# Are we using Caliper
+#
+set(RAJA_PERFSUITE_USE_CALIPER off CACHE BOOL "")
+if (RAJA_PERFSUITE_USE_CALIPER)
+  find_package(caliper REQUIRED)
+  list(APPEND RAJA_PERFSUITE_DEPENDS caliper)
+  add_definitions(-DRAJA_PERFSUITE_USE_CALIPER)
+  message(STATUS "Using Caliper")
+  find_package(adiak REQUIRED)
+  # use ${adiak_LIBRARIES} since version could have adiak vs adiak::adiak export
+  list(APPEND RAJA_PERFSUITE_DEPENDS ${adiak_LIBRARIES})
+  if (ENABLE_CUDA)
+    # Adiak will propagate -pthread from spectrum mpi from a spack install of Caliper with +mpi; and needs to be handled even if RAJAPerf is non MPI program
+    # We should delegate to BLT to handle unguarded -pthread from any dependencies, but currently BLT doesn't
+    set_target_properties(${adiak_LIBRARIES} PROPERTIES INTERFACE_COMPILE_OPTIONS "$<$<NOT:$<COMPILE_LANGUAGE:CUDA>>:-pthread>;$<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler=-pthread>")
+    # the following for adiak-0.2.2
+    if (TARGET adiak::mpi)
+      set_target_properties(adiak::mpi PROPERTIES INTERFACE_COMPILE_OPTIONS "$<$<NOT:$<COMPILE_LANGUAGE:CUDA>>:-pthread>;$<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler=-pthread>")
+    endif ()
+  endif ()
+  message(STATUS "Caliper includes : ${caliper_INCLUDE_DIR}")
+  message(STATUS "Adiak includes : ${adiak_INCLUDE_DIRS}")
+  include_directories(${caliper_INCLUDE_DIR})
+  include_directories(${adiak_INCLUDE_DIRS})
+endif ()
+
+
 set(RAJAPERF_BUILD_SYSTYPE $ENV{SYS_TYPE})
 set(RAJAPERF_BUILD_HOST $ENV{HOSTNAME})
 
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index b0d8b31a9..036ec7a89 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -255,6 +255,7 @@ else()
 blt_add_executable(
   NAME raja-perf.exe
   SOURCES RAJAPerfSuiteDriver.cpp
+  INCLUDES ${PROJECT_BINARY_DIR}/include
   DEPENDS_ON ${RAJA_PERFSUITE_EXECUTABLE_DEPENDS}
   )
 install( TARGETS raja-perf.exe
diff --git a/src/common/CMakeLists.txt b/src/common/CMakeLists.txt
index 47db79deb..9dff522bd 100644
--- a/src/common/CMakeLists.txt
+++ b/src/common/CMakeLists.txt
@@ -14,5 +14,6 @@ blt_add_library(
           OutputUtils.cpp 
           RAJAPerfSuite.cpp 
           RunParams.cpp
+  INCLUDES ${PROJECT_BINARY_DIR}/include/
   DEPENDS_ON ${RAJA_PERFSUITE_DEPENDS}
   )
diff --git a/src/common/Executor.cpp b/src/common/Executor.cpp
index e62307e04..a94d2e8c4 100644
--- a/src/common/Executor.cpp
+++ b/src/common/Executor.cpp
@@ -26,6 +26,7 @@
 #include <list>
 #include <vector>
 #include <string>
+#include <regex>
 #include <unordered_map>
 
 #include <iostream>
@@ -42,6 +43,16 @@ namespace rajaperf {
 
 using namespace std;
 
+#if defined(RAJA_PERFSUITE_USE_CALIPER)
+vector<string> split(const string str, const string regex_str)
+{
+  regex regexz(regex_str);
+  vector<string> list(sregex_token_iterator(str.begin(), str.end(), regexz, -1),
+                                sregex_token_iterator());
+  return list;
+}
+#endif
+
 namespace {
 
 #if defined(RAJA_PERFSUITE_ENABLE_MPI)
@@ -105,6 +116,94 @@ Executor::Executor(int argc, char** argv)
     reference_vid(NumVariants),
     reference_tune_idx(KernelBase::getUnknownTuningIdx())
 {
+#if defined(RAJA_PERFSUITE_USE_CALIPER)
+  struct configuration cc;
+  adiak::init(NULL);
+  adiak::user();
+  adiak::launchdate();
+  adiak::libraries();
+  adiak::cmdline();
+  adiak::clustername();
+  adiak::value("perfsuite_version", cc.adiak_perfsuite_version);
+  adiak::value("raja_version", cc.adiak_raja_version);
+  adiak::value("cmake_build_type", cc.adiak_cmake_build_type);
+  adiak::value("cmake_cxx_flags", cc.adiak_cmake_cxx_flags);
+  adiak::value("cmake_exe_linker_flags", cc.adiak_cmake_exe_linker_flags);
+  adiak::value("rajaperf_compiler", cc.adiak_rajaperf_compiler);
+  adiak::value("rajaperf_compiler_options", cc.adiak_rajaperf_compiler_options);
+  adiak::value("compiler_version", cc.adiak_compiler_version);
+
+  auto tokens = split(cc.adiak_rajaperf_compiler, "/");
+  string compiler_exec = tokens.back();
+  string compiler = compiler_exec + "-" + cc.adiak_compiler_version;
+  cout << "Compiler: " << compiler << "\n";
+  adiak::value("compiler", compiler.c_str());
+  auto tsize = tokens.size();
+  if (tsize >= 3) {
+    // pickup path version <compiler-version-hash|date>/bin/exec
+    string path_version = tokens[tsize-3];
+    //cout << "Compiler path version: " << path_version << "\n";
+    auto s = split(path_version,"-");
+    if (s.size() >= 2) {
+      string path_version_short = s[0] + "-" + s[1];
+      //cout << "Compiler path version short: " << path_version_short << "\n";
+      adiak::value("Compiler_path_version",path_version_short.c_str());
+    } 
+  }
+
+  if (strlen(cc.adiak_cuda_compiler_version) > 0) {
+    adiak::value("cuda_compiler_version", cc.adiak_cuda_compiler_version);
+  }
+  if (strlen(cc.adiak_gpu_targets) > 0) {
+    adiak::value("gpu_targets", cc.adiak_gpu_targets);
+  }
+  if (strlen(cc.adiak_cmake_hip_architectures) > 0) {
+    adiak::value("cmake_hip_architectures", cc.adiak_cmake_hip_architectures);
+  }
+  if (strlen(cc.adiak_gpu_targets_block_sizes) > 0) {
+    adiak::value("gpu_targets_block_sizes", cc.adiak_gpu_targets_block_sizes);
+  }
+  if (strlen(cc.adiak_raja_hipcc_flags) > 0) {
+    adiak::value("raja_hipcc_flags", cc.adiak_raja_hipcc_flags);
+  }
+  if (strlen(cc.adiak_mpi_cxx_compiler) > 0) {
+    adiak::value("mpi_cxx_compiler", cc.adiak_mpi_cxx_compiler);
+  }
+  if (strlen(cc.adiak_systype_build) > 0) {
+    adiak::value("systype_build", cc.adiak_systype_build);
+  }
+  if (strlen(cc.adiak_machine_build) > 0) {
+    adiak::value("machine_build", cc.adiak_machine_build);
+  }
+
+  adiak::value("ProblemSizeRunParam",(double)1.0);
+  adiak::value("SizeMeaning",run_params.SizeMeaningToStr(run_params.getSizeMeaning()).c_str());
+  if (run_params.getSizeMeaning() == RunParams::SizeMeaning::Factor) {
+    adiak::value("ProblemSizeRunParam",(double)run_params.getSizeFactor());
+  } else if (run_params.getSizeMeaning() == RunParams::SizeMeaning::Direct) {
+    adiak::value("ProblemSizeRunParam",(double)run_params.getSize());
+  }
+
+  // Openmp section
+#if defined(_OPENMP)
+  std::string strval = "";
+  std::string test = std::to_string(_OPENMP);
+
+  std::unordered_map<unsigned,std::string> map{
+    {200505,"2.5"},{200805,"3.0"},{201107,"3.1"},{201307,"4.0"},{201511,"4.5"},{201611,"4.5"},{201811,"5.0"},{202011,"5.1"},{202111,"5.2"}};
+
+  try {
+    strval = map.at(_OPENMP);
+  } catch(...) {
+    strval="Version Not Detected";
+  }
+  std::cerr << "_OPENMP:" << test << " at version: " << strval << "\n";
+  adiak::value("omp_version",strval.c_str());
+  strval = std::to_string(omp_get_max_threads());
+  adiak::value("omp_max_threads",strval.c_str());
+#endif
+
+#endif
 }
 
 
@@ -113,6 +212,9 @@ Executor::~Executor()
   for (size_t ik = 0; ik < kernels.size(); ++ik) {
     delete kernels[ik];
   }
+#if defined(RAJA_PERFSUITE_USE_CALIPER)
+  adiak::fini();
+#endif
 }
 
 
@@ -607,6 +709,11 @@ void Executor::setupSuite()
       for (VIDset::iterator vid = run_var.begin();
            vid != run_var.end(); ++vid) {
         variant_ids.push_back( *vid );
+      #if defined(RAJA_PERFSUITE_USE_CALIPER)
+          KernelBase::setCaliperMgrVariant(*vid,
+                                            run_params.getOutputDirName(),
+                                            run_params.getAddToSpotConfig());
+      #endif
       }
 
       //
@@ -932,7 +1039,13 @@ void Executor::runSuite()
 
     for (size_t ik = 0; ik < warmup_kernels.size(); ++ik) {
       KernelBase* warmup_kernel = warmup_kernels[ik];
+      #if defined(RAJA_PERFSUITE_USE_CALIPER)
+        warmup_kernel->caliperOff();
+      #endif
       runKernel(warmup_kernel, true);
+      #if defined(RAJA_PERFSUITE_USE_CALIPER)
+        warmup_kernel->caliperOn();
+      #endif
       delete warmup_kernel;
       warmup_kernels[ik] = nullptr;
     }
@@ -954,6 +1067,10 @@ void Executor::runSuite()
 
   } // loop over passes through suite
 
+  #if defined(RAJA_PERFSUITE_USE_CALIPER)
+    // Flush Caliper data
+    KernelBase::setCaliperMgrFlush();
+  #endif
 }
 
 template < typename Kernel >
diff --git a/src/common/Executor.hpp b/src/common/Executor.hpp
index 6bca5a1d2..2aaeeb10a 100644
--- a/src/common/Executor.hpp
+++ b/src/common/Executor.hpp
@@ -12,6 +12,10 @@
 #include "common/RAJAPerfSuite.hpp"
 #include "common/RunParams.hpp"
 
+#if defined(RAJA_PERFSUITE_USE_CALIPER)
+#include "rajaperf_config.hpp"
+#endif
+
 #include <iosfwd>
 #include <streambuf>
 #include <memory>
diff --git a/src/common/KernelBase.cpp b/src/common/KernelBase.cpp
index cbc2083dd..7aab4a9c1 100644
--- a/src/common/KernelBase.cpp
+++ b/src/common/KernelBase.cpp
@@ -44,6 +44,35 @@ KernelBase::KernelBase(KernelID kid, const RunParams& params)
   running_tuning = getUnknownTuningIdx();
 
   checksum_scale_factor = 1.0;
+
+#if defined(RAJA_PERFSUITE_USE_CALIPER)
+  // Init Caliper column metadata attributes 
+  // Aggregatable attributes need to be initialized before manager.start()
+  ProblemSize_attr = cali_create_attribute("ProblemSize", CALI_TYPE_DOUBLE,
+                                           CALI_ATTR_ASVALUE | 
+                                           CALI_ATTR_AGGREGATABLE | 
+                                           CALI_ATTR_SKIP_EVENTS);
+  Reps_attr = cali_create_attribute("Reps", CALI_TYPE_DOUBLE,
+                                    CALI_ATTR_ASVALUE | 
+                                    CALI_ATTR_AGGREGATABLE | 
+                                    CALI_ATTR_SKIP_EVENTS);
+  Iters_Rep_attr = cali_create_attribute("Iterations/Rep", CALI_TYPE_DOUBLE,
+                                         CALI_ATTR_ASVALUE | 
+                                         CALI_ATTR_AGGREGATABLE | 
+                                         CALI_ATTR_SKIP_EVENTS);
+  Kernels_Rep_attr = cali_create_attribute("Kernels/Rep", CALI_TYPE_DOUBLE,
+                                           CALI_ATTR_ASVALUE | 
+                                           CALI_ATTR_AGGREGATABLE | 
+                                           CALI_ATTR_SKIP_EVENTS);
+  Bytes_Rep_attr = cali_create_attribute("Bytes/Rep", CALI_TYPE_DOUBLE, 
+                                         CALI_ATTR_ASVALUE | 
+                                         CALI_ATTR_AGGREGATABLE | 
+                                         CALI_ATTR_SKIP_EVENTS);
+  Flops_Rep_attr = cali_create_attribute("Flops/Rep", CALI_TYPE_DOUBLE,
+                                         CALI_ATTR_ASVALUE | 
+                                         CALI_ATTR_AGGREGATABLE | 
+                                         CALI_ATTR_SKIP_EVENTS);
+#endif
 }
 
 
@@ -156,6 +185,9 @@ void KernelBase::setVariantDefined(VariantID vid)
   min_time[vid].resize(variant_tuning_names[vid].size(), std::numeric_limits<double>::max());
   max_time[vid].resize(variant_tuning_names[vid].size(), -std::numeric_limits<double>::max());
   tot_time[vid].resize(variant_tuning_names[vid].size(), 0.0);
+#if defined(RAJA_PERFSUITE_USE_CALIPER)   
+  doCaliMetaOnce[vid].resize(variant_tuning_names[vid].size(), true);
+#endif
 }
 
 int KernelBase::getDataAlignment() const
@@ -242,6 +274,12 @@ void KernelBase::runKernel(VariantID vid, size_t tune_idx)
     return;
   }
 
+#if defined(RAJA_PERFSUITE_USE_CALIPER)
+  if (doCaliperTiming) {
+    KernelBase::setCaliperMgrStart(vid);
+  }
+#endif
+
   switch ( vid ) {
 
     case Base_Seq :
@@ -312,6 +350,11 @@ void KernelBase::runKernel(VariantID vid, size_t tune_idx)
     }
 
   }
+#if defined(RAJA_PERFSUITE_USE_CALIPER)
+  if (doCaliperTiming) {
+    setCaliperMgrStop(vid); 
+  }
+#endif
 }
 
 void KernelBase::print(std::ostream& os) const
@@ -382,4 +425,28 @@ void KernelBase::print(std::ostream& os) const
   os << std::endl;
 }
 
+#if defined(RAJA_PERFSUITE_USE_CALIPER)
+void KernelBase::doOnceCaliMetaBegin(VariantID vid, size_t tune_idx)
+{
+  // attributes are class variables initialized in ctor
+  if(doCaliMetaOnce[vid].at(tune_idx)) {
+    cali_set_double(ProblemSize_attr,(double)getActualProblemSize());
+    cali_set_double(Reps_attr,(double)getRunReps());
+    cali_set_double(Iters_Rep_attr,(double)getItsPerRep());
+    cali_set_double(Kernels_Rep_attr,(double)getKernelsPerRep());
+    cali_set_double(Bytes_Rep_attr,(double)getBytesPerRep());
+    cali_set_double(Flops_Rep_attr,(double)getFLOPsPerRep());
+  }
+}
+
+void KernelBase::doOnceCaliMetaEnd(VariantID vid, size_t tune_idx)
+{
+  if(doCaliMetaOnce[vid].at(tune_idx)) {
+    doCaliMetaOnce[vid].at(tune_idx) = false;
+  }
+}
+
+// initialize a KernelBase static 
+std::map<rajaperf::VariantID, cali::ConfigManager> KernelBase::mgr;
+#endif
 }  // closing brace for rajaperf namespace
diff --git a/src/common/KernelBase.hpp b/src/common/KernelBase.hpp
index 5b431f05f..7b0a32414 100644
--- a/src/common/KernelBase.hpp
+++ b/src/common/KernelBase.hpp
@@ -29,9 +29,41 @@
 #include <string>
 #include <vector>
 #include <iostream>
+#include <map>
 #include <limits>
 #include <utility>
 
+#if defined(RAJA_PERFSUITE_USE_CALIPER)
+
+#define CALI_START \
+    if (doCaliperTiming) { \
+      std::string kstr = getName(); \
+      std::string gstr = getGroupName(kstr); \
+      std::string vstr = getVariantName(running_variant); \
+      doOnceCaliMetaBegin(running_variant,running_tuning); \
+      CALI_MARK_BEGIN(vstr.c_str()); \
+      CALI_MARK_BEGIN(gstr.c_str()); \
+      CALI_MARK_BEGIN(kstr.c_str()); \
+    }
+
+#define CALI_STOP \
+    if (doCaliperTiming) { \
+      std::string kstr = getName(); \
+      std::string gstr = getGroupName(kstr); \
+      std::string vstr = getVariantName(running_variant); \
+      CALI_MARK_END(kstr.c_str()); \
+      CALI_MARK_END(gstr.c_str()); \
+      CALI_MARK_END(vstr.c_str()); \
+      doOnceCaliMetaEnd(running_variant,running_tuning); \
+    }
+
+#else
+
+#define CALI_START
+#define CALI_STOP
+
+#endif
+
 namespace rajaperf {
 
 /*!
@@ -308,6 +340,7 @@ class KernelBase
     MPI_Barrier(MPI_COMM_WORLD);
 #endif
     timer.start();
+    CALI_START;
   }
 
   void stopTimer()
@@ -316,7 +349,7 @@ class KernelBase
 #if defined(RAJA_PERFSUITE_ENABLE_MPI)
     MPI_Barrier(MPI_COMM_WORLD);
 #endif
-    timer.stop(); recordExecTime();
+    CALI_STOP; timer.stop(); recordExecTime();
   }
 
   void resetTimer() { timer.reset(); }
@@ -359,6 +392,188 @@ class KernelBase
   }
 #endif
 
+#if defined(RAJA_PERFSUITE_USE_CALIPER)
+  void caliperOn() { doCaliperTiming = true; }
+  void caliperOff() { doCaliperTiming = false; }
+  void doOnceCaliMetaBegin(VariantID vid, size_t tune_idx);
+  void doOnceCaliMetaEnd(VariantID vid, size_t tune_idx);
+  static void setCaliperMgrVariant(VariantID vid, const std::string& outdir, 
+                                   const std::string& addToConfig)
+  {
+    static bool ran_spot_config_check = false;
+    bool config_ok = true;
+    const std::string problem_size_json_spec = R"json(
+    {
+        "name"        : "problem_size", 
+        "type"        : "boolean",
+        "category"    : "metric",
+        "description" : "problem size",
+        "query" :
+        [
+            { "level"    : "local",
+              "select": { "expr": "any(max#ProblemSize)", "as": "ProblemSize" },
+            },
+            { "level"    : "cross",
+              "select": { "expr": "any(any#max#ProblemSize)", "as": "ProblemSize" },
+            }
+        ]
+    }
+)json";
+
+    const std::string reps_json_spec = R"json(
+    {
+        "name"        : "reps",
+        "type"        : "boolean",
+        "category"    : "metric",
+        "description" : "reps",
+        "query" :
+        [
+            { "level"    : "local",
+              "select": { "expr": "any(max#Reps)", "as": "Reps" },
+            },
+            { "level"    : "cross",
+              "select": { "expr": "any(any#max#Reps)", "as": "Reps" },
+            }
+        ]
+    }
+)json";
+
+    const std::string iters_json_spec = R"json(
+    {
+        "name"        : "iters_p_rep",
+        "type"        : "boolean",
+        "category"    : "metric",
+        "description" : "iterations per rep",
+        "query" :
+        [
+            { "level"    : "local",
+              "select": { "expr": "any(max#Iterations/Rep)", "as": "Iterations/Rep" },
+            },
+            { "level"    : "cross",
+              "select": { "expr": "any(any#max#Iterations/Rep)", "as": "Iterations/Rep" },
+            }
+        ]
+    }
+)json";
+
+    const std::string kernels_json_spec = R"json(
+    {
+        "name"        : "kernels_p_rep",
+        "type"        : "boolean",
+        "category"    : "metric",
+        "description" : "kernels per rep",
+        "query" :
+        [
+            { "level"    : "local",
+              "select": { "expr": "any(max#Kernels/Rep)", "as": "Kernels/Rep" },
+            },
+            { "level"    : "cross",
+              "select": { "expr": "any(any#max#Kernels/Rep)", "as": "Kernels/Rep" },
+            }
+        ]
+    }
+)json";
+
+    const std::string bytes_json_spec = R"json(
+    {
+        "name"        : "bytes_p_rep",
+        "type"        : "boolean",
+        "category"    : "metric",
+        "description" : "bytes per rep",
+        "query" :
+        [
+            { "level"    : "local",
+              "select": { "expr": "any(max#Bytes/Rep)", "as": "Bytes/Rep" },
+            },
+            { "level"    : "cross",
+              "select": { "expr": "any(any#max#Bytes/Rep)", "as": "Bytes/Rep" },
+            }
+        ]
+    }
+)json";
+
+    const std::string flops_rep_json_spec = R"json(
+    {
+        "name"        : "flops_p_rep",
+        "type"        : "boolean",
+        "category"    : "metric",
+        "description" : "flops per rep",
+        "query" :
+        [
+            { "level"    : "local",
+              "select": { "expr": "any(max#Flops/Rep)", "as": "Flops/Rep" },
+            },
+            { "level"    : "cross",
+              "select": { "expr": "any(any#max#Flops/Rep)", "as": "Flops/Rep" },
+            }
+        ]
+    }
+)json";
+
+    if(!ran_spot_config_check && (!addToConfig.empty())) {
+      cali::ConfigManager cm;
+      std::string check_profile = "spot()," + addToConfig;
+      std::string msg = cm.check(check_profile.c_str());
+      if(!msg.empty()) {
+        std::cerr << "Problem with Cali Config: " << check_profile << "\n";
+        std::cerr << "Check your command line argument: " << addToConfig << "\n";
+        config_ok = false;
+        exit(-1);
+      }
+      ran_spot_config_check = true;
+      std::cout << "Caliper ran Spot config check\n";
+    }
+
+    if(config_ok) {
+      cali::ConfigManager m;
+      mgr.insert(std::make_pair(vid, m));
+      std::string od("./");
+      if (outdir.size()) {
+        od = outdir + "/";
+      }
+      std::string vstr = getVariantName(vid);
+      std::string profile = "spot(output=" + od + vstr + ".cali)";
+      if(!addToConfig.empty()) {
+        profile += "," + addToConfig;
+      }
+      std::cout << "Profile: " << profile << std::endl;
+      mgr[vid].add_option_spec(problem_size_json_spec.c_str());
+      mgr[vid].set_default_parameter("problem_size", "true");
+      mgr[vid].add_option_spec(reps_json_spec.c_str());
+      mgr[vid].set_default_parameter("reps", "true");
+      mgr[vid].add_option_spec(iters_json_spec.c_str());
+      mgr[vid].set_default_parameter("iters_p_rep", "true");
+      mgr[vid].add_option_spec(kernels_json_spec.c_str());
+      mgr[vid].set_default_parameter("kernels_p_rep", "true");
+      mgr[vid].add_option_spec(bytes_json_spec.c_str());
+      mgr[vid].set_default_parameter("bytes_p_rep", "true");
+      mgr[vid].add_option_spec(flops_rep_json_spec.c_str());
+      mgr[vid].set_default_parameter("flops_p_rep", "true");
+      mgr[vid].add(profile.c_str());
+    }
+  }
+
+  static void setCaliperMgrStart(VariantID vid) { mgr[vid].start(); }
+  static void setCaliperMgrStop(VariantID vid) { mgr[vid].stop(); }
+  static void setCaliperMgrFlush() 
+  { // we're going to flush all the variants at once
+    std::cout << "flushing " << mgr.size() << " variants\n";
+    for(auto const &kv : mgr) {
+      // set Adiak key first
+      std::string variant=getVariantName(kv.first);
+      adiak::value("variant",variant.c_str());
+      mgr[kv.first].flush(); 
+    }
+  }
+
+  std::string getGroupName(const std::string &kname )
+  {
+    std::size_t found = kname.find("_");
+    return kname.substr(0,found);
+  }
+
+#endif
+
 protected:
   const RunParams& run_params;
 
@@ -404,6 +619,22 @@ class KernelBase
 
   RAJA::Timer timer;
 
+#if defined(RAJA_PERFSUITE_USE_CALIPER)
+  bool doCaliperTiming = true; // warmup can use this to exclude timing
+  std::vector<bool> doCaliMetaOnce[NumVariants];
+  cali_id_t ProblemSize_attr; // in ctor cali_create_attribute("ProblemSize",CALI_TYPE_DOUBLE,CALI_ATTR_ASVALUE | CALI_ATTR_AGGREGATABLE | CALI_ATTR_SKIP_EVENTS);
+  cali_id_t Reps_attr;
+  cali_id_t Iters_Rep_attr;
+  cali_id_t Kernels_Rep_attr;
+  cali_id_t Bytes_Rep_attr;
+  cali_id_t Flops_Rep_attr;
+
+
+      // we need a Caliper Manager object per variant
+// we can inline this with c++17
+  static std::map<rajaperf::VariantID, cali::ConfigManager> mgr;
+#endif
+
   std::vector<RAJA::Timer::ElapsedType> min_time[NumVariants];
   std::vector<RAJA::Timer::ElapsedType> max_time[NumVariants];
   std::vector<RAJA::Timer::ElapsedType> tot_time[NumVariants];
diff --git a/src/common/RAJAPerfSuite.hpp b/src/common/RAJAPerfSuite.hpp
index 4c37f6847..e63ab9f8e 100644
--- a/src/common/RAJAPerfSuite.hpp
+++ b/src/common/RAJAPerfSuite.hpp
@@ -19,6 +19,12 @@
 #include <string>
 #include <ostream>
 
+#if defined(RAJA_PERFSUITE_USE_CALIPER)
+#include <caliper/cali.h>
+#include <caliper/cali-manager.h>
+#include <adiak.hpp>
+#endif
+
 namespace rajaperf
 {
 
diff --git a/src/common/RunParams.cpp b/src/common/RunParams.cpp
index df30b5c38..66ecdee28 100644
--- a/src/common/RunParams.cpp
+++ b/src/common/RunParams.cpp
@@ -59,6 +59,9 @@ RunParams::RunParams(int argc, char** argv)
    invalid_npasses_combiner_input(),
    outdir(),
    outfile_prefix("RAJAPerf"),
+#if defined(RAJA_PERFSUITE_USE_CALIPER)
+   add_to_spot_config(),
+#endif
    disable_warmup(false)
 {
   parseCommandLineOptions(argc, argv);
@@ -115,6 +118,12 @@ void RunParams::print(std::ostream& str) const
   str << "\n outdir = " << outdir;
   str << "\n outfile_prefix = " << outfile_prefix;
 
+#if defined(RAJA_PERFSUITE_USE_CALIPER)
+  if (add_to_spot_config.length() > 0) {
+    str << "\n add_to_spot_config = " << add_to_spot_config;
+  }
+#endif
+
   str << "\n disable_warmup = " << disable_warmup;
 
   str << "\n seq data space = " << getDataSpaceName(seqDataSpace);
@@ -683,6 +692,19 @@ void RunParams::parseCommandLineOptions(int argc, char** argv)
         }
 
       }
+#if defined(RAJA_PERFSUITE_USE_CALIPER)
+    } else if ( std::string(argv[i]) == std::string("--add-to-spot-config") ||
+               std::string(argv[i]) == std::string("-atsc") ) {
+      i++;
+      if ( i < argc ) {
+        opt = std::string(argv[i]);
+        if ( opt.at(0) == '-' ) {
+          i--;
+        } else {
+          add_to_spot_config = std::string( argv[i] );
+        }
+      }
+#endif
 
     } else {
 
@@ -893,6 +915,13 @@ void RunParams::printHelpMessage(std::ostream& str) const
   str << "\t\t Example...\n"
       << "\t\t --checkrun 2 (run each kernel twice)\n\n";
 
+#if defined(RAJA_PERFSUITE_USE_CALIPER)
+  str << "\t --add-to-spot-config, -atsc <string> [Default is none]\n"
+      << "\t\t appends additional parameters to the built-in Caliper spot config\n";
+  str << "\t\t Example to include some PAPI counters (Intel arch)\n"
+      << "\t\t -atsc topdown.all\n\n";
+#endif
+
   str << std::endl;
   str.flush();
 }
diff --git a/src/common/RunParams.hpp b/src/common/RunParams.hpp
index 3a8119c44..c3f9500eb 100644
--- a/src/common/RunParams.hpp
+++ b/src/common/RunParams.hpp
@@ -217,6 +217,10 @@ class RunParams {
   const std::string& getOutputDirName() const { return outdir; }
   const std::string& getOutputFilePrefix() const { return outfile_prefix; }
 
+#if defined(RAJA_PERFSUITE_USE_CALIPER)
+  const std::string& getAddToSpotConfig() const { return add_to_spot_config; }
+#endif
+
   bool getDisableWarmup() const { return disable_warmup; }
 
 //@}
@@ -303,6 +307,10 @@ class RunParams {
   std::string outdir;          /*!< Output directory name. */
   std::string outfile_prefix;  /*!< Prefix for output data file names. */
 
+#if defined(RAJA_PERFSUITE_USE_CALIPER)
+  std::string add_to_spot_config;
+#endif
+
   bool disable_warmup;
 
 };
diff --git a/src/rajaperf_config.hpp.in b/src/rajaperf_config.hpp.in
index 726868052..f43b43dca 100644
--- a/src/rajaperf_config.hpp.in
+++ b/src/rajaperf_config.hpp.in
@@ -40,28 +40,23 @@
 namespace rajaperf {
 
 struct configuration {
-#if 0
-// Version of RAJA Perf Suite (ex: 0.1.0)
-static const std::string perfsuite_version =
-"@RAJA_PERFSUITE_VERSION_MAJOR@" + std::string(".") +
-"@RAJA_PERFSUITE_VERSION_MINOR@" + std::string(".") +
-"@RAJA_PERFSUITE_VERSION_PATCHLEVEL@";
-
-// Version of RAJA used to build (ex: 0.2.4)
-static const std::string raja_version =
-std::to_string(RAJA::RAJA_VERSION_MAJOR) + std::string(".") +
-std::to_string(RAJA::RAJA_VERSION_MINOR) + std::string(".") +
-std::to_string(RAJA::RAJA_VERSION_PATCH_LEVEL);
-
-// Systype and machine code was built on (ex: chaos_5_x64_64, rzhasgpu18)
-static const std::string systype_build = "@RAJAPERF_BUILD_SYSTYPE@";
-static const std::string machine_build = "@RAJAPERF_BUILD_HOST@";
-
-// Compiler used to build (ex: gcc-4.9.3)
-static const std::string compiler = "@RAJAPERF_COMPILER@";
-
-// Command options used to build (ex: -Ofast -mavx)
-static const std::string compiler_options = "@RAJAPERF_COMPILER_OPTIONS@";
+#if defined(RAJA_PERFSUITE_USE_CALIPER)
+constexpr static const char* adiak_perfsuite_version = "@CMAKE_PROJECT_VERSION@";
+constexpr static const char* adiak_raja_version = "@RAJA_LOADED@";
+constexpr static const char* adiak_cmake_build_type = "@CMAKE_BUILD_TYPE@";
+constexpr static const char* adiak_cmake_cxx_flags = "@CMAKE_CXX_FLAGS@";
+constexpr static const char* adiak_cmake_exe_linker_flags = "@CMAKE_EXE_LINKER_FLAGS@";
+constexpr static const char* adiak_rajaperf_compiler = "@RAJAPERF_COMPILER@"; 
+constexpr static const char* adiak_rajaperf_compiler_options = "@RAJAPERF_COMPILER_OPTIONS@"; 
+constexpr static const char* adiak_compiler_version = "@CMAKE_CXX_COMPILER_VERSION@"; 
+constexpr static const char* adiak_cuda_compiler_version = "@CMAKE_CUDA_COMPILER_VERSION@"; 
+constexpr static const char* adiak_gpu_targets = "@GPU_TARGETS@"; 
+constexpr static const char* adiak_cmake_hip_architectures = "@CMAKE_HIP_ARCHIECTURES@"; 
+constexpr static const char* adiak_gpu_targets_block_sizes = "@RAJA_PERFSUITE_GPU_BLOCKSIZES@"; 
+constexpr static const char* adiak_raja_hipcc_flags = "@RAJA_HIPCC_FLAGS@"; 
+constexpr static const char* adiak_mpi_cxx_compiler = "@MPI_CXX_COMPILER@"; 
+constexpr static const char* adiak_systype_build = "@RAJAPERF_BUILD_SYSTYPE@"; 
+constexpr static const char* adiak_machine_build = "@RAJAPERF_BUILD_HOST@"; 
 #endif
 
 // helper alias to void trailing comma in no-arg case

From 33c8dc48a7ab3154f1b1299bbf1f19974868d430 Mon Sep 17 00:00:00 2001
From: Michael Richard Mckinsey <mckinsey@quartz770.llnl.gov>
Date: Fri, 9 Jun 2023 09:55:22 -0700
Subject: [PATCH 02/13] Set top-level node statically. This information
 (variant) already exists in the metadata.

---
 src/common/KernelBase.hpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/common/KernelBase.hpp b/src/common/KernelBase.hpp
index 7b0a32414..bd129f6b1 100644
--- a/src/common/KernelBase.hpp
+++ b/src/common/KernelBase.hpp
@@ -39,7 +39,7 @@
     if (doCaliperTiming) { \
       std::string kstr = getName(); \
       std::string gstr = getGroupName(kstr); \
-      std::string vstr = getVariantName(running_variant); \
+      std::string vstr = "RAJAPerf"; \
       doOnceCaliMetaBegin(running_variant,running_tuning); \
       CALI_MARK_BEGIN(vstr.c_str()); \
       CALI_MARK_BEGIN(gstr.c_str()); \
@@ -50,7 +50,7 @@
     if (doCaliperTiming) { \
       std::string kstr = getName(); \
       std::string gstr = getGroupName(kstr); \
-      std::string vstr = getVariantName(running_variant); \
+      std::string vstr = "RAJAPerf"; \
       CALI_MARK_END(kstr.c_str()); \
       CALI_MARK_END(gstr.c_str()); \
       CALI_MARK_END(vstr.c_str()); \

From bf61a9696b27ccac950cb3035f056839c44d3e3d Mon Sep 17 00:00:00 2001
From: Michael Richard Mckinsey <mckinsey@lassen25.coral.llnl.gov>
Date: Wed, 14 Jun 2023 14:32:50 -0700
Subject: [PATCH 03/13] Add runtime gpu block size to adiak metadata

---
 src/common/Executor.cpp  | 4 ++++
 src/common/RunParams.hpp | 3 +++
 2 files changed, 7 insertions(+)

diff --git a/src/common/Executor.cpp b/src/common/Executor.cpp
index a94d2e8c4..6c3073fc2 100644
--- a/src/common/Executor.cpp
+++ b/src/common/Executor.cpp
@@ -184,6 +184,10 @@ Executor::Executor(int argc, char** argv)
     adiak::value("ProblemSizeRunParam",(double)run_params.getSize());
   }
 
+  if (run_params.numValidGPUBlockSize() > 0) {
+    adiak::value("runtime_gpu_block_sizes", run_params.getGPUBlockSizeInput());
+  }
+
   // Openmp section
 #if defined(_OPENMP)
   std::string strval = "";
diff --git a/src/common/RunParams.hpp b/src/common/RunParams.hpp
index c3f9500eb..2f70cec93 100644
--- a/src/common/RunParams.hpp
+++ b/src/common/RunParams.hpp
@@ -151,6 +151,9 @@ class RunParams {
 
   const std::string& getReferenceVariant() const { return reference_variant; }
 
+  const std::vector<size_t>& getGPUBlockSizeInput() const
+                                  { return gpu_block_sizes; }
+
   const std::vector<std::string>& getKernelInput() const
                                   { return kernel_input; }
   void setInvalidKernelInput( std::vector<std::string>& svec )

From 731ac0c091f432632347e3296dee3c58947c108c Mon Sep 17 00:00:00 2001
From: Michael Richard Mckinsey <mckinsey@lassen26.coral.llnl.gov>
Date: Wed, 14 Jun 2023 17:00:03 -0700
Subject: [PATCH 04/13] Add tuning to filename in the case of only 1 tuning

---
 src/common/Executor.cpp   | 3 ++-
 src/common/KernelBase.hpp | 9 +++++++--
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/src/common/Executor.cpp b/src/common/Executor.cpp
index 6c3073fc2..aea84fdb5 100644
--- a/src/common/Executor.cpp
+++ b/src/common/Executor.cpp
@@ -716,7 +716,8 @@ void Executor::setupSuite()
       #if defined(RAJA_PERFSUITE_USE_CALIPER)
           KernelBase::setCaliperMgrVariant(*vid,
                                             run_params.getOutputDirName(),
-                                            run_params.getAddToSpotConfig());
+                                            run_params.getAddToSpotConfig(),
+                                            run_params.getTuningInput());
       #endif
       }
 
diff --git a/src/common/KernelBase.hpp b/src/common/KernelBase.hpp
index bd129f6b1..befa6343d 100644
--- a/src/common/KernelBase.hpp
+++ b/src/common/KernelBase.hpp
@@ -398,7 +398,8 @@ class KernelBase
   void doOnceCaliMetaBegin(VariantID vid, size_t tune_idx);
   void doOnceCaliMetaEnd(VariantID vid, size_t tune_idx);
   static void setCaliperMgrVariant(VariantID vid, const std::string& outdir, 
-                                   const std::string& addToConfig)
+                                   const std::string& addToConfig,
+                                   const std::vector<std::string>& tuning_input)
   {
     static bool ran_spot_config_check = false;
     bool config_ok = true;
@@ -532,7 +533,11 @@ class KernelBase
         od = outdir + "/";
       }
       std::string vstr = getVariantName(vid);
-      std::string profile = "spot(output=" + od + vstr + ".cali)";
+      std::string tstr = "";
+      if (tuning_input.size() == 1) { // If only 1 tuning, add to file name
+        tstr = "-" + tuning_input.front();
+      }
+      std::string profile = "spot(output=" + od + vstr + tstr + ".cali)";
       if(!addToConfig.empty()) {
         profile += "," + addToConfig;
       }

From 13f5a085c40d8aceebd7270d269f43133ab97779 Mon Sep 17 00:00:00 2001
From: Michael Richard Mckinsey <mckinsey@lassen16.coral.llnl.gov>
Date: Thu, 15 Jun 2023 16:59:57 -0700
Subject: [PATCH 05/13] Rename Cali metadata function and change functionality
 to run only once instead of per tuning

---
 src/common/KernelBase.cpp | 26 +++++++-------------------
 src/common/KernelBase.hpp |  7 ++-----
 2 files changed, 9 insertions(+), 24 deletions(-)

diff --git a/src/common/KernelBase.cpp b/src/common/KernelBase.cpp
index 7aab4a9c1..bd5f47a59 100644
--- a/src/common/KernelBase.cpp
+++ b/src/common/KernelBase.cpp
@@ -185,9 +185,6 @@ void KernelBase::setVariantDefined(VariantID vid)
   min_time[vid].resize(variant_tuning_names[vid].size(), std::numeric_limits<double>::max());
   max_time[vid].resize(variant_tuning_names[vid].size(), -std::numeric_limits<double>::max());
   tot_time[vid].resize(variant_tuning_names[vid].size(), 0.0);
-#if defined(RAJA_PERFSUITE_USE_CALIPER)   
-  doCaliMetaOnce[vid].resize(variant_tuning_names[vid].size(), true);
-#endif
 }
 
 int KernelBase::getDataAlignment() const
@@ -426,24 +423,15 @@ void KernelBase::print(std::ostream& os) const
 }
 
 #if defined(RAJA_PERFSUITE_USE_CALIPER)
-void KernelBase::doOnceCaliMetaBegin(VariantID vid, size_t tune_idx)
+void KernelBase::CaliMeta()
 {
   // attributes are class variables initialized in ctor
-  if(doCaliMetaOnce[vid].at(tune_idx)) {
-    cali_set_double(ProblemSize_attr,(double)getActualProblemSize());
-    cali_set_double(Reps_attr,(double)getRunReps());
-    cali_set_double(Iters_Rep_attr,(double)getItsPerRep());
-    cali_set_double(Kernels_Rep_attr,(double)getKernelsPerRep());
-    cali_set_double(Bytes_Rep_attr,(double)getBytesPerRep());
-    cali_set_double(Flops_Rep_attr,(double)getFLOPsPerRep());
-  }
-}
-
-void KernelBase::doOnceCaliMetaEnd(VariantID vid, size_t tune_idx)
-{
-  if(doCaliMetaOnce[vid].at(tune_idx)) {
-    doCaliMetaOnce[vid].at(tune_idx) = false;
-  }
+  cali_set_double(ProblemSize_attr,(double)getActualProblemSize());
+  cali_set_double(Reps_attr,(double)getRunReps());
+  cali_set_double(Iters_Rep_attr,(double)getItsPerRep());
+  cali_set_double(Kernels_Rep_attr,(double)getKernelsPerRep());
+  cali_set_double(Bytes_Rep_attr,(double)getBytesPerRep());
+  cali_set_double(Flops_Rep_attr,(double)getFLOPsPerRep());
 }
 
 // initialize a KernelBase static 
diff --git a/src/common/KernelBase.hpp b/src/common/KernelBase.hpp
index befa6343d..0d856ff9d 100644
--- a/src/common/KernelBase.hpp
+++ b/src/common/KernelBase.hpp
@@ -40,7 +40,7 @@
       std::string kstr = getName(); \
       std::string gstr = getGroupName(kstr); \
       std::string vstr = "RAJAPerf"; \
-      doOnceCaliMetaBegin(running_variant,running_tuning); \
+      CaliMeta(); \
       CALI_MARK_BEGIN(vstr.c_str()); \
       CALI_MARK_BEGIN(gstr.c_str()); \
       CALI_MARK_BEGIN(kstr.c_str()); \
@@ -54,7 +54,6 @@
       CALI_MARK_END(kstr.c_str()); \
       CALI_MARK_END(gstr.c_str()); \
       CALI_MARK_END(vstr.c_str()); \
-      doOnceCaliMetaEnd(running_variant,running_tuning); \
     }
 
 #else
@@ -395,8 +394,7 @@ class KernelBase
 #if defined(RAJA_PERFSUITE_USE_CALIPER)
   void caliperOn() { doCaliperTiming = true; }
   void caliperOff() { doCaliperTiming = false; }
-  void doOnceCaliMetaBegin(VariantID vid, size_t tune_idx);
-  void doOnceCaliMetaEnd(VariantID vid, size_t tune_idx);
+  void CaliMeta();
   static void setCaliperMgrVariant(VariantID vid, const std::string& outdir, 
                                    const std::string& addToConfig,
                                    const std::vector<std::string>& tuning_input)
@@ -626,7 +624,6 @@ class KernelBase
 
 #if defined(RAJA_PERFSUITE_USE_CALIPER)
   bool doCaliperTiming = true; // warmup can use this to exclude timing
-  std::vector<bool> doCaliMetaOnce[NumVariants];
   cali_id_t ProblemSize_attr; // in ctor cali_create_attribute("ProblemSize",CALI_TYPE_DOUBLE,CALI_ATTR_ASVALUE | CALI_ATTR_AGGREGATABLE | CALI_ATTR_SKIP_EVENTS);
   cali_id_t Reps_attr;
   cali_id_t Iters_Rep_attr;

From 6bbc26856636f45307978554ed875246275ac6f8 Mon Sep 17 00:00:00 2001
From: Michael Richard Mckinsey <mckinsey@lassen26.coral.llnl.gov>
Date: Fri, 16 Jun 2023 10:59:30 -0700
Subject: [PATCH 06/13] Remove map of ConfigManager's. Only use 1 CM.

---
 src/common/Executor.cpp   |  7 +++---
 src/common/KernelBase.cpp |  8 +++----
 src/common/KernelBase.hpp | 47 +++++++++++++++++----------------------
 3 files changed, 27 insertions(+), 35 deletions(-)

diff --git a/src/common/Executor.cpp b/src/common/Executor.cpp
index aea84fdb5..7a5499fd2 100644
--- a/src/common/Executor.cpp
+++ b/src/common/Executor.cpp
@@ -1072,10 +1072,6 @@ void Executor::runSuite()
 
   } // loop over passes through suite
 
-  #if defined(RAJA_PERFSUITE_USE_CALIPER)
-    // Flush Caliper data
-    KernelBase::setCaliperMgrFlush();
-  #endif
 }
 
 template < typename Kernel >
@@ -1117,6 +1113,9 @@ void Executor::runKernel(KernelBase* kernel, bool print_kernel_name)
         getCout() << "\t\tSkipping " << tuning_name << " tuning" << endl;
       }
     }
+    #if defined(RAJA_PERFSUITE_USE_CALIPER)
+      KernelBase::setCaliperMgrFlush(getVariantName(vid));
+    #endif
   } // loop over variants
 }
 
diff --git a/src/common/KernelBase.cpp b/src/common/KernelBase.cpp
index bd5f47a59..1cb8007d0 100644
--- a/src/common/KernelBase.cpp
+++ b/src/common/KernelBase.cpp
@@ -273,7 +273,7 @@ void KernelBase::runKernel(VariantID vid, size_t tune_idx)
 
 #if defined(RAJA_PERFSUITE_USE_CALIPER)
   if (doCaliperTiming) {
-    KernelBase::setCaliperMgrStart(vid);
+    KernelBase::setCaliperMgrStart();
   }
 #endif
 
@@ -349,7 +349,7 @@ void KernelBase::runKernel(VariantID vid, size_t tune_idx)
   }
 #if defined(RAJA_PERFSUITE_USE_CALIPER)
   if (doCaliperTiming) {
-    setCaliperMgrStop(vid); 
+    setCaliperMgrStop();
   }
 #endif
 }
@@ -434,7 +434,7 @@ void KernelBase::CaliMeta()
   cali_set_double(Flops_Rep_attr,(double)getFLOPsPerRep());
 }
 
-// initialize a KernelBase static 
-std::map<rajaperf::VariantID, cali::ConfigManager> KernelBase::mgr;
+// initialize a KernelBase static
+cali::ConfigManager KernelBase::mgr;
 #endif
 }  // closing brace for rajaperf namespace
diff --git a/src/common/KernelBase.hpp b/src/common/KernelBase.hpp
index 0d856ff9d..4a52deeb4 100644
--- a/src/common/KernelBase.hpp
+++ b/src/common/KernelBase.hpp
@@ -524,8 +524,6 @@ class KernelBase
     }
 
     if(config_ok) {
-      cali::ConfigManager m;
-      mgr.insert(std::make_pair(vid, m));
       std::string od("./");
       if (outdir.size()) {
         od = outdir + "/";
@@ -540,33 +538,28 @@ class KernelBase
         profile += "," + addToConfig;
       }
       std::cout << "Profile: " << profile << std::endl;
-      mgr[vid].add_option_spec(problem_size_json_spec.c_str());
-      mgr[vid].set_default_parameter("problem_size", "true");
-      mgr[vid].add_option_spec(reps_json_spec.c_str());
-      mgr[vid].set_default_parameter("reps", "true");
-      mgr[vid].add_option_spec(iters_json_spec.c_str());
-      mgr[vid].set_default_parameter("iters_p_rep", "true");
-      mgr[vid].add_option_spec(kernels_json_spec.c_str());
-      mgr[vid].set_default_parameter("kernels_p_rep", "true");
-      mgr[vid].add_option_spec(bytes_json_spec.c_str());
-      mgr[vid].set_default_parameter("bytes_p_rep", "true");
-      mgr[vid].add_option_spec(flops_rep_json_spec.c_str());
-      mgr[vid].set_default_parameter("flops_p_rep", "true");
-      mgr[vid].add(profile.c_str());
+      mgr.add_option_spec(problem_size_json_spec.c_str());
+      mgr.set_default_parameter("problem_size", "true");
+      mgr.add_option_spec(reps_json_spec.c_str());
+      mgr.set_default_parameter("reps", "true");
+      mgr.add_option_spec(iters_json_spec.c_str());
+      mgr.set_default_parameter("iters_p_rep", "true");
+      mgr.add_option_spec(kernels_json_spec.c_str());
+      mgr.set_default_parameter("kernels_p_rep", "true");
+      mgr.add_option_spec(bytes_json_spec.c_str());
+      mgr.set_default_parameter("bytes_p_rep", "true");
+      mgr.add_option_spec(flops_rep_json_spec.c_str());
+      mgr.set_default_parameter("flops_p_rep", "true");
+      mgr.add(profile.c_str());
     }
   }
 
-  static void setCaliperMgrStart(VariantID vid) { mgr[vid].start(); }
-  static void setCaliperMgrStop(VariantID vid) { mgr[vid].stop(); }
-  static void setCaliperMgrFlush() 
-  { // we're going to flush all the variants at once
-    std::cout << "flushing " << mgr.size() << " variants\n";
-    for(auto const &kv : mgr) {
-      // set Adiak key first
-      std::string variant=getVariantName(kv.first);
-      adiak::value("variant",variant.c_str());
-      mgr[kv.first].flush(); 
-    }
+  static void setCaliperMgrStart() { mgr.start(); }
+  static void setCaliperMgrStop() { mgr.stop(); }
+  static void setCaliperMgrFlush(std::string variant_name)
+  {
+    adiak::value("variant",variant_name.c_str());
+    mgr.flush();
   }
 
   std::string getGroupName(const std::string &kname )
@@ -634,7 +627,7 @@ class KernelBase
 
       // we need a Caliper Manager object per variant
 // we can inline this with c++17
-  static std::map<rajaperf::VariantID, cali::ConfigManager> mgr;
+  static cali::ConfigManager mgr;
 #endif
 
   std::vector<RAJA::Timer::ElapsedType> min_time[NumVariants];

From 0f6a052b14201d1bbbdffc37b8740f8c51006c1a Mon Sep 17 00:00:00 2001
From: Michael Richard Mckinsey <mckinsey@lassen8.coral.llnl.gov>
Date: Fri, 16 Jun 2023 14:16:24 -0700
Subject: [PATCH 07/13] Add blocksize per kernel into performance data and
 implement block size into all kernels that contain boilerplate

---
 src/common/GPUUtils.hpp   |  1 +
 src/common/KernelBase.cpp |  5 +++++
 src/common/KernelBase.hpp | 23 +++++++++++++++++++++++
 3 files changed, 29 insertions(+)

diff --git a/src/common/GPUUtils.hpp b/src/common/GPUUtils.hpp
index bf966a0df..8d6012a6d 100644
--- a/src/common/GPUUtils.hpp
+++ b/src/common/GPUUtils.hpp
@@ -177,6 +177,7 @@ inline void seq_for(camp::int_seq<T, ts...> const&, Func&& func)
       if (run_params.numValidGPUBlockSize() == 0u ||                           \
           run_params.validGPUBlockSize(block_size)) {                          \
         if (tune_idx == t) {                                                   \
+          setBlockSize(block_size);                                            \
           run##variant##VariantImpl<block_size>(vid);                          \
         }                                                                      \
         t += 1;                                                                \
diff --git a/src/common/KernelBase.cpp b/src/common/KernelBase.cpp
index 1cb8007d0..6822c1d15 100644
--- a/src/common/KernelBase.cpp
+++ b/src/common/KernelBase.cpp
@@ -72,6 +72,10 @@ KernelBase::KernelBase(KernelID kid, const RunParams& params)
                                          CALI_ATTR_ASVALUE | 
                                          CALI_ATTR_AGGREGATABLE | 
                                          CALI_ATTR_SKIP_EVENTS);
+  BlockSize_attr = cali_create_attribute("BlockSize", CALI_TYPE_INT,
+                                           CALI_ATTR_ASVALUE |
+                                           CALI_ATTR_AGGREGATABLE |
+                                           CALI_ATTR_SKIP_EVENTS);
 #endif
 }
 
@@ -432,6 +436,7 @@ void KernelBase::CaliMeta()
   cali_set_double(Kernels_Rep_attr,(double)getKernelsPerRep());
   cali_set_double(Bytes_Rep_attr,(double)getBytesPerRep());
   cali_set_double(Flops_Rep_attr,(double)getFLOPsPerRep());
+  cali_set_int(BlockSize_attr, (int)getBlockSize());
 }
 
 // initialize a KernelBase static
diff --git a/src/common/KernelBase.hpp b/src/common/KernelBase.hpp
index 4a52deeb4..64f1bd9ea 100644
--- a/src/common/KernelBase.hpp
+++ b/src/common/KernelBase.hpp
@@ -98,6 +98,7 @@ class KernelBase
   void setKernelsPerRep(Index_type nkerns) { kernels_per_rep = nkerns; };
   void setBytesPerRep(Index_type bytes) { bytes_per_rep = bytes;}
   void setFLOPsPerRep(Index_type FLOPs) { FLOPs_per_rep = FLOPs; }
+  void setBlockSize(Index_type size) { kernel_block_size = size; }
 
   void setUsesFeature(FeatureID fid) { uses_feature[fid] = true; }
 
@@ -145,6 +146,7 @@ class KernelBase
   Index_type getKernelsPerRep() const { return kernels_per_rep; };
   Index_type getBytesPerRep() const { return bytes_per_rep; }
   Index_type getFLOPsPerRep() const { return FLOPs_per_rep; }
+  Index_type getBlockSize() const { return kernel_block_size; }
 
   Index_type getTargetProblemSize() const;
   Index_type getRunReps() const;
@@ -508,6 +510,23 @@ class KernelBase
         ]
     }
 )json";
+    const std::string block_size_json_spec = R"json(
+    {
+        "name"        : "block_size",
+        "type"        : "boolean",
+        "category"    : "metric",
+        "description" : "block size",
+        "query" :
+        [
+            { "level"    : "local",
+              "select": { "expr": "any(max#BlockSize)", "as": "BlockSize" },
+            },
+            { "level"    : "cross",
+              "select": { "expr": "any(any#max#BlockSize)", "as": "BlockSize" },
+            }
+        ]
+    }
+)json";
 
     if(!ran_spot_config_check && (!addToConfig.empty())) {
       cali::ConfigManager cm;
@@ -550,6 +569,8 @@ class KernelBase
       mgr.set_default_parameter("bytes_p_rep", "true");
       mgr.add_option_spec(flops_rep_json_spec.c_str());
       mgr.set_default_parameter("flops_p_rep", "true");
+      mgr.add_option_spec(block_size_json_spec.c_str());
+      mgr.set_default_parameter("block_size", "true");
       mgr.add(profile.c_str());
     }
   }
@@ -607,6 +628,7 @@ class KernelBase
   Index_type kernels_per_rep;
   Index_type bytes_per_rep;
   Index_type FLOPs_per_rep;
+  Index_type kernel_block_size;
 
   VariantID running_variant;
   size_t running_tuning;
@@ -623,6 +645,7 @@ class KernelBase
   cali_id_t Kernels_Rep_attr;
   cali_id_t Bytes_Rep_attr;
   cali_id_t Flops_Rep_attr;
+  cali_id_t BlockSize_attr;
 
 
       // we need a Caliper Manager object per variant

From e5885f9e0502a6f7290f42a6402efb41fd9e868f Mon Sep 17 00:00:00 2001
From: Michael Richard Mckinsey <mckinsey@corona212.llnl.gov>
Date: Fri, 23 Jun 2023 11:20:50 -0700
Subject: [PATCH 08/13] Add setter to kernels that don't contain boilerplate

---
 src/algorithm/MEMCPY-Cuda.cpp     | 2 +-
 src/algorithm/MEMCPY-Hip.cpp      | 2 +-
 src/algorithm/MEMSET-Cuda.cpp     | 2 +-
 src/algorithm/MEMSET-Hip.cpp      | 2 +-
 src/algorithm/REDUCE_SUM-Cuda.cpp | 2 +-
 src/algorithm/REDUCE_SUM-Hip.cpp  | 2 +-
 6 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/algorithm/MEMCPY-Cuda.cpp b/src/algorithm/MEMCPY-Cuda.cpp
index 74d7ca34f..b86dd4df6 100644
--- a/src/algorithm/MEMCPY-Cuda.cpp
+++ b/src/algorithm/MEMCPY-Cuda.cpp
@@ -153,7 +153,7 @@ void MEMCPY::runCudaVariant(VariantID vid, size_t tune_idx)
         run_params.validGPUBlockSize(block_size)) {
 
       if (tune_idx == t) {
-
+        setBlockSize(block_size);
         runCudaVariantBlock<block_size>(vid);
 
       }
diff --git a/src/algorithm/MEMCPY-Hip.cpp b/src/algorithm/MEMCPY-Hip.cpp
index a1577e012..fa761d026 100644
--- a/src/algorithm/MEMCPY-Hip.cpp
+++ b/src/algorithm/MEMCPY-Hip.cpp
@@ -155,7 +155,7 @@ void MEMCPY::runHipVariant(VariantID vid, size_t tune_idx)
         run_params.validGPUBlockSize(block_size)) {
 
       if (tune_idx == t) {
-
+        setBlockSize(block_size);
         runHipVariantBlock<block_size>(vid);
 
       }
diff --git a/src/algorithm/MEMSET-Cuda.cpp b/src/algorithm/MEMSET-Cuda.cpp
index 7dfb5abac..3a7e049d1 100644
--- a/src/algorithm/MEMSET-Cuda.cpp
+++ b/src/algorithm/MEMSET-Cuda.cpp
@@ -155,7 +155,7 @@ void MEMSET::runCudaVariant(VariantID vid, size_t tune_idx)
         run_params.validGPUBlockSize(block_size)) {
 
       if (tune_idx == t) {
-
+        setBlockSize(block_size);
         runCudaVariantBlock<block_size>(vid);
 
       }
diff --git a/src/algorithm/MEMSET-Hip.cpp b/src/algorithm/MEMSET-Hip.cpp
index 8706d358c..1b1755dc4 100644
--- a/src/algorithm/MEMSET-Hip.cpp
+++ b/src/algorithm/MEMSET-Hip.cpp
@@ -155,7 +155,7 @@ void MEMSET::runHipVariant(VariantID vid, size_t tune_idx)
         run_params.validGPUBlockSize(block_size)) {
 
       if (tune_idx == t) {
-
+        setBlockSize(block_size);
         runHipVariantBlock<block_size>(vid);
 
       }
diff --git a/src/algorithm/REDUCE_SUM-Cuda.cpp b/src/algorithm/REDUCE_SUM-Cuda.cpp
index c5be07c41..81c54f2a9 100644
--- a/src/algorithm/REDUCE_SUM-Cuda.cpp
+++ b/src/algorithm/REDUCE_SUM-Cuda.cpp
@@ -206,7 +206,7 @@ void REDUCE_SUM::runCudaVariant(VariantID vid, size_t tune_idx)
           run_params.validGPUBlockSize(block_size)) {
 
         if (tune_idx == t) {
-
+          setBlockSize(block_size);
           runCudaVariantBlock<block_size>(vid);
 
         }
diff --git a/src/algorithm/REDUCE_SUM-Hip.cpp b/src/algorithm/REDUCE_SUM-Hip.cpp
index 3327d1991..211c96a2e 100644
--- a/src/algorithm/REDUCE_SUM-Hip.cpp
+++ b/src/algorithm/REDUCE_SUM-Hip.cpp
@@ -232,7 +232,7 @@ void REDUCE_SUM::runHipVariant(VariantID vid, size_t tune_idx)
           run_params.validGPUBlockSize(block_size)) {
 
         if (tune_idx == t) {
-
+          setBlockSize(block_size);
           runHipVariantBlock<block_size>(vid);
 
         }

From b93fbf39329ead90f6874d4d68ce12e717bcc8bc Mon Sep 17 00:00:00 2001
From: Michael Richard Mckinsey <mckinsey@corona212.llnl.gov>
Date: Fri, 23 Jun 2023 14:10:26 -0700
Subject: [PATCH 09/13] Set default block size to NaN for non GPU Kernels

---
 src/common/KernelBase.cpp | 6 ++++--
 src/common/KernelBase.hpp | 4 ++--
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/src/common/KernelBase.cpp b/src/common/KernelBase.cpp
index 6822c1d15..7061591ef 100644
--- a/src/common/KernelBase.cpp
+++ b/src/common/KernelBase.cpp
@@ -72,7 +72,7 @@ KernelBase::KernelBase(KernelID kid, const RunParams& params)
                                          CALI_ATTR_ASVALUE | 
                                          CALI_ATTR_AGGREGATABLE | 
                                          CALI_ATTR_SKIP_EVENTS);
-  BlockSize_attr = cali_create_attribute("BlockSize", CALI_TYPE_INT,
+  BlockSize_attr = cali_create_attribute("BlockSize", CALI_TYPE_DOUBLE,
                                            CALI_ATTR_ASVALUE |
                                            CALI_ATTR_AGGREGATABLE |
                                            CALI_ATTR_SKIP_EVENTS);
@@ -436,7 +436,9 @@ void KernelBase::CaliMeta()
   cali_set_double(Kernels_Rep_attr,(double)getKernelsPerRep());
   cali_set_double(Bytes_Rep_attr,(double)getBytesPerRep());
   cali_set_double(Flops_Rep_attr,(double)getFLOPsPerRep());
-  cali_set_int(BlockSize_attr, (int)getBlockSize());
+  double const block_size = getBlockSize();
+  if (!isnan(block_size))
+    cali_set_double(BlockSize_attr, block_size);
 }
 
 // initialize a KernelBase static
diff --git a/src/common/KernelBase.hpp b/src/common/KernelBase.hpp
index 64f1bd9ea..4faf12be0 100644
--- a/src/common/KernelBase.hpp
+++ b/src/common/KernelBase.hpp
@@ -146,7 +146,7 @@ class KernelBase
   Index_type getKernelsPerRep() const { return kernels_per_rep; };
   Index_type getBytesPerRep() const { return bytes_per_rep; }
   Index_type getFLOPsPerRep() const { return FLOPs_per_rep; }
-  Index_type getBlockSize() const { return kernel_block_size; }
+  double getBlockSize() const { return kernel_block_size; }
 
   Index_type getTargetProblemSize() const;
   Index_type getRunReps() const;
@@ -628,7 +628,7 @@ class KernelBase
   Index_type kernels_per_rep;
   Index_type bytes_per_rep;
   Index_type FLOPs_per_rep;
-  Index_type kernel_block_size;
+  double kernel_block_size = nan(""); // Set default value for non GPU kernels
 
   VariantID running_variant;
   size_t running_tuning;

From 578051f53766d985503eb012c2897288d89537ff Mon Sep 17 00:00:00 2001
From: Michael Richard Mckinsey <mckinsey@corona212.llnl.gov>
Date: Fri, 23 Jun 2023 17:05:56 -0700
Subject: [PATCH 10/13] Add doc files from 254

---
 docs/sphinx/user_guide/build.rst  |  43 +++++++++++
 docs/sphinx/user_guide/output.rst |  80 ++++++++++++++++++++
 docs/sphinx/user_guide/run.rst    | 117 ++++++++++++++++++++++++++++++
 3 files changed, 240 insertions(+)

diff --git a/docs/sphinx/user_guide/build.rst b/docs/sphinx/user_guide/build.rst
index be1bde2fe..aaa216fda 100644
--- a/docs/sphinx/user_guide/build.rst
+++ b/docs/sphinx/user_guide/build.rst
@@ -210,3 +210,46 @@ sizes. The CMake option for this is
 
 will build versions of GPU kernels that use 64, 128, 256, 512, and 1024 threads
 per GPU thread-block.
+
+Building with Caliper
+---------------------
+
+RAJAPerf Suite may also use Caliper instrumentation, with per variant & tuning output into .cali files. While Caliper is low-overhead
+it is not zero, so it will add a small amount of timing skew in its data as 
+compared to the original. Caliper output enables usage of performance analysis tools like Hatchet and Thicket.
+For much more on Caliper, Hatchet and Thicket, read their documentation here:
+
+| - `Caliper Documentation <http://software.llnl.gov/Caliper/>`_ 
+| - `Hatchet User Guide <https://llnl-hatchet.readthedocs.io/en/latest/user_guide.html>`_ 
+| - `Thicket User Guide <https://thicket.readthedocs.io/en/latest/>`_ 
+
+
+Caliper *annotation* is in the following tree structure::
+
+  RAJAPerf
+    Group
+      Kernel
+
+| Build against these Caliper versions
+|
+|   **caliper@2.9.0** (preferred target)
+|   **caliper@master** (if using older Spack version)
+
+In Cmake scripts add
+  **-DRAJA_PERFSUITE_USE_CALIPER=On** 
+
+Add to **-DCMAKE_PREFIX_PATH**
+  ;${CALIPER_PREFIX}/share/cmake/caliper;${ADIAK_PREFIX}/lib/cmake/adiak
+
+or use 
+  -Dcaliper_DIR -Dadiak_DIR package prefixes
+
+For Spack : raja_perf +caliper ^caliper@2.9.0
+
+For Uberenv: python3 scripts/uberenv/uberenv.py --spec +caliper ^caliper@2.9.0
+
+If you intend on passing nvtx or roctx annotation to Nvidia or AMD profiling tools, 
+build Caliper with +cuda cuda_arch=XX or +rocm respectively. Then you can specify
+an additional Caliper service for nvtx or roctx like so: roctx example:
+
+CALI_SERVICES_ENABLE=roctx rocprof --roctx-trace --hip-trace raja-perf.exe 
diff --git a/docs/sphinx/user_guide/output.rst b/docs/sphinx/user_guide/output.rst
index edcbc9a57..f7de7fc50 100644
--- a/docs/sphinx/user_guide/output.rst
+++ b/docs/sphinx/user_guide/output.rst
@@ -159,3 +159,83 @@ storing the result in matrix A (N_i X N_j). Problem size could be chosen to be
 the maximum number of entries in matrix B or C. We choose the size of matrix 
 A (N_i * N_j), which is more closely aligned with the number of independent 
 operations (i.e., the amount of parallel work) in the kernels.
+
+
+===========================
+Caliper output files
+===========================
+
+If you've built RAJAPerf with Caliper support turned on, then in addition to the
+outputs mentioned above, we also save a .cali file for each variant run, such as:
+Base_OpenMP.cali, Lambda_OpenMP.cali, RAJA_OpenMP.cali, etc.
+
+Also, by using the `--variants` and `--tunings` flag when running, you can generate 
+single variant/tuning runs. These work optimally with Hatchet/Thicket.
+
+There are several techniques to display the Caliper trees (Timing Hierarchy)
+
+| 1: Caliper's cali-query tool.
+| The first technique is with Caliper's own tool cali-query, we run it with 
+| **-T** to display tree, or you can specify **--tree**. 
+|
+| cali-query -T $HOME/data/default_problem_size/gcc/RAJA_Seq.cali
+
+2: Caliper's Python module *caliperreader*::
+
+  import os
+  import caliperreader as cr
+  DATA_DIR = os.getenv('HOME')+"/data/default_problem_size/gcc"
+  os.chdir(DATA_DIR)
+  r = cr.CaliperReader()
+  r.read("RAJA_Seq.cali")
+  metric = 'avg#inclusive#sum#time.duration'
+  for rec in r.records:
+    path = rec['path'] if 'path' in rec else 'UNKNOWN'
+    time = rec[metric] if metric in rec else '0'
+    if not 'UNKNOWN' in path:
+        if (isinstance(path, list)):
+            path = "/".join(path)
+        print("{0}: {1}".format(path, time))
+  
+You can add a couple of lines to view the metadata keys captured by Caliper/Adiak::
+
+  for g in r.globals:  
+    print(g)  
+
+You can also add a line to display metadata value in the dictionary **r.globals**
+
+For example print out the OpenMP Max Threads value recorded at runtime:: 
+
+  print('OMP Max Threads: ' + r.globals['omp_max_threads'])`  
+
+or the variant represented in this file::  
+  
+  print('Variant: ' + r.globals['variant'])
+ 
+
+.. note:: The script above was written using caliper-reader 0.3.0, 
+          but is fairly generic. Other version usage notes may be 
+          found at the link below
+
+`caliper-reader <https://pypi.org/project/caliper-reader/>`_ 
+
+
+3: Using the *Hatchet* Python module for single files::
+
+  import hatchet as ht
+  DATA_DIR = os.getenv('HOME')+"/data/default_problem_size/gcc"
+  os.chdir(DATA_DIR)
+  gf1 = ht.GraphFrame.from_caliperreader("RAJA_Seq.cali")
+  print(gf1.tree())
+
+`Find out more on hatchet <https://github.com/LLNL/hatchet>`_
+
+3: Using the *Thicket* Python module for multiple files::
+
+  import thicket as th
+  DATA_DIR = os.getenv('HOME')+"/data/default_problem_size/gcc"
+  os.chdir(DATA_DIR)
+  th1 = th.Thicket.from_caliperreader(["RAJA_Seq-default.cali", "Base_Seq-default.cali", "Base_CUDA-block_128", "Base_CUDA-block_256"])
+  print(th1.tree())
+
+`Find out more on thicket <https://github.com/LLNL/thicket>`_
diff --git a/docs/sphinx/user_guide/run.rst b/docs/sphinx/user_guide/run.rst
index 3d8473221..19a8917bd 100644
--- a/docs/sphinx/user_guide/run.rst
+++ b/docs/sphinx/user_guide/run.rst
@@ -138,3 +138,120 @@ was not possible for them to co-exist in the same executable as CUDA
 variants, for example. In the future, the build system may be reworked so 
 that the OpenMP target variants can be run from the same executable as the 
 other variants.
+
+============================
+Additional Caliper Use Cases
+============================
+
+If you specified building with Caliper (``-DRAJA_PERFSUITE_USE_CALIPER=On``),
+the generation of Caliper .cali files are automated for the most part.
+
+However, there are a couple of other supported use cases.
+
+Collecting PAPI topdown statistics on Intel Architectures
+---------------------------------------------------------
+
+On Intel systems, you can collect topdown PAPI counter statistics by using
+command line arguments
+
+``--add-to-spot-config, -atsc <string> [Default is none]``
+
+This appends additional parameters to the built-in Caliper spot config.
+
+To include some PAPI counters (Intel arch), add the following to the command 
+line
+
+``-atsc topdown.all``
+
+Caliper's topdown service generates derived metrics from raw PAPI counters; 
+a hierarchy of metrics to identify bottlenecks in out-of-order processors. 
+This is based on an an approach described in Ahmad Yasin's paper 
+*A Top-Down Method for Performance Analysis and Counters Architecture*. The 
+top level of the hierarchy has a reliable set of four derived metrics or 
+starting weights (sum to 1.0) which include:
+
+#. **Frontend Bound.** Stalls attributed to the front end which is responsible for fetching and decoding program code.    
+#. **Bad Speculation.** Fraction of the workload that is affected by incorrect execution paths, i.e. branch misprediction penalties
+#. **Retiring.** Increases in this category reflects overall Instructions Per Cycle (IPC) fraction which is good in general. However, a large retiring fraction for non-vectorized code could also be a hint to the user to vectorize their code (see Yasin's paper) 
+#. **Backend Bound.** Memory Bound where execution stalls are related to the memory subsystem, or Core Bound where execution unit occupancy is sub-optimal lowering IPC (more compiler dependent)
+
+.. note:: Backend Bound = 1 - (Frontend Bound + Bad Speculation + Retiring)
+
+.. note:: Caveats: 
+
+          #. When collecting PAPI data in this way you'll be limited to running              only one variant, since Caliper maintains only one PAPI context.
+          #. Small kernels should be run at large problem sizes to minimize 
+             anomalous readings.
+          #. Measured values are only relevant for the innermost level of the 
+             Caliper tree hierarchy, i.e. Kernel.Tuning under investigation.
+          #. Some lower level derived quantities may appear anomalous 
+             with negative values. Collecting raw counters can help identify 
+             the discrepancy.
+
+``-atsc topdown-counters.all``
+
+.. note:: Other caveats: Raw counter values are often noisy and require a lot 
+          of accommodation to collect accurate data including: 
+ 
+            * Turning off Hyperthreading
+            * Turning off Prefetch as is done in Intel's Memory Latency 
+              Checker (requires root access) 
+            * Adding LFENCE instruction to serialize and bracket code under 
+              test 
+            * Disabling preemption and hard interrupts 
+
+          See Andreas Abel's dissertation `Automatic Generation of Models of 
+          Microarchitectures` for more info on this and for a comprehensive 
+          look at the nanobench machinery.
+
+Some helpful references:
+
+`Yasin's Paper <https://www.researchgate.net/publication/269302126_A_Top-Down_method_for_performance_analysis_and_counters_architecture>`_
+
+`Vtune-cookbook topdown method <https://www.intel.com/content/www/us/en/develop/documentation/vtune-cookbook/top/methodologies/top-down-microarchitecture-analysis-method.html>`_
+
+`Automatic Generation of Models of Microarchitectures <https://uops.info/dissertation.pdf>`_
+
+Generating trace events (time-series) for viewing in chrome://tracing or Perfetto
+---------------------------------------------------------------------------------
+
+`Perfetto <https://ui.perfetto.dev/>`_
+
+Use Caliper's event trace service to collect timestamp info, where kernel 
+timing can be viewed using browser trace profile views. For example,
+
+``CALI_CONFIG=event-trace,event.timestamps ./raja-perf.exe -ek PI_ATOMIC INDEXLIST  -sp``
+
+This will produce a separate .cali file with date prefix which looks something 
+like ``221108-100718_724_ZKrHC68b77Yd.cali``
+
+Then, we need to convert this .cali file to JSON records. But first, we need 
+to make sure Caliper's python reader is available in the ``PYTHONPATH`` 
+environment variable 
+
+``export PYTHONPATH=caliper-source-dir/python/caliper-reader``
+
+then run ``cali2traceevent.py``. For example,
+
+``python3 ~/workspace/Caliper/python/cali2traceevent.py 221108-102406_956_9WkZo6xvetnu.cali RAJAPerf.trace.json``
+
+You can then load the resulting JSON file either in Chrome by going to 
+``chrome://tracing`` or in ``Perfetto``.
+
+For CUDA, assuming you built Caliper with CUDA support, you can collect and 
+combine trace information for memcpy, kernel launch, synchronization, and 
+kernels. For example,
+
+``CALI_CONFIG="event-trace(event.timestamps,trace.cuda=true,cuda.activities)" ./raja-perf.exe -v RAJA_CUDA Base_CUDA -k Algorithm_REDUCE_SUM -sp``
+
+.. warning::
+  When you run cali2traceevent.py you need to add --sort option before the filenames.
+  This is needed because the trace.cuda event records need to be sorted before processing.
+  Failing to do so may result in a Python traceback.
+  New versions of the Caliper Python package have this option built in by default to avoid this issue.
+
+``~/workspace/Caliper/python/cali2traceevent.py --sort file.cali file.json``
+
+For HIP, substitute ``rocm.activities`` for ``cuda.activities``.
+
+.. note:: Currently there is no analog ``trace.rocm``.

From 886800a89010808f0cad71293e67d365e6c22ca9 Mon Sep 17 00:00:00 2001
From: Michael Richard Mckinsey <mckinsey@lassen34.coral.llnl.gov>
Date: Fri, 14 Jul 2023 10:33:04 -0700
Subject: [PATCH 11/13] Revert tuning file generation changes and re-implement
 per variant ConfigManager

---
 src/common/Executor.cpp   |  2 +-
 src/common/KernelBase.cpp | 40 +++++++++++++++--------
 src/common/KernelBase.hpp | 67 +++++++++++++++++++++------------------
 3 files changed, 64 insertions(+), 45 deletions(-)

diff --git a/src/common/Executor.cpp b/src/common/Executor.cpp
index 7a5499fd2..6f9ef5776 100644
--- a/src/common/Executor.cpp
+++ b/src/common/Executor.cpp
@@ -1114,7 +1114,7 @@ void Executor::runKernel(KernelBase* kernel, bool print_kernel_name)
       }
     }
     #if defined(RAJA_PERFSUITE_USE_CALIPER)
-      KernelBase::setCaliperMgrFlush(getVariantName(vid));
+      KernelBase::setCaliperMgrFlush();
     #endif
   } // loop over variants
 }
diff --git a/src/common/KernelBase.cpp b/src/common/KernelBase.cpp
index 7061591ef..9a078c886 100644
--- a/src/common/KernelBase.cpp
+++ b/src/common/KernelBase.cpp
@@ -189,6 +189,9 @@ void KernelBase::setVariantDefined(VariantID vid)
   min_time[vid].resize(variant_tuning_names[vid].size(), std::numeric_limits<double>::max());
   max_time[vid].resize(variant_tuning_names[vid].size(), -std::numeric_limits<double>::max());
   tot_time[vid].resize(variant_tuning_names[vid].size(), 0.0);
+  #if defined(RAJA_PERFSUITE_USE_CALIPER)
+    doCaliMetaOnce[vid].resize(variant_tuning_names[vid].size(), true);
+  #endif
 }
 
 int KernelBase::getDataAlignment() const
@@ -277,7 +280,7 @@ void KernelBase::runKernel(VariantID vid, size_t tune_idx)
 
 #if defined(RAJA_PERFSUITE_USE_CALIPER)
   if (doCaliperTiming) {
-    KernelBase::setCaliperMgrStart();
+    KernelBase::setCaliperMgrStart(vid);
   }
 #endif
 
@@ -353,7 +356,7 @@ void KernelBase::runKernel(VariantID vid, size_t tune_idx)
   }
 #if defined(RAJA_PERFSUITE_USE_CALIPER)
   if (doCaliperTiming) {
-    setCaliperMgrStop();
+    KernelBase::setCaliperMgrStop(vid);
   }
 #endif
 }
@@ -427,21 +430,30 @@ void KernelBase::print(std::ostream& os) const
 }
 
 #if defined(RAJA_PERFSUITE_USE_CALIPER)
-void KernelBase::CaliMeta()
+void KernelBase::doOnceCaliMetaBegin(VariantID vid, size_t tune_idx)
 {
-  // attributes are class variables initialized in ctor
-  cali_set_double(ProblemSize_attr,(double)getActualProblemSize());
-  cali_set_double(Reps_attr,(double)getRunReps());
-  cali_set_double(Iters_Rep_attr,(double)getItsPerRep());
-  cali_set_double(Kernels_Rep_attr,(double)getKernelsPerRep());
-  cali_set_double(Bytes_Rep_attr,(double)getBytesPerRep());
-  cali_set_double(Flops_Rep_attr,(double)getFLOPsPerRep());
-  double const block_size = getBlockSize();
-  if (!isnan(block_size))
-    cali_set_double(BlockSize_attr, block_size);
+  if(doCaliMetaOnce[vid].at(tune_idx)) {
+    // attributes are class variables initialized in ctor
+    cali_set_double(ProblemSize_attr,(double)getActualProblemSize());
+    cali_set_double(Reps_attr,(double)getRunReps());
+    cali_set_double(Iters_Rep_attr,(double)getItsPerRep());
+    cali_set_double(Kernels_Rep_attr,(double)getKernelsPerRep());
+    cali_set_double(Bytes_Rep_attr,(double)getBytesPerRep());
+    cali_set_double(Flops_Rep_attr,(double)getFLOPsPerRep());
+    double const block_size = getBlockSize();
+    if (!isnan(block_size))
+      cali_set_double(BlockSize_attr, block_size);
+  }
+}
+
+void KernelBase::doOnceCaliMetaEnd(VariantID vid, size_t tune_idx)
+{
+  if(doCaliMetaOnce[vid].at(tune_idx)) {
+    doCaliMetaOnce[vid].at(tune_idx) = false;
+  }
 }
 
 // initialize a KernelBase static
-cali::ConfigManager KernelBase::mgr;
+std::map<rajaperf::VariantID, cali::ConfigManager> KernelBase::mgr;
 #endif
 }  // closing brace for rajaperf namespace
diff --git a/src/common/KernelBase.hpp b/src/common/KernelBase.hpp
index 4faf12be0..ce84b4f50 100644
--- a/src/common/KernelBase.hpp
+++ b/src/common/KernelBase.hpp
@@ -40,7 +40,7 @@
       std::string kstr = getName(); \
       std::string gstr = getGroupName(kstr); \
       std::string vstr = "RAJAPerf"; \
-      CaliMeta(); \
+      doOnceCaliMetaBegin(running_variant, running_tuning); \
       CALI_MARK_BEGIN(vstr.c_str()); \
       CALI_MARK_BEGIN(gstr.c_str()); \
       CALI_MARK_BEGIN(kstr.c_str()); \
@@ -54,6 +54,7 @@
       CALI_MARK_END(kstr.c_str()); \
       CALI_MARK_END(gstr.c_str()); \
       CALI_MARK_END(vstr.c_str()); \
+      doOnceCaliMetaEnd(running_variant,running_tuning); \
     }
 
 #else
@@ -396,7 +397,8 @@ class KernelBase
 #if defined(RAJA_PERFSUITE_USE_CALIPER)
   void caliperOn() { doCaliperTiming = true; }
   void caliperOff() { doCaliperTiming = false; }
-  void CaliMeta();
+  void doOnceCaliMetaBegin(VariantID vid, size_t tune_idx);
+  void doOnceCaliMetaEnd(VariantID vid, size_t tune_idx);
   static void setCaliperMgrVariant(VariantID vid, const std::string& outdir, 
                                    const std::string& addToConfig,
                                    const std::vector<std::string>& tuning_input)
@@ -543,44 +545,48 @@ class KernelBase
     }
 
     if(config_ok) {
+      cali::ConfigManager m;
+      mgr.insert(std::make_pair(vid, m));
       std::string od("./");
       if (outdir.size()) {
         od = outdir + "/";
       }
       std::string vstr = getVariantName(vid);
       std::string tstr = "";
-      if (tuning_input.size() == 1) { // If only 1 tuning, add to file name
-        tstr = "-" + tuning_input.front();
-      }
-      std::string profile = "spot(output=" + od + vstr + tstr + ".cali)";
+      std::string profile = "spot(output=" + od + vstr + ".cali)";
       if(!addToConfig.empty()) {
         profile += "," + addToConfig;
       }
       std::cout << "Profile: " << profile << std::endl;
-      mgr.add_option_spec(problem_size_json_spec.c_str());
-      mgr.set_default_parameter("problem_size", "true");
-      mgr.add_option_spec(reps_json_spec.c_str());
-      mgr.set_default_parameter("reps", "true");
-      mgr.add_option_spec(iters_json_spec.c_str());
-      mgr.set_default_parameter("iters_p_rep", "true");
-      mgr.add_option_spec(kernels_json_spec.c_str());
-      mgr.set_default_parameter("kernels_p_rep", "true");
-      mgr.add_option_spec(bytes_json_spec.c_str());
-      mgr.set_default_parameter("bytes_p_rep", "true");
-      mgr.add_option_spec(flops_rep_json_spec.c_str());
-      mgr.set_default_parameter("flops_p_rep", "true");
-      mgr.add_option_spec(block_size_json_spec.c_str());
-      mgr.set_default_parameter("block_size", "true");
-      mgr.add(profile.c_str());
+      mgr[vid].add_option_spec(problem_size_json_spec.c_str());
+      mgr[vid].set_default_parameter("problem_size", "true");
+      mgr[vid].add_option_spec(reps_json_spec.c_str());
+      mgr[vid].set_default_parameter("reps", "true");
+      mgr[vid].add_option_spec(iters_json_spec.c_str());
+      mgr[vid].set_default_parameter("iters_p_rep", "true");
+      mgr[vid].add_option_spec(kernels_json_spec.c_str());
+      mgr[vid].set_default_parameter("kernels_p_rep", "true");
+      mgr[vid].add_option_spec(bytes_json_spec.c_str());
+      mgr[vid].set_default_parameter("bytes_p_rep", "true");
+      mgr[vid].add_option_spec(flops_rep_json_spec.c_str());
+      mgr[vid].set_default_parameter("flops_p_rep", "true");
+      mgr[vid].add_option_spec(block_size_json_spec.c_str());
+      mgr[vid].set_default_parameter("block_size", "true");
+      mgr[vid].add(profile.c_str());
     }
   }
 
-  static void setCaliperMgrStart() { mgr.start(); }
-  static void setCaliperMgrStop() { mgr.stop(); }
-  static void setCaliperMgrFlush(std::string variant_name)
-  {
-    adiak::value("variant",variant_name.c_str());
-    mgr.flush();
+  static void setCaliperMgrStart(VariantID vid) { mgr[vid].start(); }
+  static void setCaliperMgrStop(VariantID vid) { mgr[vid].stop(); }
+  static void setCaliperMgrFlush()
+  { // we're going to flush all the variants at once
+    std::cout << "flushing " << mgr.size() << " variants\n";
+    for(auto const &kv : mgr) {
+      // set Adiak key first
+      std::string variant=getVariantName(kv.first);
+      adiak::value("variant",variant.c_str());
+      mgr[kv.first].flush();
+    }
   }
 
   std::string getGroupName(const std::string &kname )
@@ -639,6 +645,7 @@ class KernelBase
 
 #if defined(RAJA_PERFSUITE_USE_CALIPER)
   bool doCaliperTiming = true; // warmup can use this to exclude timing
+  std::vector<bool> doCaliMetaOnce[NumVariants];
   cali_id_t ProblemSize_attr; // in ctor cali_create_attribute("ProblemSize",CALI_TYPE_DOUBLE,CALI_ATTR_ASVALUE | CALI_ATTR_AGGREGATABLE | CALI_ATTR_SKIP_EVENTS);
   cali_id_t Reps_attr;
   cali_id_t Iters_Rep_attr;
@@ -648,9 +655,9 @@ class KernelBase
   cali_id_t BlockSize_attr;
 
 
-      // we need a Caliper Manager object per variant
-// we can inline this with c++17
-  static cali::ConfigManager mgr;
+  // we need a Caliper Manager object per variant
+  // we can inline this with c++17
+  static std::map<rajaperf::VariantID, cali::ConfigManager> mgr;
 #endif
 
   std::vector<RAJA::Timer::ElapsedType> min_time[NumVariants];

From d8bc79b51029570679779bedd63de4ea7ad7afbc Mon Sep 17 00:00:00 2001
From: Michael Richard Mckinsey <mckinsey@lassen6.coral.llnl.gov>
Date: Fri, 14 Jul 2023 15:05:27 -0700
Subject: [PATCH 12/13] Generate separate cali file per tuning

---
 src/common/Executor.cpp   | 17 ++++++-----
 src/common/KernelBase.cpp | 10 +++----
 src/common/KernelBase.hpp | 63 ++++++++++++++++++++-------------------
 3 files changed, 46 insertions(+), 44 deletions(-)

diff --git a/src/common/Executor.cpp b/src/common/Executor.cpp
index 6f9ef5776..509ea7439 100644
--- a/src/common/Executor.cpp
+++ b/src/common/Executor.cpp
@@ -713,12 +713,6 @@ void Executor::setupSuite()
       for (VIDset::iterator vid = run_var.begin();
            vid != run_var.end(); ++vid) {
         variant_ids.push_back( *vid );
-      #if defined(RAJA_PERFSUITE_USE_CALIPER)
-          KernelBase::setCaliperMgrVariant(*vid,
-                                            run_params.getOutputDirName(),
-                                            run_params.getAddToSpotConfig(),
-                                            run_params.getTuningInput());
-      #endif
       }
 
       //
@@ -779,7 +773,15 @@ void Executor::setupSuite()
 
           tuning_names[vid].resize(tuning_names_order_map.size());
           for (auto const& tuning_name_idx_pair : tuning_names_order_map) {
-            tuning_names[vid][tuning_name_idx_pair.second] = tuning_name_idx_pair.first;
+            size_t tid = tuning_name_idx_pair.second;
+            std::string tstr = tuning_name_idx_pair.first;
+            tuning_names[vid][tid] = tstr;
+            #if defined(RAJA_PERFSUITE_USE_CALIPER)
+              KernelBase::setCaliperMgrVariantTuning(vid,
+                                                tstr,
+                                                run_params.getOutputDirName(),
+                                                run_params.getAddToSpotConfig());
+            #endif
           }
           // reorder to put "default" first
           auto default_order_iter = tuning_names_order_map.find(KernelBase::getDefaultTuningName());
@@ -799,7 +801,6 @@ void Executor::setupSuite()
               tunings_set.emplace(tuning_name);
             }
           }
-          adiak::value("tunings", tunings_set);
         #endif
 
         //
diff --git a/src/common/KernelBase.cpp b/src/common/KernelBase.cpp
index 9a078c886..4ccdbb915 100644
--- a/src/common/KernelBase.cpp
+++ b/src/common/KernelBase.cpp
@@ -280,7 +280,7 @@ void KernelBase::runKernel(VariantID vid, size_t tune_idx)
 
 #if defined(RAJA_PERFSUITE_USE_CALIPER)
   if (doCaliperTiming) {
-    KernelBase::setCaliperMgrStart(vid);
+    KernelBase::setCaliperMgrStart(vid, getVariantTuningName(vid, tune_idx));
   }
 #endif
 
@@ -356,7 +356,7 @@ void KernelBase::runKernel(VariantID vid, size_t tune_idx)
   }
 #if defined(RAJA_PERFSUITE_USE_CALIPER)
   if (doCaliperTiming) {
-    KernelBase::setCaliperMgrStop(vid);
+    KernelBase::setCaliperMgrStop(vid, getVariantTuningName(vid, tune_idx));
   }
 #endif
 }
@@ -440,9 +440,7 @@ void KernelBase::doOnceCaliMetaBegin(VariantID vid, size_t tune_idx)
     cali_set_double(Kernels_Rep_attr,(double)getKernelsPerRep());
     cali_set_double(Bytes_Rep_attr,(double)getBytesPerRep());
     cali_set_double(Flops_Rep_attr,(double)getFLOPsPerRep());
-    double const block_size = getBlockSize();
-    if (!isnan(block_size))
-      cali_set_double(BlockSize_attr, block_size);
+    cali_set_double(BlockSize_attr, getBlockSize());
   }
 }
 
@@ -454,6 +452,6 @@ void KernelBase::doOnceCaliMetaEnd(VariantID vid, size_t tune_idx)
 }
 
 // initialize a KernelBase static
-std::map<rajaperf::VariantID, cali::ConfigManager> KernelBase::mgr;
+std::map<rajaperf::VariantID, std::map<std::string, cali::ConfigManager>> KernelBase::mgr;
 #endif
 }  // closing brace for rajaperf namespace
diff --git a/src/common/KernelBase.hpp b/src/common/KernelBase.hpp
index ce84b4f50..a8a410068 100644
--- a/src/common/KernelBase.hpp
+++ b/src/common/KernelBase.hpp
@@ -399,9 +399,10 @@ class KernelBase
   void caliperOff() { doCaliperTiming = false; }
   void doOnceCaliMetaBegin(VariantID vid, size_t tune_idx);
   void doOnceCaliMetaEnd(VariantID vid, size_t tune_idx);
-  static void setCaliperMgrVariant(VariantID vid, const std::string& outdir, 
-                                   const std::string& addToConfig,
-                                   const std::vector<std::string>& tuning_input)
+  static void setCaliperMgrVariantTuning(VariantID vid,
+                                    std::string tstr,
+                                    const std::string& outdir,
+                                    const std::string& addToConfig)
   {
     static bool ran_spot_config_check = false;
     bool config_ok = true;
@@ -546,46 +547,48 @@ class KernelBase
 
     if(config_ok) {
       cali::ConfigManager m;
-      mgr.insert(std::make_pair(vid, m));
+      mgr[vid][tstr] = m;
       std::string od("./");
       if (outdir.size()) {
         od = outdir + "/";
       }
       std::string vstr = getVariantName(vid);
-      std::string tstr = "";
-      std::string profile = "spot(output=" + od + vstr + ".cali)";
+      std::string profile = "spot(output=" + od + vstr + "-" + tstr + ".cali)";
       if(!addToConfig.empty()) {
         profile += "," + addToConfig;
       }
       std::cout << "Profile: " << profile << std::endl;
-      mgr[vid].add_option_spec(problem_size_json_spec.c_str());
-      mgr[vid].set_default_parameter("problem_size", "true");
-      mgr[vid].add_option_spec(reps_json_spec.c_str());
-      mgr[vid].set_default_parameter("reps", "true");
-      mgr[vid].add_option_spec(iters_json_spec.c_str());
-      mgr[vid].set_default_parameter("iters_p_rep", "true");
-      mgr[vid].add_option_spec(kernels_json_spec.c_str());
-      mgr[vid].set_default_parameter("kernels_p_rep", "true");
-      mgr[vid].add_option_spec(bytes_json_spec.c_str());
-      mgr[vid].set_default_parameter("bytes_p_rep", "true");
-      mgr[vid].add_option_spec(flops_rep_json_spec.c_str());
-      mgr[vid].set_default_parameter("flops_p_rep", "true");
-      mgr[vid].add_option_spec(block_size_json_spec.c_str());
-      mgr[vid].set_default_parameter("block_size", "true");
-      mgr[vid].add(profile.c_str());
+      mgr[vid][tstr].add_option_spec(problem_size_json_spec.c_str());
+      mgr[vid][tstr].set_default_parameter("problem_size", "true");
+      mgr[vid][tstr].add_option_spec(reps_json_spec.c_str());
+      mgr[vid][tstr].set_default_parameter("reps", "true");
+      mgr[vid][tstr].add_option_spec(iters_json_spec.c_str());
+      mgr[vid][tstr].set_default_parameter("iters_p_rep", "true");
+      mgr[vid][tstr].add_option_spec(kernels_json_spec.c_str());
+      mgr[vid][tstr].set_default_parameter("kernels_p_rep", "true");
+      mgr[vid][tstr].add_option_spec(bytes_json_spec.c_str());
+      mgr[vid][tstr].set_default_parameter("bytes_p_rep", "true");
+      mgr[vid][tstr].add_option_spec(flops_rep_json_spec.c_str());
+      mgr[vid][tstr].set_default_parameter("flops_p_rep", "true");
+      mgr[vid][tstr].add_option_spec(block_size_json_spec.c_str());
+      mgr[vid][tstr].set_default_parameter("block_size", "true");
+      mgr[vid][tstr].add(profile.c_str());
     }
   }
 
-  static void setCaliperMgrStart(VariantID vid) { mgr[vid].start(); }
-  static void setCaliperMgrStop(VariantID vid) { mgr[vid].stop(); }
+  static void setCaliperMgrStart(VariantID vid, std::string tstr) { mgr[vid][tstr].start(); }
+  static void setCaliperMgrStop(VariantID vid, std::string tstr) { mgr[vid][tstr].stop(); }
   static void setCaliperMgrFlush()
   { // we're going to flush all the variants at once
-    std::cout << "flushing " << mgr.size() << " variants\n";
-    for(auto const &kv : mgr) {
-      // set Adiak key first
-      std::string variant=getVariantName(kv.first);
-      adiak::value("variant",variant.c_str());
-      mgr[kv.first].flush();
+    for(auto const &mp : mgr) {
+      for(auto const &kv : mp.second) {
+        // set Adiak key first
+        std::string variant = getVariantName(mp.first);
+        std::string tstr = kv.first;
+        adiak::value("variant",variant.c_str());
+        adiak::value("tuning", tstr.c_str());
+        mgr[mp.first][kv.first].flush();
+      }
     }
   }
 
@@ -657,7 +660,7 @@ class KernelBase
 
   // we need a Caliper Manager object per variant
   // we can inline this with c++17
-  static std::map<rajaperf::VariantID, cali::ConfigManager> mgr;
+  static std::map<rajaperf::VariantID, std::map<std::string, cali::ConfigManager>> mgr;
 #endif
 
   std::vector<RAJA::Timer::ElapsedType> min_time[NumVariants];

From d0125252fed010930cd42ae423420cfb77ab6e27 Mon Sep 17 00:00:00 2001
From: Michael Richard Mckinsey <mckinsey@quartz764.llnl.gov>
Date: Fri, 14 Jul 2023 15:45:48 -0700
Subject: [PATCH 13/13] Update doc

---
 docs/sphinx/user_guide/output.rst | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/docs/sphinx/user_guide/output.rst b/docs/sphinx/user_guide/output.rst
index f7de7fc50..a30a11431 100644
--- a/docs/sphinx/user_guide/output.rst
+++ b/docs/sphinx/user_guide/output.rst
@@ -166,11 +166,12 @@ Caliper output files
 ===========================
 
 If you've built RAJAPerf with Caliper support turned on, then in addition to the
-outputs mentioned above, we also save a .cali file for each variant run, such as:
-Base_OpenMP.cali, Lambda_OpenMP.cali, RAJA_OpenMP.cali, etc.
+outputs mentioned above, we also save a .cali file for each variant & tuning run,
+such as:
+Base_OpenMP-default.cali, Lambda_OpenMP-default.cali, Base_CUDA-block_128.cali, etc.
 
-Also, by using the `--variants` and `--tunings` flag when running, you can generate 
-single variant/tuning runs. These work optimally with Hatchet/Thicket.
+Also, by using the `--variants` and `--tunings` flag when running, you can specify
+which variant/tunings to run.
 
 There are several techniques to display the Caliper trees (Timing Hierarchy)