llnl · michaelmckinsey1 · Jun 8, 2023 · Jun 9, 2023 · Jun 14, 2023 · Jun 15, 2023
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -124,6 +124,34 @@ if ((ENABLE_HIP) AND (NOT ENABLE_KOKKOS))
   list(APPEND RAJA_PERFSUITE_DEPENDS blt::hip_runtime)
 endif()
 
+#
+# Are we using Caliper
+#
+set(RAJA_PERFSUITE_USE_CALIPER off CACHE BOOL "")
+if (RAJA_PERFSUITE_USE_CALIPER)
+  find_package(caliper REQUIRED)
+  list(APPEND RAJA_PERFSUITE_DEPENDS caliper)
+  add_definitions(-DRAJA_PERFSUITE_USE_CALIPER)
+  message(STATUS "Using Caliper")
+  find_package(adiak REQUIRED)
+  # use ${adiak_LIBRARIES} since version could have adiak vs adiak::adiak export
+  list(APPEND RAJA_PERFSUITE_DEPENDS ${adiak_LIBRARIES})
+  if (ENABLE_CUDA)
+    # Adiak will propagate -pthread from spectrum mpi from a spack install of Caliper with +mpi; and needs to be handled even if RAJAPerf is non MPI program
+    # We should delegate to BLT to handle unguarded -pthread from any dependencies, but currently BLT doesn't
+    set_target_properties(${adiak_LIBRARIES} PROPERTIES INTERFACE_COMPILE_OPTIONS "$<$<NOT:$<COMPILE_LANGUAGE:CUDA>>:-pthread>;$<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler=-pthread>")
+    # the following for adiak-0.2.2
+    if (TARGET adiak::mpi)
+      set_target_properties(adiak::mpi PROPERTIES INTERFACE_COMPILE_OPTIONS "$<$<NOT:$<COMPILE_LANGUAGE:CUDA>>:-pthread>;$<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler=-pthread>")
+    endif ()
+  endif ()
+  message(STATUS "Caliper includes : ${caliper_INCLUDE_DIR}")
+  message(STATUS "Adiak includes : ${adiak_INCLUDE_DIRS}")
+  include_directories(${caliper_INCLUDE_DIR})
+  include_directories(${adiak_INCLUDE_DIRS})
+endif ()
+
+
 set(RAJAPERF_BUILD_SYSTYPE $ENV{SYS_TYPE})
 set(RAJAPERF_BUILD_HOST $ENV{HOSTNAME})
 

diff --git a/docs/sphinx/user_guide/build.rst b/docs/sphinx/user_guide/build.rst
@@ -210,3 +210,46 @@ sizes. The CMake option for this is
 
 will build versions of GPU kernels that use 64, 128, 256, 512, and 1024 threads
 per GPU thread-block.
+
+Building with Caliper
+---------------------
+
+RAJAPerf Suite may also use Caliper instrumentation, with per variant & tuning output into .cali files. While Caliper is low-overhead
+it is not zero, so it will add a small amount of timing skew in its data as 
+compared to the original. Caliper output enables usage of performance analysis tools like Hatchet and Thicket.
+For much more on Caliper, Hatchet and Thicket, read their documentation here:
+
+| - `Caliper Documentation <http://software.llnl.gov/Caliper/>`_ 
+| - `Hatchet User Guide <https://llnl-hatchet.readthedocs.io/en/latest/user_guide.html>`_ 
+| - `Thicket User Guide <https://thicket.readthedocs.io/en/latest/>`_ 
+
+
+Caliper *annotation* is in the following tree structure::
+
+  RAJAPerf
+    Group
+      Kernel
+
+| Build against these Caliper versions
+|
+|   **caliper@2.9.0** (preferred target)
+|   **caliper@master** (if using older Spack version)
+
+In Cmake scripts add
+  **-DRAJA_PERFSUITE_USE_CALIPER=On** 
+
+Add to **-DCMAKE_PREFIX_PATH**
+  ;${CALIPER_PREFIX}/share/cmake/caliper;${ADIAK_PREFIX}/lib/cmake/adiak
+
+or use 
+  -Dcaliper_DIR -Dadiak_DIR package prefixes
+
+For Spack : raja_perf +caliper ^caliper@2.9.0
+
+For Uberenv: python3 scripts/uberenv/uberenv.py --spec +caliper ^caliper@2.9.0
+
+If you intend on passing nvtx or roctx annotation to Nvidia or AMD profiling tools, 
+build Caliper with +cuda cuda_arch=XX or +rocm respectively. Then you can specify
+an additional Caliper service for nvtx or roctx like so: roctx example:
+
+CALI_SERVICES_ENABLE=roctx rocprof --roctx-trace --hip-trace raja-perf.exe 
diff --git a/docs/sphinx/user_guide/output.rst b/docs/sphinx/user_guide/output.rst
@@ -159,3 +159,84 @@ storing the result in matrix A (N_i X N_j). Problem size could be chosen to be
 the maximum number of entries in matrix B or C. We choose the size of matrix 
 A (N_i * N_j), which is more closely aligned with the number of independent 
 operations (i.e., the amount of parallel work) in the kernels.
+
+
+===========================
+Caliper output files
+===========================
+
+If you've built RAJAPerf with Caliper support turned on, then in addition to the
+outputs mentioned above, we also save a .cali file for each variant & tuning run,
+such as:
+Base_OpenMP-default.cali, Lambda_OpenMP-default.cali, Base_CUDA-block_128.cali, etc.
+
+Also, by using the `--variants` and `--tunings` flag when running, you can specify
+which variant/tunings to run.
+
+There are several techniques to display the Caliper trees (Timing Hierarchy)
+
+| 1: Caliper's cali-query tool.
+| The first technique is with Caliper's own tool cali-query, we run it with 
+| **-T** to display tree, or you can specify **--tree**. 
+|
+| cali-query -T $HOME/data/default_problem_size/gcc/RAJA_Seq.cali
+
+2: Caliper's Python module *caliperreader*::
+
+  import os
+  import caliperreader as cr
+  DATA_DIR = os.getenv('HOME')+"/data/default_problem_size/gcc"
+  os.chdir(DATA_DIR)
+  r = cr.CaliperReader()
+  r.read("RAJA_Seq.cali")
+  metric = 'avg#inclusive#sum#time.duration'
+  for rec in r.records:
+    path = rec['path'] if 'path' in rec else 'UNKNOWN'
+    time = rec[metric] if metric in rec else '0'
+    if not 'UNKNOWN' in path:
+        if (isinstance(path, list)):
+            path = "/".join(path)
+        print("{0}: {1}".format(path, time))
+
+You can add a couple of lines to view the metadata keys captured by Caliper/Adiak::
+
+  for g in r.globals:  
+    print(g)  
+
+You can also add a line to display metadata value in the dictionary **r.globals**
+
+For example print out the OpenMP Max Threads value recorded at runtime:: 
+
+  print('OMP Max Threads: ' + r.globals['omp_max_threads'])`  
+
+or the variant represented in this file::  
+
+  print('Variant: ' + r.globals['variant'])
+
+
+.. note:: The script above was written using caliper-reader 0.3.0, 
+          but is fairly generic. Other version usage notes may be 
+          found at the link below
+
+`caliper-reader <https://pypi.org/project/caliper-reader/>`_ 
+
+
+3: Using the *Hatchet* Python module for single files::
+
+  import hatchet as ht
+  DATA_DIR = os.getenv('HOME')+"/data/default_problem_size/gcc"
+  os.chdir(DATA_DIR)
+  gf1 = ht.GraphFrame.from_caliperreader("RAJA_Seq.cali")
+  print(gf1.tree())
+
+`Find out more on hatchet <https://github.com/LLNL/hatchet>`_
+
+3: Using the *Thicket* Python module for multiple files::
+
+  import thicket as th
+  DATA_DIR = os.getenv('HOME')+"/data/default_problem_size/gcc"
+  os.chdir(DATA_DIR)
+  th1 = th.Thicket.from_caliperreader(["RAJA_Seq-default.cali", "Base_Seq-default.cali", "Base_CUDA-block_128", "Base_CUDA-block_256"])
+  print(th1.tree())
+
+`Find out more on thicket <https://github.com/LLNL/thicket>`_
diff --git a/docs/sphinx/user_guide/run.rst b/docs/sphinx/user_guide/run.rst
@@ -138,3 +138,120 @@ was not possible for them to co-exist in the same executable as CUDA
 variants, for example. In the future, the build system may be reworked so 
 that the OpenMP target variants can be run from the same executable as the 
 other variants.
+
+============================
+Additional Caliper Use Cases
+============================
+
+If you specified building with Caliper (``-DRAJA_PERFSUITE_USE_CALIPER=On``),
+the generation of Caliper .cali files are automated for the most part.
+
+However, there are a couple of other supported use cases.
+
+Collecting PAPI topdown statistics on Intel Architectures
+---------------------------------------------------------
+
+On Intel systems, you can collect topdown PAPI counter statistics by using
+command line arguments
+
+``--add-to-spot-config, -atsc <string> [Default is none]``
+
+This appends additional parameters to the built-in Caliper spot config.
+
+To include some PAPI counters (Intel arch), add the following to the command 
+line
+
+``-atsc topdown.all``
+
+Caliper's topdown service generates derived metrics from raw PAPI counters; 
+a hierarchy of metrics to identify bottlenecks in out-of-order processors. 
+This is based on an an approach described in Ahmad Yasin's paper 
+*A Top-Down Method for Performance Analysis and Counters Architecture*. The 
+top level of the hierarchy has a reliable set of four derived metrics or 
+starting weights (sum to 1.0) which include:
+
+#. **Frontend Bound.** Stalls attributed to the front end which is responsible for fetching and decoding program code.    
+#. **Bad Speculation.** Fraction of the workload that is affected by incorrect execution paths, i.e. branch misprediction penalties
+#. **Retiring.** Increases in this category reflects overall Instructions Per Cycle (IPC) fraction which is good in general. However, a large retiring fraction for non-vectorized code could also be a hint to the user to vectorize their code (see Yasin's paper) 
+#. **Backend Bound.** Memory Bound where execution stalls are related to the memory subsystem, or Core Bound where execution unit occupancy is sub-optimal lowering IPC (more compiler dependent)
+
+.. note:: Backend Bound = 1 - (Frontend Bound + Bad Speculation + Retiring)
+
+.. note:: Caveats: 
+
+          #. When collecting PAPI data in this way you'll be limited to running              only one variant, since Caliper maintains only one PAPI context.
+          #. Small kernels should be run at large problem sizes to minimize 
+             anomalous readings.
+          #. Measured values are only relevant for the innermost level of the 
+             Caliper tree hierarchy, i.e. Kernel.Tuning under investigation.
+          #. Some lower level derived quantities may appear anomalous 
+             with negative values. Collecting raw counters can help identify 
+             the discrepancy.
+
+``-atsc topdown-counters.all``
+
+.. note:: Other caveats: Raw counter values are often noisy and require a lot 
+          of accommodation to collect accurate data including: 
+
+            * Turning off Hyperthreading
+            * Turning off Prefetch as is done in Intel's Memory Latency 
+              Checker (requires root access) 
+            * Adding LFENCE instruction to serialize and bracket code under 
+              test 
+            * Disabling preemption and hard interrupts 
+
+          See Andreas Abel's dissertation `Automatic Generation of Models of 
+          Microarchitectures` for more info on this and for a comprehensive 
+          look at the nanobench machinery.
+
+Some helpful references:
+
+`Yasin's Paper <https://www.researchgate.net/publication/269302126_A_Top-Down_method_for_performance_analysis_and_counters_architecture>`_
+
+`Vtune-cookbook topdown method <https://www.intel.com/content/www/us/en/develop/documentation/vtune-cookbook/top/methodologies/top-down-microarchitecture-analysis-method.html>`_
+
+`Automatic Generation of Models of Microarchitectures <https://uops.info/dissertation.pdf>`_
+
+Generating trace events (time-series) for viewing in chrome://tracing or Perfetto
+---------------------------------------------------------------------------------
+
+`Perfetto <https://ui.perfetto.dev/>`_
+
+Use Caliper's event trace service to collect timestamp info, where kernel 
+timing can be viewed using browser trace profile views. For example,
+
+``CALI_CONFIG=event-trace,event.timestamps ./raja-perf.exe -ek PI_ATOMIC INDEXLIST  -sp``
+
+This will produce a separate .cali file with date prefix which looks something 
+like ``221108-100718_724_ZKrHC68b77Yd.cali``
+
+Then, we need to convert this .cali file to JSON records. But first, we need 
+to make sure Caliper's python reader is available in the ``PYTHONPATH`` 
+environment variable 
+
+``export PYTHONPATH=caliper-source-dir/python/caliper-reader``
+
+then run ``cali2traceevent.py``. For example,
+
+``python3 ~/workspace/Caliper/python/cali2traceevent.py 221108-102406_956_9WkZo6xvetnu.cali RAJAPerf.trace.json``
+
+You can then load the resulting JSON file either in Chrome by going to 
+``chrome://tracing`` or in ``Perfetto``.
+
+For CUDA, assuming you built Caliper with CUDA support, you can collect and 
+combine trace information for memcpy, kernel launch, synchronization, and 
+kernels. For example,
+
+``CALI_CONFIG="event-trace(event.timestamps,trace.cuda=true,cuda.activities)" ./raja-perf.exe -v RAJA_CUDA Base_CUDA -k Algorithm_REDUCE_SUM -sp``
+
+.. warning::
+  When you run cali2traceevent.py you need to add --sort option before the filenames.
+  This is needed because the trace.cuda event records need to be sorted before processing.
+  Failing to do so may result in a Python traceback.
+  New versions of the Caliper Python package have this option built in by default to avoid this issue.
+
+``~/workspace/Caliper/python/cali2traceevent.py --sort file.cali file.json``
+
+For HIP, substitute ``rocm.activities`` for ``cuda.activities``.
+
+.. note:: Currently there is no analog ``trace.rocm``.
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
@@ -255,6 +255,7 @@ else()
 blt_add_executable(
   NAME raja-perf.exe
   SOURCES RAJAPerfSuiteDriver.cpp
+  INCLUDES ${PROJECT_BINARY_DIR}/include
   DEPENDS_ON ${RAJA_PERFSUITE_EXECUTABLE_DEPENDS}
   )
 install( TARGETS raja-perf.exe

diff --git a/src/algorithm/MEMCPY-Cuda.cpp b/src/algorithm/MEMCPY-Cuda.cpp
@@ -153,7 +153,7 @@ void MEMCPY::runCudaVariant(VariantID vid, size_t tune_idx)
         run_params.validGPUBlockSize(block_size)) {
 
       if (tune_idx == t) {
-
+        setBlockSize(block_size);
         runCudaVariantBlock<block_size>(vid);
 
       }

diff --git a/src/algorithm/MEMCPY-Hip.cpp b/src/algorithm/MEMCPY-Hip.cpp
@@ -155,7 +155,7 @@ void MEMCPY::runHipVariant(VariantID vid, size_t tune_idx)
         run_params.validGPUBlockSize(block_size)) {
 
       if (tune_idx == t) {
-
+        setBlockSize(block_size);
         runHipVariantBlock<block_size>(vid);
 
       }

diff --git a/src/algorithm/MEMSET-Cuda.cpp b/src/algorithm/MEMSET-Cuda.cpp
@@ -155,7 +155,7 @@ void MEMSET::runCudaVariant(VariantID vid, size_t tune_idx)
         run_params.validGPUBlockSize(block_size)) {
 
       if (tune_idx == t) {
-
+        setBlockSize(block_size);
         runCudaVariantBlock<block_size>(vid);
 
       }

diff --git a/src/algorithm/MEMSET-Hip.cpp b/src/algorithm/MEMSET-Hip.cpp
@@ -155,7 +155,7 @@ void MEMSET::runHipVariant(VariantID vid, size_t tune_idx)
         run_params.validGPUBlockSize(block_size)) {
 
       if (tune_idx == t) {
-
+        setBlockSize(block_size);
         runHipVariantBlock<block_size>(vid);
 
       }

diff --git a/src/algorithm/REDUCE_SUM-Cuda.cpp b/src/algorithm/REDUCE_SUM-Cuda.cpp
@@ -206,7 +206,7 @@ void REDUCE_SUM::runCudaVariant(VariantID vid, size_t tune_idx)
           run_params.validGPUBlockSize(block_size)) {
 
         if (tune_idx == t) {
-
+          setBlockSize(block_size);
           runCudaVariantBlock<block_size>(vid);
 
         }

diff --git a/src/algorithm/REDUCE_SUM-Hip.cpp b/src/algorithm/REDUCE_SUM-Hip.cpp
@@ -232,7 +232,7 @@ void REDUCE_SUM::runHipVariant(VariantID vid, size_t tune_idx)
           run_params.validGPUBlockSize(block_size)) {
 
         if (tune_idx == t) {
-
+          setBlockSize(block_size);
           runHipVariantBlock<block_size>(vid);
 
         }

diff --git a/src/common/CMakeLists.txt b/src/common/CMakeLists.txt
@@ -14,5 +14,6 @@ blt_add_library(
           OutputUtils.cpp 
           RAJAPerfSuite.cpp 
           RunParams.cpp
+  INCLUDES ${PROJECT_BINARY_DIR}/include/
   DEPENDS_ON ${RAJA_PERFSUITE_DEPENDS}
   )