Skip to content

Commit

Permalink
Adding timing metrics to CUDA and host executors (#842)
Browse files Browse the repository at this point in the history
* Adding timing metrics to CUDA and host executors
  • Loading branch information
cliffburdick authored Jan 30, 2025
1 parent 1777afc commit 2de8514
Show file tree
Hide file tree
Showing 5 changed files with 111 additions and 21 deletions.
38 changes: 38 additions & 0 deletions docs_input/basics/profiling.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
.. _profiling:

Profiling
#########

Profiling is a way to measure the performance of a program and to identify bottlenecks in your MatX application. Since
the method for profiling depends on the executor, each executor implements its own profiling mechanism. For example,
the CUDA executor can use events encapsulating the kernels it's profiling. The profiling is done through the executor
object rather than the `run` statement so that multiple `run`\s can be profiled together.

Profiling is done by calling the `start_timer()` method of the executor:

.. code-block:: cpp
exec.start_timer();
To stop the profiler, `stop_timer()` is called:

.. code-block:: cpp
exec.stop_timer();
Depending on the executor, `stop_timer()` may need to block for the operation to conplete on an asynchronous executor.

Once `stop_timer()` returns, the execution time between the timers can be retrieved by calling `get_time_ms()`:

.. code-block:: cpp
auto time = exec.get_time_ms();
In the above example `time` contains the runtime of everything executed between the `start_timer()` and `stop_timer()` calls. For
a CUDA executor this is the time between the beginning of the first kernel and the end of the last. For a CPU executor this is the CPU
time between the two calls.

.. note::
Profiling does not work a multi-threaded host executor currently

For a full example of profiling, see the `spectrogram` example.
7 changes: 3 additions & 4 deletions docs_input/build.rst
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,10 @@ Optional features of MatX that require downloading separate libraries use additi
be explicit about their requirements.

The MatX CMake build configuration is intented to help download any libraries for both the required and optional features.
The CPM_ build system is used to help with package management and version control. By default, CPM will fetch other packages
The CPM build system is used to help with package management and version control. By default, CPM will fetch other packages
from the internet. Alternatively, the option ``CPM_USE_LOCAL_PACKAGES`` can be used to point to local downloads in an air-gapped
or offline environment. Choosing local versions of packages uses the typical ``find_packages`` CMake search methods. Please see
the CPM_ documentation or the documentation for each package for more information.
the CPM documentation or the documentation for each package for more information.


System Requirements
Expand All @@ -27,8 +27,7 @@ for supported host compilers. Other requirements for optional components are lis
Required Third-party Dependencies
---------------------------------

- `CPM <https://github.com/cpm-cmake/CPM.cmake>`_ (* Included in the project source and does not require a separate download)
- `CCCL <https://github.com/NVIDIA/cccl>`_ 2.7.0+
- `CCCL <https://github.com/NVIDIA/cccl>`_ 2.7.0+ commit cbc6b9b or higher


Optional Third-party Dependencies
Expand Down
21 changes: 7 additions & 14 deletions examples/spectrogram.cu
Original file line number Diff line number Diff line change
Expand Up @@ -60,11 +60,6 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)

cudaStream_t stream;
cudaStreamCreate(&stream);

cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);

cudaExecutor exec{stream};

float fs = 10000;
Expand Down Expand Up @@ -96,23 +91,23 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
(time = linspace<0>(num_samps, 0.0f, static_cast<float>(N) - 1.0f) / fs)
.run(exec);
// mod = 500 * np.cos(2*np.pi*0.25*time)
(modulation = 500 * cos(2 * M_PI * 0.25 * time)).run(exec);
(modulation = 500.f * cos(2.f * static_cast<typename complex::value_type>(M_PI) * 0.25f * time)).run(exec);
// carrier = amp * np.sin(2*np.pi*3e3*time + modulation)
(carrier = amp * sin(2 * M_PI * 3000 * time + modulation)).run(exec);
(carrier = amp * sin(2.f * static_cast<typename complex::value_type>(M_PI) * 3000.f * time + modulation)).run(exec);
// noise = 0.01 * fs / 2 * np.random.randn(time.shape)
(noise = sqrt(0.01 * fs / 2) * random<float>({N}, NORMAL)).run(exec);
(noise = sqrt(0.01f * fs / 2.f) * random<float>({N}, NORMAL)).run(exec);
// noise *= np.exp(-time/5)
(noise = noise * exp(-1.0f * time / 5.0f)).run(exec);
// x = carrier + noise
(x = carrier + noise).run(exec);

for (uint32_t i = 0; i < num_iterations; i++) {
if (i == 2) { // Start timer on third loop to allow generation of plot
cudaEventRecord(start, stream);
exec.start_timer();
}

// DFT Sample Frequencies (rfftfreq)
(freqs = (1.0 / (static_cast<float>(nfft) * 1 / fs)) *
(freqs = (1.0f / (static_cast<float>(nfft) * 1.f / fs)) *
linspace<0>(half_win, 0.0f, static_cast<float>(nfft) / 2.0f))
.run(exec);

Expand Down Expand Up @@ -143,15 +138,13 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)

}

cudaEventRecord(stop, stream);
exec.stop_timer();
exec.sync();
cudaEventElapsedTime(&time_ms, start, stop);
time_ms = exec.get_time_ms();

printf("Spectrogram Time Without Graphs = %.2fus per iteration\n",
time_ms * 1e3 / num_iterations);

cudaEventDestroy(start);
cudaEventDestroy(stop);
cudaStreamDestroy(stream);

MATX_CUDA_CHECK_LAST_ERROR();
Expand Down
39 changes: 36 additions & 3 deletions include/matx/executors/cuda.h
Original file line number Diff line number Diff line change
Expand Up @@ -54,14 +54,24 @@ namespace matx
*
* @param stream CUDA stream
*/
cudaExecutor(cudaStream_t stream) : stream_(stream) {}
cudaExecutor(int stream) : stream_(reinterpret_cast<cudaStream_t>(stream)) {}
cudaExecutor(cudaStream_t stream) : stream_(stream) {
MATX_CUDA_CHECK(cudaEventCreate(&start_));
MATX_CUDA_CHECK(cudaEventCreate(&stop_));
}

cudaExecutor(int stream) : stream_(reinterpret_cast<cudaStream_t>(stream)) {
MATX_CUDA_CHECK(cudaEventCreate(&start_));
MATX_CUDA_CHECK(cudaEventCreate(&stop_));
}

/**
* @brief Construct a new cudaExecutor object using the default stream
*
*/
cudaExecutor() : stream_(0) {}
cudaExecutor() : stream_(0) {
MATX_CUDA_CHECK(cudaEventCreate(&start_));
MATX_CUDA_CHECK(cudaEventCreate(&stop_));
}

/**
* @brief Returns stream associated with executor
Expand All @@ -73,6 +83,27 @@ namespace matx
*
*/
void sync() { cudaStreamSynchronize(stream_); }

/**
* @brief Start a timer for profiling workload
*/
void start_timer() { cudaEventRecord(start_, stream_); }

/**
* @brief Stop a timer for profiling workload
*/
void stop_timer() { cudaEventRecord(stop_, stream_); }

/**
* @brief Get the time in milliseconds between start_timer and stop_timer.
* This will block until the event is synchronized
*/
float get_time_ms() {
float time;
cudaEventSynchronize(stop_);
cudaEventElapsedTime(&time, start_, stop_);
return time;
}

/**
* Execute an operator on a device
Expand Down Expand Up @@ -139,6 +170,8 @@ namespace matx

private:
cudaStream_t stream_;
cudaEvent_t start_;
cudaEvent_t stop_;
};

};
27 changes: 27 additions & 0 deletions include/matx/executors/host.h
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,31 @@ class HostExecutor {
*/
void sync() {}

/**
* @brief Start a timer for profiling workload
*/
void start_timer() {
MATX_STATIC_ASSERT_STR(MODE == ThreadsMode::SINGLE, matxNotSupported, "Timer not supported in multi-threaded mode");
start_ = std::chrono::high_resolution_clock::now();
}

/**
* @brief Stop a timer for profiling workload
*/
void stop_timer() {
MATX_STATIC_ASSERT_STR(MODE == ThreadsMode::SINGLE, matxNotSupported, "Timer not supported in multi-threaded mode");
stop_ = std::chrono::high_resolution_clock::now();
}

/**
* @brief Get the time in milliseconds between start_timer and stop_timer.
* This will block until the event is synchronized
*/
float get_time_ms() {
auto duration = std::chrono::duration_cast<std::chrono::microseconds>(stop_ - start_);
return static_cast<float>(static_cast<double>(duration.count()) / 1e3);
}

/**
* @brief Execute an operator
*
Expand Down Expand Up @@ -151,6 +176,8 @@ class HostExecutor {

private:
HostExecParams params_;
std::chrono::time_point<std::chrono::high_resolution_clock> start_;
std::chrono::time_point<std::chrono::high_resolution_clock> stop_;
};

using SingleThreadedHostExecutor = HostExecutor<ThreadsMode::SINGLE>;
Expand Down

0 comments on commit 2de8514

Please sign in to comment.