From 98b73d1ba3edb40013f3b66a9df17ca7f20caf3f Mon Sep 17 00:00:00 2001 From: Adel Johar Date: Wed, 22 Jan 2025 13:53:12 +0100 Subject: [PATCH] Docs: Replace terms.md page with page that provides example of API mapping --- docs/faq.rst | 33 +------ docs/index.md | 2 +- docs/reference/api_syntax.rst | 177 ++++++++++++++++++++++++++++++++++ docs/reference/terms.md | 44 --------- docs/sphinx/_toc.yml.in | 3 +- 5 files changed, 180 insertions(+), 79 deletions(-) create mode 100644 docs/reference/api_syntax.rst delete mode 100644 docs/reference/terms.md diff --git a/docs/faq.rst b/docs/faq.rst index 5e67ec465d..b57568ca79 100644 --- a/docs/faq.rst +++ b/docs/faq.rst @@ -65,38 +65,7 @@ platforms. Additional porting might be required to deal with architecture feature queries or CUDA capabilities that HIP doesn't support. -How does HIP compare with OpenCL? ---------------------------------- - -HIP offers several benefits over OpenCL: - -* Device code can be written in modern C++, including templates, lambdas, - classes and so on. -* Host and device code can be mixed in the source files. -* The HIP API is less verbose than OpenCL and is familiar to CUDA developers. -* Porting from CUDA to HIP is significantly easier than from CUDA to OpenCL. -* HIP uses development tools specialized for each platform: :doc:`amdclang++ ` - for AMD GPUs or `nvcc `_ - for NVIDIA GPUs, and profilers like :doc:`ROCm Compute Profiler ` or - `Nsight Systems `_. -* HIP provides - * pointers and host-side pointer arithmetic. - * device-level control over memory allocation and placement. - * an offline compilation model. - -How does porting CUDA to HIP compare to porting CUDA to OpenCL? ---------------------------------------------------------------- - -OpenCL differs from HIP and CUDA when considering the host runtime, -but even more so when considering the kernel code. -The HIP device code is a C++ dialect, while OpenCL is C99-based. -OpenCL does not support single-source compilation. - -As a result, the OpenCL syntax differs significantly from HIP, and porting tools -must perform complex transformations, especially regarding templates or other -C++ features in kernels. - -To better understand the syntax differences, see :doc:`here` or +To better understand the syntax differences, see :doc:`here` or the :doc:`HIP porting guide `. Can I install CUDA and ROCm on the same machine? diff --git a/docs/index.md b/docs/index.md index eb2eb1e6da..72ff80b67f 100644 --- a/docs/index.md +++ b/docs/index.md @@ -45,7 +45,7 @@ The HIP documentation is organized into the following categories: * [HSA runtime API for ROCm](./reference/virtual_rocr) * [HIP math API](./reference/math_api) * [HIP environment variables](./reference/env_variables) -* [Comparing syntax for different APIs](./reference/terms) +* [CUDA to HIP API Syntax: A Quick Comparison](./reference/api_syntax) * [List of deprecated APIs](./reference/deprecated_api_list) * [FP8 numbers in HIP](./reference/fp8_numbers) * {doc}`./reference/hardware_features` diff --git a/docs/reference/api_syntax.rst b/docs/reference/api_syntax.rst new file mode 100644 index 0000000000..89f531ab66 --- /dev/null +++ b/docs/reference/api_syntax.rst @@ -0,0 +1,177 @@ +.. meta:: + :description: Maps CUDA API syntax to HIP API syntax with an example + :keywords: AMD, ROCm, HIP, CUDA, syntax, HIP syntax + +******************************************************************************** +CUDA to HIP API Syntax: A Quick Comparison +******************************************************************************** + +This page introduces key syntax differences between CUDA and HIP APIs with a focused code +example and comparison table. For a complete list of mappings, visit :ref:`HIPIFY `. + +The CUDA code block below illustrates several CUDA API syntaxes. + +.. code-block:: cpp + + #include + #include + #include + + __global__ void block_reduction(const float* input, float* output, int num_elements) + { + extern __shared__ float s_data[]; + + int tid = threadIdx.x; + int global_id = blockDim.x * blockIdx.x + tid; + + if (global_id < num_elements) + { + s_data[tid] = input[global_id]; + } + else + { + s_data[tid] = 0.0f; + } + __syncthreads(); + + for (int stride = blockDim.x / 2; stride > 0; stride >>= 1) + { + if (tid < stride) + { + s_data[tid] += s_data[tid + stride]; + } + __syncthreads(); + } + + if (tid == 0) + { + output[blockIdx.x] = s_data[0]; + } + } + + int main() + { + int threads = 256; + const int num_elements = 50000; + + std::vector h_a(num_elements); + std::vector h_b((num_elements + threads - 1) / threads); + + for (int i = 0; i < num_elements; ++i) + { + h_a[i] = rand() / static_cast(RAND_MAX); + } + + float *d_a, *d_b; + cudaMalloc(&d_a, h_a.size() * sizeof(float)); + cudaMalloc(&d_b, h_b.size() * sizeof(float)); + + cudaStream_t stream; + cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking); + + cudaEvent_t start_event, stop_event; + cudaEventCreate(&start_event); + cudaEventCreate(&stop_event); + + cudaMemcpyAsync(d_a, h_a.data(), h_a.size() * sizeof(float), cudaMemcpyHostToDevice, stream); + + cudaEventRecord(start_event, stream); + + int blocks = (num_elements + threads - 1) / threads; + block_reduction<<>>(d_a, d_b, num_elements); + + cudaMemcpyAsync(h_b.data(), d_b, h_b.size() * sizeof(float), cudaMemcpyDeviceToHost, stream); + + cudaEventRecord(stop_event, stream); + cudaEventSynchronize(stop_event); + + cudaEventElapsedTime(&milliseconds, start_event, stop_event); + std::cout << "Kernel execution time: " << milliseconds << " ms\n"; + + cudaFree(d_a); + cudaFree(d_b); + + cudaEventDestroy(start_event); + cudaEventDestroy(stop_event); + cudaStreamDestroy(stream); + + return 0; + } + +The table below maps CUDA API syntax to corresponding HIP API syntax, as demonstrated in the +preceding code examples. + +.. list-table:: + :header-rows: 1 + :name: syntax-mapping-table + + * + - CUDA + - HIP + + * + - ``#include `` + - ``#include `` + + * + - ``cudaError_t`` + - ``hipError_t`` + + * + - ``cudaEvent_t`` + - ``hipEvent_t`` + + * + - ``cudaStream_t`` + - ``hipStream_t`` + + * + - ``cudaMalloc`` + - ``hipMalloc`` + + * + - ``cudaStreamCreateWithFlags`` + - ``hipStreamCreateWithFlags`` + + * + - ``cudaStreamNonBlocking`` + - ``hipStreamNonBlocking`` + + * + - ``cudaEventCreate`` + - ``hipEventCreate`` + + * + - ``cudaMemcpyAsync`` + - ``hipMemcpyAsync`` + + * + - ``cudaMemcpyHostToDevice`` + - ``hipMemcpyHostToDevice`` + + * + - ``cudaEventRecord`` + - ``hipEventRecord`` + + * + - ``cudaEventSynchronize`` + - ``hipEventSynchronize`` + + * + - ``cudaEventElapsedTime`` + - ``hipEventElapsedTime`` + + * + - ``cudaFree`` + - ``hipFree`` + + * + - ``cudaEventDestroy`` + - ``hipEventDestroy`` + + * + - ``cudaStreamDestroy`` + - ``hipStreamDestroy`` + +In summary, this comparison highlights the primary syntax differences between CUDA and HIP APIs. +For a complete list of mappings, visit :ref:`HIPIFY `. diff --git a/docs/reference/terms.md b/docs/reference/terms.md deleted file mode 100644 index 713bf6eb81..0000000000 --- a/docs/reference/terms.md +++ /dev/null @@ -1,44 +0,0 @@ - - - - - - -# Table comparing syntax for different compute APIs - -|Term|CUDA|HIP|OpenCL| -|---|---|---|---| -|Device|`int deviceId`|`int deviceId`|`cl_device`| -|Queue|`cudaStream_t`|`hipStream_t`|`cl_command_queue`| -|Event|`cudaEvent_t`|`hipEvent_t`|`cl_event`| -|Memory|`void *`|`void *`|`cl_mem`| -||||| -| |grid|grid|NDRange| -| |block|block|work-group| -| |thread|thread|work-item| -| |warp|warp|sub-group| -||||| -|Thread-
index | `threadIdx.x` | `threadIdx.x` | `get_local_id(0)` | -|Block-
index | `blockIdx.x` | `blockIdx.x` | `get_group_id(0)` | -|Block-
dim | `blockDim.x` | `blockDim.x` | `get_local_size(0)` | -|Grid-dim | `gridDim.x` | `gridDim.x` | `get_num_groups(0)` | -||||| -|Device Kernel|`__global__`|`__global__`|`__kernel`| -|Device Function|`__device__`|`__device__`|Implied in device compilation| -|Host Function|`__host_` (default)|`__host_` (default)|Implied in host compilation| -|Host + Device Function|`__host__` `__device__`|`__host__` `__device__`| No equivalent| -|Kernel Launch|`<<< >>>`|`hipLaunchKernel`/`hipLaunchKernelGGL`/`<<< >>>`|`clEnqueueNDRangeKernel`| -|||||| -|Global Memory|`__global__`|`__global__`|`__global`| -|Group Memory|`__shared__`|`__shared__`|`__local`| -|Constant|`__constant__`|`__constant__`|`__constant`| -|||||| -||`__syncthreads`|`__syncthreads`|`barrier(CLK_LOCAL_MEMFENCE)`| -|Atomic Builtins|`atomicAdd`|`atomicAdd`|`atomic_add`| -|Precise Math|`cos(f)`|`cos(f)`|`cos(f)`| -|Fast Math|`__cos(f)`|`__cos(f)`|`native_cos(f)`| -|Vector|`float4`|`float4`|`float4`| - -## Notes - -The indexing functions (starting with `thread-index`) show the terminology for a 1D grid. Some APIs use reverse order of `xyz` / 012 indexing for 3D grids. diff --git a/docs/sphinx/_toc.yml.in b/docs/sphinx/_toc.yml.in index 04e1ce18a6..ed0d7f914d 100644 --- a/docs/sphinx/_toc.yml.in +++ b/docs/sphinx/_toc.yml.in @@ -111,8 +111,7 @@ subtrees: - file: reference/virtual_rocr - file: reference/math_api - file: reference/env_variables - - file: reference/terms - title: Comparing syntax for different APIs + - file: reference/api_syntax - file: reference/deprecated_api_list title: List of deprecated APIs - file: reference/fp8_numbers