From 98b73d1ba3edb40013f3b66a9df17ca7f20caf3f Mon Sep 17 00:00:00 2001
From: Adel Johar <adel.johar@amd.com>
Date: Wed, 22 Jan 2025 13:53:12 +0100
Subject: [PATCH] Docs: Replace terms.md page with page that provides example
 of API mapping

---
 docs/faq.rst                  |  33 +------
 docs/index.md                 |   2 +-
 docs/reference/api_syntax.rst | 177 ++++++++++++++++++++++++++++++++++
 docs/reference/terms.md       |  44 ---------
 docs/sphinx/_toc.yml.in       |   3 +-
 5 files changed, 180 insertions(+), 79 deletions(-)
 create mode 100644 docs/reference/api_syntax.rst
 delete mode 100644 docs/reference/terms.md

diff --git a/docs/faq.rst b/docs/faq.rst
index 5e67ec465d..b57568ca79 100644
--- a/docs/faq.rst
+++ b/docs/faq.rst
@@ -65,38 +65,7 @@ platforms.
 Additional porting might be required to deal with architecture feature
 queries or CUDA capabilities that HIP doesn't support.
 
-How does HIP compare with OpenCL?
----------------------------------
-
-HIP offers several benefits over OpenCL:
-
-* Device code can be written in modern C++, including templates, lambdas,
-  classes and so on.
-* Host and device code can be mixed in the source files.
-* The HIP API is less verbose than OpenCL and is familiar to CUDA developers.
-* Porting from CUDA to HIP is significantly easier than from CUDA to OpenCL.
-* HIP uses development tools specialized for each platform: :doc:`amdclang++ <llvm-project:index>`
-  for AMD GPUs or `nvcc <https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html>`_
-  for NVIDIA GPUs, and profilers like :doc:`ROCm Compute Profiler <rocprofiler-compute:index>` or
-  `Nsight Systems <https://developer.nvidia.com/nsight-systems>`_.
-* HIP provides
-  * pointers and host-side pointer arithmetic.
-  * device-level control over memory allocation and placement.
-  * an offline compilation model.
-
-How does porting CUDA to HIP compare to porting CUDA to OpenCL?
----------------------------------------------------------------
-
-OpenCL differs from HIP and CUDA when considering the host runtime,
-but even more so when considering the kernel code.
-The HIP device code is a C++ dialect, while OpenCL is C99-based.
-OpenCL does not support single-source compilation.
-
-As a result, the OpenCL syntax differs significantly from HIP, and porting tools
-must perform complex transformations, especially regarding templates or other
-C++ features in kernels.
-
-To better understand the syntax differences, see :doc:`here<reference/terms>` or
+To better understand the syntax differences, see :doc:`here<reference/api_syntax>` or
 the :doc:`HIP porting guide <how-to/hip_porting_guide>`.
 
 Can I install CUDA and ROCm on the same machine?
diff --git a/docs/index.md b/docs/index.md
index eb2eb1e6da..72ff80b67f 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -45,7 +45,7 @@ The HIP documentation is organized into the following categories:
 * [HSA runtime API for ROCm](./reference/virtual_rocr)
 * [HIP math API](./reference/math_api)
 * [HIP environment variables](./reference/env_variables)
-* [Comparing syntax for different APIs](./reference/terms)
+* [CUDA to HIP API Syntax: A Quick Comparison](./reference/api_syntax)
 * [List of deprecated APIs](./reference/deprecated_api_list)
 * [FP8 numbers in HIP](./reference/fp8_numbers)
 * {doc}`./reference/hardware_features`
diff --git a/docs/reference/api_syntax.rst b/docs/reference/api_syntax.rst
new file mode 100644
index 0000000000..89f531ab66
--- /dev/null
+++ b/docs/reference/api_syntax.rst
@@ -0,0 +1,177 @@
+.. meta::
+  :description: Maps CUDA API syntax to HIP API syntax with an example
+  :keywords: AMD, ROCm, HIP, CUDA, syntax, HIP syntax
+
+********************************************************************************
+CUDA to HIP API Syntax: A Quick Comparison
+********************************************************************************
+
+This page introduces key syntax differences between CUDA and HIP APIs with a focused code
+example and comparison table. For a complete list of mappings, visit :ref:`HIPIFY <HIPIFY:index>`.
+
+The CUDA code block below illustrates several CUDA API syntaxes.
+
+.. code-block:: cpp
+
+  #include <iostream>
+  #include <vector>
+  #include <cuda_runtime.h>
+
+  __global__ void block_reduction(const float* input, float* output, int num_elements)
+  {
+      extern __shared__ float s_data[];
+
+      int tid = threadIdx.x;
+      int global_id = blockDim.x * blockIdx.x + tid;
+
+      if (global_id < num_elements)
+      {
+          s_data[tid] = input[global_id];
+      }
+      else
+      {
+          s_data[tid] = 0.0f;
+      }
+      __syncthreads();
+
+      for (int stride = blockDim.x / 2; stride > 0; stride >>= 1)
+      {
+          if (tid < stride)
+          {
+              s_data[tid] += s_data[tid + stride];
+          }
+          __syncthreads();
+      }
+
+      if (tid == 0)
+      {
+          output[blockIdx.x] = s_data[0];
+      }
+  }
+
+  int main()
+  {
+      int threads = 256;
+      const int num_elements = 50000;
+
+      std::vector<float> h_a(num_elements);
+      std::vector<float> h_b((num_elements + threads - 1) / threads);
+
+      for (int i = 0; i < num_elements; ++i)
+      {
+          h_a[i] = rand() / static_cast<float>(RAND_MAX);
+      }
+
+      float *d_a, *d_b;
+      cudaMalloc(&d_a, h_a.size() * sizeof(float));
+      cudaMalloc(&d_b, h_b.size() * sizeof(float));
+
+      cudaStream_t stream;
+      cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);
+
+      cudaEvent_t start_event, stop_event;
+      cudaEventCreate(&start_event);
+      cudaEventCreate(&stop_event);
+
+      cudaMemcpyAsync(d_a, h_a.data(), h_a.size() * sizeof(float), cudaMemcpyHostToDevice, stream);
+
+      cudaEventRecord(start_event, stream);
+
+      int blocks = (num_elements + threads - 1) / threads;
+      block_reduction<<<blocks, threads, threads * sizeof(float), stream>>>(d_a, d_b, num_elements);
+
+      cudaMemcpyAsync(h_b.data(), d_b, h_b.size() * sizeof(float), cudaMemcpyDeviceToHost, stream);
+
+      cudaEventRecord(stop_event, stream);
+      cudaEventSynchronize(stop_event);
+
+      cudaEventElapsedTime(&milliseconds, start_event, stop_event);
+      std::cout << "Kernel execution time: " << milliseconds << " ms\n";
+
+      cudaFree(d_a);
+      cudaFree(d_b);
+
+      cudaEventDestroy(start_event);
+      cudaEventDestroy(stop_event);
+      cudaStreamDestroy(stream);
+
+      return 0;
+  }
+
+The table below maps CUDA API syntax to corresponding HIP API syntax, as demonstrated in the
+preceding code examples.
+
+.. list-table::
+    :header-rows: 1
+    :name: syntax-mapping-table
+
+    *
+      - CUDA
+      - HIP
+
+    *
+      - ``#include <cuda_runtime.h>``
+      - ``#include <hip/hip_runtime.h>``
+
+    *
+      - ``cudaError_t``
+      - ``hipError_t``
+
+    *
+      - ``cudaEvent_t``
+      - ``hipEvent_t``
+
+    *
+      - ``cudaStream_t``
+      - ``hipStream_t``
+
+    *
+      - ``cudaMalloc``
+      - ``hipMalloc``
+
+    *
+      - ``cudaStreamCreateWithFlags``
+      - ``hipStreamCreateWithFlags``
+
+    *
+      - ``cudaStreamNonBlocking``
+      - ``hipStreamNonBlocking``
+
+    *
+      - ``cudaEventCreate``
+      - ``hipEventCreate``
+
+    *
+      - ``cudaMemcpyAsync``
+      - ``hipMemcpyAsync``
+
+    *
+      - ``cudaMemcpyHostToDevice``
+      - ``hipMemcpyHostToDevice``
+
+    *
+      - ``cudaEventRecord``
+      - ``hipEventRecord``
+
+    *
+      - ``cudaEventSynchronize``
+      - ``hipEventSynchronize``
+
+    *
+      - ``cudaEventElapsedTime``
+      - ``hipEventElapsedTime``
+
+    *
+      - ``cudaFree``
+      - ``hipFree``
+
+    *
+      - ``cudaEventDestroy``
+      - ``hipEventDestroy``
+
+    *
+      - ``cudaStreamDestroy``
+      - ``hipStreamDestroy``
+
+In summary, this comparison highlights the primary syntax differences between CUDA and HIP APIs.
+For a complete list of mappings, visit :ref:`HIPIFY <HIPIFY:index>`.
diff --git a/docs/reference/terms.md b/docs/reference/terms.md
deleted file mode 100644
index 713bf6eb81..0000000000
--- a/docs/reference/terms.md
+++ /dev/null
@@ -1,44 +0,0 @@
-<head>
-  <meta charset="UTF-8">
-  <meta name="description" content="Table comparing syntax for HIP, CUDA, and OpenCL APIs">
-  <meta name="keywords" content="HIP, Heterogeneous-computing Interface for Portability, HIP terms">
-</head>
-
-# Table comparing syntax for different compute APIs
-
-|Term|CUDA|HIP|OpenCL|
-|---|---|---|---|
-|Device|`int deviceId`|`int deviceId`|`cl_device`|
-|Queue|`cudaStream_t`|`hipStream_t`|`cl_command_queue`|
-|Event|`cudaEvent_t`|`hipEvent_t`|`cl_event`|
-|Memory|`void *`|`void *`|`cl_mem`|
-|||||
-| |grid|grid|NDRange|
-| |block|block|work-group|
-| |thread|thread|work-item|
-| |warp|warp|sub-group|
-|||||
-|Thread-<br>index | `threadIdx.x` | `threadIdx.x`  | `get_local_id(0)` |
-|Block-<br>index  | `blockIdx.x`  | `blockIdx.x`  | `get_group_id(0)` |
-|Block-<br>dim    | `blockDim.x`  | `blockDim.x`  | `get_local_size(0)` |
-|Grid-dim     | `gridDim.x`   | `gridDim.x`  | `get_num_groups(0)` |
-|||||
-|Device Kernel|`__global__`|`__global__`|`__kernel`|
-|Device Function|`__device__`|`__device__`|Implied in device compilation|
-|Host Function|`__host_` (default)|`__host_` (default)|Implied in host compilation|
-|Host + Device Function|`__host__` `__device__`|`__host__` `__device__`| No equivalent|
-|Kernel Launch|`<<< >>>`|`hipLaunchKernel`/`hipLaunchKernelGGL`/`<<< >>>`|`clEnqueueNDRangeKernel`|
-||||||
-|Global Memory|`__global__`|`__global__`|`__global`|
-|Group Memory|`__shared__`|`__shared__`|`__local`|
-|Constant|`__constant__`|`__constant__`|`__constant`|
-||||||
-||`__syncthreads`|`__syncthreads`|`barrier(CLK_LOCAL_MEMFENCE)`|
-|Atomic Builtins|`atomicAdd`|`atomicAdd`|`atomic_add`|
-|Precise Math|`cos(f)`|`cos(f)`|`cos(f)`|
-|Fast Math|`__cos(f)`|`__cos(f)`|`native_cos(f)`|
-|Vector|`float4`|`float4`|`float4`|
-
-## Notes
-
-The indexing functions (starting with `thread-index`) show the terminology for a 1D grid.  Some APIs use reverse order of `xyz` / 012 indexing for 3D grids.
diff --git a/docs/sphinx/_toc.yml.in b/docs/sphinx/_toc.yml.in
index 04e1ce18a6..ed0d7f914d 100644
--- a/docs/sphinx/_toc.yml.in
+++ b/docs/sphinx/_toc.yml.in
@@ -111,8 +111,7 @@ subtrees:
   - file: reference/virtual_rocr
   - file: reference/math_api
   - file: reference/env_variables
-  - file: reference/terms
-    title: Comparing syntax for different APIs
+  - file: reference/api_syntax
   - file: reference/deprecated_api_list
     title: List of deprecated APIs
   - file: reference/fp8_numbers