Rust-GPU · madhav-madhusoodanan · Nov 2, 2025 · Nov 2, 2025 · Nov 4, 2025 · Nov 4, 2025
diff --git a/.github/workflows/ci_windows.yml b/.github/workflows/ci_windows.yml
@@ -67,6 +67,11 @@ jobs:
       - name: Add rustup components
         run: rustup component add rustfmt clippy
 
+      - name: Update PATH to expose CUDA codegen backend
+        shell: pwsh
+        run: |
+          echo "$env:CUDA_PATH\nvvm\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
+
       - name: Verify CUDA, Rust installation
         run: |
           nvcc --version

diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -20,6 +20,9 @@ members = [
   "examples/optix/*",
   "tests/compiletests",
   "tests/compiletests/deps-helper",
+
+  "samples/introduction/async_api",
+  "samples/introduction/async_api/kernels",
 ]
 
 exclude = [

diff --git a/samples/README.md b/samples/README.md
@@ -0,0 +1,5 @@
+# Rust-Cuda Samples
+
+These are the Rust-Cuda port of the samples from Nvidia's [cuda-samples](https://github.com/NVIDIA/cuda-samples/tree/master/Samples) repository.
+
+1. Chapter 0: [Introduction](https://github.com/Rust-GPU/rust-cuda/samples/introduction)
diff --git a/samples/introduction/README.md b/samples/introduction/README.md
@@ -0,0 +1,8 @@
+# Chapter 0: Introduction
+
+## [asyncAPI](https://github.com/Rust-GPU/rust-cuda/samples/introduction/async_api)
+This example demonstrates two key capabilities of CUDA events: measuring GPU execution time and enabling concurrent CPU-GPU operations.
+
+1. Events are recorded at specific points within a CUDA stream to mark the beginning and end of GPU operations.
+2. Because CUDA stream operations execute asynchronously, the CPU remains free to perform other work while the GPU processes tasks (including memory transfers between host and device)
+3. The CPU can query these events to check whether the GPU has finished its work, allowing for coordination between the two processors without blocking the CPU.
diff --git a/samples/introduction/async_api/Cargo.toml b/samples/introduction/async_api/Cargo.toml
@@ -0,0 +1,11 @@
+[package]
+name = "async_api"
+version = "0.1.0"
+edition = "2024"
+
+[dependencies]
+cust = { path = "../../../crates/cust" }
+nanorand = "0.7"
+
+[build-dependencies]
+cuda_builder = { workspace = true, default-features = false }
diff --git a/samples/introduction/async_api/build.rs b/samples/introduction/async_api/build.rs
@@ -0,0 +1,17 @@
+use std::env;
+use std::path;
+
+use cuda_builder::CudaBuilder;
+
+fn main() {
+    println!("cargo::rerun-if-changed=build.rs");
+    println!("cargo::rerun-if-changed=kernels");
+
+    let out_path = path::PathBuf::from(env::var("OUT_DIR").unwrap());
+    let manifest_dir = path::PathBuf::from(env::var("CARGO_MANIFEST_DIR").unwrap());
+
+    CudaBuilder::new(manifest_dir.join("kernels"))
+        .copy_to(out_path.join("kernels.ptx"))
+        .build()
+        .unwrap();
+}
diff --git a/samples/introduction/async_api/kernels/Cargo.toml b/samples/introduction/async_api/kernels/Cargo.toml
@@ -0,0 +1,10 @@
+[package]
+name = "async_api-kernels"
+version = "0.1.0"
+edition = "2024"
+
+[dependencies]
+cuda_std = { path = "../../../../crates/cuda_std" }
+
+[lib]
+crate-type = ["cdylib", "rlib"]
diff --git a/samples/introduction/async_api/kernels/src/lib.rs b/samples/introduction/async_api/kernels/src/lib.rs
@@ -0,0 +1,17 @@
+use cuda_std::prelude::*;
+
+#[kernel]
+/// # Safety
+///
+/// The user must ensure that the number of (threads * blocks * grids)
+/// must not be greater than the number of elements in `g_data`.
+pub unsafe fn increment(g_data: *mut u32, inc_value: u32) {
+    // This can also be obtained directly as
+    //
+    // let idx: usize = cuda_std::thread::index() as usize;
+    let idx: usize = (cuda_std::thread::block_dim().x * cuda_std::thread::block_idx().x
+        + cuda_std::thread::thread_idx().x) as usize;
+
+    let elem: &mut u32 = unsafe { &mut *g_data.add(idx) };
+    *elem += inc_value;
+}
diff --git a/samples/introduction/async_api/src/main.rs b/samples/introduction/async_api/src/main.rs
@@ -0,0 +1,122 @@
+use cust::device::Device;
+use cust::event::{Event, EventFlags};
+use cust::function::{BlockSize, GridSize};
+use cust::launch;
+use cust::memory::{AsyncCopyDestination, DeviceBuffer, LockedBuffer};
+use cust::module::Module;
+use cust::prelude::EventStatus;
+use cust::stream::{Stream, StreamFlags};
+use std::time::Instant;
+
+static PTX: &str = include_str!(concat!(env!("OUT_DIR"), "/kernels.ptx"));
+
+fn correct_output(data: &[u32], x: u32) -> bool {
+    let not_matching_element = data.iter().enumerate().find(|&(_, &elem)| elem != x);
+
+    match not_matching_element {
+        Some((index, elem)) => println!("Error! data[{index}] = {elem}, ref = {x}"),
+        None => println!("All elements of the array match the value!"),
+    }
+
+    not_matching_element.is_none()
+}
+
+fn main() -> Result<(), cust::error::CudaError> {
+    // Set up the context, load the module, and create a stream to run kernels in.
+    let _ctx = cust::quick_init();
+    let device = Device::get_device(0).expect("Couldn't find Cuda supported devices!");
+    println!("Device Name: {}", device.name().unwrap());
+
+    let module = Module::from_ptx(PTX, &[]).expect("Module couldn't be init!");
+    let increment = module
+        .get_function("increment")
+        .expect("Kernel function not found!");
+    let stream = Stream::new(StreamFlags::NON_BLOCKING, None).expect("Stream couldn't be init!");
+
+    const N: usize = 16 * 1024 * 1024;
+    let value = 26;
+
+    let blocks = BlockSize::xy(512, 1);
+    let grids = GridSize::xy((N / (blocks.x as usize)).try_into().unwrap(), 1);
+
+    let start_event = Event::new(EventFlags::DEFAULT)?;
+    let stop_event = Event::new(EventFlags::DEFAULT)?;
+
+    // Create buffers for data on host-side
+    // Ideally should be page-locked for efficiency
+    let mut host_a = LockedBuffer::new(&0u32, N).expect("host array couldn't be initialized!");
+    let mut device_a =
+        DeviceBuffer::from_slice(&[u32::MAX; N]).expect("device array couldn't be initialized!");
+
+    start_event
+        .record(&stream)
+        .expect("Failed to record start_event in the CUDA stream!");
+    let start = Instant::now();
+
+    // # Safety
+    //
+    // Until the stop_event is triggered:
+    // 1. `host_a` is not being modified
+    // 2. Both `device_a` and `host_a` are not deallocated
+    // 3. Until `stop_query` yields `EventStatus::Ready`, `device_a` is not involved in any other operation
+    //    other than those of the operations in the stream.
+    unsafe {
+        device_a
+            .async_copy_from(&host_a, &stream)
+            .expect("Could not copy from host to device!");
+    }
+
+    // # Safety
+    //
+    // Number of threads * number of blocks = total number of elements.
+    // Hence there will not be any out-of-bounds issues.
+    unsafe {
+        let result = launch!(increment<<<grids, blocks, 0, stream>>>(
+            device_a.as_device_ptr(),
+            value
+        ));
+        result.expect("Result of `increment` kernel did not process!");
+    }
+
+    // # Safety
+    //
+    // Until the stop_event is triggered:
+    // 1. `device_a` is not being modified
+    // 2. Both `device_a` and `host_a` are not deallocated
+    // 3. At this point, until `stop_query` yields `EventStatus::Ready`,
+    //    `host_a` is not involved in any other operation.
+    unsafe {
+        device_a
+            .async_copy_to(&mut host_a, &stream)
+            .expect("Could not copy from device to host!");
+    }
+
+    stop_event
+        .record(&stream)
+        .expect("Failed to record stop_event in the CUDA stream!");
+    let cpu_time: u128 = start.elapsed().as_micros();
+
+    let mut counter: u64 = 0;
+    while stop_event.query() != Ok(EventStatus::Ready) {
+        counter += 1
+    }
+
+    let gpu_time: u128 = stop_event
+        .elapsed(&start_event)
+        .expect("Failed to calculate duration of GPU operations!")
+        .as_micros();
+
+    println!("Time spent executing by the GPU: {gpu_time} microseconds");
+    println!("Time spent by CPU in CUDA calls: {cpu_time} microseconds");
+    println!("CPU executed {counter} iterations while waiting for GPU to finish.");
+
+    assert!(correct_output(host_a.as_slice(), value));
+
+    // Stream is synchronized as a safety measure
+    stream.synchronize().expect("Stream couldn't synchronize!");
+
+    println!("test PASSED");
+    Ok(())
+
+    // The events and the memory buffers are automatically dropped here.
+}