From c3fffbad255f052fd9a4ceafd918bcf1f8030a92 Mon Sep 17 00:00:00 2001
From: Madhav Madhusoodanan <f20200049@pilani.bits-pilani.ac.in>
Date: Mon, 3 Nov 2025 00:55:40 +0530
Subject: [PATCH 01/11] feat: ported the AsyncAPI sample of CUDA examples

---
 Cargo.lock                                 |  16 +++
 Cargo.toml                                 |   2 +
 examples/cuda/async_api/Cargo.toml         |  11 ++
 examples/cuda/async_api/build.rs           |  17 +++
 examples/cuda/async_api/kernels/Cargo.toml |  10 ++
 examples/cuda/async_api/kernels/src/lib.rs |  17 +++
 examples/cuda/async_api/src/main.rs        | 129 +++++++++++++++++++++
 7 files changed, 202 insertions(+)
 create mode 100644 examples/cuda/async_api/Cargo.toml
 create mode 100644 examples/cuda/async_api/build.rs
 create mode 100644 examples/cuda/async_api/kernels/Cargo.toml
 create mode 100644 examples/cuda/async_api/kernels/src/lib.rs
 create mode 100644 examples/cuda/async_api/src/main.rs

diff --git a/Cargo.lock b/Cargo.lock
index 7cd5a8d8..1af88d55 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -156,6 +156,22 @@ version = "1.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "d92bec98840b8f03a5ff5413de5293bfcd8bf96467cf5452609f939ec6f5de16"
 
+[[package]]
+name = "async_api"
+version = "0.1.0"
+dependencies = [
+ "cuda_builder",
+ "cust",
+ "nanorand",
+]
+
+[[package]]
+name = "async_api-kernels"
+version = "0.1.0"
+dependencies = [
+ "cuda_std",
+]
+
 [[package]]
 name = "atty"
 version = "0.2.14"
diff --git a/Cargo.toml b/Cargo.toml
index 18931ef0..0fac3b78 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -8,6 +8,8 @@ members = [
 
   "xtask",
 
+  "examples/cuda/async_api",
+  "examples/cuda/async_api/kernels",
   "examples/cuda/vecadd",
   "examples/cuda/vecadd/kernels",
   "examples/cuda/gemm",
diff --git a/examples/cuda/async_api/Cargo.toml b/examples/cuda/async_api/Cargo.toml
new file mode 100644
index 00000000..b5a8cdfa
--- /dev/null
+++ b/examples/cuda/async_api/Cargo.toml
@@ -0,0 +1,11 @@
+[package]
+name = "async_api"
+version = "0.1.0"
+edition = "2024"
+
+[dependencies]
+cust = { path = "../../../crates/cust" }
+nanorand = "0.7"
+
+[build-dependencies]
+cuda_builder = { workspace = true, default-features = false }
diff --git a/examples/cuda/async_api/build.rs b/examples/cuda/async_api/build.rs
new file mode 100644
index 00000000..7f23bac1
--- /dev/null
+++ b/examples/cuda/async_api/build.rs
@@ -0,0 +1,17 @@
+use std::env;
+use std::path;
+
+use cuda_builder::CudaBuilder;
+
+fn main() {
+    println!("cargo::rerun-if-changed=build.rs");
+    println!("cargo::rerun-if-changed=kernels");
+
+    let out_path = path::PathBuf::from(env::var("OUT_DIR").unwrap());
+    let manifest_dir = path::PathBuf::from(env::var("CARGO_MANIFEST_DIR").unwrap());
+
+    CudaBuilder::new(manifest_dir.join("kernels"))
+        .copy_to(out_path.join("kernels.ptx"))
+        .build()
+        .unwrap();
+}
diff --git a/examples/cuda/async_api/kernels/Cargo.toml b/examples/cuda/async_api/kernels/Cargo.toml
new file mode 100644
index 00000000..7ad69be6
--- /dev/null
+++ b/examples/cuda/async_api/kernels/Cargo.toml
@@ -0,0 +1,10 @@
+[package]
+name = "async_api-kernels"
+version = "0.1.0"
+edition = "2024"
+
+[dependencies]
+cuda_std = { path = "../../../../crates/cuda_std" }
+
+[lib]
+crate-type = ["cdylib", "rlib"]
diff --git a/examples/cuda/async_api/kernels/src/lib.rs b/examples/cuda/async_api/kernels/src/lib.rs
new file mode 100644
index 00000000..f604fd39
--- /dev/null
+++ b/examples/cuda/async_api/kernels/src/lib.rs
@@ -0,0 +1,17 @@
+use cuda_std::prelude::*;
+
+#[kernel]
+#[allow(improper_ctypes_definitions, clippy::missing_safety_doc)]
+pub unsafe fn increment(g_data: *mut u32, inc_value: u32) {
+    // This can also be obtained directly as
+    //
+    // let idx: usize = cuda_std::thread::index() as usize;
+    let idx: usize = (
+        cuda_std::thread::block_dim().x
+        * cuda_std::thread::block_idx().x
+        + cuda_std::thread::thread_idx().x
+    ) as usize;
+
+    let elem: &mut u32 = unsafe { &mut *g_data.add(idx) };
+    *elem = *elem + inc_value;
+}
diff --git a/examples/cuda/async_api/src/main.rs b/examples/cuda/async_api/src/main.rs
new file mode 100644
index 00000000..7361b434
--- /dev/null
+++ b/examples/cuda/async_api/src/main.rs
@@ -0,0 +1,129 @@
+
+use cust::memory::{DeviceBuffer, LockedBuffer, AsyncCopyDestination};
+use cust::event::{Event, EventFlags};
+use cust::prelude::EventStatus;
+use cust::stream::{Stream, StreamFlags};
+use cust::module::Module;
+use cust::context::Context;
+use cust::{launch, CudaFlags};
+use cust::device::Device;
+use cust::function::{GridSize, BlockSize};
+use std::time::Instant;
+
+static PTX: &str = include_str!(concat!(env!("OUT_DIR"), "/kernels.ptx"));
+
+fn correct_output(data: &[u32], x: u32) -> bool {
+    let not_matching_element = data
+        .iter()
+        .enumerate()
+        .find(|&(_, &elem)| elem != x);
+
+    match not_matching_element {
+        Some((index, elem)) => println!("Error! data[{index}] = {elem}, ref = {x}"),
+        None => println!("All elements of the array match the value!")
+    }
+
+    not_matching_element.is_none()
+}
+
+fn main() -> Result<(), cust::error::CudaError> {
+    cust::init(CudaFlags::empty()).expect("Couldn't initialize CUDA environment!");
+
+    let device = Device::get_device(0)
+        .expect("Couldn't find Cuda supported devices!");
+
+    println!("Device Name: {}", device.name().unwrap());
+
+    // Set up the context, load the module, and create a stream to run kernels in.
+    let _ctx = Context::new(device);
+    let module = Module::from_ptx(PTX, &[]).expect("Module couldn't be init!");
+    let increment = module.get_function("increment").expect("Kernel function not found!");
+    let stream = Stream::new(StreamFlags::NON_BLOCKING, None).expect("Stream couldn't be init!");
+
+    const N: usize = 16 * 1024 * 1024;
+    const N_BYTES: usize = N * (i32::BITS as usize);
+    let value = 26;
+
+    let blocks = BlockSize::xy(512, 1);
+    let grids = GridSize::xy((N / (blocks.x as usize)).try_into().unwrap(), 1);
+
+    let start_event = Event::new(EventFlags::DEFAULT)?;
+    let stop_event = Event::new(EventFlags::DEFAULT)?;
+
+    // Create buffers for data on host-side
+    // Ideally must be page-locked for efficiency
+    let mut host_a = LockedBuffer::new(&0u32, N).expect("host array couldn't be initialized!");
+    let mut device_a = DeviceBuffer::from_slice(&[u32::MAX; N]).expect("device array couldn't be initialized!");
+
+    start_event.record(&stream).expect("Failed to record start_event in the CUDA stream!");
+    let start = Instant::now();
+
+    // SAFETY: until the stop_event being triggered:
+    // 1. `host_a` is not being modified
+    // 2. Both `device_a` and `host_a` are not deallocated
+    // 3. Until `stop_query` yields `EventStatus::Ready`, `device_a` is not involved in any other operation
+    //    other than those of the operations in the stream.
+    unsafe {
+        device_a.async_copy_from(&host_a, &stream).expect("Could not copy from host to device!");
+    }
+
+    // SAFETY: number of threads * number of blocks = total number of elements.
+    // Hence there will not be any out-of-bounds issues.
+    unsafe {
+        let result = launch!(increment<<<grids, blocks, 0, stream>>>(
+            device_a.as_device_ptr(),
+            value
+        ));
+        result.expect("Result of `increment` kernel did not process!");
+    }
+
+    // SAFETY: until the stop_event being triggered:
+    // 1. `device_a` is not being modified
+    // 2. Both `device_a` and `host_a` are not deallocated
+    // 3. At this point, until `stop_query` yields `EventStatus::Ready`, 
+    //    `host_a` is not involved in any other operation.
+    unsafe {
+        device_a.async_copy_to(&mut host_a, &stream).expect("Could not copy from device to host!");
+    }
+
+    stop_event.record(&stream).expect("Failed to record stop_event in the CUDA stream!");
+    let cpu_time: u128 = start.elapsed().as_micros();
+
+    let mut counter: u64 = 0;
+    while stop_event.query() != Ok(EventStatus::Ready) { counter += 1 }
+
+    let gpu_time: u128 = stop_event
+        .elapsed(&start_event)
+        .expect("Failed to calculate duration of GPU operations!")
+        .as_micros();
+    
+    println!("Time spent executing by the GPU: {gpu_time} microseconds");
+    println!("Time spent by CPU in CUDA calls: {cpu_time} microseconds");
+    println!("CPU executed {counter} iterations while waiting for GPU to finish.");
+    
+    assert!(correct_output(host_a.as_slice(), value));
+
+    // Stream is synchronized as a safety measure
+    stream.synchronize().expect("Stream couldn't synchronize!");
+
+    // Events and buffers can be safely dropped now
+    match Event::drop(start_event) {
+        Ok(()) => println!("Successfully destroyed start_event"),
+        Err((cuda_error, _event)) => {
+            println!("Failed to destroy start_event: {:?}", cuda_error);
+        },
+    }
+
+    match Event::drop(stop_event) {
+        Ok(()) => println!("Successfully destroyed stop_event"),
+        Err((cuda_error, _event)) => {
+            println!("Failed to destroy stop_event: {:?}", cuda_error);
+        },
+    }
+
+    DeviceBuffer::drop(device_a).expect("Couldn't drop device array!");
+    LockedBuffer::drop(host_a).expect("Couldn't drop host array!");
+
+    println!("test PASSED");
+    Ok(())
+}

From 4d3b181cf2b5510cd8defd564f74ce5b3d164b45 Mon Sep 17 00:00:00 2001
From: Madhav Madhusoodanan <f20200049@pilani.bits-pilani.ac.in>
Date: Mon, 3 Nov 2025 01:03:26 +0530
Subject: [PATCH 02/11] fix: spelling errors

---
 examples/cuda/async_api/src/main.rs | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/examples/cuda/async_api/src/main.rs b/examples/cuda/async_api/src/main.rs
index 7361b434..1f5b5a11 100644
--- a/examples/cuda/async_api/src/main.rs
+++ b/examples/cuda/async_api/src/main.rs
@@ -51,14 +51,14 @@ fn main() -> Result<(), cust::error::CudaError> {
     let stop_event = Event::new(EventFlags::DEFAULT)?;
 
     // Create buffers for data on host-side
-    // Ideally must be page-locked for efficiency
+    // Ideally should be page-locked for efficiency
     let mut host_a = LockedBuffer::new(&0u32, N).expect("host array couldn't be initialized!");
     let mut device_a = DeviceBuffer::from_slice(&[u32::MAX; N]).expect("device array couldn't be initialized!");
 
     start_event.record(&stream).expect("Failed to record start_event in the CUDA stream!");
     let start = Instant::now();
 
-    // SAFETY: until the stop_event being triggered:
+    // SAFETY: until the stop_event is triggered:
     // 1. `host_a` is not being modified
     // 2. Both `device_a` and `host_a` are not deallocated
     // 3. Until `stop_query` yields `EventStatus::Ready`, `device_a` is not involved in any other operation
@@ -77,7 +77,7 @@ fn main() -> Result<(), cust::error::CudaError> {
         result.expect("Result of `increment` kernel did not process!");
     }
 
-    // SAFETY: until the stop_event being triggered:
+    // SAFETY: until the stop_event is triggered:
     // 1. `device_a` is not being modified
     // 2. Both `device_a` and `host_a` are not deallocated
     // 3. At this point, until `stop_query` yields `EventStatus::Ready`, 

From 3ac6cc5a243aa73f3fdf4744a1f688176747e8a7 Mon Sep 17 00:00:00 2001
From: Madhav Madhusoodanan <f20200049@pilani.bits-pilani.ac.in>
Date: Tue, 4 Nov 2025 11:48:48 +0530
Subject: [PATCH 03/11] chore: format code

---
 examples/cuda/async_api/kernels/src/lib.rs |  7 +--
 examples/cuda/async_api/src/main.rs        | 60 ++++++++++++----------
 2 files changed, 36 insertions(+), 31 deletions(-)

diff --git a/examples/cuda/async_api/kernels/src/lib.rs b/examples/cuda/async_api/kernels/src/lib.rs
index f604fd39..ffa7cbbc 100644
--- a/examples/cuda/async_api/kernels/src/lib.rs
+++ b/examples/cuda/async_api/kernels/src/lib.rs
@@ -6,11 +6,8 @@ pub unsafe fn increment(g_data: *mut u32, inc_value: u32) {
     // This can also be obtained directly as
     //
     // let idx: usize = cuda_std::thread::index() as usize;
-    let idx: usize = (
-        cuda_std::thread::block_dim().x
-        * cuda_std::thread::block_idx().x
-        + cuda_std::thread::thread_idx().x
-    ) as usize;
+    let idx: usize = (cuda_std::thread::block_dim().x * cuda_std::thread::block_idx().x
+        + cuda_std::thread::thread_idx().x) as usize;
 
     let elem: &mut u32 = unsafe { &mut *g_data.add(idx) };
     *elem = *elem + inc_value;
diff --git a/examples/cuda/async_api/src/main.rs b/examples/cuda/async_api/src/main.rs
index 1f5b5a11..16cb7a5f 100644
--- a/examples/cuda/async_api/src/main.rs
+++ b/examples/cuda/async_api/src/main.rs
@@ -1,26 +1,22 @@
-
-use cust::memory::{DeviceBuffer, LockedBuffer, AsyncCopyDestination};
+use cust::context::Context;
+use cust::device::Device;
 use cust::event::{Event, EventFlags};
+use cust::function::{BlockSize, GridSize};
+use cust::memory::{AsyncCopyDestination, DeviceBuffer, LockedBuffer};
+use cust::module::Module;
 use cust::prelude::EventStatus;
 use cust::stream::{Stream, StreamFlags};
-use cust::module::Module;
-use cust::context::Context;
-use cust::{launch, CudaFlags};
-use cust::device::Device;
-use cust::function::{GridSize, BlockSize};
+use cust::{CudaFlags, launch};
 use std::time::Instant;
 
 static PTX: &str = include_str!(concat!(env!("OUT_DIR"), "/kernels.ptx"));
 
 fn correct_output(data: &[u32], x: u32) -> bool {
-    let not_matching_element = data
-        .iter()
-        .enumerate()
-        .find(|&(_, &elem)| elem != x);
+    let not_matching_element = data.iter().enumerate().find(|&(_, &elem)| elem != x);
 
     match not_matching_element {
         Some((index, elem)) => println!("Error! data[{index}] = {elem}, ref = {x}"),
-        None => println!("All elements of the array match the value!")
+        None => println!("All elements of the array match the value!"),
     }
 
     not_matching_element.is_none()
@@ -29,15 +25,16 @@ fn correct_output(data: &[u32], x: u32) -> bool {
 fn main() -> Result<(), cust::error::CudaError> {
     cust::init(CudaFlags::empty()).expect("Couldn't initialize CUDA environment!");
 
-    let device = Device::get_device(0)
-        .expect("Couldn't find Cuda supported devices!");
+    let device = Device::get_device(0).expect("Couldn't find Cuda supported devices!");
 
     println!("Device Name: {}", device.name().unwrap());
 
     // Set up the context, load the module, and create a stream to run kernels in.
     let _ctx = Context::new(device);
     let module = Module::from_ptx(PTX, &[]).expect("Module couldn't be init!");
-    let increment = module.get_function("increment").expect("Kernel function not found!");
+    let increment = module
+        .get_function("increment")
+        .expect("Kernel function not found!");
     let stream = Stream::new(StreamFlags::NON_BLOCKING, None).expect("Stream couldn't be init!");
 
     const N: usize = 16 * 1024 * 1024;
@@ -53,9 +50,12 @@ fn main() -> Result<(), cust::error::CudaError> {
     // Create buffers for data on host-side
     // Ideally should be page-locked for efficiency
     let mut host_a = LockedBuffer::new(&0u32, N).expect("host array couldn't be initialized!");
-    let mut device_a = DeviceBuffer::from_slice(&[u32::MAX; N]).expect("device array couldn't be initialized!");
+    let mut device_a =
+        DeviceBuffer::from_slice(&[u32::MAX; N]).expect("device array couldn't be initialized!");
 
-    start_event.record(&stream).expect("Failed to record start_event in the CUDA stream!");
+    start_event
+        .record(&stream)
+        .expect("Failed to record start_event in the CUDA stream!");
     let start = Instant::now();
 
     // SAFETY: until the stop_event is triggered:
@@ -64,7 +64,9 @@ fn main() -> Result<(), cust::error::CudaError> {
     // 3. Until `stop_query` yields `EventStatus::Ready`, `device_a` is not involved in any other operation
     //    other than those of the operations in the stream.
     unsafe {
-        device_a.async_copy_from(&host_a, &stream).expect("Could not copy from host to device!");
+        device_a
+            .async_copy_from(&host_a, &stream)
+            .expect("Could not copy from host to device!");
     }
 
     // SAFETY: number of threads * number of blocks = total number of elements.
@@ -80,27 +82,33 @@ fn main() -> Result<(), cust::error::CudaError> {
     // SAFETY: until the stop_event is triggered:
     // 1. `device_a` is not being modified
     // 2. Both `device_a` and `host_a` are not deallocated
-    // 3. At this point, until `stop_query` yields `EventStatus::Ready`, 
+    // 3. At this point, until `stop_query` yields `EventStatus::Ready`,
     //    `host_a` is not involved in any other operation.
     unsafe {
-        device_a.async_copy_to(&mut host_a, &stream).expect("Could not copy from device to host!");
+        device_a
+            .async_copy_to(&mut host_a, &stream)
+            .expect("Could not copy from device to host!");
     }
 
-    stop_event.record(&stream).expect("Failed to record stop_event in the CUDA stream!");
+    stop_event
+        .record(&stream)
+        .expect("Failed to record stop_event in the CUDA stream!");
     let cpu_time: u128 = start.elapsed().as_micros();
 
     let mut counter: u64 = 0;
-    while stop_event.query() != Ok(EventStatus::Ready) { counter += 1 }
+    while stop_event.query() != Ok(EventStatus::Ready) {
+        counter += 1
+    }
 
     let gpu_time: u128 = stop_event
         .elapsed(&start_event)
         .expect("Failed to calculate duration of GPU operations!")
         .as_micros();
-    
+
     println!("Time spent executing by the GPU: {gpu_time} microseconds");
     println!("Time spent by CPU in CUDA calls: {cpu_time} microseconds");
     println!("CPU executed {counter} iterations while waiting for GPU to finish.");
-    
+
     assert!(correct_output(host_a.as_slice(), value));
 
     // Stream is synchronized as a safety measure
@@ -111,14 +119,14 @@ fn main() -> Result<(), cust::error::CudaError> {
         Ok(()) => println!("Successfully destroyed start_event"),
         Err((cuda_error, _event)) => {
             println!("Failed to destroy start_event: {:?}", cuda_error);
-        },
+        }
     }
 
     match Event::drop(stop_event) {
         Ok(()) => println!("Successfully destroyed stop_event"),
         Err((cuda_error, _event)) => {
             println!("Failed to destroy stop_event: {:?}", cuda_error);
-        },
+        }
     }
 
     DeviceBuffer::drop(device_a).expect("Couldn't drop device array!");

From 071df4911ef5a9520d340af6f2bb6c722cb2bc7c Mon Sep 17 00:00:00 2001
From: Madhav Madhusoodanan <f20200049@pilani.bits-pilani.ac.in>
Date: Tue, 4 Nov 2025 13:18:33 +0530
Subject: [PATCH 04/11] chore: move async_api example to the
 samples/introduction/ subdirectory

---
 Cargo.toml                                                   | 5 +++--
 {examples/cuda => samples/introduction}/async_api/Cargo.toml | 0
 {examples/cuda => samples/introduction}/async_api/build.rs   | 0
 .../introduction}/async_api/kernels/Cargo.toml               | 0
 .../introduction}/async_api/kernels/src/lib.rs               | 0
 .../cuda => samples/introduction}/async_api/src/main.rs      | 1 -
 6 files changed, 3 insertions(+), 3 deletions(-)
 rename {examples/cuda => samples/introduction}/async_api/Cargo.toml (100%)
 rename {examples/cuda => samples/introduction}/async_api/build.rs (100%)
 rename {examples/cuda => samples/introduction}/async_api/kernels/Cargo.toml (100%)
 rename {examples/cuda => samples/introduction}/async_api/kernels/src/lib.rs (100%)
 rename {examples/cuda => samples/introduction}/async_api/src/main.rs (98%)

diff --git a/Cargo.toml b/Cargo.toml
index 0fac3b78..309132ba 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -8,8 +8,6 @@ members = [
 
   "xtask",
 
-  "examples/cuda/async_api",
-  "examples/cuda/async_api/kernels",
   "examples/cuda/vecadd",
   "examples/cuda/vecadd/kernels",
   "examples/cuda/gemm",
@@ -22,6 +20,9 @@ members = [
   "examples/optix/*",
   "tests/compiletests",
   "tests/compiletests/deps-helper",
+
+  "samples/introduction/async_api",
+  "samples/introduction/async_api/kernels",
 ]
 
 exclude = [
diff --git a/examples/cuda/async_api/Cargo.toml b/samples/introduction/async_api/Cargo.toml
similarity index 100%
rename from examples/cuda/async_api/Cargo.toml
rename to samples/introduction/async_api/Cargo.toml
diff --git a/examples/cuda/async_api/build.rs b/samples/introduction/async_api/build.rs
similarity index 100%
rename from examples/cuda/async_api/build.rs
rename to samples/introduction/async_api/build.rs
diff --git a/examples/cuda/async_api/kernels/Cargo.toml b/samples/introduction/async_api/kernels/Cargo.toml
similarity index 100%
rename from examples/cuda/async_api/kernels/Cargo.toml
rename to samples/introduction/async_api/kernels/Cargo.toml
diff --git a/examples/cuda/async_api/kernels/src/lib.rs b/samples/introduction/async_api/kernels/src/lib.rs
similarity index 100%
rename from examples/cuda/async_api/kernels/src/lib.rs
rename to samples/introduction/async_api/kernels/src/lib.rs
diff --git a/examples/cuda/async_api/src/main.rs b/samples/introduction/async_api/src/main.rs
similarity index 98%
rename from examples/cuda/async_api/src/main.rs
rename to samples/introduction/async_api/src/main.rs
index 16cb7a5f..770808dc 100644
--- a/examples/cuda/async_api/src/main.rs
+++ b/samples/introduction/async_api/src/main.rs
@@ -38,7 +38,6 @@ fn main() -> Result<(), cust::error::CudaError> {
     let stream = Stream::new(StreamFlags::NON_BLOCKING, None).expect("Stream couldn't be init!");
 
     const N: usize = 16 * 1024 * 1024;
-    const N_BYTES: usize = N * (i32::BITS as usize);
     let value = 26;
 
     let blocks = BlockSize::xy(512, 1);

From e4223695af76f2fb3db9764c004ad924839ed8cb Mon Sep 17 00:00:00 2001
From: Madhav Madhusoodanan <f20200049@pilani.bits-pilani.ac.in>
Date: Tue, 4 Nov 2025 13:19:15 +0530
Subject: [PATCH 05/11] feat: add README.md for the samples/ subdirectory

---
 samples/README.md              | 5 +++++
 samples/introduction/README.md | 8 ++++++++
 2 files changed, 13 insertions(+)
 create mode 100644 samples/README.md
 create mode 100644 samples/introduction/README.md

diff --git a/samples/README.md b/samples/README.md
new file mode 100644
index 00000000..73b39156
--- /dev/null
+++ b/samples/README.md
@@ -0,0 +1,5 @@
+# Rust-Cuda Samples
+
+These are the Rust-Cuda port of the samples from Nvidia's [cuda-samples](https://github.com/NVIDIA/cuda-samples/tree/master/Samples) repository.
+
+1. Chapter 0: [Introduction](https://github.com/Rust-GPU/rust-cuda/samples/introduction)
\ No newline at end of file
diff --git a/samples/introduction/README.md b/samples/introduction/README.md
new file mode 100644
index 00000000..3c980a66
--- /dev/null
+++ b/samples/introduction/README.md
@@ -0,0 +1,8 @@
+# Chapter 0: Introduction
+
+## [asyncAPI](https://github.com/Rust-GPU/rust-cuda/samples/introduction/async_api)
+This example demonstrates two key capabilities of CUDA events: measuring GPU execution time and enabling concurrent CPU-GPU operations.
+
+1. Events are recorded at specific points within a CUDA stream to mark the beginning and end of GPU operations.
+2. Because CUDA stream operations execute asynchronously, the CPU remains free to perform other work while the GPU processes tasks (including memory transfers between host and device)
+3. The CPU can query these events to check whether the GPU has finished its work, allowing for coordination between the two processors without blocking the CPU.

From 5b6d246e0284595683f33f712aa720b766d61221 Mon Sep 17 00:00:00 2001
From: Madhav Madhusoodanan <f20200049@pilani.bits-pilani.ac.in>
Date: Thu, 6 Nov 2025 00:48:44 +0530
Subject: [PATCH 06/11] fix: replace manual increment operation with +=

---
 samples/introduction/async_api/kernels/src/lib.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/samples/introduction/async_api/kernels/src/lib.rs b/samples/introduction/async_api/kernels/src/lib.rs
index ffa7cbbc..e9bb8d0d 100644
--- a/samples/introduction/async_api/kernels/src/lib.rs
+++ b/samples/introduction/async_api/kernels/src/lib.rs
@@ -10,5 +10,5 @@ pub unsafe fn increment(g_data: *mut u32, inc_value: u32) {
         + cuda_std::thread::thread_idx().x) as usize;
 
     let elem: &mut u32 = unsafe { &mut *g_data.add(idx) };
-    *elem = *elem + inc_value;
+    *elem += inc_value;
 }

From 49fea304fb87d69a101884520124ab92f5aac859 Mon Sep 17 00:00:00 2001
From: Madhav Madhusoodanan <f20200049@pilani.bits-pilani.ac.in>
Date: Thu, 6 Nov 2025 11:07:46 +0530
Subject: [PATCH 07/11] chore: remove the drop-specific code, since Rust
 automatically drops the events and buffers when they go out of scope.

---
 samples/introduction/async_api/src/main.rs | 20 ++------------------
 1 file changed, 2 insertions(+), 18 deletions(-)

diff --git a/samples/introduction/async_api/src/main.rs b/samples/introduction/async_api/src/main.rs
index 770808dc..6b97a8cc 100644
--- a/samples/introduction/async_api/src/main.rs
+++ b/samples/introduction/async_api/src/main.rs
@@ -113,24 +113,8 @@ fn main() -> Result<(), cust::error::CudaError> {
     // Stream is synchronized as a safety measure
     stream.synchronize().expect("Stream couldn't synchronize!");
 
-    // Events and buffers can be safely dropped now
-    match Event::drop(start_event) {
-        Ok(()) => println!("Successfully destroyed start_event"),
-        Err((cuda_error, _event)) => {
-            println!("Failed to destroy start_event: {:?}", cuda_error);
-        }
-    }
-
-    match Event::drop(stop_event) {
-        Ok(()) => println!("Successfully destroyed stop_event"),
-        Err((cuda_error, _event)) => {
-            println!("Failed to destroy stop_event: {:?}", cuda_error);
-        }
-    }
-
-    DeviceBuffer::drop(device_a).expect("Couldn't drop device array!");
-    LockedBuffer::drop(host_a).expect("Couldn't drop host array!");
-
     println!("test PASSED");
     Ok(())
+
+    // The events and the memory buffers are automatically dropped here.
 }

From cf47dcda50c9f99267e8f32155247554f10f3453 Mon Sep 17 00:00:00 2001
From: Madhav Madhusoodanan <f20200049@pilani.bits-pilani.ac.in>
Date: Thu, 6 Nov 2025 11:08:05 +0530
Subject: [PATCH 08/11] chore: remove the clippy annotations from the
 kernel-side code

---
 samples/introduction/async_api/kernels/src/lib.rs | 1 -
 1 file changed, 1 deletion(-)

diff --git a/samples/introduction/async_api/kernels/src/lib.rs b/samples/introduction/async_api/kernels/src/lib.rs
index e9bb8d0d..fbd3dfbf 100644
--- a/samples/introduction/async_api/kernels/src/lib.rs
+++ b/samples/introduction/async_api/kernels/src/lib.rs
@@ -1,7 +1,6 @@
 use cuda_std::prelude::*;
 
 #[kernel]
-#[allow(improper_ctypes_definitions, clippy::missing_safety_doc)]
 pub unsafe fn increment(g_data: *mut u32, inc_value: u32) {
     // This can also be obtained directly as
     //

From 00135f50c62cd4a5e4962a5ba86728ea3bd3b71a Mon Sep 17 00:00:00 2001
From: Madhav Madhusoodanan <f20200049@pilani.bits-pilani.ac.in>
Date: Thu, 6 Nov 2025 11:15:15 +0530
Subject: [PATCH 09/11] chore: make the context creation code more ergonomic by
 using `quick_init()`

---
 samples/introduction/async_api/src/main.rs | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/samples/introduction/async_api/src/main.rs b/samples/introduction/async_api/src/main.rs
index 6b97a8cc..815ea0c9 100644
--- a/samples/introduction/async_api/src/main.rs
+++ b/samples/introduction/async_api/src/main.rs
@@ -23,14 +23,11 @@ fn correct_output(data: &[u32], x: u32) -> bool {
 }
 
 fn main() -> Result<(), cust::error::CudaError> {
-    cust::init(CudaFlags::empty()).expect("Couldn't initialize CUDA environment!");
-
+    // Set up the context, load the module, and create a stream to run kernels in.
+    let _ctx = cust::quick_init();
     let device = Device::get_device(0).expect("Couldn't find Cuda supported devices!");
-
     println!("Device Name: {}", device.name().unwrap());
 
-    // Set up the context, load the module, and create a stream to run kernels in.
-    let _ctx = Context::new(device);
     let module = Module::from_ptx(PTX, &[]).expect("Module couldn't be init!");
     let increment = module
         .get_function("increment")

From f016d397edb26a2f8fe306449721eef1fdb10c49 Mon Sep 17 00:00:00 2001
From: Madhav Madhusoodanan <f20200049@pilani.bits-pilani.ac.in>
Date: Sat, 8 Nov 2025 17:16:35 +0530
Subject: [PATCH 10/11] fix: add SAFETY message to kernel and remove
 unnecessary imports

---
 samples/introduction/async_api/kernels/src/lib.rs |  4 ++++
 samples/introduction/async_api/src/main.rs        | 15 ++++++++++-----
 2 files changed, 14 insertions(+), 5 deletions(-)

diff --git a/samples/introduction/async_api/kernels/src/lib.rs b/samples/introduction/async_api/kernels/src/lib.rs
index fbd3dfbf..ffcc07d3 100644
--- a/samples/introduction/async_api/kernels/src/lib.rs
+++ b/samples/introduction/async_api/kernels/src/lib.rs
@@ -1,6 +1,10 @@
 use cuda_std::prelude::*;
 
 #[kernel]
+/// # Safety
+///
+/// The user must ensure that the number of (threads * blocks * grids)
+/// must not be greater than the number of elements in `g_data`.
 pub unsafe fn increment(g_data: *mut u32, inc_value: u32) {
     // This can also be obtained directly as
     //
diff --git a/samples/introduction/async_api/src/main.rs b/samples/introduction/async_api/src/main.rs
index 815ea0c9..063efbd3 100644
--- a/samples/introduction/async_api/src/main.rs
+++ b/samples/introduction/async_api/src/main.rs
@@ -1,12 +1,11 @@
-use cust::context::Context;
 use cust::device::Device;
 use cust::event::{Event, EventFlags};
 use cust::function::{BlockSize, GridSize};
+use cust::launch;
 use cust::memory::{AsyncCopyDestination, DeviceBuffer, LockedBuffer};
 use cust::module::Module;
 use cust::prelude::EventStatus;
 use cust::stream::{Stream, StreamFlags};
-use cust::{CudaFlags, launch};
 use std::time::Instant;
 
 static PTX: &str = include_str!(concat!(env!("OUT_DIR"), "/kernels.ptx"));
@@ -54,7 +53,9 @@ fn main() -> Result<(), cust::error::CudaError> {
         .expect("Failed to record start_event in the CUDA stream!");
     let start = Instant::now();
 
-    // SAFETY: until the stop_event is triggered:
+    // # Safety
+    //
+    // Until the stop_event is triggered:
     // 1. `host_a` is not being modified
     // 2. Both `device_a` and `host_a` are not deallocated
     // 3. Until `stop_query` yields `EventStatus::Ready`, `device_a` is not involved in any other operation
@@ -65,7 +66,9 @@ fn main() -> Result<(), cust::error::CudaError> {
             .expect("Could not copy from host to device!");
     }
 
-    // SAFETY: number of threads * number of blocks = total number of elements.
+    // # Safety
+    //
+    // Number of threads * number of blocks = total number of elements.
     // Hence there will not be any out-of-bounds issues.
     unsafe {
         let result = launch!(increment<<<grids, blocks, 0, stream>>>(
@@ -75,7 +78,9 @@ fn main() -> Result<(), cust::error::CudaError> {
         result.expect("Result of `increment` kernel did not process!");
     }
 
-    // SAFETY: until the stop_event is triggered:
+    // # Safety
+    //
+    // Until the stop_event is triggered:
     // 1. `device_a` is not being modified
     // 2. Both `device_a` and `host_a` are not deallocated
     // 3. At this point, until `stop_query` yields `EventStatus::Ready`,

From 98fc439206df54bdefeca6a0adce946ae3e71766 Mon Sep 17 00:00:00 2001
From: Madhav Madhusoodanan <f20200049@pilani.bits-pilani.ac.in>
Date: Sun, 9 Nov 2025 10:30:40 +0530
Subject: [PATCH 11/11] feat: update ci_windows.yml to update PATH to expose
 CUDA codegen backend.

---
 .github/workflows/ci_windows.yml | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/.github/workflows/ci_windows.yml b/.github/workflows/ci_windows.yml
index e1e76c9d..a5910cf4 100644
--- a/.github/workflows/ci_windows.yml
+++ b/.github/workflows/ci_windows.yml
@@ -67,6 +67,11 @@ jobs:
       - name: Add rustup components
         run: rustup component add rustfmt clippy
 
+      - name: Update PATH to expose CUDA codegen backend
+        shell: pwsh
+        run: |
+          echo "$env:CUDA_PATH\nvvm\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
+
       - name: Verify CUDA, Rust installation
         run: |
           nvcc --version