Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions .github/workflows/ci_windows.yml
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,11 @@ jobs:
- name: Add rustup components
run: rustup component add rustfmt clippy

- name: Update PATH to expose CUDA codegen backend
shell: pwsh
run: |
echo "$env:CUDA_PATH\nvvm\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append

- name: Verify CUDA, Rust installation
run: |
nvcc --version
Expand Down
16 changes: 16 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 3 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,9 @@ members = [
"examples/optix/*",
"tests/compiletests",
"tests/compiletests/deps-helper",

"samples/introduction/async_api",
"samples/introduction/async_api/kernels",
]

exclude = [
Expand Down
5 changes: 5 additions & 0 deletions samples/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
# Rust-Cuda Samples

These are the Rust-Cuda port of the samples from Nvidia's [cuda-samples](https://github.com/NVIDIA/cuda-samples/tree/master/Samples) repository.

1. Chapter 0: [Introduction](https://github.com/Rust-GPU/rust-cuda/samples/introduction)
8 changes: 8 additions & 0 deletions samples/introduction/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
# Chapter 0: Introduction

## [asyncAPI](https://github.com/Rust-GPU/rust-cuda/samples/introduction/async_api)
This example demonstrates two key capabilities of CUDA events: measuring GPU execution time and enabling concurrent CPU-GPU operations.

1. Events are recorded at specific points within a CUDA stream to mark the beginning and end of GPU operations.
2. Because CUDA stream operations execute asynchronously, the CPU remains free to perform other work while the GPU processes tasks (including memory transfers between host and device)
3. The CPU can query these events to check whether the GPU has finished its work, allowing for coordination between the two processors without blocking the CPU.
11 changes: 11 additions & 0 deletions samples/introduction/async_api/Cargo.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
[package]
name = "async_api"
version = "0.1.0"
edition = "2024"

[dependencies]
cust = { path = "../../../crates/cust" }
nanorand = "0.7"

[build-dependencies]
cuda_builder = { workspace = true, default-features = false }
17 changes: 17 additions & 0 deletions samples/introduction/async_api/build.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
use std::env;
use std::path;

use cuda_builder::CudaBuilder;

fn main() {
println!("cargo::rerun-if-changed=build.rs");
println!("cargo::rerun-if-changed=kernels");

let out_path = path::PathBuf::from(env::var("OUT_DIR").unwrap());
let manifest_dir = path::PathBuf::from(env::var("CARGO_MANIFEST_DIR").unwrap());

CudaBuilder::new(manifest_dir.join("kernels"))
.copy_to(out_path.join("kernels.ptx"))
.build()
.unwrap();
}
10 changes: 10 additions & 0 deletions samples/introduction/async_api/kernels/Cargo.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
[package]
name = "async_api-kernels"
version = "0.1.0"
edition = "2024"

[dependencies]
cuda_std = { path = "../../../../crates/cuda_std" }

[lib]
crate-type = ["cdylib", "rlib"]
17 changes: 17 additions & 0 deletions samples/introduction/async_api/kernels/src/lib.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
use cuda_std::prelude::*;

#[kernel]
/// # Safety
///
/// The user must ensure that the number of (threads * blocks * grids)
/// must not be greater than the number of elements in `g_data`.
pub unsafe fn increment(g_data: *mut u32, inc_value: u32) {
// This can also be obtained directly as
//
// let idx: usize = cuda_std::thread::index() as usize;
let idx: usize = (cuda_std::thread::block_dim().x * cuda_std::thread::block_idx().x
+ cuda_std::thread::thread_idx().x) as usize;

let elem: &mut u32 = unsafe { &mut *g_data.add(idx) };
*elem += inc_value;
}
122 changes: 122 additions & 0 deletions samples/introduction/async_api/src/main.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
use cust::device::Device;
use cust::event::{Event, EventFlags};
use cust::function::{BlockSize, GridSize};
use cust::launch;
use cust::memory::{AsyncCopyDestination, DeviceBuffer, LockedBuffer};
use cust::module::Module;
use cust::prelude::EventStatus;
use cust::stream::{Stream, StreamFlags};
use std::time::Instant;

static PTX: &str = include_str!(concat!(env!("OUT_DIR"), "/kernels.ptx"));

fn correct_output(data: &[u32], x: u32) -> bool {
let not_matching_element = data.iter().enumerate().find(|&(_, &elem)| elem != x);

match not_matching_element {
Some((index, elem)) => println!("Error! data[{index}] = {elem}, ref = {x}"),
None => println!("All elements of the array match the value!"),
}

not_matching_element.is_none()
}

fn main() -> Result<(), cust::error::CudaError> {
// Set up the context, load the module, and create a stream to run kernels in.
let _ctx = cust::quick_init();
let device = Device::get_device(0).expect("Couldn't find Cuda supported devices!");
println!("Device Name: {}", device.name().unwrap());

let module = Module::from_ptx(PTX, &[]).expect("Module couldn't be init!");
let increment = module
.get_function("increment")
.expect("Kernel function not found!");
let stream = Stream::new(StreamFlags::NON_BLOCKING, None).expect("Stream couldn't be init!");

const N: usize = 16 * 1024 * 1024;
let value = 26;

let blocks = BlockSize::xy(512, 1);
let grids = GridSize::xy((N / (blocks.x as usize)).try_into().unwrap(), 1);

let start_event = Event::new(EventFlags::DEFAULT)?;
let stop_event = Event::new(EventFlags::DEFAULT)?;

// Create buffers for data on host-side
// Ideally should be page-locked for efficiency
let mut host_a = LockedBuffer::new(&0u32, N).expect("host array couldn't be initialized!");
let mut device_a =
DeviceBuffer::from_slice(&[u32::MAX; N]).expect("device array couldn't be initialized!");

start_event
.record(&stream)
.expect("Failed to record start_event in the CUDA stream!");
let start = Instant::now();

// # Safety
//
// Until the stop_event is triggered:
// 1. `host_a` is not being modified
// 2. Both `device_a` and `host_a` are not deallocated
// 3. Until `stop_query` yields `EventStatus::Ready`, `device_a` is not involved in any other operation
// other than those of the operations in the stream.
unsafe {
device_a
.async_copy_from(&host_a, &stream)
.expect("Could not copy from host to device!");
}

// # Safety
//
// Number of threads * number of blocks = total number of elements.
// Hence there will not be any out-of-bounds issues.
unsafe {
let result = launch!(increment<<<grids, blocks, 0, stream>>>(
device_a.as_device_ptr(),
value
));
result.expect("Result of `increment` kernel did not process!");
}

// # Safety
//
// Until the stop_event is triggered:
// 1. `device_a` is not being modified
// 2. Both `device_a` and `host_a` are not deallocated
// 3. At this point, until `stop_query` yields `EventStatus::Ready`,
// `host_a` is not involved in any other operation.
unsafe {
device_a
.async_copy_to(&mut host_a, &stream)
.expect("Could not copy from device to host!");
}

stop_event
.record(&stream)
.expect("Failed to record stop_event in the CUDA stream!");
let cpu_time: u128 = start.elapsed().as_micros();

let mut counter: u64 = 0;
while stop_event.query() != Ok(EventStatus::Ready) {
counter += 1
}

let gpu_time: u128 = stop_event
.elapsed(&start_event)
.expect("Failed to calculate duration of GPU operations!")
.as_micros();

println!("Time spent executing by the GPU: {gpu_time} microseconds");
println!("Time spent by CPU in CUDA calls: {cpu_time} microseconds");
println!("CPU executed {counter} iterations while waiting for GPU to finish.");

assert!(correct_output(host_a.as_slice(), value));

// Stream is synchronized as a safety measure
stream.synchronize().expect("Stream couldn't synchronize!");

println!("test PASSED");
Ok(())

// The events and the memory buffers are automatically dropped here.
}