Device management, memory allocation, kernel launch, streams, events, and graphs.
Import: const driver = @import("zcuda").driver;
Entry point for all CUDA operations. Manages the device, streams, memory, and modules.
fn new(ordinal: usize) !*CudaContext; // Create context on device N
fn deinit(self: *const Self) void; // Release context
fn bindToThread(self) !void; // Bind context to current threadfn deviceCount() !i32; // Number of CUDA devices
fn name(self) []const u8; // Device name
fn uuid(self) !CUuuid; // Device UUID
fn computeCapability(self) !struct{major, minor}; // SM version
fn totalMem(self) !usize; // Total memory (bytes)
fn attribute(self, attr) !i32; // Query device attribute
fn getOrdinal(self) usize; // Device ordinal indexfn freeMem(self) !usize; // Free memory (bytes)
fn memInfo(self) !struct{free, total}; // Free/total memory
fn getLimit(self, limit) !usize; // Query context limit
fn setLimit(self, limit, value) !void; // Set context limit
fn getCacheConfig(self) !CUfunc_cache; // L1/shared preference
fn setCacheConfig(self, config) !void; // Set L1/shared preference
fn setBlockingSynchronize(self) !void; // Enable blocking sync
fn synchronize(self) !void; // Synchronize contextfn defaultStream(self) *const CudaStream; // Default stream
fn newStream(self) !CudaStream; // Create non-blocking stream
fn loadModule(self, ptx) !CudaModule; // Load PTX module
fn createEvent(self, flags) !CudaEvent; // Create event
fn allocManaged(self, T, len) !CudaSlice(T); // Unified memoryAsynchronous execution stream for memory operations and kernel launches.
fn alloc(T, allocator, n) !CudaSlice(T); // Allocate device memory
fn allocZeros(T, allocator, n) !CudaSlice(T); // Allocate + zero-fill
fn cloneHtoD(T, host_slice) !CudaSlice(T); // Host β Device copy
fn memcpyHtoD(T, dst, src) !void; // Copy host β device
fn memcpyDtoH(T, dst, src) !void; // Copy device β host
fn cloneDtoH(T, allocator, src) ![]T; // Clone device β new host buf
fn memcpyDtoD(T, dst, src) !void; // Copy device β device
fn memcpyHtoDAsync(T, dst, src) !void; // Async host β device
fn memcpyDtoHAsync(T, dst, src) !void; // Async device β host
fn memcpyDtoDAsync(T, dst, src) !void; // Async device β devicefn launch(func, config, args) !void; // Launch kernelfn synchronize(self) !void; // Wait for all operations
fn waitEvent(self, event) !void; // Wait for event
fn query(self) !bool; // Non-blocking completion check
fn createEvent(self, flags) !CudaEvent; // Create event
fn recordEvent(self, event) !void; // Record eventfn prefetchAsync(T, slice) !void; // Prefetch to device
fn beginCapture(self) !void; // Begin graph capture
fn endCapture(self) !?CudaGraph; // End capture β executable graph
fn captureStatus(self) !CUstreamCaptureStatus; // Query capture statusTyped, owning device memory (analogous to Vec<T> on GPU).
fn deinit(self) void; // Free device memory
fn slice(self, start, end) CudaView(T); // Immutable sub-view
fn sliceMut(self, start, end) CudaViewMut(T); // Mutable sub-view
fn devicePtr(self) DevicePtr(T); // Get typed device pointerNon-owning views into device memory (analogous to []const T / []T).
fn devicePtr(self) DevicePtr(T); // Get typed device pointer
fn subView(self, start, end) Self; // Create sub-view// CudaModule
fn deinit(self) void; // Unload module
fn getFunction(self, name) !CudaFunction; // Get kernel by name
// CudaFunction
fn getAttribute(self, attrib) !i32; // Query function attributefn deinit(self) void; // Destroy event
fn record(self, stream) !void; // Record on stream
fn synchronize(self) !void; // Wait for event
fn elapsedTime(start, end) !f32; // Milliseconds between events
fn query(self) !bool; // Non-blocking completion checkfn launch(self) !void; // Replay recorded graph
fn deinit(self) void; // Destroy graphconst Dim3 = struct { x: u32 = 1, y: u32 = 1, z: u32 = 1 };
const LaunchConfig = struct {
grid_dim: Dim3,
block_dim: Dim3,
shared_mem_bytes: u32,
fn forNumElems(n: u32) LaunchConfig; // Auto-configure for N elements
fn forNumElemsCustom(n: u32, tpb: u32) LaunchConfig;
};
const DevicePtr = fn(T: type) struct { ptr: usize };const cuda = @import("zcuda");
const ctx = try cuda.driver.CudaContext.new(0);
defer ctx.deinit();
const stream = ctx.defaultStream();
const data = try stream.cloneHtoD(f32, &[_]f32{ 1.0, 2.0, 3.0 });
defer data.deinit();
var result: [3]f32 = undefined;
try stream.memcpyDtoH(f32, &result, data);