Skip to content

Latest commit

Β 

History

History
177 lines (132 loc) Β· 5.9 KB

File metadata and controls

177 lines (132 loc) Β· 5.9 KB

Driver Module

Device management, memory allocation, kernel launch, streams, events, and graphs.

Import: const driver = @import("zcuda").driver;

CudaContext

Entry point for all CUDA operations. Manages the device, streams, memory, and modules.

Creation & Lifecycle

fn new(ordinal: usize) !*CudaContext;    // Create context on device N
fn deinit(self: *const Self) void;       // Release context
fn bindToThread(self) !void;             // Bind context to current thread

Device Info

fn deviceCount() !i32;                           // Number of CUDA devices
fn name(self) []const u8;                         // Device name
fn uuid(self) !CUuuid;                            // Device UUID
fn computeCapability(self) !struct{major, minor}; // SM version
fn totalMem(self) !usize;                         // Total memory (bytes)
fn attribute(self, attr) !i32;                     // Query device attribute
fn getOrdinal(self) usize;                         // Device ordinal index

Memory Info & Limits

fn freeMem(self) !usize;                          // Free memory (bytes)
fn memInfo(self) !struct{free, total};             // Free/total memory
fn getLimit(self, limit) !usize;                   // Query context limit
fn setLimit(self, limit, value) !void;             // Set context limit
fn getCacheConfig(self) !CUfunc_cache;             // L1/shared preference
fn setCacheConfig(self, config) !void;             // Set L1/shared preference
fn setBlockingSynchronize(self) !void;             // Enable blocking sync
fn synchronize(self) !void;                        // Synchronize context

Stream & Module Management

fn defaultStream(self) *const CudaStream;          // Default stream
fn newStream(self) !CudaStream;                    // Create non-blocking stream
fn loadModule(self, ptx) !CudaModule;              // Load PTX module
fn createEvent(self, flags) !CudaEvent;            // Create event
fn allocManaged(self, T, len) !CudaSlice(T);       // Unified memory

CudaStream

Asynchronous execution stream for memory operations and kernel launches.

Memory Operations

fn alloc(T, allocator, n) !CudaSlice(T);          // Allocate device memory
fn allocZeros(T, allocator, n) !CudaSlice(T);     // Allocate + zero-fill
fn cloneHtoD(T, host_slice) !CudaSlice(T);        // Host β†’ Device copy
fn memcpyHtoD(T, dst, src) !void;                  // Copy host β†’ device
fn memcpyDtoH(T, dst, src) !void;                  // Copy device β†’ host
fn cloneDtoH(T, allocator, src) ![]T;              // Clone device β†’ new host buf
fn memcpyDtoD(T, dst, src) !void;                  // Copy device β†’ device
fn memcpyHtoDAsync(T, dst, src) !void;             // Async host β†’ device
fn memcpyDtoHAsync(T, dst, src) !void;             // Async device β†’ host
fn memcpyDtoDAsync(T, dst, src) !void;             // Async device β†’ device

Kernel Launch

fn launch(func, config, args) !void;               // Launch kernel

Synchronization & Events

fn synchronize(self) !void;                        // Wait for all operations
fn waitEvent(self, event) !void;                    // Wait for event
fn query(self) !bool;                              // Non-blocking completion check
fn createEvent(self, flags) !CudaEvent;            // Create event
fn recordEvent(self, event) !void;                 // Record event

Unified Memory & Graph Capture

fn prefetchAsync(T, slice) !void;                  // Prefetch to device
fn beginCapture(self) !void;                       // Begin graph capture
fn endCapture(self) !?CudaGraph;                   // End capture β†’ executable graph
fn captureStatus(self) !CUstreamCaptureStatus;     // Query capture status

CudaSlice(T)

Typed, owning device memory (analogous to Vec<T> on GPU).

fn deinit(self) void;                              // Free device memory
fn slice(self, start, end) CudaView(T);            // Immutable sub-view
fn sliceMut(self, start, end) CudaViewMut(T);      // Mutable sub-view
fn devicePtr(self) DevicePtr(T);                    // Get typed device pointer

CudaView(T) / CudaViewMut(T)

Non-owning views into device memory (analogous to []const T / []T).

fn devicePtr(self) DevicePtr(T);                    // Get typed device pointer
fn subView(self, start, end) Self;                  // Create sub-view

CudaModule / CudaFunction

// CudaModule
fn deinit(self) void;                              // Unload module
fn getFunction(self, name) !CudaFunction;          // Get kernel by name

// CudaFunction
fn getAttribute(self, attrib) !i32;                // Query function attribute

CudaEvent

fn deinit(self) void;                              // Destroy event
fn record(self, stream) !void;                     // Record on stream
fn synchronize(self) !void;                        // Wait for event
fn elapsedTime(start, end) !f32;                   // Milliseconds between events
fn query(self) !bool;                              // Non-blocking completion check

CudaGraph

fn launch(self) !void;                             // Replay recorded graph
fn deinit(self) void;                              // Destroy graph

Shared Types

const Dim3 = struct { x: u32 = 1, y: u32 = 1, z: u32 = 1 };

const LaunchConfig = struct {
    grid_dim: Dim3,
    block_dim: Dim3,
    shared_mem_bytes: u32,

    fn forNumElems(n: u32) LaunchConfig;            // Auto-configure for N elements
    fn forNumElemsCustom(n: u32, tpb: u32) LaunchConfig;
};

const DevicePtr = fn(T: type) struct { ptr: usize };

Example

const cuda = @import("zcuda");

const ctx = try cuda.driver.CudaContext.new(0);
defer ctx.deinit();

const stream = ctx.defaultStream();
const data = try stream.cloneHtoD(f32, &[_]f32{ 1.0, 2.0, 3.0 });
defer data.deinit();

var result: [3]f32 = undefined;
try stream.memcpyDtoH(f32, &result, data);