-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathvector_add.zig
More file actions
110 lines (87 loc) · 3.59 KB
/
vector_add.zig
File metadata and controls
110 lines (87 loc) · 3.59 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
/// Vector Addition Example
///
/// The "Hello World" of CUDA programming. Demonstrates:
/// 1. NVRTC runtime compilation of CUDA C++ to PTX
/// 2. Device memory allocation (alloc, allocZeros, cloneHtoD)
/// 3. Kernel launch with grid/block configuration
/// 4. Device-to-host data transfer and result verification
///
/// Reference: cuda-samples/vectorAdd, cudarc/01-allocate + 02-copy + 03-launch-kernel
const std = @import("std");
const cuda = @import("zcuda");
const kernel_src =
\\extern "C" __global__ void vectorAdd(const float *A, const float *B, float *C, int n) {
\\ int i = blockIdx.x * blockDim.x + threadIdx.x;
\\ if (i < n) {
\\ C[i] = A[i] + B[i];
\\ }
\\}
;
pub fn main() !void {
const allocator = std.heap.page_allocator;
std.debug.print("=== Vector Addition Example ===\n\n", .{});
// --- Device setup ---
const ctx = try cuda.driver.CudaContext.new(0);
defer ctx.deinit();
std.debug.print("Device: {s}\n", .{ctx.name()});
const stream = ctx.defaultStream();
// --- Compile CUDA kernel at runtime ---
std.debug.print("Compiling CUDA kernel via NVRTC...\n", .{});
const ptx = try cuda.nvrtc.compilePtx(allocator, kernel_src);
defer allocator.free(ptx);
const module = try ctx.loadModule(ptx);
defer module.deinit();
const kernel = try module.getFunction("vectorAdd");
std.debug.print("✓ Kernel compiled and loaded\n\n", .{});
// --- Prepare host data ---
const n: usize = 50000;
const n_i32: i32 = @intCast(n);
var h_A: [50000]f32 = undefined;
var h_B: [50000]f32 = undefined;
for (&h_A, &h_B, 0..) |*a, *b, i| {
a.* = @as(f32, @floatFromInt(i)) * 0.001;
b.* = @as(f32, @floatFromInt(n - i)) * 0.001;
}
std.debug.print("Host data prepared: {} elements\n", .{n});
std.debug.print(" A[0..5] = ", .{});
for (h_A[0..5]) |v| std.debug.print("{d:.3} ", .{v});
std.debug.print("\n B[0..5] = ", .{});
for (h_B[0..5]) |v| std.debug.print("{d:.3} ", .{v});
std.debug.print("\n\n", .{});
// --- Allocate device memory and copy data ---
const d_A = try stream.cloneHtoD(f32, &h_A);
defer d_A.deinit();
const d_B = try stream.cloneHtoD(f32, &h_B);
defer d_B.deinit();
const d_C = try stream.allocZeros(f32, allocator, n);
defer d_C.deinit();
std.debug.print("Device memory allocated and copied\n", .{});
// --- Launch kernel ---
const config = cuda.LaunchConfig.forNumElems(@intCast(n));
std.debug.print("Launch config: grid({},{},{}), block({},{},{})\n", .{
config.grid_dim.x, config.grid_dim.y, config.grid_dim.z,
config.block_dim.x, config.block_dim.y, config.block_dim.z,
});
try stream.launch(kernel, config, .{ &d_A, &d_B, &d_C, n_i32 });
try stream.synchronize();
std.debug.print("✓ Kernel executed\n\n", .{});
// --- Copy results back and verify ---
var h_C: [50000]f32 = undefined;
try stream.memcpyDtoH(f32, &h_C, d_C);
std.debug.print("Verifying results...\n", .{});
var max_error: f32 = 0.0;
for (&h_A, &h_B, &h_C) |a, b, c| {
const expected = a + b;
const err = @abs(c - expected);
max_error = @max(max_error, err);
}
std.debug.print(" Max error: {e}\n", .{max_error});
std.debug.print(" C[0..5] = ", .{});
for (h_C[0..5]) |v| std.debug.print("{d:.3} ", .{v});
std.debug.print("\n", .{});
if (max_error > 1e-5) {
std.debug.print("\n✗ FAILED: error too large\n", .{});
return error.ValidationFailed;
}
std.debug.print("\n✓ Vector addition successful! ({} elements verified)\n", .{n});
}