-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathkernel_matvec.zig
More file actions
49 lines (46 loc) · 1.39 KB
/
kernel_matvec.zig
File metadata and controls
49 lines (46 loc) · 1.39 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
// examples/kernel/2_Matrix/kernel_matvec.zig — Matrix-vector multiplication y = A*x
//
// Reference: cuda-samples/0_Introduction/matrixMul (adapted for Mx1 output)
// API exercised: globalThreadIdx, __fmaf_rn, gridStrideLoop
const cuda = @import("zcuda_kernel");
/// Matrix-vector multiplication: y = A × x
/// A is M×N (row-major), x is N×1, y is M×1
/// One thread per output row.
export fn matvec(
A: [*]const f32,
x: [*]const f32,
y: [*]f32,
M: u32,
N: u32,
) callconv(.kernel) void {
var iter = cuda.types.gridStrideLoop(M);
while (iter.next()) |row| {
var sum: f32 = 0.0;
var j: u32 = 0;
while (j < N) : (j += 1) {
sum = cuda.__fmaf_rn(A[row * N + j], x[j], sum);
}
y[row] = sum;
}
}
/// Sparse-like matvec with stride (for banded matrices)
/// A is M×bandwidth (row-major), col_idx has bandwidth entries per row
export fn matvecBanded(
A_vals: [*]const f32,
col_idx: [*]const u32,
x: [*]const f32,
y: [*]f32,
M: u32,
bandwidth: u32,
) callconv(.kernel) void {
var iter = cuda.types.gridStrideLoop(M);
while (iter.next()) |row| {
var sum: f32 = 0.0;
var j: u32 = 0;
while (j < bandwidth) : (j += 1) {
const col = col_idx[row * bandwidth + j];
sum = cuda.__fmaf_rn(A_vals[row * bandwidth + j], x[col], sum);
}
y[row] = sum;
}
}