-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathkernel_softmax.zig
More file actions
100 lines (85 loc) Β· 3.87 KB
/
kernel_softmax.zig
File metadata and controls
100 lines (85 loc) Β· 3.87 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
// kernels/softmax.zig β Numerically stable softmax using shared memory + warp shuffle
//
// Features: SharedArray, warp shuffle, two-pass reduction, grid-stride loop
//
// Algorithm (per row):
// 1. Find max value (for numerical stability)
// 2. Compute sum of exp(x - max)
// 3. Normalize: output[i] = exp(x[i] - max) / sum
//
// Each block processes one row of the input matrix [rows Γ cols].
const cuda = @import("zcuda_kernel");
const smem = cuda.shared_mem;
const BLOCK_SIZE = 256;
export fn softmax(
input: [*]const f32,
output: [*]f32,
rows: u32,
cols: u32,
) callconv(.kernel) void {
const row = cuda.blockIdx().x;
if (row >= rows) return;
const tid = cuda.threadIdx().x;
const row_offset = row * cols;
const sdata = smem.SharedArray(f32, BLOCK_SIZE);
const s = sdata.ptr();
// ββ Pass 1: Find max value in row ββ
var max_val: f32 = -3.40282347e+38; // -FLT_MAX
var i = tid;
while (i < cols) : (i += BLOCK_SIZE) {
max_val = cuda.fmaxf(max_val, input[row_offset + i]);
}
// Warp reduction for max
max_val = cuda.fmaxf(max_val, @bitCast(cuda.__shfl_down_sync(cuda.FULL_MASK, @bitCast(max_val), 16, 32)));
max_val = cuda.fmaxf(max_val, @bitCast(cuda.__shfl_down_sync(cuda.FULL_MASK, @bitCast(max_val), 8, 32)));
max_val = cuda.fmaxf(max_val, @bitCast(cuda.__shfl_down_sync(cuda.FULL_MASK, @bitCast(max_val), 4, 32)));
max_val = cuda.fmaxf(max_val, @bitCast(cuda.__shfl_down_sync(cuda.FULL_MASK, @bitCast(max_val), 2, 32)));
max_val = cuda.fmaxf(max_val, @bitCast(cuda.__shfl_down_sync(cuda.FULL_MASK, @bitCast(max_val), 1, 32)));
// Block reduction via shared memory
if (tid % cuda.warpSize == 0) {
s[tid / cuda.warpSize] = max_val;
}
cuda.__syncthreads();
if (tid < cuda.warpSize) {
max_val = if (tid < (BLOCK_SIZE / cuda.warpSize)) s[tid] else -3.40282347e+38;
max_val = cuda.fmaxf(max_val, @bitCast(cuda.__shfl_down_sync(cuda.FULL_MASK, @bitCast(max_val), 4, 32)));
max_val = cuda.fmaxf(max_val, @bitCast(cuda.__shfl_down_sync(cuda.FULL_MASK, @bitCast(max_val), 2, 32)));
max_val = cuda.fmaxf(max_val, @bitCast(cuda.__shfl_down_sync(cuda.FULL_MASK, @bitCast(max_val), 1, 32)));
}
if (tid == 0) s[0] = max_val;
cuda.__syncthreads();
max_val = s[0];
// ββ Pass 2: Compute sum of exp(x - max) ββ
var exp_sum: f32 = 0.0;
i = tid;
while (i < cols) : (i += BLOCK_SIZE) {
exp_sum += cuda.__expf(input[row_offset + i] - max_val);
}
// Warp + block reduction for sum
exp_sum += @bitCast(cuda.__shfl_down_sync(cuda.FULL_MASK, @bitCast(exp_sum), 16, 32));
exp_sum += @bitCast(cuda.__shfl_down_sync(cuda.FULL_MASK, @bitCast(exp_sum), 8, 32));
exp_sum += @bitCast(cuda.__shfl_down_sync(cuda.FULL_MASK, @bitCast(exp_sum), 4, 32));
exp_sum += @bitCast(cuda.__shfl_down_sync(cuda.FULL_MASK, @bitCast(exp_sum), 2, 32));
exp_sum += @bitCast(cuda.__shfl_down_sync(cuda.FULL_MASK, @bitCast(exp_sum), 1, 32));
if (tid % cuda.warpSize == 0) {
s[tid / cuda.warpSize] = exp_sum;
}
cuda.__syncthreads();
if (tid < cuda.warpSize) {
exp_sum = if (tid < (BLOCK_SIZE / cuda.warpSize)) s[tid] else 0.0;
exp_sum += @bitCast(cuda.__shfl_down_sync(cuda.FULL_MASK, @bitCast(exp_sum), 4, 32));
exp_sum += @bitCast(cuda.__shfl_down_sync(cuda.FULL_MASK, @bitCast(exp_sum), 2, 32));
exp_sum += @bitCast(cuda.__shfl_down_sync(cuda.FULL_MASK, @bitCast(exp_sum), 1, 32));
}
if (tid == 0) s[0] = exp_sum;
cuda.__syncthreads();
const inv_sum = cuda.rsqrtf(s[0] * s[0]); // idiom for 1/s[0] via rsqrt(xΒ²)
// Actually: 1/sum
const sum_val = s[0];
// ββ Pass 3: Normalize ββ
i = tid;
while (i < cols) : (i += BLOCK_SIZE) {
output[row_offset + i] = cuda.__expf(input[row_offset + i] - max_val) / sum_val;
}
_ = inv_sum;
}