-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathkernel_async_copy_pipeline.zig
More file actions
61 lines (52 loc) · 2.14 KB
/
kernel_async_copy_pipeline.zig
File metadata and controls
61 lines (52 loc) · 2.14 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
// examples/kernel/9_Advanced/kernel_async_copy_pipeline.zig — Multi-stage async pipeline
//
// Reference: cuda-samples/3_CUDA_Features/globalToShmemAsyncCopy
// API exercised: SharedArray, __syncthreads, gridStrideLoop
//
// Note: True cp.async requires sm_80+ PTX. This demonstrates the
// multi-stage pipeline pattern using synchronous copies as a baseline,
// which will be upgraded to cp.async when building for sm_80+.
const cuda = @import("zcuda_kernel");
const smem = cuda.shared_mem;
const BLOCK_SIZE = 128;
const STAGES = 2; // double-buffered
/// Double-buffered pipeline: overlap global memory loads with computation.
/// Stage S loads data while stage (S^1) computes, maximizing memory/compute overlap.
export fn pipelinedTransform(
input: [*]const f32,
output: [*]f32,
scale: f32,
bias: f32,
n: u32,
) callconv(.kernel) void {
// Combined double buffer in shared memory — avoids Zig comptime type aliasing
const combined = smem.SharedArray(f32, BLOCK_SIZE * 2);
const buffers: [2][*]f32 = .{ combined.ptr(), combined.ptr() + BLOCK_SIZE };
const tid = cuda.threadIdx().x;
const blocks_total = (n + BLOCK_SIZE - 1) / BLOCK_SIZE;
var block_iter: u32 = cuda.blockIdx().x;
// Pipeline prologue: fill stage 0
if (block_iter < blocks_total) {
const gid = block_iter * BLOCK_SIZE + tid;
buffers[0][tid] = if (gid < n) input[gid] else 0.0;
}
cuda.__syncthreads();
var stage: u32 = 0;
while (block_iter < blocks_total) : (block_iter += cuda.gridDim().x) {
const cur = stage & 1;
const nxt = cur ^ 1;
// Prefetch next block into alternate buffer
const next_block = block_iter + cuda.gridDim().x;
if (next_block < blocks_total) {
const next_gid = next_block * BLOCK_SIZE + tid;
buffers[nxt][tid] = if (next_gid < n) input[next_gid] else 0.0;
}
// Compute on current buffer: y = scale * x + bias
const gid = block_iter * BLOCK_SIZE + tid;
if (gid < n) {
output[gid] = cuda.__fmaf_rn(scale, buffers[cur][tid], bias);
}
cuda.__syncthreads();
stage += 1;
}
}