-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathprofiling.zig
More file actions
113 lines (92 loc) Β· 3.71 KB
/
profiling.zig
File metadata and controls
113 lines (92 loc) Β· 3.71 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
/// NVTX Profiling Annotations Example
///
/// Demonstrates NVIDIA Tools Extension markers for profiling:
/// 1. Range push/pop for named code sections
/// 2. ScopedRange for RAII-style automatic pop
/// 3. Domain-based isolation
/// 4. Instant markers for events
///
/// Visible in NVIDIA Nsight Systems (nsys profile ./profiling)
///
/// Reference: NVTX documentation
const std = @import("std");
const cuda = @import("zcuda");
const kernel_src =
\\extern "C" __global__ void compute(float *data, int n) {
\\ int i = blockIdx.x * blockDim.x + threadIdx.x;
\\ if (i < n) {
\\ float val = data[i];
\\ for (int j = 0; j < 100; j++) {
\\ val = sinf(val) + cosf(val);
\\ }
\\ data[i] = val;
\\ }
\\}
;
pub fn main() !void {
const allocator = std.heap.page_allocator;
std.debug.print("=== NVTX Profiling Example ===\n\n", .{});
std.debug.print("Run with: nsys profile ./zig-out/bin/nvtx-profiling\n\n", .{});
// --- Setup ---
cuda.nvtx.rangePush("CUDA Setup");
const ctx = try cuda.driver.CudaContext.new(0);
defer ctx.deinit();
std.debug.print("Device: {s}\n", .{ctx.name()});
const stream = ctx.defaultStream();
// Compile kernel
const ptx = try cuda.nvrtc.compilePtx(allocator, kernel_src);
defer allocator.free(ptx);
const module = try ctx.loadModule(ptx);
defer module.deinit();
const kernel = try module.getFunction("compute");
cuda.nvtx.rangePop();
std.debug.print("β Setup complete (NVTX range: CUDA Setup)\n", .{});
// --- Using ScopedRange for automatic cleanup ---
{
const range = cuda.nvtx.ScopedRange.init("Data Preparation");
defer range.deinit();
const n: usize = 100_000;
var h_data: [100_000]f32 = undefined;
for (&h_data, 0..) |*v, i| {
v.* = @as(f32, @floatFromInt(i)) * 0.001;
}
std.debug.print("β Data prepared (NVTX: Data Preparation)\n", .{});
// --- Mark important events ---
cuda.nvtx.mark("HtoD Transfer Start");
const d_data = try stream.cloneHtoD(f32, &h_data);
defer d_data.deinit();
cuda.nvtx.mark("HtoD Transfer End");
std.debug.print("β Data transferred to GPU\n", .{});
// --- Annotate kernel execution ---
{
const kernel_range = cuda.nvtx.ScopedRange.init("Kernel Execution");
defer kernel_range.deinit();
const config = cuda.LaunchConfig.forNumElems(@intCast(n));
const n_i32: i32 = @intCast(n);
// Multiple iterations
for (0..5) |iter| {
cuda.nvtx.rangePush("Iteration");
try stream.launch(kernel, config, .{ &d_data, n_i32 });
try stream.synchronize();
cuda.nvtx.rangePop();
_ = iter;
}
std.debug.print("β Kernel executed 5 iterations (NVTX: Kernel Execution)\n", .{});
}
// --- DtoH transfer ---
cuda.nvtx.rangePush("DtoH Transfer");
try stream.memcpyDtoH(f32, &h_data, d_data);
cuda.nvtx.rangePop();
std.debug.print("β Results transferred back\n", .{});
std.debug.print(" First 3 results: {d:.4} {d:.4} {d:.4}\n", .{
h_data[0], h_data[1], h_data[2],
});
}
// --- Domain-based profiling ---
std.debug.print("\nβββ Domain Isolation βββ\n", .{});
const domain = cuda.nvtx.Domain.create("zcuda_example");
defer domain.destroy();
std.debug.print("β Created domain: 'zcuda_example'\n", .{});
std.debug.print("\nβ NVTX profiling example complete\n", .{});
std.debug.print(" Tip: Use 'nsys profile' to visualize the annotations\n", .{});
}