-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathallreduce.d2
146 lines (125 loc) · 5.16 KB
/
allreduce.d2
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
plugins: {
near: top-center
ncclPlugin: {
allreducePlugin\.cpp: {
grid-rows: 1
AllreducePlugin\:\:enqueue -> selectImplementation
}
}
}
plugins.ncclPlugin.allreducePlugin\.cpp.AllreducePlugin\:\:enqueue -> ncclAllReduce: either {
style: {
stroke: red # How is NCCL still fused with LayerNorm?
}
}
plugins.ncclPlugin.allreducePlugin\.cpp.AllreducePlugin\:\:enqueue -> kernels.customAllReduceKernels\.cu.customAllReduce: or
plugins.ncclPlugin.allreducePlugin\.cpp.selectImplementation -> common.customAllReduceUtils\.h.getMaxRequiredWorkspaceSize
plugins.ncclPlugin.allreducePlugin\.cpp.selectImplementation -> kernels.customAllReduceKernels\.cu.configurationSupported
\.\./\.\./tensorrt_llm: {
plugin: {
plugin\.py: {
PluginConfig: {
shape: class
set_nccl_plugin(dtype): self
}
PluginConfig -> init_all_reduce_helper: set_nccl_plugin
init_all_reduce_helper -> CustomAllReduceHelper: {
style: {
stroke-dash: 6
}
}
CustomAllReduceHelper: {
shape: class
max_workspace_size_auto(tp_size): <size>
allocate_workspace(mapping, size): (buffers, …)
}
}
}
}
\.\./\.\./tensorrt_llm.plugin.plugin\.py.CustomAllReduceHelper -> common.customAllReduceUtils\.h.getMaxRequiredWorkspaceSize: max_workspace_size_auto {
style: {
stroke: black
stroke-dash: 6
}
}
common: {
customAllReduceUtils\.h: {
getMaxRequiredWorkspaceSize
}
}
kernels: {
customAllReduceKernels\.cu: {
customAllReduce -> AllReduceDispatchType
customAllReduce -> configurationSupported: {
style: {
stroke: gray
}
}
AllReduceDispatchType -> AllReduceDispatchRanksPerNode
AllReduceDispatchRanksPerNode -> AllReduceDispatchPushMode
AllReduceDispatchPushMode -> AllReduceDispatchMemcpy
AllReduceDispatchMemcpy -> AllReduceDispatch: standalone
AllReduceDispatchMemcpy -> AllReduceNormDispatch: fused
AllReduceDispatch -> oneShotAllReduceKernel: either
oneShotAllReduceKernel.style.stroke: orange
oneShotAllReduceKernel.style.fill: lightgoldenrodyellow
AllReduceDispatch -> kernelLaunchConfig
AllReduceDispatch -> twoShotAllReduceKernel: or
twoShotAllReduceKernel.style.stroke: orange
twoShotAllReduceKernel.style.fill: lightgoldenrodyellow
AllReduceNormDispatch -> AllReduceNormKernelLaunch
oneShotAllReduceKernel -> block_barrier: either
block_barrier.style.stroke: orange
block_barrier.style.fill: lightgoldenrodyellow
oneShotAllReduceKernel -> multi_gpu_barrier: or
multi_gpu_barrier.style.stroke: orange
multi_gpu_barrier.style.fill: lightgoldenrodyellow
twoShotAllReduceKernel -> multi_gpu_barrier: or
twoShotAllReduceKernel -> block_barrier: either
AllReduceNormKernelLaunch -> kernelLaunchConfig: or
AllReduceNormKernelLaunch -> twoShotAllReduceKernel: or
AllReduceNormKernelLaunch -> rms_norm_kernel_launcher: or
AllReduceNormKernelLaunch -> one_shot_all_reduce_norm_kernel_launcher: either
rms_norm_kernel_launcher -> rms_norm_kernel
rms_norm_kernel.style.stroke: orange
rms_norm_kernel.style.fill: lightgoldenrodyellow
one_shot_all_reduce_norm_kernel_launcher -> one_shot_all_reduce_norm_kernel: or
one_shot_all_reduce_norm_kernel.style.stroke: orange
one_shot_all_reduce_norm_kernel.style.fill: lightgoldenrodyellow
one_shot_all_reduce_norm_kernel_launcher -> is_lamport_supported: either {
style: {
stroke: blueviolet
}
}
one_shot_all_reduce_norm_kernel_launcher -> lamport_style_one_shot_all_reduce_norm_kernel_launcher: either {
style: {
stroke: blueviolet
}
}
is_lamport_supported.style.stroke: blueviolet
lamport_style_one_shot_all_reduce_norm_kernel_launcher.style.stroke: blueviolet
one_shot_all_reduce_norm_kernel -> block_barrier
one_shot_all_reduce_norm_kernel -> rms_norm
rms_norm.style.stroke: orange
rms_norm.style.fill: lightgoldenrodyellow
lamport_style_one_shot_all_reduce_norm_kernel_launcher -> heuristic_min_warp_number: {
style: {
stroke: blueviolet
}
}
lamport_style_one_shot_all_reduce_norm_kernel_launcher -> lamport_style_one_shot_all_reduce_norm_kernel: {
style: {
stroke: blueviolet
}
}
heuristic_min_warp_number.style.stroke: blueviolet
lamport_style_one_shot_all_reduce_norm_kernel.style.stroke: blueviolet
lamport_style_one_shot_all_reduce_norm_kernel.style.fill: violet
# This edge is meaningless. It is placed here for the sake of formatting.
heuristic_min_warp_number -> is_lamport_supported: {
style: {
opacity: 0
}
}
}
}