Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
f519b26
Extend cases supported by the explicit copy transformations
ThrudPrimrose Jan 6, 2026
e483a25
Refactor, use views+
ThrudPrimrose Jan 6, 2026
4d7156f
Refactor
ThrudPrimrose Jan 6, 2026
5b3adae
Prep
ThrudPrimrose Jan 6, 2026
09e29e6
Refactor
ThrudPrimrose Jan 6, 2026
14de39e
Merge branch 'explicit-gpu-global-copies' into explicit-streams
ThrudPrimrose Jan 6, 2026
95fc7ec
Add things
ThrudPrimrose Jan 6, 2026
d31158e
Add
ThrudPrimrose Jan 7, 2026
b94dcb0
Add tests
ThrudPrimrose Jan 7, 2026
8a754c3
Extensions
ThrudPrimrose Jan 7, 2026
dad01d3
Fix bug
ThrudPrimrose Jan 7, 2026
551ffaa
Merge branch 'explicit-gpu-global-copies' into explicit-streams
ThrudPrimrose Jan 7, 2026
db71ff2
Fix
ThrudPrimrose Jan 7, 2026
e38016c
Fix
ThrudPrimrose Jan 7, 2026
831724d
Merge branch 'explicit-gpu-global-copies' into explicit-streams
ThrudPrimrose Jan 7, 2026
944db27
Check for GPU outputs in current stream generation
ThrudPrimrose Jan 7, 2026
c77ea55
Fix cpp codegen
ThrudPrimrose Jan 7, 2026
4b52240
Merge branch 'explicit-gpu-global-copies' into explicit-streams
ThrudPrimrose Jan 7, 2026
e29ca86
Refactor
ThrudPrimrose Jan 7, 2026
425d652
refactor
ThrudPrimrose Jan 9, 2026
0ed6406
Precommit
ThrudPrimrose Jan 9, 2026
70141b9
Merge branch 'explicit-gpu-global-copies' into explicit-streams
ThrudPrimrose Jan 9, 2026
bdf51a2
Fix dace current stream name conflict for old codegen compat
ThrudPrimrose Jan 9, 2026
1dac422
Merge branch 'main' into explicit-gpu-global-copies
ThrudPrimrose Jan 11, 2026
faf8ed0
Merge branch 'explicit-gpu-global-copies' into explicit-streams
ThrudPrimrose Jan 11, 2026
f762d5f
Merge branch 'main' into explicit-gpu-global-copies
ThrudPrimrose Jan 23, 2026
925a3e5
Merge
ThrudPrimrose Jan 23, 2026
02b4182
Merge branch 'main' into explicit-gpu-global-copies
ThrudPrimrose Jan 23, 2026
5ae9ca0
Merge branch 'explicit-gpu-global-copies' into explicit-streams
ThrudPrimrose Jan 23, 2026
c88c993
Fix to gpu stream dtype
ThrudPrimrose Jan 24, 2026
bca117c
Add dtype
ThrudPrimrose Jan 24, 2026
9dd1605
Rm gpu def
ThrudPrimrose Jan 24, 2026
a3ddb2d
Rm gpu def
ThrudPrimrose Jan 24, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion dace/codegen/dispatcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,8 @@ def add(self, name: str, dtype: DefinedType, ctype: str, ancestor: int = 0, allo
for _, scope, can_access_parent in reversed(self._scopes):
if name in scope:
err_str = "Shadowing variable {} from type {} to {}".format(name, scope[name], dtype)
if (allow_shadowing or config.Config.get_bool("compiler", "allow_shadowing")):
if (allow_shadowing or config.Config.get_bool("compiler", "allow_shadowing")
or dtype == DefinedType.GPUStream):
if not allow_shadowing:
print("WARNING: " + err_str)
else:
Expand Down
16 changes: 10 additions & 6 deletions dace/codegen/targets/cpp.py
Original file line number Diff line number Diff line change
Expand Up @@ -218,14 +218,17 @@ def memlet_copy_to_absolute_strides(dispatcher: 'TargetDispatcher',

def is_cuda_codegen_in_device(framecode) -> bool:
"""
Check the state of the CUDA code generator, whether it is inside device code.
Check the state of the (Experimental) CUDA code generator, whether it is inside device code.
"""
from dace.codegen.targets.cuda import CUDACodeGen

cudaClass = CUDACodeGen

if framecode is None:
cuda_codegen_in_device = False
else:
for codegen in framecode.targets:
if isinstance(codegen, CUDACodeGen):
if isinstance(codegen, cudaClass):
cuda_codegen_in_device = codegen._in_device_code
break
else:
Expand All @@ -248,11 +251,9 @@ def ptr(name: str, desc: data.Data, sdfg: SDFG = None, framecode: 'DaCeCodeGener
root = name.split('.')[0]
if root in sdfg.arrays and isinstance(sdfg.arrays[root], data.Structure):
name = name.replace('.', '->')

# Special case: If memory is persistent and defined in this SDFG, add state
# struct to name
if (desc.transient and desc.lifetime in (dtypes.AllocationLifetime.Persistent, dtypes.AllocationLifetime.External)):

if desc.storage == dtypes.StorageType.CPU_ThreadLocal: # Use unambiguous name for thread-local arrays
return f'__{sdfg.cfg_id}_{name}'
elif not is_cuda_codegen_in_device(framecode): # GPU kernels cannot access state
Expand Down Expand Up @@ -807,9 +808,12 @@ def unparse_cr(sdfg, wcr_ast, dtype):
def connected_to_gpu_memory(node: nodes.Node, state: SDFGState, sdfg: SDFG):
for e in state.all_edges(node):
path = state.memlet_path(e)
if ((isinstance(path[0].src, nodes.AccessNode)
and path[0].src.desc(sdfg).storage is dtypes.StorageType.GPU_Global)):
if (((isinstance(path[0].src, nodes.AccessNode)
and path[0].src.desc(sdfg).storage is dtypes.StorageType.GPU_Global))
or ((isinstance(path[-1].dst, nodes.AccessNode)
and path[-1].dst.desc(sdfg).storage is dtypes.StorageType.GPU_Global))):
return True

return False


Expand Down
28 changes: 23 additions & 5 deletions dace/codegen/targets/cpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -502,6 +502,14 @@ def allocate_array(self,

return
elif (nodedesc.storage == dtypes.StorageType.Register):
# The assignment necessary to unify the explicit streams and streams declared through
# the state of the SDFG.
if nodedesc.dtype == dtypes.gpuStream_t:
ctype = dtypes.gpuStream_t.ctype
allocation_stream.write(f"{ctype}* {name} = __state->gpu_context->streams;")
define_var(name, DefinedType.Pointer, ctype)
return

ctypedef = dtypes.pointer(nodedesc.dtype).ctype
if nodedesc.start_offset != 0:
raise NotImplementedError('Start offset unsupported for registers')
Expand Down Expand Up @@ -577,6 +585,9 @@ def deallocate_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgrap

if isinstance(nodedesc, (data.Scalar, data.View, data.Stream, data.Reference)):
return
elif nodedesc.dtype == dtypes.gpuStream_t:
callsite_stream.write(f"{alloc_name} = nullptr;")
return
elif (nodedesc.storage == dtypes.StorageType.CPU_Heap
or (nodedesc.storage == dtypes.StorageType.Register and
(symbolic.issymbolic(arrsize, sdfg.constants) or
Expand Down Expand Up @@ -994,6 +1005,11 @@ def process_out_memlets(self,
dst_edge = dfg.memlet_path(edge)[-1]
dst_node = dst_edge.dst

if isinstance(dst_node, nodes.AccessNode) and dst_node.desc(state).dtype == dtypes.gpuStream_t:
# Special case: GPU Streams do not represent data flow - they assing GPU Streams to kernels/tasks
# Thus, nothing needs to be written and out memlets of this kind should be ignored.
continue

# Target is neither a data nor a tasklet node
if isinstance(node, nodes.AccessNode) and (not isinstance(dst_node, nodes.AccessNode)
and not isinstance(dst_node, nodes.CodeNode)):
Expand Down Expand Up @@ -1035,8 +1051,7 @@ def process_out_memlets(self,
# Tasklet -> array with a memlet. Writing to array is emitted only if the memlet is not empty
if isinstance(node, nodes.CodeNode) and not edge.data.is_empty():
if not uconn:
raise SyntaxError("Cannot copy memlet without a local connector: {} to {}".format(
str(edge.src), str(edge.dst)))
return

conntype = node.out_connectors[uconn]
is_scalar = not isinstance(conntype, dtypes.pointer)
Expand Down Expand Up @@ -1254,7 +1269,6 @@ def memlet_definition(self,
# Dynamic WCR memlets start uninitialized
result += "{} {};".format(memlet_type, local_name)
defined = DefinedType.Scalar

else:
if not memlet.dynamic:
if is_scalar:
Expand Down Expand Up @@ -1289,8 +1303,12 @@ def memlet_definition(self,
memlet_type = ctypedef
result += "{} &{} = {};".format(memlet_type, local_name, expr)
defined = DefinedType.Stream
else:
raise TypeError("Unknown variable type: {}".format(var_type))

# Set Defined Type for GPU Stream connectors
# Shadowing for stream variable needs to be allowed
if memlet_type == 'gpuStream_t':
var_type = DefinedType.GPUStream
defined = DefinedType.GPUStream

if defined is not None:
self._dispatcher.defined_vars.add(local_name, defined, memlet_type, allow_shadowing=allow_shadowing)
Expand Down
4 changes: 2 additions & 2 deletions dace/config_schema.yml
Original file line number Diff line number Diff line change
Expand Up @@ -268,7 +268,7 @@ required:
type: str
title: Arguments
description: Compiler argument flags
default: '-fPIC -Wall -Wextra -O3 -march=native -ffast-math -Wno-unused-parameter -Wno-unused-label'
default: '-fopenmp -fPIC -Wall -Wextra -O3 -march=native -ffast-math -Wno-unused-parameter -Wno-unused-label'
default_Windows: '/O2 /fp:fast /arch:AVX2 /D_USRDLL /D_WINDLL /D__restrict__=__restrict'

libs:
Expand Down Expand Up @@ -326,7 +326,7 @@ required:
Additional CUDA architectures (separated by commas)
to compile GPU code for, excluding the current
architecture on the compiling machine.
default: '60'
default: '86'

hip_arch:
type: str
Expand Down
18 changes: 16 additions & 2 deletions dace/dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@ class ScheduleType(AutoNumberEnum):
GPU_ThreadBlock = () #: Thread-block code
GPU_ThreadBlock_Dynamic = () #: Allows rescheduling work within a block
GPU_Persistent = ()
GPU_Warp = ()

Snitch = ()
Snitch_Multicore = ()
Expand All @@ -84,6 +85,11 @@ class ScheduleType(AutoNumberEnum):
ScheduleType.GPU_Persistent,
]

# A subset of GPU schedule types for ExperimentalCUDACodeGen
EXPERIMENTAL_GPU_SCHEDULES = [
ScheduleType.GPU_Warp,
]

# A subset of CPU schedule types
CPU_SCHEDULES = [
ScheduleType.CPU_Multicore,
Expand All @@ -95,6 +101,8 @@ class ScheduleType(AutoNumberEnum):
StorageType.GPU_Shared,
]

GPU_KERNEL_ACCESSIBLE_STORAGES = [StorageType.GPU_Global, StorageType.GPU_Shared, StorageType.Register]


@undefined_safe_enum
class ReductionType(AutoNumberEnum):
Expand Down Expand Up @@ -192,7 +200,8 @@ class TilingType(AutoNumberEnum):
ScheduleType.GPU_ThreadBlock: StorageType.Register,
ScheduleType.GPU_ThreadBlock_Dynamic: StorageType.Register,
ScheduleType.SVE_Map: StorageType.CPU_Heap,
ScheduleType.Snitch: StorageType.Snitch_TCDM
ScheduleType.Snitch: StorageType.Snitch_TCDM,
ScheduleType.GPU_Warp: StorageType.Register,
}

# Maps from ScheduleType to default ScheduleType for sub-scopes
Expand All @@ -207,9 +216,10 @@ class TilingType(AutoNumberEnum):
ScheduleType.GPU_Device: ScheduleType.GPU_ThreadBlock,
ScheduleType.GPU_ThreadBlock: ScheduleType.Sequential,
ScheduleType.GPU_ThreadBlock_Dynamic: ScheduleType.Sequential,
ScheduleType.GPU_Warp: ScheduleType.Sequential,
ScheduleType.SVE_Map: ScheduleType.Sequential,
ScheduleType.Snitch: ScheduleType.Snitch,
ScheduleType.Snitch_Multicore: ScheduleType.Snitch_Multicore
ScheduleType.Snitch_Multicore: ScheduleType.Snitch_Multicore,
}

# Maps from StorageType to a preferred ScheduleType for helping determine schedules.
Expand Down Expand Up @@ -1240,6 +1250,7 @@ class string(_DaCeArray, npt.NDArray[numpy.str_]): ...
class vector(_DaCeArray, npt.NDArray[numpy.void]): ...
class MPI_Request(_DaCeArray, npt.NDArray[numpy.void]): ...
class float32sr(_DaCeArray, npt.NDArray[numpy.float32]): ...
class gpuStream_t(_DaCeArray, npt.NDArray[numpy.void]): ...
# yapf: enable
else:
# Runtime definitions
Expand All @@ -1260,6 +1271,7 @@ class float32sr(_DaCeArray, npt.NDArray[numpy.float32]): ...
complex128 = typeclass(numpy.complex128)
string = stringtype()
MPI_Request = opaque('MPI_Request')
gpuStream_t = opaque('gpuStream_t')
float32sr = Float32sr()


Expand All @@ -1281,6 +1293,7 @@ class Typeclasses(AutoNumberEnum):
float64 = float64
complex64 = complex64
complex128 = complex128
gpuStream_t = gpuStream_t


_bool = bool
Expand Down Expand Up @@ -1508,6 +1521,7 @@ def can_access(schedule: ScheduleType, storage: StorageType):
ScheduleType.GPU_Persistent,
ScheduleType.GPU_ThreadBlock,
ScheduleType.GPU_ThreadBlock_Dynamic,
ScheduleType.GPU_Warp,
]:
return storage in [StorageType.GPU_Global, StorageType.GPU_Shared, StorageType.CPU_Pinned]
elif schedule in [ScheduleType.Default, ScheduleType.CPU_Multicore, ScheduleType.CPU_Persistent]:
Expand Down
7 changes: 7 additions & 0 deletions dace/sdfg/state.py
Original file line number Diff line number Diff line change
Expand Up @@ -406,6 +406,13 @@ def memlet_path(self, edge: MultiConnectorEdge[mm.Memlet]) -> List[MultiConnecto
if (edge.src_conn is None and edge.dst_conn is None and edge.data.is_empty()):
return result

# For the explicit (new) gpu stream handling we can have dynamic out connectors, e.g.
# KernelExit: stream -> None: AccessNode, where AccessNode accesses a Stream array
# Memlets are used but its not about seing how data flows
if (isinstance(edge.src, nd.MapExit) and edge.src.map.schedule == dtypes.ScheduleType.GPU_Device
and isinstance(edge.dst, nd.AccessNode) and edge.dst.desc(state).dtype == dtypes.gpuStream_t):
return result

# Prepend incoming edges until reaching the source node
curedge = edge
visited = set()
Expand Down
11 changes: 8 additions & 3 deletions dace/sdfg/validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -858,9 +858,14 @@ def validate_state(state: 'dace.sdfg.SDFGState',
for oe in state.out_edges(dst_node)}):
pass
else:
raise InvalidSDFGEdgeError(
f"Memlet creates an invalid path (sink node {dst_node}"
" should be a data node)", sdfg, state_id, eid)
if isinstance(dst_node, nd.Tasklet) and len(dst_node.in_connectors) == 0 and len(
dst_node.out_connectors) == 0:
# Tasklets with no input or output connector -> sync tasklet -> OK
pass
else:
raise InvalidSDFGEdgeError(
f"Memlet creates an invalid path (sink node {dst_node}"
" should be a data node)", sdfg, state_id, eid)
# If scope(dst) is disjoint from scope(src), it's an illegal memlet
else:
raise InvalidSDFGEdgeError("Illegal memlet between disjoint scopes", sdfg, state_id, eid)
Expand Down
30 changes: 30 additions & 0 deletions dace/transformation/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -1959,3 +1959,33 @@ def _is_pointer(obj) -> bool:
def _is_structure_view(obj) -> bool:
"""Check if object is a StructureView."""
return isinstance(obj, data.StructureView)


def is_within_schedule_types(state: SDFGState, node: nodes.Node, schedules: Set[dtypes.ScheduleType]) -> bool:
"""
Checks if the given node is enclosed within a Map whose schedule type
matches any in the `schedules` set.
Parameters
----------
state : SDFGState
The State where the node resides
node : nodes.Node
The node to check.
schedules : set[dtypes.ScheduleType]
A set of schedule types to match (e.g., {dtypes.ScheduleType.GPU_Device}).
Returns
----------
bool
True if the node is enclosed by a Map with a schedule type in `schedules`, False otherwise.
"""
current = node

while current is not None:
if isinstance(current, nodes.MapEntry):
if current.map.schedule in schedules:
return True

parent = get_parent_map(state, current)
if parent is None:
return False
current, state = parent
1 change: 1 addition & 0 deletions dace/transformation/passes/gpu_specialization/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
# Copyright 2019-2026 ETH Zurich and the DaCe authors. All rights reserved.
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
# Copyright 2019-2026 ETH Zurich and the DaCe authors. All rights reserved.
from typing import Any, Dict, Set, Type, Union

import dace
from dace import dtypes, properties, SDFG
from dace.codegen import common
from dace.config import Config
from dace.sdfg import nodes
from dace.transformation import pass_pipeline as ppl, transformation
from dace.transformation.passes.gpu_specialization.gpu_stream_scheduling import NaiveGPUStreamScheduler
from dace.transformation.passes.gpu_specialization.insert_gpu_streams import InsertGPUStreams, get_gpu_stream_array_name, get_gpu_stream_connector_name


@properties.make_properties
@transformation.explicit_cf_compatible
class ConnectGPUStreamsToKernels(ppl.Pass):
"""
This Pass attaches GPU streams to kernels (i.e., dtypes.ScheduleType.GPU_Device scheduled maps).

Adds GPU stream AccessNodes and connects them to kernel entry and exit nodes,
indicating which GPU stream each kernel is assigned to. These assignments are e.g.
used when launching the kernels.
"""

def depends_on(self) -> Set[Union[Type[ppl.Pass], ppl.Pass]]:
return {NaiveGPUStreamScheduler, InsertGPUStreams}

def modifies(self) -> ppl.Modifies:
return ppl.Modifies.AccessNodes | ppl.Modifies.Memlets

def should_reapply(self, modified: ppl.Modifies) -> bool:
return False

def apply_pass(self, sdfg: SDFG, pipeline_results: Dict[str, Any]):
# Retrieve the GPU stream array name and the prefix for individual stream variables
stream_array_name = get_gpu_stream_array_name()
stream_var_name_prefix = get_gpu_stream_connector_name()

# Retrieve GPU stream assignments for nodes
stream_assignments: Dict[nodes.Node, Union[int, str]] = pipeline_results['NaiveGPUStreamScheduler']

# Link kernels to their assigned GPU streams
for sub_sdfg in sdfg.all_sdfgs_recursive():

for state in sub_sdfg.states():
for node in state.nodes():

# Not a kernel entry - continue
if not (isinstance(node, nodes.MapEntry) and node.map.schedule == dtypes.ScheduleType.GPU_Device):
continue

# Stream connector name and the used GPU Stream for the kernel
assigned_gpustream = stream_assignments[node]
gpu_stream_var_name = f"{stream_var_name_prefix}{assigned_gpustream}"
accessed_gpu_stream = f"{stream_array_name}[{assigned_gpustream}]"

# Assign the GPU stream to the kernel entry
kernel_entry = node
kernel_entry.add_in_connector(gpu_stream_var_name, dtypes.gpuStream_t)
stream_array_in = state.add_access(stream_array_name)
state.add_edge(stream_array_in, None, kernel_entry, gpu_stream_var_name,
dace.Memlet(accessed_gpu_stream))

# Assign the GPU stream to the kernel exit
kernel_exit = state.exit_node(kernel_entry)
stream_array_out = state.add_access(stream_array_name)
state.add_edge(kernel_exit, None, stream_array_out, None, dace.Memlet(None))

return {}
Loading