Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
553 changes: 553 additions & 0 deletions dace/codegen/targets/gpu_helpers/copy_strategies.py

Large diffs are not rendered by default.

27 changes: 27 additions & 0 deletions dace/codegen/targets/gpu_helpers/gpu_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
# Copyright 2019-2025 ETH Zurich and the DaCe authors. All rights reserved.
from dace import Config
from dace.codegen import common


def generate_sync_debug_call() -> str:
"""
Generate backend sync and error-check calls as a string if
synchronous debugging is enabled.

Parameters
----------
backend : str
Backend API prefix (e.g., 'cuda').

Returns
-------
str
The generated debug call code, or an empty string if debugging is disabled.
"""
backend: str = common.get_gpu_backend()
sync_call: str = ""
if Config.get_bool('compiler', 'cuda', 'syncdebug'):
sync_call = (f"DACE_GPU_CHECK({backend}GetLastError());\n"
f"DACE_GPU_CHECK({backend}DeviceSynchronize());\n")

return sync_call
11 changes: 11 additions & 0 deletions dace/config_schema.yml
Original file line number Diff line number Diff line change
Expand Up @@ -465,6 +465,17 @@ required:
will raise an exception if such a Memlet is encountered. This allows the user
to have full control over all Maps in the SDFG.

# New configs, needed for new CUDACodeGen
gpu_stream_name:
type: str
title: Name for the GPU stream object
description: >
GPU streams allow GPU operations, such as kernel execution or memory transfers, to run asynchronously
and in parallel. This field specifies the naming convention for the hpu stream array and its connectors
in the SDFG. For example: 'gpu_streams,gpu_stream' means 'gpu_streams' is the array containing the
stream objects, and 'gpu_stream0' (prefix derived from the second name + stream id) is used as a
connector for gpu_streams[0].
default: gpu_streams,gpu_stream

#############################################
# General FPGA flags
Expand Down
14 changes: 14 additions & 0 deletions dace/dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,18 @@ class ScheduleType(aenum.AutoNumberEnum):
ScheduleType.GPU_Persistent,
]

# A subset of GPU schedule types for the new GPU backend
GPU_SCHEDULES_EXPERIMENTAL_CUDACODEGEN = [
ScheduleType.GPU_Device,
ScheduleType.GPU_ThreadBlock,
]

# A subset of on-GPU storage types for the new GPU backend
GPU_MEMORY_STORAGES_EXPERIMENTAL_CUDACODEGEN = [
StorageType.GPU_Global,
StorageType.GPU_Shared,
]

# A subset of CPU schedule types
CPU_SCHEDULES = [
ScheduleType.CPU_Multicore,
Expand Down Expand Up @@ -1266,6 +1278,7 @@ def isconstant(var):
complex128 = typeclass(numpy.complex128)
string = stringtype()
MPI_Request = opaque('MPI_Request')
gpuStream_t = opaque('gpuStream_t')


@undefined_safe_enum
Expand All @@ -1286,6 +1299,7 @@ class Typeclasses(aenum.AutoNumberEnum):
float64 = float64
complex64 = complex64
complex128 = complex128
gpuStream_t = gpuStream_t


_bool = bool
Expand Down
7 changes: 7 additions & 0 deletions dace/sdfg/state.py
Original file line number Diff line number Diff line change
Expand Up @@ -405,6 +405,13 @@ def memlet_path(self, edge: MultiConnectorEdge[mm.Memlet]) -> List[MultiConnecto
if (edge.src_conn is None and edge.dst_conn is None and edge.data.is_empty()):
return result

# For the gpu stream (i.e. cudastream, hipstream) management we can have dynamic out connectors, e.g.
# (GPU_Device-scheduled) MapExit: stream -> None: AccessNode, where AccessNode accesses a Stream array
# Memlets are used but its not about seing how data flows
if (isinstance(edge.src, nd.MapExit) and edge.src.map.schedule == dtypes.ScheduleType.GPU_Device
and isinstance(edge.dst, nd.AccessNode) and edge.dst.desc(state).dtype == dtypes.gpuStream_t):
return result

# Prepend incoming edges until reaching the source node
curedge = edge
visited = set()
Expand Down
32 changes: 32 additions & 0 deletions dace/transformation/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -1552,6 +1552,38 @@ def get_parent_map(state: SDFGState, node: Optional[nodes.Node] = None) -> Optio
return None


def is_within_schedule_types(state: SDFGState, node: nodes.Node, schedules: Set[dtypes.ScheduleType]) -> bool:
"""
Checks if the given node is enclosed within a Map whose schedule type
matches any in the `schedules` set.

Parameters
----------
state : SDFGState
The State where the node resides
node : nodes.Node
The node to check.
schedules : set[dtypes.ScheduleType]
A set of schedule types to match (e.g., {dtypes.ScheduleType.GPU_Device}).

Returns
----------
bool
True if the node is enclosed by a Map with a schedule type in `schedules`, False otherwise.
"""
current = node

while current is not None:
if isinstance(current, nodes.MapEntry):
if current.map.schedule in schedules:
return True

parent = get_parent_map(state, current)
if parent is None:
return False
current, state = parent


def redirect_edge(state: SDFGState,
edge: graph.MultiConnectorEdge[Memlet],
new_src: Optional[nodes.Node] = None,
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
# Copyright 2019-2025 ETH Zurich and the DaCe authors. All rights reserved.
from typing import Any, Dict, Set, Type, Union

import dace
from dace import dtypes, properties, SDFG
from dace.codegen import common
from dace.config import Config
from dace.sdfg import nodes
from dace.transformation import pass_pipeline as ppl, transformation
from dace.transformation.passes.gpu_specialization.gpu_stream_scheduling import NaiveGPUStreamScheduler
from dace.transformation.passes.gpu_specialization.insert_gpu_streams_to_sdfgs import InsertGPUStreamsToSDFGs


@properties.make_properties
@transformation.explicit_cf_compatible
class ConnectGPUStreamsToKernels(ppl.Pass):
"""
This Pass attaches GPU streams to kernels (i.e., dtypes.ScheduleType.GPU_Device scheduled maps).

Adds GPU stream AccessNodes and connects them to kernel entry and exit nodes,
indicating which GPU stream each kernel is assigned to. These assignments are e.g.
used when launching the kernels.
"""

def depends_on(self) -> Set[Union[Type[ppl.Pass], ppl.Pass]]:
return {NaiveGPUStreamScheduler, InsertGPUStreamsToSDFGs}

def modifies(self) -> ppl.Modifies:
return ppl.Modifies.AccessNodes | ppl.Modifies.Memlets

def should_reapply(self, modified: ppl.Modifies) -> bool:
return False

def apply_pass(self, sdfg: SDFG, pipeline_results: Dict[str, Any]):
# Retrieve the GPU stream array name and the prefix for individual stream variables
stream_array_name, stream_var_name_prefix = Config.get('compiler', 'cuda', 'gpu_stream_name').split(',')

# Retrieve GPU stream assignments for nodes
stream_assignments: Dict[nodes.Node, Union[int, str]] = pipeline_results['NaiveGPUStreamScheduler']

# Link kernels to their assigned GPU streams
for sub_sdfg in sdfg.all_sdfgs_recursive():

for state in sub_sdfg.states():
for node in state.nodes():

# Not a kernel entry - continue
if not (isinstance(node, nodes.MapEntry) and node.map.schedule == dtypes.ScheduleType.GPU_Device):
continue

# Stream connector name and the used GPU Stream for the kernel
assigned_gpustream = stream_assignments[node]
gpu_stream_var_name = f"{stream_var_name_prefix}{assigned_gpustream}"
accessed_gpu_stream = f"{stream_array_name}[{assigned_gpustream}]"

# Assign the GPU stream to the kernel entry
kernel_entry = node
kernel_entry.add_in_connector(gpu_stream_var_name, dtypes.gpuStream_t)
stream_array_in = state.add_access(stream_array_name)
state.add_edge(stream_array_in, None, kernel_entry, gpu_stream_var_name,
dace.Memlet(accessed_gpu_stream))

# Assign the GPU stream to the kernel exit
kernel_exit = state.exit_node(kernel_entry)
kernel_exit.add_out_connector(gpu_stream_var_name, dtypes.gpuStream_t)
stream_array_out = state.add_access(stream_array_name)
state.add_edge(kernel_exit, gpu_stream_var_name, stream_array_out, None,
dace.Memlet(accessed_gpu_stream))

return {}
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
# Copyright 2019-2025 ETH Zurich and the DaCe authors. All rights reserved.
from typing import Any, Dict, Set, Type, Union

import dace
from dace import dtypes, properties, SDFG
from dace.config import Config
from dace.sdfg import nodes
from dace.transformation import pass_pipeline as ppl, transformation
from dace.transformation.passes.gpu_specialization.gpu_stream_scheduling import NaiveGPUStreamScheduler
from dace.transformation.passes.gpu_specialization.insert_gpu_streams_to_sdfgs import InsertGPUStreamsToSDFGs
from dace.transformation.passes.gpu_specialization.connect_gpu_streams_to_kernels import ConnectGPUStreamsToKernels

# Placeholder for the GPU stream variable used in tasklet code
STREAM_PLACEHOLDER = "__dace_current_stream"


@properties.make_properties
@transformation.explicit_cf_compatible
class ConnectGPUStreamsToTasklets(ppl.Pass):
"""
This pass ensures that tasklets which require access to their assigned GPU stream
are provided with it explicitly.

Such tasklets typically originate from expanded LibraryNodes targeting GPUs.
These nodes may reference the special placeholder variable `__dace_current_stream`,
which is expected to be defined during unparsing in `cpp.py`.

To avoid relying on this "hidden" mechanism, the pass rewrites tasklets to use
the GPU stream AccessNode directly.

Note that this pass is similar to `ConnectGPUStreamsToKernels`.
"""

def depends_on(self) -> Set[Union[Type[ppl.Pass], ppl.Pass]]:
return {NaiveGPUStreamScheduler, InsertGPUStreamsToSDFGs, ConnectGPUStreamsToKernels}

def modifies(self) -> ppl.Modifies:
return ppl.Modifies.AccessNodes | ppl.Modifies.Memlets

def should_reapply(self, modified: ppl.Modifies) -> bool:
return False

def apply_pass(self, sdfg: SDFG, pipeline_results: Dict[str, Any]):
# Retrieve the GPU stream's array name
stream_array_name = Config.get('compiler', 'cuda', 'gpu_stream_name').split(',')[0]

# Retrieve GPU stream assignments for nodes
stream_assignments: Dict[nodes.Node, Union[int, str]] = pipeline_results['NaiveGPUStreamScheduler']

# Find all tasklets which use the GPU stream variable (STREAM_PLACEHOLDER) in the code
# and provide them the needed GPU stream explicitly
for sub_sdfg in sdfg.all_sdfgs_recursive():

for state in sub_sdfg.states():
for node in state.nodes():

# Not a tasklet - continue
if not isinstance(node, nodes.Tasklet):
continue

# Tasklet does not need use its assigned GPU stream - continue
if not STREAM_PLACEHOLDER in node.code.as_string:
continue

# Stream connector name and the used GPU Stream for the kernel
assigned_gpustream = stream_assignments[node]
gpu_stream_conn = STREAM_PLACEHOLDER
accessed_gpu_stream = f"{stream_array_name}[{assigned_gpustream}]"

# Provide the GPU stream explicitly to the tasklet
stream_array_in = state.add_access(stream_array_name)
stream_array_out = state.add_access(stream_array_name)

node.add_in_connector(gpu_stream_conn, dtypes.gpuStream_t)
node.add_out_connector(gpu_stream_conn, dtypes.gpuStream_t, force=True)

state.add_edge(stream_array_in, None, node, gpu_stream_conn, dace.Memlet(accessed_gpu_stream))
state.add_edge(node, gpu_stream_conn, stream_array_out, None, dace.Memlet(accessed_gpu_stream))

return {}
Loading