From e327b1ecf73fdb130467cac11c6e8baa668c1db8 Mon Sep 17 00:00:00 2001 From: aydogdub Date: Thu, 8 May 2025 14:04:51 +0200 Subject: [PATCH 01/94] Start of ExperimentalCUDACodeGen impementation --- .gitignore | 4 + berkay_workpace/berkay_testbed.ipynb | 591 ++++++ berkay_workpace/berkay_testbed.py | 33 + berkay_workpace/berkay_testbed2.ipynb | 99 + berkay_workpace/playfield.py | 45 + berkay_workpace/report.py | 27 + berkay_workpace/test.py | 335 ++++ dace/codegen/CMakeLists.txt | 3 +- dace/codegen/targets/__init__.py | 1 + dace/codegen/targets/experimental_cuda.py | 2230 +++++++++++++++++++++ dace/config_schema.yml | 9 + dace/dtypes.py | 16 +- dace/registry.py | 9 + 13 files changed, 3399 insertions(+), 3 deletions(-) create mode 100644 berkay_workpace/berkay_testbed.ipynb create mode 100644 berkay_workpace/berkay_testbed.py create mode 100644 berkay_workpace/berkay_testbed2.ipynb create mode 100644 berkay_workpace/playfield.py create mode 100644 berkay_workpace/report.py create mode 100644 berkay_workpace/test.py create mode 100644 dace/codegen/targets/experimental_cuda.py diff --git a/.gitignore b/.gitignore index 5a54e2df44..8ad4ff5649 100644 --- a/.gitignore +++ b/.gitignore @@ -195,3 +195,7 @@ _build/ # Ignoring the test junk _all_tests/ + + +# Ignore my python environment +dace_env/ diff --git a/berkay_workpace/berkay_testbed.ipynb b/berkay_workpace/berkay_testbed.ipynb new file mode 100644 index 0000000000..00e2a7fb5a --- /dev/null +++ b/berkay_workpace/berkay_testbed.ipynb @@ -0,0 +1,591 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "experimental\n" + ] + } + ], + "source": [ + "import dace\n", + "import random\n", + "import cupy as cp\n", + "\n", + "from dace import registry\n", + "from dace.sdfg.scope import ScopeSubgraphView\n", + "from dace.codegen.prettycode import CodeIOStream\n", + "from dace.codegen.targets.target import TargetCodeGenerator\n", + "from dace.codegen.targets.framecode import DaCeCodeGenerator\n", + "from dace.codegen.targets.cpp import sym2cpp\n", + "from IPython.display import Code\n", + "from dace.config import Config\n", + "\n", + "\n", + "print(Config.get('compiler', 'cuda', 'implementation'))" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "
\n", + "
\n", + "
\n", + "\n", + "" + ], + "text/plain": [ + "SDFG (warpLevel)" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "@dace.program\n", + "def warpLevel(A: dace.float64[512] @ dace.dtypes.StorageType.GPU_Global, B: dace.float64[512] @ dace.dtypes.StorageType.GPU_Global):\n", + " for i in dace.map[0:512:512] @ dace.dtypes.ScheduleType.GPU_Device:\n", + " for j in dace.map[0:512] @ dace.dtypes.ScheduleType.GPU_ThreadBlock:\n", + " for wi in dace.map[0:16] @ dace.dtypes.ScheduleType.GPU_Warp:\n", + " A[wi] = A[wi] + 1\n", + "\n", + "sdfg = warpLevel.to_sdfg()\n", + "sdfg" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
#include <cuda_runtime.h>\n",
+       "#include <dace/dace.h>\n",
+       "\n",
+       "\n",
+       "struct warpLevel_state_t {\n",
+       "    dace::cuda::Context *gpu_context;\n",
+       "};\n",
+       "\n",
+       "\n",
+       "\n",
+       "DACE_EXPORTED int __dace_init_experimental_cuda(warpLevel_state_t *__state);\n",
+       "DACE_EXPORTED int __dace_exit_experimental_cuda(warpLevel_state_t *__state);\n",
+       "\n",
+       "\n",
+       "\n",
+       "int __dace_init_experimental_cuda(warpLevel_state_t *__state) {\n",
+       "    int count;\n",
+       "\n",
+       "    // Check that we are able to run cuda code\n",
+       "    if (cudaGetDeviceCount(&count) != cudaSuccess)\n",
+       "    {\n",
+       "        printf("ERROR: GPU drivers are not configured or cuda-capable device "\n",
+       "               "not found\\n");\n",
+       "        return 1;\n",
+       "    }\n",
+       "    if (count == 0)\n",
+       "    {\n",
+       "        printf("ERROR: No cuda-capable devices found\\n");\n",
+       "        return 2;\n",
+       "    }\n",
+       "\n",
+       "    // Initialize cuda before we run the application\n",
+       "    float *dev_X;\n",
+       "    DACE_GPU_CHECK(cudaMalloc((void **) &dev_X, 1));\n",
+       "    DACE_GPU_CHECK(cudaFree(dev_X));\n",
+       "\n",
+       "    \n",
+       "\n",
+       "    __state->gpu_context = new dace::cuda::Context(1, 1);\n",
+       "\n",
+       "    // Create cuda streams and events\n",
+       "    for(int i = 0; i < 1; ++i) {\n",
+       "        DACE_GPU_CHECK(cudaStreamCreateWithFlags(&__state->gpu_context->internal_streams[i], cudaStreamNonBlocking));\n",
+       "        __state->gpu_context->streams[i] = __state->gpu_context->internal_streams[i]; // Allow for externals to modify streams\n",
+       "    }\n",
+       "    for(int i = 0; i < 1; ++i) {\n",
+       "        DACE_GPU_CHECK(cudaEventCreateWithFlags(&__state->gpu_context->events[i], cudaEventDisableTiming));\n",
+       "    }\n",
+       "\n",
+       "    \n",
+       "\n",
+       "    return 0;\n",
+       "}\n",
+       "\n",
+       "int __dace_exit_experimental_cuda(warpLevel_state_t *__state) {\n",
+       "    \n",
+       "\n",
+       "    // Synchronize and check for CUDA errors\n",
+       "    int __err = static_cast<int>(__state->gpu_context->lasterror);\n",
+       "    if (__err == 0)\n",
+       "        __err = static_cast<int>(cudaDeviceSynchronize());\n",
+       "\n",
+       "    // Destroy cuda streams and events\n",
+       "    for(int i = 0; i < 1; ++i) {\n",
+       "        DACE_GPU_CHECK(cudaStreamDestroy(__state->gpu_context->internal_streams[i]));\n",
+       "    }\n",
+       "    for(int i = 0; i < 1; ++i) {\n",
+       "        DACE_GPU_CHECK(cudaEventDestroy(__state->gpu_context->events[i]));\n",
+       "    }\n",
+       "\n",
+       "    delete __state->gpu_context;\n",
+       "    return __err;\n",
+       "}\n",
+       "\n",
+       "DACE_EXPORTED bool __dace_gpu_set_stream(warpLevel_state_t *__state, int streamid, gpuStream_t stream)\n",
+       "{\n",
+       "    if (streamid < 0 || streamid >= 1)\n",
+       "        return false;\n",
+       "\n",
+       "    __state->gpu_context->streams[streamid] = stream;\n",
+       "\n",
+       "    return true;\n",
+       "}\n",
+       "\n",
+       "DACE_EXPORTED void __dace_gpu_set_all_streams(warpLevel_state_t *__state, gpuStream_t stream)\n",
+       "{\n",
+       "    for (int i = 0; i < 1; ++i)\n",
+       "        __state->gpu_context->streams[i] = stream;\n",
+       "}\n",
+       "\n",
+       "__global__ void warpLevel_3_0_0_0(double * __restrict__ A)\n",
+       "{ // Kernel scope (open 1)\n",
+       "    int i = (512 * blockIdx.x);\n",
+       "    { // ThreadBlock Scope (open 1)\n",
+       "        int j = threadIdx.x;\n",
+       "        { // WarpLevel Scope (open 1)\n",
+       "            double __tmp3;\n",
+       "\n",
+       "            int warpId = (threadIdx.x + blockDim.x * threadIdx.y + blockDim.x * blockDim.y * threadIdx.z) / 32;\n",
+       "            if ( 0 < warpId && warpId < 15) { // WarpLevel Scope (open 2)\n",
+       "\n",
+       "\n",
+       "\n",
+       "                ----------------------------------\n",
+       "                // WarpLevel operations here\n",
+       "                ----------------------------------\n",
+       "\n",
+       "\n",
+       "\n",
+       "            } // WarpLevel Scope (close 1)\n",
+       "        } // WarpLevel Scope (close 2)\n",
+       "    } // ThreadBlock Scope (close 1)\n",
+       "} // Kernel scope (close 1)\n",
+       "\n",
+       "\n",
+       "DACE_EXPORTED void __dace_runkernel_warpLevel_3_0_0_0(warpLevel_state_t *__state, double * __restrict__ A);\n",
+       "void __dace_runkernel_warpLevel_3_0_0_0(warpLevel_state_t *__state, double * __restrict__ A)\n",
+       "{\n",
+       "\n",
+       "\n",
+       "    void  *warpLevel_3_0_0_0_args[] = { (void *)&A };\n",
+       "    gpuError_t __err = cudaLaunchKernel( (void*)warpLevel_3_0_0_0, dim3(1, 1, 1), dim3(512, 1, 1), warpLevel_3_0_0_0_args, 0, __state->gpu_context->streams[0]\n",
+       "    );\n",
+       "\n",
+       "    DACE_KERNEL_LAUNCH_CHECK(__err, "warpLevel_3_0_0_0", 1, 1, 1, 512, 1, 1);\n",
+       "}\n",
+       "
\n" + ], + "text/latex": [ + "\\begin{Verbatim}[commandchars=\\\\\\{\\}]\n", + "\\PY{c+cp}{\\PYZsh{}}\\PY{c+cp}{include}\\PY{+w}{ }\\PY{c+cpf}{\\PYZlt{}cuda\\PYZus{}runtime.h\\PYZgt{}}\n", + "\\PY{c+cp}{\\PYZsh{}}\\PY{c+cp}{include}\\PY{+w}{ }\\PY{c+cpf}{\\PYZlt{}dace/dace.h\\PYZgt{}}\n", + "\n", + "\n", + "\\PY{k}{struct}\\PY{+w}{ }\\PY{n+nc}{warpLevel\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", + "\\PY{+w}{ }\\PY{n}{dace}\\PY{o}{:}\\PY{o}{:}\\PY{n}{cuda}\\PY{o}{:}\\PY{o}{:}\\PY{n}{Context}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{gpu\\PYZus{}context}\\PY{p}{;}\n", + "\\PY{p}{\\PYZcb{}}\\PY{p}{;}\n", + "\n", + "\n", + "\n", + "\\PY{n}{DACE\\PYZus{}EXPORTED}\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}dace\\PYZus{}init\\PYZus{}experimental\\PYZus{}cuda}\\PY{p}{(}\\PY{n}{warpLevel\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{)}\\PY{p}{;}\n", + "\\PY{n}{DACE\\PYZus{}EXPORTED}\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}dace\\PYZus{}exit\\PYZus{}experimental\\PYZus{}cuda}\\PY{p}{(}\\PY{n}{warpLevel\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{)}\\PY{p}{;}\n", + "\n", + "\n", + "\n", + "\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n+nf}{\\PYZus{}\\PYZus{}dace\\PYZus{}init\\PYZus{}experimental\\PYZus{}cuda}\\PY{p}{(}\\PY{n}{warpLevel\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{)}\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", + "\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{count}\\PY{p}{;}\n", + "\n", + "\\PY{+w}{ }\\PY{c+c1}{// Check that we are able to run cuda code}\n", + "\\PY{+w}{ }\\PY{k}{if}\\PY{+w}{ }\\PY{p}{(}\\PY{n}{cudaGetDeviceCount}\\PY{p}{(}\\PY{o}{\\PYZam{}}\\PY{n}{count}\\PY{p}{)}\\PY{+w}{ }\\PY{o}{!}\\PY{o}{=}\\PY{+w}{ }\\PY{n}{cudaSuccess}\\PY{p}{)}\n", + "\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", + "\\PY{+w}{ }\\PY{n}{printf}\\PY{p}{(}\\PY{l+s}{\\PYZdq{}}\\PY{l+s}{ERROR: GPU drivers are not configured or cuda\\PYZhy{}capable device }\\PY{l+s}{\\PYZdq{}}\n", + "\\PY{+w}{ }\\PY{l+s}{\\PYZdq{}}\\PY{l+s}{not found}\\PY{l+s+se}{\\PYZbs{}n}\\PY{l+s}{\\PYZdq{}}\\PY{p}{)}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{k}{return}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\n", + "\\PY{+w}{ }\\PY{k}{if}\\PY{+w}{ }\\PY{p}{(}\\PY{n}{count}\\PY{+w}{ }\\PY{o}{=}\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{)}\n", + "\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", + "\\PY{+w}{ }\\PY{n}{printf}\\PY{p}{(}\\PY{l+s}{\\PYZdq{}}\\PY{l+s}{ERROR: No cuda\\PYZhy{}capable devices found}\\PY{l+s+se}{\\PYZbs{}n}\\PY{l+s}{\\PYZdq{}}\\PY{p}{)}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{k}{return}\\PY{+w}{ }\\PY{l+m+mi}{2}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\n", + "\n", + "\\PY{+w}{ }\\PY{c+c1}{// Initialize cuda before we run the application}\n", + "\\PY{+w}{ }\\PY{k+kt}{float}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{dev\\PYZus{}X}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{n}{DACE\\PYZus{}GPU\\PYZus{}CHECK}\\PY{p}{(}\\PY{n}{cudaMalloc}\\PY{p}{(}\\PY{p}{(}\\PY{k+kt}{void}\\PY{+w}{ }\\PY{o}{*}\\PY{o}{*}\\PY{p}{)}\\PY{+w}{ }\\PY{o}{\\PYZam{}}\\PY{n}{dev\\PYZus{}X}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{)}\\PY{p}{)}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{n}{DACE\\PYZus{}GPU\\PYZus{}CHECK}\\PY{p}{(}\\PY{n}{cudaFree}\\PY{p}{(}\\PY{n}{dev\\PYZus{}X}\\PY{p}{)}\\PY{p}{)}\\PY{p}{;}\n", + "\n", + "\\PY{+w}{ }\n", + "\n", + "\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{k}{new}\\PY{+w}{ }\\PY{n}{dace}\\PY{o}{:}\\PY{o}{:}\\PY{n}{cuda}\\PY{o}{:}\\PY{o}{:}\\PY{n}{Context}\\PY{p}{(}\\PY{l+m+mi}{1}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{)}\\PY{p}{;}\n", + "\n", + "\\PY{+w}{ }\\PY{c+c1}{// Create cuda streams and events}\n", + "\\PY{+w}{ }\\PY{k}{for}\\PY{p}{(}\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{;}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{\\PYZlt{}}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{;}\\PY{+w}{ }\\PY{o}{+}\\PY{o}{+}\\PY{n}{i}\\PY{p}{)}\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", + "\\PY{+w}{ }\\PY{n}{DACE\\PYZus{}GPU\\PYZus{}CHECK}\\PY{p}{(}\\PY{n}{cudaStreamCreateWithFlags}\\PY{p}{(}\\PY{o}{\\PYZam{}}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{internal\\PYZus{}streams}\\PY{p}{[}\\PY{n}{i}\\PY{p}{]}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{cudaStreamNonBlocking}\\PY{p}{)}\\PY{p}{)}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{streams}\\PY{p}{[}\\PY{n}{i}\\PY{p}{]}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{internal\\PYZus{}streams}\\PY{p}{[}\\PY{n}{i}\\PY{p}{]}\\PY{p}{;}\\PY{+w}{ }\\PY{c+c1}{// Allow for externals to modify streams}\n", + "\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\n", + "\\PY{+w}{ }\\PY{k}{for}\\PY{p}{(}\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{;}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{\\PYZlt{}}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{;}\\PY{+w}{ }\\PY{o}{+}\\PY{o}{+}\\PY{n}{i}\\PY{p}{)}\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", + "\\PY{+w}{ }\\PY{n}{DACE\\PYZus{}GPU\\PYZus{}CHECK}\\PY{p}{(}\\PY{n}{cudaEventCreateWithFlags}\\PY{p}{(}\\PY{o}{\\PYZam{}}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{events}\\PY{p}{[}\\PY{n}{i}\\PY{p}{]}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{cudaEventDisableTiming}\\PY{p}{)}\\PY{p}{)}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\n", + "\n", + "\\PY{+w}{ }\n", + "\n", + "\\PY{+w}{ }\\PY{k}{return}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{;}\n", + "\\PY{p}{\\PYZcb{}}\n", + "\n", + "\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n+nf}{\\PYZus{}\\PYZus{}dace\\PYZus{}exit\\PYZus{}experimental\\PYZus{}cuda}\\PY{p}{(}\\PY{n}{warpLevel\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{)}\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", + "\\PY{+w}{ }\n", + "\n", + "\\PY{+w}{ }\\PY{c+c1}{// Synchronize and check for CUDA errors}\n", + "\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}err}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{k}{static\\PYZus{}cast}\\PY{o}{\\PYZlt{}}\\PY{k+kt}{int}\\PY{o}{\\PYZgt{}}\\PY{p}{(}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{lasterror}\\PY{p}{)}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{k}{if}\\PY{+w}{ }\\PY{p}{(}\\PY{n}{\\PYZus{}\\PYZus{}err}\\PY{+w}{ }\\PY{o}{=}\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{)}\n", + "\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}err}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{k}{static\\PYZus{}cast}\\PY{o}{\\PYZlt{}}\\PY{k+kt}{int}\\PY{o}{\\PYZgt{}}\\PY{p}{(}\\PY{n}{cudaDeviceSynchronize}\\PY{p}{(}\\PY{p}{)}\\PY{p}{)}\\PY{p}{;}\n", + "\n", + "\\PY{+w}{ }\\PY{c+c1}{// Destroy cuda streams and events}\n", + "\\PY{+w}{ }\\PY{k}{for}\\PY{p}{(}\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{;}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{\\PYZlt{}}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{;}\\PY{+w}{ }\\PY{o}{+}\\PY{o}{+}\\PY{n}{i}\\PY{p}{)}\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", + "\\PY{+w}{ }\\PY{n}{DACE\\PYZus{}GPU\\PYZus{}CHECK}\\PY{p}{(}\\PY{n}{cudaStreamDestroy}\\PY{p}{(}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{internal\\PYZus{}streams}\\PY{p}{[}\\PY{n}{i}\\PY{p}{]}\\PY{p}{)}\\PY{p}{)}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\n", + "\\PY{+w}{ }\\PY{k}{for}\\PY{p}{(}\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{;}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{\\PYZlt{}}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{;}\\PY{+w}{ }\\PY{o}{+}\\PY{o}{+}\\PY{n}{i}\\PY{p}{)}\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", + "\\PY{+w}{ }\\PY{n}{DACE\\PYZus{}GPU\\PYZus{}CHECK}\\PY{p}{(}\\PY{n}{cudaEventDestroy}\\PY{p}{(}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{events}\\PY{p}{[}\\PY{n}{i}\\PY{p}{]}\\PY{p}{)}\\PY{p}{)}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\n", + "\n", + "\\PY{+w}{ }\\PY{k}{delete}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{k}{return}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}err}\\PY{p}{;}\n", + "\\PY{p}{\\PYZcb{}}\n", + "\n", + "\\PY{n}{DACE\\PYZus{}EXPORTED}\\PY{+w}{ }\\PY{k+kt}{bool}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}dace\\PYZus{}gpu\\PYZus{}set\\PYZus{}stream}\\PY{p}{(}\\PY{n}{warpLevel\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{,}\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{streamid}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{gpuStream\\PYZus{}t}\\PY{+w}{ }\\PY{n}{stream}\\PY{p}{)}\n", + "\\PY{p}{\\PYZob{}}\n", + "\\PY{+w}{ }\\PY{k}{if}\\PY{+w}{ }\\PY{p}{(}\\PY{n}{streamid}\\PY{+w}{ }\\PY{o}{\\PYZlt{}}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{+w}{ }\\PY{o}{|}\\PY{o}{|}\\PY{+w}{ }\\PY{n}{streamid}\\PY{+w}{ }\\PY{o}{\\PYZgt{}}\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{)}\n", + "\\PY{+w}{ }\\PY{k}{return}\\PY{+w}{ }\\PY{n+nb}{false}\\PY{p}{;}\n", + "\n", + "\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{streams}\\PY{p}{[}\\PY{n}{streamid}\\PY{p}{]}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{stream}\\PY{p}{;}\n", + "\n", + "\\PY{+w}{ }\\PY{k}{return}\\PY{+w}{ }\\PY{n+nb}{true}\\PY{p}{;}\n", + "\\PY{p}{\\PYZcb{}}\n", + "\n", + "\\PY{n}{DACE\\PYZus{}EXPORTED}\\PY{+w}{ }\\PY{k+kt}{void}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}dace\\PYZus{}gpu\\PYZus{}set\\PYZus{}all\\PYZus{}streams}\\PY{p}{(}\\PY{n}{warpLevel\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{gpuStream\\PYZus{}t}\\PY{+w}{ }\\PY{n}{stream}\\PY{p}{)}\n", + "\\PY{p}{\\PYZob{}}\n", + "\\PY{+w}{ }\\PY{k}{for}\\PY{+w}{ }\\PY{p}{(}\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{;}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{\\PYZlt{}}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{;}\\PY{+w}{ }\\PY{o}{+}\\PY{o}{+}\\PY{n}{i}\\PY{p}{)}\n", + "\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{streams}\\PY{p}{[}\\PY{n}{i}\\PY{p}{]}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{stream}\\PY{p}{;}\n", + "\\PY{p}{\\PYZcb{}}\n", + "\n", + "\\PY{n}{\\PYZus{}\\PYZus{}global\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{k+kt}{void}\\PY{+w}{ }\\PY{n}{warpLevel\\PYZus{}3\\PYZus{}0\\PYZus{}0\\PYZus{}0}\\PY{p}{(}\\PY{k+kt}{double}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}restrict\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{n}{A}\\PY{p}{)}\n", + "\\PY{p}{\\PYZob{}}\\PY{+w}{ }\\PY{c+c1}{// Kernel scope (open 1)}\n", + "\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{p}{(}\\PY{l+m+mi}{512}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{blockIdx}\\PY{p}{.}\\PY{n}{x}\\PY{p}{)}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{p}{\\PYZob{}}\\PY{+w}{ }\\PY{c+c1}{// ThreadBlock Scope (open 1)}\n", + "\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{j}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{threadIdx}\\PY{p}{.}\\PY{n}{x}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{p}{\\PYZob{}}\\PY{+w}{ }\\PY{c+c1}{// WarpLevel Scope (open 1)}\n", + "\\PY{+w}{ }\\PY{k+kt}{double}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}tmp3}\\PY{p}{;}\n", + "\n", + "\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{warpId}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{p}{(}\\PY{n}{threadIdx}\\PY{p}{.}\\PY{n}{x}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{n}{blockDim}\\PY{p}{.}\\PY{n}{x}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{threadIdx}\\PY{p}{.}\\PY{n}{y}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{n}{blockDim}\\PY{p}{.}\\PY{n}{x}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{blockDim}\\PY{p}{.}\\PY{n}{y}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{threadIdx}\\PY{p}{.}\\PY{n}{z}\\PY{p}{)}\\PY{+w}{ }\\PY{o}{/}\\PY{+w}{ }\\PY{l+m+mi}{32}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{k}{if}\\PY{+w}{ }\\PY{p}{(}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{+w}{ }\\PY{o}{\\PYZlt{}}\\PY{+w}{ }\\PY{n}{warpId}\\PY{+w}{ }\\PY{o}{\\PYZam{}}\\PY{o}{\\PYZam{}}\\PY{+w}{ }\\PY{n}{warpId}\\PY{+w}{ }\\PY{o}{\\PYZlt{}}\\PY{+w}{ }\\PY{l+m+mi}{15}\\PY{p}{)}\\PY{+w}{ }\\PY{p}{\\PYZob{}}\\PY{+w}{ }\\PY{c+c1}{// WarpLevel Scope (open 2)}\n", + "\n", + "\n", + "\n", + "\\PY{+w}{ }\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZhy{}}\n", + "\\PY{+w}{ }\\PY{c+c1}{// WarpLevel operations here}\n", + "\\PY{+w}{ }\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZhy{}}\n", + "\n", + "\n", + "\n", + "\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\\PY{+w}{ }\\PY{c+c1}{// WarpLevel Scope (close 1)}\n", + "\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\\PY{+w}{ }\\PY{c+c1}{// WarpLevel Scope (close 2)}\n", + "\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\\PY{+w}{ }\\PY{c+c1}{// ThreadBlock Scope (close 1)}\n", + "\\PY{p}{\\PYZcb{}}\\PY{+w}{ }\\PY{c+c1}{// Kernel scope (close 1)}\n", + "\n", + "\n", + "\\PY{n}{DACE\\PYZus{}EXPORTED}\\PY{+w}{ }\\PY{k+kt}{void}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}dace\\PYZus{}runkernel\\PYZus{}warpLevel\\PYZus{}3\\PYZus{}0\\PYZus{}0\\PYZus{}0}\\PY{p}{(}\\PY{n}{warpLevel\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{,}\\PY{+w}{ }\\PY{k+kt}{double}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}restrict\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{n}{A}\\PY{p}{)}\\PY{p}{;}\n", + "\\PY{k+kt}{void}\\PY{+w}{ }\\PY{n+nf}{\\PYZus{}\\PYZus{}dace\\PYZus{}runkernel\\PYZus{}warpLevel\\PYZus{}3\\PYZus{}0\\PYZus{}0\\PYZus{}0}\\PY{p}{(}\\PY{n}{warpLevel\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{,}\\PY{+w}{ }\\PY{k+kt}{double}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}restrict\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{n}{A}\\PY{p}{)}\n", + "\\PY{p}{\\PYZob{}}\n", + "\n", + "\n", + "\\PY{+w}{ }\\PY{k+kt}{void}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{warpLevel\\PYZus{}3\\PYZus{}0\\PYZus{}0\\PYZus{}0\\PYZus{}args}\\PY{p}{[}\\PY{p}{]}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{p}{\\PYZob{}}\\PY{+w}{ }\\PY{p}{(}\\PY{k+kt}{void}\\PY{+w}{ }\\PY{o}{*}\\PY{p}{)}\\PY{o}{\\PYZam{}}\\PY{n}{A}\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{n}{gpuError\\PYZus{}t}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}err}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{cudaLaunchKernel}\\PY{p}{(}\\PY{+w}{ }\\PY{p}{(}\\PY{k+kt}{void}\\PY{o}{*}\\PY{p}{)}\\PY{n}{warpLevel\\PYZus{}3\\PYZus{}0\\PYZus{}0\\PYZus{}0}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{dim3}\\PY{p}{(}\\PY{l+m+mi}{1}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{)}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{dim3}\\PY{p}{(}\\PY{l+m+mi}{512}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{)}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{warpLevel\\PYZus{}3\\PYZus{}0\\PYZus{}0\\PYZus{}0\\PYZus{}args}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{streams}\\PY{p}{[}\\PY{l+m+mi}{0}\\PY{p}{]}\n", + "\\PY{+w}{ }\\PY{p}{)}\\PY{p}{;}\n", + "\n", + "\\PY{+w}{ }\\PY{n}{DACE\\PYZus{}KERNEL\\PYZus{}LAUNCH\\PYZus{}CHECK}\\PY{p}{(}\\PY{n}{\\PYZus{}\\PYZus{}err}\\PY{p}{,}\\PY{+w}{ }\\PY{l+s}{\\PYZdq{}}\\PY{l+s}{warpLevel\\PYZus{}3\\PYZus{}0\\PYZus{}0\\PYZus{}0}\\PY{l+s}{\\PYZdq{}}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{512}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{)}\\PY{p}{;}\n", + "\\PY{p}{\\PYZcb{}}\n", + "\\end{Verbatim}\n" + ], + "text/plain": [ + "\n", + "#include \n", + "#include \n", + "\n", + "\n", + "struct warpLevel_state_t {\n", + " dace::cuda::Context *gpu_context;\n", + "};\n", + "\n", + "\n", + "\n", + "DACE_EXPORTED int __dace_init_experimental_cuda(warpLevel_state_t *__state);\n", + "DACE_EXPORTED int __dace_exit_experimental_cuda(warpLevel_state_t *__state);\n", + "\n", + "\n", + "\n", + "int __dace_init_experimental_cuda(warpLevel_state_t *__state) {\n", + " int count;\n", + "\n", + " // Check that we are able to run cuda code\n", + " if (cudaGetDeviceCount(&count) != cudaSuccess)\n", + " {\n", + " printf(\"ERROR: GPU drivers are not configured or cuda-capable device \"\n", + " \"not found\\n\");\n", + " return 1;\n", + " }\n", + " if (count == 0)\n", + " {\n", + " printf(\"ERROR: No cuda-capable devices found\\n\");\n", + " return 2;\n", + " }\n", + "\n", + " // Initialize cuda before we run the application\n", + " float *dev_X;\n", + " DACE_GPU_CHECK(cudaMalloc((void **) &dev_X, 1));\n", + " DACE_GPU_CHECK(cudaFree(dev_X));\n", + "\n", + " \n", + "\n", + " __state->gpu_context = new dace::cuda::Context(1, 1);\n", + "\n", + " // Create cuda streams and events\n", + " for(int i = 0; i < 1; ++i) {\n", + " DACE_GPU_CHECK(cudaStreamCreateWithFlags(&__state->gpu_context->internal_streams[i], cudaStreamNonBlocking));\n", + " __state->gpu_context->streams[i] = __state->gpu_context->internal_streams[i]; // Allow for externals to modify streams\n", + " }\n", + " for(int i = 0; i < 1; ++i) {\n", + " DACE_GPU_CHECK(cudaEventCreateWithFlags(&__state->gpu_context->events[i], cudaEventDisableTiming));\n", + " }\n", + "\n", + " \n", + "\n", + " return 0;\n", + "}\n", + "\n", + "int __dace_exit_experimental_cuda(warpLevel_state_t *__state) {\n", + " \n", + "\n", + " // Synchronize and check for CUDA errors\n", + " int __err = static_cast(__state->gpu_context->lasterror);\n", + " if (__err == 0)\n", + " __err = static_cast(cudaDeviceSynchronize());\n", + "\n", + " // Destroy cuda streams and events\n", + " for(int i = 0; i < 1; ++i) {\n", + " DACE_GPU_CHECK(cudaStreamDestroy(__state->gpu_context->internal_streams[i]));\n", + " }\n", + " for(int i = 0; i < 1; ++i) {\n", + " DACE_GPU_CHECK(cudaEventDestroy(__state->gpu_context->events[i]));\n", + " }\n", + "\n", + " delete __state->gpu_context;\n", + " return __err;\n", + "}\n", + "\n", + "DACE_EXPORTED bool __dace_gpu_set_stream(warpLevel_state_t *__state, int streamid, gpuStream_t stream)\n", + "{\n", + " if (streamid < 0 || streamid >= 1)\n", + " return false;\n", + "\n", + " __state->gpu_context->streams[streamid] = stream;\n", + "\n", + " return true;\n", + "}\n", + "\n", + "DACE_EXPORTED void __dace_gpu_set_all_streams(warpLevel_state_t *__state, gpuStream_t stream)\n", + "{\n", + " for (int i = 0; i < 1; ++i)\n", + " __state->gpu_context->streams[i] = stream;\n", + "}\n", + "\n", + "__global__ void warpLevel_3_0_0_0(double * __restrict__ A)\n", + "{ // Kernel scope (open 1)\n", + " int i = (512 * blockIdx.x);\n", + " { // ThreadBlock Scope (open 1)\n", + " int j = threadIdx.x;\n", + " { // WarpLevel Scope (open 1)\n", + " double __tmp3;\n", + "\n", + " int warpId = (threadIdx.x + blockDim.x * threadIdx.y + blockDim.x * blockDim.y * threadIdx.z) / 32;\n", + " if ( 0 < warpId && warpId < 15) { // WarpLevel Scope (open 2)\n", + "\n", + "\n", + "\n", + " ----------------------------------\n", + " // WarpLevel operations here\n", + " ----------------------------------\n", + "\n", + "\n", + "\n", + " } // WarpLevel Scope (close 1)\n", + " } // WarpLevel Scope (close 2)\n", + " } // ThreadBlock Scope (close 1)\n", + "} // Kernel scope (close 1)\n", + "\n", + "\n", + "DACE_EXPORTED void __dace_runkernel_warpLevel_3_0_0_0(warpLevel_state_t *__state, double * __restrict__ A);\n", + "void __dace_runkernel_warpLevel_3_0_0_0(warpLevel_state_t *__state, double * __restrict__ A)\n", + "{\n", + "\n", + "\n", + " void *warpLevel_3_0_0_0_args[] = { (void *)&A };\n", + " gpuError_t __err = cudaLaunchKernel( (void*)warpLevel_3_0_0_0, dim3(1, 1, 1), dim3(512, 1, 1), warpLevel_3_0_0_0_args, 0, __state->gpu_context->streams[0]\n", + " );\n", + "\n", + " DACE_KERNEL_LAUNCH_CHECK(__err, \"warpLevel_3_0_0_0\", 1, 1, 1, 512, 1, 1);\n", + "}\n" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "Code(sdfg.generate_code()[1].clean_code, language='cpp')" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.3" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/berkay_workpace/berkay_testbed.py b/berkay_workpace/berkay_testbed.py new file mode 100644 index 0000000000..6923998953 --- /dev/null +++ b/berkay_workpace/berkay_testbed.py @@ -0,0 +1,33 @@ +import dace +import cupy as cp +import random + +from dace import registry +from dace.sdfg.scope import ScopeSubgraphView +from dace.codegen.prettycode import CodeIOStream +from dace.codegen.targets.target import TargetCodeGenerator +from dace.codegen.targets.framecode import DaCeCodeGenerator +from dace.codegen.targets.cpp import sym2cpp +from IPython.display import Code + + + + +N = dace.symbol('N') + +@dace.program +def vector_copy4(A: dace.float64[N] @ dace.dtypes.StorageType.GPU_Global, B: dace.float64[N] @ dace.dtypes.StorageType.GPU_Global): + for i in dace.map[0:N:32] @ dace.dtypes.ScheduleType.GPU_Device: + for j in dace.map[0:32] @ dace.dtypes.ScheduleType.GPU_ThreadBlock: + if i + j < N: + A[i + j] = B[i + j] + +n = random.randint(40, 150) +# Initialize random CUDA arrays +A = cp.zeros(n, dtype=cp.float64) # Output array +B = cp.random.rand(n).astype(cp.float64) # Random input array + +sdfg = vector_copy4.to_sdfg() +sdfg(A=A, B=B, N=n) +equal_at_end = cp.all(A == B) + diff --git a/berkay_workpace/berkay_testbed2.ipynb b/berkay_workpace/berkay_testbed2.ipynb new file mode 100644 index 0000000000..508b6b94aa --- /dev/null +++ b/berkay_workpace/berkay_testbed2.ipynb @@ -0,0 +1,99 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "323168ff", + "metadata": {}, + "outputs": [], + "source": [ + "import dace\n", + "from dace import registry\n", + "from dace.sdfg.scope import ScopeSubgraphView\n", + "from dace.codegen.prettycode import CodeIOStream\n", + "from dace.codegen.targets.target import TargetCodeGenerator\n", + "from dace.codegen.targets.framecode import DaCeCodeGenerator\n", + "from dace.codegen.targets.cpp import sym2cpp\n", + "from IPython.display import Code" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "851a8f17", + "metadata": {}, + "outputs": [], + "source": [ + "@dace.program\n", + "def vector_copy(A: dace.float32[10] @ dace.dtypes.StorageType.GPU_Global, B: dace.float32[10] @ dace.dtypes.StorageType.GPU_Global):\n", + " for i in dace.map[0:10] @ dace.dtypes.ScheduleType.GPU_Device:\n", + " A[i] = B[i]\n", + "\n", + "sdfg = vector_copy.to_sdfg()\n", + "sdfg" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "69427604", + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "Code(sdfg.generate_code()[0].clean_code, language='cpp')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ddd49236", + "metadata": {}, + "outputs": [], + "source": [ + "Code(sdfg.generate_code()[1].clean_code, language='cpp')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "567ceeff", + "metadata": {}, + "outputs": [], + "source": [ + "Code(sdfg.generate_code()[2].clean_code, language='cpp')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "80e4eea7", + "metadata": {}, + "outputs": [], + "source": [ + "Code(sdfg.generate_code()[3].clean_code, language='cpp')" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python (dace_env)", + "language": "python", + "name": "dace_emv" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/berkay_workpace/playfield.py b/berkay_workpace/playfield.py new file mode 100644 index 0000000000..40181faa74 --- /dev/null +++ b/berkay_workpace/playfield.py @@ -0,0 +1,45 @@ +import dace +import random +import cupy as cp + +from dace import registry +from dace.sdfg.scope import ScopeSubgraphView +from dace.codegen.prettycode import CodeIOStream +from dace.codegen.targets.target import TargetCodeGenerator +from dace.codegen.targets.framecode import DaCeCodeGenerator +from dace.codegen.targets.cpp import sym2cpp +from IPython.display import Code +from dace.config import Config + +print(Config.get('compiler', 'cuda', 'implementation')) + + +@dace.program +def warpLevel(A: dace.float64[512] @ dace.dtypes.StorageType.GPU_Global, B: dace.float64[512] @ dace.dtypes.StorageType.GPU_Global): + for i in dace.map[0:512:512] @ dace.dtypes.ScheduleType.GPU_Device: + for j in dace.map[0:512] @ dace.dtypes.ScheduleType.GPU_ThreadBlock: + for k in dace.map[0:16] @ dace.dtypes.ScheduleType.GPU_Warp: + A[k] = A[k] + 1 + + +sdfg = warpLevel.to_sdfg() +Code(sdfg.generate_code()[0].clean_code, language='cpp') + + +""" +""" + + +""" +@dace.program +def vector_copy3(A: dace.float64[64] @ dace.dtypes.StorageType.GPU_Global, B: dace.float64[64] @ dace.dtypes.StorageType.GPU_Global): + for j in dace.map[0:32] @ dace.dtypes.ScheduleType.GPU_ThreadBlock: + A[j] = B[j] + +sdfg = vector_copy3.to_sdfg() +Code(sdfg.generate_code()[0].clean_code, language='cpp') + + + + +""" \ No newline at end of file diff --git a/berkay_workpace/report.py b/berkay_workpace/report.py new file mode 100644 index 0000000000..1855e2fd40 --- /dev/null +++ b/berkay_workpace/report.py @@ -0,0 +1,27 @@ +# TODO: GENERAL, discuss with Yakup +# 1. Modularity for Deallocate? +# 2. KernelScopeManager: What I like: +# - simple, easy to understand, modular and clean +# what I dont like: +# - Kind of messes with _generate_exit and how dace generates code +# Your opinion? do or dont? +# 3. __syncthread example ? Or better: General examples? +# 3.5 See below +# 4. GPU streams- now or wait? +# 5. Config for thread_id - why is this even a config? +# 6. Used no instrumentation because I have no clue what it is + + +# I think the rest can wait before getting refactored (I don't need to reinvent the wheel) +# New tasks for now? + + + + +# My personal TODO's +# TODO: when tired +# include constant expressions +# 4 dimensional example + +# TODO: depending on what happens next +# change in_device_code to maybe in_kernel_code? \ No newline at end of file diff --git a/berkay_workpace/test.py b/berkay_workpace/test.py new file mode 100644 index 0000000000..9095787b8b --- /dev/null +++ b/berkay_workpace/test.py @@ -0,0 +1,335 @@ +import dace +import random +import cupy as cp + +from dace import registry +from dace.sdfg.scope import ScopeSubgraphView +from dace.codegen.prettycode import CodeIOStream +from dace.codegen.targets.target import TargetCodeGenerator +from dace.codegen.targets.framecode import DaCeCodeGenerator +from dace.codegen.targets.cpp import sym2cpp +from IPython.display import Code +from dace.config import Config + + +def test_1(): + + vec_size = 66 + @dace.program + def vector_copy1(A: dace.float64[vec_size] @ dace.dtypes.StorageType.GPU_Global, B: dace.float64[vec_size] @ dace.dtypes.StorageType.GPU_Global): + for i in dace.map[0:vec_size] @ dace.dtypes.ScheduleType.GPU_Device: + A[i] = B[i] + + sdfg = vector_copy1.to_sdfg() + + # Initialize random CUDA arrays + A = cp.zeros(vec_size, dtype=cp.float64) # Output array + B = cp.random.rand(vec_size).astype(cp.float64) # Random input array + + equal_at_start = cp.all(A == B) + if equal_at_start: + print(f"{'Test 1: Vectors are equal at start. Test is skipped.':<70}\033[93m[WARNING]\033[0m") + return + + sdfg = vector_copy1.to_sdfg() + sdfg(A=A, B=B) + equal_at_end = cp.all(A == B) + + if equal_at_end: + print(f"{'Test 1: 1D vector copy simple':<70}\033[92m[PASSED]\033[0m") + else: + print(f"{'Test 1: 1D vector copy simple':<70}\033[91m[FAILED]\033[0m") + + +def test_2(): + + N = dace.symbol('N') + @dace.program + def vector_copy2(A: dace.float64[N] @ dace.dtypes.StorageType.GPU_Global, B: dace.float64[N] @ dace.dtypes.StorageType.GPU_Global): + for i in dace.map[0:N] @ dace.dtypes.ScheduleType.GPU_Device: + A[i] = B[i] + + sdfg = vector_copy2.to_sdfg() + + n = random.randint(3, 100) + # Initialize random CUDA arrays + A = cp.zeros(n, dtype=cp.float64) # Output array + B = cp.random.rand(n).astype(cp.float64) # Random input array + + equal_at_start = cp.all(A == B) + if equal_at_start: + print(f"{'Test 2: Vectors are equal at start. Test is skipped.':<70}\033[93m[WARNING]\033[0m") + return + + sdfg = vector_copy2.to_sdfg() + sdfg(A=A, B=B, N=n) + equal_at_end = cp.all(A == B) + + if equal_at_end: + print(f"{'Test 2: 1D vector copy with symbolic size':<70}\033[92m[PASSED]\033[0m") + else: + print(f"{'Test 2: 1D vector copy with symbolic size':<70}\033[91m[FAILED]\033[0m") + + +def test_3(): + @dace.program + def vector_copy3(A: dace.float64[64] @ dace.dtypes.StorageType.GPU_Global, B: dace.float64[64] @ dace.dtypes.StorageType.GPU_Global): + for i in dace.map[0:64:32] @ dace.dtypes.ScheduleType.GPU_Device: + for j in dace.map[0:32] @ dace.dtypes.ScheduleType.GPU_ThreadBlock: + A[i + j] = B[i + j] + + sdfg = vector_copy3.to_sdfg() + + # Initialize random CUDA arrays + A = cp.zeros(64, dtype=cp.float64) # Output array + B = cp.random.rand(64).astype(cp.float64) # Random input array + + equal_at_start = cp.all(A == B) + if equal_at_start: + print(f"{'Test 3: Vectors are equal at start. Test is skipped.':<70}\033[93m[WARNING]\033[0m") + return + + sdfg = vector_copy3.to_sdfg() + sdfg(A=A, B=B) + equal_at_end = cp.all(A == B) + + if equal_at_end: + print(f"{'Test 3: 1D vector copy with threadblocking':<70}\033[92m[PASSED]\033[0m") + else: + print(f"{'Test 3: 1D vector copy with threadblocking':<70}\033[91m[FAILED]\033[0m") + + +def test_4(): + + N = dace.symbol('N') + + @dace.program + def vector_copy4(A: dace.float64[N] @ dace.dtypes.StorageType.GPU_Global, B: dace.float64[N] @ dace.dtypes.StorageType.GPU_Global): + for i in dace.map[0:N:32] @ dace.dtypes.ScheduleType.GPU_Device: + for j in dace.map[0:32] @ dace.dtypes.ScheduleType.GPU_ThreadBlock: + if i + j < N: + A[i + j] = B[i + j] + + n = random.randint(40, 150) + # Initialize random CUDA arrays + A = cp.zeros(n, dtype=cp.float64) # Output array + B = cp.random.rand(n).astype(cp.float64) # Random input array + + + equal_at_start = cp.all(A == B) + if equal_at_start: + print(f"{'Test 4: Vectors are equal at start. Test is skipped.':<70}\033[93m[WARNING]\033[0m") + return + + sdfg = vector_copy4.to_sdfg() + sdfg(A=A, B=B, N=n) + equal_at_end = cp.all(A == B) + + if equal_at_end: + print(f"{'Test 4: 1D vector copy with threadblocking & smybolic size':<70}\033[92m[PASSED]\033[0m") + else: + print(f"{'Test 4: 1D vector copy with threadblocking & smybolic size':<70}\033[91m[FAILED]\033[0m") + + +def test_5(): + @dace.program + def matrix_copy1(A: dace.float64[64,64] @ dace.dtypes.StorageType.GPU_Global, B: dace.float64[64,64] @ dace.dtypes.StorageType.GPU_Global): + for i, j in dace.map[0:64, 0:64] @ dace.dtypes.ScheduleType.GPU_Device: + A[i][j] = B[i][j] + # Preview SDFG + sdfg = matrix_copy1.to_sdfg() + + + # Initialize random CUDA arrays + A = cp.zeros((64,64), dtype=cp.float64) # Output array + B = cp.random.rand(64,64).astype(cp.float64) # Random input array + + + equal_at_start = cp.all(A == B) + if equal_at_start: + print(f"{'Test 5: Matrices are equal at start. Test is skipped.':<70}\033[93m[WARNING]\033[0m") + return + + sdfg = matrix_copy1.to_sdfg() + sdfg(A=A, B=B) + equal_at_end = cp.all(A == B) + + if equal_at_end: + print(f"{'Test 5: Simple Matrix Copy':<70}\033[92m[PASSED]\033[0m") + else: + print(f"{'Test 5: Simple Matrix Copy':<70}\033[91m[FAILED]\033[0m") + + +def test_6(): + + N = dace.symbol('N') + M = dace.symbol('M') + + @dace.program + def matrix_copy2(A: dace.float64[M,N] @ dace.dtypes.StorageType.GPU_Global, B: dace.float64[M,N] @ dace.dtypes.StorageType.GPU_Global): + for i, j in dace.map[0:M, 0:N] @ dace.dtypes.ScheduleType.GPU_Device: + A[i][j] = B[i][j] + # Preview SDFG + sdfg = matrix_copy2.to_sdfg() + + n = random.randint(40, 150) + m = random.randint(40, 150) + # Initialize random CUDA arrays + A = cp.zeros((m,n), dtype=cp.float64) # Output array + B = cp.random.rand(m,n).astype(cp.float64) # Random input array + + + equal_at_start = cp.all(A == B) + if equal_at_start: + print(f"{'Test 6: Matrices are equal at start. Test is skipped.':<70}\033[93m[WARNING]\033[0m") + return + + sdfg = matrix_copy2.to_sdfg() + sdfg(A=A, B=B, M=m, N=n) + equal_at_end = cp.all(A == B) + + if equal_at_end: + print(f"{'Test 6: Matrix Copy with symbolic sizes':<70}\033[92m[PASSED]\033[0m") + else: + print(f"{'Test 6: Matrix Copy with symbolic sizes':<70}\033[91m[FAILED]\033[0m") + + +def test_7(): + + N = dace.symbol('N') + M = dace.symbol('M') + + @dace.program + def matrix_copy3(A: dace.float64[M,N] @ dace.dtypes.StorageType.GPU_Global, B: dace.float64[M,N] @ dace.dtypes.StorageType.GPU_Global): + for i, j in dace.map[0:M:32, 0:N:32] @ dace.dtypes.ScheduleType.GPU_Device: + for ii, jj in dace.map[0:32, 0:32] @ dace.dtypes.ScheduleType.GPU_ThreadBlock: + if i + ii < M and j + jj < N: + A[i + ii, j + jj] = B[i + ii, j + jj] + # Preview SDFG + sdfg = matrix_copy3.to_sdfg() + + n = random.randint(40, 150) + m = random.randint(40, 150) + # Initialize random CUDA arrays + A = cp.zeros((m,n), dtype=cp.float64) # Output array + B = cp.random.rand(m,n).astype(cp.float64) # Random input array + + + equal_at_start = cp.all(A == B) + if equal_at_start: + print(f"{'Test 6: Matrices are equal at start. Test is skipped.':<70}\033[93m[WARNING]\033[0m") + return + + sdfg = matrix_copy3.to_sdfg() + sdfg(A=A, B=B, M=m, N=n) + equal_at_end = cp.all(A == B) + + if equal_at_end: + print(f"{'Test 7: Matrix Copy with threadblocking & symbolic sizes':<70}\033[92m[PASSED]\033[0m") + else: + print(f"{'Test 7: Matrix Copy with threadblocking & symbolic sizes':<70}\033[91m[FAILED]\033[0m") + + +def test_8(): + + N = dace.symbol('N') + M = dace.symbol('M') + + @dace.program + def matrix_copy3(A: dace.float64[M,N] @ dace.dtypes.StorageType.GPU_Global, B: dace.float64[M,N] @ dace.dtypes.StorageType.GPU_Global): + for i, j in dace.map[0:M:32, 0:N:32] @ dace.dtypes.ScheduleType.GPU_Device: + for ii, jj in dace.map[0:32, 0:32] @ dace.dtypes.ScheduleType.GPU_ThreadBlock: + sB = dace.define_local([32,32], dace.float64, storage=dace.StorageType.GPU_Shared) + sB[ii, jj] = B[i + ii, j + jj] + A[i + ii, j + jj] = sB[ii, jj] + + + # Preview SDFG + sdfg = matrix_copy3.to_sdfg() + + n = random.randint(40, 150) + m = random.randint(40, 150) + # Initialize random CUDA arrays + A = cp.zeros((m,n), dtype=cp.float64) # Output array + B = cp.random.rand(m,n).astype(cp.float64) # Random input array + + + equal_at_start = cp.all(A == B) + if equal_at_start: + print(f"{'Test 8: Matrices are equal at start. Test is skipped.':<70}\033[93m[WARNING]\033[0m") + return + + sdfg = matrix_copy3.to_sdfg() + sdfg(A=A, B=B, M=m, N=n) + equal_at_end = cp.all(A == B) + + if equal_at_end: + print(f"{'Test 8: Matrix Copy with shared memory':<70}\033[92m[PASSED]\033[0m") + else: + print(f"{'Test 8: Matrix Copy with shared memory':<70}\033[91m[FAILED]\033[0m") + + +def test_9(): + + N = dace.symbol('N') + + @dace.program + def notskewed(A: dace.float32[N] @ dace.dtypes.StorageType.GPU_Global, + B: dace.float32[N] @ dace.dtypes.StorageType.GPU_Global, + C: dace.float32[N] @ dace.dtypes.StorageType.GPU_Global): + for i in dace.map[0:N:32] @ dace.ScheduleType.GPU_Device: + for j in dace.map[i:(i+32)] @ dace.ScheduleType.GPU_ThreadBlock: + C[j] = A[j] + B[j] + + # Preview SDFG + sdfg = notskewed.to_sdfg() + + n = random.randint(40, 150) + # Initialize random CUDA arrays + A = cp.random.rand(n).astype(cp.float32) # Output array + B = cp.random.rand(n).astype(cp.float32) # Random input array + C = cp.zeros((n), dtype=cp.float32) + C_ref = cp.zeros((n), dtype=cp.float32) + + + C_ref = A + B + sdfg(A=A, B=B, C=C, N=n) + + + if cp.all(C == C_ref): + print(f"{'Test 9: Not skewed vadd3':<70}\033[92m[PASSED]\033[0m") + else: + print(f"{'Test 9: Not skewed vadd3':<70}\033[91m[FAILED]\033[0m") + + + + + +def selected(): + test_1() + test_4() + test_5() + +def all(): + test_1() + test_2() + test_3() + test_4() + test_5() + test_6() + test_7() + test_8() + test_9() + +if __name__ == '__main__': + + + print("\n" + "="*80) + print(f"Tests started: You are using the {Config.get('compiler', 'cuda', 'implementation')} CUDA implementation.") + print("="*80 + "\n") + + all() + + print("\n" + "="*80) + print(f"Tests ended.") + print("="*80 + "\n") \ No newline at end of file diff --git a/dace/codegen/CMakeLists.txt b/dace/codegen/CMakeLists.txt index 5482d4d30d..1187ce9be1 100644 --- a/dace/codegen/CMakeLists.txt +++ b/dace/codegen/CMakeLists.txt @@ -57,7 +57,8 @@ foreach(DACE_FILE ${DACE_FILES}) # Make the path absolute set(DACE_FILE ${DACE_SRC_DIR}/${DACE_FILE}) # Now treat the file according to the deduced target - if(${DACE_FILE_TARGET} STREQUAL "cuda") + # previous: if(${DACE_FILE_TARGET} STREQUAL "cuda"). Needed to work with experimental + if(${DACE_FILE_TARGET} STREQUAL "experimental_cuda" OR ${DACE_FILE_TARGET} STREQUAL "cuda") if(${DACE_FILE_TARGET_TYPE} MATCHES "hip") set(DACE_ENABLE_HIP ON) set(DACE_HIP_FILES ${DACE_HIP_FILES} ${DACE_FILE}) diff --git a/dace/codegen/targets/__init__.py b/dace/codegen/targets/__init__.py index cd4d5f957f..a0c2065524 100644 --- a/dace/codegen/targets/__init__.py +++ b/dace/codegen/targets/__init__.py @@ -9,3 +9,4 @@ from .mlir.mlir import MLIRCodeGen from .sve.codegen import SVECodeGen from .snitch import SnitchCodeGen +from .experimental_cuda import ExperimentalCUDACodeGen \ No newline at end of file diff --git a/dace/codegen/targets/experimental_cuda.py b/dace/codegen/targets/experimental_cuda.py new file mode 100644 index 0000000000..3d298d659a --- /dev/null +++ b/dace/codegen/targets/experimental_cuda.py @@ -0,0 +1,2230 @@ +import ctypes +import functools +import warnings +from typing import TYPE_CHECKING, Dict, List, Optional, Set, Tuple, Union + +import networkx as nx +import sympy +from six import StringIO + +import dace +from dace import data as dt, Memlet +from dace import dtypes, registry +from dace import subsets, symbolic +from dace.codegen import common, cppunparse +from dace.codegen.codeobject import CodeObject +from dace.codegen.dispatcher import DefinedType, TargetDispatcher +from dace.codegen.prettycode import CodeIOStream +from dace.codegen.targets import cpp +from dace.codegen.common import update_persistent_desc +from dace.codegen.targets.cpp import (codeblock_to_cpp, cpp_array_expr, memlet_copy_to_absolute_strides, sym2cpp, + synchronize_streams, unparse_cr, mangle_dace_state_struct_name) +from dace.codegen.targets.target import IllegalCopy, TargetCodeGenerator, make_absolute +from dace.config import Config +from dace.frontend import operations +from dace.sdfg import (SDFG, ScopeSubgraphView, SDFGState, has_dynamic_map_inputs, is_array_stream_view, + is_devicelevel_gpu, nodes, scope_contains_scope) +from dace.sdfg import utils as sdutil +from dace.sdfg.graph import MultiConnectorEdge +from dace.sdfg.state import ControlFlowRegion, StateSubgraphView +from dace.transformation import helpers as xfh +from dace.transformation.passes import analysis as ap + +if TYPE_CHECKING: + from dace.codegen.targets.framecode import DaCeCodeGenerator + from dace.codegen.targets.cpu import CPUCodeGen + + + +def prod(iterable): + return functools.reduce(sympy.Mul, iterable, 1) + + + + +# TODO: GENERAL, discuss with Yakup +# have a look at dtypes maybe + + + +# My personal TODO's +# TODO: when tired +# include constant expressions + launch bounds logic +# insert warnings that gpu device must be first +# 4 dimensional example + +# TODO: depending on what happens next +# change in_device_code to maybe in_kernel_code? + + + + + + + + + +# TODO : I got rid of ScheduleType.GPU_Persistent (not supported anymore). If this codeBase +# actually replaces the old one, this should be defined in dtypes.py and also accessed from +# there. Also change GPU_SCHEDULES accesses to dtypes.GPU_SCHEDULES +GPU_SCHEDULES = [ + dace.ScheduleType.GPU_Device, + dace.ScheduleType.GPU_ThreadBlock, + dace.ScheduleType.GPU_Warp +] + + +@registry.autoregister_params(name='experimental_cuda') +class ExperimentalCUDACodeGen(TargetCodeGenerator): + """ Experimental CUDA code generator.""" + target_name = 'experimental_cuda' + title = 'CUDA' + + _in_device_code = False + + ######################## Initilization and Preprocessing related start ######################################################### + + def __init__(self, frame_codegen: 'DaCeCodeGenerator', sdfg: SDFG): + + self._frame: DaCeCodeGenerator = frame_codegen # creates the frame code, orchestrates the code generation for targets + self._dispatcher: TargetDispatcher= frame_codegen.dispatcher # responsible for dispatching code generation to the appropriate target + + # dispatcher = self._dispatcher + + self.create_grid_barrier: bool = False # Used for grid level synchronization + + self.dynamic_tbmap_type = None + self.extra_nsdfg_args = [] + ExperimentalCUDACodeGen._in_device_code = False # TODO: Isn't this double? + self._cpu_codegen: Optional['CPUCodeGen'] = None + self._block_dims = None + self._grid_dims = None + + # NOTE: Type may be wrong! + self._kernel_map: Optional[nodes.MapEntry] = None # Indicates whether the code generation is currently within a "kernel" map. + + # NOTE: Moved from preprossessing to here + self.backend: str = common.get_gpu_backend() + self.language = 'cu' if self.backend == 'cuda' else 'cpp' + target_type = '' if self.backend == 'cuda' else self.backend + self._codeobject = CodeObject(sdfg.name + '_' + 'cuda', + '', + self.language, + ExperimentalCUDACodeGen, + 'CUDA', + target_type=target_type) + + self._kernel_state = None + self._kernel_grid_conditions: List[str] = [] + self._scope_has_collaborative_copy = False + + self._localcode = CodeIOStream() + self._globalcode = CodeIOStream() + + # TODO: init and exitcode seem to serve no purpose actually. + self._initcode = CodeIOStream() + self._exitcode = CodeIOStream() + + self._global_sdfg: SDFG = sdfg + self._toplevel_schedule = None + + + self._arglists: Dict[nodes.MapEntry, Dict[str, dt.Data]] = {} + + # Keep track of current "scope entry/exit" code streams for extra + # code generation + self.scope_entry_stream = self._initcode + self.scope_exit_stream = self._exitcode + + self._cuda_streams, self._cuda_events = 0, 0 + + # Positions at which to deallocate memory pool arrays + self.pool_release: Dict[Tuple[SDFG, str], Tuple[SDFGState, Set[nodes.Node]]] = {} + self.has_pool = False + + + self._ignore_warnings = True + + # INFO: + # Register GPU schedules and storage types for ExperimentalCUDACodeGen. + # The dispatcher maps GPU-related schedules and storage types to the + # appropriate code generation functions in this code generator. + + # Register dispatchers + self._cpu_codegen = self._dispatcher.get_generic_node_dispatcher() + + self._dispatcher = frame_codegen.dispatcher + self._dispatcher.register_map_dispatcher(GPU_SCHEDULES, self) + self._dispatcher.register_node_dispatcher(self, self.node_dispatch_predicate) + self._dispatcher.register_state_dispatcher(self, self.state_dispatch_predicate) + + gpu_storage = [dtypes.StorageType.GPU_Global, dtypes.StorageType.GPU_Shared, dtypes.StorageType.CPU_Pinned] + + self._dispatcher.register_array_dispatcher(gpu_storage, self) + self._dispatcher.register_array_dispatcher(dtypes.StorageType.CPU_Pinned, self) + for storage in gpu_storage: + for other_storage in dtypes.StorageType: + self._dispatcher.register_copy_dispatcher(storage, other_storage, None, self) + self._dispatcher.register_copy_dispatcher(other_storage, storage, None, self) + + + # NOTE: Moved it here from preprocessing, I think it fits better + self._backend = common.get_gpu_backend() + self._language = 'cu' if self.backend == 'cuda' else 'cpp' + target_type = "" if self.backend == 'cuda' else self.backend + self._codeobject= CodeObject(sdfg.name + '_' + 'cuda', + '', + self._language, + ExperimentalCUDACodeGen, + 'CUDA', + target_type=target_type) + + + # NOTE: + # "Register illegal copies" code NOT copied from cuda.py + # Behavior unclear for me yet. + + + ################## New variables ########################## + self._current_kernel_spec: Optional[KernelSpec] = None + + + # NOTE: I think this is good as is + def preprocess(self, sdfg: SDFG) -> None: + + # Find GPU<->GPU strided copies that cannot be represented by a single copy command + from dace.transformation.dataflow import CopyToMap + for e, state in list(sdfg.all_edges_recursive()): + if isinstance(e.src, nodes.AccessNode) and isinstance(e.dst, nodes.AccessNode): + nsdfg = state.parent + if (e.src.desc(nsdfg).storage == dtypes.StorageType.GPU_Global + and e.dst.desc(nsdfg).storage == dtypes.StorageType.GPU_Global): + copy_shape, src_strides, dst_strides, _, _ = memlet_copy_to_absolute_strides( + None, nsdfg, state, e, e.src, e.dst) + dims = len(copy_shape) + + # Skip supported copy types + if dims == 1: + continue + elif dims == 2: + if src_strides[-1] != 1 or dst_strides[-1] != 1: + # NOTE: Special case of continuous copy + # Example: dcol[0:I, 0:J, k] -> datacol[0:I, 0:J] + # with copy shape [I, J] and strides [J*K, K], [J, 1] + try: + is_src_cont = src_strides[0] / src_strides[1] == copy_shape[1] + is_dst_cont = dst_strides[0] / dst_strides[1] == copy_shape[1] + except (TypeError, ValueError): + is_src_cont = False + is_dst_cont = False + if is_src_cont and is_dst_cont: + continue + else: + continue + elif dims > 2: + if not (src_strides[-1] != 1 or dst_strides[-1] != 1): + continue + + # Turn unsupported copy to a map + try: + CopyToMap.apply_to(nsdfg, save=False, annotate=False, a=e.src, b=e.dst) + except ValueError: # If transformation doesn't match, continue normally + continue + + # Annotate CUDA streams and events + self._cuda_streams, self._cuda_events = self._compute_cudastreams(sdfg) + + # Find points where memory should be released to the memory pool + self._compute_pool_release(sdfg) + + # Write GPU context to state structure + self._frame.statestruct.append('dace::cuda::Context *gpu_context;') + + + # Collect all defined symbols and argument lists with one traversal + shared_transients = {} + for state, node, defined_syms in sdutil.traverse_sdfg_with_defined_symbols(sdfg, recursive=True): + if (isinstance(node, nodes.MapEntry) + and node.map.schedule == dtypes.ScheduleType.GPU_Device): # NOTE: Removed dtypes.ScheduleType.GPU_Persistent comparision + if state.parent not in shared_transients: + shared_transients[state.parent] = state.parent.shared_transients() + self._arglists[node] = state.scope_subgraph(node).arglist(defined_syms, shared_transients[state.parent]) + + + # NOTE: Used during preprocess. Seems good as is + def _compute_pool_release(self, top_sdfg: SDFG): + """ + Computes positions in the code generator where a memory pool array is no longer used and + ``backendFreeAsync`` should be called to release it. + + :param top_sdfg: The top-level SDFG to traverse. + :raises ValueError: If the backend does not support memory pools. + """ + # Find release points for every array in every SDFG + reachability = access_nodes = None + for sdfg in top_sdfg.all_sdfgs_recursive(): + # Skip SDFGs without memory pool hints + pooled = set(aname for aname, arr in sdfg.arrays.items() + if getattr(arr, 'pool', False) is True and arr.transient) + if not pooled: + continue + self.has_pool = True + if self.backend != 'cuda': + raise ValueError(f'Backend "{self.backend}" does not support the memory pool allocation hint') + + # Lazily compute reachability and access nodes + if reachability is None: + reachability = ap.StateReachability().apply_pass(top_sdfg, {}) + access_nodes = ap.FindAccessStates().apply_pass(top_sdfg, {}) + + reachable = reachability[sdfg.cfg_id] + access_sets = access_nodes[sdfg.cfg_id] + for state in sdfg.nodes(): + # Find all data descriptors that will no longer be used after this state + last_state_arrays: Set[str] = set( + s for s in access_sets + if s in pooled and state in access_sets[s] and not (access_sets[s] & reachable[state]) - {state}) + + anodes = list(state.data_nodes()) + for aname in last_state_arrays: + # Find out if there is a common descendant access node. + # If not, release at end of state + ans = [an for an in anodes if an.data == aname] + terminator = None + for an1 in ans: + if all(nx.has_path(state.nx, an2, an1) for an2 in ans if an2 is not an1): + terminator = an1 + break + + # Enforce a cuda_stream field so that the state-wide deallocation would work + if not hasattr(an1, '_cuda_stream'): + an1._cuda_stream = 'nullptr' + + # If access node was found, find the point where all its reads are complete + terminators = set() + if terminator is not None: + parent = state.entry_node(terminator) + # If within a scope, once all memlet paths going out of that scope are complete, + # it is time to release the memory + if parent is not None: + # Just to be safe, release at end of state (e.g., if misused in Sequential map) + terminators = set() + else: + # Otherwise, find common descendant (or end of state) following the ends of + # all memlet paths (e.g., (a)->...->[tasklet]-->...->(b)) + for e in state.out_edges(terminator): + if isinstance(e.dst, nodes.EntryNode): + terminators.add(state.exit_node(e.dst)) + else: + terminators.add(e.dst) + # After all outgoing memlets of all the terminators have been processed, memory + # will be released + + self.pool_release[(sdfg, aname)] = (state, terminators) + + # If there is unfreed pooled memory, free at the end of the SDFG + unfreed = set(arr for arr in pooled if (sdfg, arr) not in self.pool_release) + if unfreed: + # Find or make single sink node + sinks = sdfg.sink_nodes() + if len(sinks) == 1: + sink = sinks[0] + elif len(sinks) > 1: + sink = sdfg.add_state() + for s in sinks: + sdfg.add_edge(s, sink) + else: # len(sinks) == 0: + raise ValueError('End state not found when trying to free pooled memory') + + # Add sink as terminator state + for arr in unfreed: + self.pool_release[(sdfg, arr)] = (sink, set()) + + + # NOTE: Used during preprocess. Seems good as is + def _compute_cudastreams(self, sdfg: SDFG, default_stream=0, default_event=0): + """ Annotates an SDFG (and all nested ones) to include a `_cuda_stream` + field. This field is applied to all GPU maps, tasklets, and copies + that can be executed in parallel. + + :param sdfg: The sdfg to modify. + :param default_stream: The stream ID to start counting from (used + in recursion to nested SDFGs). + :param default_event: The event ID to start counting from (used + in recursion to nested SDFGs). + :return: 2-tuple of the number of streams, events to create. + """ + concurrent_streams = int(Config.get('compiler', 'cuda', 'max_concurrent_streams')) + if concurrent_streams < 0: + return 0, 0 + + def increment(streams): + if concurrent_streams > 0: + return (streams + 1) % concurrent_streams + return streams + 1 + + state_streams = [] + state_subsdfg_events = [] + + for state in sdfg.states(): + # Start by annotating source nodes + source_nodes = state.source_nodes() + + # Concurrency can only be found in each state + max_streams = default_stream + max_events = default_event + + for i, node in enumerate(source_nodes): + if isinstance(node, nodes.AccessNode): + continue + if isinstance(node, nodes.NestedSDFG): + if node.schedule == dtypes.ScheduleType.GPU_Device: + continue + if node.schedule not in dtypes.GPU_SCHEDULES: + max_streams, max_events = self._compute_cudastreams(node.sdfg, max_streams, max_events + 1) + node._cuda_stream = max_streams + node._cs_childpath = False + max_streams = increment(max_streams) + + # Maintain the same CUDA stream in DFS order, add more when + # possible. + for e in state.dfs_edges(source_nodes): + if hasattr(e.dst, '_cuda_stream'): + continue + if hasattr(e.src, '_cuda_stream'): + c = e.src._cuda_stream + + if (isinstance(e.dst, nodes.AccessNode) and isinstance(sdfg.arrays[e.dst.data], dt.View)): + # Skip views + e.dst._cuda_stream = c + e.dst._cs_childpath = False + continue + + if e.src._cs_childpath == True: + c = max_streams + max_streams = increment(max_streams) + e.src._cs_childpath = True + + # Do not create multiple streams within GPU scopes + if (isinstance(e.src, nodes.EntryNode) and e.src.schedule in dtypes.GPU_SCHEDULES): + e.src._cs_childpath = False + elif state.entry_node(e.src) is not None: + parent = state.entry_node(e.src) + if parent.schedule in dtypes.GPU_SCHEDULES: + e.src._cs_childpath = False + else: + c = max_streams + if (isinstance(e.dst, nodes.AccessNode) and isinstance(sdfg.arrays[e.dst.data], dt.View)): + # Skip views + pass + else: + max_streams = increment(max_streams) + e.dst._cuda_stream = c + if not hasattr(e.dst, '_cs_childpath'): + e.dst._cs_childpath = False + if isinstance(e.dst, nodes.NestedSDFG): + if e.dst.schedule not in dtypes.GPU_SCHEDULES: + max_streams, max_events = self._compute_cudastreams(e.dst.sdfg, e.dst._cuda_stream, + max_events + 1) + + state_streams.append(max_streams if concurrent_streams == 0 else concurrent_streams) + state_subsdfg_events.append(max_events) + + # Remove CUDA streams from paths of non-gpu copies and CPU tasklets + for node, graph in sdfg.all_nodes_recursive(): + if isinstance(graph, SDFGState): + cur_sdfg = graph.parent + + if (isinstance(node, (nodes.EntryNode, nodes.ExitNode)) and node.schedule in dtypes.GPU_SCHEDULES): + # Node must have GPU stream, remove childpath and continue + if hasattr(node, '_cs_childpath'): + delattr(node, '_cs_childpath') + continue + + for e in graph.all_edges(node): + path = graph.memlet_path(e) + # If leading from/to a GPU memory node, keep stream + if ((isinstance(path[0].src, nodes.AccessNode) + and path[0].src.desc(cur_sdfg).storage == dtypes.StorageType.GPU_Global) + or (isinstance(path[-1].dst, nodes.AccessNode) + and path[-1].dst.desc(cur_sdfg).storage == dtypes.StorageType.GPU_Global)): + break + # If leading from/to a GPU tasklet, keep stream + if ((isinstance(path[0].src, nodes.CodeNode) and is_devicelevel_gpu(cur_sdfg, graph, path[0].src)) + or (isinstance(path[-1].dst, nodes.CodeNode) + and is_devicelevel_gpu(cur_sdfg, graph, path[-1].dst))): + break + else: # If we did not break, we do not need a CUDA stream + if hasattr(node, '_cuda_stream'): + delattr(node, '_cuda_stream') + # In any case, remove childpath + if hasattr(node, '_cs_childpath'): + delattr(node, '_cs_childpath') + + # Compute maximal number of events by counting edges (within the same + # state) that point from one stream to another + state_events = [] + for i, state in enumerate(sdfg.states()): + events = state_subsdfg_events[i] + + for e in state.edges(): + if hasattr(e.src, '_cuda_stream'): + # If there are two or more CUDA streams involved in this + # edge, or the destination is unrelated to CUDA + if (not hasattr(e.dst, '_cuda_stream') or e.src._cuda_stream != e.dst._cuda_stream): + for mpe in state.memlet_path(e): + mpe._cuda_event = events + events += 1 + + state_events.append(events) + + # Maximum over all states + max_streams = max(state_streams) + max_events = max(state_events) + + return max_streams, max_events + + ######################## Initilization and Preprocessing related end ######################################################### + + @property + def has_initializer(self) -> bool: + return True + @property + def has_finalizer(self) -> bool: + return True + + + + + + + def generate_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: ScopeSubgraphView, state_id: int, + function_stream: CodeIOStream, callsite_stream: CodeIOStream) -> None: + + # Are we generating host (launch) code or device (kernel) code? + if not ExperimentalCUDACodeGen._in_device_code: + + # Prepare and cache kernel metadata (name, grid dims, arguments, etc.) + self._current_kernel_spec = KernelSpec( + cudaCodeGen=self, sdfg=sdfg, cfg=cfg, dfg_scope=dfg_scope, state_id=state_id + ) + + # + self._generate_gpu_bridge(sdfg, cfg, dfg_scope, state_id, function_stream, callsite_stream) + + #--------------- Generate Kernel Function ---------------- + ExperimentalCUDACodeGen._in_device_code = True + kernel_stream = CodeIOStream() + + kernel_name = self._current_kernel_spec.kernel_name + kernel_args = self._current_kernel_spec.args_typed + scope_entry = dfg_scope.source_nodes()[0] + + # Emit kernel function signature + kernel_stream.write( + f'__global__ void {kernel_name}({", ".join(kernel_args)}) ', + cfg, state_id, scope_entry + ) + self._generate_kernel_scope( + sdfg, cfg, dfg_scope, state_id, self._globalcode, kernel_stream + ) + self._localcode.write(kernel_stream.getvalue() + '\n') + ExperimentalCUDACodeGen._in_device_code = False + # -------------------------------------------------------------- + + # Generate the actual launch call (host-side) + self._generate_kernel_launch(sdfg, cfg, dfg_scope, state_id, function_stream, callsite_stream) + + else: + # We are already inside a kernel — this will be nested scope + node = dfg_scope.source_nodes()[0] + schedule_type = node.map.schedule.name + gen = getattr(self, f'_generate_{schedule_type}_scope', False) + if gen: + gen(sdfg, cfg, dfg_scope, state_id, function_stream, callsite_stream) + else: + raise NotImplementedError( + f"Scope generation for schedule type '{schedule_type}' is not implemented in ExperimentalCUDACodeGen. " + "Please ensure that the schedule type is supported or implement the required functionality." + ) + + + + + +####################### helper functions to generate_scope ###################################### + + + def _generate_kernel_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: ScopeSubgraphView, state_id: int, + function_stream: CodeIOStream, kernel_stream: CodeIOStream) -> None: + + + # Get the Map Node (sould be a Map Node?) + node = dfg_scope.source_nodes()[0] + + # Get kernel specifications + kernel_spec = self._current_kernel_spec + kernel_map = kernel_spec.kernel_map + has_tbmap = kernel_spec.has_tbmap + block_dims = kernel_spec.block_dims + + + + with KernelScopeManager(cudaCodeGen=self, sdfg=sdfg, cfg=cfg, dfg_scope=dfg_scope, state_id=state_id, + function_stream=function_stream, callsite_stream=kernel_stream, comment="Kernel scope",) as scopeManager: + + # Get the thread/block index type + ttype = Config.get('compiler', 'cuda', 'thread_id_type') + tidtype = getattr(dtypes, ttype, False) + if not isinstance(tidtype, dtypes.typeclass): + raise ValueError(f'Configured type "{ttype}" for ``thread_id_type`` does not match any DaCe data type. ' + 'See ``dace.dtypes`` for available types (for example ``int32``).') + + + # Generate all index arguments for kernel grid + krange = subsets.Range(kernel_map.range[::-1]) + kdims = krange.size() + dsym = [symbolic.symbol(f'__DAPB{i}', nonnegative=True, integer=True) for i in range(len(krange))] + bidx = krange.coord_at(dsym) + + + # First three dimensions are evaluated directly + for i in range(min(len(krange), 3)): + varname = kernel_map.params[-i - 1] + + # If we defaulted to a fixed number of threads per block, offset by thread ID + block_expr = f'blockIdx.{_get_cuda_dim(min(i, 2))}' + if not has_tbmap: + block_expr = f'({block_expr} * {symbolic_to_cpp(block_dims[i])} + threadIdx.{_get_cuda_dim(i)})' + + # Delinearize third dimension if necessary + if i == 2 and len(krange) > 3: + block_expr = f'({block_expr} / ({symbolic_to_cpp(functools.reduce(sympy.Mul, kdims[3:], 1))}))' + + expr = symbolic_to_cpp(bidx[i]).replace(f'__DAPB{i}', block_expr) + + kernel_stream.write(f'{tidtype.ctype} {varname} = {expr};', cfg, state_id, node) + self._dispatcher.defined_vars.add(varname, DefinedType.Scalar, tidtype.ctype) + + + # Delinearize beyond the third dimension + if len(krange) > 3: + for i in range(3, len(krange)): + varname = kernel_map.params[-i - 1] + + block_expr = 'blockIdx.z' + if not has_tbmap: + block_expr = f'({block_expr} * {symbolic_to_cpp(block_dims[2])} + threadIdx.z)' + + block_expr = '((%s / (%s)) %% (%s))' % ( + block_expr, + symbolic_to_cpp(functools.reduce(sympy.Mul, kdims[i + 1:], 1)), + symbolic_to_cpp(kdims[i]), + ) + + expr = symbolic_to_cpp(bidx[i]).replace(f'__DAPB{i}', block_expr) + kernel_stream.write(f'{tidtype.ctype} {varname} = {expr};', cfg, state_id, node) + self._dispatcher.defined_vars.add(varname, DefinedType.Scalar, tidtype.ctype) + + + + # handle conditions + if not has_tbmap: + dsym_end = [d + bs - 1 for d, bs in zip(dsym, block_dims)] + minels = krange.min_element() + maxels = krange.max_element() + for i, (v, minel, maxel) in enumerate(zip(kernel_map.params[::-1], minels, maxels)): + condition = '' + + # Optimize conditions if they are always true + if i >= 3 or (dsym[i] >= minel) != True: + condition += f'{v} >= {symbolic_to_cpp(minel)}' + + if (i >= 3 or ((dsym_end[i] < maxel) != False and ((dsym_end[i] % block_dims[i]) != 0) == True) + or (block_dims[i] > maxel) == True): + + if len(condition) > 0: + condition += ' && ' + condition += f'{v} < {symbolic_to_cpp(maxel + 1)}' + + if len(condition) > 0: + scopeManager.open(condition= condition) + + + + + self._dispatcher.dispatch_subgraph(sdfg, cfg, dfg_scope, state_id, function_stream, + kernel_stream, skip_entry_node=True) + + + + + + def _generate_GPU_ThreadBlock_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: ScopeSubgraphView, + state_id: int, function_stream: CodeIOStream, kernel_stream: CodeIOStream) -> None: + + node = dfg_scope.source_nodes()[0] + scope_map = node.map + + + with KernelScopeManager(cudaCodeGen=self, sdfg=sdfg, cfg=cfg, dfg_scope=dfg_scope, state_id=state_id, + function_stream=function_stream, callsite_stream=kernel_stream, comment="ThreadBlock Scope",) as scopeManager: + + + brange = subsets.Range(scope_map.range[::-1]) + + dsym = [symbolic.symbol(f'__DAPT{i}', nonnegative=True, integer=True) for i in range(len(brange))] + bdims = self._current_kernel_spec.block_dims + dsym_end = [d + (bs * rng[2]) - 1 for d, bs, rng in zip(dsym, bdims, brange)] + tidx = brange.coord_at(dsym) + + # First three dimensions are evaluated directly + for i in range(min(len(brange), 3)): + + varname = scope_map.params[-i - 1] + block_expr = 'threadIdx.%s' % _get_cuda_dim(i) + + expr = symbolic_to_cpp(tidx[i]).replace(f'__DAPT{i}', block_expr) + kernel_stream.write(f'int {varname} = {expr};', cfg, state_id, node) + self._dispatcher.defined_vars.add(varname, DefinedType.Scalar, 'int') + + + minels = brange.min_element() + maxels = brange.max_element() + for i, (v, minel, maxel) in enumerate(zip(scope_map.params[::-1], minels, maxels)): + condition = '' + + # Optimize conditions if they are always true + ############################################# + + # Block range start + if i >= 3 or (dsym[i] >= minel) != True: + condition += '%s >= %s' % (v, symbolic_to_cpp(minel)) + + # Special case: block size is exactly the range of the map (0:b) + if i >= 3: + skipcond = False + else: + skipcond = dsym_end[i].subs({dsym[i]: minel}) == maxel + + # Block range end + if i >= 3 or (not skipcond and (dsym_end[i] < maxel) != True): + if len(condition) > 0: + condition += ' && ' + condition += '%s < %s' % (v, symbolic_to_cpp(maxel + 1)) + + # Emit condition in code + if len(condition) > 0: + scopeManager.open(condition=condition) + + + self._dispatcher.dispatch_subgraph(sdfg, cfg, dfg_scope, state_id, function_stream, + kernel_stream, skip_entry_node=True) + + + + def _generate_GPU_Warp_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: ScopeSubgraphView, + state_id: int, function_stream: CodeIOStream, kernel_stream: CodeIOStream) -> None: + + + with KernelScopeManager(cudaCodeGen=self, sdfg=sdfg, cfg=cfg, dfg_scope=dfg_scope, state_id=state_id, + function_stream=function_stream, callsite_stream=kernel_stream, comment="WarpLevel Scope",) as scopeManager: + + node = dfg_scope.source_nodes()[0] + scope_map = node.map + map_range = subsets.Range(scope_map.range) + current_kernel_spec = self._current_kernel_spec + grid_dims = current_kernel_spec.grid_dims + block_dims = current_kernel_spec.block_dims + + # TODO: Does it? + if len(map_range) > 1: + raise ValueError("The range for GPU_Warp maps must be one-dimensional.") + + warpId = "int warpId = " + warpId += f"(threadIdx.x + blockDim.x * threadIdx.y + blockDim.x * blockDim.y * threadIdx.z) / 32;" + + map_start = symbolic_to_cpp(map_range.min_element()[0]) + map_end = symbolic_to_cpp(map_range.max_element()[0]) + + + kernel_stream.write(f"\n", cfg, state_id, node) + kernel_stream.write(f"{warpId}", cfg, state_id, node) + + condition = f" {map_start} < warpId && warpId < {map_end}" + scopeManager.open(condition=condition) + + kernel_stream.write(f"\n\n\n", cfg, state_id, node) + kernel_stream.write(f"----------------------------------", cfg, state_id, node) + kernel_stream.write(f"// WarpLevel operations here", cfg, state_id, node) + kernel_stream.write(f"----------------------------------", cfg, state_id, node) + kernel_stream.write(f"\n\n\n", cfg, state_id, node) + + + + + + + def _generate_gpu_bridge(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: ScopeSubgraphView, + state_id: int, function_stream: CodeIOStream, callsite_stream: CodeIOStream) -> None: + + + scope_entry = dfg_scope.source_nodes()[0] + + kernel_spec: KernelSpec = self._current_kernel_spec + kernel_name = kernel_spec.kernel_name + kernel_bridge_args = kernel_spec.bridge_args + kernel_bridge_args_typed = kernel_spec.bridge_args_typed + + # Declaration of the function which launches the kernel (C++ code) + function_stream.write('DACE_EXPORTED void __dace_runkernel_%s(%s);\n' % + (kernel_name, ', '.join(kernel_bridge_args_typed)), cfg, state_id, scope_entry) + + # Calling he function which launches the kernel (C++ code) + callsite_stream.write( '__dace_runkernel_%s(%s);\n' % + (kernel_name, ', '.join(kernel_bridge_args)), cfg, state_id, scope_entry) + + + def _generate_kernel_launch(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: ScopeSubgraphView, + state_id: int, function_stream: CodeIOStream, callsite_stream: CodeIOStream) -> None: + + + scope_entry = dfg_scope.source_nodes()[0] + + kernel_spec: KernelSpec = self._current_kernel_spec + kernel_name = kernel_spec.kernel_name + kernel_args_as_input = kernel_spec.args_as_input + kernel_launch_args_typed = kernel_spec.bridge_args_typed + + # get kernel dimensions and transform into a c++ string + grid_dims = kernel_spec.grid_dims + block_dims = kernel_spec.block_dims + gdims = ', '.join(symbolic_to_cpp(grid_dims)) + bdims = ', '.join(symbolic_to_cpp(block_dims)) + + + # Declaration of the function which launches the kernel (CUDA code) + self._localcode.write( + """ + DACE_EXPORTED void __dace_runkernel_{fname}({fargs}); + void __dace_runkernel_{fname}({fargs}) + {{ + """.format(fname=kernel_name, fargs=', '.join(kernel_launch_args_typed)), + cfg, state_id, scope_entry + ) + + + + + # Calling kernel function (CUDA code) + self._localcode.write( + ''' + void *{kname}_args[] = {{ {kargs} }}; + gpuError_t __err = {backend}LaunchKernel( (void*){kname}, dim3({gdims}), dim3({bdims}), {kname}_args, {dynsmem}, {stream} + ); + '''.format( + kname=kernel_name, + kargs=', '.join(['(void *)&' + arg for arg in kernel_args_as_input]), + gdims=gdims, + bdims=bdims, + dynsmem='0', + stream='__state->gpu_context->streams[0]', + backend=self.backend + ), + cfg, state_id, scope_entry + ) + + + self._localcode.write(f'DACE_KERNEL_LAUNCH_CHECK(__err, "{kernel_name}", {gdims}, {bdims});') + + self._localcode.write('}') + + + + + + +################################# NESTED SDFG handling ############################################ +# testing phase + + + + def generate_state(self, + sdfg: SDFG, + cfg: ControlFlowRegion, + state: SDFGState, + function_stream: CodeIOStream, + callsite_stream: CodeIOStream, + generate_state_footer: bool = False) -> None: + + if ExperimentalCUDACodeGen._in_device_code: + self.generate_devicelevel_state(sdfg, cfg, state, function_stream, callsite_stream) + else: + self._frame.generate_state(sdfg, cfg, state, function_stream, callsite_stream, generate_state_footer=False) + + + def generate_devicelevel_state(self, sdfg: SDFG, cfg: ControlFlowRegion, state: SDFGState, + function_stream: CodeIOStream, callsite_stream: CodeIOStream) -> None: + # Special case: if this is a GPU grid state and something is reading + # from a possible result of a collaborative write, sync first + if self._toplevel_schedule == dtypes.ScheduleType.GPU_Device: + for node in state.nodes(): + if (isinstance(node, nodes.AccessNode) and node.desc(sdfg).storage == dtypes.StorageType.GPU_Shared + and state.in_degree(node) == 0 and state.out_degree(node) > 0): + break + return + + self._frame.generate_state(sdfg, cfg, state, function_stream, callsite_stream) + + + def _emit_sync(self, codestream: CodeIOStream): + if Config.get_bool('compiler', 'cuda', 'syncdebug'): + codestream.write('''DACE_GPU_CHECK({backend}GetLastError()); + DACE_GPU_CHECK({backend}DeviceSynchronize());'''.format(backend=self.backend)) + + def _begin_streams(self, sdfg, state): + result = set() + for node in state.source_nodes(): + if hasattr(node, '_cuda_stream'): + if (isinstance(node, nodes.AccessNode) and isinstance(sdfg.arrays[node.data], dt.View)): + continue + result.add(node._cuda_stream) + else: + # Collect other streams in state start + for e in state.out_edges(node): + if hasattr(e.dst, '_cuda_stream'): + if (isinstance(node, nodes.AccessNode) and isinstance(sdfg.arrays[node.data], dt.View)): + continue + result.add(e.dst._cuda_stream) + return result + + def state_dispatch_predicate(self, sdfg, state): + if self._toplevel_schedule in dtypes.GPU_SCHEDULES: + return True + for node in state.sink_nodes(): + if hasattr(node, '_cuda_stream'): + return True + else: + for e in state.in_edges(node): + if hasattr(e.src, '_cuda_stream'): + return True + for s, _ in self.pool_release.values(): + if s is state: + return True + return False + + def node_dispatch_predicate(self, sdfg, state, node): + if hasattr(node, 'schedule'): # NOTE: Works on nodes and scopes + if node.schedule in dtypes.GPU_SCHEDULES: + return True + if ExperimentalCUDACodeGen._in_device_code: + return True + return False + + def generate_node(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int, node: nodes.Node, + function_stream: CodeIOStream, callsite_stream: CodeIOStream) -> None: + + + # get the generating function's name + gen = getattr(self, '_generate_' + type(node).__name__, False) + + # if it is not implemented, use generate node of cpu impl + if gen is not False: + gen(sdfg, cfg, dfg, state_id, node, function_stream, callsite_stream) + elif type(node).__name__ == 'MapExit': + return + else: + self._cpu_codegen.generate_node(sdfg, cfg, dfg, state_id, node, function_stream, callsite_stream) + + def generate_nsdfg_header(self, sdfg, cfg, state, state_id, node, memlet_references, sdfg_label): + return 'DACE_DFI ' + self._cpu_codegen.generate_nsdfg_header( + sdfg, cfg, state, state_id, node, memlet_references, sdfg_label, state_struct=False) + + def generate_nsdfg_call(self, sdfg, cfg, state, node, memlet_references, sdfg_label): + return self._cpu_codegen.generate_nsdfg_call(sdfg, + cfg, + state, + node, + memlet_references, + sdfg_label, + state_struct=False) + + def generate_nsdfg_arguments(self, sdfg, cfg, dfg, state, node): + result = self._cpu_codegen.generate_nsdfg_arguments(sdfg, cfg, dfg, state, node) + return result + + def _generate_NestedSDFG(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int, + node: nodes.NestedSDFG, function_stream: CodeIOStream, + callsite_stream: CodeIOStream) -> None: + old_schedule = self._toplevel_schedule + self._toplevel_schedule = node.schedule + old_codegen = self._cpu_codegen.calling_codegen + self._cpu_codegen.calling_codegen = self + + self._cpu_codegen._generate_NestedSDFG(sdfg, cfg, dfg, state_id, node, function_stream, callsite_stream) + + self._cpu_codegen.calling_codegen = old_codegen + self._toplevel_schedule = old_schedule + + + + +####################################################################### + # Rather Minor "actual" changes, but much nicer to extend and maintain + + + # For Yakup: I like it when we first "guard" and then implement the logic sorrow free + def declare_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int, + node: nodes.AccessNode, nodedesc: dt.Data, function_stream: CodeIOStream, + declaration_stream: CodeIOStream) -> None: + + + ptrname = ptr(node.data, nodedesc, sdfg, self._frame) + fsymbols = self._frame.symbols_and_constants(sdfg) + + # ----------------- Guard checks -------------------- + + # NOTE: `dfg` is None iff `nodedesc` is non-free symbol dependent (see DaCeCodeGenerator.determine_allocation_lifetime). + # We avoid `is_nonfree_sym_dependent` when dfg is None and `nodedesc` is a View. + if dfg and not sdutil.is_nonfree_sym_dependent(node, nodedesc, dfg, fsymbols): + raise NotImplementedError( + "declare_array is only for variables that require separate declaration and allocation.") + + if nodedesc.storage == dtypes.StorageType.GPU_Shared: + raise NotImplementedError("Dynamic shared memory unsupported") + + if nodedesc.storage == dtypes.StorageType.Register: + raise ValueError("Dynamic allocation of registers is not allowed") + + if nodedesc.storage not in {dtypes.StorageType.GPU_Global, dtypes.StorageType.CPU_Pinned}: + raise NotImplementedError( + f"CUDA: Unimplemented storage type {nodedesc.storage.name}.") + + if self._dispatcher.declared_arrays.has(ptrname): + return # Already declared + + + # ----------------- Declaration -------------------- + dataname = node.data + array_ctype = f'{nodedesc.dtype.ctype} *' + declaration_stream.write(f'{array_ctype} {dataname};\n', cfg, state_id, node) + self._dispatcher.declared_arrays.add(dataname, DefinedType.Pointer, array_ctype) + + + def allocate_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int, + node: nodes.AccessNode, nodedesc: dt.Data, function_stream: CodeIOStream, + declaration_stream: CodeIOStream, allocation_stream: CodeIOStream) -> None: + """ + Maybe document here that this also does declaration and that declare_array only declares specific + kind of data + """ + + dataname = ptr(node.data, nodedesc, sdfg, self._frame) + + # ------------------- Guard checks ------------------- + + # Skip if variable is already defined + if self._dispatcher.defined_vars.has(dataname): + return + + if isinstance(nodedesc, (dace.data.View, dace.data.Reference)): + return NotImplementedError("Pointers and References not implemented in ExperimentalCUDACodeGen") + + if isinstance(nodedesc, dace.data.Stream): + raise NotImplementedError("allocate_stream not implemented in ExperimentalCUDACodeGen") + + # No clue what is happening here + if nodedesc.lifetime in (dtypes.AllocationLifetime.Persistent, dtypes.AllocationLifetime.External): + nodedesc = update_persistent_desc(nodedesc, sdfg) + + # ------------------- Allocation/Declaration ------------------- + + # Call the appropriate handler based on storage type + gen = getattr(self, f'_prepare_{nodedesc.storage.name}_array', None) + if gen: + gen(sdfg, cfg, dfg, state_id, node, nodedesc, function_stream, declaration_stream, allocation_stream) + else: + raise NotImplementedError(f'CUDA: Unimplemented storage type {nodedesc.storage}') + + + def _prepare_GPU_Global_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int, + node: nodes.AccessNode, nodedesc: dt.Data, function_stream: CodeIOStream, + declaration_stream: CodeIOStream, allocation_stream: CodeIOStream): + dataname = ptr(node.data, nodedesc, sdfg, self._frame) + + # ------------------- Declaration ------------------- + array_ctype = f'{nodedesc.dtype.ctype} *' + declared = self._dispatcher.declared_arrays.has(dataname) + + if not declared: + declaration_stream.write(f'{array_ctype} {dataname};\n', cfg, state_id, node) + + self._dispatcher.defined_vars.add(dataname, DefinedType.Pointer, array_ctype) + + # ------------------- Allocation ------------------- + arrsize = nodedesc.total_size + arrsize_malloc = f'{symbolic_to_cpp(arrsize)} * sizeof({nodedesc.dtype.ctype})' + + if nodedesc.pool: + cudastream = getattr(node, '_cuda_stream', 'nullptr') + if cudastream != 'nullptr': + cudastream = f'__state->gpu_context->streams[{cudastream}]' + allocation_stream.write( + f'DACE_GPU_CHECK({self.backend}MallocAsync((void**)&{dataname}, {arrsize_malloc}, {cudastream}));\n', + cfg, state_id, node + ) + self._emit_sync(allocation_stream) + else: + # Strides are left to the user's discretion + allocation_stream.write( + f'DACE_GPU_CHECK({self.backend}Malloc((void**)&{dataname}, {arrsize_malloc}));\n', + cfg, state_id, node + ) + + # ------------------- Initialization ------------------- + if node.setzero: + allocation_stream.write( + f'DACE_GPU_CHECK({self.backend}Memset({dataname}, 0, {arrsize_malloc}));\n', + cfg, state_id, node + ) + + if isinstance(nodedesc, dt.Array) and nodedesc.start_offset != 0: + allocation_stream.write( + f'{dataname} += {symbolic_to_cpp(nodedesc.start_offset)};\n', + cfg, state_id, node + ) + + + def _prepare_CPU_Pinned_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int, + node: nodes.AccessNode, nodedesc: dt.Data, function_stream: CodeIOStream, + declaration_stream: CodeIOStream, allocation_stream: CodeIOStream): + + dataname = ptr(node.data, nodedesc, sdfg, self._frame) + + # ------------------- Declaration ------------------- + array_ctype = f'{nodedesc.dtype.ctype} *' + declared = self._dispatcher.declared_arrays.has(dataname) + + if not declared: + declaration_stream.write(f'{array_ctype} {dataname};\n', cfg, state_id, node) + + self._dispatcher.defined_vars.add(dataname, DefinedType.Pointer, array_ctype) + + + # ------------------- Allocation ------------------- + arrsize = nodedesc.total_size + arrsize_malloc = f'{symbolic_to_cpp(arrsize)} * sizeof({nodedesc.dtype.ctype})' + + # Strides are left to the user's discretion + allocation_stream.write( + f'DACE_GPU_CHECK({self.backend}MallocHost(&{dataname}, {arrsize_malloc}));\n', + cfg, state_id, node + ) + if node.setzero: + allocation_stream.write( + f'memset({dataname}, 0, {arrsize_malloc});\n', + cfg, state_id, node + ) + + if nodedesc.start_offset != 0: + allocation_stream.write( + f'{dataname} += {symbolic_to_cpp(nodedesc.start_offset)};\n', + cfg, state_id, node + ) + + + def _prepare_GPU_Shared_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int, + node: nodes.AccessNode, nodedesc: dt.Data, function_stream: CodeIOStream, + declaration_stream: CodeIOStream, allocation_stream: CodeIOStream): + + dataname = ptr(node.data, nodedesc, sdfg, self._frame) + arrsize = nodedesc.total_size + + + # ------------------- Guard checks ------------------- + if symbolic.issymbolic(arrsize, sdfg.constants): + raise NotImplementedError('Dynamic shared memory unsupported') + if nodedesc.start_offset != 0: + raise NotImplementedError('Start offset unsupported for shared memory') + + + # ------------------- Declaration ------------------- + array_ctype = f'{nodedesc.dtype.ctype} *' + + declaration_stream.write( + f'__shared__ {nodedesc.dtype.ctype} {dataname}[{symbolic_to_cpp(arrsize)}];\n', + cfg, state_id, node + ) + + self._dispatcher.defined_vars.add(dataname, DefinedType.Pointer, array_ctype) + + + # ------------------- Initialization ------------------- + if node.setzero: + allocation_stream.write( + f'dace::ResetShared<{nodedesc.dtype.ctype}, {", ".join(symbolic_to_cpp(self._block_dims))}, {symbolic_to_cpp(arrsize)}, ' + f'1, false>::Reset({dataname});\n', + cfg, state_id, node + ) + + + def _prepare_Register_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int, + node: nodes.AccessNode, nodedesc: dt.Data, function_stream: CodeIOStream, + declaration_stream: CodeIOStream, allocation_stream: CodeIOStream): + + dataname = ptr(node.data, nodedesc, sdfg, self._frame) + + # ------------------- Guard checks ------------------- + if symbolic.issymbolic(arrsize, sdfg.constants): + raise ValueError('Dynamic allocation of registers not allowed') + if nodedesc.start_offset != 0: + raise NotImplementedError('Start offset unsupported for registers') + + + # ------------------- Declaration & Initialization ------------------- + arrsize = nodedesc.total_size + array_ctype = '{nodedesc.dtype.ctype} *' + init_clause = ' = {0}' if node.setzero else '' + + declaration_stream.write( + f'{nodedesc.dtype.ctype} {dataname}[{symbolic_to_cpp(arrsize)}]{init_clause};\n', + cfg, state_id, node + ) + + self._dispatcher.defined_vars.add(dataname, DefinedType.Pointer, array_ctype) + + + # I could also do deallocate based on type.. good for modularity, but may be an overkill here + def deallocate_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int, + node: nodes.AccessNode, nodedesc: dt.Data, function_stream: CodeIOStream, + callsite_stream: CodeIOStream) -> None: + + + dataname = ptr(node.data, nodedesc, sdfg, self._frame) + + # Adjust offset if needed + if isinstance(nodedesc, dt.Array) and nodedesc.start_offset != 0: + dataname = f'({dataname} - {symbolic_to_cpp(nodedesc.start_offset)})' + + # Remove declaration info + if self._dispatcher.declared_arrays.has(dataname): + is_global = nodedesc.lifetime in ( + dtypes.AllocationLifetime.Global, + dtypes.AllocationLifetime.Persistent, + dtypes.AllocationLifetime.External, + ) + self._dispatcher.declared_arrays.remove(dataname, is_global=is_global) + + + # Special case: Stream + if isinstance(nodedesc, dace.data.Stream): + raise NotImplementedError('stream code is not implemented in ExperimentalCUDACodeGen (yet)') + + # Special case: View - no deallocation + if isinstance(nodedesc, dace.data.View): + return + + + # Main deallocation logic by storage type + if nodedesc.storage == dtypes.StorageType.GPU_Global: + if not nodedesc.pool: # If pooled, will be freed somewhere else + callsite_stream.write( + f'DACE_GPU_CHECK({self.backend}Free({dataname}));\n', + cfg, state_id, node + ) + + elif nodedesc.storage == dtypes.StorageType.CPU_Pinned: + callsite_stream.write( + 'DACE_GPU_CHECK(%sFreeHost(%s));\n' % (self.backend, dataname), cfg, state_id, node) + + elif nodedesc.storage in {dtypes.StorageType.GPU_Shared, dtypes.StorageType.Register}: + # No deallocation needed + return + + else: + raise NotImplementedError(f'Deallocation not implemented for storage type: {nodedesc.storage.name}') + + + + + ####################################################################### + # Copy-pasted, might be changed in future + + + def get_generated_codeobjects(self): + + # My comment: first part creates the header and stores it in a object property + fileheader = CodeIOStream() + + self._frame.generate_fileheader(self._global_sdfg, fileheader, 'cuda') + + # My comment: takes codeblocks and transforms it nicely to code + initcode = CodeIOStream() + for sd in self._global_sdfg.all_sdfgs_recursive(): + if None in sd.init_code: + initcode.write(codeblock_to_cpp(sd.init_code[None]), sd) + if 'cuda' in sd.init_code: + initcode.write(codeblock_to_cpp(sd.init_code['cuda']), sd) + initcode.write(self._initcode.getvalue()) + + # My comment: takes codeblocks and transforms it nicely to code- probably same as before now for exit code + exitcode = CodeIOStream() + for sd in self._global_sdfg.all_sdfgs_recursive(): + if None in sd.exit_code: + exitcode.write(codeblock_to_cpp(sd.exit_code[None]), sd) + if 'cuda' in sd.exit_code: + exitcode.write(codeblock_to_cpp(sd.exit_code['cuda']), sd) + exitcode.write(self._exitcode.getvalue()) + + + # My comment: Uses GPU backend (NVIDIA or AMD) to get correct header files + if self.backend == 'cuda': + backend_header = 'cuda_runtime.h' + elif self.backend == 'hip': + backend_header = 'hip/hip_runtime.h' + else: + raise NameError('GPU backend "%s" not recognized' % self.backend) + + # My comment: Seems to get all function params, needed for later + params_comma = self._global_sdfg.init_signature(free_symbols=self._frame.free_symbols(self._global_sdfg)) + if params_comma: + params_comma = ', ' + params_comma + + #My comment looks life Memory information + pool_header = '' + if self.has_pool: + poolcfg = Config.get('compiler', 'cuda', 'mempool_release_threshold') + pool_header = f''' + cudaMemPool_t mempool; + cudaDeviceGetDefaultMemPool(&mempool, 0); + uint64_t threshold = {poolcfg if poolcfg != -1 else 'UINT64_MAX'}; + cudaMemPoolSetAttribute(mempool, cudaMemPoolAttrReleaseThreshold, &threshold); +''' + + # My comment: Looks like a "base" template, where more details will probably be added later + self._codeobject.code = """ +#include <{backend_header}> +#include + +{file_header} + +DACE_EXPORTED int __dace_init_experimental_cuda({sdfg_state_name} *__state{params}); +DACE_EXPORTED int __dace_exit_experimental_cuda({sdfg_state_name} *__state); + +{other_globalcode} + +int __dace_init_experimental_cuda({sdfg_state_name} *__state{params}) {{ + int count; + + // Check that we are able to run {backend} code + if ({backend}GetDeviceCount(&count) != {backend}Success) + {{ + printf("ERROR: GPU drivers are not configured or {backend}-capable device " + "not found\\n"); + return 1; + }} + if (count == 0) + {{ + printf("ERROR: No {backend}-capable devices found\\n"); + return 2; + }} + + // Initialize {backend} before we run the application + float *dev_X; + DACE_GPU_CHECK({backend}Malloc((void **) &dev_X, 1)); + DACE_GPU_CHECK({backend}Free(dev_X)); + + {pool_header} + + __state->gpu_context = new dace::cuda::Context({nstreams}, {nevents}); + + // Create {backend} streams and events + for(int i = 0; i < {nstreams}; ++i) {{ + DACE_GPU_CHECK({backend}StreamCreateWithFlags(&__state->gpu_context->internal_streams[i], {backend}StreamNonBlocking)); + __state->gpu_context->streams[i] = __state->gpu_context->internal_streams[i]; // Allow for externals to modify streams + }} + for(int i = 0; i < {nevents}; ++i) {{ + DACE_GPU_CHECK({backend}EventCreateWithFlags(&__state->gpu_context->events[i], {backend}EventDisableTiming)); + }} + + {initcode} + + return 0; +}} + +int __dace_exit_experimental_cuda({sdfg_state_name} *__state) {{ + {exitcode} + + // Synchronize and check for CUDA errors + int __err = static_cast(__state->gpu_context->lasterror); + if (__err == 0) + __err = static_cast({backend}DeviceSynchronize()); + + // Destroy {backend} streams and events + for(int i = 0; i < {nstreams}; ++i) {{ + DACE_GPU_CHECK({backend}StreamDestroy(__state->gpu_context->internal_streams[i])); + }} + for(int i = 0; i < {nevents}; ++i) {{ + DACE_GPU_CHECK({backend}EventDestroy(__state->gpu_context->events[i])); + }} + + delete __state->gpu_context; + return __err; +}} + +DACE_EXPORTED bool __dace_gpu_set_stream({sdfg_state_name} *__state, int streamid, gpuStream_t stream) +{{ + if (streamid < 0 || streamid >= {nstreams}) + return false; + + __state->gpu_context->streams[streamid] = stream; + + return true; +}} + +DACE_EXPORTED void __dace_gpu_set_all_streams({sdfg_state_name} *__state, gpuStream_t stream) +{{ + for (int i = 0; i < {nstreams}; ++i) + __state->gpu_context->streams[i] = stream; +}} + +{localcode} +""".format(params=params_comma, + sdfg_state_name=mangle_dace_state_struct_name(self._global_sdfg), + initcode=initcode.getvalue(), + exitcode=exitcode.getvalue(), + other_globalcode=self._globalcode.getvalue(), + localcode=self._localcode.getvalue(), + file_header=fileheader.getvalue(), + nstreams=max(1, self._cuda_streams), + nevents=max(1, self._cuda_events), + backend=self.backend, + backend_header=backend_header, + pool_header=pool_header, + sdfg=self._global_sdfg) + + return [self._codeobject] + + @staticmethod + def cmake_options(): + options = [] + + # Override CUDA toolkit + if Config.get('compiler', 'cuda', 'path'): + options.append("-DCUDA_TOOLKIT_ROOT_DIR=\"{}\"".format( + Config.get('compiler', 'cuda', 'path').replace('\\', '/'))) + + # Get CUDA architectures from configuration + backend = common.get_gpu_backend() + if backend == 'cuda': + cuda_arch = Config.get('compiler', 'cuda', 'cuda_arch').split(',') + cuda_arch = [ca for ca in cuda_arch if ca is not None and len(ca) > 0] + + cuda_arch = ';'.join(cuda_arch) + options.append(f'-DDACE_CUDA_ARCHITECTURES_DEFAULT="{cuda_arch}"') + + flags = Config.get("compiler", "cuda", "args") + options.append("-DCMAKE_CUDA_FLAGS=\"{}\"".format(flags)) + + if backend == 'hip': + hip_arch = Config.get('compiler', 'cuda', 'hip_arch').split(',') + hip_arch = [ha for ha in hip_arch if ha is not None and len(ha) > 0] + + flags = Config.get("compiler", "cuda", "hip_args") + flags += ' ' + ' '.join( + '--offload-arch={arch}'.format(arch=arch if arch.startswith("gfx") else "gfx" + arch) + for arch in hip_arch) + options.append("-DEXTRA_HIP_FLAGS=\"{}\"".format(flags)) + + if Config.get('compiler', 'cpu', 'executable'): + host_compiler = make_absolute(Config.get("compiler", "cpu", "executable")) + options.append("-DCUDA_HOST_COMPILER=\"{}\"".format(host_compiler)) + + return options + + def get_tb_maps_recursive(self, subgraph): + res = [] + for node in subgraph.nodes(): + if isinstance(node, nodes.NestedSDFG): + for state in node.sdfg.states(): + tbmaps = self.get_tb_maps_recursive(state) + for map, sym_map in tbmaps: + for k in sym_map.values(): + for kk, vv in node.symbol_mapping.items(): + sym_map[k] = sym_map[k].subs(dace.symbol(kk), vv) + res.append((map, sym_map)) + elif isinstance(node, nodes.MapEntry) and node.schedule in ( + dtypes.ScheduleType.GPU_Device, + dtypes.ScheduleType.GPU_ThreadBlock, + dtypes.ScheduleType.GPU_ThreadBlock_Dynamic, + ): + res.append((node.map, {dace.symbol(k): dace.symbol(k) for k in node.map.range.free_symbols})) + return res + + def get_kernel_dimensions(self, dfg_scope): + """ + Determines a GPU kernel's grid/block dimensions from map scopes. + + Ruleset for kernel dimensions: + + 1. If only one map (device-level) exists, of an integer set ``S``, + the block size is ``32x1x1`` and grid size is ``ceil(|S|/32)`` in + 1st dimension. + 2. If nested thread-block maps exist ``(T_1,...,T_n)``, grid + size is ``|S|`` and block size is ``max(|T_1|,...,|T_n|)`` with + block specialization. + 3. If block size can be overapproximated, it is (for + dynamically-sized blocks that are bounded by a + predefined size). + 4. If nested device maps exist, they generate extra grid dimensions (block size 1) + as the sum of all their sizes ``(|T_1| + ... + |T_n|)`` + + :note: Kernel dimensions are separate from the map + variables, and they should be treated as such. + :note: To make use of the grid/block 3D registers, we use multi- + dimensional kernels up to 3 dimensions, and flatten the + rest into the third dimension. + """ + + kernelmap_entry: nodes.MapEntry = dfg_scope.source_nodes()[0] + grid_size = kernelmap_entry.map.range.size(True)[::-1] + block_size = None + is_persistent = (kernelmap_entry.map.schedule == dtypes.ScheduleType.GPU_Persistent) + int_ceil = symbolic.int_ceil + + # Obtain thread-block maps from nested SDFGs + subgraph = dfg_scope.scope_subgraph(kernelmap_entry) + sub_maps = self.get_tb_maps_recursive(subgraph) + + # Introduce extra grid dimensions based on device sub-maps + extra_dim_offsets: Dict[nodes.Map, symbolic.SymbolicType] = {} + extra_grid_dims: List[symbolic.SymbolicType] = None + for submap, sym_map in sub_maps: + submap: nodes.Map + if submap.schedule != dtypes.ScheduleType.GPU_Device or submap is kernelmap_entry.map: + continue + if extra_grid_dims is not None and len(submap.params) != len(extra_grid_dims): + raise NotImplementedError( + 'Multiple GPU_Device sub-ranges with different dimensionality not yet implemented (found: ' + f'{len(submap.params)}, existing: {len(extra_grid_dims)}, map: {kernelmap_entry})') + + # Add and overapproximate sizes + gsize = [s.subs(list(sym_map.items())) for s in submap.range.size()[::-1]] + gsize = [symbolic.overapproximate(s) for s in gsize] + if extra_grid_dims is None: + extra_grid_dims = gsize + extra_dim_offsets[submap] = [0] * len(submap.params) + else: + extra_dim_offsets[submap] = extra_grid_dims + extra_grid_dims = [(sz + gsz) for sz, gsz in zip(extra_grid_dims, gsize)] + if extra_grid_dims is None: + extra_grid_dims = [] + grid_size.extend(extra_grid_dims) + + # Linearize (flatten) rest of dimensions to third + if len(grid_size) > 3: + grid_size[2] = functools.reduce(sympy.Mul, grid_size[2:], 1) + del grid_size[3:] + + # Extend to 3 dimensions if necessary + grid_size = grid_size + [1] * (3 - len(grid_size)) + + # Thread-block map cases + has_dtbmap = len( + [tbmap for tbmap, _ in sub_maps if tbmap.schedule == dtypes.ScheduleType.GPU_ThreadBlock_Dynamic]) > 0 + + # keep only thread-block maps + tb_maps_sym_map = [(tbmap, sym_map) for tbmap, sym_map in sub_maps + if tbmap.schedule == dtypes.ScheduleType.GPU_ThreadBlock] + + # Map thread-block size override + block_size = kernelmap_entry.map.gpu_block_size + if block_size is not None: + # Complement to three dimensions + block_size += [1] * (3 - len(block_size)) + # Linearize (flatten) rest of dimensions to third + if len(block_size) > 3: + block_size[2] = functools.reduce(sympy.Mul, block_size[2:], 1) + del block_size[3:] + + # No thread-block maps + if len(tb_maps_sym_map) == 0: + if block_size is None: + if has_dtbmap: + if (Config.get('compiler', 'cuda', 'dynamic_map_block_size') == 'max'): + raise NotImplementedError('max dynamic block size unimplemented') + else: + block_size = [ + int(b) for b in Config.get('compiler', 'cuda', 'dynamic_map_block_size').split(',') + ] + else: + def_bsize = Config.get('compiler', 'cuda', 'default_block_size') + if (not self._ignore_warnings): # NOTE: remove the ignoring of warnings later + warnings.warn( + f'No `gpu_block_size` property specified on map "{kernelmap_entry.map.label}". ' + f'Falling back to the configuration entry `compiler.cuda.default_block_size`: {def_bsize}. ' + 'You can either specify the block size to use with the gpu_block_size property, ' + 'or by adding nested `GPU_ThreadBlock` maps, which map work to individual threads. ' + 'For more information, see https://spcldace.readthedocs.io/en/latest/optimization/gpu.html') + + if (Config.get('compiler', 'cuda', 'default_block_size') == 'max'): + raise NotImplementedError('max dynamic block size unimplemented') + else: + block_size = [int(b) for b in Config.get('compiler', 'cuda', 'default_block_size').split(',')] + + block_ndim = max(1, sum(1 if b != 1 else 0 for b in block_size)) + grid_ndim = max(1, sum(1 if g != 1 else 0 for g in grid_size)) + if block_ndim > grid_ndim: + linearized_remainder = prod(block_size[grid_ndim:]) + block_size = block_size[:grid_ndim] + [1] * (3 - grid_ndim) + block_size[grid_ndim - 1] *= linearized_remainder + warnings.warn(f'Default block size has more dimensions ({block_ndim}) than kernel dimensions ' + f'({grid_ndim}) in map "{kernelmap_entry.map.label}". Linearizing block ' + f'size to {block_size}. Consider setting the ``gpu_block_size`` property.') + + assert (len(block_size) >= 1 and len(block_size) <= 3) + + # Grid size = ceil(|S|/32) for first dimension, rest = |S| + grid_size = [int_ceil(gs, bs) for gs, bs in zip(grid_size, block_size)] + + else: + # Find all thread-block maps to determine overall block size + detected_block_sizes = [block_size] if block_size is not None else [] + for tbmap, sym_map in tb_maps_sym_map: + tbsize = [s.subs(list(sym_map.items())) for s in tbmap.range.size()[::-1]] + + # Over-approximate block size (e.g. min(N,(i+1)*32)-i*32 --> 32) + # The partial trailing thread-block is emitted as an if-condition + # that returns on some of the participating threads + tbsize = [symbolic.overapproximate(s) for s in tbsize] + + # Linearize (flatten) rest of dimensions to third + if len(tbsize) > 3: + tbsize[2] = functools.reduce(sympy.Mul, tbsize[2:], 1) + del tbsize[3:] + + # Extend to 3 dimensions if necessary + tbsize = tbsize + [1] * (3 - len(tbsize)) + + if len(detected_block_sizes) == 0: + block_size = tbsize + else: + block_size = [sympy.Max(sz, bbsz) for sz, bbsz in zip(block_size, tbsize)] + + if block_size != tbsize or len(detected_block_sizes) == 0: + detected_block_sizes.append(tbsize) + + # TODO: If grid/block sizes contain elements only defined within the + # kernel, raise an invalid SDFG exception and recommend + # overapproximation. + + if len(detected_block_sizes) > 1: + + # Error when both gpu_block_size and thread-block maps were defined and conflict + if kernelmap_entry.map.gpu_block_size is not None: + raise ValueError('Both the `gpu_block_size` property and internal thread-block ' + 'maps were defined with conflicting sizes for kernel ' + f'"{kernelmap_entry.map.label}" (sizes detected: {detected_block_sizes}). ' + 'Use `gpu_block_size` only if you do not need access to individual ' + 'thread-block threads, or explicit block-level synchronization (e.g., ' + '`__syncthreads`). Otherwise, use internal maps with the `GPU_Threadblock` or ' + '`GPU_ThreadBlock_Dynamic` schedules. For more information, see ' + 'https://spcldace.readthedocs.io/en/latest/optimization/gpu.html') + + warnings.warn('Multiple thread-block maps with different sizes detected for ' + f'kernel "{kernelmap_entry.map.label}": {detected_block_sizes}. ' + f'Over-approximating to block size {block_size}.\n' + 'If this was not the intent, try tiling one of the thread-block maps to match.') + + # both thread-block map and dynamic thread-block map exist at the same + # time + if has_dtbmap: + raise NotImplementedError("GPU_ThreadBlock and GPU_ThreadBlock_Dynamic are currently " + "not supported in the same scope") + + if is_persistent: + grid_size = ['gridDim.x', '1', '1'] + + # Check block size against configured maximum values, if those can be determined + total_bsize = prod(block_size) + total_limit = Config.get('compiler', 'cuda', 'block_size_limit') + lastdim_limit = Config.get('compiler', 'cuda', 'block_size_lastdim_limit') + if (total_bsize > total_limit) == True: + raise ValueError(f'Block size for kernel "{kernelmap_entry.map.label}" ({block_size}) ' + f'is larger than the possible number of threads per block ({total_limit}). ' + 'The kernel will potentially not run, please reduce the thread-block size. ' + 'To increase this limit, modify the `compiler.cuda.block_size_limit` ' + 'configuration entry.') + if (block_size[-1] > lastdim_limit) == True: + raise ValueError(f'Last block size dimension for kernel "{kernelmap_entry.map.label}" ({block_size}) ' + 'is larger than the possible number of threads in the last block dimension ' + f'({lastdim_limit}). The kernel will potentially not run, please reduce the ' + 'thread-block size. To increase this limit, modify the ' + '`compiler.cuda.block_size_lastdim_limit` configuration entry.') + + return grid_size, block_size, len(tb_maps_sym_map) > 0, has_dtbmap, extra_dim_offsets + + def define_out_memlet(self, sdfg: SDFG, cfg: ControlFlowRegion, state_dfg: StateSubgraphView, state_id: int, + src_node: nodes.Node, dst_node: nodes.Node, edge: MultiConnectorEdge[Memlet], + function_stream: CodeIOStream, callsite_stream: CodeIOStream) -> None: + self._cpu_codegen.define_out_memlet(sdfg, cfg, state_dfg, state_id, src_node, dst_node, edge, function_stream, + callsite_stream) + + def process_out_memlets(self, *args, **kwargs): + # Call CPU implementation with this code generator as callback + self._cpu_codegen.process_out_memlets(*args, codegen=self, **kwargs) + + def _emit_copy(self, state_id: int, src_node: nodes.Node, src_storage: dtypes.StorageType, dst_node: nodes.Node, + dst_storage: dtypes.StorageType, dst_schedule: dtypes.ScheduleType, + edge: Tuple[nodes.Node, str, nodes.Node, str, Memlet], sdfg: SDFG, cfg: ControlFlowRegion, + dfg: StateSubgraphView, callsite_stream: CodeIOStream) -> None: + u, uconn, v, vconn, memlet = edge + state_dfg = cfg.state(state_id) + + cpu_storage_types = [ + dtypes.StorageType.CPU_Heap, dtypes.StorageType.CPU_ThreadLocal, dtypes.StorageType.CPU_Pinned + ] + gpu_storage_types = [dtypes.StorageType.GPU_Global, dtypes.StorageType.GPU_Shared] + + copy_shape = memlet.subset.bounding_box_size() + copy_shape = [symbolic.overapproximate(s) for s in copy_shape] + # Determine directionality + if (isinstance(src_node, nodes.AccessNode) and memlet.data == src_node.data): + outgoing_memlet = True + elif (isinstance(dst_node, nodes.AccessNode) and memlet.data == dst_node.data): + outgoing_memlet = False + else: + raise LookupError('Memlet does not point to any of the nodes') + + if (isinstance(src_node, nodes.AccessNode) and isinstance(dst_node, nodes.AccessNode) + and not ExperimentalCUDACodeGen._in_device_code + and (src_storage in [dtypes.StorageType.GPU_Global, dtypes.StorageType.CPU_Pinned] + or dst_storage in [dtypes.StorageType.GPU_Global, dtypes.StorageType.CPU_Pinned]) + and not (src_storage in cpu_storage_types and dst_storage in cpu_storage_types)): + src_location = 'Device' if src_storage == dtypes.StorageType.GPU_Global else 'Host' + dst_location = 'Device' if dst_storage == dtypes.StorageType.GPU_Global else 'Host' + + # Corner case: A stream is writing to an array + if (isinstance(sdfg.arrays[src_node.data], dt.Stream) and isinstance(sdfg.arrays[dst_node.data], + (dt.Scalar, dt.Array))): + return # Do nothing (handled by ArrayStreamView) + + syncwith = {} # Dictionary of {stream: event} + is_sync = False + max_streams = int(Config.get('compiler', 'cuda', 'max_concurrent_streams')) + + if hasattr(src_node, '_cuda_stream'): + cudastream = src_node._cuda_stream + if not hasattr(dst_node, '_cuda_stream'): + # Copy after which data is needed by the host + is_sync = True + elif dst_node._cuda_stream != src_node._cuda_stream: + syncwith[dst_node._cuda_stream] = getattr(edge, '_cuda_event', None) + else: + pass # Otherwise, no need to synchronize + elif hasattr(dst_node, '_cuda_stream'): + cudastream = dst_node._cuda_stream + else: + if max_streams >= 0: + print('WARNING: Undefined stream, reverting to default') + if dst_location == 'Host': + is_sync = True + cudastream = 'nullptr' + + # Handle case of impending kernel/tasklet on another stream + if max_streams >= 0: + for e in state_dfg.out_edges(dst_node): + if isinstance(e.dst, nodes.AccessNode): + continue + if not hasattr(e.dst, '_cuda_stream'): + is_sync = True + elif not hasattr(e, '_cuda_event'): + is_sync = True + elif e.dst._cuda_stream != cudastream: + syncwith[e.dst._cuda_stream] = e._cuda_event + + if cudastream != 'nullptr': + cudastream = '__state->gpu_context->streams[%d]' % cudastream + + if memlet.wcr is not None: + raise NotImplementedError('Accumulate %s to %s not implemented' % (src_location, dst_location)) + ############################# + + # Obtain copy information + copy_shape, src_strides, dst_strides, src_expr, dst_expr = (memlet_copy_to_absolute_strides( + self._dispatcher, sdfg, state_dfg, edge, src_node, dst_node, self._cpu_codegen._packed_types)) + dims = len(copy_shape) + + dtype = dst_node.desc(sdfg).dtype + + # Handle unsupported copy types + if dims == 2 and (src_strides[-1] != 1 or dst_strides[-1] != 1): + # NOTE: Special case of continuous copy + # Example: dcol[0:I, 0:J, k] -> datacol[0:I, 0:J] + # with copy shape [I, J] and strides [J*K, K], [J, 1] + try: + is_src_cont = src_strides[0] / src_strides[1] == copy_shape[1] + is_dst_cont = dst_strides[0] / dst_strides[1] == copy_shape[1] + except (TypeError, ValueError): + is_src_cont = False + is_dst_cont = False + if is_src_cont and is_dst_cont: + dims = 1 + copy_shape = [copy_shape[0] * copy_shape[1]] + src_strides = [src_strides[1]] + dst_strides = [dst_strides[1]] + else: + raise NotImplementedError('2D copy only supported with one stride') + + # Currently we only support ND copies when they can be represented + # as a 1D copy or as a 2D strided copy + if dims > 2: + if src_strides[-1] != 1 or dst_strides[-1] != 1: + raise NotImplementedError( + 'GPU copies are not supported for N-dimensions if they cannot be represented by a strided copy\n' + f' Nodes: src {src_node} ({src_storage}), dst {dst_node}({dst_storage})\n' + f' Strides: src {src_strides}, dst {dst_strides}') + else: + # Write for-loop headers + for d in range(dims - 2): + callsite_stream.write(f"for (int __copyidx{d} = 0; " + f"__copyidx{d} < {copy_shape[d]};" + f"++__copyidx{d}) {{") + # Write Memcopy2DAsync + current_src_expr = src_expr + " + " + " + ".join( + ["(__copyidx{} * ({}))".format(d, sym2cpp(s)) for d, s in enumerate(src_strides[:-2])]) + current_dst_expr = dst_expr + " + " + "+ ".join( + ["(__copyidx{} * ({}))".format(d, sym2cpp(s)) for d, s in enumerate(dst_strides[:-2])]) + callsite_stream.write( + 'DACE_GPU_CHECK(%sMemcpy2DAsync(%s, %s, %s, %s, %s, %s, %sMemcpy%sTo%s, %s));\n' % + (self.backend, current_dst_expr, + symbolic_to_cpp(dst_strides[-2]) + ' * sizeof(%s)' % dst_node.desc(sdfg).dtype.ctype, current_src_expr, + sym2cpp(src_strides[-2]) + ' * sizeof(%s)' % src_node.desc(sdfg).dtype.ctype, + sym2cpp(copy_shape[-1]) + ' * sizeof(%s)' % dst_node.desc(sdfg).dtype.ctype, + sym2cpp(copy_shape[-2]), self.backend, src_location, dst_location, cudastream), cfg, state_id, + [src_node, dst_node]) + # Write for-loop footers + for d in range(dims - 2): + callsite_stream.write("}") + + if dims == 1 and not (src_strides[-1] != 1 or dst_strides[-1] != 1): + copysize = ' * '.join(symbolic_to_cpp(copy_shape)) + array_length = copysize + copysize += ' * sizeof(%s)' % dtype.ctype + + callsite_stream.write( + 'DACE_GPU_CHECK(%sMemcpyAsync(%s, %s, %s, %sMemcpy%sTo%s, %s));\n' % + (self.backend, dst_expr, src_expr, copysize, self.backend, src_location, dst_location, cudastream), + cfg, state_id, [src_node, dst_node]) + node_dtype = dst_node.desc(sdfg).dtype + if issubclass(node_dtype.type, ctypes.Structure): + callsite_stream.write('for (size_t __idx = 0; __idx < {arrlen}; ++__idx) ' + '{{'.format(arrlen=array_length)) + # TODO: Study further when tackling Structures on GPU. + for field_name, field_type in node_dtype._typeclass.fields.items(): + if isinstance(field_type, dtypes.pointer): + tclass = field_type.type + + length = node_dtype._typeclass._length[field_name] + size = 'sizeof({})*{}[__idx].{}'.format(dtypes._CTYPES[tclass], str(src_node), length) + callsite_stream.write('DACE_GPU_CHECK({backend}Malloc(&{dst}[__idx].{fname}, ' + '{sz}));'.format(dst=str(dst_node), + fname=field_name, + sz=size, + backend=self.backend)) + callsite_stream.write( + 'DACE_GPU_CHECK({backend}MemcpyAsync({dst}[__idx].{fname}, ' + '{src}[__idx].{fname}, {sz}, ' + '{backend}Memcpy{sloc}To{dloc}, {stream}));'.format(dst=str(dst_node), + src=str(src_node), + fname=field_name, + sz=size, + sloc=src_location, + dloc=dst_location, + stream=cudastream, + backend=self.backend), cfg, + state_id, [src_node, dst_node]) + callsite_stream.write('}') + elif dims == 1 and ((src_strides[-1] != 1 or dst_strides[-1] != 1)): + callsite_stream.write( + 'DACE_GPU_CHECK(%sMemcpy2DAsync(%s, %s, %s, %s, %s, %s, %sMemcpy%sTo%s, %s));\n' % + (self.backend, dst_expr, symbolic_to_cpp(dst_strides[0]) + ' * sizeof(%s)' % dst_node.desc(sdfg).dtype.ctype, + src_expr, sym2cpp(src_strides[0]) + ' * sizeof(%s)' % src_node.desc(sdfg).dtype.ctype, + 'sizeof(%s)' % dst_node.desc(sdfg).dtype.ctype, sym2cpp( + copy_shape[0]), self.backend, src_location, dst_location, cudastream), cfg, state_id, + [src_node, dst_node]) + elif dims == 2: + callsite_stream.write( + 'DACE_GPU_CHECK(%sMemcpy2DAsync(%s, %s, %s, %s, %s, %s, %sMemcpy%sTo%s, %s));\n' % + (self.backend, dst_expr, symbolic_to_cpp(dst_strides[0]) + ' * sizeof(%s)' % dst_node.desc(sdfg).dtype.ctype, + src_expr, sym2cpp(src_strides[0]) + ' * sizeof(%s)' % src_node.desc(sdfg).dtype.ctype, + sym2cpp(copy_shape[1]) + ' * sizeof(%s)' % dst_node.desc(sdfg).dtype.ctype, sym2cpp( + copy_shape[0]), self.backend, src_location, dst_location, cudastream), cfg, state_id, + [src_node, dst_node]) + + # Post-copy synchronization + if is_sync: + # Synchronize with host (done at destination) + pass + else: + # Synchronize with other streams as necessary + for streamid, event in syncwith.items(): + syncstream = '__state->gpu_context->streams[%d]' % streamid + callsite_stream.write( + ''' + DACE_GPU_CHECK({backend}EventRecord(__state->gpu_context->events[{ev}], {src_stream})); + DACE_GPU_CHECK({backend}StreamWaitEvent({dst_stream}, __state->gpu_context->events[{ev}], 0)); + '''.format(ev=event, src_stream=cudastream, dst_stream=syncstream, backend=self.backend), cfg, + state_id, [src_node, dst_node]) + + self._emit_sync(callsite_stream) + + # Copy within the GPU + elif (src_storage in gpu_storage_types and dst_storage in gpu_storage_types): + + state_dfg = cfg.state(state_id) + sdict = state_dfg.scope_dict() + schedule_node = src_node + if scope_contains_scope(sdict, src_node, dst_node): + schedule_node = dst_node + + state = state_dfg + while (schedule_node is None or not isinstance(schedule_node, nodes.MapEntry) + or schedule_node.map.schedule == dtypes.ScheduleType.Sequential): + ret = xfh.get_parent_map(state, schedule_node) + if ret is None: + schedule_node = None + break + schedule_node, state = ret + + if schedule_node is None: + inner_schedule = dtypes.SCOPEDEFAULT_SCHEDULE[None] + else: + inner_schedule = schedule_node.map.schedule + + # Collaborative load + if inner_schedule == dtypes.ScheduleType.GPU_Device: + # Obtain copy information + copy_shape, src_strides, dst_strides, src_expr, dst_expr = (memlet_copy_to_absolute_strides( + self._dispatcher, sdfg, state, edge, src_node, dst_node, self._cpu_codegen._packed_types)) + + dims = len(copy_shape) + + funcname = 'dace::%sTo%s%dD' % (_get_storagename(src_storage), _get_storagename(dst_storage), dims) + self._scope_has_collaborative_copy = True + accum = '' + custom_reduction = [] + if memlet.wcr is not None: + redtype = operations.detect_reduction_type(memlet.wcr) + reduction_tmpl = '' + # Special call for detected reduction types + if redtype != dtypes.ReductionType.Custom: + credtype = ('dace::ReductionType::' + str(redtype)[str(redtype).find('.') + 1:]) + reduction_tmpl = '<%s>' % credtype + else: + dtype = dst_node.desc(sdfg).dtype + custom_reduction = [unparse_cr(sdfg, memlet.wcr, dtype)] + accum = '::template Accum%s' % reduction_tmpl + + if any(symbolic.issymbolic(s, sdfg.constants) for s in copy_shape): + callsite_stream.write((' {func}Dynamic<{type}, {bdims}, {is_async}>{accum}({args});').format( + func=funcname, + type=dst_node.desc(sdfg).dtype.ctype, + bdims=', '.join(symbolic_to_cpp(self._block_dims)), + is_async='true' if state_dfg.out_degree(dst_node) == 0 else 'false', + accum=accum, + args=', '.join([src_expr] + symbolic_to_cpp(src_strides) + [dst_expr] + custom_reduction + + symbolic_to_cpp(dst_strides) + symbolic_to_cpp(copy_shape))), cfg, state_id, [src_node, dst_node]) + elif funcname == 'dace::SharedToGlobal1D': + # special case: use a new template struct that provides functions for copy and reduction + callsite_stream.write( + (' {func}<{type}, {bdims}, {copysize}, {is_async}>{accum}({args});').format( + func=funcname, + type=dst_node.desc(sdfg).dtype.ctype, + bdims=', '.join(symbolic_to_cpp(self._block_dims)), + copysize=', '.join(symbolic_to_cpp(copy_shape)), + is_async='true' if state_dfg.out_degree(dst_node) == 0 else 'false', + accum=accum or '::Copy', + args=', '.join([src_expr] + symbolic_to_cpp(src_strides) + [dst_expr] + symbolic_to_cpp(dst_strides) + + custom_reduction)), cfg, state_id, [src_node, dst_node]) + else: + callsite_stream.write( + (' {func}<{type}, {bdims}, {copysize}, ' + + '{dststrides}, {is_async}>{accum}({args});').format( + func=funcname, + type=dst_node.desc(sdfg).dtype.ctype, + bdims=', '.join(symbolic_to_cpp(self._block_dims)), + copysize=', '.join(symbolic_to_cpp(copy_shape)), + dststrides=', '.join(symbolic_to_cpp(dst_strides)), + is_async='true' if state_dfg.out_degree(dst_node) == 0 else 'false', + accum=accum, + args=', '.join([src_expr] + symbolic_to_cpp(src_strides) + [dst_expr] + custom_reduction)), cfg, + state_id, [src_node, dst_node]) + # Per-thread load (same as CPU copies) + else: + self._cpu_codegen.copy_memory(sdfg, cfg, dfg, state_id, src_node, dst_node, edge, None, callsite_stream) + else: + self._cpu_codegen.copy_memory(sdfg, cfg, dfg, state_id, src_node, dst_node, edge, None, callsite_stream) + + def copy_memory(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int, + src_node: Union[nodes.Tasklet, nodes.AccessNode], dst_node: Union[nodes.CodeNode, nodes.AccessNode], + memlet: Memlet, function_stream: CodeIOStream, callsite_stream: CodeIOStream) -> None: + state = cfg.state(state_id) + if isinstance(src_node, nodes.Tasklet): + src_storage = dtypes.StorageType.Register + src_parent = state.entry_node(src_node) + dst_schedule = None if src_parent is None else src_parent.map.schedule + else: + src_storage = src_node.desc(sdfg).storage + + if isinstance(dst_node, nodes.Tasklet): + dst_storage = dtypes.StorageType.Register + else: + dst_storage = dst_node.desc(sdfg).storage + + dst_parent = state.entry_node(dst_node) + dst_schedule = None if dst_parent is None else dst_parent.map.schedule + + # Emit actual copy + self._emit_copy(state_id, src_node, src_storage, dst_node, dst_storage, dst_schedule, memlet, sdfg, cfg, dfg, + callsite_stream) + + + +######################################################################### +# helper functions from old CUDACodeGen + +def symbolic_to_cpp(arr): + """ Converts an array of symbolic variables (or one) to C++ strings. """ + if not isinstance(arr, list): + return cppunparse.pyexpr2cpp(symbolic.symstr(arr, cpp_mode=True)) + return [cppunparse.pyexpr2cpp(symbolic.symstr(d, cpp_mode=True)) for d in arr] + + +def _get_cuda_dim(idx): + """ Converts 0 to x, 1 to y, 2 to z, or raises an exception. """ + if idx < 0 or idx > 2: + raise ValueError('idx must be between 0 and 2, got %d' % idx) + return ('x', 'y', 'z')[idx] + + +def _get_storagename(storage): + """ Returns a string containing the name of the storage location. + Example: dtypes.StorageType.GPU_Shared will return "Shared". """ + sname = str(storage) + return sname[sname.rindex('_') + 1:] + + + +######################################################################### +# Functions I had to redefine locally to not modify other files and ensure backwards compatibility + + +def ptr(name: str, desc: dace.data.Data, sdfg: SDFG = None, framecode=None) -> str: + """ + Returns a string that points to the data based on its name and descriptor. + + This function should be in cpp.py, but for ExperimentalCUDACodeGen I defined + it here to not modify it there, s.t. we have backwards compatibility. + + :param name: Data name. + :param desc: Data descriptor. + :return: C-compatible name that can be used to access the data. + """ + from dace.codegen.targets.framecode import DaCeCodeGenerator # Avoid import loop + framecode: DaCeCodeGenerator = framecode + + if '.' in name: + root = name.split('.')[0] + if root in sdfg.arrays and isinstance(sdfg.arrays[root], dace.data.Structure): + name = name.replace('.', '->') + + # Special case: If memory is persistent and defined in this SDFG, add state + # struct to name + if (desc.transient and desc.lifetime in (dtypes.AllocationLifetime.Persistent, dtypes.AllocationLifetime.External)): + + if desc.storage == dtypes.StorageType.CPU_ThreadLocal: # Use unambiguous name for thread-local arrays + return f'__{sdfg.cfg_id}_{name}' + elif not ExperimentalCUDACodeGen._in_device_code: # GPU kernels cannot access state + return f'__state->__{sdfg.cfg_id}_{name}' + elif (sdfg, name) in framecode.where_allocated and framecode.where_allocated[(sdfg, name)] is not sdfg: + return f'__{sdfg.cfg_id}_{name}' + elif (desc.transient and sdfg is not None and framecode is not None and (sdfg, name) in framecode.where_allocated + and framecode.where_allocated[(sdfg, name)] is not sdfg): + # Array allocated for another SDFG, use unambiguous name + return f'__{sdfg.cfg_id}_{name}' + + return name + + + + +######################################################################### +# helper class + + +class KernelSpec: + """ + A helper class to encapsulate information required for working with kernels. + This class provides a structured way to store and retrieve kernel parameters. + """ + + def __init__(self, cudaCodeGen: ExperimentalCUDACodeGen, sdfg: SDFG, cfg: ControlFlowRegion, + dfg_scope: ScopeSubgraphView, state_id: int): + # Entry and exit nodes of the scope + scope_entry = dfg_scope.source_nodes()[0] + state = cfg.state(state_id) + + self._kernel_map: nodes.Map = scope_entry.map + + # Kernel name + self._kernel_name: str = '%s_%d_%d_%d' % (scope_entry.map.label, cfg.cfg_id, state.block_id, state.node_id(scope_entry)) + + # Kernel arguments + self._args: Dict = cudaCodeGen._arglists[scope_entry] + self._args_typed: list[str] = [adata.as_arg(name=aname) for aname, adata in self._args.items()] + self._args_as_input: list[str] = [ptr(aname, adata, sdfg, cudaCodeGen._frame) for aname, adata in self._args.items()] + + # Used for the bridging function, be careful: a change in the name __state will probably lead to compilation errors + state_param: list[str] = [f'{mangle_dace_state_struct_name(cudaCodeGen._global_sdfg)} *__state'] + + self._bridge_args: list[str] = ['__state'] + self._args_as_input + self._bridge_args_typed: list[str] = state_param + self._args_typed + + # Kernel dimensions + self._grid_dims, self._block_dims, self._has_tbmap, self._has_dtbmap, _ = cudaCodeGen.get_kernel_dimensions(dfg_scope) + + @property + def kernel_name(self) -> list[str]: + """Returns the kernel name.""" + return self._kernel_name + + @property + def kernel_map(self) -> nodes.Map: + """Returns the kernel map node""" + return self._kernel_map + + + @property + def args_as_input(self) -> list[str]: + """Returns the kernel function arguments + that can be used as an input for calling the function. + It is the __global__ kernel function, NOT the kernel launch function.""" + return self._args_as_input + + @property + def args_typed(self) -> list[str]: + """Returns the typed kernel function arguments + that can be used for declaring the __global__ kernel function. + These arguments include their respective data types.""" + return self._args_typed + + @property + def bridge_args(self) -> list[str]: + return self._bridge_args + + @property + def bridge_args_typed(self) -> list[str]: + return self._bridge_args_typed + + @property + def grid_dims(self) -> list: + """Returns the grid dimensions of the kernel.""" + return self._grid_dims + + @property + def block_dims(self) -> list: + """Returns the block dimensions of the kernel.""" + return self._block_dims + + @property + def has_tbmap(self) -> bool: + """Returns whether the kernel has a thread-block map.""" + return self._has_tbmap + + @property + def has_dtbmap(self) -> bool: + """Returns whether the kernel has a dynamic thread-block map.""" + return self._has_dtbmap + + + +class KernelScopeManager: + """ + A helper class to manage opening and closing brackets in a structured way using the 'with' statement. + This class simplifies the process of correctly opening and closing brackets. It also supports an optional + debug mode to include comments in the generated code, which can help with debugging and understanding + the code structure. + """ + + def __init__(self, cudaCodeGen: ExperimentalCUDACodeGen, sdfg: SDFG, + cfg: ControlFlowRegion, dfg_scope: ScopeSubgraphView, state_id: int, + function_stream: CodeIOStream, callsite_stream: CodeIOStream, comment: str = None, + debug: bool = True): + """ + Initializes the KernelScopeManager. + + :param cudaCodeGen: The ExperimentalCUDACodeGen instance for potential future use. + :param sdfg: The SDFG instance for context. + :param cfg: The ControlFlowRegion instance for context. + :param dfg_scope: The ScopeSubgraphView instance for context. + :param state_id: The ID of the current state for context. + :param function_stream: The CodeIOStream for function-level code. + :param callsite_stream: The CodeIOStream for callsite-level code. + :param comment: A descriptive comment explaining the purpose of the code block being opened. Default is None. + :param debug: Whether to include debug comments in the output. Defaults to False. + """ + self.cudaCodeGen = cudaCodeGen + self.sdfg = sdfg + self.cfg = cfg + self.dfg_scope = dfg_scope + self.state_id = state_id + self.function_stream = function_stream + self.callsite_stream = callsite_stream + self.comment = comment + self.debug = debug + self._opened = 0 + + self.entry_node = self.dfg_scope.source_nodes()[0] + self.exit_node = self.dfg_scope.sink_nodes()[0] + + def __enter__(self): + """ + Writes the opening bracket to the stream and allocates arrays in scope. + """ + self.open() + self.cudaCodeGen._frame.allocate_arrays_in_scope( + self.sdfg, self.cfg, self.entry_node, self.function_stream, self.callsite_stream + ) + return self + + def __exit__(self, exc_type, exc_value, traceback): + """ + Deallocates arrays in scope and writes the closing brackets to the stream. + """ + self.cudaCodeGen._frame.deallocate_arrays_in_scope( + self.sdfg, self.cfg, self.entry_node, self.function_stream, self.callsite_stream + ) + for i in range(self._opened): + line = "}" + if self.debug: + line += f" // {self.comment} (close {i + 1})" + self.callsite_stream.write(line, self.cfg, self.state_id, self.exit_node) + + def open(self, condition: str = None): + """ + Opens a bracket. If a condition is given, emits 'if (condition) {', otherwise just '{'. + Tracks the number of open brackets for closing later. + + :param condition: Optional condition for the opening bracket. + """ + line = f"if ({condition}) {{" if condition else "{" + if self.debug: + line += f" // {self.comment} (open {self._opened + 1})" + self.callsite_stream.write(line, self.cfg, self.state_id, self.entry_node) + self._opened += 1 + + + + + diff --git a/dace/config_schema.yml b/dace/config_schema.yml index b5a7914018..7ca237508e 100644 --- a/dace/config_schema.yml +++ b/dace/config_schema.yml @@ -448,6 +448,15 @@ required: range, or to reduce memory usage. + implementation: + type: str + title: CUDA codegen implementation + description: > + Choose between available CUDA code generation implementations. + "legacy" is stable, "experimental" is used by Berkay Aydogdu and + Yakup Koray Budanaz for Berkays master-thesis. + enum: [legacy, experimental] + default: experimental ############################################# # General FPGA flags fpga: diff --git a/dace/dtypes.py b/dace/dtypes.py index a400e179b1..6fa154e794 100644 --- a/dace/dtypes.py +++ b/dace/dtypes.py @@ -76,6 +76,10 @@ class ScheduleType(aenum.AutoNumberEnum): Snitch_Multicore = () FPGA_Multi_Pumped = () #: Used for double pumping + # TODO: Aprove + # Scope introduced in ExperimentalCudaCodeGen + GPU_Warp = () + # A subset of GPU schedule types GPU_SCHEDULES = [ @@ -202,7 +206,11 @@ class TilingType(aenum.AutoNumberEnum): ScheduleType.GPU_ThreadBlock_Dynamic: StorageType.Register, ScheduleType.FPGA_Device: StorageType.FPGA_Global, ScheduleType.SVE_Map: StorageType.CPU_Heap, - ScheduleType.Snitch: StorageType.Snitch_TCDM + ScheduleType.Snitch: StorageType.Snitch_TCDM, + + #TODO: Approve. + # Usually used in the context with shared memory.. + ScheduleType.GPU_Warp: StorageType.Register, } # Maps from ScheduleType to default ScheduleType for sub-scopes @@ -223,7 +231,11 @@ class TilingType(aenum.AutoNumberEnum): ScheduleType.FPGA_Multi_Pumped: ScheduleType.FPGA_Device, ScheduleType.SVE_Map: ScheduleType.Sequential, ScheduleType.Snitch: ScheduleType.Snitch, - ScheduleType.Snitch_Multicore: ScheduleType.Snitch_Multicore + ScheduleType.Snitch_Multicore: ScheduleType.Snitch_Multicore, + + #TODO: Approve. + # Usually no lower scopes + ScheduleType.GPU_Warp: ScheduleType.Sequential, } # Maps from StorageType to a preferred ScheduleType for helping determine schedules. diff --git a/dace/registry.py b/dace/registry.py index 08efeb65ed..de6634e29c 100644 --- a/dace/registry.py +++ b/dace/registry.py @@ -37,6 +37,15 @@ def autoregister(cls: Type, **kwargs): that automatically registers the subclass with the superclass registry upon creation. """ + + if 'name' in kwargs and (kwargs['name'] == 'cuda' or kwargs['name'] == 'experimental_cuda'): + from dace.config import Config + if Config.get('compiler', 'cuda', 'implementation') == 'experimental' and kwargs['name'] == 'cuda': + return + if Config.get('compiler', 'cuda', 'implementation') == 'legacy' and kwargs['name'] == 'experimental_cuda': + return + + registered = False for base in cls.__bases__: if hasattr(base, '_registry_') and hasattr(base, 'register'): From e1d75d8ba0312f4cd4f30f1ae2b713353c4f80ed Mon Sep 17 00:00:00 2001 From: aydogdub Date: Thu, 8 May 2025 14:40:48 +0200 Subject: [PATCH 02/94] clean-up my workspace --- berkay_workpace/berkay_testbed.ipynb | 591 ------------------ berkay_workpace/berkay_testbed2.ipynb | 99 --- berkay_workpace/{ => reports}/report.py | 0 .../{ => scratch}/berkay_testbed.py | 0 berkay_workpace/{ => scratch}/playfield.py | 0 berkay_workpace/{test.py => tests/tests.py} | 0 6 files changed, 690 deletions(-) delete mode 100644 berkay_workpace/berkay_testbed.ipynb delete mode 100644 berkay_workpace/berkay_testbed2.ipynb rename berkay_workpace/{ => reports}/report.py (100%) rename berkay_workpace/{ => scratch}/berkay_testbed.py (100%) rename berkay_workpace/{ => scratch}/playfield.py (100%) rename berkay_workpace/{test.py => tests/tests.py} (100%) diff --git a/berkay_workpace/berkay_testbed.ipynb b/berkay_workpace/berkay_testbed.ipynb deleted file mode 100644 index 00e2a7fb5a..0000000000 --- a/berkay_workpace/berkay_testbed.ipynb +++ /dev/null @@ -1,591 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "experimental\n" - ] - } - ], - "source": [ - "import dace\n", - "import random\n", - "import cupy as cp\n", - "\n", - "from dace import registry\n", - "from dace.sdfg.scope import ScopeSubgraphView\n", - "from dace.codegen.prettycode import CodeIOStream\n", - "from dace.codegen.targets.target import TargetCodeGenerator\n", - "from dace.codegen.targets.framecode import DaCeCodeGenerator\n", - "from dace.codegen.targets.cpp import sym2cpp\n", - "from IPython.display import Code\n", - "from dace.config import Config\n", - "\n", - "\n", - "print(Config.get('compiler', 'cuda', 'implementation'))" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "
\n", - "
\n", - "
\n", - "\n", - "" - ], - "text/plain": [ - "SDFG (warpLevel)" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "@dace.program\n", - "def warpLevel(A: dace.float64[512] @ dace.dtypes.StorageType.GPU_Global, B: dace.float64[512] @ dace.dtypes.StorageType.GPU_Global):\n", - " for i in dace.map[0:512:512] @ dace.dtypes.ScheduleType.GPU_Device:\n", - " for j in dace.map[0:512] @ dace.dtypes.ScheduleType.GPU_ThreadBlock:\n", - " for wi in dace.map[0:16] @ dace.dtypes.ScheduleType.GPU_Warp:\n", - " A[wi] = A[wi] + 1\n", - "\n", - "sdfg = warpLevel.to_sdfg()\n", - "sdfg" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
#include <cuda_runtime.h>\n",
-       "#include <dace/dace.h>\n",
-       "\n",
-       "\n",
-       "struct warpLevel_state_t {\n",
-       "    dace::cuda::Context *gpu_context;\n",
-       "};\n",
-       "\n",
-       "\n",
-       "\n",
-       "DACE_EXPORTED int __dace_init_experimental_cuda(warpLevel_state_t *__state);\n",
-       "DACE_EXPORTED int __dace_exit_experimental_cuda(warpLevel_state_t *__state);\n",
-       "\n",
-       "\n",
-       "\n",
-       "int __dace_init_experimental_cuda(warpLevel_state_t *__state) {\n",
-       "    int count;\n",
-       "\n",
-       "    // Check that we are able to run cuda code\n",
-       "    if (cudaGetDeviceCount(&count) != cudaSuccess)\n",
-       "    {\n",
-       "        printf("ERROR: GPU drivers are not configured or cuda-capable device "\n",
-       "               "not found\\n");\n",
-       "        return 1;\n",
-       "    }\n",
-       "    if (count == 0)\n",
-       "    {\n",
-       "        printf("ERROR: No cuda-capable devices found\\n");\n",
-       "        return 2;\n",
-       "    }\n",
-       "\n",
-       "    // Initialize cuda before we run the application\n",
-       "    float *dev_X;\n",
-       "    DACE_GPU_CHECK(cudaMalloc((void **) &dev_X, 1));\n",
-       "    DACE_GPU_CHECK(cudaFree(dev_X));\n",
-       "\n",
-       "    \n",
-       "\n",
-       "    __state->gpu_context = new dace::cuda::Context(1, 1);\n",
-       "\n",
-       "    // Create cuda streams and events\n",
-       "    for(int i = 0; i < 1; ++i) {\n",
-       "        DACE_GPU_CHECK(cudaStreamCreateWithFlags(&__state->gpu_context->internal_streams[i], cudaStreamNonBlocking));\n",
-       "        __state->gpu_context->streams[i] = __state->gpu_context->internal_streams[i]; // Allow for externals to modify streams\n",
-       "    }\n",
-       "    for(int i = 0; i < 1; ++i) {\n",
-       "        DACE_GPU_CHECK(cudaEventCreateWithFlags(&__state->gpu_context->events[i], cudaEventDisableTiming));\n",
-       "    }\n",
-       "\n",
-       "    \n",
-       "\n",
-       "    return 0;\n",
-       "}\n",
-       "\n",
-       "int __dace_exit_experimental_cuda(warpLevel_state_t *__state) {\n",
-       "    \n",
-       "\n",
-       "    // Synchronize and check for CUDA errors\n",
-       "    int __err = static_cast<int>(__state->gpu_context->lasterror);\n",
-       "    if (__err == 0)\n",
-       "        __err = static_cast<int>(cudaDeviceSynchronize());\n",
-       "\n",
-       "    // Destroy cuda streams and events\n",
-       "    for(int i = 0; i < 1; ++i) {\n",
-       "        DACE_GPU_CHECK(cudaStreamDestroy(__state->gpu_context->internal_streams[i]));\n",
-       "    }\n",
-       "    for(int i = 0; i < 1; ++i) {\n",
-       "        DACE_GPU_CHECK(cudaEventDestroy(__state->gpu_context->events[i]));\n",
-       "    }\n",
-       "\n",
-       "    delete __state->gpu_context;\n",
-       "    return __err;\n",
-       "}\n",
-       "\n",
-       "DACE_EXPORTED bool __dace_gpu_set_stream(warpLevel_state_t *__state, int streamid, gpuStream_t stream)\n",
-       "{\n",
-       "    if (streamid < 0 || streamid >= 1)\n",
-       "        return false;\n",
-       "\n",
-       "    __state->gpu_context->streams[streamid] = stream;\n",
-       "\n",
-       "    return true;\n",
-       "}\n",
-       "\n",
-       "DACE_EXPORTED void __dace_gpu_set_all_streams(warpLevel_state_t *__state, gpuStream_t stream)\n",
-       "{\n",
-       "    for (int i = 0; i < 1; ++i)\n",
-       "        __state->gpu_context->streams[i] = stream;\n",
-       "}\n",
-       "\n",
-       "__global__ void warpLevel_3_0_0_0(double * __restrict__ A)\n",
-       "{ // Kernel scope (open 1)\n",
-       "    int i = (512 * blockIdx.x);\n",
-       "    { // ThreadBlock Scope (open 1)\n",
-       "        int j = threadIdx.x;\n",
-       "        { // WarpLevel Scope (open 1)\n",
-       "            double __tmp3;\n",
-       "\n",
-       "            int warpId = (threadIdx.x + blockDim.x * threadIdx.y + blockDim.x * blockDim.y * threadIdx.z) / 32;\n",
-       "            if ( 0 < warpId && warpId < 15) { // WarpLevel Scope (open 2)\n",
-       "\n",
-       "\n",
-       "\n",
-       "                ----------------------------------\n",
-       "                // WarpLevel operations here\n",
-       "                ----------------------------------\n",
-       "\n",
-       "\n",
-       "\n",
-       "            } // WarpLevel Scope (close 1)\n",
-       "        } // WarpLevel Scope (close 2)\n",
-       "    } // ThreadBlock Scope (close 1)\n",
-       "} // Kernel scope (close 1)\n",
-       "\n",
-       "\n",
-       "DACE_EXPORTED void __dace_runkernel_warpLevel_3_0_0_0(warpLevel_state_t *__state, double * __restrict__ A);\n",
-       "void __dace_runkernel_warpLevel_3_0_0_0(warpLevel_state_t *__state, double * __restrict__ A)\n",
-       "{\n",
-       "\n",
-       "\n",
-       "    void  *warpLevel_3_0_0_0_args[] = { (void *)&A };\n",
-       "    gpuError_t __err = cudaLaunchKernel( (void*)warpLevel_3_0_0_0, dim3(1, 1, 1), dim3(512, 1, 1), warpLevel_3_0_0_0_args, 0, __state->gpu_context->streams[0]\n",
-       "    );\n",
-       "\n",
-       "    DACE_KERNEL_LAUNCH_CHECK(__err, "warpLevel_3_0_0_0", 1, 1, 1, 512, 1, 1);\n",
-       "}\n",
-       "
\n" - ], - "text/latex": [ - "\\begin{Verbatim}[commandchars=\\\\\\{\\}]\n", - "\\PY{c+cp}{\\PYZsh{}}\\PY{c+cp}{include}\\PY{+w}{ }\\PY{c+cpf}{\\PYZlt{}cuda\\PYZus{}runtime.h\\PYZgt{}}\n", - "\\PY{c+cp}{\\PYZsh{}}\\PY{c+cp}{include}\\PY{+w}{ }\\PY{c+cpf}{\\PYZlt{}dace/dace.h\\PYZgt{}}\n", - "\n", - "\n", - "\\PY{k}{struct}\\PY{+w}{ }\\PY{n+nc}{warpLevel\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", - "\\PY{+w}{ }\\PY{n}{dace}\\PY{o}{:}\\PY{o}{:}\\PY{n}{cuda}\\PY{o}{:}\\PY{o}{:}\\PY{n}{Context}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{gpu\\PYZus{}context}\\PY{p}{;}\n", - "\\PY{p}{\\PYZcb{}}\\PY{p}{;}\n", - "\n", - "\n", - "\n", - "\\PY{n}{DACE\\PYZus{}EXPORTED}\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}dace\\PYZus{}init\\PYZus{}experimental\\PYZus{}cuda}\\PY{p}{(}\\PY{n}{warpLevel\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{)}\\PY{p}{;}\n", - "\\PY{n}{DACE\\PYZus{}EXPORTED}\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}dace\\PYZus{}exit\\PYZus{}experimental\\PYZus{}cuda}\\PY{p}{(}\\PY{n}{warpLevel\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{)}\\PY{p}{;}\n", - "\n", - "\n", - "\n", - "\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n+nf}{\\PYZus{}\\PYZus{}dace\\PYZus{}init\\PYZus{}experimental\\PYZus{}cuda}\\PY{p}{(}\\PY{n}{warpLevel\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{)}\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", - "\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{count}\\PY{p}{;}\n", - "\n", - "\\PY{+w}{ }\\PY{c+c1}{// Check that we are able to run cuda code}\n", - "\\PY{+w}{ }\\PY{k}{if}\\PY{+w}{ }\\PY{p}{(}\\PY{n}{cudaGetDeviceCount}\\PY{p}{(}\\PY{o}{\\PYZam{}}\\PY{n}{count}\\PY{p}{)}\\PY{+w}{ }\\PY{o}{!}\\PY{o}{=}\\PY{+w}{ }\\PY{n}{cudaSuccess}\\PY{p}{)}\n", - "\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", - "\\PY{+w}{ }\\PY{n}{printf}\\PY{p}{(}\\PY{l+s}{\\PYZdq{}}\\PY{l+s}{ERROR: GPU drivers are not configured or cuda\\PYZhy{}capable device }\\PY{l+s}{\\PYZdq{}}\n", - "\\PY{+w}{ }\\PY{l+s}{\\PYZdq{}}\\PY{l+s}{not found}\\PY{l+s+se}{\\PYZbs{}n}\\PY{l+s}{\\PYZdq{}}\\PY{p}{)}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{k}{return}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\n", - "\\PY{+w}{ }\\PY{k}{if}\\PY{+w}{ }\\PY{p}{(}\\PY{n}{count}\\PY{+w}{ }\\PY{o}{=}\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{)}\n", - "\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", - "\\PY{+w}{ }\\PY{n}{printf}\\PY{p}{(}\\PY{l+s}{\\PYZdq{}}\\PY{l+s}{ERROR: No cuda\\PYZhy{}capable devices found}\\PY{l+s+se}{\\PYZbs{}n}\\PY{l+s}{\\PYZdq{}}\\PY{p}{)}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{k}{return}\\PY{+w}{ }\\PY{l+m+mi}{2}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\n", - "\n", - "\\PY{+w}{ }\\PY{c+c1}{// Initialize cuda before we run the application}\n", - "\\PY{+w}{ }\\PY{k+kt}{float}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{dev\\PYZus{}X}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{n}{DACE\\PYZus{}GPU\\PYZus{}CHECK}\\PY{p}{(}\\PY{n}{cudaMalloc}\\PY{p}{(}\\PY{p}{(}\\PY{k+kt}{void}\\PY{+w}{ }\\PY{o}{*}\\PY{o}{*}\\PY{p}{)}\\PY{+w}{ }\\PY{o}{\\PYZam{}}\\PY{n}{dev\\PYZus{}X}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{)}\\PY{p}{)}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{n}{DACE\\PYZus{}GPU\\PYZus{}CHECK}\\PY{p}{(}\\PY{n}{cudaFree}\\PY{p}{(}\\PY{n}{dev\\PYZus{}X}\\PY{p}{)}\\PY{p}{)}\\PY{p}{;}\n", - "\n", - "\\PY{+w}{ }\n", - "\n", - "\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{k}{new}\\PY{+w}{ }\\PY{n}{dace}\\PY{o}{:}\\PY{o}{:}\\PY{n}{cuda}\\PY{o}{:}\\PY{o}{:}\\PY{n}{Context}\\PY{p}{(}\\PY{l+m+mi}{1}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{)}\\PY{p}{;}\n", - "\n", - "\\PY{+w}{ }\\PY{c+c1}{// Create cuda streams and events}\n", - "\\PY{+w}{ }\\PY{k}{for}\\PY{p}{(}\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{;}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{\\PYZlt{}}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{;}\\PY{+w}{ }\\PY{o}{+}\\PY{o}{+}\\PY{n}{i}\\PY{p}{)}\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", - "\\PY{+w}{ }\\PY{n}{DACE\\PYZus{}GPU\\PYZus{}CHECK}\\PY{p}{(}\\PY{n}{cudaStreamCreateWithFlags}\\PY{p}{(}\\PY{o}{\\PYZam{}}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{internal\\PYZus{}streams}\\PY{p}{[}\\PY{n}{i}\\PY{p}{]}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{cudaStreamNonBlocking}\\PY{p}{)}\\PY{p}{)}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{streams}\\PY{p}{[}\\PY{n}{i}\\PY{p}{]}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{internal\\PYZus{}streams}\\PY{p}{[}\\PY{n}{i}\\PY{p}{]}\\PY{p}{;}\\PY{+w}{ }\\PY{c+c1}{// Allow for externals to modify streams}\n", - "\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\n", - "\\PY{+w}{ }\\PY{k}{for}\\PY{p}{(}\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{;}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{\\PYZlt{}}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{;}\\PY{+w}{ }\\PY{o}{+}\\PY{o}{+}\\PY{n}{i}\\PY{p}{)}\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", - "\\PY{+w}{ }\\PY{n}{DACE\\PYZus{}GPU\\PYZus{}CHECK}\\PY{p}{(}\\PY{n}{cudaEventCreateWithFlags}\\PY{p}{(}\\PY{o}{\\PYZam{}}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{events}\\PY{p}{[}\\PY{n}{i}\\PY{p}{]}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{cudaEventDisableTiming}\\PY{p}{)}\\PY{p}{)}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\n", - "\n", - "\\PY{+w}{ }\n", - "\n", - "\\PY{+w}{ }\\PY{k}{return}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{;}\n", - "\\PY{p}{\\PYZcb{}}\n", - "\n", - "\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n+nf}{\\PYZus{}\\PYZus{}dace\\PYZus{}exit\\PYZus{}experimental\\PYZus{}cuda}\\PY{p}{(}\\PY{n}{warpLevel\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{)}\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", - "\\PY{+w}{ }\n", - "\n", - "\\PY{+w}{ }\\PY{c+c1}{// Synchronize and check for CUDA errors}\n", - "\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}err}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{k}{static\\PYZus{}cast}\\PY{o}{\\PYZlt{}}\\PY{k+kt}{int}\\PY{o}{\\PYZgt{}}\\PY{p}{(}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{lasterror}\\PY{p}{)}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{k}{if}\\PY{+w}{ }\\PY{p}{(}\\PY{n}{\\PYZus{}\\PYZus{}err}\\PY{+w}{ }\\PY{o}{=}\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{)}\n", - "\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}err}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{k}{static\\PYZus{}cast}\\PY{o}{\\PYZlt{}}\\PY{k+kt}{int}\\PY{o}{\\PYZgt{}}\\PY{p}{(}\\PY{n}{cudaDeviceSynchronize}\\PY{p}{(}\\PY{p}{)}\\PY{p}{)}\\PY{p}{;}\n", - "\n", - "\\PY{+w}{ }\\PY{c+c1}{// Destroy cuda streams and events}\n", - "\\PY{+w}{ }\\PY{k}{for}\\PY{p}{(}\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{;}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{\\PYZlt{}}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{;}\\PY{+w}{ }\\PY{o}{+}\\PY{o}{+}\\PY{n}{i}\\PY{p}{)}\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", - "\\PY{+w}{ }\\PY{n}{DACE\\PYZus{}GPU\\PYZus{}CHECK}\\PY{p}{(}\\PY{n}{cudaStreamDestroy}\\PY{p}{(}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{internal\\PYZus{}streams}\\PY{p}{[}\\PY{n}{i}\\PY{p}{]}\\PY{p}{)}\\PY{p}{)}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\n", - "\\PY{+w}{ }\\PY{k}{for}\\PY{p}{(}\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{;}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{\\PYZlt{}}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{;}\\PY{+w}{ }\\PY{o}{+}\\PY{o}{+}\\PY{n}{i}\\PY{p}{)}\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", - "\\PY{+w}{ }\\PY{n}{DACE\\PYZus{}GPU\\PYZus{}CHECK}\\PY{p}{(}\\PY{n}{cudaEventDestroy}\\PY{p}{(}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{events}\\PY{p}{[}\\PY{n}{i}\\PY{p}{]}\\PY{p}{)}\\PY{p}{)}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\n", - "\n", - "\\PY{+w}{ }\\PY{k}{delete}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{k}{return}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}err}\\PY{p}{;}\n", - "\\PY{p}{\\PYZcb{}}\n", - "\n", - "\\PY{n}{DACE\\PYZus{}EXPORTED}\\PY{+w}{ }\\PY{k+kt}{bool}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}dace\\PYZus{}gpu\\PYZus{}set\\PYZus{}stream}\\PY{p}{(}\\PY{n}{warpLevel\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{,}\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{streamid}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{gpuStream\\PYZus{}t}\\PY{+w}{ }\\PY{n}{stream}\\PY{p}{)}\n", - "\\PY{p}{\\PYZob{}}\n", - "\\PY{+w}{ }\\PY{k}{if}\\PY{+w}{ }\\PY{p}{(}\\PY{n}{streamid}\\PY{+w}{ }\\PY{o}{\\PYZlt{}}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{+w}{ }\\PY{o}{|}\\PY{o}{|}\\PY{+w}{ }\\PY{n}{streamid}\\PY{+w}{ }\\PY{o}{\\PYZgt{}}\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{)}\n", - "\\PY{+w}{ }\\PY{k}{return}\\PY{+w}{ }\\PY{n+nb}{false}\\PY{p}{;}\n", - "\n", - "\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{streams}\\PY{p}{[}\\PY{n}{streamid}\\PY{p}{]}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{stream}\\PY{p}{;}\n", - "\n", - "\\PY{+w}{ }\\PY{k}{return}\\PY{+w}{ }\\PY{n+nb}{true}\\PY{p}{;}\n", - "\\PY{p}{\\PYZcb{}}\n", - "\n", - "\\PY{n}{DACE\\PYZus{}EXPORTED}\\PY{+w}{ }\\PY{k+kt}{void}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}dace\\PYZus{}gpu\\PYZus{}set\\PYZus{}all\\PYZus{}streams}\\PY{p}{(}\\PY{n}{warpLevel\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{gpuStream\\PYZus{}t}\\PY{+w}{ }\\PY{n}{stream}\\PY{p}{)}\n", - "\\PY{p}{\\PYZob{}}\n", - "\\PY{+w}{ }\\PY{k}{for}\\PY{+w}{ }\\PY{p}{(}\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{;}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{\\PYZlt{}}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{;}\\PY{+w}{ }\\PY{o}{+}\\PY{o}{+}\\PY{n}{i}\\PY{p}{)}\n", - "\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{streams}\\PY{p}{[}\\PY{n}{i}\\PY{p}{]}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{stream}\\PY{p}{;}\n", - "\\PY{p}{\\PYZcb{}}\n", - "\n", - "\\PY{n}{\\PYZus{}\\PYZus{}global\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{k+kt}{void}\\PY{+w}{ }\\PY{n}{warpLevel\\PYZus{}3\\PYZus{}0\\PYZus{}0\\PYZus{}0}\\PY{p}{(}\\PY{k+kt}{double}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}restrict\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{n}{A}\\PY{p}{)}\n", - "\\PY{p}{\\PYZob{}}\\PY{+w}{ }\\PY{c+c1}{// Kernel scope (open 1)}\n", - "\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{p}{(}\\PY{l+m+mi}{512}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{blockIdx}\\PY{p}{.}\\PY{n}{x}\\PY{p}{)}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{p}{\\PYZob{}}\\PY{+w}{ }\\PY{c+c1}{// ThreadBlock Scope (open 1)}\n", - "\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{j}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{threadIdx}\\PY{p}{.}\\PY{n}{x}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{p}{\\PYZob{}}\\PY{+w}{ }\\PY{c+c1}{// WarpLevel Scope (open 1)}\n", - "\\PY{+w}{ }\\PY{k+kt}{double}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}tmp3}\\PY{p}{;}\n", - "\n", - "\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{warpId}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{p}{(}\\PY{n}{threadIdx}\\PY{p}{.}\\PY{n}{x}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{n}{blockDim}\\PY{p}{.}\\PY{n}{x}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{threadIdx}\\PY{p}{.}\\PY{n}{y}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{n}{blockDim}\\PY{p}{.}\\PY{n}{x}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{blockDim}\\PY{p}{.}\\PY{n}{y}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{threadIdx}\\PY{p}{.}\\PY{n}{z}\\PY{p}{)}\\PY{+w}{ }\\PY{o}{/}\\PY{+w}{ }\\PY{l+m+mi}{32}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{k}{if}\\PY{+w}{ }\\PY{p}{(}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{+w}{ }\\PY{o}{\\PYZlt{}}\\PY{+w}{ }\\PY{n}{warpId}\\PY{+w}{ }\\PY{o}{\\PYZam{}}\\PY{o}{\\PYZam{}}\\PY{+w}{ }\\PY{n}{warpId}\\PY{+w}{ }\\PY{o}{\\PYZlt{}}\\PY{+w}{ }\\PY{l+m+mi}{15}\\PY{p}{)}\\PY{+w}{ }\\PY{p}{\\PYZob{}}\\PY{+w}{ }\\PY{c+c1}{// WarpLevel Scope (open 2)}\n", - "\n", - "\n", - "\n", - "\\PY{+w}{ }\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZhy{}}\n", - "\\PY{+w}{ }\\PY{c+c1}{// WarpLevel operations here}\n", - "\\PY{+w}{ }\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZhy{}}\n", - "\n", - "\n", - "\n", - "\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\\PY{+w}{ }\\PY{c+c1}{// WarpLevel Scope (close 1)}\n", - "\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\\PY{+w}{ }\\PY{c+c1}{// WarpLevel Scope (close 2)}\n", - "\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\\PY{+w}{ }\\PY{c+c1}{// ThreadBlock Scope (close 1)}\n", - "\\PY{p}{\\PYZcb{}}\\PY{+w}{ }\\PY{c+c1}{// Kernel scope (close 1)}\n", - "\n", - "\n", - "\\PY{n}{DACE\\PYZus{}EXPORTED}\\PY{+w}{ }\\PY{k+kt}{void}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}dace\\PYZus{}runkernel\\PYZus{}warpLevel\\PYZus{}3\\PYZus{}0\\PYZus{}0\\PYZus{}0}\\PY{p}{(}\\PY{n}{warpLevel\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{,}\\PY{+w}{ }\\PY{k+kt}{double}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}restrict\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{n}{A}\\PY{p}{)}\\PY{p}{;}\n", - "\\PY{k+kt}{void}\\PY{+w}{ }\\PY{n+nf}{\\PYZus{}\\PYZus{}dace\\PYZus{}runkernel\\PYZus{}warpLevel\\PYZus{}3\\PYZus{}0\\PYZus{}0\\PYZus{}0}\\PY{p}{(}\\PY{n}{warpLevel\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{,}\\PY{+w}{ }\\PY{k+kt}{double}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}restrict\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{n}{A}\\PY{p}{)}\n", - "\\PY{p}{\\PYZob{}}\n", - "\n", - "\n", - "\\PY{+w}{ }\\PY{k+kt}{void}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{warpLevel\\PYZus{}3\\PYZus{}0\\PYZus{}0\\PYZus{}0\\PYZus{}args}\\PY{p}{[}\\PY{p}{]}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{p}{\\PYZob{}}\\PY{+w}{ }\\PY{p}{(}\\PY{k+kt}{void}\\PY{+w}{ }\\PY{o}{*}\\PY{p}{)}\\PY{o}{\\PYZam{}}\\PY{n}{A}\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{n}{gpuError\\PYZus{}t}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}err}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{cudaLaunchKernel}\\PY{p}{(}\\PY{+w}{ }\\PY{p}{(}\\PY{k+kt}{void}\\PY{o}{*}\\PY{p}{)}\\PY{n}{warpLevel\\PYZus{}3\\PYZus{}0\\PYZus{}0\\PYZus{}0}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{dim3}\\PY{p}{(}\\PY{l+m+mi}{1}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{)}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{dim3}\\PY{p}{(}\\PY{l+m+mi}{512}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{)}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{warpLevel\\PYZus{}3\\PYZus{}0\\PYZus{}0\\PYZus{}0\\PYZus{}args}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{streams}\\PY{p}{[}\\PY{l+m+mi}{0}\\PY{p}{]}\n", - "\\PY{+w}{ }\\PY{p}{)}\\PY{p}{;}\n", - "\n", - "\\PY{+w}{ }\\PY{n}{DACE\\PYZus{}KERNEL\\PYZus{}LAUNCH\\PYZus{}CHECK}\\PY{p}{(}\\PY{n}{\\PYZus{}\\PYZus{}err}\\PY{p}{,}\\PY{+w}{ }\\PY{l+s}{\\PYZdq{}}\\PY{l+s}{warpLevel\\PYZus{}3\\PYZus{}0\\PYZus{}0\\PYZus{}0}\\PY{l+s}{\\PYZdq{}}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{512}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{)}\\PY{p}{;}\n", - "\\PY{p}{\\PYZcb{}}\n", - "\\end{Verbatim}\n" - ], - "text/plain": [ - "\n", - "#include \n", - "#include \n", - "\n", - "\n", - "struct warpLevel_state_t {\n", - " dace::cuda::Context *gpu_context;\n", - "};\n", - "\n", - "\n", - "\n", - "DACE_EXPORTED int __dace_init_experimental_cuda(warpLevel_state_t *__state);\n", - "DACE_EXPORTED int __dace_exit_experimental_cuda(warpLevel_state_t *__state);\n", - "\n", - "\n", - "\n", - "int __dace_init_experimental_cuda(warpLevel_state_t *__state) {\n", - " int count;\n", - "\n", - " // Check that we are able to run cuda code\n", - " if (cudaGetDeviceCount(&count) != cudaSuccess)\n", - " {\n", - " printf(\"ERROR: GPU drivers are not configured or cuda-capable device \"\n", - " \"not found\\n\");\n", - " return 1;\n", - " }\n", - " if (count == 0)\n", - " {\n", - " printf(\"ERROR: No cuda-capable devices found\\n\");\n", - " return 2;\n", - " }\n", - "\n", - " // Initialize cuda before we run the application\n", - " float *dev_X;\n", - " DACE_GPU_CHECK(cudaMalloc((void **) &dev_X, 1));\n", - " DACE_GPU_CHECK(cudaFree(dev_X));\n", - "\n", - " \n", - "\n", - " __state->gpu_context = new dace::cuda::Context(1, 1);\n", - "\n", - " // Create cuda streams and events\n", - " for(int i = 0; i < 1; ++i) {\n", - " DACE_GPU_CHECK(cudaStreamCreateWithFlags(&__state->gpu_context->internal_streams[i], cudaStreamNonBlocking));\n", - " __state->gpu_context->streams[i] = __state->gpu_context->internal_streams[i]; // Allow for externals to modify streams\n", - " }\n", - " for(int i = 0; i < 1; ++i) {\n", - " DACE_GPU_CHECK(cudaEventCreateWithFlags(&__state->gpu_context->events[i], cudaEventDisableTiming));\n", - " }\n", - "\n", - " \n", - "\n", - " return 0;\n", - "}\n", - "\n", - "int __dace_exit_experimental_cuda(warpLevel_state_t *__state) {\n", - " \n", - "\n", - " // Synchronize and check for CUDA errors\n", - " int __err = static_cast(__state->gpu_context->lasterror);\n", - " if (__err == 0)\n", - " __err = static_cast(cudaDeviceSynchronize());\n", - "\n", - " // Destroy cuda streams and events\n", - " for(int i = 0; i < 1; ++i) {\n", - " DACE_GPU_CHECK(cudaStreamDestroy(__state->gpu_context->internal_streams[i]));\n", - " }\n", - " for(int i = 0; i < 1; ++i) {\n", - " DACE_GPU_CHECK(cudaEventDestroy(__state->gpu_context->events[i]));\n", - " }\n", - "\n", - " delete __state->gpu_context;\n", - " return __err;\n", - "}\n", - "\n", - "DACE_EXPORTED bool __dace_gpu_set_stream(warpLevel_state_t *__state, int streamid, gpuStream_t stream)\n", - "{\n", - " if (streamid < 0 || streamid >= 1)\n", - " return false;\n", - "\n", - " __state->gpu_context->streams[streamid] = stream;\n", - "\n", - " return true;\n", - "}\n", - "\n", - "DACE_EXPORTED void __dace_gpu_set_all_streams(warpLevel_state_t *__state, gpuStream_t stream)\n", - "{\n", - " for (int i = 0; i < 1; ++i)\n", - " __state->gpu_context->streams[i] = stream;\n", - "}\n", - "\n", - "__global__ void warpLevel_3_0_0_0(double * __restrict__ A)\n", - "{ // Kernel scope (open 1)\n", - " int i = (512 * blockIdx.x);\n", - " { // ThreadBlock Scope (open 1)\n", - " int j = threadIdx.x;\n", - " { // WarpLevel Scope (open 1)\n", - " double __tmp3;\n", - "\n", - " int warpId = (threadIdx.x + blockDim.x * threadIdx.y + blockDim.x * blockDim.y * threadIdx.z) / 32;\n", - " if ( 0 < warpId && warpId < 15) { // WarpLevel Scope (open 2)\n", - "\n", - "\n", - "\n", - " ----------------------------------\n", - " // WarpLevel operations here\n", - " ----------------------------------\n", - "\n", - "\n", - "\n", - " } // WarpLevel Scope (close 1)\n", - " } // WarpLevel Scope (close 2)\n", - " } // ThreadBlock Scope (close 1)\n", - "} // Kernel scope (close 1)\n", - "\n", - "\n", - "DACE_EXPORTED void __dace_runkernel_warpLevel_3_0_0_0(warpLevel_state_t *__state, double * __restrict__ A);\n", - "void __dace_runkernel_warpLevel_3_0_0_0(warpLevel_state_t *__state, double * __restrict__ A)\n", - "{\n", - "\n", - "\n", - " void *warpLevel_3_0_0_0_args[] = { (void *)&A };\n", - " gpuError_t __err = cudaLaunchKernel( (void*)warpLevel_3_0_0_0, dim3(1, 1, 1), dim3(512, 1, 1), warpLevel_3_0_0_0_args, 0, __state->gpu_context->streams[0]\n", - " );\n", - "\n", - " DACE_KERNEL_LAUNCH_CHECK(__err, \"warpLevel_3_0_0_0\", 1, 1, 1, 512, 1, 1);\n", - "}\n" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "Code(sdfg.generate_code()[1].clean_code, language='cpp')" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.12.3" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/berkay_workpace/berkay_testbed2.ipynb b/berkay_workpace/berkay_testbed2.ipynb deleted file mode 100644 index 508b6b94aa..0000000000 --- a/berkay_workpace/berkay_testbed2.ipynb +++ /dev/null @@ -1,99 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "id": "323168ff", - "metadata": {}, - "outputs": [], - "source": [ - "import dace\n", - "from dace import registry\n", - "from dace.sdfg.scope import ScopeSubgraphView\n", - "from dace.codegen.prettycode import CodeIOStream\n", - "from dace.codegen.targets.target import TargetCodeGenerator\n", - "from dace.codegen.targets.framecode import DaCeCodeGenerator\n", - "from dace.codegen.targets.cpp import sym2cpp\n", - "from IPython.display import Code" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "851a8f17", - "metadata": {}, - "outputs": [], - "source": [ - "@dace.program\n", - "def vector_copy(A: dace.float32[10] @ dace.dtypes.StorageType.GPU_Global, B: dace.float32[10] @ dace.dtypes.StorageType.GPU_Global):\n", - " for i in dace.map[0:10] @ dace.dtypes.ScheduleType.GPU_Device:\n", - " A[i] = B[i]\n", - "\n", - "sdfg = vector_copy.to_sdfg()\n", - "sdfg" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "69427604", - "metadata": {}, - "outputs": [], - "source": [ - "\n", - "Code(sdfg.generate_code()[0].clean_code, language='cpp')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ddd49236", - "metadata": {}, - "outputs": [], - "source": [ - "Code(sdfg.generate_code()[1].clean_code, language='cpp')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "567ceeff", - "metadata": {}, - "outputs": [], - "source": [ - "Code(sdfg.generate_code()[2].clean_code, language='cpp')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "80e4eea7", - "metadata": {}, - "outputs": [], - "source": [ - "Code(sdfg.generate_code()[3].clean_code, language='cpp')" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python (dace_env)", - "language": "python", - "name": "dace_emv" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.12.3" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/berkay_workpace/report.py b/berkay_workpace/reports/report.py similarity index 100% rename from berkay_workpace/report.py rename to berkay_workpace/reports/report.py diff --git a/berkay_workpace/berkay_testbed.py b/berkay_workpace/scratch/berkay_testbed.py similarity index 100% rename from berkay_workpace/berkay_testbed.py rename to berkay_workpace/scratch/berkay_testbed.py diff --git a/berkay_workpace/playfield.py b/berkay_workpace/scratch/playfield.py similarity index 100% rename from berkay_workpace/playfield.py rename to berkay_workpace/scratch/playfield.py diff --git a/berkay_workpace/test.py b/berkay_workpace/tests/tests.py similarity index 100% rename from berkay_workpace/test.py rename to berkay_workpace/tests/tests.py From 7a9a6f7e3b89b1a9e7088750f90d34e3ea3ebacf Mon Sep 17 00:00:00 2001 From: aydogdub Date: Mon, 12 May 2025 20:53:38 +0200 Subject: [PATCH 03/94] initial complete implementation of GPU_Warp schedule --- dace/codegen/targets/experimental_cuda.py | 140 +++++++++++++++++----- 1 file changed, 108 insertions(+), 32 deletions(-) diff --git a/dace/codegen/targets/experimental_cuda.py b/dace/codegen/targets/experimental_cuda.py index 3d298d659a..60c89131bb 100644 --- a/dace/codegen/targets/experimental_cuda.py +++ b/dace/codegen/targets/experimental_cuda.py @@ -36,14 +36,14 @@ -def prod(iterable): - return functools.reduce(sympy.Mul, iterable, 1) - # TODO: GENERAL, discuss with Yakup -# have a look at dtypes maybe + + +# TODO: I am not handling map with strided rights now, +# why? because This is handled somewhere else than in the scope @@ -723,42 +723,117 @@ def _generate_GPU_ThreadBlock_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, df - def _generate_GPU_Warp_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: ScopeSubgraphView, - state_id: int, function_stream: CodeIOStream, kernel_stream: CodeIOStream) -> None: - - with KernelScopeManager(cudaCodeGen=self, sdfg=sdfg, cfg=cfg, dfg_scope=dfg_scope, state_id=state_id, - function_stream=function_stream, callsite_stream=kernel_stream, comment="WarpLevel Scope",) as scopeManager: - + + def _generate_GPU_Warp_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: ScopeSubgraphView, state_id: int, + function_stream: CodeIOStream, kernel_stream: CodeIOStream) -> None: + + + with KernelScopeManager(cudaCodeGen=self, sdfg=sdfg, cfg=cfg, dfg_scope=dfg_scope, state_id=state_id, + function_stream=function_stream, callsite_stream=kernel_stream, comment="WarpLevel Scope") as scopeManager: + node = dfg_scope.source_nodes()[0] scope_map = node.map - map_range = subsets.Range(scope_map.range) - current_kernel_spec = self._current_kernel_spec - grid_dims = current_kernel_spec.grid_dims - block_dims = current_kernel_spec.block_dims + map_range = subsets.Range(scope_map.range[::-1]) # Reversed for potential better performance + block_dims = self._current_kernel_spec.block_dims + + + THREADS_PER_WARP = 32 + num_threads_in_block = prod(block_dims) + upper_bound_warp_ids = [max_elem + 1 for max_elem in map_range.max_element()] + num_warps = prod(upper_bound_warp_ids) + warp_dim = len(map_range) + state_dfg = cfg.state(state_id) + + + # ----------------- Guard checks ----------------------- + + #TODO: rename xfh, to cryptic + parent_map, _ = xfh.get_parent_map(state_dfg, node) + if parent_map.schedule != dtypes.ScheduleType.GPU_ThreadBlock: + raise ValueError("GPU_Warp map must be nested within a GPU_ThreadBlock map.") + + if warp_dim > 3: + raise NotImplementedError("GPU_Warp maps are limited to 3 dimensions.") + + if num_threads_in_block % THREADS_PER_WARP != 0: + raise ValueError(f"Block must be a multiple of {THREADS_PER_WARP} threads for GPU_Warp scheduling " + f"(got {num_threads_in_block}).") + + # TODO: This should be checked at get_kernel dim + if num_threads_in_block > 1024: + raise ValueError("CUDA does not support more than 1024 threads per block (hardware limit).") + + if num_warps * THREADS_PER_WARP > num_threads_in_block: + raise ValueError(f"Invalid configuration: {num_warps} warps x {THREADS_PER_WARP} threads exceed " + f"{num_threads_in_block} threads in the block.") - # TODO: Does it? - if len(map_range) > 1: - raise ValueError("The range for GPU_Warp maps must be one-dimensional.") + if not all(x >= 0 for x in map_range.min_element()): + raise ValueError("Warp IDs (from map range) must be non-negative.") - warpId = "int warpId = " - warpId += f"(threadIdx.x + blockDim.x * threadIdx.y + blockDim.x * blockDim.y * threadIdx.z) / 32;" + + + # ----------------- Map unflattening and scope guards ----------------------- + + flattened_terms = [] + for i, dim_size in enumerate(block_dims): + if dim_size == 1: + continue + dim = _get_cuda_dim(i) + stride = [f"{block_dims[j]}" for j in range(i) if block_dims[j] > 1] + idx_expr = " * ".join(stride + [f"threadIdx.{_get_cuda_dim(i)}"]) if stride else f"threadIdx.{dim}" + flattened_terms.append(idx_expr) + + # NOTE: too ugly? + flat_thread_id_expr = " + ".join(flattened_terms) + warp_id_name = 'warpId_%s_%d_%d_%d' % (scope_map.label, cfg.cfg_id, state_dfg.block_id, state_dfg.node_id(node)) + + kernel_stream.write( + f"int {warp_id_name} = ({flat_thread_id_expr}) / {THREADS_PER_WARP};", + cfg, state_id, node + ) + self._dispatcher.defined_vars.add(warp_id_name, DefinedType.Scalar, 'int') + + - map_start = symbolic_to_cpp(map_range.min_element()[0]) - map_end = symbolic_to_cpp(map_range.max_element()[0]) + # ----------------- Compute flattened warp ID ----------------------- + range_max_elements = map_range.max_element() + range_min_elements = map_range.min_element() + warp_dim_bounds = [str(e + 1) for e in range_max_elements] - kernel_stream.write(f"\n", cfg, state_id, node) - kernel_stream.write(f"{warpId}", cfg, state_id, node) + for i in range(warp_dim): + var_name = scope_map.params[-i - 1] # reverse order + previous_sizes = warp_dim_bounds[:i] - condition = f" {map_start} < warpId && warpId < {map_end}" - scopeManager.open(condition=condition) + if len(previous_sizes) > 0: + divisor = " * ".join(previous_sizes) + divisor = f"({divisor})" if len(previous_sizes) > 1 else divisor + expr = f"({warp_id_name} / {divisor}) % {warp_dim_bounds[i]}" + else: + expr = f"{warp_id_name} % {warp_dim_bounds[i]}" - kernel_stream.write(f"\n\n\n", cfg, state_id, node) - kernel_stream.write(f"----------------------------------", cfg, state_id, node) - kernel_stream.write(f"// WarpLevel operations here", cfg, state_id, node) - kernel_stream.write(f"----------------------------------", cfg, state_id, node) - kernel_stream.write(f"\n\n\n", cfg, state_id, node) + kernel_stream.write(f"int {var_name} = {expr};", cfg, state_id, node) + self._dispatcher.defined_vars.add(var_name, DefinedType.Scalar, 'int') + + + # check conditions + # NOTE: WarpId coordinate can start at non-zero but never exceeds the upper range bound + # due to the combination of enforcing guard checks (32 * warps <= # threads in block) and the way + # we assign the coordinates + min_element = range_min_elements[i] + if range_min_elements[i] != 0: + conditions = f'{var_name} >= {min_element}' + scopeManager.open(condition=conditions) + + + + # ----------------- Warp Code Block ----------------------- + + self._dispatcher.dispatch_subgraph( + sdfg, cfg, dfg_scope, state_id, function_stream, + kernel_stream, skip_entry_node=True + ) @@ -862,7 +937,6 @@ def generate_state(self, else: self._frame.generate_state(sdfg, cfg, state, function_stream, callsite_stream, generate_state_footer=False) - def generate_devicelevel_state(self, sdfg: SDFG, cfg: ControlFlowRegion, state: SDFGState, function_stream: CodeIOStream, callsite_stream: CodeIOStream) -> None: # Special case: if this is a GPU grid state and something is reading @@ -876,7 +950,6 @@ def generate_devicelevel_state(self, sdfg: SDFG, cfg: ControlFlowRegion, state: self._frame.generate_state(sdfg, cfg, state, function_stream, callsite_stream) - def _emit_sync(self, codestream: CodeIOStream): if Config.get_bool('compiler', 'cuda', 'syncdebug'): codestream.write('''DACE_GPU_CHECK({backend}GetLastError()); @@ -2015,6 +2088,9 @@ def _get_storagename(storage): return sname[sname.rindex('_') + 1:] +# TODO: Just use product as name? +def prod(iterable): + return functools.reduce(sympy.Mul, iterable, 1) ######################################################################### # Functions I had to redefine locally to not modify other files and ensure backwards compatibility From 9d81a379bdc27adcd15b2201909b0d5fa54b0108 Mon Sep 17 00:00:00 2001 From: aydogdub Date: Mon, 12 May 2025 20:57:05 +0200 Subject: [PATCH 04/94] small test of GPU_Warp schedule --- berkay_workpace/scratch/berkay_testbed.py | 33 - berkay_workpace/scratch/playfield.py | 50 +- berkay_workpace/scratch/tesetbed2.ipynb | 626 +++++++++++++++++++ berkay_workpace/scratch/testbed.ipynb | 718 ++++++++++++++++++++++ berkay_workpace/scratch/vis.ipynb | 492 +++++++++++++++ 5 files changed, 1866 insertions(+), 53 deletions(-) delete mode 100644 berkay_workpace/scratch/berkay_testbed.py create mode 100644 berkay_workpace/scratch/tesetbed2.ipynb create mode 100644 berkay_workpace/scratch/testbed.ipynb create mode 100644 berkay_workpace/scratch/vis.ipynb diff --git a/berkay_workpace/scratch/berkay_testbed.py b/berkay_workpace/scratch/berkay_testbed.py deleted file mode 100644 index 6923998953..0000000000 --- a/berkay_workpace/scratch/berkay_testbed.py +++ /dev/null @@ -1,33 +0,0 @@ -import dace -import cupy as cp -import random - -from dace import registry -from dace.sdfg.scope import ScopeSubgraphView -from dace.codegen.prettycode import CodeIOStream -from dace.codegen.targets.target import TargetCodeGenerator -from dace.codegen.targets.framecode import DaCeCodeGenerator -from dace.codegen.targets.cpp import sym2cpp -from IPython.display import Code - - - - -N = dace.symbol('N') - -@dace.program -def vector_copy4(A: dace.float64[N] @ dace.dtypes.StorageType.GPU_Global, B: dace.float64[N] @ dace.dtypes.StorageType.GPU_Global): - for i in dace.map[0:N:32] @ dace.dtypes.ScheduleType.GPU_Device: - for j in dace.map[0:32] @ dace.dtypes.ScheduleType.GPU_ThreadBlock: - if i + j < N: - A[i + j] = B[i + j] - -n = random.randint(40, 150) -# Initialize random CUDA arrays -A = cp.zeros(n, dtype=cp.float64) # Output array -B = cp.random.rand(n).astype(cp.float64) # Random input array - -sdfg = vector_copy4.to_sdfg() -sdfg(A=A, B=B, N=n) -equal_at_end = cp.all(A == B) - diff --git a/berkay_workpace/scratch/playfield.py b/berkay_workpace/scratch/playfield.py index 40181faa74..b2456cdec3 100644 --- a/berkay_workpace/scratch/playfield.py +++ b/berkay_workpace/scratch/playfield.py @@ -1,6 +1,8 @@ import dace import random import cupy as cp +from dace.frontend.python.interface import inline + from dace import registry from dace.sdfg.scope import ScopeSubgraphView @@ -11,35 +13,43 @@ from IPython.display import Code from dace.config import Config -print(Config.get('compiler', 'cuda', 'implementation')) - @dace.program -def warpLevel(A: dace.float64[512] @ dace.dtypes.StorageType.GPU_Global, B: dace.float64[512] @ dace.dtypes.StorageType.GPU_Global): - for i in dace.map[0:512:512] @ dace.dtypes.ScheduleType.GPU_Device: - for j in dace.map[0:512] @ dace.dtypes.ScheduleType.GPU_ThreadBlock: - for k in dace.map[0:16] @ dace.dtypes.ScheduleType.GPU_Warp: - A[k] = A[k] + 1 +def reduce_add_sync(mask: dace.uint32, value: dace.uint32): - -sdfg = warpLevel.to_sdfg() -Code(sdfg.generate_code()[0].clean_code, language='cpp') + result = dace.define_local_scalar(dace.uint32) + + with dace.tasklet(dace.Language.CPP): + inp_mask << mask + inp_value << value + out_result >> result + """ + out_result = __reduce_add_sync(inp_mask, inp_value); + """ + return result -""" -""" - -""" @dace.program -def vector_copy3(A: dace.float64[64] @ dace.dtypes.StorageType.GPU_Global, B: dace.float64[64] @ dace.dtypes.StorageType.GPU_Global): - for j in dace.map[0:32] @ dace.dtypes.ScheduleType.GPU_ThreadBlock: - A[j] = B[j] +def warpLevel(A: dace.uint32[512] @ dace.dtypes.StorageType.GPU_Global, B: dace.uint32[512] @ dace.dtypes.StorageType.GPU_Global): + for _ in dace.map[0:512:512] @ dace.dtypes.ScheduleType.GPU_Device: + for j in dace.map[0:512] @ dace.dtypes.ScheduleType.GPU_ThreadBlock: -sdfg = vector_copy3.to_sdfg() -Code(sdfg.generate_code()[0].clean_code, language='cpp') + value = A[j] + mask = 0xffffffff + result = 0 + for _ in dace.map[0:16] @ dace.dtypes.ScheduleType.GPU_Warp: + + result = reduce_add_sync(mask, value) + + B[j] = result +A = cp.ones(512, cp.uint32) +B = cp.random.rand(512).astype(cp.uint32) + +sdfg = warpLevel.to_sdfg() +sdfg(A=A, B=B) -""" \ No newline at end of file +print(B) \ No newline at end of file diff --git a/berkay_workpace/scratch/tesetbed2.ipynb b/berkay_workpace/scratch/tesetbed2.ipynb new file mode 100644 index 0000000000..45307e0ed4 --- /dev/null +++ b/berkay_workpace/scratch/tesetbed2.ipynb @@ -0,0 +1,626 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "1497afd7", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import dace\n", + "\n", + "from dace import registry\n", + "from dace.sdfg.scope import ScopeSubgraphView\n", + "from dace.codegen.prettycode import CodeIOStream\n", + "from dace.codegen.targets.target import TargetCodeGenerator\n", + "from dace.codegen.targets.framecode import DaCeCodeGenerator\n", + "from dace.codegen.targets.cpp import sym2cpp\n", + "from IPython.display import Code\n", + "from dace.config import Config" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "130d986f", + "metadata": {}, + "outputs": [], + "source": [ + "@dace.program\n", + "def reduce_add_sync(mask: dace.uint32, value: dace.uint32):\n", + "\n", + " result = dace.define_local_scalar(dace.uint32)\n", + " \n", + " with dace.tasklet(dace.Language.CPP):\n", + " inp_mask << mask\n", + " inp_value << value\n", + " out_result >> result\n", + " \"\"\"\n", + " out_result = __reduce_add_sync(inp_mask, inp_value);\n", + " \"\"\"\n", + " return result\n", + "\n", + "\n", + "\n", + "@dace.program\n", + "def warpLevel(A: dace.uint32[1024] @ dace.dtypes.StorageType.GPU_Global, B: dace.uint32[1024] @ dace.dtypes.StorageType.GPU_Global):\n", + " for _ in dace.map[0:1024:1024] @ dace.dtypes.ScheduleType.GPU_Device:\n", + " for j in dace.map[0:1024] @ dace.dtypes.ScheduleType.GPU_ThreadBlock:\n", + "\n", + " for l, m, k in dace.map[0:4:2, 0:4, 0:2] @ dace.dtypes.ScheduleType.GPU_Warp:\n", + " mask = 0xffffffff\n", + " value = A[j]\n", + " result = dace.define_local_scalar(dace.uint32)\n", + " with dace.tasklet(dace.Language.CPP):\n", + " inp_mask << mask\n", + " inp_value << value\n", + " out_result >> result\n", + " \"\"\"\n", + " out_result = __reduce_add_sync(inp_mask, inp_value);\n", + " \"\"\"\n", + " \n", + " B[j] = result\n" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "c0146590", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
#include <cuda_runtime.h>\n",
+       "#include <dace/dace.h>\n",
+       "\n",
+       "\n",
+       "struct warpLevel_state_t {\n",
+       "    dace::cuda::Context *gpu_context;\n",
+       "};\n",
+       "\n",
+       "\n",
+       "\n",
+       "DACE_EXPORTED int __dace_init_experimental_cuda(warpLevel_state_t *__state);\n",
+       "DACE_EXPORTED int __dace_exit_experimental_cuda(warpLevel_state_t *__state);\n",
+       "\n",
+       "DACE_DFI void warpLevel_19_4_20_8_22_12_0_0_8(const dace::uint&  __tmp_24_24_r, dace::uint* __restrict__ __tmp_34_16_w) {\n",
+       "    dace::uint value;\n",
+       "\n",
+       "\n",
+       "    value = __tmp_24_24_r;\n",
+       "    {\n",
+       "\n",
+       "        {\n",
+       "            dace::uint __out;\n",
+       "\n",
+       "            ///////////////////\n",
+       "            // Tasklet code (assign_34_16)\n",
+       "            __out = __reduce_add_sync(4294967295U, value);\n",
+       "            ///////////////////\n",
+       "\n",
+       "            __tmp_34_16_w[0] = __out;\n",
+       "        }\n",
+       "\n",
+       "    }\n",
+       "}\n",
+       "\n",
+       "\n",
+       "\n",
+       "int __dace_init_experimental_cuda(warpLevel_state_t *__state) {\n",
+       "    int count;\n",
+       "\n",
+       "    // Check that we are able to run cuda code\n",
+       "    if (cudaGetDeviceCount(&count) != cudaSuccess)\n",
+       "    {\n",
+       "        printf("ERROR: GPU drivers are not configured or cuda-capable device "\n",
+       "               "not found\\n");\n",
+       "        return 1;\n",
+       "    }\n",
+       "    if (count == 0)\n",
+       "    {\n",
+       "        printf("ERROR: No cuda-capable devices found\\n");\n",
+       "        return 2;\n",
+       "    }\n",
+       "\n",
+       "    // Initialize cuda before we run the application\n",
+       "    float *dev_X;\n",
+       "    DACE_GPU_CHECK(cudaMalloc((void **) &dev_X, 1));\n",
+       "    DACE_GPU_CHECK(cudaFree(dev_X));\n",
+       "\n",
+       "    \n",
+       "\n",
+       "    __state->gpu_context = new dace::cuda::Context(1, 1);\n",
+       "\n",
+       "    // Create cuda streams and events\n",
+       "    for(int i = 0; i < 1; ++i) {\n",
+       "        DACE_GPU_CHECK(cudaStreamCreateWithFlags(&__state->gpu_context->internal_streams[i], cudaStreamNonBlocking));\n",
+       "        __state->gpu_context->streams[i] = __state->gpu_context->internal_streams[i]; // Allow for externals to modify streams\n",
+       "    }\n",
+       "    for(int i = 0; i < 1; ++i) {\n",
+       "        DACE_GPU_CHECK(cudaEventCreateWithFlags(&__state->gpu_context->events[i], cudaEventDisableTiming));\n",
+       "    }\n",
+       "\n",
+       "    \n",
+       "\n",
+       "    return 0;\n",
+       "}\n",
+       "\n",
+       "int __dace_exit_experimental_cuda(warpLevel_state_t *__state) {\n",
+       "    \n",
+       "\n",
+       "    // Synchronize and check for CUDA errors\n",
+       "    int __err = static_cast<int>(__state->gpu_context->lasterror);\n",
+       "    if (__err == 0)\n",
+       "        __err = static_cast<int>(cudaDeviceSynchronize());\n",
+       "\n",
+       "    // Destroy cuda streams and events\n",
+       "    for(int i = 0; i < 1; ++i) {\n",
+       "        DACE_GPU_CHECK(cudaStreamDestroy(__state->gpu_context->internal_streams[i]));\n",
+       "    }\n",
+       "    for(int i = 0; i < 1; ++i) {\n",
+       "        DACE_GPU_CHECK(cudaEventDestroy(__state->gpu_context->events[i]));\n",
+       "    }\n",
+       "\n",
+       "    delete __state->gpu_context;\n",
+       "    return __err;\n",
+       "}\n",
+       "\n",
+       "DACE_EXPORTED bool __dace_gpu_set_stream(warpLevel_state_t *__state, int streamid, gpuStream_t stream)\n",
+       "{\n",
+       "    if (streamid < 0 || streamid >= 1)\n",
+       "        return false;\n",
+       "\n",
+       "    __state->gpu_context->streams[streamid] = stream;\n",
+       "\n",
+       "    return true;\n",
+       "}\n",
+       "\n",
+       "DACE_EXPORTED void __dace_gpu_set_all_streams(warpLevel_state_t *__state, gpuStream_t stream)\n",
+       "{\n",
+       "    for (int i = 0; i < 1; ++i)\n",
+       "        __state->gpu_context->streams[i] = stream;\n",
+       "}\n",
+       "\n",
+       "__global__ void warpLevel_19_0_0_0(dace::uint * __restrict__ A, dace::uint * __restrict__ B)\n",
+       "{ // Kernel scope (open 1)\n",
+       "    int _ = (1024 * blockIdx.x);\n",
+       "    { // ThreadBlock Scope (open 1)\n",
+       "        int j = threadIdx.x;\n",
+       "        { // WarpLevel Scope (open 1)\n",
+       "            int warpId_warpLevel_19_4_20_8_22_0_0_6 = (threadIdx.x) / 32;\n",
+       "            int k = warpId_warpLevel_19_4_20_8_22_0_0_6 % 2;\n",
+       "            int m = (warpId_warpLevel_19_4_20_8_22_0_0_6 / 2) % 4;\n",
+       "            int l = (warpId_warpLevel_19_4_20_8_22_0_0_6 / (2 * 4)) % 4;\n",
+       "            warpLevel_19_4_20_8_22_12_0_0_8(A[j], &B[j]);\n",
+       "        } // WarpLevel Scope (close 1)\n",
+       "    } // ThreadBlock Scope (close 1)\n",
+       "} // Kernel scope (close 1)\n",
+       "\n",
+       "\n",
+       "DACE_EXPORTED void __dace_runkernel_warpLevel_19_0_0_0(warpLevel_state_t *__state, dace::uint * __restrict__ A, dace::uint * __restrict__ B);\n",
+       "void __dace_runkernel_warpLevel_19_0_0_0(warpLevel_state_t *__state, dace::uint * __restrict__ A, dace::uint * __restrict__ B)\n",
+       "{\n",
+       "\n",
+       "\n",
+       "    void  *warpLevel_19_0_0_0_args[] = { (void *)&A, (void *)&B };\n",
+       "    gpuError_t __err = cudaLaunchKernel( (void*)warpLevel_19_0_0_0, dim3(1, 1, 1), dim3(1024, 1, 1), warpLevel_19_0_0_0_args, 0, __state->gpu_context->streams[0]\n",
+       "    );\n",
+       "\n",
+       "    DACE_KERNEL_LAUNCH_CHECK(__err, "warpLevel_19_0_0_0", 1, 1, 1, 1024, 1, 1);\n",
+       "}\n",
+       "
\n" + ], + "text/latex": [ + "\\begin{Verbatim}[commandchars=\\\\\\{\\}]\n", + "\\PY{c+cp}{\\PYZsh{}}\\PY{c+cp}{include}\\PY{+w}{ }\\PY{c+cpf}{\\PYZlt{}cuda\\PYZus{}runtime.h\\PYZgt{}}\n", + "\\PY{c+cp}{\\PYZsh{}}\\PY{c+cp}{include}\\PY{+w}{ }\\PY{c+cpf}{\\PYZlt{}dace/dace.h\\PYZgt{}}\n", + "\n", + "\n", + "\\PY{k}{struct}\\PY{+w}{ }\\PY{n+nc}{warpLevel\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", + "\\PY{+w}{ }\\PY{n}{dace}\\PY{o}{:}\\PY{o}{:}\\PY{n}{cuda}\\PY{o}{:}\\PY{o}{:}\\PY{n}{Context}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{gpu\\PYZus{}context}\\PY{p}{;}\n", + "\\PY{p}{\\PYZcb{}}\\PY{p}{;}\n", + "\n", + "\n", + "\n", + "\\PY{n}{DACE\\PYZus{}EXPORTED}\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}dace\\PYZus{}init\\PYZus{}experimental\\PYZus{}cuda}\\PY{p}{(}\\PY{n}{warpLevel\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{)}\\PY{p}{;}\n", + "\\PY{n}{DACE\\PYZus{}EXPORTED}\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}dace\\PYZus{}exit\\PYZus{}experimental\\PYZus{}cuda}\\PY{p}{(}\\PY{n}{warpLevel\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{)}\\PY{p}{;}\n", + "\n", + "\\PY{n}{DACE\\PYZus{}DFI}\\PY{+w}{ }\\PY{k+kt}{void}\\PY{+w}{ }\\PY{n}{warpLevel\\PYZus{}19\\PYZus{}4\\PYZus{}20\\PYZus{}8\\PYZus{}22\\PYZus{}12\\PYZus{}0\\PYZus{}0\\PYZus{}8}\\PY{p}{(}\\PY{k}{const}\\PY{+w}{ }\\PY{n}{dace}\\PY{o}{:}\\PY{o}{:}\\PY{n}{uint}\\PY{o}{\\PYZam{}}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}tmp\\PYZus{}24\\PYZus{}24\\PYZus{}r}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{dace}\\PY{o}{:}\\PY{o}{:}\\PY{n}{uint}\\PY{o}{*}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}restrict\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}tmp\\PYZus{}34\\PYZus{}16\\PYZus{}w}\\PY{p}{)}\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", + "\\PY{+w}{ }\\PY{n}{dace}\\PY{o}{:}\\PY{o}{:}\\PY{n}{uint}\\PY{+w}{ }\\PY{n}{value}\\PY{p}{;}\n", + "\n", + "\n", + "\\PY{+w}{ }\\PY{n}{value}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}tmp\\PYZus{}24\\PYZus{}24\\PYZus{}r}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", + "\n", + "\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", + "\\PY{+w}{ }\\PY{n}{dace}\\PY{o}{:}\\PY{o}{:}\\PY{n}{uint}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}out}\\PY{p}{;}\n", + "\n", + "\\PY{+w}{ }\\PY{c+c1}{///////////////////}\n", + "\\PY{+w}{ }\\PY{c+c1}{// Tasklet code (assign\\PYZus{}34\\PYZus{}16)}\n", + "\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}out}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}reduce\\PYZus{}add\\PYZus{}sync}\\PY{p}{(}\\PY{l+m+mi}{4294967295U}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{value}\\PY{p}{)}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{c+c1}{///////////////////}\n", + "\n", + "\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}tmp\\PYZus{}34\\PYZus{}16\\PYZus{}w}\\PY{p}{[}\\PY{l+m+mi}{0}\\PY{p}{]}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}out}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\n", + "\n", + "\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\n", + "\\PY{p}{\\PYZcb{}}\n", + "\n", + "\n", + "\n", + "\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}dace\\PYZus{}init\\PYZus{}experimental\\PYZus{}cuda}\\PY{p}{(}\\PY{n}{warpLevel\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{)}\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", + "\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{count}\\PY{p}{;}\n", + "\n", + "\\PY{+w}{ }\\PY{c+c1}{// Check that we are able to run cuda code}\n", + "\\PY{+w}{ }\\PY{k}{if}\\PY{+w}{ }\\PY{p}{(}\\PY{n}{cudaGetDeviceCount}\\PY{p}{(}\\PY{o}{\\PYZam{}}\\PY{n}{count}\\PY{p}{)}\\PY{+w}{ }\\PY{o}{!}\\PY{o}{=}\\PY{+w}{ }\\PY{n}{cudaSuccess}\\PY{p}{)}\n", + "\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", + "\\PY{+w}{ }\\PY{n}{printf}\\PY{p}{(}\\PY{l+s}{\\PYZdq{}}\\PY{l+s}{ERROR: GPU drivers are not configured or cuda\\PYZhy{}capable device }\\PY{l+s}{\\PYZdq{}}\n", + "\\PY{+w}{ }\\PY{l+s}{\\PYZdq{}}\\PY{l+s}{not found}\\PY{l+s+se}{\\PYZbs{}n}\\PY{l+s}{\\PYZdq{}}\\PY{p}{)}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{k}{return}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\n", + "\\PY{+w}{ }\\PY{k}{if}\\PY{+w}{ }\\PY{p}{(}\\PY{n}{count}\\PY{+w}{ }\\PY{o}{=}\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{)}\n", + "\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", + "\\PY{+w}{ }\\PY{n}{printf}\\PY{p}{(}\\PY{l+s}{\\PYZdq{}}\\PY{l+s}{ERROR: No cuda\\PYZhy{}capable devices found}\\PY{l+s+se}{\\PYZbs{}n}\\PY{l+s}{\\PYZdq{}}\\PY{p}{)}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{k}{return}\\PY{+w}{ }\\PY{l+m+mi}{2}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\n", + "\n", + "\\PY{+w}{ }\\PY{c+c1}{// Initialize cuda before we run the application}\n", + "\\PY{+w}{ }\\PY{k+kt}{float}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{dev\\PYZus{}X}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{n}{DACE\\PYZus{}GPU\\PYZus{}CHECK}\\PY{p}{(}\\PY{n}{cudaMalloc}\\PY{p}{(}\\PY{p}{(}\\PY{k+kt}{void}\\PY{+w}{ }\\PY{o}{*}\\PY{o}{*}\\PY{p}{)}\\PY{+w}{ }\\PY{o}{\\PYZam{}}\\PY{n}{dev\\PYZus{}X}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{)}\\PY{p}{)}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{n}{DACE\\PYZus{}GPU\\PYZus{}CHECK}\\PY{p}{(}\\PY{n}{cudaFree}\\PY{p}{(}\\PY{n}{dev\\PYZus{}X}\\PY{p}{)}\\PY{p}{)}\\PY{p}{;}\n", + "\n", + "\\PY{+w}{ }\n", + "\n", + "\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{k}{new}\\PY{+w}{ }\\PY{n}{dace}\\PY{o}{:}\\PY{o}{:}\\PY{n}{cuda}\\PY{o}{:}\\PY{o}{:}\\PY{n}{Context}\\PY{p}{(}\\PY{l+m+mi}{1}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{)}\\PY{p}{;}\n", + "\n", + "\\PY{+w}{ }\\PY{c+c1}{// Create cuda streams and events}\n", + "\\PY{+w}{ }\\PY{k}{for}\\PY{p}{(}\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{;}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{\\PYZlt{}}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{;}\\PY{+w}{ }\\PY{o}{+}\\PY{o}{+}\\PY{n}{i}\\PY{p}{)}\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", + "\\PY{+w}{ }\\PY{n}{DACE\\PYZus{}GPU\\PYZus{}CHECK}\\PY{p}{(}\\PY{n}{cudaStreamCreateWithFlags}\\PY{p}{(}\\PY{o}{\\PYZam{}}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{internal\\PYZus{}streams}\\PY{p}{[}\\PY{n}{i}\\PY{p}{]}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{cudaStreamNonBlocking}\\PY{p}{)}\\PY{p}{)}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{streams}\\PY{p}{[}\\PY{n}{i}\\PY{p}{]}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{internal\\PYZus{}streams}\\PY{p}{[}\\PY{n}{i}\\PY{p}{]}\\PY{p}{;}\\PY{+w}{ }\\PY{c+c1}{// Allow for externals to modify streams}\n", + "\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\n", + "\\PY{+w}{ }\\PY{k}{for}\\PY{p}{(}\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{;}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{\\PYZlt{}}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{;}\\PY{+w}{ }\\PY{o}{+}\\PY{o}{+}\\PY{n}{i}\\PY{p}{)}\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", + "\\PY{+w}{ }\\PY{n}{DACE\\PYZus{}GPU\\PYZus{}CHECK}\\PY{p}{(}\\PY{n}{cudaEventCreateWithFlags}\\PY{p}{(}\\PY{o}{\\PYZam{}}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{events}\\PY{p}{[}\\PY{n}{i}\\PY{p}{]}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{cudaEventDisableTiming}\\PY{p}{)}\\PY{p}{)}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\n", + "\n", + "\\PY{+w}{ }\n", + "\n", + "\\PY{+w}{ }\\PY{k}{return}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{;}\n", + "\\PY{p}{\\PYZcb{}}\n", + "\n", + "\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}dace\\PYZus{}exit\\PYZus{}experimental\\PYZus{}cuda}\\PY{p}{(}\\PY{n}{warpLevel\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{)}\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", + "\\PY{+w}{ }\n", + "\n", + "\\PY{+w}{ }\\PY{c+c1}{// Synchronize and check for CUDA errors}\n", + "\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}err}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{k}{static\\PYZus{}cast}\\PY{o}{\\PYZlt{}}\\PY{k+kt}{int}\\PY{o}{\\PYZgt{}}\\PY{p}{(}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{lasterror}\\PY{p}{)}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{k}{if}\\PY{+w}{ }\\PY{p}{(}\\PY{n}{\\PYZus{}\\PYZus{}err}\\PY{+w}{ }\\PY{o}{=}\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{)}\n", + "\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}err}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{k}{static\\PYZus{}cast}\\PY{o}{\\PYZlt{}}\\PY{k+kt}{int}\\PY{o}{\\PYZgt{}}\\PY{p}{(}\\PY{n}{cudaDeviceSynchronize}\\PY{p}{(}\\PY{p}{)}\\PY{p}{)}\\PY{p}{;}\n", + "\n", + "\\PY{+w}{ }\\PY{c+c1}{// Destroy cuda streams and events}\n", + "\\PY{+w}{ }\\PY{k}{for}\\PY{p}{(}\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{;}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{\\PYZlt{}}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{;}\\PY{+w}{ }\\PY{o}{+}\\PY{o}{+}\\PY{n}{i}\\PY{p}{)}\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", + "\\PY{+w}{ }\\PY{n}{DACE\\PYZus{}GPU\\PYZus{}CHECK}\\PY{p}{(}\\PY{n}{cudaStreamDestroy}\\PY{p}{(}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{internal\\PYZus{}streams}\\PY{p}{[}\\PY{n}{i}\\PY{p}{]}\\PY{p}{)}\\PY{p}{)}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\n", + "\\PY{+w}{ }\\PY{k}{for}\\PY{p}{(}\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{;}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{\\PYZlt{}}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{;}\\PY{+w}{ }\\PY{o}{+}\\PY{o}{+}\\PY{n}{i}\\PY{p}{)}\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", + "\\PY{+w}{ }\\PY{n}{DACE\\PYZus{}GPU\\PYZus{}CHECK}\\PY{p}{(}\\PY{n}{cudaEventDestroy}\\PY{p}{(}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{events}\\PY{p}{[}\\PY{n}{i}\\PY{p}{]}\\PY{p}{)}\\PY{p}{)}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\n", + "\n", + "\\PY{+w}{ }\\PY{k}{delete}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{k}{return}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}err}\\PY{p}{;}\n", + "\\PY{p}{\\PYZcb{}}\n", + "\n", + "\\PY{n}{DACE\\PYZus{}EXPORTED}\\PY{+w}{ }\\PY{k+kt}{bool}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}dace\\PYZus{}gpu\\PYZus{}set\\PYZus{}stream}\\PY{p}{(}\\PY{n}{warpLevel\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{,}\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{streamid}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{gpuStream\\PYZus{}t}\\PY{+w}{ }\\PY{n}{stream}\\PY{p}{)}\n", + "\\PY{p}{\\PYZob{}}\n", + "\\PY{+w}{ }\\PY{k}{if}\\PY{+w}{ }\\PY{p}{(}\\PY{n}{streamid}\\PY{+w}{ }\\PY{o}{\\PYZlt{}}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{+w}{ }\\PY{o}{|}\\PY{o}{|}\\PY{+w}{ }\\PY{n}{streamid}\\PY{+w}{ }\\PY{o}{\\PYZgt{}}\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{)}\n", + "\\PY{+w}{ }\\PY{k}{return}\\PY{+w}{ }\\PY{n+nb}{false}\\PY{p}{;}\n", + "\n", + "\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{streams}\\PY{p}{[}\\PY{n}{streamid}\\PY{p}{]}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{stream}\\PY{p}{;}\n", + "\n", + "\\PY{+w}{ }\\PY{k}{return}\\PY{+w}{ }\\PY{n+nb}{true}\\PY{p}{;}\n", + "\\PY{p}{\\PYZcb{}}\n", + "\n", + "\\PY{n}{DACE\\PYZus{}EXPORTED}\\PY{+w}{ }\\PY{k+kt}{void}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}dace\\PYZus{}gpu\\PYZus{}set\\PYZus{}all\\PYZus{}streams}\\PY{p}{(}\\PY{n}{warpLevel\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{gpuStream\\PYZus{}t}\\PY{+w}{ }\\PY{n}{stream}\\PY{p}{)}\n", + "\\PY{p}{\\PYZob{}}\n", + "\\PY{+w}{ }\\PY{k}{for}\\PY{+w}{ }\\PY{p}{(}\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{;}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{\\PYZlt{}}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{;}\\PY{+w}{ }\\PY{o}{+}\\PY{o}{+}\\PY{n}{i}\\PY{p}{)}\n", + "\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{streams}\\PY{p}{[}\\PY{n}{i}\\PY{p}{]}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{stream}\\PY{p}{;}\n", + "\\PY{p}{\\PYZcb{}}\n", + "\n", + "\\PY{n}{\\PYZus{}\\PYZus{}global\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{k+kt}{void}\\PY{+w}{ }\\PY{n}{warpLevel\\PYZus{}19\\PYZus{}0\\PYZus{}0\\PYZus{}0}\\PY{p}{(}\\PY{n}{dace}\\PY{o}{:}\\PY{o}{:}\\PY{n}{uint}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}restrict\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{n}{A}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{dace}\\PY{o}{:}\\PY{o}{:}\\PY{n}{uint}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}restrict\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{n}{B}\\PY{p}{)}\n", + "\\PY{p}{\\PYZob{}}\\PY{+w}{ }\\PY{c+c1}{// Kernel scope (open 1)}\n", + "\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{\\PYZus{}}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{p}{(}\\PY{l+m+mi}{1024}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{blockIdx}\\PY{p}{.}\\PY{n}{x}\\PY{p}{)}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{p}{\\PYZob{}}\\PY{+w}{ }\\PY{c+c1}{// ThreadBlock Scope (open 1)}\n", + "\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{j}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{threadIdx}\\PY{p}{.}\\PY{n}{x}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{p}{\\PYZob{}}\\PY{+w}{ }\\PY{c+c1}{// WarpLevel Scope (open 1)}\n", + "\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{warpId\\PYZus{}warpLevel\\PYZus{}19\\PYZus{}4\\PYZus{}20\\PYZus{}8\\PYZus{}22\\PYZus{}0\\PYZus{}0\\PYZus{}6}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{p}{(}\\PY{n}{threadIdx}\\PY{p}{.}\\PY{n}{x}\\PY{p}{)}\\PY{+w}{ }\\PY{o}{/}\\PY{+w}{ }\\PY{l+m+mi}{32}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{k}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{warpId\\PYZus{}warpLevel\\PYZus{}19\\PYZus{}4\\PYZus{}20\\PYZus{}8\\PYZus{}22\\PYZus{}0\\PYZus{}0\\PYZus{}6}\\PY{+w}{ }\\PY{o}{\\PYZpc{}}\\PY{+w}{ }\\PY{l+m+mi}{2}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{m}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{p}{(}\\PY{n}{warpId\\PYZus{}warpLevel\\PYZus{}19\\PYZus{}4\\PYZus{}20\\PYZus{}8\\PYZus{}22\\PYZus{}0\\PYZus{}0\\PYZus{}6}\\PY{+w}{ }\\PY{o}{/}\\PY{+w}{ }\\PY{l+m+mi}{2}\\PY{p}{)}\\PY{+w}{ }\\PY{o}{\\PYZpc{}}\\PY{+w}{ }\\PY{l+m+mi}{4}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{l}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{p}{(}\\PY{n}{warpId\\PYZus{}warpLevel\\PYZus{}19\\PYZus{}4\\PYZus{}20\\PYZus{}8\\PYZus{}22\\PYZus{}0\\PYZus{}0\\PYZus{}6}\\PY{+w}{ }\\PY{o}{/}\\PY{+w}{ }\\PY{p}{(}\\PY{l+m+mi}{2}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{l+m+mi}{4}\\PY{p}{)}\\PY{p}{)}\\PY{+w}{ }\\PY{o}{\\PYZpc{}}\\PY{+w}{ }\\PY{l+m+mi}{4}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{n}{warpLevel\\PYZus{}19\\PYZus{}4\\PYZus{}20\\PYZus{}8\\PYZus{}22\\PYZus{}12\\PYZus{}0\\PYZus{}0\\PYZus{}8}\\PY{p}{(}\\PY{n}{A}\\PY{p}{[}\\PY{n}{j}\\PY{p}{]}\\PY{p}{,}\\PY{+w}{ }\\PY{o}{\\PYZam{}}\\PY{n}{B}\\PY{p}{[}\\PY{n}{j}\\PY{p}{]}\\PY{p}{)}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\\PY{+w}{ }\\PY{c+c1}{// WarpLevel Scope (close 1)}\n", + "\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\\PY{+w}{ }\\PY{c+c1}{// ThreadBlock Scope (close 1)}\n", + "\\PY{p}{\\PYZcb{}}\\PY{+w}{ }\\PY{c+c1}{// Kernel scope (close 1)}\n", + "\n", + "\n", + "\\PY{n}{DACE\\PYZus{}EXPORTED}\\PY{+w}{ }\\PY{k+kt}{void}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}dace\\PYZus{}runkernel\\PYZus{}warpLevel\\PYZus{}19\\PYZus{}0\\PYZus{}0\\PYZus{}0}\\PY{p}{(}\\PY{n}{warpLevel\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{dace}\\PY{o}{:}\\PY{o}{:}\\PY{n}{uint}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}restrict\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{n}{A}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{dace}\\PY{o}{:}\\PY{o}{:}\\PY{n}{uint}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}restrict\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{n}{B}\\PY{p}{)}\\PY{p}{;}\n", + "\\PY{k+kt}{void}\\PY{+w}{ }\\PY{n+nf}{\\PYZus{}\\PYZus{}dace\\PYZus{}runkernel\\PYZus{}warpLevel\\PYZus{}19\\PYZus{}0\\PYZus{}0\\PYZus{}0}\\PY{p}{(}\\PY{n}{warpLevel\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{dace}\\PY{o}{:}\\PY{o}{:}\\PY{n}{uint}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}restrict\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{n}{A}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{dace}\\PY{o}{:}\\PY{o}{:}\\PY{n}{uint}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}restrict\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{n}{B}\\PY{p}{)}\n", + "\\PY{p}{\\PYZob{}}\n", + "\n", + "\n", + "\\PY{+w}{ }\\PY{k+kt}{void}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{warpLevel\\PYZus{}19\\PYZus{}0\\PYZus{}0\\PYZus{}0\\PYZus{}args}\\PY{p}{[}\\PY{p}{]}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{p}{\\PYZob{}}\\PY{+w}{ }\\PY{p}{(}\\PY{k+kt}{void}\\PY{+w}{ }\\PY{o}{*}\\PY{p}{)}\\PY{o}{\\PYZam{}}\\PY{n}{A}\\PY{p}{,}\\PY{+w}{ }\\PY{p}{(}\\PY{k+kt}{void}\\PY{+w}{ }\\PY{o}{*}\\PY{p}{)}\\PY{o}{\\PYZam{}}\\PY{n}{B}\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{n}{gpuError\\PYZus{}t}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}err}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{cudaLaunchKernel}\\PY{p}{(}\\PY{+w}{ }\\PY{p}{(}\\PY{k+kt}{void}\\PY{o}{*}\\PY{p}{)}\\PY{n}{warpLevel\\PYZus{}19\\PYZus{}0\\PYZus{}0\\PYZus{}0}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{dim3}\\PY{p}{(}\\PY{l+m+mi}{1}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{)}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{dim3}\\PY{p}{(}\\PY{l+m+mi}{1024}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{)}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{warpLevel\\PYZus{}19\\PYZus{}0\\PYZus{}0\\PYZus{}0\\PYZus{}args}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{streams}\\PY{p}{[}\\PY{l+m+mi}{0}\\PY{p}{]}\n", + "\\PY{+w}{ }\\PY{p}{)}\\PY{p}{;}\n", + "\n", + "\\PY{+w}{ }\\PY{n}{DACE\\PYZus{}KERNEL\\PYZus{}LAUNCH\\PYZus{}CHECK}\\PY{p}{(}\\PY{n}{\\PYZus{}\\PYZus{}err}\\PY{p}{,}\\PY{+w}{ }\\PY{l+s}{\\PYZdq{}}\\PY{l+s}{warpLevel\\PYZus{}19\\PYZus{}0\\PYZus{}0\\PYZus{}0}\\PY{l+s}{\\PYZdq{}}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1024}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{)}\\PY{p}{;}\n", + "\\PY{p}{\\PYZcb{}}\n", + "\\end{Verbatim}\n" + ], + "text/plain": [ + "\n", + "#include \n", + "#include \n", + "\n", + "\n", + "struct warpLevel_state_t {\n", + " dace::cuda::Context *gpu_context;\n", + "};\n", + "\n", + "\n", + "\n", + "DACE_EXPORTED int __dace_init_experimental_cuda(warpLevel_state_t *__state);\n", + "DACE_EXPORTED int __dace_exit_experimental_cuda(warpLevel_state_t *__state);\n", + "\n", + "DACE_DFI void warpLevel_19_4_20_8_22_12_0_0_8(const dace::uint& __tmp_24_24_r, dace::uint* __restrict__ __tmp_34_16_w) {\n", + " dace::uint value;\n", + "\n", + "\n", + " value = __tmp_24_24_r;\n", + " {\n", + "\n", + " {\n", + " dace::uint __out;\n", + "\n", + " ///////////////////\n", + " // Tasklet code (assign_34_16)\n", + " __out = __reduce_add_sync(4294967295U, value);\n", + " ///////////////////\n", + "\n", + " __tmp_34_16_w[0] = __out;\n", + " }\n", + "\n", + " }\n", + "}\n", + "\n", + "\n", + "\n", + "int __dace_init_experimental_cuda(warpLevel_state_t *__state) {\n", + " int count;\n", + "\n", + " // Check that we are able to run cuda code\n", + " if (cudaGetDeviceCount(&count) != cudaSuccess)\n", + " {\n", + " printf(\"ERROR: GPU drivers are not configured or cuda-capable device \"\n", + " \"not found\\n\");\n", + " return 1;\n", + " }\n", + " if (count == 0)\n", + " {\n", + " printf(\"ERROR: No cuda-capable devices found\\n\");\n", + " return 2;\n", + " }\n", + "\n", + " // Initialize cuda before we run the application\n", + " float *dev_X;\n", + " DACE_GPU_CHECK(cudaMalloc((void **) &dev_X, 1));\n", + " DACE_GPU_CHECK(cudaFree(dev_X));\n", + "\n", + " \n", + "\n", + " __state->gpu_context = new dace::cuda::Context(1, 1);\n", + "\n", + " // Create cuda streams and events\n", + " for(int i = 0; i < 1; ++i) {\n", + " DACE_GPU_CHECK(cudaStreamCreateWithFlags(&__state->gpu_context->internal_streams[i], cudaStreamNonBlocking));\n", + " __state->gpu_context->streams[i] = __state->gpu_context->internal_streams[i]; // Allow for externals to modify streams\n", + " }\n", + " for(int i = 0; i < 1; ++i) {\n", + " DACE_GPU_CHECK(cudaEventCreateWithFlags(&__state->gpu_context->events[i], cudaEventDisableTiming));\n", + " }\n", + "\n", + " \n", + "\n", + " return 0;\n", + "}\n", + "\n", + "int __dace_exit_experimental_cuda(warpLevel_state_t *__state) {\n", + " \n", + "\n", + " // Synchronize and check for CUDA errors\n", + " int __err = static_cast(__state->gpu_context->lasterror);\n", + " if (__err == 0)\n", + " __err = static_cast(cudaDeviceSynchronize());\n", + "\n", + " // Destroy cuda streams and events\n", + " for(int i = 0; i < 1; ++i) {\n", + " DACE_GPU_CHECK(cudaStreamDestroy(__state->gpu_context->internal_streams[i]));\n", + " }\n", + " for(int i = 0; i < 1; ++i) {\n", + " DACE_GPU_CHECK(cudaEventDestroy(__state->gpu_context->events[i]));\n", + " }\n", + "\n", + " delete __state->gpu_context;\n", + " return __err;\n", + "}\n", + "\n", + "DACE_EXPORTED bool __dace_gpu_set_stream(warpLevel_state_t *__state, int streamid, gpuStream_t stream)\n", + "{\n", + " if (streamid < 0 || streamid >= 1)\n", + " return false;\n", + "\n", + " __state->gpu_context->streams[streamid] = stream;\n", + "\n", + " return true;\n", + "}\n", + "\n", + "DACE_EXPORTED void __dace_gpu_set_all_streams(warpLevel_state_t *__state, gpuStream_t stream)\n", + "{\n", + " for (int i = 0; i < 1; ++i)\n", + " __state->gpu_context->streams[i] = stream;\n", + "}\n", + "\n", + "__global__ void warpLevel_19_0_0_0(dace::uint * __restrict__ A, dace::uint * __restrict__ B)\n", + "{ // Kernel scope (open 1)\n", + " int _ = (1024 * blockIdx.x);\n", + " { // ThreadBlock Scope (open 1)\n", + " int j = threadIdx.x;\n", + " { // WarpLevel Scope (open 1)\n", + " int warpId_warpLevel_19_4_20_8_22_0_0_6 = (threadIdx.x) / 32;\n", + " int k = warpId_warpLevel_19_4_20_8_22_0_0_6 % 2;\n", + " int m = (warpId_warpLevel_19_4_20_8_22_0_0_6 / 2) % 4;\n", + " int l = (warpId_warpLevel_19_4_20_8_22_0_0_6 / (2 * 4)) % 4;\n", + " warpLevel_19_4_20_8_22_12_0_0_8(A[j], &B[j]);\n", + " } // WarpLevel Scope (close 1)\n", + " } // ThreadBlock Scope (close 1)\n", + "} // Kernel scope (close 1)\n", + "\n", + "\n", + "DACE_EXPORTED void __dace_runkernel_warpLevel_19_0_0_0(warpLevel_state_t *__state, dace::uint * __restrict__ A, dace::uint * __restrict__ B);\n", + "void __dace_runkernel_warpLevel_19_0_0_0(warpLevel_state_t *__state, dace::uint * __restrict__ A, dace::uint * __restrict__ B)\n", + "{\n", + "\n", + "\n", + " void *warpLevel_19_0_0_0_args[] = { (void *)&A, (void *)&B };\n", + " gpuError_t __err = cudaLaunchKernel( (void*)warpLevel_19_0_0_0, dim3(1, 1, 1), dim3(1024, 1, 1), warpLevel_19_0_0_0_args, 0, __state->gpu_context->streams[0]\n", + " );\n", + "\n", + " DACE_KERNEL_LAUNCH_CHECK(__err, \"warpLevel_19_0_0_0\", 1, 1, 1, 1024, 1, 1);\n", + "}\n" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sdfg = warpLevel.to_sdfg()\n", + "\n", + "Code(sdfg.generate_code()[1].clean_code, language='cpp')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0ac768f2", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "dace_env", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/berkay_workpace/scratch/testbed.ipynb b/berkay_workpace/scratch/testbed.ipynb new file mode 100644 index 0000000000..b67d6cbc56 --- /dev/null +++ b/berkay_workpace/scratch/testbed.ipynb @@ -0,0 +1,718 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "a5aeb1f5", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "experimental\n" + ] + } + ], + "source": [ + "import dace\n", + "import cupy as cp\n", + "\n", + "from IPython.display import Code\n", + "from dace.config import Config\n", + "\n", + "\n", + "print(Config.get('compiler', 'cuda', 'implementation'))\n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "58226f37", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "
\n", + "
\n", + "
\n", + "\n", + "" + ], + "text/plain": [ + "SDFG (Warp_test_1)" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "\n", + "# Generate framework\n", + "sdfg = dace.SDFG(\"Warp_test_1\")\n", + "\n", + "state = sdfg.add_state(\"main\")\n", + "\n", + "# Generate access nodes\n", + "a_dev = sdfg.add_array(\"A\", (32,), dace.uint32, dace.dtypes.StorageType.GPU_Global)\n", + "b_dev = sdfg.add_array(\"B\", (32,), dace.uint32, dace.dtypes.StorageType.GPU_Global)\n", + "a_acc = state.add_access(\"A\")\n", + "b_acc = state.add_access(\"B\")\n", + "\n", + "\n", + "# Generate maps, connect entries with access data\n", + "gpu_map_entry, gpu_map_exit = state.add_map(name = \"GPU_Map\",\n", + " ndrange = dict(i='0:32:32'),\n", + " schedule = dace.dtypes.ScheduleType.GPU_Device)\n", + "state.add_edge(a_acc, None, gpu_map_entry, None, dace.memlet.Memlet('A[0:32]'))\n", + "\n", + "\n", + "tblock_map_entry, tblock_map_exit = state.add_map(name = \"Block_Map\",\n", + " ndrange = dict(j='0:32'),\n", + " schedule = dace.dtypes.ScheduleType.GPU_ThreadBlock)\n", + "state.add_edge(gpu_map_entry, None, tblock_map_entry, None, dace.memlet.Memlet('A[0:32]'))\n", + "\n", + "\n", + "\n", + "\n", + "tasklet, warp_scope_entry, warp_scope_exit = state.add_mapped_tasklet(\n", + " name='WarpLevel_Operation',\n", + " map_ranges=dict(_='0:1'),\n", + " inputs=dict(inp=dace.Memlet('A[0:32]', volume=32)),\n", + " code=\n", + "''' \n", + "value = inp[j]\n", + "out = __reduce_add_sync(0xFFFFFFFF, value);\n", + "''',\n", + " outputs=dict(out=dace.Memlet(\"B[j]\")),\n", + " schedule=dace.dtypes.ScheduleType.GPU_Warp\n", + ")\n", + "\n", + "state.add_edge(tblock_map_entry, None, warp_scope_entry, None, dace.memlet.Memlet('A[0:32]'))\n", + "\n", + "# Connect Exit nodes\n", + "state.add_edge(warp_scope_exit, None, tblock_map_exit, None, dace.memlet.Memlet('B[j]'))\n", + "state.add_edge(tblock_map_exit, None, gpu_map_exit, None, dace.memlet.Memlet('B[j]'))\n", + "state.add_edge(gpu_map_exit, None, b_acc, None, dace.memlet.Memlet('B[0:32]'))\n", + "\n", + "\n", + "\n", + "\n", + "#sdfg.fill_scope_connectors()\n", + "\n", + "\n", + "\n", + "sdfg" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "9426fb29", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
#include <cuda_runtime.h>\n",
+       "#include <dace/dace.h>\n",
+       "\n",
+       "\n",
+       "struct Warp_test_1_state_t {\n",
+       "    dace::cuda::Context *gpu_context;\n",
+       "};\n",
+       "\n",
+       "\n",
+       "\n",
+       "DACE_EXPORTED int __dace_init_experimental_cuda(Warp_test_1_state_t *__state);\n",
+       "DACE_EXPORTED int __dace_exit_experimental_cuda(Warp_test_1_state_t *__state);\n",
+       "\n",
+       "\n",
+       "\n",
+       "int __dace_init_experimental_cuda(Warp_test_1_state_t *__state) {\n",
+       "    int count;\n",
+       "\n",
+       "    // Check that we are able to run cuda code\n",
+       "    if (cudaGetDeviceCount(&count) != cudaSuccess)\n",
+       "    {\n",
+       "        printf("ERROR: GPU drivers are not configured or cuda-capable device "\n",
+       "               "not found\\n");\n",
+       "        return 1;\n",
+       "    }\n",
+       "    if (count == 0)\n",
+       "    {\n",
+       "        printf("ERROR: No cuda-capable devices found\\n");\n",
+       "        return 2;\n",
+       "    }\n",
+       "\n",
+       "    // Initialize cuda before we run the application\n",
+       "    float *dev_X;\n",
+       "    DACE_GPU_CHECK(cudaMalloc((void **) &dev_X, 1));\n",
+       "    DACE_GPU_CHECK(cudaFree(dev_X));\n",
+       "\n",
+       "    \n",
+       "\n",
+       "    __state->gpu_context = new dace::cuda::Context(1, 1);\n",
+       "\n",
+       "    // Create cuda streams and events\n",
+       "    for(int i = 0; i < 1; ++i) {\n",
+       "        DACE_GPU_CHECK(cudaStreamCreateWithFlags(&__state->gpu_context->internal_streams[i], cudaStreamNonBlocking));\n",
+       "        __state->gpu_context->streams[i] = __state->gpu_context->internal_streams[i]; // Allow for externals to modify streams\n",
+       "    }\n",
+       "    for(int i = 0; i < 1; ++i) {\n",
+       "        DACE_GPU_CHECK(cudaEventCreateWithFlags(&__state->gpu_context->events[i], cudaEventDisableTiming));\n",
+       "    }\n",
+       "\n",
+       "    \n",
+       "\n",
+       "    return 0;\n",
+       "}\n",
+       "\n",
+       "int __dace_exit_experimental_cuda(Warp_test_1_state_t *__state) {\n",
+       "    \n",
+       "\n",
+       "    // Synchronize and check for CUDA errors\n",
+       "    int __err = static_cast<int>(__state->gpu_context->lasterror);\n",
+       "    if (__err == 0)\n",
+       "        __err = static_cast<int>(cudaDeviceSynchronize());\n",
+       "\n",
+       "    // Destroy cuda streams and events\n",
+       "    for(int i = 0; i < 1; ++i) {\n",
+       "        DACE_GPU_CHECK(cudaStreamDestroy(__state->gpu_context->internal_streams[i]));\n",
+       "    }\n",
+       "    for(int i = 0; i < 1; ++i) {\n",
+       "        DACE_GPU_CHECK(cudaEventDestroy(__state->gpu_context->events[i]));\n",
+       "    }\n",
+       "\n",
+       "    delete __state->gpu_context;\n",
+       "    return __err;\n",
+       "}\n",
+       "\n",
+       "DACE_EXPORTED bool __dace_gpu_set_stream(Warp_test_1_state_t *__state, int streamid, gpuStream_t stream)\n",
+       "{\n",
+       "    if (streamid < 0 || streamid >= 1)\n",
+       "        return false;\n",
+       "\n",
+       "    __state->gpu_context->streams[streamid] = stream;\n",
+       "\n",
+       "    return true;\n",
+       "}\n",
+       "\n",
+       "DACE_EXPORTED void __dace_gpu_set_all_streams(Warp_test_1_state_t *__state, gpuStream_t stream)\n",
+       "{\n",
+       "    for (int i = 0; i < 1; ++i)\n",
+       "        __state->gpu_context->streams[i] = stream;\n",
+       "}\n",
+       "\n",
+       "__global__ void GPU_Map_0_0_2(dace::uint * __restrict__ A, dace::uint * __restrict__ B)\n",
+       "{ // Kernel scope (open 1)\n",
+       "    int i = (32 * blockIdx.x);\n",
+       "    { // ThreadBlock Scope (open 1)\n",
+       "        int j = threadIdx.x;\n",
+       "        { // WarpLevel Scope (open 1)\n",
+       "\n",
+       "            int warpId = (threadIdx.x + blockDim.x * threadIdx.y + blockDim.x * blockDim.y * threadIdx.z) / 32;\n",
+       "            if ( 0 <= warpId && warpId <= 0) { // WarpLevel Scope (open 2)\n",
+       "\n",
+       "\n",
+       "\n",
+       "                //////////////////////////////////////\n",
+       "                // WarpLevel operations start\n",
+       "                //////////////////////////////////////\n",
+       "                {\n",
+       "                    dace::uint* inp = &A[0];\n",
+       "                    dace::uint out;\n",
+       "\n",
+       "                    ///////////////////\n",
+       "                    // Tasklet code (WarpLevel_Operation)\n",
+       "                    auto value = inp[j];\n",
+       "                    out = __reduce_add_sync(4294967295U, value);\n",
+       "                    ///////////////////\n",
+       "\n",
+       "                    B[j] = out;\n",
+       "                }\n",
+       "                //////////////////////////////////////\n",
+       "                // WarpLevel operations end\n",
+       "                //////////////////////////////////////\n",
+       "\n",
+       "\n",
+       "\n",
+       "            } // WarpLevel Scope (close 1)\n",
+       "        } // WarpLevel Scope (close 2)\n",
+       "    } // ThreadBlock Scope (close 1)\n",
+       "} // Kernel scope (close 1)\n",
+       "\n",
+       "\n",
+       "DACE_EXPORTED void __dace_runkernel_GPU_Map_0_0_2(Warp_test_1_state_t *__state, dace::uint * __restrict__ A, dace::uint * __restrict__ B);\n",
+       "void __dace_runkernel_GPU_Map_0_0_2(Warp_test_1_state_t *__state, dace::uint * __restrict__ A, dace::uint * __restrict__ B)\n",
+       "{\n",
+       "\n",
+       "\n",
+       "    void  *GPU_Map_0_0_2_args[] = { (void *)&A, (void *)&B };\n",
+       "    gpuError_t __err = cudaLaunchKernel( (void*)GPU_Map_0_0_2, dim3(1, 1, 1), dim3(32, 1, 1), GPU_Map_0_0_2_args, 0, __state->gpu_context->streams[0]\n",
+       "    );\n",
+       "\n",
+       "    DACE_KERNEL_LAUNCH_CHECK(__err, "GPU_Map_0_0_2", 1, 1, 1, 32, 1, 1);\n",
+       "}\n",
+       "
\n" + ], + "text/latex": [ + "\\begin{Verbatim}[commandchars=\\\\\\{\\}]\n", + "\\PY{c+cp}{\\PYZsh{}}\\PY{c+cp}{include}\\PY{+w}{ }\\PY{c+cpf}{\\PYZlt{}cuda\\PYZus{}runtime.h\\PYZgt{}}\n", + "\\PY{c+cp}{\\PYZsh{}}\\PY{c+cp}{include}\\PY{+w}{ }\\PY{c+cpf}{\\PYZlt{}dace/dace.h\\PYZgt{}}\n", + "\n", + "\n", + "\\PY{k}{struct}\\PY{+w}{ }\\PY{n+nc}{Warp\\PYZus{}test\\PYZus{}1\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", + "\\PY{+w}{ }\\PY{n}{dace}\\PY{o}{:}\\PY{o}{:}\\PY{n}{cuda}\\PY{o}{:}\\PY{o}{:}\\PY{n}{Context}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{gpu\\PYZus{}context}\\PY{p}{;}\n", + "\\PY{p}{\\PYZcb{}}\\PY{p}{;}\n", + "\n", + "\n", + "\n", + "\\PY{n}{DACE\\PYZus{}EXPORTED}\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}dace\\PYZus{}init\\PYZus{}experimental\\PYZus{}cuda}\\PY{p}{(}\\PY{n}{Warp\\PYZus{}test\\PYZus{}1\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{)}\\PY{p}{;}\n", + "\\PY{n}{DACE\\PYZus{}EXPORTED}\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}dace\\PYZus{}exit\\PYZus{}experimental\\PYZus{}cuda}\\PY{p}{(}\\PY{n}{Warp\\PYZus{}test\\PYZus{}1\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{)}\\PY{p}{;}\n", + "\n", + "\n", + "\n", + "\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n+nf}{\\PYZus{}\\PYZus{}dace\\PYZus{}init\\PYZus{}experimental\\PYZus{}cuda}\\PY{p}{(}\\PY{n}{Warp\\PYZus{}test\\PYZus{}1\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{)}\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", + "\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{count}\\PY{p}{;}\n", + "\n", + "\\PY{+w}{ }\\PY{c+c1}{// Check that we are able to run cuda code}\n", + "\\PY{+w}{ }\\PY{k}{if}\\PY{+w}{ }\\PY{p}{(}\\PY{n}{cudaGetDeviceCount}\\PY{p}{(}\\PY{o}{\\PYZam{}}\\PY{n}{count}\\PY{p}{)}\\PY{+w}{ }\\PY{o}{!}\\PY{o}{=}\\PY{+w}{ }\\PY{n}{cudaSuccess}\\PY{p}{)}\n", + "\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", + "\\PY{+w}{ }\\PY{n}{printf}\\PY{p}{(}\\PY{l+s}{\\PYZdq{}}\\PY{l+s}{ERROR: GPU drivers are not configured or cuda\\PYZhy{}capable device }\\PY{l+s}{\\PYZdq{}}\n", + "\\PY{+w}{ }\\PY{l+s}{\\PYZdq{}}\\PY{l+s}{not found}\\PY{l+s+se}{\\PYZbs{}n}\\PY{l+s}{\\PYZdq{}}\\PY{p}{)}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{k}{return}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\n", + "\\PY{+w}{ }\\PY{k}{if}\\PY{+w}{ }\\PY{p}{(}\\PY{n}{count}\\PY{+w}{ }\\PY{o}{=}\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{)}\n", + "\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", + "\\PY{+w}{ }\\PY{n}{printf}\\PY{p}{(}\\PY{l+s}{\\PYZdq{}}\\PY{l+s}{ERROR: No cuda\\PYZhy{}capable devices found}\\PY{l+s+se}{\\PYZbs{}n}\\PY{l+s}{\\PYZdq{}}\\PY{p}{)}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{k}{return}\\PY{+w}{ }\\PY{l+m+mi}{2}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\n", + "\n", + "\\PY{+w}{ }\\PY{c+c1}{// Initialize cuda before we run the application}\n", + "\\PY{+w}{ }\\PY{k+kt}{float}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{dev\\PYZus{}X}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{n}{DACE\\PYZus{}GPU\\PYZus{}CHECK}\\PY{p}{(}\\PY{n}{cudaMalloc}\\PY{p}{(}\\PY{p}{(}\\PY{k+kt}{void}\\PY{+w}{ }\\PY{o}{*}\\PY{o}{*}\\PY{p}{)}\\PY{+w}{ }\\PY{o}{\\PYZam{}}\\PY{n}{dev\\PYZus{}X}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{)}\\PY{p}{)}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{n}{DACE\\PYZus{}GPU\\PYZus{}CHECK}\\PY{p}{(}\\PY{n}{cudaFree}\\PY{p}{(}\\PY{n}{dev\\PYZus{}X}\\PY{p}{)}\\PY{p}{)}\\PY{p}{;}\n", + "\n", + "\\PY{+w}{ }\n", + "\n", + "\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{k}{new}\\PY{+w}{ }\\PY{n}{dace}\\PY{o}{:}\\PY{o}{:}\\PY{n}{cuda}\\PY{o}{:}\\PY{o}{:}\\PY{n}{Context}\\PY{p}{(}\\PY{l+m+mi}{1}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{)}\\PY{p}{;}\n", + "\n", + "\\PY{+w}{ }\\PY{c+c1}{// Create cuda streams and events}\n", + "\\PY{+w}{ }\\PY{k}{for}\\PY{p}{(}\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{;}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{\\PYZlt{}}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{;}\\PY{+w}{ }\\PY{o}{+}\\PY{o}{+}\\PY{n}{i}\\PY{p}{)}\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", + "\\PY{+w}{ }\\PY{n}{DACE\\PYZus{}GPU\\PYZus{}CHECK}\\PY{p}{(}\\PY{n}{cudaStreamCreateWithFlags}\\PY{p}{(}\\PY{o}{\\PYZam{}}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{internal\\PYZus{}streams}\\PY{p}{[}\\PY{n}{i}\\PY{p}{]}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{cudaStreamNonBlocking}\\PY{p}{)}\\PY{p}{)}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{streams}\\PY{p}{[}\\PY{n}{i}\\PY{p}{]}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{internal\\PYZus{}streams}\\PY{p}{[}\\PY{n}{i}\\PY{p}{]}\\PY{p}{;}\\PY{+w}{ }\\PY{c+c1}{// Allow for externals to modify streams}\n", + "\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\n", + "\\PY{+w}{ }\\PY{k}{for}\\PY{p}{(}\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{;}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{\\PYZlt{}}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{;}\\PY{+w}{ }\\PY{o}{+}\\PY{o}{+}\\PY{n}{i}\\PY{p}{)}\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", + "\\PY{+w}{ }\\PY{n}{DACE\\PYZus{}GPU\\PYZus{}CHECK}\\PY{p}{(}\\PY{n}{cudaEventCreateWithFlags}\\PY{p}{(}\\PY{o}{\\PYZam{}}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{events}\\PY{p}{[}\\PY{n}{i}\\PY{p}{]}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{cudaEventDisableTiming}\\PY{p}{)}\\PY{p}{)}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\n", + "\n", + "\\PY{+w}{ }\n", + "\n", + "\\PY{+w}{ }\\PY{k}{return}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{;}\n", + "\\PY{p}{\\PYZcb{}}\n", + "\n", + "\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n+nf}{\\PYZus{}\\PYZus{}dace\\PYZus{}exit\\PYZus{}experimental\\PYZus{}cuda}\\PY{p}{(}\\PY{n}{Warp\\PYZus{}test\\PYZus{}1\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{)}\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", + "\\PY{+w}{ }\n", + "\n", + "\\PY{+w}{ }\\PY{c+c1}{// Synchronize and check for CUDA errors}\n", + "\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}err}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{k}{static\\PYZus{}cast}\\PY{o}{\\PYZlt{}}\\PY{k+kt}{int}\\PY{o}{\\PYZgt{}}\\PY{p}{(}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{lasterror}\\PY{p}{)}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{k}{if}\\PY{+w}{ }\\PY{p}{(}\\PY{n}{\\PYZus{}\\PYZus{}err}\\PY{+w}{ }\\PY{o}{=}\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{)}\n", + "\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}err}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{k}{static\\PYZus{}cast}\\PY{o}{\\PYZlt{}}\\PY{k+kt}{int}\\PY{o}{\\PYZgt{}}\\PY{p}{(}\\PY{n}{cudaDeviceSynchronize}\\PY{p}{(}\\PY{p}{)}\\PY{p}{)}\\PY{p}{;}\n", + "\n", + "\\PY{+w}{ }\\PY{c+c1}{// Destroy cuda streams and events}\n", + "\\PY{+w}{ }\\PY{k}{for}\\PY{p}{(}\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{;}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{\\PYZlt{}}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{;}\\PY{+w}{ }\\PY{o}{+}\\PY{o}{+}\\PY{n}{i}\\PY{p}{)}\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", + "\\PY{+w}{ }\\PY{n}{DACE\\PYZus{}GPU\\PYZus{}CHECK}\\PY{p}{(}\\PY{n}{cudaStreamDestroy}\\PY{p}{(}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{internal\\PYZus{}streams}\\PY{p}{[}\\PY{n}{i}\\PY{p}{]}\\PY{p}{)}\\PY{p}{)}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\n", + "\\PY{+w}{ }\\PY{k}{for}\\PY{p}{(}\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{;}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{\\PYZlt{}}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{;}\\PY{+w}{ }\\PY{o}{+}\\PY{o}{+}\\PY{n}{i}\\PY{p}{)}\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", + "\\PY{+w}{ }\\PY{n}{DACE\\PYZus{}GPU\\PYZus{}CHECK}\\PY{p}{(}\\PY{n}{cudaEventDestroy}\\PY{p}{(}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{events}\\PY{p}{[}\\PY{n}{i}\\PY{p}{]}\\PY{p}{)}\\PY{p}{)}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\n", + "\n", + "\\PY{+w}{ }\\PY{k}{delete}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{k}{return}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}err}\\PY{p}{;}\n", + "\\PY{p}{\\PYZcb{}}\n", + "\n", + "\\PY{n}{DACE\\PYZus{}EXPORTED}\\PY{+w}{ }\\PY{k+kt}{bool}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}dace\\PYZus{}gpu\\PYZus{}set\\PYZus{}stream}\\PY{p}{(}\\PY{n}{Warp\\PYZus{}test\\PYZus{}1\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{,}\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{streamid}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{gpuStream\\PYZus{}t}\\PY{+w}{ }\\PY{n}{stream}\\PY{p}{)}\n", + "\\PY{p}{\\PYZob{}}\n", + "\\PY{+w}{ }\\PY{k}{if}\\PY{+w}{ }\\PY{p}{(}\\PY{n}{streamid}\\PY{+w}{ }\\PY{o}{\\PYZlt{}}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{+w}{ }\\PY{o}{|}\\PY{o}{|}\\PY{+w}{ }\\PY{n}{streamid}\\PY{+w}{ }\\PY{o}{\\PYZgt{}}\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{)}\n", + "\\PY{+w}{ }\\PY{k}{return}\\PY{+w}{ }\\PY{n+nb}{false}\\PY{p}{;}\n", + "\n", + "\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{streams}\\PY{p}{[}\\PY{n}{streamid}\\PY{p}{]}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{stream}\\PY{p}{;}\n", + "\n", + "\\PY{+w}{ }\\PY{k}{return}\\PY{+w}{ }\\PY{n+nb}{true}\\PY{p}{;}\n", + "\\PY{p}{\\PYZcb{}}\n", + "\n", + "\\PY{n}{DACE\\PYZus{}EXPORTED}\\PY{+w}{ }\\PY{k+kt}{void}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}dace\\PYZus{}gpu\\PYZus{}set\\PYZus{}all\\PYZus{}streams}\\PY{p}{(}\\PY{n}{Warp\\PYZus{}test\\PYZus{}1\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{gpuStream\\PYZus{}t}\\PY{+w}{ }\\PY{n}{stream}\\PY{p}{)}\n", + "\\PY{p}{\\PYZob{}}\n", + "\\PY{+w}{ }\\PY{k}{for}\\PY{+w}{ }\\PY{p}{(}\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{;}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{\\PYZlt{}}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{;}\\PY{+w}{ }\\PY{o}{+}\\PY{o}{+}\\PY{n}{i}\\PY{p}{)}\n", + "\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{streams}\\PY{p}{[}\\PY{n}{i}\\PY{p}{]}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{stream}\\PY{p}{;}\n", + "\\PY{p}{\\PYZcb{}}\n", + "\n", + "\\PY{n}{\\PYZus{}\\PYZus{}global\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{k+kt}{void}\\PY{+w}{ }\\PY{n}{GPU\\PYZus{}Map\\PYZus{}0\\PYZus{}0\\PYZus{}2}\\PY{p}{(}\\PY{n}{dace}\\PY{o}{:}\\PY{o}{:}\\PY{n}{uint}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}restrict\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{n}{A}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{dace}\\PY{o}{:}\\PY{o}{:}\\PY{n}{uint}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}restrict\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{n}{B}\\PY{p}{)}\n", + "\\PY{p}{\\PYZob{}}\\PY{+w}{ }\\PY{c+c1}{// Kernel scope (open 1)}\n", + "\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{p}{(}\\PY{l+m+mi}{32}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{blockIdx}\\PY{p}{.}\\PY{n}{x}\\PY{p}{)}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{p}{\\PYZob{}}\\PY{+w}{ }\\PY{c+c1}{// ThreadBlock Scope (open 1)}\n", + "\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{j}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{threadIdx}\\PY{p}{.}\\PY{n}{x}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{p}{\\PYZob{}}\\PY{+w}{ }\\PY{c+c1}{// WarpLevel Scope (open 1)}\n", + "\n", + "\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{warpId}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{p}{(}\\PY{n}{threadIdx}\\PY{p}{.}\\PY{n}{x}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{n}{blockDim}\\PY{p}{.}\\PY{n}{x}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{threadIdx}\\PY{p}{.}\\PY{n}{y}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{n}{blockDim}\\PY{p}{.}\\PY{n}{x}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{blockDim}\\PY{p}{.}\\PY{n}{y}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{threadIdx}\\PY{p}{.}\\PY{n}{z}\\PY{p}{)}\\PY{+w}{ }\\PY{o}{/}\\PY{+w}{ }\\PY{l+m+mi}{32}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{k}{if}\\PY{+w}{ }\\PY{p}{(}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{+w}{ }\\PY{o}{\\PYZlt{}}\\PY{o}{=}\\PY{+w}{ }\\PY{n}{warpId}\\PY{+w}{ }\\PY{o}{\\PYZam{}}\\PY{o}{\\PYZam{}}\\PY{+w}{ }\\PY{n}{warpId}\\PY{+w}{ }\\PY{o}{\\PYZlt{}}\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{)}\\PY{+w}{ }\\PY{p}{\\PYZob{}}\\PY{+w}{ }\\PY{c+c1}{// WarpLevel Scope (open 2)}\n", + "\n", + "\n", + "\n", + "\\PY{+w}{ }\\PY{c+c1}{//////////////////////////////////////}\n", + "\\PY{+w}{ }\\PY{c+c1}{// WarpLevel operations start}\n", + "\\PY{+w}{ }\\PY{c+c1}{//////////////////////////////////////}\n", + "\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", + "\\PY{+w}{ }\\PY{n}{dace}\\PY{o}{:}\\PY{o}{:}\\PY{n}{uint}\\PY{o}{*}\\PY{+w}{ }\\PY{n}{inp}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{o}{\\PYZam{}}\\PY{n}{A}\\PY{p}{[}\\PY{l+m+mi}{0}\\PY{p}{]}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{n}{dace}\\PY{o}{:}\\PY{o}{:}\\PY{n}{uint}\\PY{+w}{ }\\PY{n}{out}\\PY{p}{;}\n", + "\n", + "\\PY{+w}{ }\\PY{c+c1}{///////////////////}\n", + "\\PY{+w}{ }\\PY{c+c1}{// Tasklet code (WarpLevel\\PYZus{}Operation)}\n", + "\\PY{+w}{ }\\PY{k}{auto}\\PY{+w}{ }\\PY{n}{value}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{inp}\\PY{p}{[}\\PY{n}{j}\\PY{p}{]}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{n}{out}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}reduce\\PYZus{}add\\PYZus{}sync}\\PY{p}{(}\\PY{l+m+mi}{4294967295U}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{value}\\PY{p}{)}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{c+c1}{///////////////////}\n", + "\n", + "\\PY{+w}{ }\\PY{n}{B}\\PY{p}{[}\\PY{n}{j}\\PY{p}{]}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{out}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\n", + "\\PY{+w}{ }\\PY{c+c1}{//////////////////////////////////////}\n", + "\\PY{+w}{ }\\PY{c+c1}{// WarpLevel operations end}\n", + "\\PY{+w}{ }\\PY{c+c1}{//////////////////////////////////////}\n", + "\n", + "\n", + "\n", + "\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\\PY{+w}{ }\\PY{c+c1}{// WarpLevel Scope (close 1)}\n", + "\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\\PY{+w}{ }\\PY{c+c1}{// WarpLevel Scope (close 2)}\n", + "\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\\PY{+w}{ }\\PY{c+c1}{// ThreadBlock Scope (close 1)}\n", + "\\PY{p}{\\PYZcb{}}\\PY{+w}{ }\\PY{c+c1}{// Kernel scope (close 1)}\n", + "\n", + "\n", + "\\PY{n}{DACE\\PYZus{}EXPORTED}\\PY{+w}{ }\\PY{k+kt}{void}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}dace\\PYZus{}runkernel\\PYZus{}GPU\\PYZus{}Map\\PYZus{}0\\PYZus{}0\\PYZus{}2}\\PY{p}{(}\\PY{n}{Warp\\PYZus{}test\\PYZus{}1\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{dace}\\PY{o}{:}\\PY{o}{:}\\PY{n}{uint}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}restrict\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{n}{A}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{dace}\\PY{o}{:}\\PY{o}{:}\\PY{n}{uint}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}restrict\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{n}{B}\\PY{p}{)}\\PY{p}{;}\n", + "\\PY{k+kt}{void}\\PY{+w}{ }\\PY{n+nf}{\\PYZus{}\\PYZus{}dace\\PYZus{}runkernel\\PYZus{}GPU\\PYZus{}Map\\PYZus{}0\\PYZus{}0\\PYZus{}2}\\PY{p}{(}\\PY{n}{Warp\\PYZus{}test\\PYZus{}1\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{dace}\\PY{o}{:}\\PY{o}{:}\\PY{n}{uint}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}restrict\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{n}{A}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{dace}\\PY{o}{:}\\PY{o}{:}\\PY{n}{uint}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}restrict\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{n}{B}\\PY{p}{)}\n", + "\\PY{p}{\\PYZob{}}\n", + "\n", + "\n", + "\\PY{+w}{ }\\PY{k+kt}{void}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{GPU\\PYZus{}Map\\PYZus{}0\\PYZus{}0\\PYZus{}2\\PYZus{}args}\\PY{p}{[}\\PY{p}{]}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{p}{\\PYZob{}}\\PY{+w}{ }\\PY{p}{(}\\PY{k+kt}{void}\\PY{+w}{ }\\PY{o}{*}\\PY{p}{)}\\PY{o}{\\PYZam{}}\\PY{n}{A}\\PY{p}{,}\\PY{+w}{ }\\PY{p}{(}\\PY{k+kt}{void}\\PY{+w}{ }\\PY{o}{*}\\PY{p}{)}\\PY{o}{\\PYZam{}}\\PY{n}{B}\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{n}{gpuError\\PYZus{}t}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}err}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{cudaLaunchKernel}\\PY{p}{(}\\PY{+w}{ }\\PY{p}{(}\\PY{k+kt}{void}\\PY{o}{*}\\PY{p}{)}\\PY{n}{GPU\\PYZus{}Map\\PYZus{}0\\PYZus{}0\\PYZus{}2}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{dim3}\\PY{p}{(}\\PY{l+m+mi}{1}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{)}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{dim3}\\PY{p}{(}\\PY{l+m+mi}{32}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{)}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{GPU\\PYZus{}Map\\PYZus{}0\\PYZus{}0\\PYZus{}2\\PYZus{}args}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{streams}\\PY{p}{[}\\PY{l+m+mi}{0}\\PY{p}{]}\n", + "\\PY{+w}{ }\\PY{p}{)}\\PY{p}{;}\n", + "\n", + "\\PY{+w}{ }\\PY{n}{DACE\\PYZus{}KERNEL\\PYZus{}LAUNCH\\PYZus{}CHECK}\\PY{p}{(}\\PY{n}{\\PYZus{}\\PYZus{}err}\\PY{p}{,}\\PY{+w}{ }\\PY{l+s}{\\PYZdq{}}\\PY{l+s}{GPU\\PYZus{}Map\\PYZus{}0\\PYZus{}0\\PYZus{}2}\\PY{l+s}{\\PYZdq{}}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{32}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{)}\\PY{p}{;}\n", + "\\PY{p}{\\PYZcb{}}\n", + "\\end{Verbatim}\n" + ], + "text/plain": [ + "\n", + "#include \n", + "#include \n", + "\n", + "\n", + "struct Warp_test_1_state_t {\n", + " dace::cuda::Context *gpu_context;\n", + "};\n", + "\n", + "\n", + "\n", + "DACE_EXPORTED int __dace_init_experimental_cuda(Warp_test_1_state_t *__state);\n", + "DACE_EXPORTED int __dace_exit_experimental_cuda(Warp_test_1_state_t *__state);\n", + "\n", + "\n", + "\n", + "int __dace_init_experimental_cuda(Warp_test_1_state_t *__state) {\n", + " int count;\n", + "\n", + " // Check that we are able to run cuda code\n", + " if (cudaGetDeviceCount(&count) != cudaSuccess)\n", + " {\n", + " printf(\"ERROR: GPU drivers are not configured or cuda-capable device \"\n", + " \"not found\\n\");\n", + " return 1;\n", + " }\n", + " if (count == 0)\n", + " {\n", + " printf(\"ERROR: No cuda-capable devices found\\n\");\n", + " return 2;\n", + " }\n", + "\n", + " // Initialize cuda before we run the application\n", + " float *dev_X;\n", + " DACE_GPU_CHECK(cudaMalloc((void **) &dev_X, 1));\n", + " DACE_GPU_CHECK(cudaFree(dev_X));\n", + "\n", + " \n", + "\n", + " __state->gpu_context = new dace::cuda::Context(1, 1);\n", + "\n", + " // Create cuda streams and events\n", + " for(int i = 0; i < 1; ++i) {\n", + " DACE_GPU_CHECK(cudaStreamCreateWithFlags(&__state->gpu_context->internal_streams[i], cudaStreamNonBlocking));\n", + " __state->gpu_context->streams[i] = __state->gpu_context->internal_streams[i]; // Allow for externals to modify streams\n", + " }\n", + " for(int i = 0; i < 1; ++i) {\n", + " DACE_GPU_CHECK(cudaEventCreateWithFlags(&__state->gpu_context->events[i], cudaEventDisableTiming));\n", + " }\n", + "\n", + " \n", + "\n", + " return 0;\n", + "}\n", + "\n", + "int __dace_exit_experimental_cuda(Warp_test_1_state_t *__state) {\n", + " \n", + "\n", + " // Synchronize and check for CUDA errors\n", + " int __err = static_cast(__state->gpu_context->lasterror);\n", + " if (__err == 0)\n", + " __err = static_cast(cudaDeviceSynchronize());\n", + "\n", + " // Destroy cuda streams and events\n", + " for(int i = 0; i < 1; ++i) {\n", + " DACE_GPU_CHECK(cudaStreamDestroy(__state->gpu_context->internal_streams[i]));\n", + " }\n", + " for(int i = 0; i < 1; ++i) {\n", + " DACE_GPU_CHECK(cudaEventDestroy(__state->gpu_context->events[i]));\n", + " }\n", + "\n", + " delete __state->gpu_context;\n", + " return __err;\n", + "}\n", + "\n", + "DACE_EXPORTED bool __dace_gpu_set_stream(Warp_test_1_state_t *__state, int streamid, gpuStream_t stream)\n", + "{\n", + " if (streamid < 0 || streamid >= 1)\n", + " return false;\n", + "\n", + " __state->gpu_context->streams[streamid] = stream;\n", + "\n", + " return true;\n", + "}\n", + "\n", + "DACE_EXPORTED void __dace_gpu_set_all_streams(Warp_test_1_state_t *__state, gpuStream_t stream)\n", + "{\n", + " for (int i = 0; i < 1; ++i)\n", + " __state->gpu_context->streams[i] = stream;\n", + "}\n", + "\n", + "__global__ void GPU_Map_0_0_2(dace::uint * __restrict__ A, dace::uint * __restrict__ B)\n", + "{ // Kernel scope (open 1)\n", + " int i = (32 * blockIdx.x);\n", + " { // ThreadBlock Scope (open 1)\n", + " int j = threadIdx.x;\n", + " { // WarpLevel Scope (open 1)\n", + "\n", + " int warpId = (threadIdx.x + blockDim.x * threadIdx.y + blockDim.x * blockDim.y * threadIdx.z) / 32;\n", + " if ( 0 <= warpId && warpId <= 0) { // WarpLevel Scope (open 2)\n", + "\n", + "\n", + "\n", + " //////////////////////////////////////\n", + " // WarpLevel operations start\n", + " //////////////////////////////////////\n", + " {\n", + " dace::uint* inp = &A[0];\n", + " dace::uint out;\n", + "\n", + " ///////////////////\n", + " // Tasklet code (WarpLevel_Operation)\n", + " auto value = inp[j];\n", + " out = __reduce_add_sync(4294967295U, value);\n", + " ///////////////////\n", + "\n", + " B[j] = out;\n", + " }\n", + " //////////////////////////////////////\n", + " // WarpLevel operations end\n", + " //////////////////////////////////////\n", + "\n", + "\n", + "\n", + " } // WarpLevel Scope (close 1)\n", + " } // WarpLevel Scope (close 2)\n", + " } // ThreadBlock Scope (close 1)\n", + "} // Kernel scope (close 1)\n", + "\n", + "\n", + "DACE_EXPORTED void __dace_runkernel_GPU_Map_0_0_2(Warp_test_1_state_t *__state, dace::uint * __restrict__ A, dace::uint * __restrict__ B);\n", + "void __dace_runkernel_GPU_Map_0_0_2(Warp_test_1_state_t *__state, dace::uint * __restrict__ A, dace::uint * __restrict__ B)\n", + "{\n", + "\n", + "\n", + " void *GPU_Map_0_0_2_args[] = { (void *)&A, (void *)&B };\n", + " gpuError_t __err = cudaLaunchKernel( (void*)GPU_Map_0_0_2, dim3(1, 1, 1), dim3(32, 1, 1), GPU_Map_0_0_2_args, 0, __state->gpu_context->streams[0]\n", + " );\n", + "\n", + " DACE_KERNEL_LAUNCH_CHECK(__err, \"GPU_Map_0_0_2\", 1, 1, 1, 32, 1, 1);\n", + "}\n" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "Code(sdfg.generate_code()[1].clean_code, language='cpp')" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "6d7c1429", + "metadata": {}, + "outputs": [], + "source": [ + "call_it = sdfg.compile()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "9673dc1b", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/berkay/master-thesis/dace/dace/sdfg/sdfg.py:2373: UserWarning: SDFG 'Warp_test_1' is already loaded by another object, recompiling under a different name 'Warp_test_1_0'.\n", + " warnings.warn(f\"SDFG '{self.name}' is already loaded by another object, recompiling under a different \"\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32\n", + " 32 32 32 32 32 32 32 32]\n" + ] + } + ], + "source": [ + "A = cp.ones(32, dtype=cp.uint32) \n", + "B = cp.zeros(32, dtype=cp.uint32) \n", + "\n", + "sdfg(A=A, B=B)\n", + "\n", + "print(B)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "dace_env", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/berkay_workpace/scratch/vis.ipynb b/berkay_workpace/scratch/vis.ipynb new file mode 100644 index 0000000000..efd0359347 --- /dev/null +++ b/berkay_workpace/scratch/vis.ipynb @@ -0,0 +1,492 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "2a7d72f5", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import dace\n", + "import cupy as cp\n", + "\n", + "from IPython.display import Code\n", + "from dace.config import Config" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "2166f4ee", + "metadata": {}, + "outputs": [], + "source": [ + "@dace.program\n", + "def cpuRed(A: dace.uint32[32], B:dace.uint32[32]):\n", + " for i in dace.map[0:32]:\n", + " B[i] = sum(A)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "baa4d9ba", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sdfg = cpuRed.to_sdfg()\n", + "sdfg.compile()\n", + "sdfg\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f863ad50", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
/* DaCe AUTO-GENERATED FILE. DO NOT MODIFY */\n",
+       "#include <dace/dace.h>\n",
+       "#include "../../include/hash.h"\n",
+       "\n",
+       "struct cpuRed_state_t {\n",
+       "\n",
+       "};\n",
+       "\n",
+       "inline void reduce_0_0_6(cpuRed_state_t *__state, dace::uint* __restrict__ _in, dace::uint* __restrict__ _out) {\n",
+       "\n",
+       "    {\n",
+       "\n",
+       "        {\n",
+       "            for (auto _o0 = 0; _o0 < 1; _o0 += 1) {\n",
+       "                {\n",
+       "                    dace::uint __out;\n",
+       "\n",
+       "                    ///////////////////\n",
+       "                    // Tasklet code (reduce_init)\n",
+       "                    __out = 0;\n",
+       "                    ///////////////////\n",
+       "\n",
+       "                    _out[_o0] = __out;\n",
+       "                }\n",
+       "            }\n",
+       "        }\n",
+       "\n",
+       "    }\n",
+       "    {\n",
+       "\n",
+       "        {\n",
+       "            for (auto _i0 = 0; _i0 < 32; _i0 += 1) {\n",
+       "                {\n",
+       "                    dace::uint __inp = _in[_i0];\n",
+       "                    dace::uint __out;\n",
+       "\n",
+       "                    ///////////////////\n",
+       "                    // Tasklet code (identity)\n",
+       "                    __out = __inp;\n",
+       "                    ///////////////////\n",
+       "\n",
+       "                    dace::wcr_fixed<dace::ReductionType::Sum, dace::uint>::reduce(_out, __out);\n",
+       "                }\n",
+       "            }\n",
+       "        }\n",
+       "\n",
+       "    }\n",
+       "}\n",
+       "\n",
+       "void __program_cpuRed_internal(cpuRed_state_t*__state, dace::uint * __restrict__ A, dace::uint * __restrict__ B)\n",
+       "{\n",
+       "\n",
+       "    {\n",
+       "\n",
+       "        {\n",
+       "            #pragma omp parallel for\n",
+       "            for (auto i = 0; i < 32; i += 1) {\n",
+       "                dace::uint __tmp1;\n",
+       "                reduce_0_0_6(__state, &A[0], &__tmp1);\n",
+       "                {\n",
+       "                    dace::uint __inp = __tmp1;\n",
+       "                    dace::uint __out;\n",
+       "\n",
+       "                    ///////////////////\n",
+       "                    // Tasklet code (assign_4_8)\n",
+       "                    __out = __inp;\n",
+       "                    ///////////////////\n",
+       "\n",
+       "                    B[i] = __out;\n",
+       "                }\n",
+       "            }\n",
+       "        }\n",
+       "\n",
+       "    }\n",
+       "}\n",
+       "\n",
+       "DACE_EXPORTED void __program_cpuRed(cpuRed_state_t *__state, dace::uint * __restrict__ A, dace::uint * __restrict__ B)\n",
+       "{\n",
+       "    __program_cpuRed_internal(__state, A, B);\n",
+       "}\n",
+       "\n",
+       "DACE_EXPORTED cpuRed_state_t *__dace_init_cpuRed()\n",
+       "{\n",
+       "    int __result = 0;\n",
+       "    cpuRed_state_t *__state = new cpuRed_state_t;\n",
+       "\n",
+       "\n",
+       "\n",
+       "    if (__result) {\n",
+       "        delete __state;\n",
+       "        return nullptr;\n",
+       "    }\n",
+       "    return __state;\n",
+       "}\n",
+       "\n",
+       "DACE_EXPORTED int __dace_exit_cpuRed(cpuRed_state_t *__state)\n",
+       "{\n",
+       "    int __err = 0;\n",
+       "    delete __state;\n",
+       "    return __err;\n",
+       "}\n",
+       "
\n" + ], + "text/latex": [ + "\\begin{Verbatim}[commandchars=\\\\\\{\\}]\n", + "\\PY{c+cm}{/* DaCe AUTO\\PYZhy{}GENERATED FILE. DO NOT MODIFY */}\n", + "\\PY{c+cp}{\\PYZsh{}}\\PY{c+cp}{include}\\PY{+w}{ }\\PY{c+cpf}{\\PYZlt{}dace/dace.h\\PYZgt{}}\n", + "\\PY{c+cp}{\\PYZsh{}}\\PY{c+cp}{include}\\PY{+w}{ }\\PY{c+cpf}{\\PYZdq{}../../include/hash.h\\PYZdq{}}\n", + "\n", + "\\PY{k}{struct}\\PY{+w}{ }\\PY{n+nc}{cpuRed\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", + "\n", + "\\PY{p}{\\PYZcb{}}\\PY{p}{;}\n", + "\n", + "\\PY{k+kr}{inline}\\PY{+w}{ }\\PY{k+kt}{void}\\PY{+w}{ }\\PY{n+nf}{reduce\\PYZus{}0\\PYZus{}0\\PYZus{}6}\\PY{p}{(}\\PY{n}{cpuRed\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{dace}\\PY{o}{:}\\PY{o}{:}\\PY{n}{uint}\\PY{o}{*}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}restrict\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{n}{\\PYZus{}in}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{dace}\\PY{o}{:}\\PY{o}{:}\\PY{n}{uint}\\PY{o}{*}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}restrict\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{n}{\\PYZus{}out}\\PY{p}{)}\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", + "\n", + "\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", + "\n", + "\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", + "\\PY{+w}{ }\\PY{k}{for}\\PY{+w}{ }\\PY{p}{(}\\PY{k}{auto}\\PY{+w}{ }\\PY{n}{\\PYZus{}o0}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{;}\\PY{+w}{ }\\PY{n}{\\PYZus{}o0}\\PY{+w}{ }\\PY{o}{\\PYZlt{}}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{;}\\PY{+w}{ }\\PY{n}{\\PYZus{}o0}\\PY{+w}{ }\\PY{o}{+}\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{)}\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", + "\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", + "\\PY{+w}{ }\\PY{n}{dace}\\PY{o}{:}\\PY{o}{:}\\PY{n}{uint}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}out}\\PY{p}{;}\n", + "\n", + "\\PY{+w}{ }\\PY{c+c1}{///////////////////}\n", + "\\PY{+w}{ }\\PY{c+c1}{// Tasklet code (reduce\\PYZus{}init)}\n", + "\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}out}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{c+c1}{///////////////////}\n", + "\n", + "\\PY{+w}{ }\\PY{n}{\\PYZus{}out}\\PY{p}{[}\\PY{n}{\\PYZus{}o0}\\PY{p}{]}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}out}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\n", + "\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\n", + "\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\n", + "\n", + "\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\n", + "\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", + "\n", + "\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", + "\\PY{+w}{ }\\PY{k}{for}\\PY{+w}{ }\\PY{p}{(}\\PY{k}{auto}\\PY{+w}{ }\\PY{n}{\\PYZus{}i0}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{;}\\PY{+w}{ }\\PY{n}{\\PYZus{}i0}\\PY{+w}{ }\\PY{o}{\\PYZlt{}}\\PY{+w}{ }\\PY{l+m+mi}{32}\\PY{p}{;}\\PY{+w}{ }\\PY{n}{\\PYZus{}i0}\\PY{+w}{ }\\PY{o}{+}\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{)}\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", + "\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", + "\\PY{+w}{ }\\PY{n}{dace}\\PY{o}{:}\\PY{o}{:}\\PY{n}{uint}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}inp}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{\\PYZus{}in}\\PY{p}{[}\\PY{n}{\\PYZus{}i0}\\PY{p}{]}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{n}{dace}\\PY{o}{:}\\PY{o}{:}\\PY{n}{uint}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}out}\\PY{p}{;}\n", + "\n", + "\\PY{+w}{ }\\PY{c+c1}{///////////////////}\n", + "\\PY{+w}{ }\\PY{c+c1}{// Tasklet code (identity)}\n", + "\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}out}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}inp}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{c+c1}{///////////////////}\n", + "\n", + "\\PY{+w}{ }\\PY{n}{dace}\\PY{o}{:}\\PY{o}{:}\\PY{n}{wcr\\PYZus{}fixed}\\PY{o}{\\PYZlt{}}\\PY{n}{dace}\\PY{o}{:}\\PY{o}{:}\\PY{n}{ReductionType}\\PY{o}{:}\\PY{o}{:}\\PY{n}{Sum}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{dace}\\PY{o}{:}\\PY{o}{:}\\PY{n}{uint}\\PY{o}{\\PYZgt{}}\\PY{o}{:}\\PY{o}{:}\\PY{n}{reduce}\\PY{p}{(}\\PY{n}{\\PYZus{}out}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}out}\\PY{p}{)}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\n", + "\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\n", + "\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\n", + "\n", + "\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\n", + "\\PY{p}{\\PYZcb{}}\n", + "\n", + "\\PY{k+kt}{void}\\PY{+w}{ }\\PY{n+nf}{\\PYZus{}\\PYZus{}program\\PYZus{}cpuRed\\PYZus{}internal}\\PY{p}{(}\\PY{n}{cpuRed\\PYZus{}state\\PYZus{}t}\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{dace}\\PY{o}{:}\\PY{o}{:}\\PY{n}{uint}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}restrict\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{n}{A}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{dace}\\PY{o}{:}\\PY{o}{:}\\PY{n}{uint}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}restrict\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{n}{B}\\PY{p}{)}\n", + "\\PY{p}{\\PYZob{}}\n", + "\n", + "\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", + "\n", + "\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", + "\\PY{+w}{ }\\PY{c+cp}{\\PYZsh{}}\\PY{c+cp}{pragma omp parallel for}\n", + "\\PY{+w}{ }\\PY{k}{for}\\PY{+w}{ }\\PY{p}{(}\\PY{k}{auto}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{;}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{\\PYZlt{}}\\PY{+w}{ }\\PY{l+m+mi}{32}\\PY{p}{;}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{+}\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{)}\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", + "\\PY{+w}{ }\\PY{n}{dace}\\PY{o}{:}\\PY{o}{:}\\PY{n}{uint}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}tmp1}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{n}{reduce\\PYZus{}0\\PYZus{}0\\PYZus{}6}\\PY{p}{(}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{,}\\PY{+w}{ }\\PY{o}{\\PYZam{}}\\PY{n}{A}\\PY{p}{[}\\PY{l+m+mi}{0}\\PY{p}{]}\\PY{p}{,}\\PY{+w}{ }\\PY{o}{\\PYZam{}}\\PY{n}{\\PYZus{}\\PYZus{}tmp1}\\PY{p}{)}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", + "\\PY{+w}{ }\\PY{n}{dace}\\PY{o}{:}\\PY{o}{:}\\PY{n}{uint}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}inp}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}tmp1}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{n}{dace}\\PY{o}{:}\\PY{o}{:}\\PY{n}{uint}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}out}\\PY{p}{;}\n", + "\n", + "\\PY{+w}{ }\\PY{c+c1}{///////////////////}\n", + "\\PY{+w}{ }\\PY{c+c1}{// Tasklet code (assign\\PYZus{}4\\PYZus{}8)}\n", + "\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}out}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}inp}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{c+c1}{///////////////////}\n", + "\n", + "\\PY{+w}{ }\\PY{n}{B}\\PY{p}{[}\\PY{n}{i}\\PY{p}{]}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}out}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\n", + "\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\n", + "\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\n", + "\n", + "\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\n", + "\\PY{p}{\\PYZcb{}}\n", + "\n", + "\\PY{n}{DACE\\PYZus{}EXPORTED}\\PY{+w}{ }\\PY{k+kt}{void}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}program\\PYZus{}cpuRed}\\PY{p}{(}\\PY{n}{cpuRed\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{dace}\\PY{o}{:}\\PY{o}{:}\\PY{n}{uint}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}restrict\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{n}{A}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{dace}\\PY{o}{:}\\PY{o}{:}\\PY{n}{uint}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}restrict\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{n}{B}\\PY{p}{)}\n", + "\\PY{p}{\\PYZob{}}\n", + "\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}program\\PYZus{}cpuRed\\PYZus{}internal}\\PY{p}{(}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{A}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{B}\\PY{p}{)}\\PY{p}{;}\n", + "\\PY{p}{\\PYZcb{}}\n", + "\n", + "\\PY{n}{DACE\\PYZus{}EXPORTED}\\PY{+w}{ }\\PY{n}{cpuRed\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}dace\\PYZus{}init\\PYZus{}cpuRed}\\PY{p}{(}\\PY{p}{)}\n", + "\\PY{p}{\\PYZob{}}\n", + "\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}result}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{n}{cpuRed\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{k}{new}\\PY{+w}{ }\\PY{n}{cpuRed\\PYZus{}state\\PYZus{}t}\\PY{p}{;}\n", + "\n", + "\n", + "\n", + "\\PY{+w}{ }\\PY{k}{if}\\PY{+w}{ }\\PY{p}{(}\\PY{n}{\\PYZus{}\\PYZus{}result}\\PY{p}{)}\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", + "\\PY{+w}{ }\\PY{k}{delete}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{k}{return}\\PY{+w}{ }\\PY{k}{nullptr}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\n", + "\\PY{+w}{ }\\PY{k}{return}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{;}\n", + "\\PY{p}{\\PYZcb{}}\n", + "\n", + "\\PY{n}{DACE\\PYZus{}EXPORTED}\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}dace\\PYZus{}exit\\PYZus{}cpuRed}\\PY{p}{(}\\PY{n}{cpuRed\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{)}\n", + "\\PY{p}{\\PYZob{}}\n", + "\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}err}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{k}{delete}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{k}{return}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}err}\\PY{p}{;}\n", + "\\PY{p}{\\PYZcb{}}\n", + "\\end{Verbatim}\n" + ], + "text/plain": [ + "/* DaCe AUTO-GENERATED FILE. DO NOT MODIFY */\n", + "#include \n", + "#include \"../../include/hash.h\"\n", + "\n", + "struct cpuRed_state_t {\n", + "\n", + "};\n", + "\n", + "inline void reduce_0_0_6(cpuRed_state_t *__state, dace::uint* __restrict__ _in, dace::uint* __restrict__ _out) {\n", + "\n", + " {\n", + "\n", + " {\n", + " for (auto _o0 = 0; _o0 < 1; _o0 += 1) {\n", + " {\n", + " dace::uint __out;\n", + "\n", + " ///////////////////\n", + " // Tasklet code (reduce_init)\n", + " __out = 0;\n", + " ///////////////////\n", + "\n", + " _out[_o0] = __out;\n", + " }\n", + " }\n", + " }\n", + "\n", + " }\n", + " {\n", + "\n", + " {\n", + " for (auto _i0 = 0; _i0 < 32; _i0 += 1) {\n", + " {\n", + " dace::uint __inp = _in[_i0];\n", + " dace::uint __out;\n", + "\n", + " ///////////////////\n", + " // Tasklet code (identity)\n", + " __out = __inp;\n", + " ///////////////////\n", + "\n", + " dace::wcr_fixed::reduce(_out, __out);\n", + " }\n", + " }\n", + " }\n", + "\n", + " }\n", + "}\n", + "\n", + "void __program_cpuRed_internal(cpuRed_state_t*__state, dace::uint * __restrict__ A, dace::uint * __restrict__ B)\n", + "{\n", + "\n", + " {\n", + "\n", + " {\n", + " #pragma omp parallel for\n", + " for (auto i = 0; i < 32; i += 1) {\n", + " dace::uint __tmp1;\n", + " reduce_0_0_6(__state, &A[0], &__tmp1);\n", + " {\n", + " dace::uint __inp = __tmp1;\n", + " dace::uint __out;\n", + "\n", + " ///////////////////\n", + " // Tasklet code (assign_4_8)\n", + " __out = __inp;\n", + " ///////////////////\n", + "\n", + " B[i] = __out;\n", + " }\n", + " }\n", + " }\n", + "\n", + " }\n", + "}\n", + "\n", + "DACE_EXPORTED void __program_cpuRed(cpuRed_state_t *__state, dace::uint * __restrict__ A, dace::uint * __restrict__ B)\n", + "{\n", + " __program_cpuRed_internal(__state, A, B);\n", + "}\n", + "\n", + "DACE_EXPORTED cpuRed_state_t *__dace_init_cpuRed()\n", + "{\n", + " int __result = 0;\n", + " cpuRed_state_t *__state = new cpuRed_state_t;\n", + "\n", + "\n", + "\n", + " if (__result) {\n", + " delete __state;\n", + " return nullptr;\n", + " }\n", + " return __state;\n", + "}\n", + "\n", + "DACE_EXPORTED int __dace_exit_cpuRed(cpuRed_state_t *__state)\n", + "{\n", + " int __err = 0;\n", + " delete __state;\n", + " return __err;\n", + "}" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "Code(sdfg.generate_code()[0].clean_code, language='cpp')" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "dace_env", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From f47aba6b14c999babaf1286b04044c3ec9b7ffa0 Mon Sep 17 00:00:00 2001 From: aydogdub Date: Tue, 13 May 2025 18:29:43 +0200 Subject: [PATCH 05/94] fix ThreadBlock scope generation and improve readability --- dace/codegen/targets/experimental_cuda.py | 166 +++++++++++++++++----- 1 file changed, 133 insertions(+), 33 deletions(-) diff --git a/dace/codegen/targets/experimental_cuda.py b/dace/codegen/targets/experimental_cuda.py index 60c89131bb..36dd7e4795 100644 --- a/dace/codegen/targets/experimental_cuda.py +++ b/dace/codegen/targets/experimental_cuda.py @@ -40,6 +40,7 @@ # TODO: GENERAL, discuss with Yakup +# 1. Approval of dtypes # TODO: I am not handling map with strided rights now, @@ -658,37 +659,39 @@ def _generate_kernel_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: + """ - def _generate_GPU_ThreadBlock_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: ScopeSubgraphView, - state_id: int, function_stream: CodeIOStream, kernel_stream: CodeIOStream) -> None: - - node = dfg_scope.source_nodes()[0] - scope_map = node.map - - - with KernelScopeManager(cudaCodeGen=self, sdfg=sdfg, cfg=cfg, dfg_scope=dfg_scope, state_id=state_id, - function_stream=function_stream, callsite_stream=kernel_stream, comment="ThreadBlock Scope",) as scopeManager: - - - brange = subsets.Range(scope_map.range[::-1]) - - dsym = [symbolic.symbol(f'__DAPT{i}', nonnegative=True, integer=True) for i in range(len(brange))] - bdims = self._current_kernel_spec.block_dims - dsym_end = [d + (bs * rng[2]) - 1 for d, bs, rng in zip(dsym, bdims, brange)] - tidx = brange.coord_at(dsym) - - # First three dimensions are evaluated directly - for i in range(min(len(brange), 3)): - - varname = scope_map.params[-i - 1] - block_expr = 'threadIdx.%s' % _get_cuda_dim(i) - - expr = symbolic_to_cpp(tidx[i]).replace(f'__DAPT{i}', block_expr) - kernel_stream.write(f'int {varname} = {expr};', cfg, state_id, node) - self._dispatcher.defined_vars.add(varname, DefinedType.Scalar, 'int') - + # First three dimensions are evaluated directly + for i in range(min(len(brange), 3)): + varname = scope_map.params[-i - 1] + # Delinearize third dimension if necessary + if i == 2 and len(brange) > 3: + block_expr = '(threadIdx.z / (%s))' % _topy(functools.reduce(sympy.Mul, kdims[3:], 1)) + else: + block_expr = 'threadIdx.%s' % _named_idx(i) + + expr = _topy(tidx[i]).replace('__DAPT%d' % i, block_expr) + callsite_stream.write('int %s = %s;' % (varname, expr), cfg, state_id, scope_entry) + self._dispatcher.defined_vars.add(varname, DefinedType.Scalar, 'int') + + # Delinearize beyond the third dimension + if len(brange) > 3: + for i in range(3, len(brange)): + varname = scope_map.params[-i - 1] + # true dim i = z / ('*'.join(kdims[i+1:])) % kdims[i] + block_expr = '(threadIdx.z / (%s)) %% (%s)' % ( + _topy(functools.reduce(sympy.Mul, kdims[i + 1:], 1)), + _topy(kdims[i]), + ) + + expr = _topy(tidx[i]).replace('__DAPT%d' % i, block_expr) + callsite_stream.write('int %s = %s;' % (varname, expr), cfg, state_id, scope_entry) + self._dispatcher.defined_vars.add(varname, DefinedType.Scalar, 'int') + + # Generate conditions for this block's execution using min and max + # element, e.g. skipping out-of-bounds threads in trailing block minels = brange.min_element() maxels = brange.max_element() for i, (v, minel, maxel) in enumerate(zip(scope_map.params[::-1], minels, maxels)): @@ -699,7 +702,7 @@ def _generate_GPU_ThreadBlock_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, df # Block range start if i >= 3 or (dsym[i] >= minel) != True: - condition += '%s >= %s' % (v, symbolic_to_cpp(minel)) + condition += '%s >= %s' % (v, _topy(minel)) # Special case: block size is exactly the range of the map (0:b) if i >= 3: @@ -711,15 +714,110 @@ def _generate_GPU_ThreadBlock_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, df if i >= 3 or (not skipcond and (dsym_end[i] < maxel) != True): if len(condition) > 0: condition += ' && ' - condition += '%s < %s' % (v, symbolic_to_cpp(maxel + 1)) + condition += '%s < %s' % (v, _topy(maxel + 1)) # Emit condition in code if len(condition) > 0: - scopeManager.open(condition=condition) + callsite_stream.write('if (%s) {' % condition, cfg, state_id, scope_entry) + else: + callsite_stream.write('{', cfg, state_id, scope_entry) + """ + + def _generate_GPU_ThreadBlock_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: ScopeSubgraphView, + state_id: int, function_stream: CodeIOStream, kernel_stream: CodeIOStream) -> None: + + + # NOTE: not my code, but my insights. Approval for commenting this needed + with KernelScopeManager(cudaCodeGen=self, sdfg=sdfg, cfg=cfg, dfg_scope=dfg_scope, state_id=state_id, + function_stream=function_stream, callsite_stream=kernel_stream, comment="ThreadBlock Scope",) as scopeManager: + + node = dfg_scope.source_nodes()[0] + scope_map = node.map + + + # ----------------- Map Range Preprocessing ----------------------- + + # Reverse range for better performance (e.g. memory coalescing) + reversed_scope_range = scope_map.range[::-1] + map_range = subsets.Range(reversed_scope_range) + map_dimensions = len(map_range) + map_dim_sizes = map_range.size() + + kernel_block_dims = self._current_kernel_spec.block_dims + + + # ----------------- Symbolic Index Expressions ----------------------- + + symbolic_indices = [ symbolic.symbol(f'__SYM_IDX{dim}', nonnegative=True, integer=True) for dim in range(map_dimensions)] + symbolic_index_bounds = [idx + (block_dim * rng[2]) - 1 for idx, block_dim, rng in zip(symbolic_indices, kernel_block_dims, map_range)] + symbolic_coordinates = map_range.coord_at(symbolic_indices) + + + # ----------------- Generate Index Variable Definitions ----------------------- + + for dim in range(map_dimensions): + + var_name = scope_map.params[-dim - 1] # also reverse it here! + + if dim < 3: + # First three dimensions: direct mapping or partial delinearization + if dim == 2 and map_dimensions > 3: + tail_prod = prod(map_dim_sizes[3:]) + base_expr = f"(threadIdx.z / ({symbolic_to_cpp(tail_prod)}))" + else: + base_expr = f"threadIdx.{_get_cuda_dim(dim)}" + else: + # Dimensions beyond the third: full delinearization + tail_prod = prod(map_dim_sizes[dim + 1:]) + base_expr = (f"(threadIdx.z / ({symbolic_to_cpp(tail_prod)})) % "f"({symbolic_to_cpp(map_dim_sizes[dim])})") + + + var_def = symbolic_to_cpp(symbolic_coordinates[dim]).replace(f'__SYM_IDX{dim}', base_expr) + kernel_stream.write(f'int {var_name} = {var_def};', cfg, state_id, node) + self._dispatcher.defined_vars.add(var_name, DefinedType.Scalar, 'int') + + + + # ----------------- Guard Conditions for Block Execution ----------------------- + + # Generate conditions for this block's execution using min and max + # element, e.g. skipping out-of-bounds threads in trailing block + minels = map_range.min_element() + maxels = map_range.max_element() + for dim, (var_name, start, end) in enumerate(zip(scope_map.params[::-1], minels, maxels)): + + # Optimize conditions if they are always true + ############################################# + + condition = '' + + # Block range start + if dim >= 3 or (symbolic_indices[dim] >= start) != True: + condition += f'{var_name} >= {symbolic_to_cpp(start)}' + + # Special case: block size is exactly the range of the map (0:b) + if dim >= 3: + skipcond = False + else: + skipcond = symbolic_index_bounds[dim].subs({symbolic_indices[dim]: start}) == end + + # Block range end + if dim >= 3 or (not skipcond and (symbolic_index_bounds[dim] < end) != True): + if len(condition) > 0: + condition += ' && ' + condition += f'{var_name} < {symbolic_to_cpp(end + 1)}' + + # Emit condition in code if any + if len(condition) > 0: + scopeManager.open(condition=condition) + + # ----------------- Dispatch Subgraph code generation ----------------------- self._dispatcher.dispatch_subgraph(sdfg, cfg, dfg_scope, state_id, function_stream, - kernel_stream, skip_entry_node=True) + kernel_stream, skip_entry_node=True) + + @@ -748,6 +846,8 @@ def _generate_GPU_Warp_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope # ----------------- Guard checks ----------------------- + #TODO: Move them to validation as well if possible + #TODO: rename xfh, to cryptic parent_map, _ = xfh.get_parent_map(state_dfg, node) if parent_map.schedule != dtypes.ScheduleType.GPU_ThreadBlock: @@ -828,7 +928,7 @@ def _generate_GPU_Warp_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope - # ----------------- Warp Code Block ----------------------- + # ----------------- Dispatch Subgraph code generation ----------------------- self._dispatcher.dispatch_subgraph( sdfg, cfg, dfg_scope, state_id, function_stream, From 2d4a8f36f5dc98fdb974e78e49b4ad449fccde10 Mon Sep 17 00:00:00 2001 From: aydogdub Date: Fri, 16 May 2025 17:23:27 +0200 Subject: [PATCH 06/94] setting up testing environment --- berkay_workpace/scratch/A_output.txt | 4 + berkay_workpace/scratch/tesetbed2.ipynb | 626 ------------------ berkay_workpace/scratch/testbed.ipynb | 142 ++-- .../scratch/visualizingWarps.ipynb | 180 +++++ .../warp_level_test.py | 476 +++++++++++++ .../tests/gpu_map_tests/device_map_test.py | 155 +++++ .../tests/gpu_map_tests/threadBlock_test.py | 97 +++ .../tests/reusable_tests/cuda_block_test.py | 207 ++++++ .../cuda_highdim_kernel_test.py | 213 ++++++ berkay_workpace/tests/tests.py | 335 ---------- 10 files changed, 1382 insertions(+), 1053 deletions(-) create mode 100644 berkay_workpace/scratch/A_output.txt delete mode 100644 berkay_workpace/scratch/tesetbed2.ipynb create mode 100644 berkay_workpace/scratch/visualizingWarps.ipynb create mode 100644 berkay_workpace/tests/experimental_features_tests/warp_level_test.py create mode 100644 berkay_workpace/tests/gpu_map_tests/device_map_test.py create mode 100644 berkay_workpace/tests/gpu_map_tests/threadBlock_test.py create mode 100644 berkay_workpace/tests/reusable_tests/cuda_block_test.py create mode 100644 berkay_workpace/tests/reusable_tests/cuda_highdim_kernel_test.py delete mode 100644 berkay_workpace/tests/tests.py diff --git a/berkay_workpace/scratch/A_output.txt b/berkay_workpace/scratch/A_output.txt new file mode 100644 index 0000000000..38c368d3df --- /dev/null +++ b/berkay_workpace/scratch/A_output.txt @@ -0,0 +1,4 @@ +0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 +100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 101 101 101 101 101 101 101 101 101 101 101 101 101 101 101 101 101 101 101 101 101 101 101 101 101 101 101 101 101 101 101 101 102 102 102 102 102 102 102 102 102 102 102 102 102 102 102 102 102 102 102 102 102 102 102 102 102 102 102 102 102 102 102 102 103 103 103 103 103 103 103 103 103 103 103 103 103 103 103 103 103 103 103 103 103 103 103 103 103 103 103 103 103 103 103 103 104 104 104 104 104 104 104 104 104 104 104 104 104 104 104 104 104 104 104 104 104 104 104 104 104 104 104 104 104 104 104 104 105 105 105 105 105 105 105 105 105 105 105 105 105 105 105 105 105 105 105 105 105 105 105 105 105 105 105 105 105 105 105 105 106 106 106 106 106 106 106 106 106 106 106 106 106 106 106 106 106 106 106 106 106 106 106 106 106 106 106 106 106 106 106 106 107 107 107 107 107 107 107 107 107 107 107 107 107 107 107 107 107 107 107 107 107 107 107 107 107 107 107 107 107 107 107 107 +200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 201 201 201 201 201 201 201 201 201 201 201 201 201 201 201 201 201 201 201 201 201 201 201 201 201 201 201 201 201 201 201 201 202 202 202 202 202 202 202 202 202 202 202 202 202 202 202 202 202 202 202 202 202 202 202 202 202 202 202 202 202 202 202 202 203 203 203 203 203 203 203 203 203 203 203 203 203 203 203 203 203 203 203 203 203 203 203 203 203 203 203 203 203 203 203 203 204 204 204 204 204 204 204 204 204 204 204 204 204 204 204 204 204 204 204 204 204 204 204 204 204 204 204 204 204 204 204 204 205 205 205 205 205 205 205 205 205 205 205 205 205 205 205 205 205 205 205 205 205 205 205 205 205 205 205 205 205 205 205 205 206 206 206 206 206 206 206 206 206 206 206 206 206 206 206 206 206 206 206 206 206 206 206 206 206 206 206 206 206 206 206 206 207 207 207 207 207 207 207 207 207 207 207 207 207 207 207 207 207 207 207 207 207 207 207 207 207 207 207 207 207 207 207 207 +300 300 300 300 300 300 300 300 300 300 300 300 300 300 300 300 300 300 300 300 300 300 300 300 300 300 300 300 300 300 300 300 301 301 301 301 301 301 301 301 301 301 301 301 301 301 301 301 301 301 301 301 301 301 301 301 301 301 301 301 301 301 301 301 302 302 302 302 302 302 302 302 302 302 302 302 302 302 302 302 302 302 302 302 302 302 302 302 302 302 302 302 302 302 302 302 303 303 303 303 303 303 303 303 303 303 303 303 303 303 303 303 303 303 303 303 303 303 303 303 303 303 303 303 303 303 303 303 304 304 304 304 304 304 304 304 304 304 304 304 304 304 304 304 304 304 304 304 304 304 304 304 304 304 304 304 304 304 304 304 305 305 305 305 305 305 305 305 305 305 305 305 305 305 305 305 305 305 305 305 305 305 305 305 305 305 305 305 305 305 305 305 306 306 306 306 306 306 306 306 306 306 306 306 306 306 306 306 306 306 306 306 306 306 306 306 306 306 306 306 306 306 306 306 307 307 307 307 307 307 307 307 307 307 307 307 307 307 307 307 307 307 307 307 307 307 307 307 307 307 307 307 307 307 307 307 diff --git a/berkay_workpace/scratch/tesetbed2.ipynb b/berkay_workpace/scratch/tesetbed2.ipynb deleted file mode 100644 index 45307e0ed4..0000000000 --- a/berkay_workpace/scratch/tesetbed2.ipynb +++ /dev/null @@ -1,626 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "id": "1497afd7", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "import dace\n", - "\n", - "from dace import registry\n", - "from dace.sdfg.scope import ScopeSubgraphView\n", - "from dace.codegen.prettycode import CodeIOStream\n", - "from dace.codegen.targets.target import TargetCodeGenerator\n", - "from dace.codegen.targets.framecode import DaCeCodeGenerator\n", - "from dace.codegen.targets.cpp import sym2cpp\n", - "from IPython.display import Code\n", - "from dace.config import Config" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "130d986f", - "metadata": {}, - "outputs": [], - "source": [ - "@dace.program\n", - "def reduce_add_sync(mask: dace.uint32, value: dace.uint32):\n", - "\n", - " result = dace.define_local_scalar(dace.uint32)\n", - " \n", - " with dace.tasklet(dace.Language.CPP):\n", - " inp_mask << mask\n", - " inp_value << value\n", - " out_result >> result\n", - " \"\"\"\n", - " out_result = __reduce_add_sync(inp_mask, inp_value);\n", - " \"\"\"\n", - " return result\n", - "\n", - "\n", - "\n", - "@dace.program\n", - "def warpLevel(A: dace.uint32[1024] @ dace.dtypes.StorageType.GPU_Global, B: dace.uint32[1024] @ dace.dtypes.StorageType.GPU_Global):\n", - " for _ in dace.map[0:1024:1024] @ dace.dtypes.ScheduleType.GPU_Device:\n", - " for j in dace.map[0:1024] @ dace.dtypes.ScheduleType.GPU_ThreadBlock:\n", - "\n", - " for l, m, k in dace.map[0:4:2, 0:4, 0:2] @ dace.dtypes.ScheduleType.GPU_Warp:\n", - " mask = 0xffffffff\n", - " value = A[j]\n", - " result = dace.define_local_scalar(dace.uint32)\n", - " with dace.tasklet(dace.Language.CPP):\n", - " inp_mask << mask\n", - " inp_value << value\n", - " out_result >> result\n", - " \"\"\"\n", - " out_result = __reduce_add_sync(inp_mask, inp_value);\n", - " \"\"\"\n", - " \n", - " B[j] = result\n" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "c0146590", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
#include <cuda_runtime.h>\n",
-       "#include <dace/dace.h>\n",
-       "\n",
-       "\n",
-       "struct warpLevel_state_t {\n",
-       "    dace::cuda::Context *gpu_context;\n",
-       "};\n",
-       "\n",
-       "\n",
-       "\n",
-       "DACE_EXPORTED int __dace_init_experimental_cuda(warpLevel_state_t *__state);\n",
-       "DACE_EXPORTED int __dace_exit_experimental_cuda(warpLevel_state_t *__state);\n",
-       "\n",
-       "DACE_DFI void warpLevel_19_4_20_8_22_12_0_0_8(const dace::uint&  __tmp_24_24_r, dace::uint* __restrict__ __tmp_34_16_w) {\n",
-       "    dace::uint value;\n",
-       "\n",
-       "\n",
-       "    value = __tmp_24_24_r;\n",
-       "    {\n",
-       "\n",
-       "        {\n",
-       "            dace::uint __out;\n",
-       "\n",
-       "            ///////////////////\n",
-       "            // Tasklet code (assign_34_16)\n",
-       "            __out = __reduce_add_sync(4294967295U, value);\n",
-       "            ///////////////////\n",
-       "\n",
-       "            __tmp_34_16_w[0] = __out;\n",
-       "        }\n",
-       "\n",
-       "    }\n",
-       "}\n",
-       "\n",
-       "\n",
-       "\n",
-       "int __dace_init_experimental_cuda(warpLevel_state_t *__state) {\n",
-       "    int count;\n",
-       "\n",
-       "    // Check that we are able to run cuda code\n",
-       "    if (cudaGetDeviceCount(&count) != cudaSuccess)\n",
-       "    {\n",
-       "        printf("ERROR: GPU drivers are not configured or cuda-capable device "\n",
-       "               "not found\\n");\n",
-       "        return 1;\n",
-       "    }\n",
-       "    if (count == 0)\n",
-       "    {\n",
-       "        printf("ERROR: No cuda-capable devices found\\n");\n",
-       "        return 2;\n",
-       "    }\n",
-       "\n",
-       "    // Initialize cuda before we run the application\n",
-       "    float *dev_X;\n",
-       "    DACE_GPU_CHECK(cudaMalloc((void **) &dev_X, 1));\n",
-       "    DACE_GPU_CHECK(cudaFree(dev_X));\n",
-       "\n",
-       "    \n",
-       "\n",
-       "    __state->gpu_context = new dace::cuda::Context(1, 1);\n",
-       "\n",
-       "    // Create cuda streams and events\n",
-       "    for(int i = 0; i < 1; ++i) {\n",
-       "        DACE_GPU_CHECK(cudaStreamCreateWithFlags(&__state->gpu_context->internal_streams[i], cudaStreamNonBlocking));\n",
-       "        __state->gpu_context->streams[i] = __state->gpu_context->internal_streams[i]; // Allow for externals to modify streams\n",
-       "    }\n",
-       "    for(int i = 0; i < 1; ++i) {\n",
-       "        DACE_GPU_CHECK(cudaEventCreateWithFlags(&__state->gpu_context->events[i], cudaEventDisableTiming));\n",
-       "    }\n",
-       "\n",
-       "    \n",
-       "\n",
-       "    return 0;\n",
-       "}\n",
-       "\n",
-       "int __dace_exit_experimental_cuda(warpLevel_state_t *__state) {\n",
-       "    \n",
-       "\n",
-       "    // Synchronize and check for CUDA errors\n",
-       "    int __err = static_cast<int>(__state->gpu_context->lasterror);\n",
-       "    if (__err == 0)\n",
-       "        __err = static_cast<int>(cudaDeviceSynchronize());\n",
-       "\n",
-       "    // Destroy cuda streams and events\n",
-       "    for(int i = 0; i < 1; ++i) {\n",
-       "        DACE_GPU_CHECK(cudaStreamDestroy(__state->gpu_context->internal_streams[i]));\n",
-       "    }\n",
-       "    for(int i = 0; i < 1; ++i) {\n",
-       "        DACE_GPU_CHECK(cudaEventDestroy(__state->gpu_context->events[i]));\n",
-       "    }\n",
-       "\n",
-       "    delete __state->gpu_context;\n",
-       "    return __err;\n",
-       "}\n",
-       "\n",
-       "DACE_EXPORTED bool __dace_gpu_set_stream(warpLevel_state_t *__state, int streamid, gpuStream_t stream)\n",
-       "{\n",
-       "    if (streamid < 0 || streamid >= 1)\n",
-       "        return false;\n",
-       "\n",
-       "    __state->gpu_context->streams[streamid] = stream;\n",
-       "\n",
-       "    return true;\n",
-       "}\n",
-       "\n",
-       "DACE_EXPORTED void __dace_gpu_set_all_streams(warpLevel_state_t *__state, gpuStream_t stream)\n",
-       "{\n",
-       "    for (int i = 0; i < 1; ++i)\n",
-       "        __state->gpu_context->streams[i] = stream;\n",
-       "}\n",
-       "\n",
-       "__global__ void warpLevel_19_0_0_0(dace::uint * __restrict__ A, dace::uint * __restrict__ B)\n",
-       "{ // Kernel scope (open 1)\n",
-       "    int _ = (1024 * blockIdx.x);\n",
-       "    { // ThreadBlock Scope (open 1)\n",
-       "        int j = threadIdx.x;\n",
-       "        { // WarpLevel Scope (open 1)\n",
-       "            int warpId_warpLevel_19_4_20_8_22_0_0_6 = (threadIdx.x) / 32;\n",
-       "            int k = warpId_warpLevel_19_4_20_8_22_0_0_6 % 2;\n",
-       "            int m = (warpId_warpLevel_19_4_20_8_22_0_0_6 / 2) % 4;\n",
-       "            int l = (warpId_warpLevel_19_4_20_8_22_0_0_6 / (2 * 4)) % 4;\n",
-       "            warpLevel_19_4_20_8_22_12_0_0_8(A[j], &B[j]);\n",
-       "        } // WarpLevel Scope (close 1)\n",
-       "    } // ThreadBlock Scope (close 1)\n",
-       "} // Kernel scope (close 1)\n",
-       "\n",
-       "\n",
-       "DACE_EXPORTED void __dace_runkernel_warpLevel_19_0_0_0(warpLevel_state_t *__state, dace::uint * __restrict__ A, dace::uint * __restrict__ B);\n",
-       "void __dace_runkernel_warpLevel_19_0_0_0(warpLevel_state_t *__state, dace::uint * __restrict__ A, dace::uint * __restrict__ B)\n",
-       "{\n",
-       "\n",
-       "\n",
-       "    void  *warpLevel_19_0_0_0_args[] = { (void *)&A, (void *)&B };\n",
-       "    gpuError_t __err = cudaLaunchKernel( (void*)warpLevel_19_0_0_0, dim3(1, 1, 1), dim3(1024, 1, 1), warpLevel_19_0_0_0_args, 0, __state->gpu_context->streams[0]\n",
-       "    );\n",
-       "\n",
-       "    DACE_KERNEL_LAUNCH_CHECK(__err, "warpLevel_19_0_0_0", 1, 1, 1, 1024, 1, 1);\n",
-       "}\n",
-       "
\n" - ], - "text/latex": [ - "\\begin{Verbatim}[commandchars=\\\\\\{\\}]\n", - "\\PY{c+cp}{\\PYZsh{}}\\PY{c+cp}{include}\\PY{+w}{ }\\PY{c+cpf}{\\PYZlt{}cuda\\PYZus{}runtime.h\\PYZgt{}}\n", - "\\PY{c+cp}{\\PYZsh{}}\\PY{c+cp}{include}\\PY{+w}{ }\\PY{c+cpf}{\\PYZlt{}dace/dace.h\\PYZgt{}}\n", - "\n", - "\n", - "\\PY{k}{struct}\\PY{+w}{ }\\PY{n+nc}{warpLevel\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", - "\\PY{+w}{ }\\PY{n}{dace}\\PY{o}{:}\\PY{o}{:}\\PY{n}{cuda}\\PY{o}{:}\\PY{o}{:}\\PY{n}{Context}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{gpu\\PYZus{}context}\\PY{p}{;}\n", - "\\PY{p}{\\PYZcb{}}\\PY{p}{;}\n", - "\n", - "\n", - "\n", - "\\PY{n}{DACE\\PYZus{}EXPORTED}\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}dace\\PYZus{}init\\PYZus{}experimental\\PYZus{}cuda}\\PY{p}{(}\\PY{n}{warpLevel\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{)}\\PY{p}{;}\n", - "\\PY{n}{DACE\\PYZus{}EXPORTED}\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}dace\\PYZus{}exit\\PYZus{}experimental\\PYZus{}cuda}\\PY{p}{(}\\PY{n}{warpLevel\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{)}\\PY{p}{;}\n", - "\n", - "\\PY{n}{DACE\\PYZus{}DFI}\\PY{+w}{ }\\PY{k+kt}{void}\\PY{+w}{ }\\PY{n}{warpLevel\\PYZus{}19\\PYZus{}4\\PYZus{}20\\PYZus{}8\\PYZus{}22\\PYZus{}12\\PYZus{}0\\PYZus{}0\\PYZus{}8}\\PY{p}{(}\\PY{k}{const}\\PY{+w}{ }\\PY{n}{dace}\\PY{o}{:}\\PY{o}{:}\\PY{n}{uint}\\PY{o}{\\PYZam{}}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}tmp\\PYZus{}24\\PYZus{}24\\PYZus{}r}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{dace}\\PY{o}{:}\\PY{o}{:}\\PY{n}{uint}\\PY{o}{*}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}restrict\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}tmp\\PYZus{}34\\PYZus{}16\\PYZus{}w}\\PY{p}{)}\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", - "\\PY{+w}{ }\\PY{n}{dace}\\PY{o}{:}\\PY{o}{:}\\PY{n}{uint}\\PY{+w}{ }\\PY{n}{value}\\PY{p}{;}\n", - "\n", - "\n", - "\\PY{+w}{ }\\PY{n}{value}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}tmp\\PYZus{}24\\PYZus{}24\\PYZus{}r}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", - "\n", - "\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", - "\\PY{+w}{ }\\PY{n}{dace}\\PY{o}{:}\\PY{o}{:}\\PY{n}{uint}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}out}\\PY{p}{;}\n", - "\n", - "\\PY{+w}{ }\\PY{c+c1}{///////////////////}\n", - "\\PY{+w}{ }\\PY{c+c1}{// Tasklet code (assign\\PYZus{}34\\PYZus{}16)}\n", - "\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}out}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}reduce\\PYZus{}add\\PYZus{}sync}\\PY{p}{(}\\PY{l+m+mi}{4294967295U}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{value}\\PY{p}{)}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{c+c1}{///////////////////}\n", - "\n", - "\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}tmp\\PYZus{}34\\PYZus{}16\\PYZus{}w}\\PY{p}{[}\\PY{l+m+mi}{0}\\PY{p}{]}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}out}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\n", - "\n", - "\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\n", - "\\PY{p}{\\PYZcb{}}\n", - "\n", - "\n", - "\n", - "\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}dace\\PYZus{}init\\PYZus{}experimental\\PYZus{}cuda}\\PY{p}{(}\\PY{n}{warpLevel\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{)}\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", - "\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{count}\\PY{p}{;}\n", - "\n", - "\\PY{+w}{ }\\PY{c+c1}{// Check that we are able to run cuda code}\n", - "\\PY{+w}{ }\\PY{k}{if}\\PY{+w}{ }\\PY{p}{(}\\PY{n}{cudaGetDeviceCount}\\PY{p}{(}\\PY{o}{\\PYZam{}}\\PY{n}{count}\\PY{p}{)}\\PY{+w}{ }\\PY{o}{!}\\PY{o}{=}\\PY{+w}{ }\\PY{n}{cudaSuccess}\\PY{p}{)}\n", - "\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", - "\\PY{+w}{ }\\PY{n}{printf}\\PY{p}{(}\\PY{l+s}{\\PYZdq{}}\\PY{l+s}{ERROR: GPU drivers are not configured or cuda\\PYZhy{}capable device }\\PY{l+s}{\\PYZdq{}}\n", - "\\PY{+w}{ }\\PY{l+s}{\\PYZdq{}}\\PY{l+s}{not found}\\PY{l+s+se}{\\PYZbs{}n}\\PY{l+s}{\\PYZdq{}}\\PY{p}{)}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{k}{return}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\n", - "\\PY{+w}{ }\\PY{k}{if}\\PY{+w}{ }\\PY{p}{(}\\PY{n}{count}\\PY{+w}{ }\\PY{o}{=}\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{)}\n", - "\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", - "\\PY{+w}{ }\\PY{n}{printf}\\PY{p}{(}\\PY{l+s}{\\PYZdq{}}\\PY{l+s}{ERROR: No cuda\\PYZhy{}capable devices found}\\PY{l+s+se}{\\PYZbs{}n}\\PY{l+s}{\\PYZdq{}}\\PY{p}{)}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{k}{return}\\PY{+w}{ }\\PY{l+m+mi}{2}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\n", - "\n", - "\\PY{+w}{ }\\PY{c+c1}{// Initialize cuda before we run the application}\n", - "\\PY{+w}{ }\\PY{k+kt}{float}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{dev\\PYZus{}X}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{n}{DACE\\PYZus{}GPU\\PYZus{}CHECK}\\PY{p}{(}\\PY{n}{cudaMalloc}\\PY{p}{(}\\PY{p}{(}\\PY{k+kt}{void}\\PY{+w}{ }\\PY{o}{*}\\PY{o}{*}\\PY{p}{)}\\PY{+w}{ }\\PY{o}{\\PYZam{}}\\PY{n}{dev\\PYZus{}X}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{)}\\PY{p}{)}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{n}{DACE\\PYZus{}GPU\\PYZus{}CHECK}\\PY{p}{(}\\PY{n}{cudaFree}\\PY{p}{(}\\PY{n}{dev\\PYZus{}X}\\PY{p}{)}\\PY{p}{)}\\PY{p}{;}\n", - "\n", - "\\PY{+w}{ }\n", - "\n", - "\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{k}{new}\\PY{+w}{ }\\PY{n}{dace}\\PY{o}{:}\\PY{o}{:}\\PY{n}{cuda}\\PY{o}{:}\\PY{o}{:}\\PY{n}{Context}\\PY{p}{(}\\PY{l+m+mi}{1}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{)}\\PY{p}{;}\n", - "\n", - "\\PY{+w}{ }\\PY{c+c1}{// Create cuda streams and events}\n", - "\\PY{+w}{ }\\PY{k}{for}\\PY{p}{(}\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{;}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{\\PYZlt{}}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{;}\\PY{+w}{ }\\PY{o}{+}\\PY{o}{+}\\PY{n}{i}\\PY{p}{)}\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", - "\\PY{+w}{ }\\PY{n}{DACE\\PYZus{}GPU\\PYZus{}CHECK}\\PY{p}{(}\\PY{n}{cudaStreamCreateWithFlags}\\PY{p}{(}\\PY{o}{\\PYZam{}}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{internal\\PYZus{}streams}\\PY{p}{[}\\PY{n}{i}\\PY{p}{]}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{cudaStreamNonBlocking}\\PY{p}{)}\\PY{p}{)}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{streams}\\PY{p}{[}\\PY{n}{i}\\PY{p}{]}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{internal\\PYZus{}streams}\\PY{p}{[}\\PY{n}{i}\\PY{p}{]}\\PY{p}{;}\\PY{+w}{ }\\PY{c+c1}{// Allow for externals to modify streams}\n", - "\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\n", - "\\PY{+w}{ }\\PY{k}{for}\\PY{p}{(}\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{;}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{\\PYZlt{}}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{;}\\PY{+w}{ }\\PY{o}{+}\\PY{o}{+}\\PY{n}{i}\\PY{p}{)}\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", - "\\PY{+w}{ }\\PY{n}{DACE\\PYZus{}GPU\\PYZus{}CHECK}\\PY{p}{(}\\PY{n}{cudaEventCreateWithFlags}\\PY{p}{(}\\PY{o}{\\PYZam{}}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{events}\\PY{p}{[}\\PY{n}{i}\\PY{p}{]}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{cudaEventDisableTiming}\\PY{p}{)}\\PY{p}{)}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\n", - "\n", - "\\PY{+w}{ }\n", - "\n", - "\\PY{+w}{ }\\PY{k}{return}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{;}\n", - "\\PY{p}{\\PYZcb{}}\n", - "\n", - "\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}dace\\PYZus{}exit\\PYZus{}experimental\\PYZus{}cuda}\\PY{p}{(}\\PY{n}{warpLevel\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{)}\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", - "\\PY{+w}{ }\n", - "\n", - "\\PY{+w}{ }\\PY{c+c1}{// Synchronize and check for CUDA errors}\n", - "\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}err}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{k}{static\\PYZus{}cast}\\PY{o}{\\PYZlt{}}\\PY{k+kt}{int}\\PY{o}{\\PYZgt{}}\\PY{p}{(}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{lasterror}\\PY{p}{)}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{k}{if}\\PY{+w}{ }\\PY{p}{(}\\PY{n}{\\PYZus{}\\PYZus{}err}\\PY{+w}{ }\\PY{o}{=}\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{)}\n", - "\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}err}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{k}{static\\PYZus{}cast}\\PY{o}{\\PYZlt{}}\\PY{k+kt}{int}\\PY{o}{\\PYZgt{}}\\PY{p}{(}\\PY{n}{cudaDeviceSynchronize}\\PY{p}{(}\\PY{p}{)}\\PY{p}{)}\\PY{p}{;}\n", - "\n", - "\\PY{+w}{ }\\PY{c+c1}{// Destroy cuda streams and events}\n", - "\\PY{+w}{ }\\PY{k}{for}\\PY{p}{(}\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{;}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{\\PYZlt{}}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{;}\\PY{+w}{ }\\PY{o}{+}\\PY{o}{+}\\PY{n}{i}\\PY{p}{)}\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", - "\\PY{+w}{ }\\PY{n}{DACE\\PYZus{}GPU\\PYZus{}CHECK}\\PY{p}{(}\\PY{n}{cudaStreamDestroy}\\PY{p}{(}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{internal\\PYZus{}streams}\\PY{p}{[}\\PY{n}{i}\\PY{p}{]}\\PY{p}{)}\\PY{p}{)}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\n", - "\\PY{+w}{ }\\PY{k}{for}\\PY{p}{(}\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{;}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{\\PYZlt{}}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{;}\\PY{+w}{ }\\PY{o}{+}\\PY{o}{+}\\PY{n}{i}\\PY{p}{)}\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", - "\\PY{+w}{ }\\PY{n}{DACE\\PYZus{}GPU\\PYZus{}CHECK}\\PY{p}{(}\\PY{n}{cudaEventDestroy}\\PY{p}{(}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{events}\\PY{p}{[}\\PY{n}{i}\\PY{p}{]}\\PY{p}{)}\\PY{p}{)}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\n", - "\n", - "\\PY{+w}{ }\\PY{k}{delete}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{k}{return}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}err}\\PY{p}{;}\n", - "\\PY{p}{\\PYZcb{}}\n", - "\n", - "\\PY{n}{DACE\\PYZus{}EXPORTED}\\PY{+w}{ }\\PY{k+kt}{bool}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}dace\\PYZus{}gpu\\PYZus{}set\\PYZus{}stream}\\PY{p}{(}\\PY{n}{warpLevel\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{,}\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{streamid}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{gpuStream\\PYZus{}t}\\PY{+w}{ }\\PY{n}{stream}\\PY{p}{)}\n", - "\\PY{p}{\\PYZob{}}\n", - "\\PY{+w}{ }\\PY{k}{if}\\PY{+w}{ }\\PY{p}{(}\\PY{n}{streamid}\\PY{+w}{ }\\PY{o}{\\PYZlt{}}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{+w}{ }\\PY{o}{|}\\PY{o}{|}\\PY{+w}{ }\\PY{n}{streamid}\\PY{+w}{ }\\PY{o}{\\PYZgt{}}\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{)}\n", - "\\PY{+w}{ }\\PY{k}{return}\\PY{+w}{ }\\PY{n+nb}{false}\\PY{p}{;}\n", - "\n", - "\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{streams}\\PY{p}{[}\\PY{n}{streamid}\\PY{p}{]}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{stream}\\PY{p}{;}\n", - "\n", - "\\PY{+w}{ }\\PY{k}{return}\\PY{+w}{ }\\PY{n+nb}{true}\\PY{p}{;}\n", - "\\PY{p}{\\PYZcb{}}\n", - "\n", - "\\PY{n}{DACE\\PYZus{}EXPORTED}\\PY{+w}{ }\\PY{k+kt}{void}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}dace\\PYZus{}gpu\\PYZus{}set\\PYZus{}all\\PYZus{}streams}\\PY{p}{(}\\PY{n}{warpLevel\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{gpuStream\\PYZus{}t}\\PY{+w}{ }\\PY{n}{stream}\\PY{p}{)}\n", - "\\PY{p}{\\PYZob{}}\n", - "\\PY{+w}{ }\\PY{k}{for}\\PY{+w}{ }\\PY{p}{(}\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{;}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{\\PYZlt{}}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{;}\\PY{+w}{ }\\PY{o}{+}\\PY{o}{+}\\PY{n}{i}\\PY{p}{)}\n", - "\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{streams}\\PY{p}{[}\\PY{n}{i}\\PY{p}{]}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{stream}\\PY{p}{;}\n", - "\\PY{p}{\\PYZcb{}}\n", - "\n", - "\\PY{n}{\\PYZus{}\\PYZus{}global\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{k+kt}{void}\\PY{+w}{ }\\PY{n}{warpLevel\\PYZus{}19\\PYZus{}0\\PYZus{}0\\PYZus{}0}\\PY{p}{(}\\PY{n}{dace}\\PY{o}{:}\\PY{o}{:}\\PY{n}{uint}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}restrict\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{n}{A}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{dace}\\PY{o}{:}\\PY{o}{:}\\PY{n}{uint}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}restrict\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{n}{B}\\PY{p}{)}\n", - "\\PY{p}{\\PYZob{}}\\PY{+w}{ }\\PY{c+c1}{// Kernel scope (open 1)}\n", - "\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{\\PYZus{}}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{p}{(}\\PY{l+m+mi}{1024}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{blockIdx}\\PY{p}{.}\\PY{n}{x}\\PY{p}{)}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{p}{\\PYZob{}}\\PY{+w}{ }\\PY{c+c1}{// ThreadBlock Scope (open 1)}\n", - "\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{j}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{threadIdx}\\PY{p}{.}\\PY{n}{x}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{p}{\\PYZob{}}\\PY{+w}{ }\\PY{c+c1}{// WarpLevel Scope (open 1)}\n", - "\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{warpId\\PYZus{}warpLevel\\PYZus{}19\\PYZus{}4\\PYZus{}20\\PYZus{}8\\PYZus{}22\\PYZus{}0\\PYZus{}0\\PYZus{}6}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{p}{(}\\PY{n}{threadIdx}\\PY{p}{.}\\PY{n}{x}\\PY{p}{)}\\PY{+w}{ }\\PY{o}{/}\\PY{+w}{ }\\PY{l+m+mi}{32}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{k}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{warpId\\PYZus{}warpLevel\\PYZus{}19\\PYZus{}4\\PYZus{}20\\PYZus{}8\\PYZus{}22\\PYZus{}0\\PYZus{}0\\PYZus{}6}\\PY{+w}{ }\\PY{o}{\\PYZpc{}}\\PY{+w}{ }\\PY{l+m+mi}{2}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{m}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{p}{(}\\PY{n}{warpId\\PYZus{}warpLevel\\PYZus{}19\\PYZus{}4\\PYZus{}20\\PYZus{}8\\PYZus{}22\\PYZus{}0\\PYZus{}0\\PYZus{}6}\\PY{+w}{ }\\PY{o}{/}\\PY{+w}{ }\\PY{l+m+mi}{2}\\PY{p}{)}\\PY{+w}{ }\\PY{o}{\\PYZpc{}}\\PY{+w}{ }\\PY{l+m+mi}{4}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{l}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{p}{(}\\PY{n}{warpId\\PYZus{}warpLevel\\PYZus{}19\\PYZus{}4\\PYZus{}20\\PYZus{}8\\PYZus{}22\\PYZus{}0\\PYZus{}0\\PYZus{}6}\\PY{+w}{ }\\PY{o}{/}\\PY{+w}{ }\\PY{p}{(}\\PY{l+m+mi}{2}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{l+m+mi}{4}\\PY{p}{)}\\PY{p}{)}\\PY{+w}{ }\\PY{o}{\\PYZpc{}}\\PY{+w}{ }\\PY{l+m+mi}{4}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{n}{warpLevel\\PYZus{}19\\PYZus{}4\\PYZus{}20\\PYZus{}8\\PYZus{}22\\PYZus{}12\\PYZus{}0\\PYZus{}0\\PYZus{}8}\\PY{p}{(}\\PY{n}{A}\\PY{p}{[}\\PY{n}{j}\\PY{p}{]}\\PY{p}{,}\\PY{+w}{ }\\PY{o}{\\PYZam{}}\\PY{n}{B}\\PY{p}{[}\\PY{n}{j}\\PY{p}{]}\\PY{p}{)}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\\PY{+w}{ }\\PY{c+c1}{// WarpLevel Scope (close 1)}\n", - "\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\\PY{+w}{ }\\PY{c+c1}{// ThreadBlock Scope (close 1)}\n", - "\\PY{p}{\\PYZcb{}}\\PY{+w}{ }\\PY{c+c1}{// Kernel scope (close 1)}\n", - "\n", - "\n", - "\\PY{n}{DACE\\PYZus{}EXPORTED}\\PY{+w}{ }\\PY{k+kt}{void}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}dace\\PYZus{}runkernel\\PYZus{}warpLevel\\PYZus{}19\\PYZus{}0\\PYZus{}0\\PYZus{}0}\\PY{p}{(}\\PY{n}{warpLevel\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{dace}\\PY{o}{:}\\PY{o}{:}\\PY{n}{uint}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}restrict\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{n}{A}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{dace}\\PY{o}{:}\\PY{o}{:}\\PY{n}{uint}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}restrict\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{n}{B}\\PY{p}{)}\\PY{p}{;}\n", - "\\PY{k+kt}{void}\\PY{+w}{ }\\PY{n+nf}{\\PYZus{}\\PYZus{}dace\\PYZus{}runkernel\\PYZus{}warpLevel\\PYZus{}19\\PYZus{}0\\PYZus{}0\\PYZus{}0}\\PY{p}{(}\\PY{n}{warpLevel\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{dace}\\PY{o}{:}\\PY{o}{:}\\PY{n}{uint}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}restrict\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{n}{A}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{dace}\\PY{o}{:}\\PY{o}{:}\\PY{n}{uint}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}restrict\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{n}{B}\\PY{p}{)}\n", - "\\PY{p}{\\PYZob{}}\n", - "\n", - "\n", - "\\PY{+w}{ }\\PY{k+kt}{void}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{warpLevel\\PYZus{}19\\PYZus{}0\\PYZus{}0\\PYZus{}0\\PYZus{}args}\\PY{p}{[}\\PY{p}{]}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{p}{\\PYZob{}}\\PY{+w}{ }\\PY{p}{(}\\PY{k+kt}{void}\\PY{+w}{ }\\PY{o}{*}\\PY{p}{)}\\PY{o}{\\PYZam{}}\\PY{n}{A}\\PY{p}{,}\\PY{+w}{ }\\PY{p}{(}\\PY{k+kt}{void}\\PY{+w}{ }\\PY{o}{*}\\PY{p}{)}\\PY{o}{\\PYZam{}}\\PY{n}{B}\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{n}{gpuError\\PYZus{}t}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}err}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{cudaLaunchKernel}\\PY{p}{(}\\PY{+w}{ }\\PY{p}{(}\\PY{k+kt}{void}\\PY{o}{*}\\PY{p}{)}\\PY{n}{warpLevel\\PYZus{}19\\PYZus{}0\\PYZus{}0\\PYZus{}0}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{dim3}\\PY{p}{(}\\PY{l+m+mi}{1}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{)}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{dim3}\\PY{p}{(}\\PY{l+m+mi}{1024}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{)}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{warpLevel\\PYZus{}19\\PYZus{}0\\PYZus{}0\\PYZus{}0\\PYZus{}args}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{streams}\\PY{p}{[}\\PY{l+m+mi}{0}\\PY{p}{]}\n", - "\\PY{+w}{ }\\PY{p}{)}\\PY{p}{;}\n", - "\n", - "\\PY{+w}{ }\\PY{n}{DACE\\PYZus{}KERNEL\\PYZus{}LAUNCH\\PYZus{}CHECK}\\PY{p}{(}\\PY{n}{\\PYZus{}\\PYZus{}err}\\PY{p}{,}\\PY{+w}{ }\\PY{l+s}{\\PYZdq{}}\\PY{l+s}{warpLevel\\PYZus{}19\\PYZus{}0\\PYZus{}0\\PYZus{}0}\\PY{l+s}{\\PYZdq{}}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1024}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{)}\\PY{p}{;}\n", - "\\PY{p}{\\PYZcb{}}\n", - "\\end{Verbatim}\n" - ], - "text/plain": [ - "\n", - "#include \n", - "#include \n", - "\n", - "\n", - "struct warpLevel_state_t {\n", - " dace::cuda::Context *gpu_context;\n", - "};\n", - "\n", - "\n", - "\n", - "DACE_EXPORTED int __dace_init_experimental_cuda(warpLevel_state_t *__state);\n", - "DACE_EXPORTED int __dace_exit_experimental_cuda(warpLevel_state_t *__state);\n", - "\n", - "DACE_DFI void warpLevel_19_4_20_8_22_12_0_0_8(const dace::uint& __tmp_24_24_r, dace::uint* __restrict__ __tmp_34_16_w) {\n", - " dace::uint value;\n", - "\n", - "\n", - " value = __tmp_24_24_r;\n", - " {\n", - "\n", - " {\n", - " dace::uint __out;\n", - "\n", - " ///////////////////\n", - " // Tasklet code (assign_34_16)\n", - " __out = __reduce_add_sync(4294967295U, value);\n", - " ///////////////////\n", - "\n", - " __tmp_34_16_w[0] = __out;\n", - " }\n", - "\n", - " }\n", - "}\n", - "\n", - "\n", - "\n", - "int __dace_init_experimental_cuda(warpLevel_state_t *__state) {\n", - " int count;\n", - "\n", - " // Check that we are able to run cuda code\n", - " if (cudaGetDeviceCount(&count) != cudaSuccess)\n", - " {\n", - " printf(\"ERROR: GPU drivers are not configured or cuda-capable device \"\n", - " \"not found\\n\");\n", - " return 1;\n", - " }\n", - " if (count == 0)\n", - " {\n", - " printf(\"ERROR: No cuda-capable devices found\\n\");\n", - " return 2;\n", - " }\n", - "\n", - " // Initialize cuda before we run the application\n", - " float *dev_X;\n", - " DACE_GPU_CHECK(cudaMalloc((void **) &dev_X, 1));\n", - " DACE_GPU_CHECK(cudaFree(dev_X));\n", - "\n", - " \n", - "\n", - " __state->gpu_context = new dace::cuda::Context(1, 1);\n", - "\n", - " // Create cuda streams and events\n", - " for(int i = 0; i < 1; ++i) {\n", - " DACE_GPU_CHECK(cudaStreamCreateWithFlags(&__state->gpu_context->internal_streams[i], cudaStreamNonBlocking));\n", - " __state->gpu_context->streams[i] = __state->gpu_context->internal_streams[i]; // Allow for externals to modify streams\n", - " }\n", - " for(int i = 0; i < 1; ++i) {\n", - " DACE_GPU_CHECK(cudaEventCreateWithFlags(&__state->gpu_context->events[i], cudaEventDisableTiming));\n", - " }\n", - "\n", - " \n", - "\n", - " return 0;\n", - "}\n", - "\n", - "int __dace_exit_experimental_cuda(warpLevel_state_t *__state) {\n", - " \n", - "\n", - " // Synchronize and check for CUDA errors\n", - " int __err = static_cast(__state->gpu_context->lasterror);\n", - " if (__err == 0)\n", - " __err = static_cast(cudaDeviceSynchronize());\n", - "\n", - " // Destroy cuda streams and events\n", - " for(int i = 0; i < 1; ++i) {\n", - " DACE_GPU_CHECK(cudaStreamDestroy(__state->gpu_context->internal_streams[i]));\n", - " }\n", - " for(int i = 0; i < 1; ++i) {\n", - " DACE_GPU_CHECK(cudaEventDestroy(__state->gpu_context->events[i]));\n", - " }\n", - "\n", - " delete __state->gpu_context;\n", - " return __err;\n", - "}\n", - "\n", - "DACE_EXPORTED bool __dace_gpu_set_stream(warpLevel_state_t *__state, int streamid, gpuStream_t stream)\n", - "{\n", - " if (streamid < 0 || streamid >= 1)\n", - " return false;\n", - "\n", - " __state->gpu_context->streams[streamid] = stream;\n", - "\n", - " return true;\n", - "}\n", - "\n", - "DACE_EXPORTED void __dace_gpu_set_all_streams(warpLevel_state_t *__state, gpuStream_t stream)\n", - "{\n", - " for (int i = 0; i < 1; ++i)\n", - " __state->gpu_context->streams[i] = stream;\n", - "}\n", - "\n", - "__global__ void warpLevel_19_0_0_0(dace::uint * __restrict__ A, dace::uint * __restrict__ B)\n", - "{ // Kernel scope (open 1)\n", - " int _ = (1024 * blockIdx.x);\n", - " { // ThreadBlock Scope (open 1)\n", - " int j = threadIdx.x;\n", - " { // WarpLevel Scope (open 1)\n", - " int warpId_warpLevel_19_4_20_8_22_0_0_6 = (threadIdx.x) / 32;\n", - " int k = warpId_warpLevel_19_4_20_8_22_0_0_6 % 2;\n", - " int m = (warpId_warpLevel_19_4_20_8_22_0_0_6 / 2) % 4;\n", - " int l = (warpId_warpLevel_19_4_20_8_22_0_0_6 / (2 * 4)) % 4;\n", - " warpLevel_19_4_20_8_22_12_0_0_8(A[j], &B[j]);\n", - " } // WarpLevel Scope (close 1)\n", - " } // ThreadBlock Scope (close 1)\n", - "} // Kernel scope (close 1)\n", - "\n", - "\n", - "DACE_EXPORTED void __dace_runkernel_warpLevel_19_0_0_0(warpLevel_state_t *__state, dace::uint * __restrict__ A, dace::uint * __restrict__ B);\n", - "void __dace_runkernel_warpLevel_19_0_0_0(warpLevel_state_t *__state, dace::uint * __restrict__ A, dace::uint * __restrict__ B)\n", - "{\n", - "\n", - "\n", - " void *warpLevel_19_0_0_0_args[] = { (void *)&A, (void *)&B };\n", - " gpuError_t __err = cudaLaunchKernel( (void*)warpLevel_19_0_0_0, dim3(1, 1, 1), dim3(1024, 1, 1), warpLevel_19_0_0_0_args, 0, __state->gpu_context->streams[0]\n", - " );\n", - "\n", - " DACE_KERNEL_LAUNCH_CHECK(__err, \"warpLevel_19_0_0_0\", 1, 1, 1, 1024, 1, 1);\n", - "}\n" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "sdfg = warpLevel.to_sdfg()\n", - "\n", - "Code(sdfg.generate_code()[1].clean_code, language='cpp')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0ac768f2", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "dace_env", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.12.3" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/berkay_workpace/scratch/testbed.ipynb b/berkay_workpace/scratch/testbed.ipynb index b67d6cbc56..23161b92b2 100644 --- a/berkay_workpace/scratch/testbed.ipynb +++ b/berkay_workpace/scratch/testbed.ipynb @@ -48,15 +48,15 @@ "text/html": [ "\n", "
\n", - "
\n", + "
\n", "
\n", "\n", "" @@ -122,7 +122,7 @@ "\n", "\n", "\n", - "#sdfg.fill_scope_connectors()\n", + "sdfg.fill_scope_connectors()\n", "\n", "\n", "\n", @@ -308,35 +308,21 @@ " { // ThreadBlock Scope (open 1)\n", " int j = threadIdx.x;\n", " { // WarpLevel Scope (open 1)\n", - "\n", - " int warpId = (threadIdx.x + blockDim.x * threadIdx.y + blockDim.x * blockDim.y * threadIdx.z) / 32;\n", - " if ( 0 <= warpId && warpId <= 0) { // WarpLevel Scope (open 2)\n", - "\n", - "\n", - "\n", - " //////////////////////////////////////\n", - " // WarpLevel operations start\n", - " //////////////////////////////////////\n", - " {\n", - " dace::uint* inp = &A[0];\n", - " dace::uint out;\n", - "\n", - " ///////////////////\n", - " // Tasklet code (WarpLevel_Operation)\n", - " auto value = inp[j];\n", - " out = __reduce_add_sync(4294967295U, value);\n", - " ///////////////////\n", - "\n", - " B[j] = out;\n", - " }\n", - " //////////////////////////////////////\n", - " // WarpLevel operations end\n", - " //////////////////////////////////////\n", - "\n", - "\n", - "\n", - " } // WarpLevel Scope (close 1)\n", - " } // WarpLevel Scope (close 2)\n", + " int warpId_WarpLevel_Operation_map_0_0_6 = (threadIdx.x) / 32;\n", + " int _ = warpId_WarpLevel_Operation_map_0_0_6 % 1;\n", + " {\n", + " dace::uint* inp = &A[0];\n", + " dace::uint out;\n", + "\n", + " ///////////////////\n", + " // Tasklet code (WarpLevel_Operation)\n", + " auto value = inp[j];\n", + " out = __reduce_add_sync(4294967295U, value);\n", + " ///////////////////\n", + "\n", + " B[j] = out;\n", + " }\n", + " } // WarpLevel Scope (close 1)\n", " } // ThreadBlock Scope (close 1)\n", "} // Kernel scope (close 1)\n", "\n", @@ -452,35 +438,21 @@ "\\PY{+w}{ }\\PY{p}{\\PYZob{}}\\PY{+w}{ }\\PY{c+c1}{// ThreadBlock Scope (open 1)}\n", "\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{j}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{threadIdx}\\PY{p}{.}\\PY{n}{x}\\PY{p}{;}\n", "\\PY{+w}{ }\\PY{p}{\\PYZob{}}\\PY{+w}{ }\\PY{c+c1}{// WarpLevel Scope (open 1)}\n", - "\n", - "\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{warpId}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{p}{(}\\PY{n}{threadIdx}\\PY{p}{.}\\PY{n}{x}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{n}{blockDim}\\PY{p}{.}\\PY{n}{x}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{threadIdx}\\PY{p}{.}\\PY{n}{y}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{n}{blockDim}\\PY{p}{.}\\PY{n}{x}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{blockDim}\\PY{p}{.}\\PY{n}{y}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{threadIdx}\\PY{p}{.}\\PY{n}{z}\\PY{p}{)}\\PY{+w}{ }\\PY{o}{/}\\PY{+w}{ }\\PY{l+m+mi}{32}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{k}{if}\\PY{+w}{ }\\PY{p}{(}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{+w}{ }\\PY{o}{\\PYZlt{}}\\PY{o}{=}\\PY{+w}{ }\\PY{n}{warpId}\\PY{+w}{ }\\PY{o}{\\PYZam{}}\\PY{o}{\\PYZam{}}\\PY{+w}{ }\\PY{n}{warpId}\\PY{+w}{ }\\PY{o}{\\PYZlt{}}\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{)}\\PY{+w}{ }\\PY{p}{\\PYZob{}}\\PY{+w}{ }\\PY{c+c1}{// WarpLevel Scope (open 2)}\n", - "\n", - "\n", - "\n", - "\\PY{+w}{ }\\PY{c+c1}{//////////////////////////////////////}\n", - "\\PY{+w}{ }\\PY{c+c1}{// WarpLevel operations start}\n", - "\\PY{+w}{ }\\PY{c+c1}{//////////////////////////////////////}\n", - "\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", - "\\PY{+w}{ }\\PY{n}{dace}\\PY{o}{:}\\PY{o}{:}\\PY{n}{uint}\\PY{o}{*}\\PY{+w}{ }\\PY{n}{inp}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{o}{\\PYZam{}}\\PY{n}{A}\\PY{p}{[}\\PY{l+m+mi}{0}\\PY{p}{]}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{n}{dace}\\PY{o}{:}\\PY{o}{:}\\PY{n}{uint}\\PY{+w}{ }\\PY{n}{out}\\PY{p}{;}\n", - "\n", - "\\PY{+w}{ }\\PY{c+c1}{///////////////////}\n", - "\\PY{+w}{ }\\PY{c+c1}{// Tasklet code (WarpLevel\\PYZus{}Operation)}\n", - "\\PY{+w}{ }\\PY{k}{auto}\\PY{+w}{ }\\PY{n}{value}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{inp}\\PY{p}{[}\\PY{n}{j}\\PY{p}{]}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{n}{out}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}reduce\\PYZus{}add\\PYZus{}sync}\\PY{p}{(}\\PY{l+m+mi}{4294967295U}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{value}\\PY{p}{)}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{c+c1}{///////////////////}\n", - "\n", - "\\PY{+w}{ }\\PY{n}{B}\\PY{p}{[}\\PY{n}{j}\\PY{p}{]}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{out}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\n", - "\\PY{+w}{ }\\PY{c+c1}{//////////////////////////////////////}\n", - "\\PY{+w}{ }\\PY{c+c1}{// WarpLevel operations end}\n", - "\\PY{+w}{ }\\PY{c+c1}{//////////////////////////////////////}\n", - "\n", - "\n", - "\n", - "\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\\PY{+w}{ }\\PY{c+c1}{// WarpLevel Scope (close 1)}\n", - "\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\\PY{+w}{ }\\PY{c+c1}{// WarpLevel Scope (close 2)}\n", + "\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{warpId\\PYZus{}WarpLevel\\PYZus{}Operation\\PYZus{}map\\PYZus{}0\\PYZus{}0\\PYZus{}6}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{p}{(}\\PY{n}{threadIdx}\\PY{p}{.}\\PY{n}{x}\\PY{p}{)}\\PY{+w}{ }\\PY{o}{/}\\PY{+w}{ }\\PY{l+m+mi}{32}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{\\PYZus{}}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{warpId\\PYZus{}WarpLevel\\PYZus{}Operation\\PYZus{}map\\PYZus{}0\\PYZus{}0\\PYZus{}6}\\PY{+w}{ }\\PY{o}{\\PYZpc{}}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", + "\\PY{+w}{ }\\PY{n}{dace}\\PY{o}{:}\\PY{o}{:}\\PY{n}{uint}\\PY{o}{*}\\PY{+w}{ }\\PY{n}{inp}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{o}{\\PYZam{}}\\PY{n}{A}\\PY{p}{[}\\PY{l+m+mi}{0}\\PY{p}{]}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{n}{dace}\\PY{o}{:}\\PY{o}{:}\\PY{n}{uint}\\PY{+w}{ }\\PY{n}{out}\\PY{p}{;}\n", + "\n", + "\\PY{+w}{ }\\PY{c+c1}{///////////////////}\n", + "\\PY{+w}{ }\\PY{c+c1}{// Tasklet code (WarpLevel\\PYZus{}Operation)}\n", + "\\PY{+w}{ }\\PY{k}{auto}\\PY{+w}{ }\\PY{n}{value}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{inp}\\PY{p}{[}\\PY{n}{j}\\PY{p}{]}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{n}{out}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}reduce\\PYZus{}add\\PYZus{}sync}\\PY{p}{(}\\PY{l+m+mi}{4294967295U}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{value}\\PY{p}{)}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{c+c1}{///////////////////}\n", + "\n", + "\\PY{+w}{ }\\PY{n}{B}\\PY{p}{[}\\PY{n}{j}\\PY{p}{]}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{out}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\n", + "\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\\PY{+w}{ }\\PY{c+c1}{// WarpLevel Scope (close 1)}\n", "\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\\PY{+w}{ }\\PY{c+c1}{// ThreadBlock Scope (close 1)}\n", "\\PY{p}{\\PYZcb{}}\\PY{+w}{ }\\PY{c+c1}{// Kernel scope (close 1)}\n", "\n", @@ -596,35 +568,21 @@ " { // ThreadBlock Scope (open 1)\n", " int j = threadIdx.x;\n", " { // WarpLevel Scope (open 1)\n", - "\n", - " int warpId = (threadIdx.x + blockDim.x * threadIdx.y + blockDim.x * blockDim.y * threadIdx.z) / 32;\n", - " if ( 0 <= warpId && warpId <= 0) { // WarpLevel Scope (open 2)\n", - "\n", - "\n", - "\n", - " //////////////////////////////////////\n", - " // WarpLevel operations start\n", - " //////////////////////////////////////\n", - " {\n", - " dace::uint* inp = &A[0];\n", - " dace::uint out;\n", - "\n", - " ///////////////////\n", - " // Tasklet code (WarpLevel_Operation)\n", - " auto value = inp[j];\n", - " out = __reduce_add_sync(4294967295U, value);\n", - " ///////////////////\n", - "\n", - " B[j] = out;\n", - " }\n", - " //////////////////////////////////////\n", - " // WarpLevel operations end\n", - " //////////////////////////////////////\n", - "\n", - "\n", - "\n", - " } // WarpLevel Scope (close 1)\n", - " } // WarpLevel Scope (close 2)\n", + " int warpId_WarpLevel_Operation_map_0_0_6 = (threadIdx.x) / 32;\n", + " int _ = warpId_WarpLevel_Operation_map_0_0_6 % 1;\n", + " {\n", + " dace::uint* inp = &A[0];\n", + " dace::uint out;\n", + "\n", + " ///////////////////\n", + " // Tasklet code (WarpLevel_Operation)\n", + " auto value = inp[j];\n", + " out = __reduce_add_sync(4294967295U, value);\n", + " ///////////////////\n", + "\n", + " B[j] = out;\n", + " }\n", + " } // WarpLevel Scope (close 1)\n", " } // ThreadBlock Scope (close 1)\n", "} // Kernel scope (close 1)\n", "\n", diff --git a/berkay_workpace/scratch/visualizingWarps.ipynb b/berkay_workpace/scratch/visualizingWarps.ipynb new file mode 100644 index 0000000000..bb25667a8b --- /dev/null +++ b/berkay_workpace/scratch/visualizingWarps.ipynb @@ -0,0 +1,180 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 15, + "id": "1497afd7", + "metadata": {}, + "outputs": [], + "source": [ + "import dace\n", + "import cupy as cp\n", + "import numpy as np\n", + "\n", + "from dace import registry\n", + "\n", + "from dace.sdfg.scope import ScopeSubgraphView\n", + "from dace.codegen.prettycode import CodeIOStream\n", + "from dace.codegen.targets.target import TargetCodeGenerator\n", + "from dace.codegen.targets.framecode import DaCeCodeGenerator\n", + "from dace.codegen.targets.cpp import sym2cpp\n", + "from IPython.display import Code\n", + "from dace.config import Config" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "3f6d665e", + "metadata": {}, + "outputs": [], + "source": [ + "@dace.program\n", + "def test(A: dace.uint32[32,32] @ dace.dtypes.StorageType.GPU_Global):\n", + " for i, j in dace.map[0:32:32, 0:32:32] @ dace.dtypes.ScheduleType.GPU_Device:\n", + " for ii, jj in dace.map[0:32, 0:32] @ dace.dtypes.ScheduleType.GPU_ThreadBlock:\n", + " for wx,wy in dace.map[0:4, 0:8] @ dace.dtypes.ScheduleType.GPU_Warp:\n", + " r = wx\n", + " c = wy\n", + " result = dace.define_local_scalar(dace.uint32)\n", + " with dace.tasklet(dace.Language.CPP):\n", + " iwx << r\n", + " iwy << c\n", + " out_result >> result\n", + " \"\"\"\n", + " out_result = iwx * 100 + iwy;\n", + " \"\"\"\n", + " \n", + " A[i + ii, j + jj] = result\n", + "\n", + "sdfg = test.to_sdfg()\n", + "A = cp.zeros((32,32), dtype=cp.uint32)\n", + "sdfg(A=A)\n", + "\n", + "A_cpu = cp.asnumpy(A)\n", + "A_reshaped = A_cpu.reshape(-1, 256)\n", + "np.savetxt(\"A_output.txt\", A_reshaped, fmt='%d')\n" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "130d986f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "
\n", + "
\n", + "
\n", + "\n", + "" + ], + "text/plain": [ + "SDFG (Warp_test_1)" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sdfg = dace.SDFG(\"Warp_test_1\")\n", + "state = sdfg.add_state(\"main\")\n", + "\n", + "# Generate access nodes\n", + "a_dev = sdfg.add_array(\"A\", (32,32), dace.uint32, dace.dtypes.StorageType.GPU_Global)\n", + "a_acc = state.add_access(\"A\")\n", + "\n", + "\n", + "\n", + "# Generate maps, connect entries with access data\n", + "gpu_map_entry, gpu_map_exit = state.add_map(name = \"GPU_Map\",\n", + " ndrange = dict(i='0:32:32', j ='0:32:32'),\n", + " schedule = dace.dtypes.ScheduleType.GPU_Device)\n", + "\n", + "\n", + "\n", + "tblock_map_entry, tblock_map_exit = state.add_map(name = \"Block_Map\",\n", + " ndrange = dict(ii='0:32', jj='0:32'),\n", + " schedule = dace.dtypes.ScheduleType.GPU_ThreadBlock)\n", + "\n", + "state.add_edge(gpu_map_entry, None, tblock_map_entry, None, dace.memlet.Memlet())\n", + "\n", + "\n", + "\n", + "tasklet, warp_scope_entry, warp_scope_exit = state.add_mapped_tasklet(\n", + " name='WarpLevel_Operation',\n", + " map_ranges=dict(wi='0:32'),\n", + " inputs=dict(),\n", + " code=\n", + "\"\"\"\n", + "out = wi\n", + "\"\"\",\n", + " outputs=dict(out=dace.Memlet(\"A[i+ii, j+jj]\")),\n", + " schedule=dace.dtypes.ScheduleType.GPU_Warp\n", + ")\n", + "\n", + "state.add_edge(tblock_map_entry, None, warp_scope_entry, None, dace.memlet.Memlet())\n", + "\n", + "state.add_edge(warp_scope_exit, None, tblock_map_exit, None, dace.memlet.Memlet('A[i+ii, j+jj]'))\n", + "state.add_edge(tblock_map_exit, None, gpu_map_exit, None, dace.memlet.Memlet('A[i:i+32,j:j+32]'))\n", + "state.add_edge(gpu_map_exit, None, a_acc, None, dace.memlet.Memlet('A[0:32, 0:32]'))\n", + "\n", + "sdfg.fill_scope_connectors()\n", + "sdfg" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c0146590", + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "Code(sdfg.generate_code()[1].clean_code, language='cpp')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "141d0c40", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "dace_env", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/berkay_workpace/tests/experimental_features_tests/warp_level_test.py b/berkay_workpace/tests/experimental_features_tests/warp_level_test.py new file mode 100644 index 0000000000..750e64b270 --- /dev/null +++ b/berkay_workpace/tests/experimental_features_tests/warp_level_test.py @@ -0,0 +1,476 @@ +import dace +import pytest +import cupy as cp + +from IPython.display import Code +from dace.config import Config + + +####################### Testing correct mapping of indices to WarpIds ################## + +# NOTE: Focus in these section is not on the tasklet (just used to have a simple +# verification option) and the SDFG is not correct, dataFlow to warps includes 32 elements +# and not only 1 element. But there is no support for correct representation (yet). However, +# the construction of the warpIds is not affected by this. Correct SDFGs appear in the next +# test section + +@pytest.mark.gpu +@pytest.mark.parametrize("start, end, stride", [ + (0, 32, 1), + (3, 16, 1), + (5, 17, 3) +]) +def test_warp_map_single_TB(start, end, stride): + @dace.program + def simple_warp_map(A: dace.uint32[1024] @ dace.dtypes.StorageType.GPU_Global, B: dace.uint32[1024] @ dace.dtypes.StorageType.GPU_Global): + """ + 1D check with different start, end and strides. + """ + for i in dace.map[0:1024:1024] @ dace.dtypes.ScheduleType.GPU_Device: + for j in dace.map[0:1024] @ dace.dtypes.ScheduleType.GPU_ThreadBlock: + for _ in dace.map[start:end:stride] @ dace.dtypes.ScheduleType.GPU_Warp: + mask = 0xffffffff + value = A[j] + result = dace.define_local_scalar(dace.uint32) + with dace.tasklet(dace.Language.CPP): + inp_mask << mask + inp_value << value + out_result >> result + """ + out_result = __reduce_add_sync(inp_mask, inp_value); + """ + + B[j] = result + + + sdfg = simple_warp_map.to_sdfg() + + A = cp.ones(1024, dtype=cp.uint32) + B = cp.zeros(1024, dtype=cp.uint32) + + sdfg(A=A, B=B) + + expected = cp.full(1024, 0, dtype=cp.uint32) + for tid in range(1024): + warpId = tid // 32 + if warpId in range(start, end, stride): + expected[tid] = 32 + + cp.testing.assert_array_equal(B, expected) + + + + +@pytest.mark.gpu +@pytest.mark.parametrize("start, end, stride", [ + (2, 16, 6), + (3, 15, 3) +]) +def test_warp_map_multiple_TB(start, end, stride): + @dace.program + def multTB_warp_map(A: dace.uint32[1024] @ dace.dtypes.StorageType.GPU_Global, B: dace.uint32[1024] @ dace.dtypes.StorageType.GPU_Global): + """ + The case where we have more than one ThreadBlock. + """ + for i in dace.map[0:1024:512] @ dace.dtypes.ScheduleType.GPU_Device: + for j in dace.map[0:512] @ dace.dtypes.ScheduleType.GPU_ThreadBlock: + for _ in dace.map[start:end:stride] @ dace.dtypes.ScheduleType.GPU_Warp: + mask = 0xffffffff + value = A[i + j] + result = dace.define_local_scalar(dace.uint32) + with dace.tasklet(dace.Language.CPP): + inp_mask << mask + inp_value << value + out_result >> result + """ + out_result = __reduce_add_sync(inp_mask, inp_value); + """ + + B[i + j] = result + + + sdfg = multTB_warp_map.to_sdfg() + + A = cp.ones(1024, dtype=cp.uint32) + B = cp.zeros(1024, dtype=cp.uint32) + + sdfg(A=A, B=B) + + expected = cp.full(1024, 0, dtype=cp.uint32) + for block_start in range(0, 1024, 512): + for tid in range(512): + warpId = tid // 32 + if warpId in range(start, end, stride): + expected[block_start + tid] = 32 + + cp.testing.assert_array_equal(B, expected) + + + +@pytest.mark.gpu +@pytest.mark.parametrize("b1, e1, s1, b2, e2, s2", [ + (0, 4, 1, 0, 4, 1), + (0, 3, 2, 0, 5, 3), +]) +def test_warp_map_2D(b1, e1, s1, b2, e2, s2): + @dace.program + def multTB_warp_map_2D(A: dace.uint32[1024] @ dace.dtypes.StorageType.GPU_Global, B: dace.uint32[1024] @ dace.dtypes.StorageType.GPU_Global): + """ + Simple functionality check of 2D maps, focus is on 2D and less on multible TB. + """ + for i in dace.map[0:1024:512] @ dace.dtypes.ScheduleType.GPU_Device: + for j in dace.map[0:512] @ dace.dtypes.ScheduleType.GPU_ThreadBlock: + for k, l in dace.map[b1:e1:s1, b2:e2:s2] @ dace.dtypes.ScheduleType.GPU_Warp: + mask = 0xffffffff + value = A[i + j] + result = dace.define_local_scalar(dace.uint32) + with dace.tasklet(dace.Language.CPP): + inp_mask << mask + inp_value << value + out_result >> result + """ + out_result = __reduce_add_sync(inp_mask, inp_value); + """ + + B[i + j] = result + + + sdfg = multTB_warp_map_2D.to_sdfg() + + A = cp.ones(1024, dtype=cp.uint32) + B = cp.zeros(1024, dtype=cp.uint32) + + sdfg(A=A, B=B) + + # Check whether result is as expected + expected = cp.full(1024, 0, dtype=cp.uint32) + for block_start in range(0, 1024, 512): + for tid in range(512): + warpId = (tid // 32) + if warpId >= e1 * e2: + continue + warpIdx = (warpId % e2 ) + warpIdy = (warpId // e2 ) % e1 + if (warpIdx - b2) % s2 == 0 and (warpIdy - b1) % s1 == 0: + expected[block_start + tid] = 32 + + + cp.testing.assert_array_equal(B, expected) + + + + +@pytest.mark.gpu +@pytest.mark.parametrize("b1, e1, s1, b2, e2, s2, b3, e3, s3", [ + (0, 4, 1, 0, 4, 2, 0, 2, 1), + (0, 3, 2, 1, 5, 3, 1, 2, 1), +]) +def test_warp_map_3D(b1, e1, s1, b2, e2, s2, b3, e3, s3): + @dace.program + def warp_map_3D(A: dace.uint32[1024] @ dace.dtypes.StorageType.GPU_Global, B: dace.uint32[1024] @ dace.dtypes.StorageType.GPU_Global): + """ + Simple functionality check of 3D maps + """ + for i in dace.map[0:1024:1024] @ dace.dtypes.ScheduleType.GPU_Device: + for j in dace.map[0:1024] @ dace.dtypes.ScheduleType.GPU_ThreadBlock: + for k, l, m in dace.map[b1:e1:s1, b2:e2:s2, b3:e3:s3] @ dace.dtypes.ScheduleType.GPU_Warp: + mask = 0xffffffff + value = A[i + j] + result = dace.define_local_scalar(dace.uint32) + with dace.tasklet(dace.Language.CPP): + inp_mask << mask + inp_value << value + out_result >> result + """ + out_result = __reduce_add_sync(inp_mask, inp_value); + """ + + B[i + j] = result + + + sdfg = warp_map_3D.to_sdfg() + + A = cp.ones(1024, dtype=cp.uint32) + B = cp.zeros(1024, dtype=cp.uint32) + + sdfg(A=A, B=B) + + # Check whether result is as expected + expected = cp.full(1024, 0, dtype=cp.uint32) + for block_start in range(0, 1024, 1024): + for tid in range(1024): + warpId = (tid // 32) + if warpId >= e1 * e2 * e3: + continue + warpIdx = warpId % e3 + warpIdy = (warpId // e3 ) % e2 + warpIdz = (warpId // (e3 * e2) ) % e1 + if ((warpIdx - b3) % s3 == 0 and warpIdx >= b3 and + (warpIdy - b2) % s2 == 0 and warpIdx >= b2 and + (warpIdz - b1) % s1 == 0 and warpIdx >= b1): + expected[block_start + tid] = 32 + + + cp.testing.assert_array_equal(B, expected) + + + + + +@pytest.mark.gpu +@pytest.mark.parametrize("bs, ns", [(512, 1024), (1024, 2048)]) +def test_symbolic_warp_map(bs, ns): + + BS = dace.symbol('BS') + NS = dace.symbol('NS') + + START = dace.symbol('START') + WS = dace.symbol('WS') + STRIDE = dace.symbol('STRIDE') + + start = 2 + stride = 3 + ws = bs // 32 + @dace.program + def symbolic_warp_map(A: dace.uint32[NS] @ dace.dtypes.StorageType.GPU_Global, B: dace.uint32[NS] @ dace.dtypes.StorageType.GPU_Global): + """ + Focus is in the use of symbolic variables in the MAP. + """ + for i in dace.map[0:NS:BS] @ dace.dtypes.ScheduleType.GPU_Device: + for j in dace.map[0:BS] @ dace.dtypes.ScheduleType.GPU_ThreadBlock: + + for k in dace.map[START:WS:STRIDE] @ dace.dtypes.ScheduleType.GPU_Warp: + mask = 0xffffffff + value = A[i + j] + result = dace.define_local_scalar(dace.uint32) + with dace.tasklet(dace.Language.CPP): + inp_mask << mask + inp_value << value + out_result >> result + """ + out_result = __reduce_add_sync(inp_mask, inp_value); + """ + + B[i + j] = result + + + sdfg = symbolic_warp_map.to_sdfg() + + A = cp.ones(ns, dtype=cp.uint32) + B = cp.zeros(ns, dtype=cp.uint32) + + sdfg(A=A, B=B, START= start, WS=ws, STRIDE=stride, BS=bs, NS=ns) + + expected = cp.full(ns, 0, dtype=cp.uint32) + for block_start in range(0, ns, bs): + for tid in range(bs): + warpId = tid // 32 + if warpId in range(start, ws, stride): + expected[block_start + tid] = 32 + + cp.testing.assert_array_equal(B, expected) + + + + + + + +@pytest.mark.gpu +def test_dynamic_warpSize_warp_map(): + + STRIDE = 3 # just smth else than 1, 1 is easy to pass + BS = dace.symbol('BS') + NS = dace.symbol('NS') + + bs = 1024 + ns = 2024 + @dace.program + def symbolic_warp_map(A: dace.uint32[NS] @ dace.dtypes.StorageType.GPU_Global, B: dace.uint32[NS] @ dace.dtypes.StorageType.GPU_Global): + """ + What if warpSize is determined at runtime. + """ + for i in dace.map[0:NS:BS] @ dace.dtypes.ScheduleType.GPU_Device: + for j in dace.map[0:BS] @ dace.dtypes.ScheduleType.GPU_ThreadBlock: + ws = bs // 32 + for k in dace.map[0:ws:STRIDE] @ dace.dtypes.ScheduleType.GPU_Warp: + mask = 0xffffffff + value = A[i + j] + result = dace.define_local_scalar(dace.uint32) + with dace.tasklet(dace.Language.CPP): + inp_mask << mask + inp_value << value + out_result >> result + """ + out_result = __reduce_add_sync(inp_mask, inp_value); + """ + + B[i + j] = result + + + sdfg = symbolic_warp_map.to_sdfg() + + A = cp.ones(ns, dtype=cp.uint32) + B = cp.zeros(ns, dtype=cp.uint32) + + sdfg(A=A, B=B, BS=bs, NS=ns) + + expected = cp.full(ns, 0, dtype=cp.uint32) + for block_start in range(0, ns, bs): + for tid in range(bs): + ws = bs // 32 + warpId = tid // 32 + if warpId in range(0, ws, STRIDE): + expected[block_start + tid] = 32 + + cp.testing.assert_array_equal(B, expected) + +####################### Testing simple warplevel programs ################# + +@pytest.mark.gpu +def test_warp_reduce_add(): + """ + Best way to understand this is to copy paste it and + to look at the sdfg. A simple explanation: It tests whether + the most basic functionality of warp maps work and whether + we can use "__reduce_add_sync(mask, value)" on by definining a + custom tasklet. + """ + + # Generate framework + sdfg = dace.SDFG("Warp_test_1") + state = sdfg.add_state("main") + + # Generate access nodes + a_dev = sdfg.add_array("A", (32,), dace.uint32, dace.dtypes.StorageType.GPU_Global) + b_dev = sdfg.add_array("B", (32,), dace.uint32, dace.dtypes.StorageType.GPU_Global) + a_acc = state.add_access("A") + b_acc = state.add_access("B") + + + # Generate maps, connect entries with access data + gpu_map_entry, gpu_map_exit = state.add_map(name = "GPU_Map", + ndrange = dict(i='0:32:32'), + schedule = dace.dtypes.ScheduleType.GPU_Device) + state.add_edge(a_acc, None, gpu_map_entry, None, dace.memlet.Memlet('A[0:32]')) + + + tblock_map_entry, tblock_map_exit = state.add_map(name = "Block_Map", + ndrange = dict(j='0:32'), + schedule = dace.dtypes.ScheduleType.GPU_ThreadBlock) + state.add_edge(gpu_map_entry, None, tblock_map_entry, None, dace.memlet.Memlet('A[0:32]')) + + + tasklet, warp_scope_entry, warp_scope_exit = state.add_mapped_tasklet( + name='WarpLevel_Operation', + map_ranges=dict(_='0:1'), + inputs=dict(inp=dace.Memlet('A[0:32]', volume=32)), + code= +""" +value = inp[j] +out = __reduce_add_sync(0xFFFFFFFF, value); +""", + outputs=dict(out=dace.Memlet("B[j]")), + schedule=dace.dtypes.ScheduleType.GPU_Warp + ) + + state.add_edge(tblock_map_entry, None, warp_scope_entry, None, dace.memlet.Memlet('A[0:32]')) + + # Connect Exit nodes + state.add_edge(warp_scope_exit, None, tblock_map_exit, None, dace.memlet.Memlet('B[j]')) + state.add_edge(tblock_map_exit, None, gpu_map_exit, None, dace.memlet.Memlet('B[j]')) + state.add_edge(gpu_map_exit, None, b_acc, None, dace.memlet.Memlet('B[0:32]')) + + sdfg.fill_scope_connectors() + + A = cp.ones(32, dtype=cp.uint32) + B = cp.zeros(32, dtype=cp.uint32) + + sdfg(A=A, B=B) + + all_32 = cp.full(32, 32, dtype=cp.uint32) + cp.testing.assert_array_equal(B, all_32) + + + +@pytest.mark.gpu +def test_warp_shfl_op(): + """ + Best way to understand this is to copy paste it and + to look at the sdfg. A simple explanation: It tests now another + warpLevel primitive, namely __shfl_down_sync and __shfl_up_sync. + """ + sdfg = dace.SDFG("Warp_test_1") + state = sdfg.add_state("main") + + # Generate access nodes + a_dev = sdfg.add_array("A", (32,), dace.uint32, dace.dtypes.StorageType.GPU_Global) + b_dev = sdfg.add_array("B", (32,), dace.uint32, dace.dtypes.StorageType.GPU_Global) + a_acc = state.add_access("A") + b_acc = state.add_access("B") + + + # Generate maps, connect entries with access data + gpu_map_entry, gpu_map_exit = state.add_map(name = "GPU_Map", + ndrange = dict(i='0:32:32'), + schedule = dace.dtypes.ScheduleType.GPU_Device) + state.add_edge(a_acc, None, gpu_map_entry, None, dace.memlet.Memlet('A[0:32]')) + + + tblock_map_entry, tblock_map_exit = state.add_map(name = "Block_Map", + ndrange = dict(j='0:32'), + schedule = dace.dtypes.ScheduleType.GPU_ThreadBlock) + state.add_edge(gpu_map_entry, None, tblock_map_entry, None, dace.memlet.Memlet('A[0:32]')) + + + tasklet, warp_scope_entry, warp_scope_exit = state.add_mapped_tasklet( + name='WarpLevel_Operation', + map_ranges=dict(_='0:1'), + inputs=dict(inp=dace.Memlet('A[0:32]', volume=32)), + code= +""" +tid = j; +value = inp[tid]; +up = __shfl_down_sync(0xFFFFFFFF, value, 16); +low = __shfl_up_sync(0xFFFFFFFF, value, 16); +if tid < 16: + value = up; +else: + value = low +out= value + +""", + outputs=dict(out=dace.Memlet("B[j]")), + schedule=dace.dtypes.ScheduleType.GPU_Warp + ) + + state.add_edge(tblock_map_entry, None, warp_scope_entry, None, dace.memlet.Memlet('A[0:32]')) + + # Connect Exit nodes + state.add_edge(warp_scope_exit, None, tblock_map_exit, None, dace.memlet.Memlet('B[j]')) + state.add_edge(tblock_map_exit, None, gpu_map_exit, None, dace.memlet.Memlet('B[j]')) + state.add_edge(gpu_map_exit, None, b_acc, None, dace.memlet.Memlet('B[0:32]')) + + sdfg.fill_scope_connectors() + + A = cp.array([0 if False else i for i in range(32)], dtype=cp.uint32) + B = cp.zeros(32, dtype=cp.uint32) + + sdfg(A=A, B=B) + + expected = cp.array(cp.concatenate((A[16:32], A[0:16]))) + cp.testing.assert_array_equal(B,expected) + + + + + + +if __name__ == '__main__': + + # Warnings are ignored + #test_warp_map(0, 32, 1) + pytest.main(["-v", "-p", "no:warnings", __file__]) + + # Use this if you want to see the warning + # pytest.main(["-v", __file__]) \ No newline at end of file diff --git a/berkay_workpace/tests/gpu_map_tests/device_map_test.py b/berkay_workpace/tests/gpu_map_tests/device_map_test.py new file mode 100644 index 0000000000..2cbf2a51e4 --- /dev/null +++ b/berkay_workpace/tests/gpu_map_tests/device_map_test.py @@ -0,0 +1,155 @@ +import dace +import random +import cupy as cp +import pytest + +from dace.config import Config + + +@pytest.mark.gpu +@pytest.mark.parametrize("vec_size", [0, 15, 32, 67]) # default block size is 32, so these parameters handle interesting groups +def test_1d_maps_fixed_sizes(vec_size): + """ + Tests flat 1D vector copy from B to A using a single GPU_Device map (no thread blocking) for fixed size arrays. + The vector sizes are chosen to cover interesting cases considering a default block size is 32. + """ + + @dace.program + def vector_copy_flat(A: dace.float64[vec_size] @ dace.dtypes.StorageType.GPU_Global, + B: dace.float64[vec_size] @ dace.dtypes.StorageType.GPU_Global): + for i in dace.map[0:vec_size] @ dace.dtypes.ScheduleType.GPU_Device: + A[i] = B[i] + + sdfg = vector_copy_flat.to_sdfg() + + # Initialize random CUDA arrays + A = cp.zeros(vec_size, dtype=cp.float64) # Output array + B = cp.random.rand(vec_size).astype(cp.float64) # Input array + + # Ensure arrays differ at start + if vec_size != 0: + assert not cp.allclose(A, B), "Arrays are unexpectedly equal before copy." + + # Run the SDFG + sdfg(A=A, B=B) + + # Assert values match + cp.testing.assert_array_equal(A, B) + + + + +@pytest.mark.gpu +@pytest.mark.parametrize("n", [0, 15, 32, 67]) +def test_1d_maps_dynamic_sizes(n): + """ + Tests flat 1D vector copy from B to A using a single GPU_Device map (no thread blocking) for variable size arrays. + The vector sizes are chosen to cover interesting cases considering a default block size is 32. + """ + N = dace.symbol('N') + + @dace.program + def vector_copy_dyn_sizes(A: dace.float64[N] @ dace.dtypes.StorageType.GPU_Global, B: dace.float64[N] @ dace.dtypes.StorageType.GPU_Global): + for i in dace.map[0:N] @ dace.dtypes.ScheduleType.GPU_Device: + A[i] = B[i] + + sdfg = vector_copy_dyn_sizes.to_sdfg() + + # Initialize random CUDA arrays + A = cp.zeros(n, dtype=cp.float64) # Output array + B = cp.random.rand(n).astype(cp.float64) # Input array + + # Ensure arrays differ at start + if n != 0: + assert not cp.allclose(A, B), "Arrays are unexpectedly equal before copy." + + sdfg(A=A, B=B, N=n) + + # Assert values match + cp.testing.assert_array_equal(A, B) + + + +@pytest.mark.gpu +@pytest.mark.parametrize("s", [1, 2, 32, 33]) +def test_1d_maps_strides(s): + """ + Tests flat 1D vector copy from B to A using a single GPU_Device map (no thread blocking) for different strides. + N is variable in the sdfg/code but we just test for N = 67 here. + """ + N = dace.symbol('N') + n = 67 + + @dace.program + def vector_copy_strides(A: dace.float64[N] @ dace.dtypes.StorageType.GPU_Global, B: dace.float64[N] @ dace.dtypes.StorageType.GPU_Global): + for i in dace.map[0:N:s] @ dace.dtypes.ScheduleType.GPU_Device: + A[i] = B[i] + + sdfg = vector_copy_strides.to_sdfg() + + # Initialize random CUDA arrays + A = cp.zeros(n, dtype=cp.float64) # Output array + B = cp.random.rand(n).astype(cp.float64) # Input array + + # Ensure arrays differ at start + if n != 0: + assert not cp.allclose(A, B), "Arrays are unexpectedly equal before copy." + + sdfg(A=A, B=B, N=n) + + # Check at stride positions: A[i] == B[i] + cp.testing.assert_array_equal(A[::s], B[::s]) + + # Check non-stride positions: A[i] == 0 + mask = cp.ones(n, dtype=bool) + mask[::s] = False + cp.testing.assert_array_equal(A[mask], cp.zeros_like(A[mask])) + + + +@pytest.mark.gpu +@pytest.mark.parametrize("shape", [(2, 16), (3, 32)]) +def test_2d_maps_dynamic_sizes(shape): + """ + Tests 2D matrix copy from B to A using a GPU_Device map for variable-sized matrices. + """ + M = dace.symbol('M') + N = dace.symbol('N') + m, n = shape + + @dace.program + def matrix_copy(A: dace.float64[M, N] @ dace.dtypes.StorageType.GPU_Global, + B: dace.float64[M, N] @ dace.dtypes.StorageType.GPU_Global): + for i, j in dace.map[0:M, 0:N] @ dace.ScheduleType.GPU_Device: + A[i, j] = B[i, j] + + sdfg = matrix_copy.to_sdfg() + + # Initialize arrays + A = cp.zeros((m, n), dtype=cp.float64) + B = cp.random.rand(m, n).astype(cp.float64) + + # Ensure they differ at start + assert not cp.allclose(A, B), "Arrays are unexpectedly equal before copy." + + # Run the SDFG + sdfg(A=A, B=B, M=m, N=n) + + # Assert result + cp.testing.assert_array_equal(A, B) + + + +# higher dimensions in old tests + + + +if __name__ == '__main__': + + print(f"\n\n\033[94m[INFO] You are using the \033[92m{Config.get('compiler', 'cuda', 'implementation')}\033[94m CUDA implementation.\033[0m \n\n") + + # Warnings are ignored + pytest.main(["-v", "-p", "no:warnings", __file__]) + + # Use this if you want to see the warning + # pytest.main(["-v", __file__]) \ No newline at end of file diff --git a/berkay_workpace/tests/gpu_map_tests/threadBlock_test.py b/berkay_workpace/tests/gpu_map_tests/threadBlock_test.py new file mode 100644 index 0000000000..a321716f09 --- /dev/null +++ b/berkay_workpace/tests/gpu_map_tests/threadBlock_test.py @@ -0,0 +1,97 @@ +import dace +import random +import cupy as cp +import pytest + +from dace.config import Config + +# More tests at old tests, see /reusable_test + +@pytest.mark.gpu +@pytest.mark.parametrize("vec_size, block_size, stride", [ + (32, 32, 2), + (64, 32, 4), + (67, 32, 2), + (128, 64, 8), +]) +def test_tb_map_strided(vec_size, block_size, stride): + """ + Tests strided copy from B to A using nested GPU maps: outer map with GPU_Device and + inner map with GPU_ThreadBlock. Only indices matching the stride are written. + """ + + N = dace.symbol('N') + + @dace.program + def vector_copy_strided(A: dace.float64[N] @ dace.dtypes.StorageType.GPU_Global, + B: dace.float64[N] @ dace.dtypes.StorageType.GPU_Global): + for i in dace.map[0:N:block_size] @ dace.dtypes.ScheduleType.GPU_Device: + for j in dace.map[0:block_size:stride] @ dace.dtypes.ScheduleType.GPU_ThreadBlock: + if i + j < N: + A[i + j] = B[i + j] + + sdfg = vector_copy_strided.to_sdfg() + + A = cp.zeros(vec_size, dtype=cp.float64) + B = cp.random.rand(vec_size).astype(cp.float64) + + assert not cp.allclose(A, B), "Arrays are unexpectedly equal at the start." + + sdfg(A=A, B=B, N=vec_size) + + # Check stride positions + cp.testing.assert_array_equal(A[::stride], B[::stride]) + + # Check untouched values (non-stride positions) + mask = cp.ones(vec_size, dtype=bool) + mask[::stride] = False + cp.testing.assert_array_equal(A[mask], cp.zeros_like(A[mask])) + + + + +@pytest.mark.gpu +@pytest.mark.parametrize("n", [40, 64, 100, 128, 149]) +def test_skewed_like_map_range_flat_add(n): + """ + Tests vector addition C = A + B using a skewed-style inner map: + outer GPU_Device map over blocks of size 32, and inner GPU_ThreadBlock map over absolute indices. + """ + + N = dace.symbol('N') + + @dace.program + def vadd_flat_skew_like(A: dace.float32[N] @ dace.StorageType.GPU_Global, + B: dace.float32[N] @ dace.StorageType.GPU_Global, + C: dace.float32[N] @ dace.StorageType.GPU_Global): + for i in dace.map[0:N:32] @ dace.ScheduleType.GPU_Device: + for j in dace.map[i:(i + 32)] @ dace.ScheduleType.GPU_ThreadBlock: + if j < N: + C[j] = A[j] + B[j] + + sdfg = vadd_flat_skew_like.to_sdfg() + + # Allocate test data + A = cp.random.rand(n).astype(cp.float32) + B = cp.random.rand(n).astype(cp.float32) + C = cp.zeros(n, dtype=cp.float32) + C_expected = A + B + + # Run the program + sdfg(A=A, B=B, C=C, N=n) + + # Validate output + cp.testing.assert_allclose(C, C_expected, rtol=1e-5, err_msg=f"Mismatch in output vector C for n={n}") + + + + +if __name__ == '__main__': + + print(f"\n\n\033[94m[INFO] You are using the \033[92m{Config.get('compiler', 'cuda', 'implementation')}\033[94m CUDA implementation.\033[0m \n\n") + + # Warnings are ignored + pytest.main(["-v", "-p", "no:warnings", __file__]) + + # Use this if you want to see the warning + # pytest.main(["-v", __file__]) \ No newline at end of file diff --git a/berkay_workpace/tests/reusable_tests/cuda_block_test.py b/berkay_workpace/tests/reusable_tests/cuda_block_test.py new file mode 100644 index 0000000000..c716b7b117 --- /dev/null +++ b/berkay_workpace/tests/reusable_tests/cuda_block_test.py @@ -0,0 +1,207 @@ +# Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved. +import dace +from dace.transformation.dataflow import GPUTransformMap +from dace.transformation.interstate import GPUTransformSDFG +import numpy as np +import pytest + +N = dace.symbol('N') + + +@dace.program(dace.float64[N], dace.float64[N]) +def cudahello(V, Vout): + + @dace.mapscope(_[0:N:32]) + def multiplication(i): + + # I don't understand why this is here + # Probably will be removed later? + @dace.map(_[0:32]) + def mult_block(bi): + in_V << V[i + bi] + out >> Vout[i + bi] + out = in_V * 2 + + @dace.map(_[0:32]) + def mult_block_2(bi): + in_V << V[i + bi] + out >> Vout[i + bi] + out = in_V * 2 + + +def _test(sdfg): + N = 128 + + print('Vector double CUDA (block) %d' % (N)) + + V = dace.ndarray([N], dace.float64) + Vout = dace.ndarray([N], dace.float64) + V[:] = np.random.rand(N).astype(dace.float64.type) + Vout[:] = dace.float64(0) + + cudahello(V=V, Vout=Vout, N=N) + + diff = np.linalg.norm(2 * V - Vout) / N + print("Difference:", diff) + assert diff <= 1e-5 + + +def test_cpu(): + _test(cudahello.to_sdfg()) + + +@pytest.mark.gpu +def test_gpu(): + sdfg = cudahello.to_sdfg() + assert sdfg.apply_transformations(GPUTransformMap) == 1 + _test(sdfg) + + +@pytest.mark.gpu +def test_different_block_sizes_nesting(): + + @dace.program + def nested(V: dace.float64[34], v1: dace.float64[1]): + with dace.tasklet: + o >> v1(-1) + # Tasklet that does nothing + pass + + for i in dace.map[0:34]: + with dace.tasklet: + inp << V[i] + out >> v1(1, lambda a, b: a + b)[0] + out = inp + inp + + @dace.program + def nested2(V: dace.float64[34], v1: dace.float64[1]): + with dace.tasklet: + o >> v1(-1) + # Tasklet that does nothing + pass + + nested(V, v1) + + @dace.program + def diffblocks(V: dace.float64[130], v1: dace.float64[4], v2: dace.float64[128]): + for bi in dace.map[1:129:32]: + for i in dace.map[0:32]: + with dace.tasklet: + in_V << V[i + bi] + out >> v2[i + bi - 1] + out = in_V * 3 + + nested2(V[bi - 1:bi + 33], v1[bi // 32:bi // 32 + 1]) + + sdfg = diffblocks.to_sdfg() + assert sdfg.apply_transformations(GPUTransformSDFG, dict(sequential_innermaps=False)) == 1 + V = np.random.rand(130) + v1 = np.zeros([4], np.float64) + v2 = np.random.rand(128) + expected_v2 = V[1:129] * 3 + expected_v1 = np.zeros([4], np.float64) + for i in range(4): + expected_v1[i] = np.sum(V[i * 32:(i + 1) * 32 + 2]) * 2 + + sdfg(V, v1, v2) + assert np.linalg.norm(v1 - expected_v1) <= 1e-6 + assert np.allclose(v2, expected_v2) + + +@pytest.mark.gpu +def test_custom_block_size_onemap(): + + @dace.program + def tester(A: dace.float64[400, 300]): + for i, j in dace.map[0:400, 0:300]: + with dace.tasklet: + a >> A[i, j] + a = 1 + + sdfg = tester.to_sdfg() + sdfg.apply_gpu_transformations() + mapentry: dace.nodes.MapEntry = next(n for n, _ in sdfg.all_nodes_recursive() if isinstance(n, dace.nodes.MapEntry)) + + # Test 1: too many dimensions + mapentry.map.gpu_block_size = (13, 5, 3, 4) + code = sdfg.generate_code()[1].clean_code # Get GPU code (second file) + assert 'dim3(13, 5, 12)' in code + + # Test 2: too few dimensions + mapentry.map.gpu_block_size = (127, 5) + code = sdfg.generate_code()[1].clean_code # Get GPU code (second file) + assert 'dim3(127, 5, 1)' in code + + # Test 3: compilation + sdfg.compile() + + +@pytest.mark.gpu +def test_custom_block_size_twomaps(): + + @dace.program + def tester(A: dace.float64[400, 300, 2, 32]): + for i, j in dace.map[0:400, 0:300]: + for bi, bj in dace.map[0:2, 0:32]: + with dace.tasklet: + a >> A[i, j, bi, bj] + a = 1 + + sdfg = tester.to_sdfg() + sdfg.apply_gpu_transformations(sequential_innermaps=True) + mapentry: dace.nodes.MapEntry = next( + n for n, _ in sdfg.all_nodes_recursive() + if isinstance(n, dace.nodes.MapEntry) and n.map.schedule == dace.ScheduleType.GPU_Device) + + mapentry.map.gpu_block_size = (127, 5) + code = sdfg.generate_code()[1].clean_code # Get GPU code (second file) + assert 'dim3(127, 5, 1)' in code + + # Test 3: compilation + sdfg.compile() + + + +""" +# Not implemened +@pytest.mark.gpu +def test_block_thread_specialization(): + + @dace.program + def tester(A: dace.float64[200]): + for i in dace.map[0:200:32]: + for bi in dace.map[0:32]: + with dace.tasklet: + a >> A[i + bi] + a = 1 + with dace.tasklet: # Tasklet to be specialized + a >> A[i + bi] + a = 2 + + sdfg = tester.to_sdfg() + sdfg.apply_gpu_transformations(sequential_innermaps=False) + tasklet = next(n for n, _ in sdfg.all_nodes_recursive() + if isinstance(n, dace.nodes.Tasklet) and '2' in n.code.as_string) + tasklet.location['gpu_thread'] = dace.subsets.Range.from_string('2:9:3') + tasklet.location['gpu_block'] = 1 + + code = sdfg.generate_code()[1].clean_code # Get GPU code (second file) + assert '>= 2' in code and '<= 8' in code + assert ' == 1' in code + + a = np.random.rand(200) + ref = np.ones_like(a) + ref[32:64][2:9:3] = 2 + sdfg(a) + assert np.allclose(a, ref) +""" + + + +if __name__ == "__main__": + test_cpu() + test_gpu() + test_different_block_sizes_nesting() + test_custom_block_size_onemap() + test_custom_block_size_twomaps() + #test_block_thread_specialization() diff --git a/berkay_workpace/tests/reusable_tests/cuda_highdim_kernel_test.py b/berkay_workpace/tests/reusable_tests/cuda_highdim_kernel_test.py new file mode 100644 index 0000000000..88120f3324 --- /dev/null +++ b/berkay_workpace/tests/reusable_tests/cuda_highdim_kernel_test.py @@ -0,0 +1,213 @@ +# Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved. +import dace +from dace.transformation.dataflow import GPUTransformMap +import numpy as np +import pytest + +# Symbols +N = dace.symbol('N') +M = dace.symbol('M') +K = dace.symbol('K') +L = dace.symbol('L') + +X = dace.symbol('X') +Y = dace.symbol('Y') +Z = dace.symbol('Z') +W = dace.symbol('W') +U = dace.symbol('U') + + +@dace.program +def highdim(A: dace.uint64[N, M, K, L, X, Y, Z, W, U], B: dace.uint64[N, M, K, L]): + + @dace.mapscope + def kernel(i: _[5:N - 5], j: _[0:M], k: _[7:K - 1], l: _[0:L]): + + @dace.map + def block(a: _[0:X], b: _[0:Y], c: _[1:Z], d: _[2:W - 1], e: _[0:U]): + input << A[i, j, k, l, a, b, c, d, e] + output >> B(1, lambda a, b: a + b)[i, j, k, l] + output = input + + +def makendrange(*args): + result = [] + for i in range(0, len(args), 2): + result.append(slice(args[i], args[i + 1], 1)) + return result + + +def _test(sdfg): + # 4D kernel with 5D block + N = 12 + M = 3 + K = 14 + L = 15 + X = 1 + Y = 2 + Z = 3 + W = 4 + U = 5 + dims = tuple(s for s in (N, M, K, L, X, Y, Z, W, U)) + outdims = tuple(s for s in (N, M, K, L)) + print('High-dimensional GPU kernel test', dims) + + A = dace.ndarray((N, M, K, L, X, Y, Z, W, U), dtype=dace.uint64) + B = dace.ndarray((N, M, K, L), dtype=dace.uint64) + A[:] = np.random.randint(10, size=dims).astype(np.uint64) + B[:] = np.zeros(outdims, dtype=np.uint64) + B_regression = np.zeros(outdims, dtype=np.uint64) + + # Equivalent python code + for i, j, k, l in dace.ndrange(makendrange(5, N - 5, 0, M, 7, K - 1, 0, L)): + for a, b, c, d, e in dace.ndrange(makendrange(0, X, 0, Y, 1, Z, 2, W - 1, 0, U)): + B_regression[i, j, k, l] += A[i, j, k, l, a, b, c, d, e] + + sdfg(A=A, B=B, N=N, M=M, K=K, L=L, X=X, Y=Y, Z=Z, W=W, U=U) + + diff = np.linalg.norm(B_regression - B) / (N * M * K * L) + print('Difference:', diff) + assert diff <= 1e-5 + + +def test_cpu(): + _test(highdim.to_sdfg()) + + +@pytest.mark.gpu +def test_gpu(): + sdfg = highdim.to_sdfg() + assert sdfg.apply_transformations(GPUTransformMap, options=dict(fullcopy=True)) == 1 + _test(sdfg) + + +@pytest.mark.gpu +def test_highdim_implicit_block(): + + @dace.program + def tester(x: dace.float64[32, 90, 80, 70]): + for i, j, k, l in dace.map[0:32, 0:90, 0:80, 0:70]: + x[i, j, k, l] = 2.0 + + # Create GPU SDFG + sdfg = tester.to_sdfg() + sdfg.apply_gpu_transformations() + + # Change map implicit block size + for node, _ in sdfg.all_nodes_recursive(): + if isinstance(node, dace.nodes.MapEntry): + node.map.gpu_block_size = [8, 2, 4] + + a = np.random.rand(32, 90, 80, 70) + sdfg(a) + assert np.allclose(a, 2) + + +@pytest.mark.gpu +def test_highdim_implicit_block_threadsplit(): + + @dace.program + def tester(x: dace.float64[2, 2, 80, 70]): + for i, j, k, l in dace.map[0:2, 0:2, 0:80, 0:70]: + x[i, j, k, l] = 2.0 + + # Create GPU SDFG + sdfg = tester.to_sdfg() + sdfg.apply_gpu_transformations() + + # Change map implicit block size + for node, _ in sdfg.all_nodes_recursive(): + if isinstance(node, dace.nodes.MapEntry): + node.map.gpu_block_size = [8, 2, 3] + + a = np.random.rand(2, 2, 80, 70) + sdfg(a) + assert np.allclose(a, 2) + + +def test_highdim_default_block_size(): + + @dace.program + def tester(a: dace.float64[1024, 1024] @ dace.StorageType.GPU_Global): + for i, j in dace.map[0:1024, 0:1024] @ dace.ScheduleType.GPU_Device: + a[i, j] = 1 + + with dace.config.set_temporary('compiler', 'cuda', 'default_block_size', value='32, 8, 2'): + with pytest.warns(UserWarning, match='has more dimensions'): + sdfg = tester.to_sdfg() + gpu_code = sdfg.generate_code()[1] + assert 'dim3(32, 16, 1)' in gpu_code.code + + +def test_block_size_mismatch_warning(): + + @dace.program + def tester(a: dace.float64[1024, 1024] @ dace.StorageType.GPU_Global): + for i, j in dace.map[0:512:2, 0:512:2] @ dace.ScheduleType.GPU_Device: + for bi, bj in dace.map[0:2, 0:2] @ dace.ScheduleType.GPU_ThreadBlock: + a[i + bi, j + bj] = 1 + for bi, bj in dace.map[0:2, 0:1] @ dace.ScheduleType.GPU_ThreadBlock: + a[i + bi, j + bj] = 1 + + sdfg = tester.to_sdfg() + with pytest.warns(UserWarning, match='Multiple thread-block maps'): + sdfg.generate_code() + + +def test_block_size_mismatch_error(): + + @dace.program + def tester(a: dace.float64[1024, 1024] @ dace.StorageType.GPU_Global): + for i, j in dace.map[0:512:2, 0:512:2] @ dace.ScheduleType.GPU_Device: + for bi, bj in dace.map[0:2, 0:2] @ dace.ScheduleType.GPU_ThreadBlock: + a[i + bi, j + bj] = 1 + + sdfg = tester.to_sdfg() + for n, _ in sdfg.all_nodes_recursive(): + if isinstance(n, dace.nodes.MapEntry) and n.schedule == dace.ScheduleType.GPU_Device: + n.gpu_block_size = [4, 2, 1] + + with pytest.raises(ValueError): + sdfg.generate_code() + + +def test_block_size_too_large(): + + @dace.program + def tester(a: dace.float64[1024, 1024] @ dace.StorageType.GPU_Global): + for i, j in dace.map[0:1024, 0:1024] @ dace.ScheduleType.GPU_Device: + a[i, j] = 1 + + sdfg = tester.to_sdfg() + for n, _ in sdfg.all_nodes_recursive(): + if isinstance(n, dace.nodes.MapEntry) and n.schedule == dace.ScheduleType.GPU_Device: + n.gpu_block_size = [64, 32, 1] + + with pytest.raises(ValueError): + sdfg.generate_code() + + +def test_highdim_block_size_too_large(): + BX, BY, BZ, BW = 64, 2, 2, 2 + + @dace.program + def tester(a: dace.float64[1024, 2, 2, 20] @ dace.StorageType.GPU_Global): + for i, j, k, l in dace.map[0:16, 0:1, 0:1, 0:10:2] @ dace.ScheduleType.GPU_Device: + for bi, bj, bk, bl in dace.map[0:BX, 0:BY, 0:BZ, 0:BW] @ dace.ScheduleType.GPU_ThreadBlock: + a[i + bi, j + bj, k + bk, l + bl] = 1 + + sdfg = tester.to_sdfg() + with pytest.raises(ValueError): + sdfg.generate_code() + + +if __name__ == "__main__": + test_cpu() + test_gpu() + test_highdim_implicit_block() + test_highdim_implicit_block_threadsplit() + test_highdim_default_block_size() + test_block_size_mismatch_warning() + test_block_size_mismatch_error() + test_block_size_too_large() + test_highdim_block_size_too_large() diff --git a/berkay_workpace/tests/tests.py b/berkay_workpace/tests/tests.py deleted file mode 100644 index 9095787b8b..0000000000 --- a/berkay_workpace/tests/tests.py +++ /dev/null @@ -1,335 +0,0 @@ -import dace -import random -import cupy as cp - -from dace import registry -from dace.sdfg.scope import ScopeSubgraphView -from dace.codegen.prettycode import CodeIOStream -from dace.codegen.targets.target import TargetCodeGenerator -from dace.codegen.targets.framecode import DaCeCodeGenerator -from dace.codegen.targets.cpp import sym2cpp -from IPython.display import Code -from dace.config import Config - - -def test_1(): - - vec_size = 66 - @dace.program - def vector_copy1(A: dace.float64[vec_size] @ dace.dtypes.StorageType.GPU_Global, B: dace.float64[vec_size] @ dace.dtypes.StorageType.GPU_Global): - for i in dace.map[0:vec_size] @ dace.dtypes.ScheduleType.GPU_Device: - A[i] = B[i] - - sdfg = vector_copy1.to_sdfg() - - # Initialize random CUDA arrays - A = cp.zeros(vec_size, dtype=cp.float64) # Output array - B = cp.random.rand(vec_size).astype(cp.float64) # Random input array - - equal_at_start = cp.all(A == B) - if equal_at_start: - print(f"{'Test 1: Vectors are equal at start. Test is skipped.':<70}\033[93m[WARNING]\033[0m") - return - - sdfg = vector_copy1.to_sdfg() - sdfg(A=A, B=B) - equal_at_end = cp.all(A == B) - - if equal_at_end: - print(f"{'Test 1: 1D vector copy simple':<70}\033[92m[PASSED]\033[0m") - else: - print(f"{'Test 1: 1D vector copy simple':<70}\033[91m[FAILED]\033[0m") - - -def test_2(): - - N = dace.symbol('N') - @dace.program - def vector_copy2(A: dace.float64[N] @ dace.dtypes.StorageType.GPU_Global, B: dace.float64[N] @ dace.dtypes.StorageType.GPU_Global): - for i in dace.map[0:N] @ dace.dtypes.ScheduleType.GPU_Device: - A[i] = B[i] - - sdfg = vector_copy2.to_sdfg() - - n = random.randint(3, 100) - # Initialize random CUDA arrays - A = cp.zeros(n, dtype=cp.float64) # Output array - B = cp.random.rand(n).astype(cp.float64) # Random input array - - equal_at_start = cp.all(A == B) - if equal_at_start: - print(f"{'Test 2: Vectors are equal at start. Test is skipped.':<70}\033[93m[WARNING]\033[0m") - return - - sdfg = vector_copy2.to_sdfg() - sdfg(A=A, B=B, N=n) - equal_at_end = cp.all(A == B) - - if equal_at_end: - print(f"{'Test 2: 1D vector copy with symbolic size':<70}\033[92m[PASSED]\033[0m") - else: - print(f"{'Test 2: 1D vector copy with symbolic size':<70}\033[91m[FAILED]\033[0m") - - -def test_3(): - @dace.program - def vector_copy3(A: dace.float64[64] @ dace.dtypes.StorageType.GPU_Global, B: dace.float64[64] @ dace.dtypes.StorageType.GPU_Global): - for i in dace.map[0:64:32] @ dace.dtypes.ScheduleType.GPU_Device: - for j in dace.map[0:32] @ dace.dtypes.ScheduleType.GPU_ThreadBlock: - A[i + j] = B[i + j] - - sdfg = vector_copy3.to_sdfg() - - # Initialize random CUDA arrays - A = cp.zeros(64, dtype=cp.float64) # Output array - B = cp.random.rand(64).astype(cp.float64) # Random input array - - equal_at_start = cp.all(A == B) - if equal_at_start: - print(f"{'Test 3: Vectors are equal at start. Test is skipped.':<70}\033[93m[WARNING]\033[0m") - return - - sdfg = vector_copy3.to_sdfg() - sdfg(A=A, B=B) - equal_at_end = cp.all(A == B) - - if equal_at_end: - print(f"{'Test 3: 1D vector copy with threadblocking':<70}\033[92m[PASSED]\033[0m") - else: - print(f"{'Test 3: 1D vector copy with threadblocking':<70}\033[91m[FAILED]\033[0m") - - -def test_4(): - - N = dace.symbol('N') - - @dace.program - def vector_copy4(A: dace.float64[N] @ dace.dtypes.StorageType.GPU_Global, B: dace.float64[N] @ dace.dtypes.StorageType.GPU_Global): - for i in dace.map[0:N:32] @ dace.dtypes.ScheduleType.GPU_Device: - for j in dace.map[0:32] @ dace.dtypes.ScheduleType.GPU_ThreadBlock: - if i + j < N: - A[i + j] = B[i + j] - - n = random.randint(40, 150) - # Initialize random CUDA arrays - A = cp.zeros(n, dtype=cp.float64) # Output array - B = cp.random.rand(n).astype(cp.float64) # Random input array - - - equal_at_start = cp.all(A == B) - if equal_at_start: - print(f"{'Test 4: Vectors are equal at start. Test is skipped.':<70}\033[93m[WARNING]\033[0m") - return - - sdfg = vector_copy4.to_sdfg() - sdfg(A=A, B=B, N=n) - equal_at_end = cp.all(A == B) - - if equal_at_end: - print(f"{'Test 4: 1D vector copy with threadblocking & smybolic size':<70}\033[92m[PASSED]\033[0m") - else: - print(f"{'Test 4: 1D vector copy with threadblocking & smybolic size':<70}\033[91m[FAILED]\033[0m") - - -def test_5(): - @dace.program - def matrix_copy1(A: dace.float64[64,64] @ dace.dtypes.StorageType.GPU_Global, B: dace.float64[64,64] @ dace.dtypes.StorageType.GPU_Global): - for i, j in dace.map[0:64, 0:64] @ dace.dtypes.ScheduleType.GPU_Device: - A[i][j] = B[i][j] - # Preview SDFG - sdfg = matrix_copy1.to_sdfg() - - - # Initialize random CUDA arrays - A = cp.zeros((64,64), dtype=cp.float64) # Output array - B = cp.random.rand(64,64).astype(cp.float64) # Random input array - - - equal_at_start = cp.all(A == B) - if equal_at_start: - print(f"{'Test 5: Matrices are equal at start. Test is skipped.':<70}\033[93m[WARNING]\033[0m") - return - - sdfg = matrix_copy1.to_sdfg() - sdfg(A=A, B=B) - equal_at_end = cp.all(A == B) - - if equal_at_end: - print(f"{'Test 5: Simple Matrix Copy':<70}\033[92m[PASSED]\033[0m") - else: - print(f"{'Test 5: Simple Matrix Copy':<70}\033[91m[FAILED]\033[0m") - - -def test_6(): - - N = dace.symbol('N') - M = dace.symbol('M') - - @dace.program - def matrix_copy2(A: dace.float64[M,N] @ dace.dtypes.StorageType.GPU_Global, B: dace.float64[M,N] @ dace.dtypes.StorageType.GPU_Global): - for i, j in dace.map[0:M, 0:N] @ dace.dtypes.ScheduleType.GPU_Device: - A[i][j] = B[i][j] - # Preview SDFG - sdfg = matrix_copy2.to_sdfg() - - n = random.randint(40, 150) - m = random.randint(40, 150) - # Initialize random CUDA arrays - A = cp.zeros((m,n), dtype=cp.float64) # Output array - B = cp.random.rand(m,n).astype(cp.float64) # Random input array - - - equal_at_start = cp.all(A == B) - if equal_at_start: - print(f"{'Test 6: Matrices are equal at start. Test is skipped.':<70}\033[93m[WARNING]\033[0m") - return - - sdfg = matrix_copy2.to_sdfg() - sdfg(A=A, B=B, M=m, N=n) - equal_at_end = cp.all(A == B) - - if equal_at_end: - print(f"{'Test 6: Matrix Copy with symbolic sizes':<70}\033[92m[PASSED]\033[0m") - else: - print(f"{'Test 6: Matrix Copy with symbolic sizes':<70}\033[91m[FAILED]\033[0m") - - -def test_7(): - - N = dace.symbol('N') - M = dace.symbol('M') - - @dace.program - def matrix_copy3(A: dace.float64[M,N] @ dace.dtypes.StorageType.GPU_Global, B: dace.float64[M,N] @ dace.dtypes.StorageType.GPU_Global): - for i, j in dace.map[0:M:32, 0:N:32] @ dace.dtypes.ScheduleType.GPU_Device: - for ii, jj in dace.map[0:32, 0:32] @ dace.dtypes.ScheduleType.GPU_ThreadBlock: - if i + ii < M and j + jj < N: - A[i + ii, j + jj] = B[i + ii, j + jj] - # Preview SDFG - sdfg = matrix_copy3.to_sdfg() - - n = random.randint(40, 150) - m = random.randint(40, 150) - # Initialize random CUDA arrays - A = cp.zeros((m,n), dtype=cp.float64) # Output array - B = cp.random.rand(m,n).astype(cp.float64) # Random input array - - - equal_at_start = cp.all(A == B) - if equal_at_start: - print(f"{'Test 6: Matrices are equal at start. Test is skipped.':<70}\033[93m[WARNING]\033[0m") - return - - sdfg = matrix_copy3.to_sdfg() - sdfg(A=A, B=B, M=m, N=n) - equal_at_end = cp.all(A == B) - - if equal_at_end: - print(f"{'Test 7: Matrix Copy with threadblocking & symbolic sizes':<70}\033[92m[PASSED]\033[0m") - else: - print(f"{'Test 7: Matrix Copy with threadblocking & symbolic sizes':<70}\033[91m[FAILED]\033[0m") - - -def test_8(): - - N = dace.symbol('N') - M = dace.symbol('M') - - @dace.program - def matrix_copy3(A: dace.float64[M,N] @ dace.dtypes.StorageType.GPU_Global, B: dace.float64[M,N] @ dace.dtypes.StorageType.GPU_Global): - for i, j in dace.map[0:M:32, 0:N:32] @ dace.dtypes.ScheduleType.GPU_Device: - for ii, jj in dace.map[0:32, 0:32] @ dace.dtypes.ScheduleType.GPU_ThreadBlock: - sB = dace.define_local([32,32], dace.float64, storage=dace.StorageType.GPU_Shared) - sB[ii, jj] = B[i + ii, j + jj] - A[i + ii, j + jj] = sB[ii, jj] - - - # Preview SDFG - sdfg = matrix_copy3.to_sdfg() - - n = random.randint(40, 150) - m = random.randint(40, 150) - # Initialize random CUDA arrays - A = cp.zeros((m,n), dtype=cp.float64) # Output array - B = cp.random.rand(m,n).astype(cp.float64) # Random input array - - - equal_at_start = cp.all(A == B) - if equal_at_start: - print(f"{'Test 8: Matrices are equal at start. Test is skipped.':<70}\033[93m[WARNING]\033[0m") - return - - sdfg = matrix_copy3.to_sdfg() - sdfg(A=A, B=B, M=m, N=n) - equal_at_end = cp.all(A == B) - - if equal_at_end: - print(f"{'Test 8: Matrix Copy with shared memory':<70}\033[92m[PASSED]\033[0m") - else: - print(f"{'Test 8: Matrix Copy with shared memory':<70}\033[91m[FAILED]\033[0m") - - -def test_9(): - - N = dace.symbol('N') - - @dace.program - def notskewed(A: dace.float32[N] @ dace.dtypes.StorageType.GPU_Global, - B: dace.float32[N] @ dace.dtypes.StorageType.GPU_Global, - C: dace.float32[N] @ dace.dtypes.StorageType.GPU_Global): - for i in dace.map[0:N:32] @ dace.ScheduleType.GPU_Device: - for j in dace.map[i:(i+32)] @ dace.ScheduleType.GPU_ThreadBlock: - C[j] = A[j] + B[j] - - # Preview SDFG - sdfg = notskewed.to_sdfg() - - n = random.randint(40, 150) - # Initialize random CUDA arrays - A = cp.random.rand(n).astype(cp.float32) # Output array - B = cp.random.rand(n).astype(cp.float32) # Random input array - C = cp.zeros((n), dtype=cp.float32) - C_ref = cp.zeros((n), dtype=cp.float32) - - - C_ref = A + B - sdfg(A=A, B=B, C=C, N=n) - - - if cp.all(C == C_ref): - print(f"{'Test 9: Not skewed vadd3':<70}\033[92m[PASSED]\033[0m") - else: - print(f"{'Test 9: Not skewed vadd3':<70}\033[91m[FAILED]\033[0m") - - - - - -def selected(): - test_1() - test_4() - test_5() - -def all(): - test_1() - test_2() - test_3() - test_4() - test_5() - test_6() - test_7() - test_8() - test_9() - -if __name__ == '__main__': - - - print("\n" + "="*80) - print(f"Tests started: You are using the {Config.get('compiler', 'cuda', 'implementation')} CUDA implementation.") - print("="*80 + "\n") - - all() - - print("\n" + "="*80) - print(f"Tests ended.") - print("="*80 + "\n") \ No newline at end of file From 665b2178ca5ed1defb001e8d92bf2ecbbbfc4af5 Mon Sep 17 00:00:00 2001 From: aydogdub Date: Fri, 16 May 2025 21:07:50 +0200 Subject: [PATCH 07/94] fix issues and enable launch bounds hint --- dace/codegen/targets/experimental_cuda.py | 333 ++++++++++++---------- 1 file changed, 186 insertions(+), 147 deletions(-) diff --git a/dace/codegen/targets/experimental_cuda.py b/dace/codegen/targets/experimental_cuda.py index 36dd7e4795..28cd424ea0 100644 --- a/dace/codegen/targets/experimental_cuda.py +++ b/dace/codegen/targets/experimental_cuda.py @@ -37,8 +37,6 @@ - - # TODO: GENERAL, discuss with Yakup # 1. Approval of dtypes @@ -60,11 +58,6 @@ - - - - - # TODO : I got rid of ScheduleType.GPU_Persistent (not supported anymore). If this codeBase # actually replaces the old one, this should be defined in dtypes.py and also accessed from # there. Also change GPU_SCHEDULES accesses to dtypes.GPU_SCHEDULES @@ -75,6 +68,8 @@ ] +THREADS_PER_WARP = 32 + @registry.autoregister_params(name='experimental_cuda') class ExperimentalCUDACodeGen(TargetCodeGenerator): """ Experimental CUDA code generator.""" @@ -510,25 +505,40 @@ def generate_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: ScopeSub cudaCodeGen=self, sdfg=sdfg, cfg=cfg, dfg_scope=dfg_scope, state_id=state_id ) - # + self._generate_gpu_bridge(sdfg, cfg, dfg_scope, state_id, function_stream, callsite_stream) #--------------- Generate Kernel Function ---------------- + ExperimentalCUDACodeGen._in_device_code = True kernel_stream = CodeIOStream() kernel_name = self._current_kernel_spec.kernel_name kernel_args = self._current_kernel_spec.args_typed - scope_entry = dfg_scope.source_nodes()[0] + block_dims = self._current_kernel_spec.block_dims + node = dfg_scope.source_nodes()[0] + + # Conditionally add __launch_bounds__ for block size optimization. + launch_bounds = '' + if node.gpu_launch_bounds != '-1': + if node.gpu_launch_bounds == "0": + if not any(symbolic.issymbolic(b) for b in block_dims): + launch_bounds = f'__launch_bounds__({prod(block_dims)})' + else: + launch_bounds = f'__launch_bounds__({node.gpu_launch_bounds})' + # Emit kernel function signature kernel_stream.write( - f'__global__ void {kernel_name}({", ".join(kernel_args)}) ', - cfg, state_id, scope_entry + f'__global__ void {launch_bounds} {kernel_name}({", ".join(kernel_args)}) ', + cfg, state_id, node ) + + # generate kernel scope self._generate_kernel_scope( sdfg, cfg, dfg_scope, state_id, self._globalcode, kernel_stream ) + self._localcode.write(kernel_stream.getvalue() + '\n') ExperimentalCUDACodeGen._in_device_code = False # -------------------------------------------------------------- @@ -536,23 +546,27 @@ def generate_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: ScopeSub # Generate the actual launch call (host-side) self._generate_kernel_launch(sdfg, cfg, dfg_scope, state_id, function_stream, callsite_stream) + else: - # We are already inside a kernel — this will be nested scope + # Nested scope: already inside a GPU kernel node = dfg_scope.source_nodes()[0] schedule_type = node.map.schedule.name - gen = getattr(self, f'_generate_{schedule_type}_scope', False) + + if schedule_type == dace.ScheduleType.GPU_Device: + raise NotImplementedError( + "Dynamic parallelism (nested GPU_Device schedules) is not supported." + ) + + gen = getattr(self, f'_generate_{schedule_type}_scope', None) if gen: gen(sdfg, cfg, dfg_scope, state_id, function_stream, callsite_stream) else: raise NotImplementedError( f"Scope generation for schedule type '{schedule_type}' is not implemented in ExperimentalCUDACodeGen. " - "Please ensure that the schedule type is supported or implement the required functionality." + "Please check for supported schedule types or implement the corresponding generator." ) - - - ####################### helper functions to generate_scope ###################################### @@ -659,69 +673,6 @@ def _generate_kernel_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: - """ - - - # First three dimensions are evaluated directly - for i in range(min(len(brange), 3)): - varname = scope_map.params[-i - 1] - - # Delinearize third dimension if necessary - if i == 2 and len(brange) > 3: - block_expr = '(threadIdx.z / (%s))' % _topy(functools.reduce(sympy.Mul, kdims[3:], 1)) - else: - block_expr = 'threadIdx.%s' % _named_idx(i) - - expr = _topy(tidx[i]).replace('__DAPT%d' % i, block_expr) - callsite_stream.write('int %s = %s;' % (varname, expr), cfg, state_id, scope_entry) - self._dispatcher.defined_vars.add(varname, DefinedType.Scalar, 'int') - - # Delinearize beyond the third dimension - if len(brange) > 3: - for i in range(3, len(brange)): - varname = scope_map.params[-i - 1] - # true dim i = z / ('*'.join(kdims[i+1:])) % kdims[i] - block_expr = '(threadIdx.z / (%s)) %% (%s)' % ( - _topy(functools.reduce(sympy.Mul, kdims[i + 1:], 1)), - _topy(kdims[i]), - ) - - expr = _topy(tidx[i]).replace('__DAPT%d' % i, block_expr) - callsite_stream.write('int %s = %s;' % (varname, expr), cfg, state_id, scope_entry) - self._dispatcher.defined_vars.add(varname, DefinedType.Scalar, 'int') - - # Generate conditions for this block's execution using min and max - # element, e.g. skipping out-of-bounds threads in trailing block - minels = brange.min_element() - maxels = brange.max_element() - for i, (v, minel, maxel) in enumerate(zip(scope_map.params[::-1], minels, maxels)): - condition = '' - - # Optimize conditions if they are always true - ############################################# - - # Block range start - if i >= 3 or (dsym[i] >= minel) != True: - condition += '%s >= %s' % (v, _topy(minel)) - - # Special case: block size is exactly the range of the map (0:b) - if i >= 3: - skipcond = False - else: - skipcond = dsym_end[i].subs({dsym[i]: minel}) == maxel - - # Block range end - if i >= 3 or (not skipcond and (dsym_end[i] < maxel) != True): - if len(condition) > 0: - condition += ' && ' - condition += '%s < %s' % (v, _topy(maxel + 1)) - - # Emit condition in code - if len(condition) > 0: - callsite_stream.write('if (%s) {' % condition, cfg, state_id, scope_entry) - else: - callsite_stream.write('{', cfg, state_id, scope_entry) - """ def _generate_GPU_ThreadBlock_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: ScopeSubgraphView, state_id: int, function_stream: CodeIOStream, kernel_stream: CodeIOStream) -> None: @@ -820,9 +771,6 @@ def _generate_GPU_ThreadBlock_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, df - - - def _generate_GPU_Warp_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: ScopeSubgraphView, state_id: int, function_stream: CodeIOStream, kernel_stream: CodeIOStream) -> None: @@ -830,106 +778,103 @@ def _generate_GPU_Warp_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope with KernelScopeManager(cudaCodeGen=self, sdfg=sdfg, cfg=cfg, dfg_scope=dfg_scope, state_id=state_id, function_stream=function_stream, callsite_stream=kernel_stream, comment="WarpLevel Scope") as scopeManager: + + + block_dims = self._current_kernel_spec.block_dims + + state_dfg = cfg.state(state_id) node = dfg_scope.source_nodes()[0] scope_map = node.map + map_range = subsets.Range(scope_map.range[::-1]) # Reversed for potential better performance - block_dims = self._current_kernel_spec.block_dims + warp_dim = len(map_range) + # The following sizes and bounds are be symbolic + num_threads_in_block = prod(block_dims) + warp_dim_bounds = [max_elem + 1 for max_elem in map_range.max_element()] + num_warps = prod(warp_dim_bounds) - THREADS_PER_WARP = 32 - num_threads_in_block = prod(block_dims) - upper_bound_warp_ids = [max_elem + 1 for max_elem in map_range.max_element()] - num_warps = prod(upper_bound_warp_ids) - warp_dim = len(map_range) - state_dfg = cfg.state(state_id) # ----------------- Guard checks ----------------------- - #TODO: Move them to validation as well if possible - #TODO: rename xfh, to cryptic - parent_map, _ = xfh.get_parent_map(state_dfg, node) - if parent_map.schedule != dtypes.ScheduleType.GPU_ThreadBlock: - raise ValueError("GPU_Warp map must be nested within a GPU_ThreadBlock map.") - - if warp_dim > 3: - raise NotImplementedError("GPU_Warp maps are limited to 3 dimensions.") - - if num_threads_in_block % THREADS_PER_WARP != 0: - raise ValueError(f"Block must be a multiple of {THREADS_PER_WARP} threads for GPU_Warp scheduling " - f"(got {num_threads_in_block}).") - - # TODO: This should be checked at get_kernel dim - if num_threads_in_block > 1024: - raise ValueError("CUDA does not support more than 1024 threads per block (hardware limit).") - - if num_warps * THREADS_PER_WARP > num_threads_in_block: - raise ValueError(f"Invalid configuration: {num_warps} warps x {THREADS_PER_WARP} threads exceed " - f"{num_threads_in_block} threads in the block.") - - if not all(x >= 0 for x in map_range.min_element()): - raise ValueError("Warp IDs (from map range) must be non-negative.") + # handles checks either at compile time or runtime (i.e. checks in the generated code) + self._hanlde_GPU_Warp_scope_guards(state_dfg, node, map_range, warp_dim, num_threads_in_block, num_warps, + kernel_stream, scopeManager) + - - # ----------------- Map unflattening and scope guards ----------------------- + # ----------------- Define (flat) Thread ID within Block ----------------------- flattened_terms = [] + for i, dim_size in enumerate(block_dims): + if dim_size == 1: continue + dim = _get_cuda_dim(i) stride = [f"{block_dims[j]}" for j in range(i) if block_dims[j] > 1] idx_expr = " * ".join(stride + [f"threadIdx.{_get_cuda_dim(i)}"]) if stride else f"threadIdx.{dim}" flattened_terms.append(idx_expr) - # NOTE: too ugly? - flat_thread_id_expr = " + ".join(flattened_terms) - warp_id_name = 'warpId_%s_%d_%d_%d' % (scope_map.label, cfg.cfg_id, state_dfg.block_id, state_dfg.node_id(node)) - kernel_stream.write( - f"int {warp_id_name} = ({flat_thread_id_expr}) / {THREADS_PER_WARP};", - cfg, state_id, node - ) - self._dispatcher.defined_vars.add(warp_id_name, DefinedType.Scalar, 'int') + joined_terms = " + ".join(flattened_terms) + flat_thread_idx_expr = f"({joined_terms})" if len(flattened_terms) > 1 else joined_terms + # NOTE: name too ugly? How shorter but still unique ? + threadID_name = 'ThreadId_%s_%d_%d_%d' % (scope_map.label, cfg.cfg_id, state_dfg.block_id, state_dfg.node_id(node)) + + kernel_stream.write(f"int {threadID_name} = ({flat_thread_idx_expr}) / {THREADS_PER_WARP};", cfg, state_id, node) + self._dispatcher.defined_vars.add(threadID_name, DefinedType.Scalar, 'int') - - - # ----------------- Compute flattened warp ID ----------------------- - range_max_elements = map_range.max_element() - range_min_elements = map_range.min_element() - warp_dim_bounds = [str(e + 1) for e in range_max_elements] + + # ----------------- Compute Map indices (= Warp indices) ----------------------- for i in range(warp_dim): var_name = scope_map.params[-i - 1] # reverse order previous_sizes = warp_dim_bounds[:i] if len(previous_sizes) > 0: - divisor = " * ".join(previous_sizes) - divisor = f"({divisor})" if len(previous_sizes) > 1 else divisor - expr = f"({warp_id_name} / {divisor}) % {warp_dim_bounds[i]}" + divisor = prod(previous_sizes) + expr = f"({threadID_name} / {divisor}) % {warp_dim_bounds[i]}" else: - expr = f"{warp_id_name} % {warp_dim_bounds[i]}" + expr = f"{threadID_name} % {warp_dim_bounds[i]}" kernel_stream.write(f"int {var_name} = {expr};", cfg, state_id, node) self._dispatcher.defined_vars.add(var_name, DefinedType.Scalar, 'int') - # check conditions - # NOTE: WarpId coordinate can start at non-zero but never exceeds the upper range bound - # due to the combination of enforcing guard checks (32 * warps <= # threads in block) and the way - # we assign the coordinates - min_element = range_min_elements[i] - if range_min_elements[i] != 0: - conditions = f'{var_name} >= {min_element}' - scopeManager.open(condition=conditions) + # ----------------- Guard Conditions for Warp Execution ----------------------- + + + if num_warps * THREADS_PER_WARP != num_threads_in_block: + condition = f'{threadID_name} < {num_warps}' + scopeManager.open(condition) + + warp_range = [(start, end + 1, stride) for start, end, stride in map_range.ranges] + + for dim, (var_name, (start, _, stride)) in enumerate(zip(scope_map.params[::-1], warp_range)): + + condition_terms = [] + + if start != 0: + condition_terms.append(f"{var_name} >= {start}") + + if stride != 1: + expr = var_name if start == 0 else f"({var_name} - {start})" + condition_terms.append(f'{expr} % {stride} == 0' ) + + if condition_terms: + condition = " && ".join(condition_terms) + scopeManager.open(condition) # ----------------- Dispatch Subgraph code generation ----------------------- + self._dispatcher.dispatch_subgraph( sdfg, cfg, dfg_scope, state_id, function_stream, kernel_stream, skip_entry_node=True @@ -937,11 +882,73 @@ def _generate_GPU_Warp_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope + + def _hanlde_GPU_Warp_scope_guards(self, state_dfg: SDFGState, node: nodes.MapEntry, map_range: subsets.Range, + warp_dim: int, num_threads_in_block, num_warps, kernel_stream: CodeIOStream, + scopeManager: 'KernelScopeManager'): + #TODO: Move them to sdfg validation as well if possible + + #TODO: rename xfh, to cryptic + parent_map, _ = xfh.get_parent_map(state_dfg, node) + if parent_map.schedule != dtypes.ScheduleType.GPU_ThreadBlock: + raise ValueError("GPU_Warp map must be nested within a GPU_ThreadBlock map.") + + if warp_dim > 3: + raise NotImplementedError("GPU_Warp maps are limited to 3 dimensions.") + + + # Guard against invalid thread/block configurations. + # - For concrete (compile-time) values, raise Python errors early. + # - For symbolic values, insert runtime CUDA checks (guards) into the generated kernel. + # These will emit meaningful error messages and abort execution if violated. + if isinstance(num_threads_in_block, symbolic.symbol): + condition = ( + f"{num_threads_in_block} % {THREADS_PER_WARP} != 0 || " + f"{num_threads_in_block} > 1024 || " + f"{num_warps} * {THREADS_PER_WARP} > {num_threads_in_block}" + ) + kernel_stream.write(f"""\ + if ({condition}) {{ + printf("CUDA error:\\n" + "1. Block must be a multiple of {THREADS_PER_WARP} threads (DaCe requirement for GPU_Warp scheduling).\\n" + "2. Block size must not exceed 1024 threads (CUDA hardware limit).\\n" + "3. Number of warps x {THREADS_PER_WARP} must fit in the block (otherwise logic is unclear).\\n"); + asm("trap;"); + }} + """) + + else: + if isinstance(num_warps, symbolic.symbol): + condition = f"{num_warps} * {THREADS_PER_WARP} > {num_threads_in_block}" + scopeManager.open(condition=condition) + + elif num_warps * THREADS_PER_WARP > num_threads_in_block: + raise ValueError(f"Invalid configuration: {num_warps} warps x {THREADS_PER_WARP} threads exceed " + f"{num_threads_in_block} threads in the block.") + + if num_threads_in_block % THREADS_PER_WARP != 0: + raise ValueError(f"Block must be a multiple of {THREADS_PER_WARP} threads for GPU_Warp scheduling " + f"(got {num_threads_in_block}).") + + if num_threads_in_block > 1024: + raise ValueError("CUDA does not support more than 1024 threads per block (hardware limit).") + + + for x in map_range.min_element(): + if isinstance(x, symbolic.symbol): + kernel_stream.write(f'if ({x} < 0) {{\n' + f' printf("Runtime error: Warp ID symbol {x} must be non-negative.\\n");\n' + f' asm("trap;");\n' + f'}}\n') + elif x < 0: + raise ValueError(f"Warp ID value {x} must be non-negative.") + + def _generate_gpu_bridge(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: ScopeSubgraphView, - state_id: int, function_stream: CodeIOStream, callsite_stream: CodeIOStream) -> None: + state_id: int, function_stream: CodeIOStream, callsite_stream: CodeIOStream) -> None: scope_entry = dfg_scope.source_nodes()[0] @@ -960,9 +967,14 @@ def _generate_gpu_bridge(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: Sc (kernel_name, ', '.join(kernel_bridge_args)), cfg, state_id, scope_entry) + + def _generate_kernel_launch(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: ScopeSubgraphView, state_id: int, function_stream: CodeIOStream, callsite_stream: CodeIOStream) -> None: + # NOTE: This generates the function that launches the kernel. + # Do not confuse it with CUDA's internal "LaunchKernel" API — + # the generated function *calls* that API, but we also refer to it as a "launch function". scope_entry = dfg_scope.source_nodes()[0] @@ -978,7 +990,8 @@ def _generate_kernel_launch(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: bdims = ', '.join(symbolic_to_cpp(block_dims)) - # Declaration of the function which launches the kernel (CUDA code) + + # ----------------- Kernel Launch Function Declaration ----------------------- self._localcode.write( """ DACE_EXPORTED void __dace_runkernel_{fname}({fargs}); @@ -989,9 +1002,34 @@ def _generate_kernel_launch(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: ) - + + # ----------------- Guard Checks handling ----------------------- - # Calling kernel function (CUDA code) + # Ensure that iteration space is neither empty nor negative sized + single_dimchecks = [] + for gdim in grid_dims: + # Only emit a guard if we can't statically prove gdim > 0 + if (gdim > 0) != True: + single_dimchecks.append(f'(({symbolic_to_cpp(gdim)}) <= 0)') + + dimcheck = ' || '.join(single_dimchecks) + + if dimcheck: + emptygrid_warning = '' + if Config.get('debugprint') == 'verbose' or Config.get_bool('compiler', 'cuda', 'syncdebug'): + emptygrid_warning = (f'printf("Warning: Skipping launching kernel \\"{kernel_name}\\" ' + 'due to an empty grid.\\n");') + + self._localcode.write( + f''' + if ({dimcheck}) {{ + {emptygrid_warning} + return; + }}''', cfg, state_id, scope_entry) + + + + # ----------------- Kernel Launch Invocation ----------------------- self._localcode.write( ''' void *{kname}_args[] = {{ {kargs} }}; @@ -1011,7 +1049,6 @@ def _generate_kernel_launch(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: self._localcode.write(f'DACE_KERNEL_LAUNCH_CHECK(__err, "{kernel_name}", {gdims}, {bdims});') - self._localcode.write('}') @@ -1104,7 +1141,9 @@ def generate_node(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphVi # if it is not implemented, use generate node of cpu impl if gen is not False: gen(sdfg, cfg, dfg, state_id, node, function_stream, callsite_stream) - elif type(node).__name__ == 'MapExit': + elif type(node).__name__ == 'MapExit' and node.schedule in GPU_SCHEDULES: + # Special case: It is a MapExit but from a GPU_schedule- the MapExit is already + # handled by a KernelScopeManager instance. Otherwise cpu_codegen will close it return else: self._cpu_codegen.generate_node(sdfg, cfg, dfg, state_id, node, function_stream, callsite_stream) From 797df83c9535f3b81c41e0318fcac57746fcacb9 Mon Sep 17 00:00:00 2001 From: aydogdub Date: Tue, 20 May 2025 18:22:53 +0200 Subject: [PATCH 08/94] refactoring computation of kernel dimensions --- .../warp_level_test.py | 1 - dace/codegen/targets/experimental_cuda.py | 513 ++++++++++-------- 2 files changed, 281 insertions(+), 233 deletions(-) diff --git a/berkay_workpace/tests/experimental_features_tests/warp_level_test.py b/berkay_workpace/tests/experimental_features_tests/warp_level_test.py index 750e64b270..89374322f2 100644 --- a/berkay_workpace/tests/experimental_features_tests/warp_level_test.py +++ b/berkay_workpace/tests/experimental_features_tests/warp_level_test.py @@ -13,7 +13,6 @@ # and not only 1 element. But there is no support for correct representation (yet). However, # the construction of the warpIds is not affected by this. Correct SDFGs appear in the next # test section - @pytest.mark.gpu @pytest.mark.parametrize("start, end, stride", [ (0, 32, 1), diff --git a/dace/codegen/targets/experimental_cuda.py b/dace/codegen/targets/experimental_cuda.py index 28cd424ea0..a96ea7aad4 100644 --- a/dace/codegen/targets/experimental_cuda.py +++ b/dace/codegen/targets/experimental_cuda.py @@ -139,8 +139,6 @@ def __init__(self, frame_codegen: 'DaCeCodeGenerator', sdfg: SDFG): self.has_pool = False - self._ignore_warnings = True - # INFO: # Register GPU schedules and storage types for ExperimentalCUDACodeGen. # The dispatcher maps GPU-related schedules and storage types to the @@ -1654,228 +1652,6 @@ def cmake_options(): return options - def get_tb_maps_recursive(self, subgraph): - res = [] - for node in subgraph.nodes(): - if isinstance(node, nodes.NestedSDFG): - for state in node.sdfg.states(): - tbmaps = self.get_tb_maps_recursive(state) - for map, sym_map in tbmaps: - for k in sym_map.values(): - for kk, vv in node.symbol_mapping.items(): - sym_map[k] = sym_map[k].subs(dace.symbol(kk), vv) - res.append((map, sym_map)) - elif isinstance(node, nodes.MapEntry) and node.schedule in ( - dtypes.ScheduleType.GPU_Device, - dtypes.ScheduleType.GPU_ThreadBlock, - dtypes.ScheduleType.GPU_ThreadBlock_Dynamic, - ): - res.append((node.map, {dace.symbol(k): dace.symbol(k) for k in node.map.range.free_symbols})) - return res - - def get_kernel_dimensions(self, dfg_scope): - """ - Determines a GPU kernel's grid/block dimensions from map scopes. - - Ruleset for kernel dimensions: - - 1. If only one map (device-level) exists, of an integer set ``S``, - the block size is ``32x1x1`` and grid size is ``ceil(|S|/32)`` in - 1st dimension. - 2. If nested thread-block maps exist ``(T_1,...,T_n)``, grid - size is ``|S|`` and block size is ``max(|T_1|,...,|T_n|)`` with - block specialization. - 3. If block size can be overapproximated, it is (for - dynamically-sized blocks that are bounded by a - predefined size). - 4. If nested device maps exist, they generate extra grid dimensions (block size 1) - as the sum of all their sizes ``(|T_1| + ... + |T_n|)`` - - :note: Kernel dimensions are separate from the map - variables, and they should be treated as such. - :note: To make use of the grid/block 3D registers, we use multi- - dimensional kernels up to 3 dimensions, and flatten the - rest into the third dimension. - """ - - kernelmap_entry: nodes.MapEntry = dfg_scope.source_nodes()[0] - grid_size = kernelmap_entry.map.range.size(True)[::-1] - block_size = None - is_persistent = (kernelmap_entry.map.schedule == dtypes.ScheduleType.GPU_Persistent) - int_ceil = symbolic.int_ceil - - # Obtain thread-block maps from nested SDFGs - subgraph = dfg_scope.scope_subgraph(kernelmap_entry) - sub_maps = self.get_tb_maps_recursive(subgraph) - - # Introduce extra grid dimensions based on device sub-maps - extra_dim_offsets: Dict[nodes.Map, symbolic.SymbolicType] = {} - extra_grid_dims: List[symbolic.SymbolicType] = None - for submap, sym_map in sub_maps: - submap: nodes.Map - if submap.schedule != dtypes.ScheduleType.GPU_Device or submap is kernelmap_entry.map: - continue - if extra_grid_dims is not None and len(submap.params) != len(extra_grid_dims): - raise NotImplementedError( - 'Multiple GPU_Device sub-ranges with different dimensionality not yet implemented (found: ' - f'{len(submap.params)}, existing: {len(extra_grid_dims)}, map: {kernelmap_entry})') - - # Add and overapproximate sizes - gsize = [s.subs(list(sym_map.items())) for s in submap.range.size()[::-1]] - gsize = [symbolic.overapproximate(s) for s in gsize] - if extra_grid_dims is None: - extra_grid_dims = gsize - extra_dim_offsets[submap] = [0] * len(submap.params) - else: - extra_dim_offsets[submap] = extra_grid_dims - extra_grid_dims = [(sz + gsz) for sz, gsz in zip(extra_grid_dims, gsize)] - if extra_grid_dims is None: - extra_grid_dims = [] - grid_size.extend(extra_grid_dims) - - # Linearize (flatten) rest of dimensions to third - if len(grid_size) > 3: - grid_size[2] = functools.reduce(sympy.Mul, grid_size[2:], 1) - del grid_size[3:] - - # Extend to 3 dimensions if necessary - grid_size = grid_size + [1] * (3 - len(grid_size)) - - # Thread-block map cases - has_dtbmap = len( - [tbmap for tbmap, _ in sub_maps if tbmap.schedule == dtypes.ScheduleType.GPU_ThreadBlock_Dynamic]) > 0 - - # keep only thread-block maps - tb_maps_sym_map = [(tbmap, sym_map) for tbmap, sym_map in sub_maps - if tbmap.schedule == dtypes.ScheduleType.GPU_ThreadBlock] - - # Map thread-block size override - block_size = kernelmap_entry.map.gpu_block_size - if block_size is not None: - # Complement to three dimensions - block_size += [1] * (3 - len(block_size)) - # Linearize (flatten) rest of dimensions to third - if len(block_size) > 3: - block_size[2] = functools.reduce(sympy.Mul, block_size[2:], 1) - del block_size[3:] - - # No thread-block maps - if len(tb_maps_sym_map) == 0: - if block_size is None: - if has_dtbmap: - if (Config.get('compiler', 'cuda', 'dynamic_map_block_size') == 'max'): - raise NotImplementedError('max dynamic block size unimplemented') - else: - block_size = [ - int(b) for b in Config.get('compiler', 'cuda', 'dynamic_map_block_size').split(',') - ] - else: - def_bsize = Config.get('compiler', 'cuda', 'default_block_size') - if (not self._ignore_warnings): # NOTE: remove the ignoring of warnings later - warnings.warn( - f'No `gpu_block_size` property specified on map "{kernelmap_entry.map.label}". ' - f'Falling back to the configuration entry `compiler.cuda.default_block_size`: {def_bsize}. ' - 'You can either specify the block size to use with the gpu_block_size property, ' - 'or by adding nested `GPU_ThreadBlock` maps, which map work to individual threads. ' - 'For more information, see https://spcldace.readthedocs.io/en/latest/optimization/gpu.html') - - if (Config.get('compiler', 'cuda', 'default_block_size') == 'max'): - raise NotImplementedError('max dynamic block size unimplemented') - else: - block_size = [int(b) for b in Config.get('compiler', 'cuda', 'default_block_size').split(',')] - - block_ndim = max(1, sum(1 if b != 1 else 0 for b in block_size)) - grid_ndim = max(1, sum(1 if g != 1 else 0 for g in grid_size)) - if block_ndim > grid_ndim: - linearized_remainder = prod(block_size[grid_ndim:]) - block_size = block_size[:grid_ndim] + [1] * (3 - grid_ndim) - block_size[grid_ndim - 1] *= linearized_remainder - warnings.warn(f'Default block size has more dimensions ({block_ndim}) than kernel dimensions ' - f'({grid_ndim}) in map "{kernelmap_entry.map.label}". Linearizing block ' - f'size to {block_size}. Consider setting the ``gpu_block_size`` property.') - - assert (len(block_size) >= 1 and len(block_size) <= 3) - - # Grid size = ceil(|S|/32) for first dimension, rest = |S| - grid_size = [int_ceil(gs, bs) for gs, bs in zip(grid_size, block_size)] - - else: - # Find all thread-block maps to determine overall block size - detected_block_sizes = [block_size] if block_size is not None else [] - for tbmap, sym_map in tb_maps_sym_map: - tbsize = [s.subs(list(sym_map.items())) for s in tbmap.range.size()[::-1]] - - # Over-approximate block size (e.g. min(N,(i+1)*32)-i*32 --> 32) - # The partial trailing thread-block is emitted as an if-condition - # that returns on some of the participating threads - tbsize = [symbolic.overapproximate(s) for s in tbsize] - - # Linearize (flatten) rest of dimensions to third - if len(tbsize) > 3: - tbsize[2] = functools.reduce(sympy.Mul, tbsize[2:], 1) - del tbsize[3:] - - # Extend to 3 dimensions if necessary - tbsize = tbsize + [1] * (3 - len(tbsize)) - - if len(detected_block_sizes) == 0: - block_size = tbsize - else: - block_size = [sympy.Max(sz, bbsz) for sz, bbsz in zip(block_size, tbsize)] - - if block_size != tbsize or len(detected_block_sizes) == 0: - detected_block_sizes.append(tbsize) - - # TODO: If grid/block sizes contain elements only defined within the - # kernel, raise an invalid SDFG exception and recommend - # overapproximation. - - if len(detected_block_sizes) > 1: - - # Error when both gpu_block_size and thread-block maps were defined and conflict - if kernelmap_entry.map.gpu_block_size is not None: - raise ValueError('Both the `gpu_block_size` property and internal thread-block ' - 'maps were defined with conflicting sizes for kernel ' - f'"{kernelmap_entry.map.label}" (sizes detected: {detected_block_sizes}). ' - 'Use `gpu_block_size` only if you do not need access to individual ' - 'thread-block threads, or explicit block-level synchronization (e.g., ' - '`__syncthreads`). Otherwise, use internal maps with the `GPU_Threadblock` or ' - '`GPU_ThreadBlock_Dynamic` schedules. For more information, see ' - 'https://spcldace.readthedocs.io/en/latest/optimization/gpu.html') - - warnings.warn('Multiple thread-block maps with different sizes detected for ' - f'kernel "{kernelmap_entry.map.label}": {detected_block_sizes}. ' - f'Over-approximating to block size {block_size}.\n' - 'If this was not the intent, try tiling one of the thread-block maps to match.') - - # both thread-block map and dynamic thread-block map exist at the same - # time - if has_dtbmap: - raise NotImplementedError("GPU_ThreadBlock and GPU_ThreadBlock_Dynamic are currently " - "not supported in the same scope") - - if is_persistent: - grid_size = ['gridDim.x', '1', '1'] - - # Check block size against configured maximum values, if those can be determined - total_bsize = prod(block_size) - total_limit = Config.get('compiler', 'cuda', 'block_size_limit') - lastdim_limit = Config.get('compiler', 'cuda', 'block_size_lastdim_limit') - if (total_bsize > total_limit) == True: - raise ValueError(f'Block size for kernel "{kernelmap_entry.map.label}" ({block_size}) ' - f'is larger than the possible number of threads per block ({total_limit}). ' - 'The kernel will potentially not run, please reduce the thread-block size. ' - 'To increase this limit, modify the `compiler.cuda.block_size_limit` ' - 'configuration entry.') - if (block_size[-1] > lastdim_limit) == True: - raise ValueError(f'Last block size dimension for kernel "{kernelmap_entry.map.label}" ({block_size}) ' - 'is larger than the possible number of threads in the last block dimension ' - f'({lastdim_limit}). The kernel will potentially not run, please reduce the ' - 'thread-block size. To increase this limit, modify the ' - '`compiler.cuda.block_size_lastdim_limit` configuration entry.') - - return grid_size, block_size, len(tb_maps_sym_map) > 0, has_dtbmap, extra_dim_offsets - def define_out_memlet(self, sdfg: SDFG, cfg: ControlFlowRegion, state_dfg: StateSubgraphView, state_id: int, src_node: nodes.Node, dst_node: nodes.Node, edge: MultiConnectorEdge[Memlet], function_stream: CodeIOStream, callsite_stream: CodeIOStream) -> None: @@ -2286,17 +2062,19 @@ class KernelSpec: def __init__(self, cudaCodeGen: ExperimentalCUDACodeGen, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: ScopeSubgraphView, state_id: int): - # Entry and exit nodes of the scope - scope_entry = dfg_scope.source_nodes()[0] - state = cfg.state(state_id) + + + kernel_entry_node = dfg_scope.source_nodes()[0] + state: SDFGState = cfg.state(state_id) - self._kernel_map: nodes.Map = scope_entry.map + self._kernel_entry_node: nodes.MapEntry = kernel_entry_node + self._kernel_map: nodes.Map = kernel_entry_node.map # Kernel name - self._kernel_name: str = '%s_%d_%d_%d' % (scope_entry.map.label, cfg.cfg_id, state.block_id, state.node_id(scope_entry)) + self._kernel_name: str = '%s_%d_%d_%d' % (kernel_entry_node.map.label, cfg.cfg_id, state.block_id, state.node_id(kernel_entry_node)) # Kernel arguments - self._args: Dict = cudaCodeGen._arglists[scope_entry] + self._args: Dict = cudaCodeGen._arglists[kernel_entry_node] self._args_typed: list[str] = [adata.as_arg(name=aname) for aname, adata in self._args.items()] self._args_as_input: list[str] = [ptr(aname, adata, sdfg, cudaCodeGen._frame) for aname, adata in self._args.items()] @@ -2307,7 +2085,279 @@ def __init__(self, cudaCodeGen: ExperimentalCUDACodeGen, sdfg: SDFG, cfg: Contro self._bridge_args_typed: list[str] = state_param + self._args_typed # Kernel dimensions - self._grid_dims, self._block_dims, self._has_tbmap, self._has_dtbmap, _ = cudaCodeGen.get_kernel_dimensions(dfg_scope) + self._grid_dims, self._block_dims, self._has_tbmap, self._has_dtbmap, _ = self._get_kernel_dimensions(dfg_scope) + + + + def _get_kernel_dimensions(self, dfg_scope: ScopeSubgraphView): + """ + Determines a GPU kernel's grid/block dimensions from map scopes. + + Ruleset for kernel dimensions: + + 1. If only one map (device-level) exists, of an integer set ``S``, + the block size is ``32x1x1`` and grid size is ``ceil(|S|/32)`` in + 1st dimension. + 2. If nested thread-block maps exist ``(T_1,...,T_n)``, grid + size is ``|S|`` and block size is ``max(|T_1|,...,|T_n|)`` with + block specialization. + 3. If block size can be overapproximated, it is (for + dynamically-sized blocks that are bounded by a + predefined size). + 4. If nested device maps exist, behavior is unknown but an error is thrown + in the generate_scope function. This is not supported here + + :note: Kernel dimensions are separate from the map + variables, and they should be treated as such. + :note: To make use of the grid/block 3D registers, we use multi- + dimensional kernels up to 3 dimensions, and flatten the + rest into the third dimension. + """ + + + # Extract the subgraph of the kernel entry map + launch_scope = dfg_scope.scope_subgraph(self._kernel_entry_node) + + # Collect all relevant maps affecting launch (i.e. grid and block) dimensions + affecting_maps = self._get_maps_affecting_launch_dims(launch_scope) + + # Filter for ThreadBlock maps + threadblock_maps = [(tbmap, sym_map) for tbmap, sym_map in affecting_maps + if tbmap.schedule == dtypes.ScheduleType.GPU_ThreadBlock] + + # Determine if we fall back to default block size (which also affects grid size) + no_block_info: bool = len(threadblock_maps) == 0 and self._kernel_map.gpu_block_size is None + + if no_block_info: + block_size, grid_size = self._compute_default_block_and_grid() + else: + block_size, grid_size = self._compute_block_and_grid_from_maps(threadblock_maps) + + + return grid_size, block_size, len(threadblock_maps) > 0, False, 0 + + + def _compute_default_block_and_grid(self): + """ + Fallback when no gpu_block_size (i.e. self._kernel_map.gpu_block_size is None) + or GPU_ThreadBlock maps are defined: + + Uses default_block_size (e.g. [32,1,1]) on the whole domain S (assuming 1 dimensional), + producing block=[32,1,1] and grid=[ceil(|S|/32),1,1]. + + Special case: if the block has more active (non-1) dimensions than S, + extra block dimensions are collapsed into the last active slot. + """ + + kernel_map_label = self._kernel_entry_node.map.label + default_block_size_config = Config.get('compiler', 'cuda', 'default_block_size') + + # 1) Reject unsupported 'max' setting + if default_block_size_config == 'max': + # TODO: does this make sense? what is meant with dynamic here? + raise NotImplementedError('max dynamic block size unimplemented') + + # 2) Warn that we're falling back to config + warnings.warn( + f'No `gpu_block_size` property specified on map "{kernel_map_label}". ' + f'Falling back to the configuration entry `compiler.cuda.default_block_size`: {default_block_size_config}. ' + 'You can either specify the block size to use with the gpu_block_size property, ' + 'or by adding nested `GPU_ThreadBlock` maps, which map work to individual threads. ' + 'For more information, see https://spcldace.readthedocs.io/en/latest/optimization/gpu.html') + + + # 3) Normalize the total iteration space size (len(X),len(Y),len(Z)…) to 3D + raw_domain = list(self._kernel_map.range.size(True))[::-1] + kernel_domain_size = self._to_3d_dims(raw_domain) + + # 4) Parse & normalize the default block size to 3D + default_block_size = [int(x) for x in default_block_size_config.split(',')] + default_block_size = self._to_3d_dims(default_block_size) + + # 5) If block has more "active" dims than the domain, collapse extras + active_block_dims = max(1, sum(1 for b in default_block_size if b != 1)) + active_grid_dims = max(1, sum(1 for g in kernel_domain_size if g != 1)) + + if active_block_dims > active_grid_dims: + tail_product = prod(default_block_size[active_grid_dims:]) + block_size = default_block_size[:active_grid_dims] + [1] * (3 - active_grid_dims) + block_size[active_grid_dims - 1] *= tail_product + warnings.warn(f'Default block size has more dimensions ({active_block_dims}) than kernel dimensions ' + f'({active_grid_dims}) in map "{kernel_map_label}". Linearizing block ' + f'size to {block_size}. Consider setting the ``gpu_block_size`` property.') + else: + block_size = default_block_size + + # 6) Compute the final grid size per axis: ceil(domain / block) + grid_size = [symbolic.int_ceil(gs, bs) for gs, bs in zip(kernel_domain_size, block_size)] + + + # 7) Check block size against configured CUDA hardware limits + self._validate_block_size_limits(block_size) + + return block_size, grid_size + + + def _compute_block_and_grid_from_maps(self, tb_maps_sym_map): + # TODO: also provide a description here in docstring + + + kernel_entry_node = self._kernel_entry_node + + # Compute kernel grid size + raw_grid_size = self._kernel_map.range.size(True)[::-1] + grid_size = self._to_3d_dims(raw_grid_size) + + # Determine block size, using gpu_block_size override if specified + # NOTE: this must be done on the original list! otherwise error + block_size = self._kernel_map.gpu_block_size + if block_size is not None: + block_size = self._to_3d_dims(block_size) + + + # Find all thread-block maps to determine overall block size + detected_block_sizes = [block_size] if block_size is not None else [] + for tbmap, sym_map in tb_maps_sym_map: + tbsize = [s.subs(list(sym_map.items())) for s in tbmap.range.size()[::-1]] + + # Over-approximate block size (e.g. min(N,(i+1)*32)-i*32 --> 32) + # The partial trailing thread-block is emitted as an if-condition + # that returns on some of the participating threads + tbsize = [symbolic.overapproximate(s) for s in tbsize] + + # To Cuda compatible block dimension description + tbsize = self._to_3d_dims(tbsize) + + if len(detected_block_sizes) == 0: + block_size = tbsize + else: + block_size = [sympy.Max(sz, bbsz) for sz, bbsz in zip(block_size, tbsize)] + + if block_size != tbsize or len(detected_block_sizes) == 0: + detected_block_sizes.append(tbsize) + + + + #-------------- Error handling and warnings ------------------------ + + # TODO: If grid/block sizes contain elements only defined within the + # kernel, raise an invalid SDFG exception and recommend + # overapproximation. + + kernel_map_label = kernel_entry_node.map.label + if len(detected_block_sizes) > 1: + # Error when both gpu_block_size and thread-block maps were defined and conflict + if kernel_entry_node.map.gpu_block_size is not None: + raise ValueError('Both the `gpu_block_size` property and internal thread-block ' + 'maps were defined with conflicting sizes for kernel ' + f'"{kernel_map_label}" (sizes detected: {detected_block_sizes}). ' + 'Use `gpu_block_size` only if you do not need access to individual ' + 'thread-block threads, or explicit block-level synchronization (e.g., ' + '`__syncthreads`). Otherwise, use internal maps with the `GPU_Threadblock` or ' + '`GPU_ThreadBlock_Dynamic` schedules. For more information, see ' + 'https://spcldace.readthedocs.io/en/latest/optimization/gpu.html') + + warnings.warn('Multiple thread-block maps with different sizes detected for ' + f'kernel "{kernel_map_label}": {detected_block_sizes}. ' + f'Over-approximating to block size {block_size}.\n' + 'If this was not the intent, try tiling one of the thread-block maps to match.') + + # Check block size against configured CUDA hardware limits + self._validate_block_size_limits(block_size) + + return block_size, grid_size + + + def _validate_block_size_limits(self, block_size): + """ + Check block size against configured maximum values, if those can be determined + """ + + kernel_map_label = self._kernel_map.label + + total_block_size = prod(block_size) + limit = Config.get('compiler', 'cuda', 'block_size_limit') + lastdim_limit = Config.get('compiler', 'cuda', 'block_size_lastdim_limit') + + if (total_block_size > limit) == True: + raise ValueError(f'Block size for kernel "{kernel_map_label}" ({block_size}) ' + f'is larger than the possible number of threads per block ({limit}). ' + 'The kernel will potentially not run, please reduce the thread-block size. ' + 'To increase this limit, modify the `compiler.cuda.block_size_limit` ' + 'configuration entry.') + if (block_size[-1] > lastdim_limit) == True: + raise ValueError(f'Last block size dimension for kernel "{kernel_map_label}" ({block_size}) ' + 'is larger than the possible number of threads in the last block dimension ' + f'({lastdim_limit}). The kernel will potentially not run, please reduce the ' + 'thread-block size. To increase this limit, modify the ' + '`compiler.cuda.block_size_lastdim_limit` configuration entry.') + + + def _to_3d_dims(self, dim_sizes: List) -> List: + """ + Given a list representing the size of each dimension, this function modifies + the list in-place by collapsing all dimensions beyond the second into the + third entry. If the list has fewer than three entries, it is padded with 1's + to ensure it always contains exactly three elements. This is used to format + grid and block size parameters for a kernel launch. + + Examples: + [x] → [x, 1, 1] + [x, y] → [x, y, 1] + [x, y, z] → [x, y, z] + [x, y, z, u, v] → [x, y, z * u * v] + """ + + if len(dim_sizes) > 3: + # multiply everything from the 3rd onward into d[2] + dim_sizes[2] = prod(dim_sizes[2:]) + dim_sizes = dim_sizes[:3] + + # pad with 1s if necessary + dim_sizes += [1] * (3 - len(dim_sizes)) + + return dim_sizes + + + def _get_maps_affecting_launch_dims(self, graph: ScopeSubgraphView) -> List[Tuple[nodes.MapEntry, Dict[dace.symbol, dace.symbol]]]: + """ + Recursively collects all GPU_Device and GPU_ThreadBlock maps within the given graph, + including those inside nested SDFGs. For each relevant map, returns a tuple containing + the map object and an identity mapping of its free symbols. + + Args: + graph (ScopeSubgraphView): The subgraph to search for relevant maps. + + Returns: + List[Tuple[nodes.MapEntry, Dict[dace.symbol, dace.symbol]]]: + A list of tuples, each consisting of a MapEntry object and a dictionary mapping + each free symbol in the map's range to itself (identity mapping). + + NOTE: + Currently, dynamic parallelism (nested GPU_Device schedules) is not supported. + The GPU_Device is only used for the top level map, where it is allowed and required. + """ + + relevant_maps = [] + + for node in graph.nodes(): + + # Recurse into nested SDFGs + if isinstance(node, nodes.NestedSDFG): + for state in node.sdfg.states(): + relevant_maps.extend(self._get_maps_affecting_launch_dims(state)) + continue + + # MapEntry with schedule affecting launch dimensions + if (isinstance(node, nodes.MapEntry) and + node.schedule in {dtypes.ScheduleType.GPU_Device, dtypes.ScheduleType.GPU_ThreadBlock}): + identity_map = { dace.symbol(sym): dace.symbol(sym) for sym in node.map.range.free_symbols} + relevant_maps.append((node.map, identity_map)) + + return relevant_maps + + + @property def kernel_name(self) -> list[str]: @@ -2319,7 +2369,6 @@ def kernel_map(self) -> nodes.Map: """Returns the kernel map node""" return self._kernel_map - @property def args_as_input(self) -> list[str]: """Returns the kernel function arguments From c575a326f8aee5797f0bfa01a114d401acc8bfc8 Mon Sep 17 00:00:00 2001 From: aydogdub Date: Fri, 23 May 2025 21:21:59 +0200 Subject: [PATCH 09/94] report 1 --- berkay_workpace/reports/report_1.md | 60 +++++++++++++++++++++++++++++ 1 file changed, 60 insertions(+) create mode 100644 berkay_workpace/reports/report_1.md diff --git a/berkay_workpace/reports/report_1.md b/berkay_workpace/reports/report_1.md new file mode 100644 index 0000000000..040cef38ac --- /dev/null +++ b/berkay_workpace/reports/report_1.md @@ -0,0 +1,60 @@ +# Master's Thesis Report + +**Thesis Title:** Code-generation for Modern GPUs in DaCe +**Student:** Berkay Aydogdu +**Supervisor:** Yakup Koray Budanaz +**Date:** 2025-05-23 +**Short description:** The objectives of this Master's thesis are to refactor the CUDA code generator in DaCe and to extend it with new features. The refactoring focuses on improving the structure, readability, and maintainability of the code. + +## Progress Overview + +By inspecting the source code of the CUDA code generator, we identified several poor coding +practices. These included, among others, intertwined functionality, non-descriptive variable +and function names, and numerous code fragments that appeared more like quick fixes or hacks +than thoughtfully designed solutions. + +To address these issues, we implemented a new CUDA code generator class `ExperimentalCUDACodeGen`, which can be enabled via configuration settings. We began by +running simple programs using the new generator, reusing parts of the existing code to get +minimal examples working. + +We deliberately chose not to build a completely new generator from scratch, as improving code +quality is only one part of the overall goal. Moreover, the existing implementation contains +well-designed components that are worth preserving—there is no need to reinvent the wheel. + +The following section highlights the notable aspects of the new implementation: + +- Only simple features are supported for now, in order to eliminate the complexity introduced + by rarely used features such as dynamic parallelism. +- The generation of scopes — specifically GPU maps— has been almost completely reworked. + In the existing CUDA code generator, this component has major issues, with several hundred + lines of dense code packed into just a few functions, even though it could be logically + split. For example, the generation of different map types (based on schedule types), the + kernel launch, and the kernel wrapper function are now implemented in separate functions. + We also improved naming throughout the code by replacing vague variable names with more + meaningful ones. +- The existing CUDA code generator opens and closes brackets in inconsistent + locations—sometimes even at another file. This is not only error-prone, but also makes + the code appear more complex than necessary. To address this, we implemented a Python + class (`KernelScopeManager`) that uses the `with` construct to clearly define when scopes + are entered and exited, making bracket management more structured and easier to control. +- In our view, the existing CUDA code generator class relies on too many attributes, some of + which are specific to individual kernels—such as inputs, block and grid dimensions. These + are currently derived ad hoc and stored directly on the generator, leading to clutter and + reduced clarity. To address this, we introduced a `KernelSpec` class that encapsulates all + kernel-specific information. This allows such attributes to be accessed cleanly from a + KernelSpec instance, reducing the number of attributes in the code generator and improving + structure and maintainability. +- We also implemented a first extension, namely the support of WarpLevel schedules, by + introducing a new GPU schedule type called `GPU_Warp`. With this, the we can specify which + warps are selected to perform a task. + + +## Next Steps + +The next steps include enabling asynchronous memory copies and continuing to refactor the +remaining parts of the code generator. This will require support for shared memory and +further discussions around key design decisions. + + + + From 0270047627134767d118e5ab875959f54e232539 Mon Sep 17 00:00:00 2001 From: aydogdub Date: Sun, 25 May 2025 14:06:00 +0200 Subject: [PATCH 10/94] clean-up and finishing small TODO's --- dace/codegen/targets/experimental_cuda.py | 177 ++++++++++++---------- dace/config_schema.yml | 22 ++- dace/dtypes.py | 9 +- 3 files changed, 122 insertions(+), 86 deletions(-) diff --git a/dace/codegen/targets/experimental_cuda.py b/dace/codegen/targets/experimental_cuda.py index a96ea7aad4..6218217361 100644 --- a/dace/codegen/targets/experimental_cuda.py +++ b/dace/codegen/targets/experimental_cuda.py @@ -27,7 +27,7 @@ from dace.sdfg import utils as sdutil from dace.sdfg.graph import MultiConnectorEdge from dace.sdfg.state import ControlFlowRegion, StateSubgraphView -from dace.transformation import helpers as xfh +from dace.transformation import helpers from dace.transformation.passes import analysis as ap if TYPE_CHECKING: @@ -36,21 +36,12 @@ +# TODO's: +# 1. Yakup: Approval of dtypes extensions (e.g. mapping default sub scope scheduleTypes) +# 2. Berkay: Include constant expresssions +# 3. Berkay: Warning if sync property in maps is used +# 4. Berkay: Warning/Error that GPU_device must be used before other GPU schedule types -# TODO: GENERAL, discuss with Yakup -# 1. Approval of dtypes - - -# TODO: I am not handling map with strided rights now, -# why? because This is handled somewhere else than in the scope - - - -# My personal TODO's -# TODO: when tired -# include constant expressions + launch bounds logic -# insert warnings that gpu device must be first -# 4 dimensional example # TODO: depending on what happens next # change in_device_code to maybe in_kernel_code? @@ -58,18 +49,6 @@ -# TODO : I got rid of ScheduleType.GPU_Persistent (not supported anymore). If this codeBase -# actually replaces the old one, this should be defined in dtypes.py and also accessed from -# there. Also change GPU_SCHEDULES accesses to dtypes.GPU_SCHEDULES -GPU_SCHEDULES = [ - dace.ScheduleType.GPU_Device, - dace.ScheduleType.GPU_ThreadBlock, - dace.ScheduleType.GPU_Warp -] - - -THREADS_PER_WARP = 32 - @registry.autoregister_params(name='experimental_cuda') class ExperimentalCUDACodeGen(TargetCodeGenerator): """ Experimental CUDA code generator.""" @@ -96,7 +75,7 @@ def __init__(self, frame_codegen: 'DaCeCodeGenerator', sdfg: SDFG): self._block_dims = None self._grid_dims = None - # NOTE: Type may be wrong! + self._kernel_map: Optional[nodes.MapEntry] = None # Indicates whether the code generation is currently within a "kernel" map. # NOTE: Moved from preprossessing to here @@ -148,10 +127,11 @@ def __init__(self, frame_codegen: 'DaCeCodeGenerator', sdfg: SDFG): self._cpu_codegen = self._dispatcher.get_generic_node_dispatcher() self._dispatcher = frame_codegen.dispatcher - self._dispatcher.register_map_dispatcher(GPU_SCHEDULES, self) + self._dispatcher.register_map_dispatcher(dtypes.GPU_SCHEDULES_EXPERIMENTAL_CUDACODEGEN, self) self._dispatcher.register_node_dispatcher(self, self.node_dispatch_predicate) self._dispatcher.register_state_dispatcher(self, self.state_dispatch_predicate) + # TODO: Add this to dtypes as well gpu_storage = [dtypes.StorageType.GPU_Global, dtypes.StorageType.GPU_Shared, dtypes.StorageType.CPU_Pinned] self._dispatcher.register_array_dispatcher(gpu_storage, self) @@ -335,7 +315,7 @@ def _compute_pool_release(self, top_sdfg: SDFG): self.pool_release[(sdfg, arr)] = (sink, set()) - # NOTE: Used during preprocess. Seems good as is + # NOTE: SHould be a transformation to some part def _compute_cudastreams(self, sdfg: SDFG, default_stream=0, default_event=0): """ Annotates an SDFG (and all nested ones) to include a `_cuda_stream` field. This field is applied to all GPU maps, tasklets, and copies @@ -374,7 +354,7 @@ def increment(streams): if isinstance(node, nodes.NestedSDFG): if node.schedule == dtypes.ScheduleType.GPU_Device: continue - if node.schedule not in dtypes.GPU_SCHEDULES: + if node.schedule not in dtypes.GPU_SCHEDULES_EXPERIMENTAL_CUDACODEGEN: max_streams, max_events = self._compute_cudastreams(node.sdfg, max_streams, max_events + 1) node._cuda_stream = max_streams node._cs_childpath = False @@ -382,6 +362,7 @@ def increment(streams): # Maintain the same CUDA stream in DFS order, add more when # possible. + # NOTE: Either all have an attribute or none, tell yakup if you see stuff like this for e in state.dfs_edges(source_nodes): if hasattr(e.dst, '_cuda_stream'): continue @@ -400,11 +381,11 @@ def increment(streams): e.src._cs_childpath = True # Do not create multiple streams within GPU scopes - if (isinstance(e.src, nodes.EntryNode) and e.src.schedule in dtypes.GPU_SCHEDULES): + if (isinstance(e.src, nodes.EntryNode) and e.src.schedule in dtypes.GPU_SCHEDULES_EXPERIMENTAL_CUDACODEGEN): e.src._cs_childpath = False elif state.entry_node(e.src) is not None: parent = state.entry_node(e.src) - if parent.schedule in dtypes.GPU_SCHEDULES: + if parent.schedule in dtypes.GPU_SCHEDULES_EXPERIMENTAL_CUDACODEGEN: e.src._cs_childpath = False else: c = max_streams @@ -417,7 +398,7 @@ def increment(streams): if not hasattr(e.dst, '_cs_childpath'): e.dst._cs_childpath = False if isinstance(e.dst, nodes.NestedSDFG): - if e.dst.schedule not in dtypes.GPU_SCHEDULES: + if e.dst.schedule not in dtypes.GPU_SCHEDULES_EXPERIMENTAL_CUDACODEGEN: max_streams, max_events = self._compute_cudastreams(e.dst.sdfg, e.dst._cuda_stream, max_events + 1) @@ -429,7 +410,7 @@ def increment(streams): if isinstance(graph, SDFGState): cur_sdfg = graph.parent - if (isinstance(node, (nodes.EntryNode, nodes.ExitNode)) and node.schedule in dtypes.GPU_SCHEDULES): + if (isinstance(node, (nodes.EntryNode, nodes.ExitNode)) and node.schedule in dtypes.GPU_SCHEDULES_EXPERIMENTAL_CUDACODEGEN): # Node must have GPU stream, remove childpath and continue if hasattr(node, '_cs_childpath'): delattr(node, '_cs_childpath') @@ -504,7 +485,7 @@ def generate_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: ScopeSub ) - self._generate_gpu_bridge(sdfg, cfg, dfg_scope, state_id, function_stream, callsite_stream) + self._generate_kernel_wrapper(sdfg, cfg, dfg_scope, state_id, function_stream, callsite_stream) #--------------- Generate Kernel Function ---------------- @@ -521,7 +502,7 @@ def generate_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: ScopeSub if node.gpu_launch_bounds != '-1': if node.gpu_launch_bounds == "0": if not any(symbolic.issymbolic(b) for b in block_dims): - launch_bounds = f'__launch_bounds__({prod(block_dims)})' + launch_bounds = f'__launch_bounds__({product(block_dims)})' else: launch_bounds = f'__launch_bounds__({node.gpu_launch_bounds})' @@ -586,6 +567,7 @@ def _generate_kernel_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: with KernelScopeManager(cudaCodeGen=self, sdfg=sdfg, cfg=cfg, dfg_scope=dfg_scope, state_id=state_id, function_stream=function_stream, callsite_stream=kernel_stream, comment="Kernel scope",) as scopeManager: + # TODO: If time allows, maybe rename in configs somehow, discuss with Yakup # Get the thread/block index type ttype = Config.get('compiler', 'cuda', 'thread_id_type') tidtype = getattr(dtypes, ttype, False) @@ -711,19 +693,19 @@ def _generate_GPU_ThreadBlock_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, df if dim < 3: # First three dimensions: direct mapping or partial delinearization if dim == 2 and map_dimensions > 3: - tail_prod = prod(map_dim_sizes[3:]) + tail_prod = product(map_dim_sizes[3:]) base_expr = f"(threadIdx.z / ({symbolic_to_cpp(tail_prod)}))" else: base_expr = f"threadIdx.{_get_cuda_dim(dim)}" else: # Dimensions beyond the third: full delinearization - tail_prod = prod(map_dim_sizes[dim + 1:]) + tail_prod = product(map_dim_sizes[dim + 1:]) base_expr = (f"(threadIdx.z / ({symbolic_to_cpp(tail_prod)})) % "f"({symbolic_to_cpp(map_dim_sizes[dim])})") var_def = symbolic_to_cpp(symbolic_coordinates[dim]).replace(f'__SYM_IDX{dim}', base_expr) kernel_stream.write(f'int {var_name} = {var_def};', cfg, state_id, node) - self._dispatcher.defined_vars.add(var_name, DefinedType.Scalar, 'int') + self._dispatcher.defined_vars.add(var_name, DefinedType.Scalar, 'int') # TODO: get varname type @@ -777,8 +759,10 @@ def _generate_GPU_Warp_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope function_stream=function_stream, callsite_stream=kernel_stream, comment="WarpLevel Scope") as scopeManager: - - block_dims = self._current_kernel_spec.block_dims + # Get kernel specifications + kernel_spec = self._current_kernel_spec + block_dims = kernel_spec.block_dims + warpSize = kernel_spec.warpSize state_dfg = cfg.state(state_id) node = dfg_scope.source_nodes()[0] @@ -788,9 +772,9 @@ def _generate_GPU_Warp_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope warp_dim = len(map_range) # The following sizes and bounds are be symbolic - num_threads_in_block = prod(block_dims) + num_threads_in_block = product(block_dims) warp_dim_bounds = [max_elem + 1 for max_elem in map_range.max_element()] - num_warps = prod(warp_dim_bounds) + num_warps = product(warp_dim_bounds) @@ -820,11 +804,11 @@ def _generate_GPU_Warp_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope joined_terms = " + ".join(flattened_terms) flat_thread_idx_expr = f"({joined_terms})" if len(flattened_terms) > 1 else joined_terms - # NOTE: name too ugly? How shorter but still unique ? + threadID_name = 'ThreadId_%s_%d_%d_%d' % (scope_map.label, cfg.cfg_id, state_dfg.block_id, state_dfg.node_id(node)) - kernel_stream.write(f"int {threadID_name} = ({flat_thread_idx_expr}) / {THREADS_PER_WARP};", cfg, state_id, node) - self._dispatcher.defined_vars.add(threadID_name, DefinedType.Scalar, 'int') + kernel_stream.write(f"int {threadID_name} = ({flat_thread_idx_expr}) / {warpSize};", cfg, state_id, node) + self._dispatcher.defined_vars.add(threadID_name, DefinedType.Scalar, 'int') # TODO: Fix type? @@ -835,7 +819,7 @@ def _generate_GPU_Warp_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope previous_sizes = warp_dim_bounds[:i] if len(previous_sizes) > 0: - divisor = prod(previous_sizes) + divisor = product(previous_sizes) expr = f"({threadID_name} / {divisor}) % {warp_dim_bounds[i]}" else: expr = f"{threadID_name} % {warp_dim_bounds[i]}" @@ -848,7 +832,7 @@ def _generate_GPU_Warp_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope # ----------------- Guard Conditions for Warp Execution ----------------------- - if num_warps * THREADS_PER_WARP != num_threads_in_block: + if num_warps * warpSize != num_threads_in_block: condition = f'{threadID_name} < {num_warps}' scopeManager.open(condition) @@ -886,9 +870,11 @@ def _hanlde_GPU_Warp_scope_guards(self, state_dfg: SDFGState, node: nodes.MapEnt scopeManager: 'KernelScopeManager'): #TODO: Move them to sdfg validation as well if possible + + # Get warpSize from the kernel specification + warpSize = self._current_kernel_spec.warpSize - #TODO: rename xfh, to cryptic - parent_map, _ = xfh.get_parent_map(state_dfg, node) + parent_map, _ = helpers.get_parent_map(state_dfg, node) if parent_map.schedule != dtypes.ScheduleType.GPU_ThreadBlock: raise ValueError("GPU_Warp map must be nested within a GPU_ThreadBlock map.") @@ -902,50 +888,49 @@ def _hanlde_GPU_Warp_scope_guards(self, state_dfg: SDFGState, node: nodes.MapEnt # These will emit meaningful error messages and abort execution if violated. if isinstance(num_threads_in_block, symbolic.symbol): condition = ( - f"{num_threads_in_block} % {THREADS_PER_WARP} != 0 || " + f"{num_threads_in_block} % {warpSize} != 0 || " f"{num_threads_in_block} > 1024 || " - f"{num_warps} * {THREADS_PER_WARP} > {num_threads_in_block}" + f"{num_warps} * {warpSize} > {num_threads_in_block}" ) kernel_stream.write(f"""\ if ({condition}) {{ printf("CUDA error:\\n" - "1. Block must be a multiple of {THREADS_PER_WARP} threads (DaCe requirement for GPU_Warp scheduling).\\n" + "1. Block must be a multiple of {warpSize} threads (DaCe requirement for GPU_Warp scheduling).\\n" "2. Block size must not exceed 1024 threads (CUDA hardware limit).\\n" - "3. Number of warps x {THREADS_PER_WARP} must fit in the block (otherwise logic is unclear).\\n"); + "3. Number of warps x {warpSize} must fit in the block (otherwise logic is unclear).\\n"); asm("trap;"); }} """) else: if isinstance(num_warps, symbolic.symbol): - condition = f"{num_warps} * {THREADS_PER_WARP} > {num_threads_in_block}" + condition = f"{num_warps} * {warpSize} > {num_threads_in_block}" scopeManager.open(condition=condition) - elif num_warps * THREADS_PER_WARP > num_threads_in_block: - raise ValueError(f"Invalid configuration: {num_warps} warps x {THREADS_PER_WARP} threads exceed " + elif num_warps * warpSize > num_threads_in_block: + raise ValueError(f"Invalid configuration: {num_warps} warps x {warpSize} threads exceed " f"{num_threads_in_block} threads in the block.") - if num_threads_in_block % THREADS_PER_WARP != 0: - raise ValueError(f"Block must be a multiple of {THREADS_PER_WARP} threads for GPU_Warp scheduling " + if num_threads_in_block % warpSize != 0: + raise ValueError(f"Block must be a multiple of {warpSize} threads for GPU_Warp scheduling " f"(got {num_threads_in_block}).") if num_threads_in_block > 1024: raise ValueError("CUDA does not support more than 1024 threads per block (hardware limit).") - for x in map_range.min_element(): - if isinstance(x, symbolic.symbol): - kernel_stream.write(f'if ({x} < 0) {{\n' - f' printf("Runtime error: Warp ID symbol {x} must be non-negative.\\n");\n' + for min_element in map_range.min_element(): + if isinstance(min_element, symbolic.symbol): + kernel_stream.write(f'if ({min_element} < 0) {{\n' + f' printf("Runtime error: Warp ID symbol {min_element} must be non-negative.\\n");\n' f' asm("trap;");\n' f'}}\n') - elif x < 0: - raise ValueError(f"Warp ID value {x} must be non-negative.") + elif min_element < 0: + raise ValueError(f"Warp ID value {min_element} must be non-negative.") - - def _generate_gpu_bridge(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: ScopeSubgraphView, + def _generate_kernel_wrapper(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: ScopeSubgraphView, state_id: int, function_stream: CodeIOStream, callsite_stream: CodeIOStream) -> None: @@ -1107,7 +1092,7 @@ def _begin_streams(self, sdfg, state): return result def state_dispatch_predicate(self, sdfg, state): - if self._toplevel_schedule in dtypes.GPU_SCHEDULES: + if self._toplevel_schedule in dtypes.GPU_SCHEDULES_EXPERIMENTAL_CUDACODEGEN: return True for node in state.sink_nodes(): if hasattr(node, '_cuda_stream'): @@ -1123,7 +1108,7 @@ def state_dispatch_predicate(self, sdfg, state): def node_dispatch_predicate(self, sdfg, state, node): if hasattr(node, 'schedule'): # NOTE: Works on nodes and scopes - if node.schedule in dtypes.GPU_SCHEDULES: + if node.schedule in dtypes.GPU_SCHEDULES_EXPERIMENTAL_CUDACODEGEN: return True if ExperimentalCUDACodeGen._in_device_code: return True @@ -1139,7 +1124,7 @@ def generate_node(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphVi # if it is not implemented, use generate node of cpu impl if gen is not False: gen(sdfg, cfg, dfg, state_id, node, function_stream, callsite_stream) - elif type(node).__name__ == 'MapExit' and node.schedule in GPU_SCHEDULES: + elif type(node).__name__ == 'MapExit' and node.schedule in dtypes.GPU_SCHEDULES_EXPERIMENTAL_CUDACODEGEN: # Special case: It is a MapExit but from a GPU_schedule- the MapExit is already # handled by a KernelScopeManager instance. Otherwise cpu_codegen will close it return @@ -1879,7 +1864,7 @@ def _emit_copy(self, state_id: int, src_node: nodes.Node, src_storage: dtypes.St state = state_dfg while (schedule_node is None or not isinstance(schedule_node, nodes.MapEntry) or schedule_node.map.schedule == dtypes.ScheduleType.Sequential): - ret = xfh.get_parent_map(state, schedule_node) + ret = helpers.get_parent_map(state, schedule_node) if ret is None: schedule_node = None break @@ -2003,8 +1988,15 @@ def _get_storagename(storage): return sname[sname.rindex('_') + 1:] -# TODO: Just use product as name? -def prod(iterable): + +def product(iterable): + """ + Computes the symbolic product of elements in the iterable using sympy.Mul. + + This is equivalent to: ```functools.reduce(sympy.Mul, iterable, 1)```. + + Purpose: This function is used to improve readability of the codeGen. + """ return functools.reduce(sympy.Mul, iterable, 1) ######################################################################### @@ -2085,7 +2077,19 @@ def __init__(self, cudaCodeGen: ExperimentalCUDACodeGen, sdfg: SDFG, cfg: Contro self._bridge_args_typed: list[str] = state_param + self._args_typed # Kernel dimensions - self._grid_dims, self._block_dims, self._has_tbmap, self._has_dtbmap, _ = self._get_kernel_dimensions(dfg_scope) + self._grid_dims, self._block_dims, self._has_tbmap = self._get_kernel_dimensions(dfg_scope) + + + # Set warp size of the kernel + if cudaCodeGen.backend not in ['cuda', 'hip']: + raise ValueError( + f"Unsupported backend '{cudaCodeGen.backend}' in ExperimentalCUDACodeGen. " + "Only 'cuda' and 'hip' are supported." + ) + + warp_size_key = 'cuda_warp_size' if cudaCodeGen.backend == 'cuda' else 'hip_warp_size' + self._warpSize = Config.get('compiler', 'cuda', warp_size_key) + @@ -2104,8 +2108,9 @@ def _get_kernel_dimensions(self, dfg_scope: ScopeSubgraphView): 3. If block size can be overapproximated, it is (for dynamically-sized blocks that are bounded by a predefined size). - 4. If nested device maps exist, behavior is unknown but an error is thrown - in the generate_scope function. This is not supported here + 4. If nested device maps exist, behavior is an error is thrown + in the generate_scope function. Nested device maps are not supported + anymore. :note: Kernel dimensions are separate from the map variables, and they should be treated as such. @@ -2134,7 +2139,7 @@ def _get_kernel_dimensions(self, dfg_scope: ScopeSubgraphView): block_size, grid_size = self._compute_block_and_grid_from_maps(threadblock_maps) - return grid_size, block_size, len(threadblock_maps) > 0, False, 0 + return grid_size, block_size, len(threadblock_maps) > 0 def _compute_default_block_and_grid(self): @@ -2179,7 +2184,7 @@ def _compute_default_block_and_grid(self): active_grid_dims = max(1, sum(1 for g in kernel_domain_size if g != 1)) if active_block_dims > active_grid_dims: - tail_product = prod(default_block_size[active_grid_dims:]) + tail_product = product(default_block_size[active_grid_dims:]) block_size = default_block_size[:active_grid_dims] + [1] * (3 - active_grid_dims) block_size[active_grid_dims - 1] *= tail_product warnings.warn(f'Default block size has more dimensions ({active_block_dims}) than kernel dimensions ' @@ -2275,7 +2280,7 @@ def _validate_block_size_limits(self, block_size): kernel_map_label = self._kernel_map.label - total_block_size = prod(block_size) + total_block_size = product(block_size) limit = Config.get('compiler', 'cuda', 'block_size_limit') lastdim_limit = Config.get('compiler', 'cuda', 'block_size_lastdim_limit') @@ -2310,7 +2315,7 @@ def _to_3d_dims(self, dim_sizes: List) -> List: if len(dim_sizes) > 3: # multiply everything from the 3rd onward into d[2] - dim_sizes[2] = prod(dim_sizes[2:]) + dim_sizes[2] = product(dim_sizes[2:]) dim_sizes = dim_sizes[:3] # pad with 1s if necessary @@ -2407,9 +2412,13 @@ def has_tbmap(self) -> bool: return self._has_tbmap @property - def has_dtbmap(self) -> bool: - """Returns whether the kernel has a dynamic thread-block map.""" - return self._has_dtbmap + def warpSize(self) -> int: + """ + Returns the warp size used in this kernel. + This value depends on the selected backend (CUDA or HIP) + and is retrieved from the configuration. + """ + return self._warpSize diff --git a/dace/config_schema.yml b/dace/config_schema.yml index 7ca237508e..86ec8555a7 100644 --- a/dace/config_schema.yml +++ b/dace/config_schema.yml @@ -447,7 +447,7 @@ required: index types are needed to address memory offsets that are beyond the 32-bit range, or to reduce memory usage. - + # New configs, needed for ExperimentalCUDACodeGen implementation: type: str title: CUDA codegen implementation @@ -457,6 +457,26 @@ required: Yakup Koray Budanaz for Berkays master-thesis. enum: [legacy, experimental] default: experimental + + cuda_warp_size: + type: int + title: CUDA warp size + description: > + Defines the warp size used during CUDA code generation. The default and current + standard value for CUDA is 32. This should only be changed if future CUDA + architectures explicitly alter the warp size. Modifying this value arbitrarily may + result in incorrect or unknown behavior, and is therefore strongly discouraged. + default: 32 + + hip_warp_size: + type: int + title: HIP warp size + description: > + Specifies the warp size (also known as wavefront size) for HIP code generation. + The default value for AMD GPUs is typically 64. This setting should only be modified + if you have a clear understanding of what you are doing. + default: 64 + ############################################# # General FPGA flags fpga: diff --git a/dace/dtypes.py b/dace/dtypes.py index 6fa154e794..d7b382b4ff 100644 --- a/dace/dtypes.py +++ b/dace/dtypes.py @@ -89,6 +89,13 @@ class ScheduleType(aenum.AutoNumberEnum): ScheduleType.GPU_Persistent, ] +# A subset of GPU schedule types for ExperimentalCUDACodeGen +GPU_SCHEDULES_EXPERIMENTAL_CUDACODEGEN = [ + ScheduleType.GPU_Device, + ScheduleType.GPU_ThreadBlock, + ScheduleType.GPU_Warp, +] + # A subset of CPU schedule types CPU_SCHEDULES = [ ScheduleType.CPU_Multicore, @@ -209,7 +216,7 @@ class TilingType(aenum.AutoNumberEnum): ScheduleType.Snitch: StorageType.Snitch_TCDM, #TODO: Approve. - # Usually used in the context with shared memory.. + # Should be registers in my opinion ScheduleType.GPU_Warp: StorageType.Register, } From 9be949f1f21624b6f789a3d92f308028d25e2813 Mon Sep 17 00:00:00 2001 From: aydogdub Date: Mon, 26 May 2025 18:17:05 +0200 Subject: [PATCH 11/94] Some TODO's and improving _generate_kernel_scope function --- dace/codegen/targets/experimental_cuda.py | 261 ++++++++++++---------- 1 file changed, 141 insertions(+), 120 deletions(-) diff --git a/dace/codegen/targets/experimental_cuda.py b/dace/codegen/targets/experimental_cuda.py index 6218217361..edadc144b2 100644 --- a/dace/codegen/targets/experimental_cuda.py +++ b/dace/codegen/targets/experimental_cuda.py @@ -43,8 +43,6 @@ # 4. Berkay: Warning/Error that GPU_device must be used before other GPU schedule types -# TODO: depending on what happens next -# change in_device_code to maybe in_kernel_code? @@ -55,7 +53,6 @@ class ExperimentalCUDACodeGen(TargetCodeGenerator): target_name = 'experimental_cuda' title = 'CUDA' - _in_device_code = False ######################## Initilization and Preprocessing related start ######################################################### @@ -64,19 +61,11 @@ def __init__(self, frame_codegen: 'DaCeCodeGenerator', sdfg: SDFG): self._frame: DaCeCodeGenerator = frame_codegen # creates the frame code, orchestrates the code generation for targets self._dispatcher: TargetDispatcher= frame_codegen.dispatcher # responsible for dispatching code generation to the appropriate target - # dispatcher = self._dispatcher - self.create_grid_barrier: bool = False # Used for grid level synchronization - - self.dynamic_tbmap_type = None - self.extra_nsdfg_args = [] - ExperimentalCUDACodeGen._in_device_code = False # TODO: Isn't this double? + ExperimentalCUDACodeGen._in_kernel_code = False # TODO: Isn't this double? self._cpu_codegen: Optional['CPUCodeGen'] = None - self._block_dims = None - self._grid_dims = None - self._kernel_map: Optional[nodes.MapEntry] = None # Indicates whether the code generation is currently within a "kernel" map. # NOTE: Moved from preprossessing to here self.backend: str = common.get_gpu_backend() @@ -89,8 +78,8 @@ def __init__(self, frame_codegen: 'DaCeCodeGenerator', sdfg: SDFG): 'CUDA', target_type=target_type) - self._kernel_state = None - self._kernel_grid_conditions: List[str] = [] + + self._scope_has_collaborative_copy = False self._localcode = CodeIOStream() @@ -477,7 +466,7 @@ def generate_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: ScopeSub function_stream: CodeIOStream, callsite_stream: CodeIOStream) -> None: # Are we generating host (launch) code or device (kernel) code? - if not ExperimentalCUDACodeGen._in_device_code: + if not ExperimentalCUDACodeGen._in_kernel_code: # Prepare and cache kernel metadata (name, grid dims, arguments, etc.) self._current_kernel_spec = KernelSpec( @@ -489,7 +478,7 @@ def generate_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: ScopeSub #--------------- Generate Kernel Function ---------------- - ExperimentalCUDACodeGen._in_device_code = True + ExperimentalCUDACodeGen._in_kernel_code = True kernel_stream = CodeIOStream() kernel_name = self._current_kernel_spec.kernel_name @@ -519,7 +508,7 @@ def generate_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: ScopeSub ) self._localcode.write(kernel_stream.getvalue() + '\n') - ExperimentalCUDACodeGen._in_device_code = False + ExperimentalCUDACodeGen._in_kernel_code = False # -------------------------------------------------------------- # Generate the actual launch call (host-side) @@ -552,101 +541,100 @@ def generate_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: ScopeSub def _generate_kernel_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: ScopeSubgraphView, state_id: int, function_stream: CodeIOStream, kernel_stream: CodeIOStream) -> None: - - # Get the Map Node (sould be a Map Node?) - node = dfg_scope.source_nodes()[0] - - # Get kernel specifications - kernel_spec = self._current_kernel_spec - kernel_map = kernel_spec.kernel_map - has_tbmap = kernel_spec.has_tbmap - block_dims = kernel_spec.block_dims - with KernelScopeManager(cudaCodeGen=self, sdfg=sdfg, cfg=cfg, dfg_scope=dfg_scope, state_id=state_id, function_stream=function_stream, callsite_stream=kernel_stream, comment="Kernel scope",) as scopeManager: - - # TODO: If time allows, maybe rename in configs somehow, discuss with Yakup - # Get the thread/block index type - ttype = Config.get('compiler', 'cuda', 'thread_id_type') - tidtype = getattr(dtypes, ttype, False) - if not isinstance(tidtype, dtypes.typeclass): - raise ValueError(f'Configured type "{ttype}" for ``thread_id_type`` does not match any DaCe data type. ' - 'See ``dace.dtypes`` for available types (for example ``int32``).') - # Generate all index arguments for kernel grid - krange = subsets.Range(kernel_map.range[::-1]) - kdims = krange.size() - dsym = [symbolic.symbol(f'__DAPB{i}', nonnegative=True, integer=True) for i in range(len(krange))] - bidx = krange.coord_at(dsym) + # ----------------- Retrieve kernel configuration ----------------------- + kernel_spec = self._current_kernel_spec + kernel_entry_node = kernel_spec._kernel_entry_node # = dfg_scope.source_nodes()[0] + kernel_map = kernel_spec.kernel_map + has_tbmap = kernel_spec.has_tbmap + kernel_block_dims = self._current_kernel_spec.block_dims + + # ----------------- Kernel/Map Range Preprocessing ----------------------- - # First three dimensions are evaluated directly - for i in range(min(len(krange), 3)): - varname = kernel_map.params[-i - 1] + reversed_kernel_range = kernel_map.range[::-1] # also reverse it + kernel_range = subsets.Range(reversed_kernel_range) + kernel_dimensions = len(kernel_range) + kernel_dim_sizes = kernel_range.size() - # If we defaulted to a fixed number of threads per block, offset by thread ID - block_expr = f'blockIdx.{_get_cuda_dim(min(i, 2))}' - if not has_tbmap: - block_expr = f'({block_expr} * {symbolic_to_cpp(block_dims[i])} + threadIdx.{_get_cuda_dim(i)})' - # Delinearize third dimension if necessary - if i == 2 and len(krange) > 3: - block_expr = f'({block_expr} / ({symbolic_to_cpp(functools.reduce(sympy.Mul, kdims[3:], 1))}))' + # ----------------- Set up symbolic index expressions ----------------------- - expr = symbolic_to_cpp(bidx[i]).replace(f'__DAPB{i}', block_expr) + symbolic_indices = [ symbolic.symbol(f'__SYM_IDX{dim}', nonnegative=True, integer=True) for dim in range(kernel_dimensions)] + symbolic_index_bounds = [ idx + block_dim - 1 for idx, block_dim in zip(symbolic_indices, kernel_block_dims)] + symbolic_coordinates = kernel_range.coord_at(symbolic_indices) - kernel_stream.write(f'{tidtype.ctype} {varname} = {expr};', cfg, state_id, node) - self._dispatcher.defined_vars.add(varname, DefinedType.Scalar, tidtype.ctype) + # ----------------- Generate Thread or Block index Definitions ----------------------- - # Delinearize beyond the third dimension - if len(krange) > 3: - for i in range(3, len(krange)): - varname = kernel_map.params[-i - 1] - block_expr = 'blockIdx.z' - if not has_tbmap: - block_expr = f'({block_expr} * {symbolic_to_cpp(block_dims[2])} + threadIdx.z)' + thread_id_ctype = kernel_spec.gpu_index_ctype # Data type of CUDA thread/block indices - block_expr = '((%s / (%s)) %% (%s))' % ( - block_expr, - symbolic_to_cpp(functools.reduce(sympy.Mul, kdims[i + 1:], 1)), - symbolic_to_cpp(kdims[i]), - ) - expr = symbolic_to_cpp(bidx[i]).replace(f'__DAPB{i}', block_expr) - kernel_stream.write(f'{tidtype.ctype} {varname} = {expr};', cfg, state_id, node) - self._dispatcher.defined_vars.add(varname, DefinedType.Scalar, tidtype.ctype) + # In case there is no ThreadBlock map used in a submap, the map variables will + # be mapped to thread IDs instead of block IDs + for dim in range(kernel_dimensions): + var_name = kernel_map.params[-dim - 1] # also reverse it here! + + # Compute index expressions for up to 3 dimensions (x, y, z) + if dim < 3: + if has_tbmap: + index_expr = f'blockIdx.{_get_cuda_dim(dim)}' + else: + index_expr = f'(blockIdx.{_get_cuda_dim(dim)} * {symbolic_to_cpp(kernel_block_dims[dim])} + threadIdx.{_get_cuda_dim(dim)})' + + # Delinearize third dimension if more than 3D (used in 3D+ mapping) + if dim == 2 and kernel_dimensions > 3: + tail_prod = product(kernel_dim_sizes[3:]) + index_expr = f"({index_expr} / ({symbolic_to_cpp(tail_prod)}))" + + else: # Handle dimensions beyond the third (delinearize and modulo) + if has_tbmap: + index_expr = f'blockIdx.z' + else: + index_expr = f'(blockIdx.z * {symbolic_to_cpp(kernel_block_dims[2])} + threadIdx.z)' + + tail_prod = product(kernel_dim_sizes[dim + 1:]) + index_expr = (f"({index_expr} / ({symbolic_to_cpp(tail_prod)})) % ({symbolic_to_cpp(kernel_dim_sizes[dim])})") + + + # Define thread/Block index + var_def = symbolic_to_cpp(symbolic_coordinates[dim]).replace(f'__SYM_IDX{dim}', index_expr) + kernel_stream.write(f'{thread_id_ctype} {var_name} = {var_def};', cfg, state_id, kernel_entry_node) + self._dispatcher.defined_vars.add(var_name, DefinedType.Scalar, thread_id_ctype) + + + # ----------------- Guard Conditions for Block Execution ----------------------- - - # handle conditions if not has_tbmap: - dsym_end = [d + bs - 1 for d, bs in zip(dsym, block_dims)] - minels = krange.min_element() - maxels = krange.max_element() - for i, (v, minel, maxel) in enumerate(zip(kernel_map.params[::-1], minels, maxels)): + minels = kernel_range.min_element() + maxels = kernel_range.max_element() + + for dim, (var_name, start, end) in enumerate(zip(kernel_map.params[::-1], minels, maxels)): condition = '' # Optimize conditions if they are always true - if i >= 3 or (dsym[i] >= minel) != True: - condition += f'{v} >= {symbolic_to_cpp(minel)}' + if dim >= 3 or (symbolic_indices[dim] >= start) != True: + condition += f'{var_name} >= {symbolic_to_cpp(start)}' - if (i >= 3 or ((dsym_end[i] < maxel) != False and ((dsym_end[i] % block_dims[i]) != 0) == True) - or (block_dims[i] > maxel) == True): + if (dim >= 3 or ((symbolic_index_bounds[dim] < end) != False + and ((symbolic_index_bounds[dim] % kernel_block_dims[dim]) != 0) == True) or (kernel_block_dims[dim] > end) == True): if len(condition) > 0: condition += ' && ' - condition += f'{v} < {symbolic_to_cpp(maxel + 1)}' + condition += f'{var_name} < {symbolic_to_cpp(end + 1)}' if len(condition) > 0: - scopeManager.open(condition= condition) - + scopeManager.open(condition=condition) + # ----------------- Dispatch Subgraph code generation ----------------------- self._dispatcher.dispatch_subgraph(sdfg, cfg, dfg_scope, state_id, function_stream, kernel_stream, skip_entry_node=True) @@ -686,8 +674,10 @@ def _generate_GPU_ThreadBlock_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, df # ----------------- Generate Index Variable Definitions ----------------------- - for dim in range(map_dimensions): + # Get the block's index dace data type + block_id_ctype = self._current_kernel_spec.gpu_index_ctype + for dim in range(map_dimensions): var_name = scope_map.params[-dim - 1] # also reverse it here! if dim < 3: @@ -704,10 +694,9 @@ def _generate_GPU_ThreadBlock_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, df var_def = symbolic_to_cpp(symbolic_coordinates[dim]).replace(f'__SYM_IDX{dim}', base_expr) - kernel_stream.write(f'int {var_name} = {var_def};', cfg, state_id, node) - self._dispatcher.defined_vars.add(var_name, DefinedType.Scalar, 'int') # TODO: get varname type + kernel_stream.write(f'{block_id_ctype} {var_name} = {var_def};', cfg, state_id, node) + self._dispatcher.defined_vars.add(var_name, DefinedType.Scalar, block_id_ctype) - # ----------------- Guard Conditions for Block Execution ----------------------- @@ -777,12 +766,14 @@ def _generate_GPU_Warp_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope num_warps = product(warp_dim_bounds) + # The C type used to define the (flat) threadId and warpId variables + ids_ctype = kernel_spec.gpu_index_ctype # ----------------- Guard checks ----------------------- # handles checks either at compile time or runtime (i.e. checks in the generated code) - self._hanlde_GPU_Warp_scope_guards(state_dfg, node, map_range, warp_dim, num_threads_in_block, num_warps, + self._handle_GPU_Warp_scope_guards(state_dfg, node, map_range, warp_dim, num_threads_in_block, num_warps, kernel_stream, scopeManager) @@ -807,8 +798,8 @@ def _generate_GPU_Warp_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope threadID_name = 'ThreadId_%s_%d_%d_%d' % (scope_map.label, cfg.cfg_id, state_dfg.block_id, state_dfg.node_id(node)) - kernel_stream.write(f"int {threadID_name} = ({flat_thread_idx_expr}) / {warpSize};", cfg, state_id, node) - self._dispatcher.defined_vars.add(threadID_name, DefinedType.Scalar, 'int') # TODO: Fix type? + kernel_stream.write(f"{ids_ctype} {threadID_name} = ({flat_thread_idx_expr}) / {warpSize};", cfg, state_id, node) + self._dispatcher.defined_vars.add(threadID_name, DefinedType.Scalar, ids_ctype) @@ -824,8 +815,8 @@ def _generate_GPU_Warp_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope else: expr = f"{threadID_name} % {warp_dim_bounds[i]}" - kernel_stream.write(f"int {var_name} = {expr};", cfg, state_id, node) - self._dispatcher.defined_vars.add(var_name, DefinedType.Scalar, 'int') + kernel_stream.write(f"{ids_ctype} {var_name} = {expr};", cfg, state_id, node) + self._dispatcher.defined_vars.add(var_name, DefinedType.Scalar, ids_ctype) @@ -865,7 +856,7 @@ def _generate_GPU_Warp_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope - def _hanlde_GPU_Warp_scope_guards(self, state_dfg: SDFGState, node: nodes.MapEntry, map_range: subsets.Range, + def _handle_GPU_Warp_scope_guards(self, state_dfg: SDFGState, node: nodes.MapEntry, map_range: subsets.Range, warp_dim: int, num_threads_in_block, num_warps, kernel_stream: CodeIOStream, scopeManager: 'KernelScopeManager'): @@ -938,16 +929,16 @@ def _generate_kernel_wrapper(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope kernel_spec: KernelSpec = self._current_kernel_spec kernel_name = kernel_spec.kernel_name - kernel_bridge_args = kernel_spec.bridge_args - kernel_bridge_args_typed = kernel_spec.bridge_args_typed + kernel_wrapper_args = kernel_spec.kernel_wrapper_args + kernel_wrapper_args_typed = kernel_spec.kernel_wrapper_args_typed # Declaration of the function which launches the kernel (C++ code) function_stream.write('DACE_EXPORTED void __dace_runkernel_%s(%s);\n' % - (kernel_name, ', '.join(kernel_bridge_args_typed)), cfg, state_id, scope_entry) + (kernel_name, ', '.join(kernel_wrapper_args_typed)), cfg, state_id, scope_entry) # Calling he function which launches the kernel (C++ code) callsite_stream.write( '__dace_runkernel_%s(%s);\n' % - (kernel_name, ', '.join(kernel_bridge_args)), cfg, state_id, scope_entry) + (kernel_name, ', '.join(kernel_wrapper_args)), cfg, state_id, scope_entry) @@ -964,7 +955,7 @@ def _generate_kernel_launch(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: kernel_spec: KernelSpec = self._current_kernel_spec kernel_name = kernel_spec.kernel_name kernel_args_as_input = kernel_spec.args_as_input - kernel_launch_args_typed = kernel_spec.bridge_args_typed + kernel_launch_args_typed = kernel_spec.kernel_wrapper_args_typed # get kernel dimensions and transform into a c++ string grid_dims = kernel_spec.grid_dims @@ -1052,7 +1043,7 @@ def generate_state(self, callsite_stream: CodeIOStream, generate_state_footer: bool = False) -> None: - if ExperimentalCUDACodeGen._in_device_code: + if ExperimentalCUDACodeGen._in_kernel_code: self.generate_devicelevel_state(sdfg, cfg, state, function_stream, callsite_stream) else: self._frame.generate_state(sdfg, cfg, state, function_stream, callsite_stream, generate_state_footer=False) @@ -1110,7 +1101,7 @@ def node_dispatch_predicate(self, sdfg, state, node): if hasattr(node, 'schedule'): # NOTE: Works on nodes and scopes if node.schedule in dtypes.GPU_SCHEDULES_EXPERIMENTAL_CUDACODEGEN: return True - if ExperimentalCUDACodeGen._in_device_code: + if ExperimentalCUDACodeGen._in_kernel_code: return True return False @@ -1136,13 +1127,8 @@ def generate_nsdfg_header(self, sdfg, cfg, state, state_id, node, memlet_referen sdfg, cfg, state, state_id, node, memlet_references, sdfg_label, state_struct=False) def generate_nsdfg_call(self, sdfg, cfg, state, node, memlet_references, sdfg_label): - return self._cpu_codegen.generate_nsdfg_call(sdfg, - cfg, - state, - node, - memlet_references, - sdfg_label, - state_struct=False) + return self._cpu_codegen.generate_nsdfg_call(sdfg, cfg, state, node, memlet_references, + sdfg_label, state_struct=False) def generate_nsdfg_arguments(self, sdfg, cfg, dfg, state, node): result = self._cpu_codegen.generate_nsdfg_arguments(sdfg, cfg, dfg, state, node) @@ -1168,7 +1154,6 @@ def _generate_NestedSDFG(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSub # Rather Minor "actual" changes, but much nicer to extend and maintain - # For Yakup: I like it when we first "guard" and then implement the logic sorrow free def declare_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int, node: nodes.AccessNode, nodedesc: dt.Data, function_stream: CodeIOStream, declaration_stream: CodeIOStream) -> None: @@ -1357,7 +1342,7 @@ def _prepare_GPU_Shared_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: Sta # ------------------- Initialization ------------------- if node.setzero: allocation_stream.write( - f'dace::ResetShared<{nodedesc.dtype.ctype}, {", ".join(symbolic_to_cpp(self._block_dims))}, {symbolic_to_cpp(arrsize)}, ' + f'dace::ResetShared<{nodedesc.dtype.ctype}, {", ".join(symbolic_to_cpp(self._current_kernel_spec.block_dims))}, {symbolic_to_cpp(arrsize)}, ' f'1, false>::Reset({dataname});\n', cfg, state_id, node ) @@ -1389,7 +1374,6 @@ def _prepare_Register_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: State self._dispatcher.defined_vars.add(dataname, DefinedType.Pointer, array_ctype) - # I could also do deallocate based on type.. good for modularity, but may be an overkill here def deallocate_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int, node: nodes.AccessNode, nodedesc: dt.Data, function_stream: CodeIOStream, callsite_stream: CodeIOStream) -> None: @@ -1430,7 +1414,7 @@ def deallocate_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgrap elif nodedesc.storage == dtypes.StorageType.CPU_Pinned: callsite_stream.write( - 'DACE_GPU_CHECK(%sFreeHost(%s));\n' % (self.backend, dataname), cfg, state_id, node) + f'DACE_GPU_CHECK({self.backend}FreeHost({dataname}));\n', cfg, state_id, node) elif nodedesc.storage in {dtypes.StorageType.GPU_Shared, dtypes.StorageType.Register}: # No deallocation needed @@ -1670,7 +1654,7 @@ def _emit_copy(self, state_id: int, src_node: nodes.Node, src_storage: dtypes.St raise LookupError('Memlet does not point to any of the nodes') if (isinstance(src_node, nodes.AccessNode) and isinstance(dst_node, nodes.AccessNode) - and not ExperimentalCUDACodeGen._in_device_code + and not ExperimentalCUDACodeGen._in_kernel_code and (src_storage in [dtypes.StorageType.GPU_Global, dtypes.StorageType.CPU_Pinned] or dst_storage in [dtypes.StorageType.GPU_Global, dtypes.StorageType.CPU_Pinned]) and not (src_storage in cpu_storage_types and dst_storage in cpu_storage_types)): @@ -1988,7 +1972,6 @@ def _get_storagename(storage): return sname[sname.rindex('_') + 1:] - def product(iterable): """ Computes the symbolic product of elements in the iterable using sympy.Mul. @@ -2028,7 +2011,7 @@ def ptr(name: str, desc: dace.data.Data, sdfg: SDFG = None, framecode=None) -> s if desc.storage == dtypes.StorageType.CPU_ThreadLocal: # Use unambiguous name for thread-local arrays return f'__{sdfg.cfg_id}_{name}' - elif not ExperimentalCUDACodeGen._in_device_code: # GPU kernels cannot access state + elif not ExperimentalCUDACodeGen._in_kernel_code: # GPU kernels cannot access state return f'__state->__{sdfg.cfg_id}_{name}' elif (sdfg, name) in framecode.where_allocated and framecode.where_allocated[(sdfg, name)] is not sdfg: return f'__{sdfg.cfg_id}_{name}' @@ -2070,15 +2053,17 @@ def __init__(self, cudaCodeGen: ExperimentalCUDACodeGen, sdfg: SDFG, cfg: Contro self._args_typed: list[str] = [adata.as_arg(name=aname) for aname, adata in self._args.items()] self._args_as_input: list[str] = [ptr(aname, adata, sdfg, cudaCodeGen._frame) for aname, adata in self._args.items()] - # Used for the bridging function, be careful: a change in the name __state will probably lead to compilation errors + # Used for the kernel wrapper function, be careful: a change in the name __state will probably lead to compilation errors state_param: list[str] = [f'{mangle_dace_state_struct_name(cudaCodeGen._global_sdfg)} *__state'] - self._bridge_args: list[str] = ['__state'] + self._args_as_input - self._bridge_args_typed: list[str] = state_param + self._args_typed + self._kernel_wrapper_args: list[str] = ['__state'] + self._args_as_input + self._kernel_wrapper_args_typed: list[str] = state_param + self._args_typed # Kernel dimensions self._grid_dims, self._block_dims, self._has_tbmap = self._get_kernel_dimensions(dfg_scope) + # C type (as string) of thread, block and warp indices + self._gpu_index_ctype: str = self.get_gpu_index_ctype() # Set warp size of the kernel if cudaCodeGen.backend not in ['cuda', 'hip']: @@ -2091,7 +2076,29 @@ def __init__(self, cudaCodeGen: ExperimentalCUDACodeGen, sdfg: SDFG, cfg: Contro self._warpSize = Config.get('compiler', 'cuda', warp_size_key) - + def get_gpu_index_ctype(self, config_key='gpu_index_type') -> str: + """ + Retrieves the GPU index data type as a C type string (for thread, block, warp indices) + from the configuration and if it matches a DaCe data type. + + Raises: + ValueError: If the configured type does not match a DaCe data type. + + Returns: + str: + The C type string corresponding to the configured GPU index type. + Used for defining thread, block, and warp indices in the generated code. + """ + type_name = Config.get('compiler', 'cuda', config_key) + dtype = getattr(dtypes, type_name, None) + if not isinstance(dtype, dtypes.typeclass): + raise ValueError( + f'Invalid {config_key} "{type_name}" configured (used for thread, block, and warp indices): ' + 'no matching DaCe data type found.\n' + 'Please use a valid type from dace.dtypes (e.g., "int32", "uint64").' + ) + return dtype.ctype + def _get_kernel_dimensions(self, dfg_scope: ScopeSubgraphView): """ @@ -2368,6 +2375,11 @@ def _get_maps_affecting_launch_dims(self, graph: ScopeSubgraphView) -> List[Tupl def kernel_name(self) -> list[str]: """Returns the kernel name.""" return self._kernel_name + + @property + def kernel_entry_node(self) -> nodes.MapEntry: + """Returns the kernels entry node""" + return self._kernel_entry_node @property def kernel_map(self) -> nodes.Map: @@ -2389,12 +2401,12 @@ def args_typed(self) -> list[str]: return self._args_typed @property - def bridge_args(self) -> list[str]: - return self._bridge_args + def kernel_wrapper_args(self) -> list[str]: + return self._kernel_wrapper_args @property - def bridge_args_typed(self) -> list[str]: - return self._bridge_args_typed + def kernel_wrapper_args_typed(self) -> list[str]: + return self._kernel_wrapper_args_typed @property def grid_dims(self) -> list: @@ -2420,6 +2432,15 @@ def warpSize(self) -> int: """ return self._warpSize + @property + def gpu_index_ctype(self) -> str: + """ + Returns the C data type used for GPU indices (thread, block, warp) + in generated code. This type is determined by the 'gpu_index_type' + setting in the configuration and matches with a DaCe typeclass. + """ + return self._gpu_index_ctype + class KernelScopeManager: From c1ec70a1ce74d77c91e3b123d0d2bb49c222a82e Mon Sep 17 00:00:00 2001 From: aydogdub Date: Mon, 26 May 2025 18:42:53 +0200 Subject: [PATCH 12/94] new configs --- dace/config_schema.yml | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/dace/config_schema.yml b/dace/config_schema.yml index 86ec8555a7..0a8afa3b3c 100644 --- a/dace/config_schema.yml +++ b/dace/config_schema.yml @@ -458,6 +458,18 @@ required: enum: [legacy, experimental] default: experimental + gpu_index_type: + type: str + title: Thread/block/warp index data type + default: int32 + description: > + Defines the data type for a thread, block and warp index in the generated code. + The type is based on the type-classes in ``dace.dtypes``. For example, + ``uint64`` is equivalent to ``dace.uint64``. Change this setting when large + index types are needed to address memory offsets that are beyond the 32-bit + range, or to reduce memory usage. This replaces ``thread_id_type`` in + ``ExperimentalCUDACodeGen`` , as the new name more accurately reflects its broader + usage. cuda_warp_size: type: int title: CUDA warp size From 8f2a5daa74334b68d65f0a489cd2370775c9c2e6 Mon Sep 17 00:00:00 2001 From: aydogdub Date: Sun, 1 Jun 2025 18:07:20 +0200 Subject: [PATCH 13/94] Refactoring: Getting rid of complicated cuda stream handling and replacing it by simple function call through a cudastream manager object. Implementing copy_memory using a strategy approach. --- berkay_workpace/reports/notes.txt | 36 + berkay_workpace/scratch/A_output.txt | 4 - berkay_workpace/scratch/notes.md | 17 + berkay_workpace/scratch/playfield.py | 55 +- berkay_workpace/scratch/testbed.ipynb | 561 +++++++++------- berkay_workpace/scratch/vis.ipynb | 492 -------------- .../scratch/visualizingWarps.ipynb | 180 ----- .../tests/gpu_map_tests/threadBlock_test.py | 1 - .../out_of_kernel_memcpy_test.py | 81 +++ .../cuda_highdim_kernel_test.py | 16 +- .../reusable_tests/gpu_launch_bounds_test.py | 70 ++ dace/codegen/targets/__init__.py | 2 +- dace/codegen/targets/cuda.py | 1 - .../new_cuda_codegen/copy_strategies.py | 376 +++++++++++ .../experimental_cuda.py | 625 +++--------------- dace/dtypes.py | 9 - 16 files changed, 996 insertions(+), 1530 deletions(-) create mode 100644 berkay_workpace/reports/notes.txt delete mode 100644 berkay_workpace/scratch/A_output.txt create mode 100644 berkay_workpace/scratch/notes.md delete mode 100644 berkay_workpace/scratch/vis.ipynb delete mode 100644 berkay_workpace/scratch/visualizingWarps.ipynb create mode 100644 berkay_workpace/tests/memcopy_tests/out_of_kernel_memcpy_test.py create mode 100644 berkay_workpace/tests/reusable_tests/gpu_launch_bounds_test.py create mode 100644 dace/codegen/targets/new_cuda_codegen/copy_strategies.py rename dace/codegen/targets/{ => new_cuda_codegen}/experimental_cuda.py (74%) diff --git a/berkay_workpace/reports/notes.txt b/berkay_workpace/reports/notes.txt new file mode 100644 index 0000000000..767275669d --- /dev/null +++ b/berkay_workpace/reports/notes.txt @@ -0,0 +1,36 @@ +What was bad: + + +- Also dead code at "copy_memory", the first "dst_schedule" in the src_node if-else case it dead code. + In Fact, "dst_schedule" is not even needed at all. So we have "double dead code", once a computation + which gets overwritten, and once we compute smth we don't need. + +- Damn, even the copy_memory input named "memlet" is wrong.. this should be an edge type, not a memlet type. + +- Also, in "_emit_copy" there is a code snippet that "determines directionality", but part which may be useful + is raising an error in the else case. Again, dead code- setting variables that are never used. + + +- Again dead code: Computes "copy_shape" twice, first definition seems wrong and is not even used. + +- Stream handling in CudaCodeGen check is just random- streams are not handled by the codegen. + +- again, define local variables but then not use it. In this case: dtype + + +- yeah I realize that in the codegen the same function is implemented twice (one locally, + once in a utility file) + + +Tell Yakup: + +- I removed any logic that should handle cudaStream synchronization since I am not responsible for it. + In order to help to extend it in the future, I have two options. + 1. I can add function calls (which are empty i.e. do nothing) that signal what should be implemented once + there is a good solution of handling cuda streams in DaCe's new version + 2. Document it and say that several streams are not supported (people could come up with completely new + approaches to handle streams maybe) + 3. We got smth wrong. CopyToMap handles only GPU<->GPU code cases. + 4. I tried to handle "special case" as I understood... maybe worth to look at it closer with you + + diff --git a/berkay_workpace/scratch/A_output.txt b/berkay_workpace/scratch/A_output.txt deleted file mode 100644 index 38c368d3df..0000000000 --- a/berkay_workpace/scratch/A_output.txt +++ /dev/null @@ -1,4 +0,0 @@ -0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 -100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 100 101 101 101 101 101 101 101 101 101 101 101 101 101 101 101 101 101 101 101 101 101 101 101 101 101 101 101 101 101 101 101 101 102 102 102 102 102 102 102 102 102 102 102 102 102 102 102 102 102 102 102 102 102 102 102 102 102 102 102 102 102 102 102 102 103 103 103 103 103 103 103 103 103 103 103 103 103 103 103 103 103 103 103 103 103 103 103 103 103 103 103 103 103 103 103 103 104 104 104 104 104 104 104 104 104 104 104 104 104 104 104 104 104 104 104 104 104 104 104 104 104 104 104 104 104 104 104 104 105 105 105 105 105 105 105 105 105 105 105 105 105 105 105 105 105 105 105 105 105 105 105 105 105 105 105 105 105 105 105 105 106 106 106 106 106 106 106 106 106 106 106 106 106 106 106 106 106 106 106 106 106 106 106 106 106 106 106 106 106 106 106 106 107 107 107 107 107 107 107 107 107 107 107 107 107 107 107 107 107 107 107 107 107 107 107 107 107 107 107 107 107 107 107 107 -200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 200 201 201 201 201 201 201 201 201 201 201 201 201 201 201 201 201 201 201 201 201 201 201 201 201 201 201 201 201 201 201 201 201 202 202 202 202 202 202 202 202 202 202 202 202 202 202 202 202 202 202 202 202 202 202 202 202 202 202 202 202 202 202 202 202 203 203 203 203 203 203 203 203 203 203 203 203 203 203 203 203 203 203 203 203 203 203 203 203 203 203 203 203 203 203 203 203 204 204 204 204 204 204 204 204 204 204 204 204 204 204 204 204 204 204 204 204 204 204 204 204 204 204 204 204 204 204 204 204 205 205 205 205 205 205 205 205 205 205 205 205 205 205 205 205 205 205 205 205 205 205 205 205 205 205 205 205 205 205 205 205 206 206 206 206 206 206 206 206 206 206 206 206 206 206 206 206 206 206 206 206 206 206 206 206 206 206 206 206 206 206 206 206 207 207 207 207 207 207 207 207 207 207 207 207 207 207 207 207 207 207 207 207 207 207 207 207 207 207 207 207 207 207 207 207 -300 300 300 300 300 300 300 300 300 300 300 300 300 300 300 300 300 300 300 300 300 300 300 300 300 300 300 300 300 300 300 300 301 301 301 301 301 301 301 301 301 301 301 301 301 301 301 301 301 301 301 301 301 301 301 301 301 301 301 301 301 301 301 301 302 302 302 302 302 302 302 302 302 302 302 302 302 302 302 302 302 302 302 302 302 302 302 302 302 302 302 302 302 302 302 302 303 303 303 303 303 303 303 303 303 303 303 303 303 303 303 303 303 303 303 303 303 303 303 303 303 303 303 303 303 303 303 303 304 304 304 304 304 304 304 304 304 304 304 304 304 304 304 304 304 304 304 304 304 304 304 304 304 304 304 304 304 304 304 304 305 305 305 305 305 305 305 305 305 305 305 305 305 305 305 305 305 305 305 305 305 305 305 305 305 305 305 305 305 305 305 305 306 306 306 306 306 306 306 306 306 306 306 306 306 306 306 306 306 306 306 306 306 306 306 306 306 306 306 306 306 306 306 306 307 307 307 307 307 307 307 307 307 307 307 307 307 307 307 307 307 307 307 307 307 307 307 307 307 307 307 307 307 307 307 307 diff --git a/berkay_workpace/scratch/notes.md b/berkay_workpace/scratch/notes.md new file mode 100644 index 0000000000..fd34688b2c --- /dev/null +++ b/berkay_workpace/scratch/notes.md @@ -0,0 +1,17 @@ +# Preprocess notes + +**1. Application of "CopyToMap" transformation for certain cases of GPU<->GPU copy that cannot be done using a single copy command:** + +So I left it there because I did not in which kind of situation this will be needed, i.e. +a concrete example/situation. It seems to be interwined with the _emit_copy() function, where similar checks +are also performed (i.e. nobody cleaned this, the checks there actually make no sense after preprocessing). + + +**2. _compute_pool_release() Function:** + +I left it because it looks useful - helps to free memory. But it seems like the actual freeing is not performed anymore +in my code, so I maybe should remove it as well? Kind of happens if + +**3. _compute_cudastreams()** + +Also remove it for now? I mean, stream allocation/deallocation are not handled anyways. \ No newline at end of file diff --git a/berkay_workpace/scratch/playfield.py b/berkay_workpace/scratch/playfield.py index b2456cdec3..c8e44a7675 100644 --- a/berkay_workpace/scratch/playfield.py +++ b/berkay_workpace/scratch/playfield.py @@ -14,42 +14,25 @@ from dace.config import Config +bs = 512 +ns = 1024 +BS = dace.symbol('BS') +NS = dace.symbol('NS') + +START = dace.symbol('START') +WS = dace.symbol('WS') +STRIDE = dace.symbol('STRIDE') + +start = 2 +stride = 3 +ws = 16 @dace.program -def reduce_add_sync(mask: dace.uint32, value: dace.uint32): +def symbolic_warp_map(A: dace.uint32[NS] @ dace.dtypes.StorageType.GPU_Global, B: dace.uint32[NS] @ dace.dtypes.StorageType.GPU_Global): + """ + Focus is in the use of symbolic variables in the MAP. + """ + A[:] = B[:] - result = dace.define_local_scalar(dace.uint32) - - with dace.tasklet(dace.Language.CPP): - inp_mask << mask - inp_value << value - out_result >> result - """ - out_result = __reduce_add_sync(inp_mask, inp_value); - """ - return result +sdfg = symbolic_warp_map.to_sdfg() - - -@dace.program -def warpLevel(A: dace.uint32[512] @ dace.dtypes.StorageType.GPU_Global, B: dace.uint32[512] @ dace.dtypes.StorageType.GPU_Global): - for _ in dace.map[0:512:512] @ dace.dtypes.ScheduleType.GPU_Device: - for j in dace.map[0:512] @ dace.dtypes.ScheduleType.GPU_ThreadBlock: - - value = A[j] - mask = 0xffffffff - result = 0 - - for _ in dace.map[0:16] @ dace.dtypes.ScheduleType.GPU_Warp: - - result = reduce_add_sync(mask, value) - - B[j] = result - - -A = cp.ones(512, cp.uint32) -B = cp.random.rand(512).astype(cp.uint32) - -sdfg = warpLevel.to_sdfg() -sdfg(A=A, B=B) - -print(B) \ No newline at end of file +Code(sdfg.generate_code()[0].clean_code, language='cpp') \ No newline at end of file diff --git a/berkay_workpace/scratch/testbed.ipynb b/berkay_workpace/scratch/testbed.ipynb index 23161b92b2..65eecf1343 100644 --- a/berkay_workpace/scratch/testbed.ipynb +++ b/berkay_workpace/scratch/testbed.ipynb @@ -42,99 +42,318 @@ "execution_count": 2, "id": "58226f37", "metadata": {}, + "outputs": [], + "source": [ + "bs = 512\n", + "ns = 1024\n", + "BS = dace.symbol('BS')\n", + "NS = dace.symbol('NS')\n", + "\n", + "START = dace.symbol('START')\n", + "WS = dace.symbol('WS')\n", + "STRIDE = dace.symbol('STRIDE')\n", + "\n", + "start = 2\n", + "stride = 3\n", + "ws = 16\n", + "@dace.program\n", + "def symbolic_warp_map(A: dace.uint32[NS] @ dace.dtypes.StorageType.GPU_Global, B: dace.uint32[NS] @ dace.dtypes.StorageType.GPU_Global):\n", + " \"\"\"\n", + " Focus is in the use of symbolic variables in the MAP.\n", + " \"\"\"\n", + " A[:] = B[:]\n", + "\n", + "sdfg = symbolic_warp_map.to_sdfg()" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "a755b788", + "metadata": {}, "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "not_in_kernel_code: True\n", + "is_between_access_nodes: True\n", + "involves_gpu_or_pinned: True\n", + "\n", + "\n", + "copy_context.src_storage: StorageType.GPU_Global\n", + "copy_context.dst_storage: StorageType.GPU_Global\n", + "is_not_cpu_to_cpu: True\n" + ] + }, { "data": { "text/html": [ + "
/* DaCe AUTO-GENERATED FILE. DO NOT MODIFY */\n",
+       "#include <dace/dace.h>\n",
+       "#include "../../include/hash.h"\n",
+       "\n",
+       "struct symbolic_warp_map_state_t {\n",
+       "    dace::cuda::Context *gpu_context;\n",
+       "};\n",
+       "\n",
+       "void __program_symbolic_warp_map_internal(symbolic_warp_map_state_t*__state, dace::uint * __restrict__ A, dace::uint * __restrict__ B, int NS)\n",
+       "{\n",
+       "\n",
+       "    {\n",
+       "\n",
+       "        DACE_GPU_CHECK(cudaMemcpyAsync(A, B, NS * sizeof(dace::uint), cudaMemcpyDeviceToDevice, __state->gpu_context->streams[0]));\n",
+       "\n",
+       "    }\n",
+       "}\n",
+       "\n",
+       "DACE_EXPORTED void __program_symbolic_warp_map(symbolic_warp_map_state_t *__state, dace::uint * __restrict__ A, dace::uint * __restrict__ B, int NS)\n",
+       "{\n",
+       "    __program_symbolic_warp_map_internal(__state, A, B, NS);\n",
+       "}\n",
+       "DACE_EXPORTED int __dace_init_experimental_cuda(symbolic_warp_map_state_t *__state, int NS);\n",
+       "DACE_EXPORTED int __dace_exit_experimental_cuda(symbolic_warp_map_state_t *__state);\n",
+       "\n",
+       "DACE_EXPORTED symbolic_warp_map_state_t *__dace_init_symbolic_warp_map(int NS)\n",
+       "{\n",
+       "    int __result = 0;\n",
+       "    symbolic_warp_map_state_t *__state = new symbolic_warp_map_state_t;\n",
+       "\n",
+       "\n",
+       "    __result |= __dace_init_experimental_cuda(__state, NS);\n",
+       "\n",
+       "    if (__result) {\n",
+       "        delete __state;\n",
+       "        return nullptr;\n",
+       "    }\n",
+       "    return __state;\n",
+       "}\n",
+       "\n",
+       "DACE_EXPORTED int __dace_exit_symbolic_warp_map(symbolic_warp_map_state_t *__state)\n",
+       "{\n",
+       "    int __err = 0;\n",
+       "\n",
+       "    int __err_experimental_cuda = __dace_exit_experimental_cuda(__state);\n",
+       "    if (__err_experimental_cuda) {\n",
+       "        __err = __err_experimental_cuda;\n",
+       "    }\n",
+       "    delete __state;\n",
+       "    return __err;\n",
+       "}\n",
+       "
\n" + ], + "text/latex": [ + "\\begin{Verbatim}[commandchars=\\\\\\{\\}]\n", + "\\PY{c+cm}{/* DaCe AUTO\\PYZhy{}GENERATED FILE. DO NOT MODIFY */}\n", + "\\PY{c+cp}{\\PYZsh{}}\\PY{c+cp}{include}\\PY{+w}{ }\\PY{c+cpf}{\\PYZlt{}dace/dace.h\\PYZgt{}}\n", + "\\PY{c+cp}{\\PYZsh{}}\\PY{c+cp}{include}\\PY{+w}{ }\\PY{c+cpf}{\\PYZdq{}../../include/hash.h\\PYZdq{}}\n", + "\n", + "\\PY{k}{struct}\\PY{+w}{ }\\PY{n+nc}{symbolic\\PYZus{}warp\\PYZus{}map\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", + "\\PY{+w}{ }\\PY{n}{dace}\\PY{o}{:}\\PY{o}{:}\\PY{n}{cuda}\\PY{o}{:}\\PY{o}{:}\\PY{n}{Context}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{gpu\\PYZus{}context}\\PY{p}{;}\n", + "\\PY{p}{\\PYZcb{}}\\PY{p}{;}\n", "\n", - "
\n", - "
\n", - "
\n", - "\n", - "" + "\\PY{k+kt}{void}\\PY{+w}{ }\\PY{n+nf}{\\PYZus{}\\PYZus{}program\\PYZus{}symbolic\\PYZus{}warp\\PYZus{}map\\PYZus{}internal}\\PY{p}{(}\\PY{n}{symbolic\\PYZus{}warp\\PYZus{}map\\PYZus{}state\\PYZus{}t}\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{dace}\\PY{o}{:}\\PY{o}{:}\\PY{n}{uint}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}restrict\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{n}{A}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{dace}\\PY{o}{:}\\PY{o}{:}\\PY{n}{uint}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}restrict\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{n}{B}\\PY{p}{,}\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{NS}\\PY{p}{)}\n", + "\\PY{p}{\\PYZob{}}\n", + "\n", + "\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", + "\n", + "\\PY{+w}{ }\\PY{n}{DACE\\PYZus{}GPU\\PYZus{}CHECK}\\PY{p}{(}\\PY{n}{cudaMemcpyAsync}\\PY{p}{(}\\PY{n}{A}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{B}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{NS}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{k}{sizeof}\\PY{p}{(}\\PY{n}{dace}\\PY{o}{:}\\PY{o}{:}\\PY{n}{uint}\\PY{p}{)}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{cudaMemcpyDeviceToDevice}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{streams}\\PY{p}{[}\\PY{l+m+mi}{0}\\PY{p}{]}\\PY{p}{)}\\PY{p}{)}\\PY{p}{;}\n", + "\n", + "\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\n", + "\\PY{p}{\\PYZcb{}}\n", + "\n", + "\\PY{n}{DACE\\PYZus{}EXPORTED}\\PY{+w}{ }\\PY{k+kt}{void}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}program\\PYZus{}symbolic\\PYZus{}warp\\PYZus{}map}\\PY{p}{(}\\PY{n}{symbolic\\PYZus{}warp\\PYZus{}map\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{dace}\\PY{o}{:}\\PY{o}{:}\\PY{n}{uint}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}restrict\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{n}{A}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{dace}\\PY{o}{:}\\PY{o}{:}\\PY{n}{uint}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}restrict\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{n}{B}\\PY{p}{,}\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{NS}\\PY{p}{)}\n", + "\\PY{p}{\\PYZob{}}\n", + "\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}program\\PYZus{}symbolic\\PYZus{}warp\\PYZus{}map\\PYZus{}internal}\\PY{p}{(}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{A}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{B}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{NS}\\PY{p}{)}\\PY{p}{;}\n", + "\\PY{p}{\\PYZcb{}}\n", + "\\PY{n}{DACE\\PYZus{}EXPORTED}\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}dace\\PYZus{}init\\PYZus{}experimental\\PYZus{}cuda}\\PY{p}{(}\\PY{n}{symbolic\\PYZus{}warp\\PYZus{}map\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{,}\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{NS}\\PY{p}{)}\\PY{p}{;}\n", + "\\PY{n}{DACE\\PYZus{}EXPORTED}\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}dace\\PYZus{}exit\\PYZus{}experimental\\PYZus{}cuda}\\PY{p}{(}\\PY{n}{symbolic\\PYZus{}warp\\PYZus{}map\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{)}\\PY{p}{;}\n", + "\n", + "\\PY{n}{DACE\\PYZus{}EXPORTED}\\PY{+w}{ }\\PY{n}{symbolic\\PYZus{}warp\\PYZus{}map\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}dace\\PYZus{}init\\PYZus{}symbolic\\PYZus{}warp\\PYZus{}map}\\PY{p}{(}\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{NS}\\PY{p}{)}\n", + "\\PY{p}{\\PYZob{}}\n", + "\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}result}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{n}{symbolic\\PYZus{}warp\\PYZus{}map\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{k}{new}\\PY{+w}{ }\\PY{n}{symbolic\\PYZus{}warp\\PYZus{}map\\PYZus{}state\\PYZus{}t}\\PY{p}{;}\n", + "\n", + "\n", + "\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}result}\\PY{+w}{ }\\PY{o}{|}\\PY{o}{=}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}dace\\PYZus{}init\\PYZus{}experimental\\PYZus{}cuda}\\PY{p}{(}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{NS}\\PY{p}{)}\\PY{p}{;}\n", + "\n", + "\\PY{+w}{ }\\PY{k}{if}\\PY{+w}{ }\\PY{p}{(}\\PY{n}{\\PYZus{}\\PYZus{}result}\\PY{p}{)}\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", + "\\PY{+w}{ }\\PY{k}{delete}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{k}{return}\\PY{+w}{ }\\PY{k}{nullptr}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\n", + "\\PY{+w}{ }\\PY{k}{return}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{;}\n", + "\\PY{p}{\\PYZcb{}}\n", + "\n", + "\\PY{n}{DACE\\PYZus{}EXPORTED}\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}dace\\PYZus{}exit\\PYZus{}symbolic\\PYZus{}warp\\PYZus{}map}\\PY{p}{(}\\PY{n}{symbolic\\PYZus{}warp\\PYZus{}map\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{)}\n", + "\\PY{p}{\\PYZob{}}\n", + "\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}err}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{;}\n", + "\n", + "\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}err\\PYZus{}experimental\\PYZus{}cuda}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}dace\\PYZus{}exit\\PYZus{}experimental\\PYZus{}cuda}\\PY{p}{(}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{)}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{k}{if}\\PY{+w}{ }\\PY{p}{(}\\PY{n}{\\PYZus{}\\PYZus{}err\\PYZus{}experimental\\PYZus{}cuda}\\PY{p}{)}\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", + "\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}err}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}err\\PYZus{}experimental\\PYZus{}cuda}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\n", + "\\PY{+w}{ }\\PY{k}{delete}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{k}{return}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}err}\\PY{p}{;}\n", + "\\PY{p}{\\PYZcb{}}\n", + "\\end{Verbatim}\n" ], "text/plain": [ - "SDFG (Warp_test_1)" + "/* DaCe AUTO-GENERATED FILE. DO NOT MODIFY */\n", + "#include \n", + "#include \"../../include/hash.h\"\n", + "\n", + "struct symbolic_warp_map_state_t {\n", + " dace::cuda::Context *gpu_context;\n", + "};\n", + "\n", + "void __program_symbolic_warp_map_internal(symbolic_warp_map_state_t*__state, dace::uint * __restrict__ A, dace::uint * __restrict__ B, int NS)\n", + "{\n", + "\n", + " {\n", + "\n", + " DACE_GPU_CHECK(cudaMemcpyAsync(A, B, NS * sizeof(dace::uint), cudaMemcpyDeviceToDevice, __state->gpu_context->streams[0]));\n", + "\n", + " }\n", + "}\n", + "\n", + "DACE_EXPORTED void __program_symbolic_warp_map(symbolic_warp_map_state_t *__state, dace::uint * __restrict__ A, dace::uint * __restrict__ B, int NS)\n", + "{\n", + " __program_symbolic_warp_map_internal(__state, A, B, NS);\n", + "}\n", + "DACE_EXPORTED int __dace_init_experimental_cuda(symbolic_warp_map_state_t *__state, int NS);\n", + "DACE_EXPORTED int __dace_exit_experimental_cuda(symbolic_warp_map_state_t *__state);\n", + "\n", + "DACE_EXPORTED symbolic_warp_map_state_t *__dace_init_symbolic_warp_map(int NS)\n", + "{\n", + " int __result = 0;\n", + " symbolic_warp_map_state_t *__state = new symbolic_warp_map_state_t;\n", + "\n", + "\n", + " __result |= __dace_init_experimental_cuda(__state, NS);\n", + "\n", + " if (__result) {\n", + " delete __state;\n", + " return nullptr;\n", + " }\n", + " return __state;\n", + "}\n", + "\n", + "DACE_EXPORTED int __dace_exit_symbolic_warp_map(symbolic_warp_map_state_t *__state)\n", + "{\n", + " int __err = 0;\n", + "\n", + " int __err_experimental_cuda = __dace_exit_experimental_cuda(__state);\n", + " if (__err_experimental_cuda) {\n", + " __err = __err_experimental_cuda;\n", + " }\n", + " delete __state;\n", + " return __err;\n", + "}" ] }, - "execution_count": 2, + "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "\n", - "# Generate framework\n", - "sdfg = dace.SDFG(\"Warp_test_1\")\n", - "\n", - "state = sdfg.add_state(\"main\")\n", - "\n", - "# Generate access nodes\n", - "a_dev = sdfg.add_array(\"A\", (32,), dace.uint32, dace.dtypes.StorageType.GPU_Global)\n", - "b_dev = sdfg.add_array(\"B\", (32,), dace.uint32, dace.dtypes.StorageType.GPU_Global)\n", - "a_acc = state.add_access(\"A\")\n", - "b_acc = state.add_access(\"B\")\n", - "\n", - "\n", - "# Generate maps, connect entries with access data\n", - "gpu_map_entry, gpu_map_exit = state.add_map(name = \"GPU_Map\",\n", - " ndrange = dict(i='0:32:32'),\n", - " schedule = dace.dtypes.ScheduleType.GPU_Device)\n", - "state.add_edge(a_acc, None, gpu_map_entry, None, dace.memlet.Memlet('A[0:32]'))\n", - "\n", - "\n", - "tblock_map_entry, tblock_map_exit = state.add_map(name = \"Block_Map\",\n", - " ndrange = dict(j='0:32'),\n", - " schedule = dace.dtypes.ScheduleType.GPU_ThreadBlock)\n", - "state.add_edge(gpu_map_entry, None, tblock_map_entry, None, dace.memlet.Memlet('A[0:32]'))\n", - "\n", - "\n", - "\n", - "\n", - "tasklet, warp_scope_entry, warp_scope_exit = state.add_mapped_tasklet(\n", - " name='WarpLevel_Operation',\n", - " map_ranges=dict(_='0:1'),\n", - " inputs=dict(inp=dace.Memlet('A[0:32]', volume=32)),\n", - " code=\n", - "''' \n", - "value = inp[j]\n", - "out = __reduce_add_sync(0xFFFFFFFF, value);\n", - "''',\n", - " outputs=dict(out=dace.Memlet(\"B[j]\")),\n", - " schedule=dace.dtypes.ScheduleType.GPU_Warp\n", - ")\n", - "\n", - "state.add_edge(tblock_map_entry, None, warp_scope_entry, None, dace.memlet.Memlet('A[0:32]'))\n", - "\n", - "# Connect Exit nodes\n", - "state.add_edge(warp_scope_exit, None, tblock_map_exit, None, dace.memlet.Memlet('B[j]'))\n", - "state.add_edge(tblock_map_exit, None, gpu_map_exit, None, dace.memlet.Memlet('B[j]'))\n", - "state.add_edge(gpu_map_exit, None, b_acc, None, dace.memlet.Memlet('B[0:32]'))\n", - "\n", - "\n", - "\n", - "\n", - "sdfg.fill_scope_connectors()\n", - "\n", - "\n", - "\n", - "sdfg" + "Code(sdfg.generate_code()[0].clean_code, language='cpp')" ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 4, "id": "9426fb29", "metadata": {}, "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "not_in_kernel_code: True\n", + "is_between_access_nodes: True\n", + "involves_gpu_or_pinned: True\n", + "\n", + "\n", + "copy_context.src_storage: StorageType.GPU_Global\n", + "copy_context.dst_storage: StorageType.GPU_Global\n", + "is_not_cpu_to_cpu: True\n" + ] + }, { "data": { "text/html": [ @@ -216,18 +435,18 @@ "#include <dace/dace.h>\n", "\n", "\n", - "struct Warp_test_1_state_t {\n", + "struct symbolic_warp_map_state_t {\n", " dace::cuda::Context *gpu_context;\n", "};\n", "\n", "\n", "\n", - "DACE_EXPORTED int __dace_init_experimental_cuda(Warp_test_1_state_t *__state);\n", - "DACE_EXPORTED int __dace_exit_experimental_cuda(Warp_test_1_state_t *__state);\n", + "DACE_EXPORTED int __dace_init_experimental_cuda(symbolic_warp_map_state_t *__state, int NS);\n", + "DACE_EXPORTED int __dace_exit_experimental_cuda(symbolic_warp_map_state_t *__state);\n", "\n", "\n", "\n", - "int __dace_init_experimental_cuda(Warp_test_1_state_t *__state) {\n", + "int __dace_init_experimental_cuda(symbolic_warp_map_state_t *__state, int NS) {\n", " int count;\n", "\n", " // Check that we are able to run cuda code\n", @@ -266,7 +485,7 @@ " return 0;\n", "}\n", "\n", - "int __dace_exit_experimental_cuda(Warp_test_1_state_t *__state) {\n", + "int __dace_exit_experimental_cuda(symbolic_warp_map_state_t *__state) {\n", " \n", "\n", " // Synchronize and check for CUDA errors\n", @@ -286,7 +505,7 @@ " return __err;\n", "}\n", "\n", - "DACE_EXPORTED bool __dace_gpu_set_stream(Warp_test_1_state_t *__state, int streamid, gpuStream_t stream)\n", + "DACE_EXPORTED bool __dace_gpu_set_stream(symbolic_warp_map_state_t *__state, int streamid, gpuStream_t stream)\n", "{\n", " if (streamid < 0 || streamid >= 1)\n", " return false;\n", @@ -296,48 +515,11 @@ " return true;\n", "}\n", "\n", - "DACE_EXPORTED void __dace_gpu_set_all_streams(Warp_test_1_state_t *__state, gpuStream_t stream)\n", + "DACE_EXPORTED void __dace_gpu_set_all_streams(symbolic_warp_map_state_t *__state, gpuStream_t stream)\n", "{\n", " for (int i = 0; i < 1; ++i)\n", " __state->gpu_context->streams[i] = stream;\n", "}\n", - "\n", - "__global__ void GPU_Map_0_0_2(dace::uint * __restrict__ A, dace::uint * __restrict__ B)\n", - "{ // Kernel scope (open 1)\n", - " int i = (32 * blockIdx.x);\n", - " { // ThreadBlock Scope (open 1)\n", - " int j = threadIdx.x;\n", - " { // WarpLevel Scope (open 1)\n", - " int warpId_WarpLevel_Operation_map_0_0_6 = (threadIdx.x) / 32;\n", - " int _ = warpId_WarpLevel_Operation_map_0_0_6 % 1;\n", - " {\n", - " dace::uint* inp = &A[0];\n", - " dace::uint out;\n", - "\n", - " ///////////////////\n", - " // Tasklet code (WarpLevel_Operation)\n", - " auto value = inp[j];\n", - " out = __reduce_add_sync(4294967295U, value);\n", - " ///////////////////\n", - "\n", - " B[j] = out;\n", - " }\n", - " } // WarpLevel Scope (close 1)\n", - " } // ThreadBlock Scope (close 1)\n", - "} // Kernel scope (close 1)\n", - "\n", - "\n", - "DACE_EXPORTED void __dace_runkernel_GPU_Map_0_0_2(Warp_test_1_state_t *__state, dace::uint * __restrict__ A, dace::uint * __restrict__ B);\n", - "void __dace_runkernel_GPU_Map_0_0_2(Warp_test_1_state_t *__state, dace::uint * __restrict__ A, dace::uint * __restrict__ B)\n", - "{\n", - "\n", - "\n", - " void *GPU_Map_0_0_2_args[] = { (void *)&A, (void *)&B };\n", - " gpuError_t __err = cudaLaunchKernel( (void*)GPU_Map_0_0_2, dim3(1, 1, 1), dim3(32, 1, 1), GPU_Map_0_0_2_args, 0, __state->gpu_context->streams[0]\n", - " );\n", - "\n", - " DACE_KERNEL_LAUNCH_CHECK(__err, "GPU_Map_0_0_2", 1, 1, 1, 32, 1, 1);\n", - "}\n", "\n" ], "text/latex": [ @@ -346,18 +528,18 @@ "\\PY{c+cp}{\\PYZsh{}}\\PY{c+cp}{include}\\PY{+w}{ }\\PY{c+cpf}{\\PYZlt{}dace/dace.h\\PYZgt{}}\n", "\n", "\n", - "\\PY{k}{struct}\\PY{+w}{ }\\PY{n+nc}{Warp\\PYZus{}test\\PYZus{}1\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", + "\\PY{k}{struct}\\PY{+w}{ }\\PY{n+nc}{symbolic\\PYZus{}warp\\PYZus{}map\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", "\\PY{+w}{ }\\PY{n}{dace}\\PY{o}{:}\\PY{o}{:}\\PY{n}{cuda}\\PY{o}{:}\\PY{o}{:}\\PY{n}{Context}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{gpu\\PYZus{}context}\\PY{p}{;}\n", "\\PY{p}{\\PYZcb{}}\\PY{p}{;}\n", "\n", "\n", "\n", - "\\PY{n}{DACE\\PYZus{}EXPORTED}\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}dace\\PYZus{}init\\PYZus{}experimental\\PYZus{}cuda}\\PY{p}{(}\\PY{n}{Warp\\PYZus{}test\\PYZus{}1\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{)}\\PY{p}{;}\n", - "\\PY{n}{DACE\\PYZus{}EXPORTED}\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}dace\\PYZus{}exit\\PYZus{}experimental\\PYZus{}cuda}\\PY{p}{(}\\PY{n}{Warp\\PYZus{}test\\PYZus{}1\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{)}\\PY{p}{;}\n", + "\\PY{n}{DACE\\PYZus{}EXPORTED}\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}dace\\PYZus{}init\\PYZus{}experimental\\PYZus{}cuda}\\PY{p}{(}\\PY{n}{symbolic\\PYZus{}warp\\PYZus{}map\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{,}\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{NS}\\PY{p}{)}\\PY{p}{;}\n", + "\\PY{n}{DACE\\PYZus{}EXPORTED}\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}dace\\PYZus{}exit\\PYZus{}experimental\\PYZus{}cuda}\\PY{p}{(}\\PY{n}{symbolic\\PYZus{}warp\\PYZus{}map\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{)}\\PY{p}{;}\n", "\n", "\n", "\n", - "\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n+nf}{\\PYZus{}\\PYZus{}dace\\PYZus{}init\\PYZus{}experimental\\PYZus{}cuda}\\PY{p}{(}\\PY{n}{Warp\\PYZus{}test\\PYZus{}1\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{)}\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", + "\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n+nf}{\\PYZus{}\\PYZus{}dace\\PYZus{}init\\PYZus{}experimental\\PYZus{}cuda}\\PY{p}{(}\\PY{n}{symbolic\\PYZus{}warp\\PYZus{}map\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{,}\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{NS}\\PY{p}{)}\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", "\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{count}\\PY{p}{;}\n", "\n", "\\PY{+w}{ }\\PY{c+c1}{// Check that we are able to run cuda code}\n", @@ -396,7 +578,7 @@ "\\PY{+w}{ }\\PY{k}{return}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{;}\n", "\\PY{p}{\\PYZcb{}}\n", "\n", - "\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n+nf}{\\PYZus{}\\PYZus{}dace\\PYZus{}exit\\PYZus{}experimental\\PYZus{}cuda}\\PY{p}{(}\\PY{n}{Warp\\PYZus{}test\\PYZus{}1\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{)}\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", + "\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n+nf}{\\PYZus{}\\PYZus{}dace\\PYZus{}exit\\PYZus{}experimental\\PYZus{}cuda}\\PY{p}{(}\\PY{n}{symbolic\\PYZus{}warp\\PYZus{}map\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{)}\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", "\\PY{+w}{ }\n", "\n", "\\PY{+w}{ }\\PY{c+c1}{// Synchronize and check for CUDA errors}\n", @@ -416,7 +598,7 @@ "\\PY{+w}{ }\\PY{k}{return}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}err}\\PY{p}{;}\n", "\\PY{p}{\\PYZcb{}}\n", "\n", - "\\PY{n}{DACE\\PYZus{}EXPORTED}\\PY{+w}{ }\\PY{k+kt}{bool}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}dace\\PYZus{}gpu\\PYZus{}set\\PYZus{}stream}\\PY{p}{(}\\PY{n}{Warp\\PYZus{}test\\PYZus{}1\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{,}\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{streamid}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{gpuStream\\PYZus{}t}\\PY{+w}{ }\\PY{n}{stream}\\PY{p}{)}\n", + "\\PY{n}{DACE\\PYZus{}EXPORTED}\\PY{+w}{ }\\PY{k+kt}{bool}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}dace\\PYZus{}gpu\\PYZus{}set\\PYZus{}stream}\\PY{p}{(}\\PY{n}{symbolic\\PYZus{}warp\\PYZus{}map\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{,}\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{streamid}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{gpuStream\\PYZus{}t}\\PY{+w}{ }\\PY{n}{stream}\\PY{p}{)}\n", "\\PY{p}{\\PYZob{}}\n", "\\PY{+w}{ }\\PY{k}{if}\\PY{+w}{ }\\PY{p}{(}\\PY{n}{streamid}\\PY{+w}{ }\\PY{o}{\\PYZlt{}}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{+w}{ }\\PY{o}{|}\\PY{o}{|}\\PY{+w}{ }\\PY{n}{streamid}\\PY{+w}{ }\\PY{o}{\\PYZgt{}}\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{)}\n", "\\PY{+w}{ }\\PY{k}{return}\\PY{+w}{ }\\PY{n+nb}{false}\\PY{p}{;}\n", @@ -426,48 +608,11 @@ "\\PY{+w}{ }\\PY{k}{return}\\PY{+w}{ }\\PY{n+nb}{true}\\PY{p}{;}\n", "\\PY{p}{\\PYZcb{}}\n", "\n", - "\\PY{n}{DACE\\PYZus{}EXPORTED}\\PY{+w}{ }\\PY{k+kt}{void}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}dace\\PYZus{}gpu\\PYZus{}set\\PYZus{}all\\PYZus{}streams}\\PY{p}{(}\\PY{n}{Warp\\PYZus{}test\\PYZus{}1\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{gpuStream\\PYZus{}t}\\PY{+w}{ }\\PY{n}{stream}\\PY{p}{)}\n", + "\\PY{n}{DACE\\PYZus{}EXPORTED}\\PY{+w}{ }\\PY{k+kt}{void}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}dace\\PYZus{}gpu\\PYZus{}set\\PYZus{}all\\PYZus{}streams}\\PY{p}{(}\\PY{n}{symbolic\\PYZus{}warp\\PYZus{}map\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{gpuStream\\PYZus{}t}\\PY{+w}{ }\\PY{n}{stream}\\PY{p}{)}\n", "\\PY{p}{\\PYZob{}}\n", "\\PY{+w}{ }\\PY{k}{for}\\PY{+w}{ }\\PY{p}{(}\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{;}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{\\PYZlt{}}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{;}\\PY{+w}{ }\\PY{o}{+}\\PY{o}{+}\\PY{n}{i}\\PY{p}{)}\n", "\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{streams}\\PY{p}{[}\\PY{n}{i}\\PY{p}{]}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{stream}\\PY{p}{;}\n", "\\PY{p}{\\PYZcb{}}\n", - "\n", - "\\PY{n}{\\PYZus{}\\PYZus{}global\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{k+kt}{void}\\PY{+w}{ }\\PY{n}{GPU\\PYZus{}Map\\PYZus{}0\\PYZus{}0\\PYZus{}2}\\PY{p}{(}\\PY{n}{dace}\\PY{o}{:}\\PY{o}{:}\\PY{n}{uint}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}restrict\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{n}{A}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{dace}\\PY{o}{:}\\PY{o}{:}\\PY{n}{uint}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}restrict\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{n}{B}\\PY{p}{)}\n", - "\\PY{p}{\\PYZob{}}\\PY{+w}{ }\\PY{c+c1}{// Kernel scope (open 1)}\n", - "\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{p}{(}\\PY{l+m+mi}{32}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{blockIdx}\\PY{p}{.}\\PY{n}{x}\\PY{p}{)}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{p}{\\PYZob{}}\\PY{+w}{ }\\PY{c+c1}{// ThreadBlock Scope (open 1)}\n", - "\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{j}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{threadIdx}\\PY{p}{.}\\PY{n}{x}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{p}{\\PYZob{}}\\PY{+w}{ }\\PY{c+c1}{// WarpLevel Scope (open 1)}\n", - "\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{warpId\\PYZus{}WarpLevel\\PYZus{}Operation\\PYZus{}map\\PYZus{}0\\PYZus{}0\\PYZus{}6}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{p}{(}\\PY{n}{threadIdx}\\PY{p}{.}\\PY{n}{x}\\PY{p}{)}\\PY{+w}{ }\\PY{o}{/}\\PY{+w}{ }\\PY{l+m+mi}{32}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{\\PYZus{}}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{warpId\\PYZus{}WarpLevel\\PYZus{}Operation\\PYZus{}map\\PYZus{}0\\PYZus{}0\\PYZus{}6}\\PY{+w}{ }\\PY{o}{\\PYZpc{}}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", - "\\PY{+w}{ }\\PY{n}{dace}\\PY{o}{:}\\PY{o}{:}\\PY{n}{uint}\\PY{o}{*}\\PY{+w}{ }\\PY{n}{inp}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{o}{\\PYZam{}}\\PY{n}{A}\\PY{p}{[}\\PY{l+m+mi}{0}\\PY{p}{]}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{n}{dace}\\PY{o}{:}\\PY{o}{:}\\PY{n}{uint}\\PY{+w}{ }\\PY{n}{out}\\PY{p}{;}\n", - "\n", - "\\PY{+w}{ }\\PY{c+c1}{///////////////////}\n", - "\\PY{+w}{ }\\PY{c+c1}{// Tasklet code (WarpLevel\\PYZus{}Operation)}\n", - "\\PY{+w}{ }\\PY{k}{auto}\\PY{+w}{ }\\PY{n}{value}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{inp}\\PY{p}{[}\\PY{n}{j}\\PY{p}{]}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{n}{out}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}reduce\\PYZus{}add\\PYZus{}sync}\\PY{p}{(}\\PY{l+m+mi}{4294967295U}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{value}\\PY{p}{)}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{c+c1}{///////////////////}\n", - "\n", - "\\PY{+w}{ }\\PY{n}{B}\\PY{p}{[}\\PY{n}{j}\\PY{p}{]}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{out}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\n", - "\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\\PY{+w}{ }\\PY{c+c1}{// WarpLevel Scope (close 1)}\n", - "\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\\PY{+w}{ }\\PY{c+c1}{// ThreadBlock Scope (close 1)}\n", - "\\PY{p}{\\PYZcb{}}\\PY{+w}{ }\\PY{c+c1}{// Kernel scope (close 1)}\n", - "\n", - "\n", - "\\PY{n}{DACE\\PYZus{}EXPORTED}\\PY{+w}{ }\\PY{k+kt}{void}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}dace\\PYZus{}runkernel\\PYZus{}GPU\\PYZus{}Map\\PYZus{}0\\PYZus{}0\\PYZus{}2}\\PY{p}{(}\\PY{n}{Warp\\PYZus{}test\\PYZus{}1\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{dace}\\PY{o}{:}\\PY{o}{:}\\PY{n}{uint}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}restrict\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{n}{A}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{dace}\\PY{o}{:}\\PY{o}{:}\\PY{n}{uint}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}restrict\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{n}{B}\\PY{p}{)}\\PY{p}{;}\n", - "\\PY{k+kt}{void}\\PY{+w}{ }\\PY{n+nf}{\\PYZus{}\\PYZus{}dace\\PYZus{}runkernel\\PYZus{}GPU\\PYZus{}Map\\PYZus{}0\\PYZus{}0\\PYZus{}2}\\PY{p}{(}\\PY{n}{Warp\\PYZus{}test\\PYZus{}1\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{dace}\\PY{o}{:}\\PY{o}{:}\\PY{n}{uint}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}restrict\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{n}{A}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{dace}\\PY{o}{:}\\PY{o}{:}\\PY{n}{uint}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}restrict\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{n}{B}\\PY{p}{)}\n", - "\\PY{p}{\\PYZob{}}\n", - "\n", - "\n", - "\\PY{+w}{ }\\PY{k+kt}{void}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{GPU\\PYZus{}Map\\PYZus{}0\\PYZus{}0\\PYZus{}2\\PYZus{}args}\\PY{p}{[}\\PY{p}{]}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{p}{\\PYZob{}}\\PY{+w}{ }\\PY{p}{(}\\PY{k+kt}{void}\\PY{+w}{ }\\PY{o}{*}\\PY{p}{)}\\PY{o}{\\PYZam{}}\\PY{n}{A}\\PY{p}{,}\\PY{+w}{ }\\PY{p}{(}\\PY{k+kt}{void}\\PY{+w}{ }\\PY{o}{*}\\PY{p}{)}\\PY{o}{\\PYZam{}}\\PY{n}{B}\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{n}{gpuError\\PYZus{}t}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}err}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{cudaLaunchKernel}\\PY{p}{(}\\PY{+w}{ }\\PY{p}{(}\\PY{k+kt}{void}\\PY{o}{*}\\PY{p}{)}\\PY{n}{GPU\\PYZus{}Map\\PYZus{}0\\PYZus{}0\\PYZus{}2}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{dim3}\\PY{p}{(}\\PY{l+m+mi}{1}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{)}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{dim3}\\PY{p}{(}\\PY{l+m+mi}{32}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{)}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{GPU\\PYZus{}Map\\PYZus{}0\\PYZus{}0\\PYZus{}2\\PYZus{}args}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{streams}\\PY{p}{[}\\PY{l+m+mi}{0}\\PY{p}{]}\n", - "\\PY{+w}{ }\\PY{p}{)}\\PY{p}{;}\n", - "\n", - "\\PY{+w}{ }\\PY{n}{DACE\\PYZus{}KERNEL\\PYZus{}LAUNCH\\PYZus{}CHECK}\\PY{p}{(}\\PY{n}{\\PYZus{}\\PYZus{}err}\\PY{p}{,}\\PY{+w}{ }\\PY{l+s}{\\PYZdq{}}\\PY{l+s}{GPU\\PYZus{}Map\\PYZus{}0\\PYZus{}0\\PYZus{}2}\\PY{l+s}{\\PYZdq{}}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{32}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{)}\\PY{p}{;}\n", - "\\PY{p}{\\PYZcb{}}\n", "\\end{Verbatim}\n" ], "text/plain": [ @@ -476,18 +621,18 @@ "#include \n", "\n", "\n", - "struct Warp_test_1_state_t {\n", + "struct symbolic_warp_map_state_t {\n", " dace::cuda::Context *gpu_context;\n", "};\n", "\n", "\n", "\n", - "DACE_EXPORTED int __dace_init_experimental_cuda(Warp_test_1_state_t *__state);\n", - "DACE_EXPORTED int __dace_exit_experimental_cuda(Warp_test_1_state_t *__state);\n", + "DACE_EXPORTED int __dace_init_experimental_cuda(symbolic_warp_map_state_t *__state, int NS);\n", + "DACE_EXPORTED int __dace_exit_experimental_cuda(symbolic_warp_map_state_t *__state);\n", "\n", "\n", "\n", - "int __dace_init_experimental_cuda(Warp_test_1_state_t *__state) {\n", + "int __dace_init_experimental_cuda(symbolic_warp_map_state_t *__state, int NS) {\n", " int count;\n", "\n", " // Check that we are able to run cuda code\n", @@ -526,7 +671,7 @@ " return 0;\n", "}\n", "\n", - "int __dace_exit_experimental_cuda(Warp_test_1_state_t *__state) {\n", + "int __dace_exit_experimental_cuda(symbolic_warp_map_state_t *__state) {\n", " \n", "\n", " // Synchronize and check for CUDA errors\n", @@ -546,7 +691,7 @@ " return __err;\n", "}\n", "\n", - "DACE_EXPORTED bool __dace_gpu_set_stream(Warp_test_1_state_t *__state, int streamid, gpuStream_t stream)\n", + "DACE_EXPORTED bool __dace_gpu_set_stream(symbolic_warp_map_state_t *__state, int streamid, gpuStream_t stream)\n", "{\n", " if (streamid < 0 || streamid >= 1)\n", " return false;\n", @@ -556,51 +701,15 @@ " return true;\n", "}\n", "\n", - "DACE_EXPORTED void __dace_gpu_set_all_streams(Warp_test_1_state_t *__state, gpuStream_t stream)\n", + "DACE_EXPORTED void __dace_gpu_set_all_streams(symbolic_warp_map_state_t *__state, gpuStream_t stream)\n", "{\n", " for (int i = 0; i < 1; ++i)\n", " __state->gpu_context->streams[i] = stream;\n", "}\n", - "\n", - "__global__ void GPU_Map_0_0_2(dace::uint * __restrict__ A, dace::uint * __restrict__ B)\n", - "{ // Kernel scope (open 1)\n", - " int i = (32 * blockIdx.x);\n", - " { // ThreadBlock Scope (open 1)\n", - " int j = threadIdx.x;\n", - " { // WarpLevel Scope (open 1)\n", - " int warpId_WarpLevel_Operation_map_0_0_6 = (threadIdx.x) / 32;\n", - " int _ = warpId_WarpLevel_Operation_map_0_0_6 % 1;\n", - " {\n", - " dace::uint* inp = &A[0];\n", - " dace::uint out;\n", - "\n", - " ///////////////////\n", - " // Tasklet code (WarpLevel_Operation)\n", - " auto value = inp[j];\n", - " out = __reduce_add_sync(4294967295U, value);\n", - " ///////////////////\n", - "\n", - " B[j] = out;\n", - " }\n", - " } // WarpLevel Scope (close 1)\n", - " } // ThreadBlock Scope (close 1)\n", - "} // Kernel scope (close 1)\n", - "\n", - "\n", - "DACE_EXPORTED void __dace_runkernel_GPU_Map_0_0_2(Warp_test_1_state_t *__state, dace::uint * __restrict__ A, dace::uint * __restrict__ B);\n", - "void __dace_runkernel_GPU_Map_0_0_2(Warp_test_1_state_t *__state, dace::uint * __restrict__ A, dace::uint * __restrict__ B)\n", - "{\n", - "\n", - "\n", - " void *GPU_Map_0_0_2_args[] = { (void *)&A, (void *)&B };\n", - " gpuError_t __err = cudaLaunchKernel( (void*)GPU_Map_0_0_2, dim3(1, 1, 1), dim3(32, 1, 1), GPU_Map_0_0_2_args, 0, __state->gpu_context->streams[0]\n", - " );\n", - "\n", - " DACE_KERNEL_LAUNCH_CHECK(__err, \"GPU_Map_0_0_2\", 1, 1, 1, 32, 1, 1);\n", - "}\n" + "\n" ] }, - "execution_count": 3, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } @@ -608,48 +717,6 @@ "source": [ "Code(sdfg.generate_code()[1].clean_code, language='cpp')" ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "6d7c1429", - "metadata": {}, - "outputs": [], - "source": [ - "call_it = sdfg.compile()" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "9673dc1b", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/home/berkay/master-thesis/dace/dace/sdfg/sdfg.py:2373: UserWarning: SDFG 'Warp_test_1' is already loaded by another object, recompiling under a different name 'Warp_test_1_0'.\n", - " warnings.warn(f\"SDFG '{self.name}' is already loaded by another object, recompiling under a different \"\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32\n", - " 32 32 32 32 32 32 32 32]\n" - ] - } - ], - "source": [ - "A = cp.ones(32, dtype=cp.uint32) \n", - "B = cp.zeros(32, dtype=cp.uint32) \n", - "\n", - "sdfg(A=A, B=B)\n", - "\n", - "print(B)" - ] } ], "metadata": { diff --git a/berkay_workpace/scratch/vis.ipynb b/berkay_workpace/scratch/vis.ipynb deleted file mode 100644 index efd0359347..0000000000 --- a/berkay_workpace/scratch/vis.ipynb +++ /dev/null @@ -1,492 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "id": "2a7d72f5", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "import dace\n", - "import cupy as cp\n", - "\n", - "from IPython.display import Code\n", - "from dace.config import Config" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "2166f4ee", - "metadata": {}, - "outputs": [], - "source": [ - "@dace.program\n", - "def cpuRed(A: dace.uint32[32], B:dace.uint32[32]):\n", - " for i in dace.map[0:32]:\n", - " B[i] = sum(A)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "baa4d9ba", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "sdfg = cpuRed.to_sdfg()\n", - "sdfg.compile()\n", - "sdfg\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f863ad50", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
/* DaCe AUTO-GENERATED FILE. DO NOT MODIFY */\n",
-       "#include <dace/dace.h>\n",
-       "#include "../../include/hash.h"\n",
-       "\n",
-       "struct cpuRed_state_t {\n",
-       "\n",
-       "};\n",
-       "\n",
-       "inline void reduce_0_0_6(cpuRed_state_t *__state, dace::uint* __restrict__ _in, dace::uint* __restrict__ _out) {\n",
-       "\n",
-       "    {\n",
-       "\n",
-       "        {\n",
-       "            for (auto _o0 = 0; _o0 < 1; _o0 += 1) {\n",
-       "                {\n",
-       "                    dace::uint __out;\n",
-       "\n",
-       "                    ///////////////////\n",
-       "                    // Tasklet code (reduce_init)\n",
-       "                    __out = 0;\n",
-       "                    ///////////////////\n",
-       "\n",
-       "                    _out[_o0] = __out;\n",
-       "                }\n",
-       "            }\n",
-       "        }\n",
-       "\n",
-       "    }\n",
-       "    {\n",
-       "\n",
-       "        {\n",
-       "            for (auto _i0 = 0; _i0 < 32; _i0 += 1) {\n",
-       "                {\n",
-       "                    dace::uint __inp = _in[_i0];\n",
-       "                    dace::uint __out;\n",
-       "\n",
-       "                    ///////////////////\n",
-       "                    // Tasklet code (identity)\n",
-       "                    __out = __inp;\n",
-       "                    ///////////////////\n",
-       "\n",
-       "                    dace::wcr_fixed<dace::ReductionType::Sum, dace::uint>::reduce(_out, __out);\n",
-       "                }\n",
-       "            }\n",
-       "        }\n",
-       "\n",
-       "    }\n",
-       "}\n",
-       "\n",
-       "void __program_cpuRed_internal(cpuRed_state_t*__state, dace::uint * __restrict__ A, dace::uint * __restrict__ B)\n",
-       "{\n",
-       "\n",
-       "    {\n",
-       "\n",
-       "        {\n",
-       "            #pragma omp parallel for\n",
-       "            for (auto i = 0; i < 32; i += 1) {\n",
-       "                dace::uint __tmp1;\n",
-       "                reduce_0_0_6(__state, &A[0], &__tmp1);\n",
-       "                {\n",
-       "                    dace::uint __inp = __tmp1;\n",
-       "                    dace::uint __out;\n",
-       "\n",
-       "                    ///////////////////\n",
-       "                    // Tasklet code (assign_4_8)\n",
-       "                    __out = __inp;\n",
-       "                    ///////////////////\n",
-       "\n",
-       "                    B[i] = __out;\n",
-       "                }\n",
-       "            }\n",
-       "        }\n",
-       "\n",
-       "    }\n",
-       "}\n",
-       "\n",
-       "DACE_EXPORTED void __program_cpuRed(cpuRed_state_t *__state, dace::uint * __restrict__ A, dace::uint * __restrict__ B)\n",
-       "{\n",
-       "    __program_cpuRed_internal(__state, A, B);\n",
-       "}\n",
-       "\n",
-       "DACE_EXPORTED cpuRed_state_t *__dace_init_cpuRed()\n",
-       "{\n",
-       "    int __result = 0;\n",
-       "    cpuRed_state_t *__state = new cpuRed_state_t;\n",
-       "\n",
-       "\n",
-       "\n",
-       "    if (__result) {\n",
-       "        delete __state;\n",
-       "        return nullptr;\n",
-       "    }\n",
-       "    return __state;\n",
-       "}\n",
-       "\n",
-       "DACE_EXPORTED int __dace_exit_cpuRed(cpuRed_state_t *__state)\n",
-       "{\n",
-       "    int __err = 0;\n",
-       "    delete __state;\n",
-       "    return __err;\n",
-       "}\n",
-       "
\n" - ], - "text/latex": [ - "\\begin{Verbatim}[commandchars=\\\\\\{\\}]\n", - "\\PY{c+cm}{/* DaCe AUTO\\PYZhy{}GENERATED FILE. DO NOT MODIFY */}\n", - "\\PY{c+cp}{\\PYZsh{}}\\PY{c+cp}{include}\\PY{+w}{ }\\PY{c+cpf}{\\PYZlt{}dace/dace.h\\PYZgt{}}\n", - "\\PY{c+cp}{\\PYZsh{}}\\PY{c+cp}{include}\\PY{+w}{ }\\PY{c+cpf}{\\PYZdq{}../../include/hash.h\\PYZdq{}}\n", - "\n", - "\\PY{k}{struct}\\PY{+w}{ }\\PY{n+nc}{cpuRed\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", - "\n", - "\\PY{p}{\\PYZcb{}}\\PY{p}{;}\n", - "\n", - "\\PY{k+kr}{inline}\\PY{+w}{ }\\PY{k+kt}{void}\\PY{+w}{ }\\PY{n+nf}{reduce\\PYZus{}0\\PYZus{}0\\PYZus{}6}\\PY{p}{(}\\PY{n}{cpuRed\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{dace}\\PY{o}{:}\\PY{o}{:}\\PY{n}{uint}\\PY{o}{*}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}restrict\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{n}{\\PYZus{}in}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{dace}\\PY{o}{:}\\PY{o}{:}\\PY{n}{uint}\\PY{o}{*}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}restrict\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{n}{\\PYZus{}out}\\PY{p}{)}\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", - "\n", - "\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", - "\n", - "\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", - "\\PY{+w}{ }\\PY{k}{for}\\PY{+w}{ }\\PY{p}{(}\\PY{k}{auto}\\PY{+w}{ }\\PY{n}{\\PYZus{}o0}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{;}\\PY{+w}{ }\\PY{n}{\\PYZus{}o0}\\PY{+w}{ }\\PY{o}{\\PYZlt{}}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{;}\\PY{+w}{ }\\PY{n}{\\PYZus{}o0}\\PY{+w}{ }\\PY{o}{+}\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{)}\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", - "\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", - "\\PY{+w}{ }\\PY{n}{dace}\\PY{o}{:}\\PY{o}{:}\\PY{n}{uint}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}out}\\PY{p}{;}\n", - "\n", - "\\PY{+w}{ }\\PY{c+c1}{///////////////////}\n", - "\\PY{+w}{ }\\PY{c+c1}{// Tasklet code (reduce\\PYZus{}init)}\n", - "\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}out}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{c+c1}{///////////////////}\n", - "\n", - "\\PY{+w}{ }\\PY{n}{\\PYZus{}out}\\PY{p}{[}\\PY{n}{\\PYZus{}o0}\\PY{p}{]}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}out}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\n", - "\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\n", - "\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\n", - "\n", - "\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\n", - "\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", - "\n", - "\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", - "\\PY{+w}{ }\\PY{k}{for}\\PY{+w}{ }\\PY{p}{(}\\PY{k}{auto}\\PY{+w}{ }\\PY{n}{\\PYZus{}i0}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{;}\\PY{+w}{ }\\PY{n}{\\PYZus{}i0}\\PY{+w}{ }\\PY{o}{\\PYZlt{}}\\PY{+w}{ }\\PY{l+m+mi}{32}\\PY{p}{;}\\PY{+w}{ }\\PY{n}{\\PYZus{}i0}\\PY{+w}{ }\\PY{o}{+}\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{)}\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", - "\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", - "\\PY{+w}{ }\\PY{n}{dace}\\PY{o}{:}\\PY{o}{:}\\PY{n}{uint}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}inp}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{\\PYZus{}in}\\PY{p}{[}\\PY{n}{\\PYZus{}i0}\\PY{p}{]}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{n}{dace}\\PY{o}{:}\\PY{o}{:}\\PY{n}{uint}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}out}\\PY{p}{;}\n", - "\n", - "\\PY{+w}{ }\\PY{c+c1}{///////////////////}\n", - "\\PY{+w}{ }\\PY{c+c1}{// Tasklet code (identity)}\n", - "\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}out}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}inp}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{c+c1}{///////////////////}\n", - "\n", - "\\PY{+w}{ }\\PY{n}{dace}\\PY{o}{:}\\PY{o}{:}\\PY{n}{wcr\\PYZus{}fixed}\\PY{o}{\\PYZlt{}}\\PY{n}{dace}\\PY{o}{:}\\PY{o}{:}\\PY{n}{ReductionType}\\PY{o}{:}\\PY{o}{:}\\PY{n}{Sum}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{dace}\\PY{o}{:}\\PY{o}{:}\\PY{n}{uint}\\PY{o}{\\PYZgt{}}\\PY{o}{:}\\PY{o}{:}\\PY{n}{reduce}\\PY{p}{(}\\PY{n}{\\PYZus{}out}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}out}\\PY{p}{)}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\n", - "\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\n", - "\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\n", - "\n", - "\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\n", - "\\PY{p}{\\PYZcb{}}\n", - "\n", - "\\PY{k+kt}{void}\\PY{+w}{ }\\PY{n+nf}{\\PYZus{}\\PYZus{}program\\PYZus{}cpuRed\\PYZus{}internal}\\PY{p}{(}\\PY{n}{cpuRed\\PYZus{}state\\PYZus{}t}\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{dace}\\PY{o}{:}\\PY{o}{:}\\PY{n}{uint}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}restrict\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{n}{A}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{dace}\\PY{o}{:}\\PY{o}{:}\\PY{n}{uint}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}restrict\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{n}{B}\\PY{p}{)}\n", - "\\PY{p}{\\PYZob{}}\n", - "\n", - "\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", - "\n", - "\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", - "\\PY{+w}{ }\\PY{c+cp}{\\PYZsh{}}\\PY{c+cp}{pragma omp parallel for}\n", - "\\PY{+w}{ }\\PY{k}{for}\\PY{+w}{ }\\PY{p}{(}\\PY{k}{auto}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{;}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{\\PYZlt{}}\\PY{+w}{ }\\PY{l+m+mi}{32}\\PY{p}{;}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{+}\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{)}\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", - "\\PY{+w}{ }\\PY{n}{dace}\\PY{o}{:}\\PY{o}{:}\\PY{n}{uint}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}tmp1}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{n}{reduce\\PYZus{}0\\PYZus{}0\\PYZus{}6}\\PY{p}{(}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{,}\\PY{+w}{ }\\PY{o}{\\PYZam{}}\\PY{n}{A}\\PY{p}{[}\\PY{l+m+mi}{0}\\PY{p}{]}\\PY{p}{,}\\PY{+w}{ }\\PY{o}{\\PYZam{}}\\PY{n}{\\PYZus{}\\PYZus{}tmp1}\\PY{p}{)}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", - "\\PY{+w}{ }\\PY{n}{dace}\\PY{o}{:}\\PY{o}{:}\\PY{n}{uint}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}inp}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}tmp1}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{n}{dace}\\PY{o}{:}\\PY{o}{:}\\PY{n}{uint}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}out}\\PY{p}{;}\n", - "\n", - "\\PY{+w}{ }\\PY{c+c1}{///////////////////}\n", - "\\PY{+w}{ }\\PY{c+c1}{// Tasklet code (assign\\PYZus{}4\\PYZus{}8)}\n", - "\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}out}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}inp}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{c+c1}{///////////////////}\n", - "\n", - "\\PY{+w}{ }\\PY{n}{B}\\PY{p}{[}\\PY{n}{i}\\PY{p}{]}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}out}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\n", - "\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\n", - "\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\n", - "\n", - "\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\n", - "\\PY{p}{\\PYZcb{}}\n", - "\n", - "\\PY{n}{DACE\\PYZus{}EXPORTED}\\PY{+w}{ }\\PY{k+kt}{void}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}program\\PYZus{}cpuRed}\\PY{p}{(}\\PY{n}{cpuRed\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{dace}\\PY{o}{:}\\PY{o}{:}\\PY{n}{uint}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}restrict\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{n}{A}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{dace}\\PY{o}{:}\\PY{o}{:}\\PY{n}{uint}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}restrict\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{n}{B}\\PY{p}{)}\n", - "\\PY{p}{\\PYZob{}}\n", - "\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}program\\PYZus{}cpuRed\\PYZus{}internal}\\PY{p}{(}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{A}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{B}\\PY{p}{)}\\PY{p}{;}\n", - "\\PY{p}{\\PYZcb{}}\n", - "\n", - "\\PY{n}{DACE\\PYZus{}EXPORTED}\\PY{+w}{ }\\PY{n}{cpuRed\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}dace\\PYZus{}init\\PYZus{}cpuRed}\\PY{p}{(}\\PY{p}{)}\n", - "\\PY{p}{\\PYZob{}}\n", - "\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}result}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{n}{cpuRed\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{k}{new}\\PY{+w}{ }\\PY{n}{cpuRed\\PYZus{}state\\PYZus{}t}\\PY{p}{;}\n", - "\n", - "\n", - "\n", - "\\PY{+w}{ }\\PY{k}{if}\\PY{+w}{ }\\PY{p}{(}\\PY{n}{\\PYZus{}\\PYZus{}result}\\PY{p}{)}\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", - "\\PY{+w}{ }\\PY{k}{delete}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{k}{return}\\PY{+w}{ }\\PY{k}{nullptr}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\n", - "\\PY{+w}{ }\\PY{k}{return}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{;}\n", - "\\PY{p}{\\PYZcb{}}\n", - "\n", - "\\PY{n}{DACE\\PYZus{}EXPORTED}\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}dace\\PYZus{}exit\\PYZus{}cpuRed}\\PY{p}{(}\\PY{n}{cpuRed\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{)}\n", - "\\PY{p}{\\PYZob{}}\n", - "\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}err}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{k}{delete}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{k}{return}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}err}\\PY{p}{;}\n", - "\\PY{p}{\\PYZcb{}}\n", - "\\end{Verbatim}\n" - ], - "text/plain": [ - "/* DaCe AUTO-GENERATED FILE. DO NOT MODIFY */\n", - "#include \n", - "#include \"../../include/hash.h\"\n", - "\n", - "struct cpuRed_state_t {\n", - "\n", - "};\n", - "\n", - "inline void reduce_0_0_6(cpuRed_state_t *__state, dace::uint* __restrict__ _in, dace::uint* __restrict__ _out) {\n", - "\n", - " {\n", - "\n", - " {\n", - " for (auto _o0 = 0; _o0 < 1; _o0 += 1) {\n", - " {\n", - " dace::uint __out;\n", - "\n", - " ///////////////////\n", - " // Tasklet code (reduce_init)\n", - " __out = 0;\n", - " ///////////////////\n", - "\n", - " _out[_o0] = __out;\n", - " }\n", - " }\n", - " }\n", - "\n", - " }\n", - " {\n", - "\n", - " {\n", - " for (auto _i0 = 0; _i0 < 32; _i0 += 1) {\n", - " {\n", - " dace::uint __inp = _in[_i0];\n", - " dace::uint __out;\n", - "\n", - " ///////////////////\n", - " // Tasklet code (identity)\n", - " __out = __inp;\n", - " ///////////////////\n", - "\n", - " dace::wcr_fixed::reduce(_out, __out);\n", - " }\n", - " }\n", - " }\n", - "\n", - " }\n", - "}\n", - "\n", - "void __program_cpuRed_internal(cpuRed_state_t*__state, dace::uint * __restrict__ A, dace::uint * __restrict__ B)\n", - "{\n", - "\n", - " {\n", - "\n", - " {\n", - " #pragma omp parallel for\n", - " for (auto i = 0; i < 32; i += 1) {\n", - " dace::uint __tmp1;\n", - " reduce_0_0_6(__state, &A[0], &__tmp1);\n", - " {\n", - " dace::uint __inp = __tmp1;\n", - " dace::uint __out;\n", - "\n", - " ///////////////////\n", - " // Tasklet code (assign_4_8)\n", - " __out = __inp;\n", - " ///////////////////\n", - "\n", - " B[i] = __out;\n", - " }\n", - " }\n", - " }\n", - "\n", - " }\n", - "}\n", - "\n", - "DACE_EXPORTED void __program_cpuRed(cpuRed_state_t *__state, dace::uint * __restrict__ A, dace::uint * __restrict__ B)\n", - "{\n", - " __program_cpuRed_internal(__state, A, B);\n", - "}\n", - "\n", - "DACE_EXPORTED cpuRed_state_t *__dace_init_cpuRed()\n", - "{\n", - " int __result = 0;\n", - " cpuRed_state_t *__state = new cpuRed_state_t;\n", - "\n", - "\n", - "\n", - " if (__result) {\n", - " delete __state;\n", - " return nullptr;\n", - " }\n", - " return __state;\n", - "}\n", - "\n", - "DACE_EXPORTED int __dace_exit_cpuRed(cpuRed_state_t *__state)\n", - "{\n", - " int __err = 0;\n", - " delete __state;\n", - " return __err;\n", - "}" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "Code(sdfg.generate_code()[0].clean_code, language='cpp')" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "dace_env", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.12.3" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/berkay_workpace/scratch/visualizingWarps.ipynb b/berkay_workpace/scratch/visualizingWarps.ipynb deleted file mode 100644 index bb25667a8b..0000000000 --- a/berkay_workpace/scratch/visualizingWarps.ipynb +++ /dev/null @@ -1,180 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 15, - "id": "1497afd7", - "metadata": {}, - "outputs": [], - "source": [ - "import dace\n", - "import cupy as cp\n", - "import numpy as np\n", - "\n", - "from dace import registry\n", - "\n", - "from dace.sdfg.scope import ScopeSubgraphView\n", - "from dace.codegen.prettycode import CodeIOStream\n", - "from dace.codegen.targets.target import TargetCodeGenerator\n", - "from dace.codegen.targets.framecode import DaCeCodeGenerator\n", - "from dace.codegen.targets.cpp import sym2cpp\n", - "from IPython.display import Code\n", - "from dace.config import Config" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "id": "3f6d665e", - "metadata": {}, - "outputs": [], - "source": [ - "@dace.program\n", - "def test(A: dace.uint32[32,32] @ dace.dtypes.StorageType.GPU_Global):\n", - " for i, j in dace.map[0:32:32, 0:32:32] @ dace.dtypes.ScheduleType.GPU_Device:\n", - " for ii, jj in dace.map[0:32, 0:32] @ dace.dtypes.ScheduleType.GPU_ThreadBlock:\n", - " for wx,wy in dace.map[0:4, 0:8] @ dace.dtypes.ScheduleType.GPU_Warp:\n", - " r = wx\n", - " c = wy\n", - " result = dace.define_local_scalar(dace.uint32)\n", - " with dace.tasklet(dace.Language.CPP):\n", - " iwx << r\n", - " iwy << c\n", - " out_result >> result\n", - " \"\"\"\n", - " out_result = iwx * 100 + iwy;\n", - " \"\"\"\n", - " \n", - " A[i + ii, j + jj] = result\n", - "\n", - "sdfg = test.to_sdfg()\n", - "A = cp.zeros((32,32), dtype=cp.uint32)\n", - "sdfg(A=A)\n", - "\n", - "A_cpu = cp.asnumpy(A)\n", - "A_reshaped = A_cpu.reshape(-1, 256)\n", - "np.savetxt(\"A_output.txt\", A_reshaped, fmt='%d')\n" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "130d986f", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "
\n", - "
\n", - "
\n", - "\n", - "" - ], - "text/plain": [ - "SDFG (Warp_test_1)" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "sdfg = dace.SDFG(\"Warp_test_1\")\n", - "state = sdfg.add_state(\"main\")\n", - "\n", - "# Generate access nodes\n", - "a_dev = sdfg.add_array(\"A\", (32,32), dace.uint32, dace.dtypes.StorageType.GPU_Global)\n", - "a_acc = state.add_access(\"A\")\n", - "\n", - "\n", - "\n", - "# Generate maps, connect entries with access data\n", - "gpu_map_entry, gpu_map_exit = state.add_map(name = \"GPU_Map\",\n", - " ndrange = dict(i='0:32:32', j ='0:32:32'),\n", - " schedule = dace.dtypes.ScheduleType.GPU_Device)\n", - "\n", - "\n", - "\n", - "tblock_map_entry, tblock_map_exit = state.add_map(name = \"Block_Map\",\n", - " ndrange = dict(ii='0:32', jj='0:32'),\n", - " schedule = dace.dtypes.ScheduleType.GPU_ThreadBlock)\n", - "\n", - "state.add_edge(gpu_map_entry, None, tblock_map_entry, None, dace.memlet.Memlet())\n", - "\n", - "\n", - "\n", - "tasklet, warp_scope_entry, warp_scope_exit = state.add_mapped_tasklet(\n", - " name='WarpLevel_Operation',\n", - " map_ranges=dict(wi='0:32'),\n", - " inputs=dict(),\n", - " code=\n", - "\"\"\"\n", - "out = wi\n", - "\"\"\",\n", - " outputs=dict(out=dace.Memlet(\"A[i+ii, j+jj]\")),\n", - " schedule=dace.dtypes.ScheduleType.GPU_Warp\n", - ")\n", - "\n", - "state.add_edge(tblock_map_entry, None, warp_scope_entry, None, dace.memlet.Memlet())\n", - "\n", - "state.add_edge(warp_scope_exit, None, tblock_map_exit, None, dace.memlet.Memlet('A[i+ii, j+jj]'))\n", - "state.add_edge(tblock_map_exit, None, gpu_map_exit, None, dace.memlet.Memlet('A[i:i+32,j:j+32]'))\n", - "state.add_edge(gpu_map_exit, None, a_acc, None, dace.memlet.Memlet('A[0:32, 0:32]'))\n", - "\n", - "sdfg.fill_scope_connectors()\n", - "sdfg" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c0146590", - "metadata": {}, - "outputs": [], - "source": [ - "\n", - "Code(sdfg.generate_code()[1].clean_code, language='cpp')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "141d0c40", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "dace_env", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.12.3" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/berkay_workpace/tests/gpu_map_tests/threadBlock_test.py b/berkay_workpace/tests/gpu_map_tests/threadBlock_test.py index a321716f09..93ca7e757f 100644 --- a/berkay_workpace/tests/gpu_map_tests/threadBlock_test.py +++ b/berkay_workpace/tests/gpu_map_tests/threadBlock_test.py @@ -1,5 +1,4 @@ import dace -import random import cupy as cp import pytest diff --git a/berkay_workpace/tests/memcopy_tests/out_of_kernel_memcpy_test.py b/berkay_workpace/tests/memcopy_tests/out_of_kernel_memcpy_test.py new file mode 100644 index 0000000000..9dc9323f9f --- /dev/null +++ b/berkay_workpace/tests/memcopy_tests/out_of_kernel_memcpy_test.py @@ -0,0 +1,81 @@ +import dace +import cupy as cp +import numpy as np +import pytest + +from dace.codegen import common +from IPython.display import Code + +BACKEND = common.get_gpu_backend() + +''' +@pytest.mark.gpu +def test_1d_out_of_kernel_memcpy(): + """ + Test 1D out-of-kernel memcpy using DaCe and CuPy. + Verifies that device-to-device memcpy is performed. + """ + n = 100 + + @dace.program + def simple_1d_memcpy(dst: dace.uint32[n] @ dace.dtypes.StorageType.GPU_Global, + src: dace.uint32[n] @ dace.dtypes.StorageType.GPU_Global): + dst[:] = src[:] + + sdfg = simple_1d_memcpy.to_sdfg() + + + # Initialize arrays on GPU + src = cp.ones(n, dtype=cp.uint32) + dst = cp.zeros(n, dtype=cp.uint32) + + # Run SDFG + sdfg(dst, src, N=n) + + # Check correctness + cp.testing.assert_array_equal(dst, src) + + # Check generated code for correct memcpy usage + func_name = f"{BACKEND}MemcpyAsync" + kind = f"{BACKEND}MemcpyDeviceToDevice" + code = sdfg.generate_code()[0].code + + assert func_name in code and kind in code + +''' + +@pytest.mark.gpu +def test_1d_out_of_kernel_memcpy_strided(): + """ + Test 1D out-of-kernel memcpy using DaCe and CuPy. + Here, the copy shape is strided and we use symbolic sizes. + Furthermore, we have a CPU to GPU copy + """ + N = dace.symbol('N') + n = 10 + + @dace.program + def strided_1d_memcpy(dst: dace.uint32[2*N] @ dace.dtypes.StorageType.GPU_Global, + src: dace.uint32[4*N]): + dst[::2] = src[::4] + + sdfg = strided_1d_memcpy.to_sdfg(validate=False) + + # Initialize arrays on GPU + src = np.ones(4*n, dtype=np.uint32) + dst = cp.zeros(2*n, dtype=cp.uint32) + + # Run SDFG + sdfg(dst, src, N=n) + + # Check correctness + expected = cp.zeros(2*n, dtype=cp.uint32) + expected[::2] = 1 # since src[::4] are all ones + cp.testing.assert_array_equal(dst, expected) + + # Check generated code for correct memcpy usage + func_name = f"{BACKEND}Memcpy2DAsync" + kind = f"{BACKEND}MemcpyHostToDevice" + code = sdfg.generate_code()[0].code + + assert func_name in code and kind in code diff --git a/berkay_workpace/tests/reusable_tests/cuda_highdim_kernel_test.py b/berkay_workpace/tests/reusable_tests/cuda_highdim_kernel_test.py index 88120f3324..fed6f72fe1 100644 --- a/berkay_workpace/tests/reusable_tests/cuda_highdim_kernel_test.py +++ b/berkay_workpace/tests/reusable_tests/cuda_highdim_kernel_test.py @@ -202,12 +202,12 @@ def tester(a: dace.float64[1024, 2, 2, 20] @ dace.StorageType.GPU_Global): if __name__ == "__main__": - test_cpu() - test_gpu() + # test_cpu() + # test_gpu() test_highdim_implicit_block() - test_highdim_implicit_block_threadsplit() - test_highdim_default_block_size() - test_block_size_mismatch_warning() - test_block_size_mismatch_error() - test_block_size_too_large() - test_highdim_block_size_too_large() + # test_highdim_implicit_block_threadsplit() + # test_highdim_default_block_size() + # test_block_size_mismatch_warning() + # test_block_size_mismatch_error() + # test_block_size_too_large() + # test_highdim_block_size_too_large() diff --git a/berkay_workpace/tests/reusable_tests/gpu_launch_bounds_test.py b/berkay_workpace/tests/reusable_tests/gpu_launch_bounds_test.py new file mode 100644 index 0000000000..48ae32665c --- /dev/null +++ b/berkay_workpace/tests/reusable_tests/gpu_launch_bounds_test.py @@ -0,0 +1,70 @@ +# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved. + +import dace +import pytest + + +@pytest.mark.gpu +def test_launch_bounds_default(): + + @dace.program + def prog(a: dace.float64[100, 20] @ dace.StorageType.GPU_Global): + for i, j in dace.map[0:100, 0:20] @ dace.ScheduleType.GPU_Device: + a[i, j] = 1 + + with dace.config.set_temporary('compiler', 'cuda', 'default_block_size', value='32,2,1'): + assert '__launch_bounds__(64)' in prog.to_sdfg().generate_code()[1].code + + +@pytest.mark.gpu +def test_launch_bounds_implicit(): + + @dace.program + def prog(a: dace.float64[100, 20] @ dace.StorageType.GPU_Global): + for i, j in dace.map[0:50, 0:10] @ dace.ScheduleType.GPU_Device: + for bi, bj in dace.map[0:2, 0:2] @ dace.ScheduleType.GPU_ThreadBlock: + a[i * 2 + bi, j * 2 + bj] = 1 + + assert '__launch_bounds__(4)' in prog.to_sdfg().generate_code()[1].code + + +@pytest.mark.gpu +def test_launch_bounds_implicit_sym(): + B = dace.symbol('B') + + @dace.program + def prog(a: dace.float64[100, 20] @ dace.StorageType.GPU_Global): + for i, j in dace.map[0:50, 0:10] @ dace.ScheduleType.GPU_Device: + for bi, bj in dace.map[0:B, 0:B] @ dace.ScheduleType.GPU_ThreadBlock: + a[i * B + bi, j * B + bj] = 1 + + assert '__launch_bounds__' not in prog.to_sdfg().generate_code()[1].code + + +@pytest.mark.gpu +def test_launch_bounds_explicit(): + B = 2 + + @dace.program + def prog(a: dace.float64[100, 20] @ dace.StorageType.GPU_Global): + for i, j in dace.map[0:50, 0:10] @ dace.ScheduleType.GPU_Device: + for bi, bj in dace.map[0:B, 0:B] @ dace.ScheduleType.GPU_ThreadBlock: + a[i * B + bi, j * B + bj] = 1 + + sdfg = prog.to_sdfg() + for n, _ in sdfg.all_nodes_recursive(): + if isinstance(n, dace.nodes.MapEntry) and n.map.schedule == dace.ScheduleType.GPU_Device: + mapentry = n + break + + mapentry.map.gpu_launch_bounds = '-1' + assert '__launch_bounds__' not in sdfg.generate_code()[1].code + mapentry.map.gpu_launch_bounds = '5, 1' + assert '__launch_bounds__(5, 1)' in sdfg.generate_code()[1].code + + +if __name__ == '__main__': + test_launch_bounds_default() + test_launch_bounds_implicit() + test_launch_bounds_implicit_sym() + test_launch_bounds_explicit() diff --git a/dace/codegen/targets/__init__.py b/dace/codegen/targets/__init__.py index a0c2065524..9ac7561c10 100644 --- a/dace/codegen/targets/__init__.py +++ b/dace/codegen/targets/__init__.py @@ -9,4 +9,4 @@ from .mlir.mlir import MLIRCodeGen from .sve.codegen import SVECodeGen from .snitch import SnitchCodeGen -from .experimental_cuda import ExperimentalCUDACodeGen \ No newline at end of file +from .new_cuda_codegen.experimental_cuda import ExperimentalCUDACodeGen \ No newline at end of file diff --git a/dace/codegen/targets/cuda.py b/dace/codegen/targets/cuda.py index aaba068da3..3dd50667cb 100644 --- a/dace/codegen/targets/cuda.py +++ b/dace/codegen/targets/cuda.py @@ -1263,7 +1263,6 @@ def generate_state(self, ptrname = cpp.ptr(name, desc, sd, self._frame) if isinstance(desc, dt.Array) and desc.start_offset != 0: ptrname = f'({ptrname} - {cpp.sym2cpp(desc.start_offset)})' - callsite_stream.write(f'DACE_GPU_CHECK({backend}Free({ptrname}));\n', sd) self._emit_sync(callsite_stream) to_remove.add((sd, name)) diff --git a/dace/codegen/targets/new_cuda_codegen/copy_strategies.py b/dace/codegen/targets/new_cuda_codegen/copy_strategies.py new file mode 100644 index 0000000000..f89e882647 --- /dev/null +++ b/dace/codegen/targets/new_cuda_codegen/copy_strategies.py @@ -0,0 +1,376 @@ +from abc import ABC, abstractmethod +from typing import Tuple + +from dace import symbolic +from dace import Memlet, dtypes +from dace.dtypes import StorageType +from dace.codegen.targets.new_cuda_codegen.experimental_cuda import ExperimentalCUDACodeGen, CUDAStreamManager, product + + + +from dace.codegen.prettycode import CodeIOStream +from dace.sdfg import SDFG, nodes +from dace.sdfg.nodes import Node +from dace.sdfg.state import ControlFlowRegion, StateSubgraphView + +from dace.codegen.targets.cpp import memlet_copy_to_absolute_strides + + +# TODO: Adapt documentation if src strides is None! +class CopyContext: + """ + Stores and derives all information required for memory copy operations on GPUs. + + This class exists because memory copy logic often requires a large amount of context, + including node references, expressions, layout, and backend details. Handling all this + ad hoc makes the code harder to follow and maintain. + + CopyContext centralizes this information and provides helper functions to clarify + what values are needed for code generation and why. This improves readability, + simplifies copy emission logic, and makes future extensions easier. + """ + def __init__(self, codegen: ExperimentalCUDACodeGen, cuda_stream_manager: CUDAStreamManager, state_id: int, + src_node: Node, dst_node: Node, edge: Tuple[Node, str, Node, str, Memlet], sdfg: SDFG, + cfg: ControlFlowRegion, dfg: StateSubgraphView, callsite_stream: CodeIOStream): + + # Store general context information for the copy operation, such as: + # - which code generator is responsible, + # - which edge and SDFG/state context related to the copy, + # - and where the generated code is written (callsite stream). + self.codegen = codegen + self.state_id = state_id + self.src_node = src_node + self.dst_node = dst_node + self.edge = edge + self.sdfg = sdfg + self.cfg = cfg + self.dfg = dfg + self.callsite_stream = callsite_stream + + # Additional information frequently needed + self.backend = codegen.backend + self.state_dfg = cfg.state(state_id) + self.cudastream = cuda_stream_manager.get_stream_edge(src_node, dst_node) + self.src_storage = self.get_storage_type(src_node) + self.dst_storage = self.get_storage_type(dst_node) + + + if isinstance(src_node, nodes.AccessNode) and isinstance(dst_node, nodes.AccessNode): + copy_shape, src_strides, dst_strides, src_expr, dst_expr = memlet_copy_to_absolute_strides( + codegen._dispatcher, sdfg, self.state_dfg, edge, src_node, dst_node, codegen._cpu_codegen._packed_types) + else: + _, _, _, _, memlet = edge + copy_shape = [symbolic.overapproximate(s) for s in memlet.subset.bounding_box_size()] + + # if src and dst node are not AccessNodes, these are undefined + src_strides = dst_strides = src_expr = dst_expr = None + + + self.copy_shape = copy_shape + self.src_strides = src_strides + self.dst_strides = dst_strides + self.src_expr = src_expr + self.dst_expr = dst_expr + + self.num_dims = len(copy_shape) + + def get_storage_type(self, node: Node): + + if isinstance(node, nodes.Tasklet): + storage_type = StorageType.Register + else: + storage_type = node.desc(self.sdfg).storage + + return storage_type + + def get_copy_call_parameters(self) -> Tuple[str, str, str, str, str, str, any]: + """ + Returns all essential parameters required to emit a backend memory copy call. + + This method determines both structural and backend-specific information + needed to perform a memory copy, including memory locations, pointer + expressions, and data types. In cases where either the source or + destination is not a data access node, pointer expressions may be unavailable. + + Returns + ------- + Tuple[str, Optional[str], Optional[str], str, str, str, any] + A tuple containing: + - backend (str): Name of the backend used (e.g., 'cuda', 'hip'). + - src_expr (Optional[str]): Source pointer expression, or None if unavailable. + - dst_expr (Optional[str]): Destination pointer expression, or None if unavailable. + - src_location (str): Memory location of the source ('Host' or 'Device'). + - dst_location (str): Memory location of the destination ('Host' or 'Device'). + - cudastream (str): Backend-specific stream identifier. + - ctype (any): The C type of the data being copied. + """ + src_location = 'Device' if self.src_storage == dtypes.StorageType.GPU_Global else 'Host' + dst_location = 'Device' if self.dst_storage == dtypes.StorageType.GPU_Global else 'Host' + + # Use the destination data type + ctype = self.dst_node.desc(self.sdfg).ctype + + # NOTE: I implicitly assume it is the same dtype as of the src. + assert ctype == self.src_node.desc(self.sdfg).dtype.ctype, \ + "Source and destination data types must match for the memory copy." + + return self.backend, self.src_expr, self.dst_expr, src_location, dst_location, self.cudastream, ctype + + def get_transfer_layout(self) -> Tuple[list, list, list]: + """ + Returns layout information required for emitting a memory copy. + + Returns + ------- + copy_shape : List + The size (extent) of each dimension to be copied. + Singleton dimensions (i.e., dimensions of size 1) are omitted. + Example: [J, K, 1] becomes [J, K] + src_strides : List or None + Stride values of the source expression, per dimension if + source and destination are of type AccessNode, else None. + dst_strides : List or None + Stride values of the destination expression, per dimension if + source and destination are of type AccessNode, else None. + """ + return self.copy_shape, self.src_strides, self.dst_strides + + def get_write_context(self) -> Tuple[CodeIOStream, ControlFlowRegion, int, Node, Node]: + """ + Returns all context required to emit code into the callsite stream with proper SDFG annotations. + + Returns + ------- + callsite_stream : CodeIOStream + The output stream where backend code is written. + cfg : ControlFlowRegion + The control flow region containing the current state. + state_id : int + The ID of the SDFG state being generated. + src_node : Node + The source node involved in the copy. + dst_node : Node + The destination node involved in the copy. + """ + return self.callsite_stream, self.cfg, self.state_id, self.src_node, self.dst_node + + def is_contiguous_copy(self) -> bool: + """ + Returns True if the memory copy is contiguous in the last dimension + for both source and destination. + """ + return (self.src_strides[-1] == 1) and (self.dst_strides[-1] == 1) + + + + +class CopyStrategy(ABC): + + @abstractmethod + def applicable(self, copy_context: CopyContext) -> bool: + """ + Return True if this strategy can handle the given memory copy. + """ + raise NotImplementedError('Abstract class') + + @abstractmethod + def generate_copy(self, copy_context: CopyContext) -> None: + """ + Generates the copy code for the supported pattern. + """ + raise NotImplementedError('Abstract class') + + +class OutOfKernelCopyStrategy(CopyStrategy): + + def applicable(self, copy_context: CopyContext) -> bool: + """ + Determines whether the data movement is a host<->device memory copy. + + This function returns True if: + - We are not currently generating kernel code + - The copy occurs between two AccessNodes + - The storage types involve a CPU and a GPU (but not CPU-to-CPU or GPU-to-GPU) + + This check is used to detect and handle transfers between host and device memory spaces. + """ + + # TODO: I don't understand why all of these conditions are needed, look into it + + cpu_storage_types = [StorageType.CPU_Heap, StorageType.CPU_ThreadLocal, StorageType.CPU_Pinned] + not_in_kernel_code = not ExperimentalCUDACodeGen._in_kernel_code + + is_between_access_nodes = ( + isinstance(copy_context.src_node, nodes.AccessNode) and + isinstance(copy_context.dst_node, nodes.AccessNode) + ) + + + involves_gpu_or_pinned = ( + copy_context.src_storage in (StorageType.GPU_Global, StorageType.CPU_Pinned) or + copy_context.dst_storage in (StorageType.GPU_Global, StorageType.CPU_Pinned) + ) + + is_not_cpu_to_cpu = not ( + copy_context.src_storage in cpu_storage_types and + copy_context.dst_storage in cpu_storage_types + ) + + is_gpu_host_copy = ( + not_in_kernel_code and + is_between_access_nodes and + involves_gpu_or_pinned and + is_not_cpu_to_cpu + ) + + return is_gpu_host_copy + + def generate_copy(self, copy_context: CopyContext) -> None: + """Execute host-device copy with CUDA memory operations""" + + + num_dims = copy_context.num_dims + + if num_dims == 1: + self._generate_1d_copy(copy_context) + elif num_dims == 2: + self._generate_2d_copy(copy_context) + elif num_dims > 2: + self._generate_nd_copy(copy_context) + else: # num_dims = 0 + raise NotImplementedError( + f"ExternalCudaCopyStrategy does not support memory copies with {num_dims} dimensions " + f"(copy shape: {copy_context.copy_shape}). " + ) + + def _generate_1d_copy(self, copy_context: CopyContext) -> None: + """ + Emits code for a 1D memory copy between host and device using GPU backend. + Uses {backend}MemcpyAsync for contiguous memory and uses {backend}Memcpy2DAsync + for strided memory copies. + """ + + # ----------- Extract relevant copy parameters -------------- + copy_shape, src_strides, dst_strides= copy_context.get_transfer_layout() + + backend, src_expr, dst_expr, src_location, dst_location, cudastream, ctype = \ + copy_context.get_copy_call_parameters() + + # ----------------- Generate backend call -------------------- + if copy_context.is_contiguous_copy(): + # Memory is linear: can use {backend}MemcpyAsync + num_bytes = f'{product(copy_shape)} * sizeof({ctype})' + kind = f'{backend}Memcpy{src_location}To{dst_location}' + + call = f'DACE_GPU_CHECK({backend}MemcpyAsync({dst_expr}, {src_expr}, {num_bytes}, {kind}, {cudastream}));\n' + + else: + # Memory is strided: use {backend}Memcpy2DAsync with dpitch/spitch + # This allows copying a strided 1D region + dpitch = f'{dst_strides[0]} * sizeof({ctype})' + spitch = f'{src_strides[0]} * sizeof({ctype})' + width = f'sizeof({ctype})' + height = copy_shape[0] + kind = f'{backend}Memcpy{src_location}To{dst_location}' + + call = f'DACE_GPU_CHECK({backend}Memcpy2DAsync({dst_expr}, {dpitch}, {src_expr}, {spitch}, {width}, {height}, {kind}, {cudastream}));\n' + + # ----------------- Write copy call to code stream -------------------- + callsite_stream, cfg, state_id, src_node, dst_node = copy_context.get_write_context() + callsite_stream.write(call, cfg, state_id, [src_node, dst_node]) + + def _generate_2d_copy(self, copy_context: CopyContext) -> None: + """Generates code for a 2D copy, falling back to 1D flattening if applicable.""" + + # ----------- Extract relevant copy parameters -------------- + copy_shape, src_strides, dst_strides= copy_context.get_transfer_layout() + + backend, src_expr, dst_expr, src_location, dst_location, cudastream, ctype = \ + copy_context.get_copy_call_parameters() + + + # ----------------- Generate backend call if supported -------------------- + + if copy_context.is_contiguous_copy(): + dpitch = f'{dst_strides[0]} * sizeof({ctype})' + spitch = f'{src_strides[0]} * sizeof({ctype})' + width = f'{copy_shape[1]} * sizeof({ctype})' + height = f'{copy_shape[0]}' + kind = f'{backend}Memcpy{src_location}To{dst_location}' + + call = f'DACE_GPU_CHECK({backend}Memcpy2DAsync({dst_expr}, {dpitch}, {src_expr}, {spitch}, {width}, {height}, {kind}, {cudastream}));\n' + + elif src_strides[-1] == 1 or dst_strides[-1] == 1: + # TODO: Checks this, I am not sure but the old code and its description + # seems to be more complicated here than necessary.. + # But worth to mention: we essentiall flatten + + # NOTE: Special case of continuous copy + # Example: dcol[0:I, 0:J, k] -> datacol[0:I, 0:J] + # with copy shape [I, J] and strides [J*K, K], [J, 1] + + dpitch = f'{dst_strides[1]} * sizeof({ctype})' + spitch = f'{src_strides[1]} * sizeof({ctype})' + width = f'sizeof({ctype})' + height = copy_shape[0] * copy_shape[1] + kind = f'{backend}Memcpy{src_location}To{dst_location}' + + call = f'DACE_GPU_CHECK({backend}Memcpy2DAsync({dst_expr}, {dpitch}, {src_expr}, {spitch}, {width}, {height}, {kind}, {cudastream}));\n' + + else: + raise NotImplementedError('2D copy only supported with one stride') + + + # ----------------- Write copy call to code stream -------------------- + callsite_stream, cfg, state_id, src_node, dst_node = copy_context.get_write_context() + callsite_stream.write(call, cfg, state_id, [src_node, dst_node]) + + def _generate_nd_copy(self, copy_context: CopyContext) -> None: + + # ----------- Guard for unsupported Pattern -------------- + if not copy_context.is_contiguous_copy(): + raise NotImplementedError( + "Strided GPU memory copies for N-dimensional arrays are not currently supported.\n" + f" Source node: {copy_context.src_node} (storage: {copy_context.src_storage})\n" + f" Destination node: {copy_context.dst_node} (storage: {copy_context.dst_storage})\n" + f" Source strides: {copy_context.src_strides}\n" + f" Destination strides: {copy_context.dst_strides}\n" + ) + + # ----------- Extract relevant copy parameters -------------- + copy_shape, src_strides, dst_strides= copy_context.get_transfer_layout() + + backend, src_expr, dst_expr, src_location, dst_location, cudastream, ctype = \ + copy_context.get_copy_call_parameters() + + num_dims = copy_context.num_dims + # ----------------- Generate and write backend call(s) -------------------- + + callsite_stream, cfg, state_id, src_node, dst_node = copy_context.get_write_context() + + # Write for-loop headers + for dim in range(num_dims - 2): + callsite_stream.write( + f"for (int __copyidx{dim} = 0; __copyidx{dim} < {copy_shape[dim]}; ++__copyidx{dim}) {{") + + # Write Memcopy2DAsync + offset_src = ' + '.join(f'(__copyidx{d} * ({s}))' for d, s in enumerate(src_strides[:-2])) + offset_dst = ' + '.join(f'(__copyidx{d} * ({s}))' for d, s in enumerate(dst_strides[:-2])) + + src = f'{src_expr} + {offset_src}' + dst = f'{dst_expr} + {offset_dst}' + + dpitch = f'{dst_strides[-2]} + sizeof({ctype})' + spitch = f'{src_strides[-2]} + sizeof({ctype})' + width = f'{copy_shape[-1]} + sizeof({ctype})' + height = copy_shape[-2] + kind = f'{backend}Memcpy{src_location}To{dst_location}' + + # Generate call and write it + call = f'DACE_GPU_CHECK({backend}Memcpy2DAsync({dst}, {dpitch}, {src}, {spitch}, {width}, {height}, {kind}, {cudastream}));\n' + callsite_stream.write(call, cfg, state_id, [src_node, dst_node]) + + # Write for-loop footers + for d in range(num_dims - 2): + callsite_stream.write("}") + diff --git a/dace/codegen/targets/experimental_cuda.py b/dace/codegen/targets/new_cuda_codegen/experimental_cuda.py similarity index 74% rename from dace/codegen/targets/experimental_cuda.py rename to dace/codegen/targets/new_cuda_codegen/experimental_cuda.py index edadc144b2..11cc955d63 100644 --- a/dace/codegen/targets/experimental_cuda.py +++ b/dace/codegen/targets/new_cuda_codegen/experimental_cuda.py @@ -1,7 +1,7 @@ import ctypes import functools import warnings -from typing import TYPE_CHECKING, Dict, List, Optional, Set, Tuple, Union +from typing import TYPE_CHECKING, Dict, List, Optional, Set, Tuple, Union, Any import networkx as nx import sympy @@ -30,6 +30,7 @@ from dace.transformation import helpers from dace.transformation.passes import analysis as ap + if TYPE_CHECKING: from dace.codegen.targets.framecode import DaCeCodeGenerator from dace.codegen.targets.cpu import CPUCodeGen @@ -37,7 +38,6 @@ # TODO's: -# 1. Yakup: Approval of dtypes extensions (e.g. mapping default sub scope scheduleTypes) # 2. Berkay: Include constant expresssions # 3. Berkay: Warning if sync property in maps is used # 4. Berkay: Warning/Error that GPU_device must be used before other GPU schedule types @@ -45,15 +45,12 @@ - - @registry.autoregister_params(name='experimental_cuda') class ExperimentalCUDACodeGen(TargetCodeGenerator): """ Experimental CUDA code generator.""" target_name = 'experimental_cuda' title = 'CUDA' - ######################## Initilization and Preprocessing related start ######################################################### def __init__(self, frame_codegen: 'DaCeCodeGenerator', sdfg: SDFG): @@ -62,11 +59,10 @@ def __init__(self, frame_codegen: 'DaCeCodeGenerator', sdfg: SDFG): self._dispatcher: TargetDispatcher= frame_codegen.dispatcher # responsible for dispatching code generation to the appropriate target - ExperimentalCUDACodeGen._in_kernel_code = False # TODO: Isn't this double? + ExperimentalCUDACodeGen._in_kernel_code = False self._cpu_codegen: Optional['CPUCodeGen'] = None - # NOTE: Moved from preprossessing to here self.backend: str = common.get_gpu_backend() self.language = 'cu' if self.backend == 'cuda' else 'cpp' @@ -93,15 +89,6 @@ def __init__(self, frame_codegen: 'DaCeCodeGenerator', sdfg: SDFG): self._toplevel_schedule = None - self._arglists: Dict[nodes.MapEntry, Dict[str, dt.Data]] = {} - - # Keep track of current "scope entry/exit" code streams for extra - # code generation - self.scope_entry_stream = self._initcode - self.scope_exit_stream = self._exitcode - - self._cuda_streams, self._cuda_events = 0, 0 - # Positions at which to deallocate memory pool arrays self.pool_release: Dict[Tuple[SDFG, str], Tuple[SDFGState, Set[nodes.Node]]] = {} self.has_pool = False @@ -131,18 +118,6 @@ def __init__(self, frame_codegen: 'DaCeCodeGenerator', sdfg: SDFG): self._dispatcher.register_copy_dispatcher(other_storage, storage, None, self) - # NOTE: Moved it here from preprocessing, I think it fits better - self._backend = common.get_gpu_backend() - self._language = 'cu' if self.backend == 'cuda' else 'cpp' - target_type = "" if self.backend == 'cuda' else self.backend - self._codeobject= CodeObject(sdfg.name + '_' + 'cuda', - '', - self._language, - ExperimentalCUDACodeGen, - 'CUDA', - target_type=target_type) - - # NOTE: # "Register illegal copies" code NOT copied from cuda.py # Behavior unclear for me yet. @@ -151,8 +126,11 @@ def __init__(self, frame_codegen: 'DaCeCodeGenerator', sdfg: SDFG): ################## New variables ########################## self._current_kernel_spec: Optional[KernelSpec] = None + self._cuda_stream_manager: CUDAStreamManager = CUDAStreamManager(sdfg) + + + - # NOTE: I think this is good as is def preprocess(self, sdfg: SDFG) -> None: # Find GPU<->GPU strided copies that cannot be represented by a single copy command @@ -194,9 +172,7 @@ def preprocess(self, sdfg: SDFG) -> None: except ValueError: # If transformation doesn't match, continue normally continue - # Annotate CUDA streams and events - self._cuda_streams, self._cuda_events = self._compute_cudastreams(sdfg) - + # Find points where memory should be released to the memory pool self._compute_pool_release(sdfg) @@ -204,17 +180,6 @@ def preprocess(self, sdfg: SDFG) -> None: self._frame.statestruct.append('dace::cuda::Context *gpu_context;') - # Collect all defined symbols and argument lists with one traversal - shared_transients = {} - for state, node, defined_syms in sdutil.traverse_sdfg_with_defined_symbols(sdfg, recursive=True): - if (isinstance(node, nodes.MapEntry) - and node.map.schedule == dtypes.ScheduleType.GPU_Device): # NOTE: Removed dtypes.ScheduleType.GPU_Persistent comparision - if state.parent not in shared_transients: - shared_transients[state.parent] = state.parent.shared_transients() - self._arglists[node] = state.scope_subgraph(node).arglist(defined_syms, shared_transients[state.parent]) - - - # NOTE: Used during preprocess. Seems good as is def _compute_pool_release(self, top_sdfg: SDFG): """ Computes positions in the code generator where a memory pool array is no longer used and @@ -304,152 +269,6 @@ def _compute_pool_release(self, top_sdfg: SDFG): self.pool_release[(sdfg, arr)] = (sink, set()) - # NOTE: SHould be a transformation to some part - def _compute_cudastreams(self, sdfg: SDFG, default_stream=0, default_event=0): - """ Annotates an SDFG (and all nested ones) to include a `_cuda_stream` - field. This field is applied to all GPU maps, tasklets, and copies - that can be executed in parallel. - - :param sdfg: The sdfg to modify. - :param default_stream: The stream ID to start counting from (used - in recursion to nested SDFGs). - :param default_event: The event ID to start counting from (used - in recursion to nested SDFGs). - :return: 2-tuple of the number of streams, events to create. - """ - concurrent_streams = int(Config.get('compiler', 'cuda', 'max_concurrent_streams')) - if concurrent_streams < 0: - return 0, 0 - - def increment(streams): - if concurrent_streams > 0: - return (streams + 1) % concurrent_streams - return streams + 1 - - state_streams = [] - state_subsdfg_events = [] - - for state in sdfg.states(): - # Start by annotating source nodes - source_nodes = state.source_nodes() - - # Concurrency can only be found in each state - max_streams = default_stream - max_events = default_event - - for i, node in enumerate(source_nodes): - if isinstance(node, nodes.AccessNode): - continue - if isinstance(node, nodes.NestedSDFG): - if node.schedule == dtypes.ScheduleType.GPU_Device: - continue - if node.schedule not in dtypes.GPU_SCHEDULES_EXPERIMENTAL_CUDACODEGEN: - max_streams, max_events = self._compute_cudastreams(node.sdfg, max_streams, max_events + 1) - node._cuda_stream = max_streams - node._cs_childpath = False - max_streams = increment(max_streams) - - # Maintain the same CUDA stream in DFS order, add more when - # possible. - # NOTE: Either all have an attribute or none, tell yakup if you see stuff like this - for e in state.dfs_edges(source_nodes): - if hasattr(e.dst, '_cuda_stream'): - continue - if hasattr(e.src, '_cuda_stream'): - c = e.src._cuda_stream - - if (isinstance(e.dst, nodes.AccessNode) and isinstance(sdfg.arrays[e.dst.data], dt.View)): - # Skip views - e.dst._cuda_stream = c - e.dst._cs_childpath = False - continue - - if e.src._cs_childpath == True: - c = max_streams - max_streams = increment(max_streams) - e.src._cs_childpath = True - - # Do not create multiple streams within GPU scopes - if (isinstance(e.src, nodes.EntryNode) and e.src.schedule in dtypes.GPU_SCHEDULES_EXPERIMENTAL_CUDACODEGEN): - e.src._cs_childpath = False - elif state.entry_node(e.src) is not None: - parent = state.entry_node(e.src) - if parent.schedule in dtypes.GPU_SCHEDULES_EXPERIMENTAL_CUDACODEGEN: - e.src._cs_childpath = False - else: - c = max_streams - if (isinstance(e.dst, nodes.AccessNode) and isinstance(sdfg.arrays[e.dst.data], dt.View)): - # Skip views - pass - else: - max_streams = increment(max_streams) - e.dst._cuda_stream = c - if not hasattr(e.dst, '_cs_childpath'): - e.dst._cs_childpath = False - if isinstance(e.dst, nodes.NestedSDFG): - if e.dst.schedule not in dtypes.GPU_SCHEDULES_EXPERIMENTAL_CUDACODEGEN: - max_streams, max_events = self._compute_cudastreams(e.dst.sdfg, e.dst._cuda_stream, - max_events + 1) - - state_streams.append(max_streams if concurrent_streams == 0 else concurrent_streams) - state_subsdfg_events.append(max_events) - - # Remove CUDA streams from paths of non-gpu copies and CPU tasklets - for node, graph in sdfg.all_nodes_recursive(): - if isinstance(graph, SDFGState): - cur_sdfg = graph.parent - - if (isinstance(node, (nodes.EntryNode, nodes.ExitNode)) and node.schedule in dtypes.GPU_SCHEDULES_EXPERIMENTAL_CUDACODEGEN): - # Node must have GPU stream, remove childpath and continue - if hasattr(node, '_cs_childpath'): - delattr(node, '_cs_childpath') - continue - - for e in graph.all_edges(node): - path = graph.memlet_path(e) - # If leading from/to a GPU memory node, keep stream - if ((isinstance(path[0].src, nodes.AccessNode) - and path[0].src.desc(cur_sdfg).storage == dtypes.StorageType.GPU_Global) - or (isinstance(path[-1].dst, nodes.AccessNode) - and path[-1].dst.desc(cur_sdfg).storage == dtypes.StorageType.GPU_Global)): - break - # If leading from/to a GPU tasklet, keep stream - if ((isinstance(path[0].src, nodes.CodeNode) and is_devicelevel_gpu(cur_sdfg, graph, path[0].src)) - or (isinstance(path[-1].dst, nodes.CodeNode) - and is_devicelevel_gpu(cur_sdfg, graph, path[-1].dst))): - break - else: # If we did not break, we do not need a CUDA stream - if hasattr(node, '_cuda_stream'): - delattr(node, '_cuda_stream') - # In any case, remove childpath - if hasattr(node, '_cs_childpath'): - delattr(node, '_cs_childpath') - - # Compute maximal number of events by counting edges (within the same - # state) that point from one stream to another - state_events = [] - for i, state in enumerate(sdfg.states()): - events = state_subsdfg_events[i] - - for e in state.edges(): - if hasattr(e.src, '_cuda_stream'): - # If there are two or more CUDA streams involved in this - # edge, or the destination is unrelated to CUDA - if (not hasattr(e.dst, '_cuda_stream') or e.src._cuda_stream != e.dst._cuda_stream): - for mpe in state.memlet_path(e): - mpe._cuda_event = events - events += 1 - - state_events.append(events) - - # Maximum over all states - max_streams = max(state_streams) - max_events = max(state_events) - - return max_streams, max_events - - ######################## Initilization and Preprocessing related end ######################################################### - @property def has_initializer(self) -> bool: return True @@ -1034,7 +853,6 @@ def _generate_kernel_launch(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: # testing phase - def generate_state(self, sdfg: SDFG, cfg: ControlFlowRegion, @@ -1066,43 +884,32 @@ def _emit_sync(self, codestream: CodeIOStream): codestream.write('''DACE_GPU_CHECK({backend}GetLastError()); DACE_GPU_CHECK({backend}DeviceSynchronize());'''.format(backend=self.backend)) - def _begin_streams(self, sdfg, state): - result = set() - for node in state.source_nodes(): - if hasattr(node, '_cuda_stream'): - if (isinstance(node, nodes.AccessNode) and isinstance(sdfg.arrays[node.data], dt.View)): - continue - result.add(node._cuda_stream) - else: - # Collect other streams in state start - for e in state.out_edges(node): - if hasattr(e.dst, '_cuda_stream'): - if (isinstance(node, nodes.AccessNode) and isinstance(sdfg.arrays[node.data], dt.View)): - continue - result.add(e.dst._cuda_stream) - return result - def state_dispatch_predicate(self, sdfg, state): - if self._toplevel_schedule in dtypes.GPU_SCHEDULES_EXPERIMENTAL_CUDACODEGEN: - return True - for node in state.sink_nodes(): - if hasattr(node, '_cuda_stream'): - return True - else: - for e in state.in_edges(node): - if hasattr(e.src, '_cuda_stream'): - return True - for s, _ in self.pool_release.values(): - if s is state: - return True - return False + """ + Determines whether a state should be handled by this + code generator (`ExperimentalCUDACodeGen`). + + Returns True if the generator is currently generating kernel code. + """ + return ExperimentalCUDACodeGen._in_kernel_code def node_dispatch_predicate(self, sdfg, state, node): - if hasattr(node, 'schedule'): # NOTE: Works on nodes and scopes - if node.schedule in dtypes.GPU_SCHEDULES_EXPERIMENTAL_CUDACODEGEN: - return True + """ + Determines whether a node should be handled by this + code generator (`ExperimentalCUDACodeGen`). + + Returns True if: + - The node has a GPU schedule handled by this backend, or + - The generator is currently generating kernel code. + """ + schedule = getattr(node, 'schedule', None) + + if schedule in dtypes.GPU_SCHEDULES_EXPERIMENTAL_CUDACODEGEN: + return True + if ExperimentalCUDACodeGen._in_kernel_code: return True + return False def generate_node(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int, node: nodes.Node, @@ -1148,8 +955,6 @@ def _generate_NestedSDFG(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSub self._toplevel_schedule = old_schedule - - ####################################################################### # Rather Minor "actual" changes, but much nicer to extend and maintain @@ -1575,8 +1380,8 @@ def get_generated_codeobjects(self): other_globalcode=self._globalcode.getvalue(), localcode=self._localcode.getvalue(), file_header=fileheader.getvalue(), - nstreams=max(1, self._cuda_streams), - nevents=max(1, self._cuda_events), + nstreams=max(1, self._cuda_stream_manager.cuda_streams), + nevents=max(1, self._cuda_stream_manager.cuda_events), backend=self.backend, backend_header=backend_header, pool_header=pool_header, @@ -1631,322 +1436,22 @@ def process_out_memlets(self, *args, **kwargs): # Call CPU implementation with this code generator as callback self._cpu_codegen.process_out_memlets(*args, codegen=self, **kwargs) - def _emit_copy(self, state_id: int, src_node: nodes.Node, src_storage: dtypes.StorageType, dst_node: nodes.Node, - dst_storage: dtypes.StorageType, dst_schedule: dtypes.ScheduleType, - edge: Tuple[nodes.Node, str, nodes.Node, str, Memlet], sdfg: SDFG, cfg: ControlFlowRegion, - dfg: StateSubgraphView, callsite_stream: CodeIOStream) -> None: - u, uconn, v, vconn, memlet = edge - state_dfg = cfg.state(state_id) - - cpu_storage_types = [ - dtypes.StorageType.CPU_Heap, dtypes.StorageType.CPU_ThreadLocal, dtypes.StorageType.CPU_Pinned - ] - gpu_storage_types = [dtypes.StorageType.GPU_Global, dtypes.StorageType.GPU_Shared] - - copy_shape = memlet.subset.bounding_box_size() - copy_shape = [symbolic.overapproximate(s) for s in copy_shape] - # Determine directionality - if (isinstance(src_node, nodes.AccessNode) and memlet.data == src_node.data): - outgoing_memlet = True - elif (isinstance(dst_node, nodes.AccessNode) and memlet.data == dst_node.data): - outgoing_memlet = False - else: - raise LookupError('Memlet does not point to any of the nodes') - - if (isinstance(src_node, nodes.AccessNode) and isinstance(dst_node, nodes.AccessNode) - and not ExperimentalCUDACodeGen._in_kernel_code - and (src_storage in [dtypes.StorageType.GPU_Global, dtypes.StorageType.CPU_Pinned] - or dst_storage in [dtypes.StorageType.GPU_Global, dtypes.StorageType.CPU_Pinned]) - and not (src_storage in cpu_storage_types and dst_storage in cpu_storage_types)): - src_location = 'Device' if src_storage == dtypes.StorageType.GPU_Global else 'Host' - dst_location = 'Device' if dst_storage == dtypes.StorageType.GPU_Global else 'Host' - - # Corner case: A stream is writing to an array - if (isinstance(sdfg.arrays[src_node.data], dt.Stream) and isinstance(sdfg.arrays[dst_node.data], - (dt.Scalar, dt.Array))): - return # Do nothing (handled by ArrayStreamView) - - syncwith = {} # Dictionary of {stream: event} - is_sync = False - max_streams = int(Config.get('compiler', 'cuda', 'max_concurrent_streams')) - - if hasattr(src_node, '_cuda_stream'): - cudastream = src_node._cuda_stream - if not hasattr(dst_node, '_cuda_stream'): - # Copy after which data is needed by the host - is_sync = True - elif dst_node._cuda_stream != src_node._cuda_stream: - syncwith[dst_node._cuda_stream] = getattr(edge, '_cuda_event', None) - else: - pass # Otherwise, no need to synchronize - elif hasattr(dst_node, '_cuda_stream'): - cudastream = dst_node._cuda_stream - else: - if max_streams >= 0: - print('WARNING: Undefined stream, reverting to default') - if dst_location == 'Host': - is_sync = True - cudastream = 'nullptr' - - # Handle case of impending kernel/tasklet on another stream - if max_streams >= 0: - for e in state_dfg.out_edges(dst_node): - if isinstance(e.dst, nodes.AccessNode): - continue - if not hasattr(e.dst, '_cuda_stream'): - is_sync = True - elif not hasattr(e, '_cuda_event'): - is_sync = True - elif e.dst._cuda_stream != cudastream: - syncwith[e.dst._cuda_stream] = e._cuda_event - - if cudastream != 'nullptr': - cudastream = '__state->gpu_context->streams[%d]' % cudastream - - if memlet.wcr is not None: - raise NotImplementedError('Accumulate %s to %s not implemented' % (src_location, dst_location)) - ############################# - - # Obtain copy information - copy_shape, src_strides, dst_strides, src_expr, dst_expr = (memlet_copy_to_absolute_strides( - self._dispatcher, sdfg, state_dfg, edge, src_node, dst_node, self._cpu_codegen._packed_types)) - dims = len(copy_shape) - - dtype = dst_node.desc(sdfg).dtype - - # Handle unsupported copy types - if dims == 2 and (src_strides[-1] != 1 or dst_strides[-1] != 1): - # NOTE: Special case of continuous copy - # Example: dcol[0:I, 0:J, k] -> datacol[0:I, 0:J] - # with copy shape [I, J] and strides [J*K, K], [J, 1] - try: - is_src_cont = src_strides[0] / src_strides[1] == copy_shape[1] - is_dst_cont = dst_strides[0] / dst_strides[1] == copy_shape[1] - except (TypeError, ValueError): - is_src_cont = False - is_dst_cont = False - if is_src_cont and is_dst_cont: - dims = 1 - copy_shape = [copy_shape[0] * copy_shape[1]] - src_strides = [src_strides[1]] - dst_strides = [dst_strides[1]] - else: - raise NotImplementedError('2D copy only supported with one stride') - - # Currently we only support ND copies when they can be represented - # as a 1D copy or as a 2D strided copy - if dims > 2: - if src_strides[-1] != 1 or dst_strides[-1] != 1: - raise NotImplementedError( - 'GPU copies are not supported for N-dimensions if they cannot be represented by a strided copy\n' - f' Nodes: src {src_node} ({src_storage}), dst {dst_node}({dst_storage})\n' - f' Strides: src {src_strides}, dst {dst_strides}') - else: - # Write for-loop headers - for d in range(dims - 2): - callsite_stream.write(f"for (int __copyidx{d} = 0; " - f"__copyidx{d} < {copy_shape[d]};" - f"++__copyidx{d}) {{") - # Write Memcopy2DAsync - current_src_expr = src_expr + " + " + " + ".join( - ["(__copyidx{} * ({}))".format(d, sym2cpp(s)) for d, s in enumerate(src_strides[:-2])]) - current_dst_expr = dst_expr + " + " + "+ ".join( - ["(__copyidx{} * ({}))".format(d, sym2cpp(s)) for d, s in enumerate(dst_strides[:-2])]) - callsite_stream.write( - 'DACE_GPU_CHECK(%sMemcpy2DAsync(%s, %s, %s, %s, %s, %s, %sMemcpy%sTo%s, %s));\n' % - (self.backend, current_dst_expr, - symbolic_to_cpp(dst_strides[-2]) + ' * sizeof(%s)' % dst_node.desc(sdfg).dtype.ctype, current_src_expr, - sym2cpp(src_strides[-2]) + ' * sizeof(%s)' % src_node.desc(sdfg).dtype.ctype, - sym2cpp(copy_shape[-1]) + ' * sizeof(%s)' % dst_node.desc(sdfg).dtype.ctype, - sym2cpp(copy_shape[-2]), self.backend, src_location, dst_location, cudastream), cfg, state_id, - [src_node, dst_node]) - # Write for-loop footers - for d in range(dims - 2): - callsite_stream.write("}") - - if dims == 1 and not (src_strides[-1] != 1 or dst_strides[-1] != 1): - copysize = ' * '.join(symbolic_to_cpp(copy_shape)) - array_length = copysize - copysize += ' * sizeof(%s)' % dtype.ctype - - callsite_stream.write( - 'DACE_GPU_CHECK(%sMemcpyAsync(%s, %s, %s, %sMemcpy%sTo%s, %s));\n' % - (self.backend, dst_expr, src_expr, copysize, self.backend, src_location, dst_location, cudastream), - cfg, state_id, [src_node, dst_node]) - node_dtype = dst_node.desc(sdfg).dtype - if issubclass(node_dtype.type, ctypes.Structure): - callsite_stream.write('for (size_t __idx = 0; __idx < {arrlen}; ++__idx) ' - '{{'.format(arrlen=array_length)) - # TODO: Study further when tackling Structures on GPU. - for field_name, field_type in node_dtype._typeclass.fields.items(): - if isinstance(field_type, dtypes.pointer): - tclass = field_type.type - - length = node_dtype._typeclass._length[field_name] - size = 'sizeof({})*{}[__idx].{}'.format(dtypes._CTYPES[tclass], str(src_node), length) - callsite_stream.write('DACE_GPU_CHECK({backend}Malloc(&{dst}[__idx].{fname}, ' - '{sz}));'.format(dst=str(dst_node), - fname=field_name, - sz=size, - backend=self.backend)) - callsite_stream.write( - 'DACE_GPU_CHECK({backend}MemcpyAsync({dst}[__idx].{fname}, ' - '{src}[__idx].{fname}, {sz}, ' - '{backend}Memcpy{sloc}To{dloc}, {stream}));'.format(dst=str(dst_node), - src=str(src_node), - fname=field_name, - sz=size, - sloc=src_location, - dloc=dst_location, - stream=cudastream, - backend=self.backend), cfg, - state_id, [src_node, dst_node]) - callsite_stream.write('}') - elif dims == 1 and ((src_strides[-1] != 1 or dst_strides[-1] != 1)): - callsite_stream.write( - 'DACE_GPU_CHECK(%sMemcpy2DAsync(%s, %s, %s, %s, %s, %s, %sMemcpy%sTo%s, %s));\n' % - (self.backend, dst_expr, symbolic_to_cpp(dst_strides[0]) + ' * sizeof(%s)' % dst_node.desc(sdfg).dtype.ctype, - src_expr, sym2cpp(src_strides[0]) + ' * sizeof(%s)' % src_node.desc(sdfg).dtype.ctype, - 'sizeof(%s)' % dst_node.desc(sdfg).dtype.ctype, sym2cpp( - copy_shape[0]), self.backend, src_location, dst_location, cudastream), cfg, state_id, - [src_node, dst_node]) - elif dims == 2: - callsite_stream.write( - 'DACE_GPU_CHECK(%sMemcpy2DAsync(%s, %s, %s, %s, %s, %s, %sMemcpy%sTo%s, %s));\n' % - (self.backend, dst_expr, symbolic_to_cpp(dst_strides[0]) + ' * sizeof(%s)' % dst_node.desc(sdfg).dtype.ctype, - src_expr, sym2cpp(src_strides[0]) + ' * sizeof(%s)' % src_node.desc(sdfg).dtype.ctype, - sym2cpp(copy_shape[1]) + ' * sizeof(%s)' % dst_node.desc(sdfg).dtype.ctype, sym2cpp( - copy_shape[0]), self.backend, src_location, dst_location, cudastream), cfg, state_id, - [src_node, dst_node]) - - # Post-copy synchronization - if is_sync: - # Synchronize with host (done at destination) - pass - else: - # Synchronize with other streams as necessary - for streamid, event in syncwith.items(): - syncstream = '__state->gpu_context->streams[%d]' % streamid - callsite_stream.write( - ''' - DACE_GPU_CHECK({backend}EventRecord(__state->gpu_context->events[{ev}], {src_stream})); - DACE_GPU_CHECK({backend}StreamWaitEvent({dst_stream}, __state->gpu_context->events[{ev}], 0)); - '''.format(ev=event, src_stream=cudastream, dst_stream=syncstream, backend=self.backend), cfg, - state_id, [src_node, dst_node]) - - self._emit_sync(callsite_stream) - - # Copy within the GPU - elif (src_storage in gpu_storage_types and dst_storage in gpu_storage_types): - - state_dfg = cfg.state(state_id) - sdict = state_dfg.scope_dict() - schedule_node = src_node - if scope_contains_scope(sdict, src_node, dst_node): - schedule_node = dst_node - - state = state_dfg - while (schedule_node is None or not isinstance(schedule_node, nodes.MapEntry) - or schedule_node.map.schedule == dtypes.ScheduleType.Sequential): - ret = helpers.get_parent_map(state, schedule_node) - if ret is None: - schedule_node = None - break - schedule_node, state = ret - - if schedule_node is None: - inner_schedule = dtypes.SCOPEDEFAULT_SCHEDULE[None] - else: - inner_schedule = schedule_node.map.schedule - - # Collaborative load - if inner_schedule == dtypes.ScheduleType.GPU_Device: - # Obtain copy information - copy_shape, src_strides, dst_strides, src_expr, dst_expr = (memlet_copy_to_absolute_strides( - self._dispatcher, sdfg, state, edge, src_node, dst_node, self._cpu_codegen._packed_types)) - - dims = len(copy_shape) - - funcname = 'dace::%sTo%s%dD' % (_get_storagename(src_storage), _get_storagename(dst_storage), dims) - self._scope_has_collaborative_copy = True - accum = '' - custom_reduction = [] - if memlet.wcr is not None: - redtype = operations.detect_reduction_type(memlet.wcr) - reduction_tmpl = '' - # Special call for detected reduction types - if redtype != dtypes.ReductionType.Custom: - credtype = ('dace::ReductionType::' + str(redtype)[str(redtype).find('.') + 1:]) - reduction_tmpl = '<%s>' % credtype - else: - dtype = dst_node.desc(sdfg).dtype - custom_reduction = [unparse_cr(sdfg, memlet.wcr, dtype)] - accum = '::template Accum%s' % reduction_tmpl - - if any(symbolic.issymbolic(s, sdfg.constants) for s in copy_shape): - callsite_stream.write((' {func}Dynamic<{type}, {bdims}, {is_async}>{accum}({args});').format( - func=funcname, - type=dst_node.desc(sdfg).dtype.ctype, - bdims=', '.join(symbolic_to_cpp(self._block_dims)), - is_async='true' if state_dfg.out_degree(dst_node) == 0 else 'false', - accum=accum, - args=', '.join([src_expr] + symbolic_to_cpp(src_strides) + [dst_expr] + custom_reduction + - symbolic_to_cpp(dst_strides) + symbolic_to_cpp(copy_shape))), cfg, state_id, [src_node, dst_node]) - elif funcname == 'dace::SharedToGlobal1D': - # special case: use a new template struct that provides functions for copy and reduction - callsite_stream.write( - (' {func}<{type}, {bdims}, {copysize}, {is_async}>{accum}({args});').format( - func=funcname, - type=dst_node.desc(sdfg).dtype.ctype, - bdims=', '.join(symbolic_to_cpp(self._block_dims)), - copysize=', '.join(symbolic_to_cpp(copy_shape)), - is_async='true' if state_dfg.out_degree(dst_node) == 0 else 'false', - accum=accum or '::Copy', - args=', '.join([src_expr] + symbolic_to_cpp(src_strides) + [dst_expr] + symbolic_to_cpp(dst_strides) + - custom_reduction)), cfg, state_id, [src_node, dst_node]) - else: - callsite_stream.write( - (' {func}<{type}, {bdims}, {copysize}, ' + - '{dststrides}, {is_async}>{accum}({args});').format( - func=funcname, - type=dst_node.desc(sdfg).dtype.ctype, - bdims=', '.join(symbolic_to_cpp(self._block_dims)), - copysize=', '.join(symbolic_to_cpp(copy_shape)), - dststrides=', '.join(symbolic_to_cpp(dst_strides)), - is_async='true' if state_dfg.out_degree(dst_node) == 0 else 'false', - accum=accum, - args=', '.join([src_expr] + symbolic_to_cpp(src_strides) + [dst_expr] + custom_reduction)), cfg, - state_id, [src_node, dst_node]) - # Per-thread load (same as CPU copies) - else: - self._cpu_codegen.copy_memory(sdfg, cfg, dfg, state_id, src_node, dst_node, edge, None, callsite_stream) - else: - self._cpu_codegen.copy_memory(sdfg, cfg, dfg, state_id, src_node, dst_node, edge, None, callsite_stream) - def copy_memory(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int, src_node: Union[nodes.Tasklet, nodes.AccessNode], dst_node: Union[nodes.CodeNode, nodes.AccessNode], - memlet: Memlet, function_stream: CodeIOStream, callsite_stream: CodeIOStream) -> None: - state = cfg.state(state_id) - if isinstance(src_node, nodes.Tasklet): - src_storage = dtypes.StorageType.Register - src_parent = state.entry_node(src_node) - dst_schedule = None if src_parent is None else src_parent.map.schedule - else: - src_storage = src_node.desc(sdfg).storage + edge: Tuple[nodes.Node, str, nodes.Node, str, Memlet], + function_stream: CodeIOStream, callsite_stream: CodeIOStream) -> None: + + from dace.codegen.targets.new_cuda_codegen.copy_strategies import CopyContext, OutOfKernelCopyStrategy + + context = CopyContext(self, self._cuda_stream_manager, state_id, src_node, dst_node, edge, + sdfg, cfg, dfg, callsite_stream) + + strategy = OutOfKernelCopyStrategy() - if isinstance(dst_node, nodes.Tasklet): - dst_storage = dtypes.StorageType.Register + if strategy.applicable(context): + strategy.generate_copy(context) else: - dst_storage = dst_node.desc(sdfg).storage - - dst_parent = state.entry_node(dst_node) - dst_schedule = None if dst_parent is None else dst_parent.map.schedule - - # Emit actual copy - self._emit_copy(state_id, src_node, src_storage, dst_node, dst_storage, dst_schedule, memlet, sdfg, cfg, dfg, - callsite_stream) - - + self._cpu_codegen.copy_memory(sdfg, cfg, dfg, state_id, src_node, dst_node, edge, None, callsite_stream) ######################################################################### # helper functions from old CUDACodeGen @@ -1957,21 +1462,12 @@ def symbolic_to_cpp(arr): return cppunparse.pyexpr2cpp(symbolic.symstr(arr, cpp_mode=True)) return [cppunparse.pyexpr2cpp(symbolic.symstr(d, cpp_mode=True)) for d in arr] - def _get_cuda_dim(idx): """ Converts 0 to x, 1 to y, 2 to z, or raises an exception. """ if idx < 0 or idx > 2: raise ValueError('idx must be between 0 and 2, got %d' % idx) return ('x', 'y', 'z')[idx] - -def _get_storagename(storage): - """ Returns a string containing the name of the storage location. - Example: dtypes.StorageType.GPU_Shared will return "Shared". """ - sname = str(storage) - return sname[sname.rindex('_') + 1:] - - def product(iterable): """ Computes the symbolic product of elements in the iterable using sympy.Mul. @@ -1985,7 +1481,6 @@ def product(iterable): ######################################################################### # Functions I had to redefine locally to not modify other files and ensure backwards compatibility - def ptr(name: str, desc: dace.data.Data, sdfg: SDFG = None, framecode=None) -> str: """ Returns a string that points to the data based on its name and descriptor. @@ -2023,12 +1518,11 @@ def ptr(name: str, desc: dace.data.Data, sdfg: SDFG = None, framecode=None) -> s return name - - ######################################################################### # helper class + class KernelSpec: """ A helper class to encapsulate information required for working with kernels. @@ -2049,7 +1543,13 @@ def __init__(self, cudaCodeGen: ExperimentalCUDACodeGen, sdfg: SDFG, cfg: Contro self._kernel_name: str = '%s_%d_%d_%d' % (kernel_entry_node.map.label, cfg.cfg_id, state.block_id, state.node_id(kernel_entry_node)) # Kernel arguments - self._args: Dict = cudaCodeGen._arglists[kernel_entry_node] + arglist = {} + for state, node, defined_syms in sdutil.traverse_sdfg_with_defined_symbols(sdfg, recursive=True): + if node is kernel_entry_node: + shared_transients = state.parent.shared_transients() + arglist = state.scope_subgraph(node).arglist(defined_syms, shared_transients) + break + self._args: Dict = arglist self._args_typed: list[str] = [adata.as_arg(name=aname) for aname, adata in self._args.items()] self._args_as_input: list[str] = [ptr(aname, adata, sdfg, cudaCodeGen._frame) for aname, adata in self._args.items()] @@ -2520,5 +2020,28 @@ def open(self, condition: str = None): +class CUDAStreamManager: + + def __init__(self, sdfg: SDFG): + + self.cuda_streams = 0 + self.cuda_events = 0 + + + def get_stream_node(self, node: nodes.Node) -> Any: + """ + Returns the CUDA stream assigned to the given node. + Currently just uses the default (0) cudastream. + """ + return '__state->gpu_context->streams[0]' + + def get_stream_edge(self, src_node: nodes.Node, dst_node: nodes.Node) -> Any: + """ + Returns the CUDA stream assigned to the given edge. + Currently just uses the default (0) cudastream. + """ + return '__state->gpu_context->streams[0]' + + diff --git a/dace/dtypes.py b/dace/dtypes.py index d7b382b4ff..a94521f728 100644 --- a/dace/dtypes.py +++ b/dace/dtypes.py @@ -75,9 +75,6 @@ class ScheduleType(aenum.AutoNumberEnum): Snitch = () Snitch_Multicore = () FPGA_Multi_Pumped = () #: Used for double pumping - - # TODO: Aprove - # Scope introduced in ExperimentalCudaCodeGen GPU_Warp = () @@ -214,9 +211,6 @@ class TilingType(aenum.AutoNumberEnum): ScheduleType.FPGA_Device: StorageType.FPGA_Global, ScheduleType.SVE_Map: StorageType.CPU_Heap, ScheduleType.Snitch: StorageType.Snitch_TCDM, - - #TODO: Approve. - # Should be registers in my opinion ScheduleType.GPU_Warp: StorageType.Register, } @@ -239,9 +233,6 @@ class TilingType(aenum.AutoNumberEnum): ScheduleType.SVE_Map: ScheduleType.Sequential, ScheduleType.Snitch: ScheduleType.Snitch, ScheduleType.Snitch_Multicore: ScheduleType.Snitch_Multicore, - - #TODO: Approve. - # Usually no lower scopes ScheduleType.GPU_Warp: ScheduleType.Sequential, } From 52a6394b5df9df1660fd72f957cf84efee4bd7d4 Mon Sep 17 00:00:00 2001 From: aydogdub Date: Mon, 2 Jun 2025 14:43:07 +0200 Subject: [PATCH 14/94] test for out of kernel memory copies --- .../out_of_kernel_memcpy_test.py | 246 +++++++++++++++--- 1 file changed, 211 insertions(+), 35 deletions(-) diff --git a/berkay_workpace/tests/memcopy_tests/out_of_kernel_memcpy_test.py b/berkay_workpace/tests/memcopy_tests/out_of_kernel_memcpy_test.py index 9dc9323f9f..1f156f110d 100644 --- a/berkay_workpace/tests/memcopy_tests/out_of_kernel_memcpy_test.py +++ b/berkay_workpace/tests/memcopy_tests/out_of_kernel_memcpy_test.py @@ -6,76 +6,252 @@ from dace.codegen import common from IPython.display import Code +""" +NOTE: +This test suite focuses on GPU memory copies that are generated outside the kernel code using DaCe and aims to +remain backend-agnostic (CUDA/HIP). While HIP support has not been verified, care was taken to ensure tests are +not backend-specific. + +Design notes: +- A small number of test cases is used intentionally to avoid redundancy while still covering a broad set of scenarios. +- The test set alternates between different offsets, symbolic sizes, fixed sizes and different locations of the source and destination + (GPU or CPU) to simulate common usage patterns. +- At the time of writing, the DaCe Python frontend does not correctly translate some advanced slicing patterns + (e.g., `dst[b1:e1:s1] = src[b2:e2:s2]`) into valid SDFG representations. + Therefore, such cases are implemented directly through the SDFG API for full control and correctness. +""" + BACKEND = common.get_gpu_backend() -''' + +#------------------ 1D Memory Copy Tests ----------------------- @pytest.mark.gpu def test_1d_out_of_kernel_memcpy(): """ - Test 1D out-of-kernel memcpy using DaCe and CuPy. - Verifies that device-to-device memcpy is performed. + Test simple 1D out-of-kernel memory copy. + The size of both arrays is symbolic, both are defined on + the GPU. """ - n = 100 + # Symbolic array size + N = dace.symbol('N') - @dace.program - def simple_1d_memcpy(dst: dace.uint32[n] @ dace.dtypes.StorageType.GPU_Global, - src: dace.uint32[n] @ dace.dtypes.StorageType.GPU_Global): - dst[:] = src[:] + sdfg = dace.SDFG("simple_1D_memory_copy") + state = sdfg.add_state("main") - sdfg = simple_1d_memcpy.to_sdfg() + # Access nodes + sdfg.add_array("src", (N,), dace.uint32, dace.dtypes.StorageType.GPU_Global) + sdfg.add_array("dst", (N,), dace.uint32, dace.dtypes.StorageType.GPU_Global) + src_acc = state.add_access("src") + dst_acc = state.add_access("dst") + # Create memlet/edge + state.add_edge(src_acc, None, dst_acc, None, dace.memlet.Memlet(expr='[0:N] -> dst[0:N]', volume=N)) + sdfg.fill_scope_connectors() + # Check correctness + # Initialize arrays on GPU + n = 100 src = cp.ones(n, dtype=cp.uint32) dst = cp.zeros(n, dtype=cp.uint32) # Run SDFG - sdfg(dst, src, N=n) - - # Check correctness - cp.testing.assert_array_equal(dst, src) + sdfg(src=src, dst=dst, N=n) # Check generated code for correct memcpy usage func_name = f"{BACKEND}MemcpyAsync" kind = f"{BACKEND}MemcpyDeviceToDevice" code = sdfg.generate_code()[0].code - assert func_name in code and kind in code -''' + # Check correctness + cp.testing.assert_array_equal(dst, src) @pytest.mark.gpu def test_1d_out_of_kernel_memcpy_strided(): """ - Test 1D out-of-kernel memcpy using DaCe and CuPy. - Here, the copy shape is strided and we use symbolic sizes. - Furthermore, we have a CPU to GPU copy + Test strided 1D out-of-kernel memcpy. + Here, the copy shape is strided (different strides for source and destination) + and we use fixed sizes. Src is a CPU array, dst a GPU one. """ - N = dace.symbol('N') - n = 10 - @dace.program - def strided_1d_memcpy(dst: dace.uint32[2*N] @ dace.dtypes.StorageType.GPU_Global, - src: dace.uint32[4*N]): - dst[::2] = src[::4] + sdfg = dace.SDFG("strided_1D_memory_copy") + state = sdfg.add_state("main") - sdfg = strided_1d_memcpy.to_sdfg(validate=False) + # Access nodes of fixed shapes + sdfg.add_array("src", (40,), dace.uint32) + sdfg.add_array("dst", (20,), dace.uint32, dace.dtypes.StorageType.GPU_Global) + src_acc = state.add_access("src") + dst_acc = state.add_access("dst") - # Initialize arrays on GPU - src = np.ones(4*n, dtype=np.uint32) - dst = cp.zeros(2*n, dtype=cp.uint32) - - # Run SDFG - sdfg(dst, src, N=n) + # copy is of the form: src[0:40:4] -> dst[0:20:2], Volume 10 + state.add_edge(src_acc, None, dst_acc, None, dace.memlet.Memlet('[0:40:4] -> dst[0:20:2]')) + sdfg.fill_scope_connectors() # Check correctness - expected = cp.zeros(2*n, dtype=cp.uint32) - expected[::2] = 1 # since src[::4] are all ones - cp.testing.assert_array_equal(dst, expected) - # Check generated code for correct memcpy usage + # Initialize arrays + src = np.ones(40, dtype=cp.uint32) + dst = cp.zeros(20, dtype=cp.uint32) + + # Run program + sdfg(src=src, dst=dst) + + # Check generated code for expected memcpy usage + # NOTE: Memcpy2DAsync is used! Check the codegen, neat trick :) func_name = f"{BACKEND}Memcpy2DAsync" kind = f"{BACKEND}MemcpyHostToDevice" code = sdfg.generate_code()[0].code + assert func_name in code and kind in code + + #Check whether result is as expected + expected = cp.zeros(20, dtype=cp.uint32) + expected[::2] = 1 + cp.testing.assert_array_equal(expected, dst) + +#------------------ 2D Memory Copy Tests ----------------------- +@pytest.mark.gpu +def test_2d_out_of_kernel_memcpy(): + """ + Test 2D out-of-kernel memcpy. + Here, the copy shape is contigous (copy contiguous src to contigous dst), + we use fixed sizes and only copy a subset of the array. + Source is on GPU, destination an array on CPU. + """ + sdfg = dace.SDFG("simple_2D_memory_copy") + state = sdfg.add_state("main") + + # Access nodes of fixed shape (5,10) + sdfg.add_array("src", (5,10,), dace.uint32, dace.dtypes.StorageType.GPU_Global) + sdfg.add_array("dst", (5,10,), dace.uint32) + src_acc = state.add_access("src") + dst_acc = state.add_access("dst") + + # Copying only subset of src to dst, i.e. src[2:4,5:8] -> dst[2:4,5:8] + state.add_edge(src_acc, None, dst_acc, None, dace.memlet.Memlet('[2:4,5:8] -> dst[2:4,5:8]')) + sdfg.fill_scope_connectors() + + # Check correctness + # Initialize arrays + src = cp.ones((5,10), dtype=cp.uint32) + dst = np.zeros((5,10), dtype=cp.uint32) + + # Run program + sdfg(src=src, dst=dst) + + # Check generated code for expected memcpy usage + func_name = f"{BACKEND}Memcpy2DAsync" + kind = f"{BACKEND}MemcpyDeviceToHost" + code = sdfg.generate_code()[0].code assert func_name in code and kind in code + + #Check whether result is as expected + expected = np.zeros((5,10), dtype=cp.uint32) + expected[2:4, 5:8] = 1 + np.testing.assert_array_equal(dst, expected) + +@pytest.mark.gpu +def test_2d_out_of_kernel_memcpy_one_strided(): + """ + Test strided 2D out-of-kernel memcpy. + Symbolic sizes are used, stride is non-contigous + only in one access node. + """ + + N = dace.symbol('N') + M = dace.symbol('M') + sdfg = dace.SDFG("one_strided_2D_memory_copy") + state = sdfg.add_state("main") + + # Access nodes + sdfg.add_array("src", (N,2*M,), dace.uint32) + sdfg.add_array("dst", (N,M,), dace.uint32, dace.dtypes.StorageType.GPU_Global) + src_acc = state.add_access("src") + dst_acc = state.add_access("dst") + + # the edge/memlet + state.add_edge(src_acc, None, dst_acc, None, dace.memlet.Memlet('[0:N,0:2*M:2] -> dst[0:N,0:M]')) + sdfg.fill_scope_connectors() + + # Check correctness + + # Initialize arrays + n = 3 + m = 10 + src = np.ones((n,2*m), dtype=cp.uint32) + dst = cp.zeros((n,m), dtype=cp.uint32) + + # Run program + sdfg(src=src, dst=dst, N=n, M=m) + + # Check generated code for expected memcpy usage + func_name = f"{BACKEND}Memcpy2DAsync" + kind = f"{BACKEND}MemcpyHostToDevice" + code = sdfg.generate_code()[0].code + assert func_name in code and kind in code + + #Check whether result is as expected + expected = cp.ones((n,m), dtype=cp.uint32) + cp.testing.assert_array_equal(dst, expected) + +@pytest.mark.gpu +def test_2d_oofkmemcpy_two_strided_fail(): + """ + Test strided 2D out-of-kernel memcpy. + This test should fail (notImplementedError). + """ + + sdfg = dace.SDFG("failing_2D_memory_copy") + state = sdfg.add_state("main") + + # Access nodes + sdfg.add_array("src", (2,20,), dace.uint32, dace.dtypes.StorageType.GPU_Global) + sdfg.add_array("dst", (2,10,), dace.uint32, dace.dtypes.StorageType.GPU_Global) + src_acc = state.add_access("src") + dst_acc = state.add_access("dst") + + # the edge/memlet + state.add_edge(src_acc, None, dst_acc, None, dace.memlet.Memlet('[0:2,0:20:10] -> dst[0:2,0:10:5]')) + sdfg.fill_scope_connectors() + + # Check correctness + + # Initialize arrays + src = cp.ones((2,20), dtype=cp.uint32) + dst = cp.zeros((2,10), dtype=cp.uint32) + + # notImplementedError should be raised + with pytest.raises(NotImplementedError): + sdfg(src=src, dst=dst) + +# ---------- Higher-Dimensional (>2D) Memory Copy Tests -------- + +@pytest.mark.gpu +def test_3d_oofkmemcpy(): + """ + Test simple 3D out-of-kernel memcpy. + """ + + sdfg = dace.SDFG("simple_3D_memory_copy") + state = sdfg.add_state("main") + + # Access nodes + sdfg.add_array("src", (2,2,4), dace.uint32, dace.dtypes.StorageType.GPU_Global) + sdfg.add_array("dst", (2,2,4), dace.uint32, dace.dtypes.StorageType.GPU_Global) + src_acc = state.add_access("src") + dst_acc = state.add_access("dst") + + # the edge/memlet + state.add_edge(src_acc, None, dst_acc, None, dace.memlet.Memlet('[0:2,0:2,0:4] -> dst[0:2,0:2,0:4]')) + sdfg.fill_scope_connectors() + + # Check correctness + + # Initialize arrays + src = cp.ones((2,2,4), dtype=cp.uint32) + dst = cp.zeros((2,2,4), dtype=cp.uint32) + + # run and check + sdfg(src=src, dst=dst) + cp.testing.assert_array_equal(dst, src) \ No newline at end of file From 6ceb4e8f912c059bf1ea0897e2fab5a31c8a9b07 Mon Sep 17 00:00:00 2001 From: aydogdub Date: Mon, 2 Jun 2025 16:20:42 +0200 Subject: [PATCH 15/94] provisional memory copy solution --- .../out_of_kernel_memcpy_test.py | 2 - .../new_cuda_codegen/copy_strategies.py | 86 +++++++++++++++++-- .../new_cuda_codegen/experimental_cuda.py | 45 +++++----- 3 files changed, 102 insertions(+), 31 deletions(-) diff --git a/berkay_workpace/tests/memcopy_tests/out_of_kernel_memcpy_test.py b/berkay_workpace/tests/memcopy_tests/out_of_kernel_memcpy_test.py index 1f156f110d..787b1d8b87 100644 --- a/berkay_workpace/tests/memcopy_tests/out_of_kernel_memcpy_test.py +++ b/berkay_workpace/tests/memcopy_tests/out_of_kernel_memcpy_test.py @@ -4,7 +4,6 @@ import pytest from dace.codegen import common -from IPython.display import Code """ NOTE: @@ -226,7 +225,6 @@ def test_2d_oofkmemcpy_two_strided_fail(): sdfg(src=src, dst=dst) # ---------- Higher-Dimensional (>2D) Memory Copy Tests -------- - @pytest.mark.gpu def test_3d_oofkmemcpy(): """ diff --git a/dace/codegen/targets/new_cuda_codegen/copy_strategies.py b/dace/codegen/targets/new_cuda_codegen/copy_strategies.py index f89e882647..eb51873a4f 100644 --- a/dace/codegen/targets/new_cuda_codegen/copy_strategies.py +++ b/dace/codegen/targets/new_cuda_codegen/copy_strategies.py @@ -16,7 +16,8 @@ from dace.codegen.targets.cpp import memlet_copy_to_absolute_strides -# TODO: Adapt documentation if src strides is None! +# TODO: Review Documentation once done here. And also, take care of the other +# two strategies below. class CopyContext: """ Stores and derives all information required for memory copy operations on GPUs. @@ -161,8 +162,12 @@ def is_contiguous_copy(self) -> bool: """ return (self.src_strides[-1] == 1) and (self.dst_strides[-1] == 1) + def get_memory_location(self) -> Tuple[str, str]: + src_location = 'Device' if self.src_storage == dtypes.StorageType.GPU_Global else 'Host' + dst_location = 'Device' if self.dst_storage == dtypes.StorageType.GPU_Global else 'Host' + + return src_location, dst_location - class CopyStrategy(ABC): @@ -228,20 +233,22 @@ def applicable(self, copy_context: CopyContext) -> bool: def generate_copy(self, copy_context: CopyContext) -> None: """Execute host-device copy with CUDA memory operations""" + # guard + _, _, _, _, memlet = copy_context.edge + if memlet.wcr is not None: + src_location, dst_location = copy_context.get_memory_location() + raise NotImplementedError(f'Accumulate {src_location} to {dst_location} not implemented') + # call corresponding helper function num_dims = copy_context.num_dims - if num_dims == 1: self._generate_1d_copy(copy_context) elif num_dims == 2: self._generate_2d_copy(copy_context) - elif num_dims > 2: + else: + # sanity check + assert num_dims > 2, f"Expected copy shape with more than 2 dimensions, but got {num_dims}." self._generate_nd_copy(copy_context) - else: # num_dims = 0 - raise NotImplementedError( - f"ExternalCudaCopyStrategy does not support memory copies with {num_dims} dimensions " - f"(copy shape: {copy_context.copy_shape}). " - ) def _generate_1d_copy(self, copy_context: CopyContext) -> None: """ @@ -374,3 +381,64 @@ def _generate_nd_copy(self, copy_context: CopyContext) -> None: for d in range(num_dims - 2): callsite_stream.write("}") + +################ TODO, just here out of completeness for now ############# + + +class WithinGPUCopyStrategy(CopyStrategy): + + def applicable(self, copy_context: CopyContext) -> bool: + + from dace.sdfg import scope_contains_scope + from dace.transformation import helpers + + gpu_storage_types = [dtypes.StorageType.GPU_Global, dtypes.StorageType.GPU_Shared] + cond1 = copy_context.src_storage in gpu_storage_types and copy_context.dst_storage in gpu_storage_types + + state_id = copy_context.state_id + cfg = copy_context.cfg + src_node = copy_context.src_node + dst_node = copy_context.dst_node + + state_dfg = cfg.state(state_id) + sdict = state_dfg.scope_dict() + schedule_node = copy_context.src_node + if scope_contains_scope(sdict, src_node, dst_node): + schedule_node = dst_node + + state = state_dfg + while (schedule_node is None or not isinstance(schedule_node, nodes.MapEntry) + or schedule_node.map.schedule == dtypes.ScheduleType.Sequential): + ret = helpers.get_parent_map(state, schedule_node) + if ret is None: + schedule_node = None + break + schedule_node, state = ret + + if schedule_node is None: + inner_schedule = dtypes.SCOPEDEFAULT_SCHEDULE[None] + else: + inner_schedule = schedule_node.map.schedule + + # Collaborative load + cond2 = inner_schedule == dtypes.ScheduleType.GPU_Device + + return cond1 and cond2 + + def generate_copy(self, copy_context: CopyContext) -> None: + raise NotImplementedError(f'WithinGPUCopy not yet implemented in ExperimentalCUDACodeGen') + + +class FallBackGPUCopyStrategy(CopyStrategy): + + def applicable(self, copy_context: CopyContext)-> bool: + return True + + def generate_copy(self, copy_context: CopyContext): + callsite_stream, cfg, state_id, src_node, dst_node = copy_context.get_write_context() + sdfg = copy_context.sdfg + dfg = copy_context.dfg + edge = copy_context.edge + cpu_codegen = copy_context.codegen._cpu_codegen + cpu_codegen.copy_memory(sdfg, cfg, dfg, state_id, src_node, dst_node, edge, None, callsite_stream) + \ No newline at end of file diff --git a/dace/codegen/targets/new_cuda_codegen/experimental_cuda.py b/dace/codegen/targets/new_cuda_codegen/experimental_cuda.py index 11cc955d63..c064d2f29a 100644 --- a/dace/codegen/targets/new_cuda_codegen/experimental_cuda.py +++ b/dace/codegen/targets/new_cuda_codegen/experimental_cuda.py @@ -42,6 +42,8 @@ # 3. Berkay: Warning if sync property in maps is used # 4. Berkay: Warning/Error that GPU_device must be used before other GPU schedule types +# Ask yakup: +# How would you seperate it into several files? @@ -1231,9 +1233,8 @@ def deallocate_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgrap - ####################################################################### - # Copy-pasted, might be changed in future - +####################################################################### +# Copy-pasted, might be changed in future def get_generated_codeobjects(self): @@ -1441,20 +1442,31 @@ def copy_memory(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView edge: Tuple[nodes.Node, str, nodes.Node, str, Memlet], function_stream: CodeIOStream, callsite_stream: CodeIOStream) -> None: - from dace.codegen.targets.new_cuda_codegen.copy_strategies import CopyContext, OutOfKernelCopyStrategy - + from dace.codegen.targets.new_cuda_codegen.copy_strategies import ( + CopyContext, CopyStrategy, OutOfKernelCopyStrategy, + WithinGPUCopyStrategy, FallBackGPUCopyStrategy) + context = CopyContext(self, self._cuda_stream_manager, state_id, src_node, dst_node, edge, - sdfg, cfg, dfg, callsite_stream) - - strategy = OutOfKernelCopyStrategy() + sdfg, cfg, dfg, callsite_stream) + + # Order matters: fallback must come last + strategies: List[CopyStrategy] = [ + OutOfKernelCopyStrategy(), + WithinGPUCopyStrategy(), + FallBackGPUCopyStrategy() + ] + + for strategy in strategies: + if strategy.applicable(context): + strategy.generate_copy(context) + return + + raise RuntimeError("No applicable GPU memory copy strategy found (this should not happen).") + - if strategy.applicable(context): - strategy.generate_copy(context) - else: - self._cpu_codegen.copy_memory(sdfg, cfg, dfg, state_id, src_node, dst_node, edge, None, callsite_stream) ######################################################################### -# helper functions from old CUDACodeGen +# helper classes and functions def symbolic_to_cpp(arr): """ Converts an array of symbolic variables (or one) to C++ strings. """ @@ -1478,9 +1490,7 @@ def product(iterable): """ return functools.reduce(sympy.Mul, iterable, 1) -######################################################################### # Functions I had to redefine locally to not modify other files and ensure backwards compatibility - def ptr(name: str, desc: dace.data.Data, sdfg: SDFG = None, framecode=None) -> str: """ Returns a string that points to the data based on its name and descriptor. @@ -1518,11 +1528,6 @@ def ptr(name: str, desc: dace.data.Data, sdfg: SDFG = None, framecode=None) -> s return name -######################################################################### -# helper class - - - class KernelSpec: """ A helper class to encapsulate information required for working with kernels. From 52a89c5169ea36981653bb31a93ca62b62d3aadd Mon Sep 17 00:00:00 2001 From: aydogdub Date: Wed, 4 Jun 2025 20:07:23 +0200 Subject: [PATCH 16/94] A naive alternative of a cuda stream scheduler as a pass --- .../passes/gpustream_scheduling.py | 215 ++++++++++++++++++ 1 file changed, 215 insertions(+) create mode 100644 dace/transformation/passes/gpustream_scheduling.py diff --git a/dace/transformation/passes/gpustream_scheduling.py b/dace/transformation/passes/gpustream_scheduling.py new file mode 100644 index 0000000000..76de3e351e --- /dev/null +++ b/dace/transformation/passes/gpustream_scheduling.py @@ -0,0 +1,215 @@ +from typing import Union, Dict, Set + +import dace +from dace import SDFG, properties, SDFGState +from dace import dtypes +from dace.codegen import common +from dace.config import Config +from dace.transformation import pass_pipeline as ppl, transformation +from dace.sdfg import nodes +from dace.sdfg.graph import Edge + + +@properties.make_properties +@transformation.explicit_cf_compatible +class NaiveGPUStreamScheduler(ppl.Pass): + """ + Assigns GPU streams to relevant nodes based on connected components. + Also, it adds synchronization tasklets where required. + + Strategy: + - "Relevant nodes" in connected components within a state are assigned the same stream. + - Each state (except for nested states) starts fresh with stream 0. + - States in nested SDFGs inherit the parent component's stream. + - Only nodes that are either ("relevant nodes"): + * in GPU memory (AccessNodes in GPU memory), + * GPU scheduled (e.g., maps/kernels or library nodes), + * or directly connected to such nodes, + are assigned a stream. + - GPU stream IDs wrap around based on the max_concurrent_streams config. + + Example: + A state with K1->K2, K3->K4->K5, K6 becomes: + K1,K2 → stream0 + K3,K4,K5 → stream1 + K6 → stream2 + (assuming no limit on the number of CUDA streams) + + NOTE: These are backend streams (CUDA/HIP), not DaCe streams. + """ + + # max configured number of concurrent streams + max_concurrent_streams = int(Config.get('compiler', 'cuda', 'max_concurrent_streams')) + + # needed to call correct backend synchronization functions and in correct language + backend: str = common.get_gpu_backend() + language = 'cu' if backend == 'cuda' else 'cpp' + + # This is expected to be set by the calling target codegenerator. + gpu_stream_access_template: str = "" + + def apply_pass(self, sdfg: SDFG, _) -> Dict[nodes.Node, Union[int, str]]: + """ + Assigns GPU streams and adds synchronization tasklets. + """ + + assigned_nodes = self._assign_streams_to_sdfg(sdfg) + + num_assigned_streams = max(assigned_nodes.values(), default=0) + + # If all use 0 stream or max_concurrent_stream is -1 (only default stream) + # then assign to all nodes the nullptr. + if num_assigned_streams == 0: # note: self.max_concurrent_streams == -1 implies num_assigned_streams == 0 + for k in assigned_nodes.keys(): + assigned_nodes[k] = "nullptr" + + self._add_sync_tasklet(sdfg, assigned_nodes) + + return assigned_nodes + + def _assign_streams_to_sdfg(self, sdfg: SDFG, assigned_nodes=None, visited=None) -> Dict: + """ + Traverse all SDFG states and assign streams to connected components. + Each state (exluding nested states) restarts stream assignment from 0. + """ + if assigned_nodes is None: + assigned_nodes = dict() + if visited is None: + visited = set() + + for state in sdfg.states(): + self._assign_streams_to_state_recursively(sdfg, state, assigned_nodes, visited, 0) + + return assigned_nodes + + def _assign_streams_to_state_recursively(self, sdfg: SDFG, state: SDFGState, assigned_nodes: Dict, visited: Set, gpu_stream:int): + """ + Processes connected components in a state, assigning each to a different GPU stream, + but only if they contain GPU-related nodes (otherwise, stream assignment is skipped). + + Nested SDFGs inherit the GPU stream of their parent state/component. + """ + for source_node in state.source_nodes(): + if source_node in visited: + continue # Skip already processed components + + nodes_assigned_before = len(assigned_nodes) + + # Process all nodes in this connected component + for edge in state.dfs_edges(source_node): + + # get both ends of the edge + src = edge.src + dst = edge.dst + + # both are visited, potentially again + visited.add(src) + visited.add(dst) + + # Either they are gpu nodes are directly connected to them, + # so they get assigned to the current gpu_stream + if self._is_gpu_node(src, sdfg) or self._is_gpu_node(dst, sdfg): + assigned_nodes[src] = gpu_stream + assigned_nodes[dst] = gpu_stream + + # Recursively process nested SDFG states with same stream + if isinstance(src, nodes.NestedSDFG): + for nested_state in src.sdfg.states(): + self._assign_streams_to_state_recursively(src.sdfg, nested_state, assigned_nodes, visited, gpu_stream) + + if isinstance(dst, nodes.NestedSDFG): + for nested_state in dst.sdfg.states(): + self._assign_streams_to_state_recursively(dst.sdfg, nested_state, assigned_nodes, visited, gpu_stream) + + + # Move to next stream if we assigned any nodes in this component + if len(assigned_nodes) > nodes_assigned_before: + gpu_stream = self._next_stream(gpu_stream) + + def _is_gpu_node(self, node: nodes.Node, sdfg: SDFG) -> bool: + """ + Determine if a node is a gpu node. + + This includes GPU-scheduled library nodes, kernels (maps), and GPU global memory + access nodes. + + Args: + node: Node to check + sdfg: SDFG for context + + Returns: + True if node is a gpu node + """ + # GPU global memory access nodes + if (isinstance(node, nodes.AccessNode) and + node.desc(sdfg).storage == dtypes.StorageType.GPU_Global): + return True + + # GPU-scheduled map entry/exit nodes (kernels) + if (isinstance(node, (nodes.EntryNode, nodes.ExitNode)) and + node.schedule in dtypes.GPU_SCHEDULES): + return True + + # GPU-scheduled library nodes + if (isinstance(node, nodes.LibraryNode) and + node.schedule in dtypes.GPU_SCHEDULES): + return True + + return False + + def _next_stream(self, gpu_stream: int) -> int: + """ + Returns the next CUDA stream index based on the configured concurrency policy. + + - If max_concurrent_streams == 0: unlimited streams → increment stream index + - If max_concurrent_streams == -1: default → always return 0 + - Else: wrap around within the allowed number of streams + """ + if self.max_concurrent_streams == 0: + return gpu_stream + 1 + elif self.max_concurrent_streams == -1: + return 0 + else: + return (gpu_stream + 1) % self.max_concurrent_streams + + def _add_sync_tasklet(self, sdfg: SDFG, assigned_nodes: dict): + """ + Adds a synchronization tasklet for each sink node in a connected component, + but only for top-level states (not inside nested SDFGs). + + Specifically: + - If a sink node is an AccessNode and has been assigned a GPU stream, + a tasklet is inserted after it to call stream synchronization. + - This ensures proper synchronization. + """ + for state in sdfg.states(): + for snode in state.sink_nodes(): + + if isinstance(snode, nodes.AccessNode) and snode in assigned_nodes.keys(): + + # get correct stream access expr + stream = assigned_nodes[snode] + if stream == "nullptr": + gpu_stream_access_expr = "nullptr" + else: + gpu_stream_access_expr = self.gpu_stream_access_template.format(gpu_stream=stream) + # Add tasklet and connect it to the sink node + tasklet = state.add_tasklet( + name=f"sync_{stream}", inputs=set(), outputs=set(), + code=f"DACE_GPU_CHECK({self.backend}StreamSynchronize({gpu_stream_access_expr}));\n", + language=dtypes.Language.CPP + ) + + state.add_edge(snode, None, tasklet, None, dace.Memlet()) + else: + continue + + def set_gpu_stream_access_template(self, expr_template: str): + """ + Sets the stream access expression template. The string should include + a `{gpu_stream}` placeholder. This function is expected to be called from a + gpu code generator. + """ + if "{gpu_stream}" not in expr_template: + raise ValueError("gpu_stream_access_template must include '{gpu_stream}' placeholder.") + self.gpu_stream_access_template = expr_template \ No newline at end of file From 9f26992d927ffbf9dbf18096f80fcbfb065c738a Mon Sep 17 00:00:00 2001 From: aydogdub Date: Wed, 4 Jun 2025 20:09:44 +0200 Subject: [PATCH 17/94] adapt such that the naive GPU scheduler pass works and can be used by the ExperimentalCUDACodegen as intended --- dace/codegen/targets/cpp.py | 2 +- .../new_cuda_codegen/copy_strategies.py | 6 +- .../new_cuda_codegen/experimental_cuda.py | 138 ++++++++++++++---- 3 files changed, 111 insertions(+), 35 deletions(-) diff --git a/dace/codegen/targets/cpp.py b/dace/codegen/targets/cpp.py index 5c4d04c0a7..be1136f3b6 100644 --- a/dace/codegen/targets/cpp.py +++ b/dace/codegen/targets/cpp.py @@ -919,7 +919,7 @@ def unparse_tasklet(sdfg, cfg, state_id, dfg, node, function_stream, callsite_st # set the stream to a local variable. max_streams = int(Config.get("compiler", "cuda", "max_concurrent_streams")) if not is_devicelevel_gpu(sdfg, state_dfg, node) and (hasattr(node, "_cuda_stream") - or connected_to_gpu_memory(node, state_dfg, sdfg)): + and connected_to_gpu_memory(node, state_dfg, sdfg)): if max_streams >= 0: callsite_stream.write( 'int __dace_current_stream_id = %d;\n%sStream_t __dace_current_stream = __state->gpu_context->streams[__dace_current_stream_id];' diff --git a/dace/codegen/targets/new_cuda_codegen/copy_strategies.py b/dace/codegen/targets/new_cuda_codegen/copy_strategies.py index eb51873a4f..8b7bb32e8e 100644 --- a/dace/codegen/targets/new_cuda_codegen/copy_strategies.py +++ b/dace/codegen/targets/new_cuda_codegen/copy_strategies.py @@ -4,7 +4,7 @@ from dace import symbolic from dace import Memlet, dtypes from dace.dtypes import StorageType -from dace.codegen.targets.new_cuda_codegen.experimental_cuda import ExperimentalCUDACodeGen, CUDAStreamManager, product +from dace.codegen.targets.new_cuda_codegen.experimental_cuda import ExperimentalCUDACodeGen, GPUStreamManager, product @@ -30,7 +30,7 @@ class CopyContext: what values are needed for code generation and why. This improves readability, simplifies copy emission logic, and makes future extensions easier. """ - def __init__(self, codegen: ExperimentalCUDACodeGen, cuda_stream_manager: CUDAStreamManager, state_id: int, + def __init__(self, codegen: ExperimentalCUDACodeGen, gpu_stream_manager: GPUStreamManager, state_id: int, src_node: Node, dst_node: Node, edge: Tuple[Node, str, Node, str, Memlet], sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, callsite_stream: CodeIOStream): @@ -51,7 +51,7 @@ def __init__(self, codegen: ExperimentalCUDACodeGen, cuda_stream_manager: CUDASt # Additional information frequently needed self.backend = codegen.backend self.state_dfg = cfg.state(state_id) - self.cudastream = cuda_stream_manager.get_stream_edge(src_node, dst_node) + self.cudastream = gpu_stream_manager.get_stream_edge(src_node, dst_node) self.src_storage = self.get_storage_type(src_node) self.dst_storage = self.get_storage_type(dst_node) diff --git a/dace/codegen/targets/new_cuda_codegen/experimental_cuda.py b/dace/codegen/targets/new_cuda_codegen/experimental_cuda.py index c064d2f29a..e23a6c2741 100644 --- a/dace/codegen/targets/new_cuda_codegen/experimental_cuda.py +++ b/dace/codegen/targets/new_cuda_codegen/experimental_cuda.py @@ -29,6 +29,8 @@ from dace.sdfg.state import ControlFlowRegion, StateSubgraphView from dace.transformation import helpers from dace.transformation.passes import analysis as ap +from dace.transformation.passes.gpustream_scheduling import NaiveGPUStreamScheduler + if TYPE_CHECKING: @@ -36,16 +38,32 @@ from dace.codegen.targets.cpu import CPUCodeGen +# TODO's easy: +# 1. Handle memory pools release +# 2. Handle sync properties +# 3. Warning/Error that GPU_deive must be used before other GPU schedules +# 4. Emit sync + +# TODO's harder: +# 2. Include constant expressions + -# TODO's: -# 2. Berkay: Include constant expresssions -# 3. Berkay: Warning if sync property in maps is used -# 4. Berkay: Warning/Error that GPU_device must be used before other GPU schedule types # Ask yakup: -# How would you seperate it into several files? +# 1. Show copy_strategy and ask about WithinGPU +# 2. How would you seperate it into several files? +# 3. Should I implement scope also using Strategy? Allocation is Dispatcher style + +# 4. How to handle "_compute_pool_release"? It also uses "getattr()" +# issues I have with it -> I messed this up because it was not obvious where deallocation happens +# I could leave it be for the most part and just implement the release (currently not needed, no test needs it) +# Or I could again add a helper class. We might then just have quite a number of helper classes + +# 5. Cudastream: Should I maybe document (once I have nothing todo/am tired) what exactly will be needed to do there? +# 6. shared memory, how is it handled now, how should it be handled in future? +# cudastream: as a pass, tasklet to synchronize @registry.autoregister_params(name='experimental_cuda') class ExperimentalCUDACodeGen(TargetCodeGenerator): @@ -126,14 +144,22 @@ def __init__(self, frame_codegen: 'DaCeCodeGenerator', sdfg: SDFG): ################## New variables ########################## - self._current_kernel_spec: Optional[KernelSpec] = None - self._cuda_stream_manager: CUDAStreamManager = CUDAStreamManager(sdfg) + self._current_kernel_spec: Optional[KernelSpec] = None + self._gpu_stream_manager: Optional[GPUStreamManager] = None def preprocess(self, sdfg: SDFG) -> None: + """ + Preprocess the SDFG to prepare it for GPU code generation. This includes: + - Handling GPU<->GPU strided copies. + - Assigning backend GPU streams (e.g., CUDA streams) and creating the GPUStreamManager. + - Handling memory pool management + """ + + #------------------------- Hanlde GPU<->GPU strided copies -------------------------- # Find GPU<->GPU strided copies that cannot be represented by a single copy command from dace.transformation.dataflow import CopyToMap @@ -174,12 +200,28 @@ def preprocess(self, sdfg: SDFG) -> None: except ValueError: # If transformation doesn't match, continue normally continue - + + #------------------------- GPU Stream related Logic -------------------------- + + # Register GPU context in state struct + self._frame.statestruct.append('dace::cuda::Context *gpu_context;') + + # Define backend stream access expression (e.g., CUDA stream handle) + gpu_stream_access_template = "__state->gpu_context->streams[{gpu_stream}]" + + # Initialize and configure GPU stream scheduling pass + gpu_stream_pass = NaiveGPUStreamScheduler() + gpu_stream_pass.set_gpu_stream_access_template(gpu_stream_access_template) + assigned_streams = gpu_stream_pass.apply_pass(sdfg, None) + + # Initialize runtime GPU stream manager + self._gpu_stream_manager = GPUStreamManager(sdfg, assigned_streams, gpu_stream_access_template) + + #------------------------- Memory Pool related Logic -------------------------- + # Find points where memory should be released to the memory pool self._compute_pool_release(sdfg) - # Write GPU context to state structure - self._frame.statestruct.append('dace::cuda::Context *gpu_context;') def _compute_pool_release(self, top_sdfg: SDFG): @@ -784,7 +826,8 @@ def _generate_kernel_launch(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: gdims = ', '.join(symbolic_to_cpp(grid_dims)) bdims = ', '.join(symbolic_to_cpp(block_dims)) - + # cuda/hip stream the kernel belongs to + gpu_stream = self._gpu_stream_manager.get_stream_node(scope_entry) # ----------------- Kernel Launch Function Declaration ----------------------- self._localcode.write( @@ -836,7 +879,7 @@ def _generate_kernel_launch(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: gdims=gdims, bdims=bdims, dynsmem='0', - stream='__state->gpu_context->streams[0]', + stream=gpu_stream, backend=self.backend ), cfg, state_id, scope_entry @@ -1231,8 +1274,6 @@ def deallocate_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgrap raise NotImplementedError(f'Deallocation not implemented for storage type: {nodedesc.storage.name}') - - ####################################################################### # Copy-pasted, might be changed in future @@ -1381,8 +1422,8 @@ def get_generated_codeobjects(self): other_globalcode=self._globalcode.getvalue(), localcode=self._localcode.getvalue(), file_header=fileheader.getvalue(), - nstreams=max(1, self._cuda_stream_manager.cuda_streams), - nevents=max(1, self._cuda_stream_manager.cuda_events), + nstreams=self._gpu_stream_manager.num_gpu_streams, + nevents=self._gpu_stream_manager.num_gpu_events, backend=self.backend, backend_header=backend_header, pool_header=pool_header, @@ -1442,11 +1483,14 @@ def copy_memory(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView edge: Tuple[nodes.Node, str, nodes.Node, str, Memlet], function_stream: CodeIOStream, callsite_stream: CodeIOStream) -> None: + # NOTE: + # There should be additional cudastream handling implemented- I can make this a TODO + from dace.codegen.targets.new_cuda_codegen.copy_strategies import ( CopyContext, CopyStrategy, OutOfKernelCopyStrategy, WithinGPUCopyStrategy, FallBackGPUCopyStrategy) - context = CopyContext(self, self._cuda_stream_manager, state_id, src_node, dst_node, edge, + context = CopyContext(self, self._gpu_stream_manager, state_id, src_node, dst_node, edge, sdfg, cfg, dfg, callsite_stream) # Order matters: fallback must come last @@ -1465,6 +1509,12 @@ def copy_memory(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView + + + + + + ######################################################################### # helper classes and functions @@ -2025,28 +2075,54 @@ def open(self, condition: str = None): -class CUDAStreamManager: +class GPUStreamManager: + """ + Manages GPU backend streams (e.g., CUDA or HIP streams) for nodes in an SDFG. + Assumes that the initialization inputs come from the NaiveGPUScheduler pass. - def __init__(self, sdfg: SDFG): + NOTE: "Stream" here refers to backend GPU streams, not DaCe data streams. + """ - self.cuda_streams = 0 - self.cuda_events = 0 - + def __init__(self, sdfg: SDFG, assigned_streams: Dict[nodes.Node, Union[int, str]], stream_access_template: str): + self.sdfg = sdfg + self.assigned_streams = assigned_streams + self.stream_access_template = stream_access_template + + # Placeholder for future support of backend events (e.g., CUDA events) + self.num_gpu_events = 0 - def get_stream_node(self, node: nodes.Node) -> Any: + # Determine the number of streams used (stream IDs start from 0) + # Only count integer stream IDs (ignore string values like "nullptr") + int_stream_ids = [v for v in assigned_streams.values() if isinstance(v, int)] + self.num_gpu_streams = max(int_stream_ids, default=0) + + def get_stream_node(self, node: nodes.Node) -> str: """ - Returns the CUDA stream assigned to the given node. - Currently just uses the default (0) cudastream. + Returns the GPU stream access expression for a given node. + + If the node has an assigned stream not equal the default "nullptr", returns + the formatted stream expression. Otherwise, returns "nullptr". """ - return '__state->gpu_context->streams[0]' + if node in self.assigned_streams and self.assigned_streams[node] != "nullptr": + return self.stream_access_template.format(gpu_stream=self.assigned_streams[node]) + return "nullptr" - def get_stream_edge(self, src_node: nodes.Node, dst_node: nodes.Node) -> Any: + def get_stream_edge(self, src_node: nodes.Node, dst_node: nodes.Node) -> str: """ - Returns the CUDA stream assigned to the given edge. - Currently just uses the default (0) cudastream. + Returns the stream access expression for an edge based on either the + source or destination node. If one of the nodes has an assigned stream not equal + to the default 'nullptr', that stream is returned (should be symmetric + when using the NaiveGPUStreamScheduler pass). Otherwise, returns 'nullptr'. """ - return '__state->gpu_context->streams[0]' - + if src_node in self.assigned_streams and self.assigned_streams[src_node] != "nullptr": + stream_id = self.assigned_streams[src_node] + return self.stream_access_template.format(gpu_stream=stream_id) + elif dst_node in self.assigned_streams and self.assigned_streams[dst_node] != "nullptr": + stream_id = self.assigned_streams[dst_node] + return self.stream_access_template.format(gpu_stream=stream_id) + else: + return "nullptr" + From 5a0f479fd31b1d07f36465ca3d0dc8666fe26f17 Mon Sep 17 00:00:00 2001 From: aydogdub Date: Wed, 4 Jun 2025 20:12:27 +0200 Subject: [PATCH 18/94] from now on, write important notes here to help future developers and yakup --- berkay_workpace/reports/important_notes.txt | 6 ++++++ 1 file changed, 6 insertions(+) create mode 100644 berkay_workpace/reports/important_notes.txt diff --git a/berkay_workpace/reports/important_notes.txt b/berkay_workpace/reports/important_notes.txt new file mode 100644 index 0000000000..f03bdb40a3 --- /dev/null +++ b/berkay_workpace/reports/important_notes.txt @@ -0,0 +1,6 @@ +1. CudaEvents are a bit a mix in the codegen. I left it as an attribute in the GPUStreamManager class, but I do not work with CudaEvents. + I left it there because it may be implemented in future and more importantly, the GPU code template (in get_generated_codeobject()) + depends on it. Instead of removing it, I decided to let it be and just say 0 CudaEvents are created and used. + Generally: The CudaStreamManager assumes that the NaiveGPUScheduler pass was called before. + Also, the CudaStreamManager should define the functions "get_stream_edge" (and maybe "get_stream_node"), since the the copystrategies might + depend on it \ No newline at end of file From af8faf1ef88a3d87ebb6e67aba4c87110e3c76e0 Mon Sep 17 00:00:00 2001 From: aydogdub Date: Wed, 4 Jun 2025 21:18:29 +0200 Subject: [PATCH 19/94] Looking at the effects of NaiveGPUStreamScheduler --- berkay_workpace/scratch/cudastreamPass.ipynb | 631 +++++++++++++++++++ 1 file changed, 631 insertions(+) create mode 100644 berkay_workpace/scratch/cudastreamPass.ipynb diff --git a/berkay_workpace/scratch/cudastreamPass.ipynb b/berkay_workpace/scratch/cudastreamPass.ipynb new file mode 100644 index 0000000000..0f505df94e --- /dev/null +++ b/berkay_workpace/scratch/cudastreamPass.ipynb @@ -0,0 +1,631 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "f4d111db", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import dace\n", + "from IPython.display import Code\n", + "from dace.transformation import pass_pipeline\n" + ] + }, + { + "cell_type": "markdown", + "id": "7bdf4ea6", + "metadata": {}, + "source": [ + "Here you can choose any of the 3 following programs to see how the sdfg assigns streams and how it adds synchronization tasklets if required.\n", + "You can, if you wish, also change e.g. the StorageType of one input- as long as you don't choose a strategy where the GPU is not used (e.g. a direct CPU\n", + "to CPU copy), a synchronization tasklet should be added. \n", + "\n", + "Note: test1 is a special case - where we have only one connected component. I thought it would be cool if we just use the default nullptr in this case instead of \n", + "creating a stream." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "3dfa8ad3", + "metadata": {}, + "outputs": [], + "source": [ + "@dace.program\n", + "def test1(A: dace.uint32[10] @ dace.dtypes.StorageType.GPU_Global,\n", + " B: dace.uint32[10] @ dace.dtypes.StorageType.GPU_Global\n", + " ):\n", + " A[:] = B[:]" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "ad689f3d", + "metadata": {}, + "outputs": [], + "source": [ + "@dace.program\n", + "def test2(A: dace.uint32[10] @ dace.dtypes.StorageType.GPU_Global,\n", + " B: dace.uint32[10] @ dace.dtypes.StorageType.GPU_Global,\n", + " C: dace.uint32[10] @ dace.dtypes.StorageType.GPU_Global,\n", + " D: dace.uint32[10] @ dace.dtypes.StorageType.GPU_Global\n", + " ):\n", + " \n", + " for i in dace.map[0:10] @ dace.dtypes.ScheduleType.GPU_Device:\n", + " A[i] = B[i]\n", + " \n", + " for j in dace.map[0:10] @ dace.dtypes.ScheduleType.GPU_Device:\n", + " C[j] = D[j]" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "104de517", + "metadata": {}, + "outputs": [], + "source": [ + "@dace.program\n", + "def test3(A: dace.uint32[10] @ dace.dtypes.StorageType.GPU_Global,\n", + " B: dace.uint32[10] @ dace.dtypes.StorageType.GPU_Global,\n", + " C: dace.uint32[10] @ dace.dtypes.StorageType.GPU_Global,\n", + " D: dace.uint32[10] @ dace.dtypes.StorageType.GPU_Global\n", + " ):\n", + " \n", + " A[:] = B[:]\n", + " \n", + " for i in dace.map[0:3] @ dace.dtypes.ScheduleType.Sequential:\n", + " for j in dace.map[0:10] @ dace.dtypes.ScheduleType.GPU_Device:\n", + " C[j] = D[j]" + ] + }, + { + "cell_type": "markdown", + "id": "3ce0f3af", + "metadata": {}, + "source": [ + "Choose which program you want to select for generating the sdfg below. It will give you the sdfg, without any snychronization tasklets.\n", + "The old codegen, would figure out where synchronization has to occur. We will make this explicit, as you wanted :)." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "5ba1505c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "
\n", + "
\n", + "
\n", + "\n", + "" + ], + "text/plain": [ + "SDFG (test3)" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Choose\n", + "# sdfg = test1.to_sdfg()\n", + "# sdfg = test2.to_sdfg()\n", + "sdfg = test3.to_sdfg()\n", + "sdfg" + ] + }, + { + "cell_type": "markdown", + "id": "16c0f318", + "metadata": {}, + "source": [ + "Now we apply the pass to see the change:" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "c9152955", + "metadata": {}, + "outputs": [], + "source": [ + "# import the pass\n", + "from dace.transformation.passes.gpustream_scheduling import NaiveGPUStreamScheduler\n", + "\n", + "# Define backend stream access expression, which is used as below. \n", + "# (I do this explicitly such that any change in the access expression can be detected easier in future)\n", + "gpu_stream_access_template = \"__state->gpu_context->streams[{gpu_stream}]\" \n", + "\n", + "# Initialize and configure GPU stream scheduling pass\n", + "gpu_stream_pass = NaiveGPUStreamScheduler()\n", + "gpu_stream_pass.set_gpu_stream_access_template(gpu_stream_access_template)\n", + "assigned_streams = gpu_stream_pass.apply_pass(sdfg, None)\n" + ] + }, + { + "cell_type": "markdown", + "id": "415675f7", + "metadata": {}, + "source": [ + "Look at which nodes get assigned to which streams - as expected, right?" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "964ac157", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{AccessNode (B): 0,\n", + " AccessNode (A): 0,\n", + " AccessNode (D): 1,\n", + " MapEntry (test3_10[i=0:3]): 1,\n", + " MapEntry (test3_10_4_11[j=0:10]): 1,\n", + " Tasklet (assign_12_12): 1,\n", + " MapExit (test3_10_4_11[j=0:10]): 1,\n", + " MapExit (test3_10[i=0:3]): 1,\n", + " AccessNode (C): 1}" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "assigned_streams" + ] + }, + { + "cell_type": "markdown", + "id": "69b5a1c0", + "metadata": {}, + "source": [ + "Look at the extended sdfg, now the synchronization is explicit and not the job of the codegen to figure out and implement." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "f0cbcd1f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "
\n", + "
\n", + "
\n", + "\n", + "" + ], + "text/plain": [ + "SDFG (test3)" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sdfg " + ] + }, + { + "cell_type": "markdown", + "id": "804d8436", + "metadata": {}, + "source": [ + "And you can also inspect the corresponding code. Just ensure that you are using the experimental codegen:" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "60d817de", + "metadata": {}, + "outputs": [], + "source": [ + "from dace.config import Config\n", + "\n", + "assert Config.get('compiler', 'cuda', 'implementation') == \"experimental\"" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "bf7c6836", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/berkay/master-thesis/dace/dace/codegen/targets/new_cuda_codegen/experimental_cuda.py:1728: UserWarning: No `gpu_block_size` property specified on map \"test3_10_4_11\". Falling back to the configuration entry `compiler.cuda.default_block_size`: 32,1,1. You can either specify the block size to use with the gpu_block_size property, or by adding nested `GPU_ThreadBlock` maps, which map work to individual threads. For more information, see https://spcldace.readthedocs.io/en/latest/optimization/gpu.html\n", + " warnings.warn(\n" + ] + }, + { + "data": { + "text/html": [ + "
/* DaCe AUTO-GENERATED FILE. DO NOT MODIFY */\n",
+       "#include <dace/dace.h>\n",
+       "#include "../../include/hash.h"\n",
+       "\n",
+       "struct test3_state_t {\n",
+       "    dace::cuda::Context *gpu_context;\n",
+       "};\n",
+       "\n",
+       "DACE_EXPORTED void __dace_runkernel_test3_10_4_11_0_0_6(test3_state_t *__state, dace::uint * __restrict__ C, dace::uint * __restrict__ D);\n",
+       "void __program_test3_internal(test3_state_t*__state, dace::uint * __restrict__ A, dace::uint * __restrict__ B, dace::uint * __restrict__ C, dace::uint * __restrict__ D)\n",
+       "{\n",
+       "\n",
+       "    {\n",
+       "\n",
+       "        DACE_GPU_CHECK(cudaMemcpyAsync(A, B, 10 * sizeof(dace::uint), cudaMemcpyDeviceToDevice, __state->gpu_context->streams[0]));\n",
+       "        {\n",
+       "\n",
+       "            ///////////////////\n",
+       "            DACE_GPU_CHECK(cudaStreamSynchronize(__state->gpu_context->streams[0]));\n",
+       "            ///////////////////\n",
+       "\n",
+       "        }\n",
+       "        {\n",
+       "            for (auto i = 0; i < 3; i += 1) {\n",
+       "                __dace_runkernel_test3_10_4_11_0_0_6(__state, C, D);\n",
+       "            }\n",
+       "        }\n",
+       "        {\n",
+       "\n",
+       "            ///////////////////\n",
+       "            DACE_GPU_CHECK(cudaStreamSynchronize(__state->gpu_context->streams[1]));\n",
+       "            ///////////////////\n",
+       "\n",
+       "        }\n",
+       "\n",
+       "    }\n",
+       "}\n",
+       "\n",
+       "DACE_EXPORTED void __program_test3(test3_state_t *__state, dace::uint * __restrict__ A, dace::uint * __restrict__ B, dace::uint * __restrict__ C, dace::uint * __restrict__ D)\n",
+       "{\n",
+       "    __program_test3_internal(__state, A, B, C, D);\n",
+       "}\n",
+       "DACE_EXPORTED int __dace_init_experimental_cuda(test3_state_t *__state);\n",
+       "DACE_EXPORTED int __dace_exit_experimental_cuda(test3_state_t *__state);\n",
+       "\n",
+       "DACE_EXPORTED test3_state_t *__dace_init_test3()\n",
+       "{\n",
+       "    int __result = 0;\n",
+       "    test3_state_t *__state = new test3_state_t;\n",
+       "\n",
+       "\n",
+       "    __result |= __dace_init_experimental_cuda(__state);\n",
+       "\n",
+       "    if (__result) {\n",
+       "        delete __state;\n",
+       "        return nullptr;\n",
+       "    }\n",
+       "    return __state;\n",
+       "}\n",
+       "\n",
+       "DACE_EXPORTED int __dace_exit_test3(test3_state_t *__state)\n",
+       "{\n",
+       "    int __err = 0;\n",
+       "\n",
+       "    int __err_experimental_cuda = __dace_exit_experimental_cuda(__state);\n",
+       "    if (__err_experimental_cuda) {\n",
+       "        __err = __err_experimental_cuda;\n",
+       "    }\n",
+       "    delete __state;\n",
+       "    return __err;\n",
+       "}\n",
+       "
\n" + ], + "text/latex": [ + "\\begin{Verbatim}[commandchars=\\\\\\{\\}]\n", + "\\PY{c+cm}{/* DaCe AUTO\\PYZhy{}GENERATED FILE. DO NOT MODIFY */}\n", + "\\PY{c+cp}{\\PYZsh{}}\\PY{c+cp}{include}\\PY{+w}{ }\\PY{c+cpf}{\\PYZlt{}dace/dace.h\\PYZgt{}}\n", + "\\PY{c+cp}{\\PYZsh{}}\\PY{c+cp}{include}\\PY{+w}{ }\\PY{c+cpf}{\\PYZdq{}../../include/hash.h\\PYZdq{}}\n", + "\n", + "\\PY{k}{struct}\\PY{+w}{ }\\PY{n+nc}{test3\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", + "\\PY{+w}{ }\\PY{n}{dace}\\PY{o}{:}\\PY{o}{:}\\PY{n}{cuda}\\PY{o}{:}\\PY{o}{:}\\PY{n}{Context}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{gpu\\PYZus{}context}\\PY{p}{;}\n", + "\\PY{p}{\\PYZcb{}}\\PY{p}{;}\n", + "\n", + "\\PY{n}{DACE\\PYZus{}EXPORTED}\\PY{+w}{ }\\PY{k+kt}{void}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}dace\\PYZus{}runkernel\\PYZus{}test3\\PYZus{}10\\PYZus{}4\\PYZus{}11\\PYZus{}0\\PYZus{}0\\PYZus{}6}\\PY{p}{(}\\PY{n}{test3\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{dace}\\PY{o}{:}\\PY{o}{:}\\PY{n}{uint}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}restrict\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{n}{C}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{dace}\\PY{o}{:}\\PY{o}{:}\\PY{n}{uint}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}restrict\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{n}{D}\\PY{p}{)}\\PY{p}{;}\n", + "\\PY{k+kt}{void}\\PY{+w}{ }\\PY{n+nf}{\\PYZus{}\\PYZus{}program\\PYZus{}test3\\PYZus{}internal}\\PY{p}{(}\\PY{n}{test3\\PYZus{}state\\PYZus{}t}\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{dace}\\PY{o}{:}\\PY{o}{:}\\PY{n}{uint}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}restrict\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{n}{A}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{dace}\\PY{o}{:}\\PY{o}{:}\\PY{n}{uint}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}restrict\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{n}{B}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{dace}\\PY{o}{:}\\PY{o}{:}\\PY{n}{uint}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}restrict\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{n}{C}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{dace}\\PY{o}{:}\\PY{o}{:}\\PY{n}{uint}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}restrict\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{n}{D}\\PY{p}{)}\n", + "\\PY{p}{\\PYZob{}}\n", + "\n", + "\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", + "\n", + "\\PY{+w}{ }\\PY{n}{DACE\\PYZus{}GPU\\PYZus{}CHECK}\\PY{p}{(}\\PY{n}{cudaMemcpyAsync}\\PY{p}{(}\\PY{n}{A}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{B}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{10}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{k}{sizeof}\\PY{p}{(}\\PY{n}{dace}\\PY{o}{:}\\PY{o}{:}\\PY{n}{uint}\\PY{p}{)}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{cudaMemcpyDeviceToDevice}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{streams}\\PY{p}{[}\\PY{l+m+mi}{0}\\PY{p}{]}\\PY{p}{)}\\PY{p}{)}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", + "\n", + "\\PY{+w}{ }\\PY{c+c1}{///////////////////}\n", + "\\PY{+w}{ }\\PY{n}{DACE\\PYZus{}GPU\\PYZus{}CHECK}\\PY{p}{(}\\PY{n}{cudaStreamSynchronize}\\PY{p}{(}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{streams}\\PY{p}{[}\\PY{l+m+mi}{0}\\PY{p}{]}\\PY{p}{)}\\PY{p}{)}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{c+c1}{///////////////////}\n", + "\n", + "\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\n", + "\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", + "\\PY{+w}{ }\\PY{k}{for}\\PY{+w}{ }\\PY{p}{(}\\PY{k}{auto}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{;}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{\\PYZlt{}}\\PY{+w}{ }\\PY{l+m+mi}{3}\\PY{p}{;}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{+}\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{)}\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", + "\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}dace\\PYZus{}runkernel\\PYZus{}test3\\PYZus{}10\\PYZus{}4\\PYZus{}11\\PYZus{}0\\PYZus{}0\\PYZus{}6}\\PY{p}{(}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{C}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{D}\\PY{p}{)}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\n", + "\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\n", + "\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", + "\n", + "\\PY{+w}{ }\\PY{c+c1}{///////////////////}\n", + "\\PY{+w}{ }\\PY{n}{DACE\\PYZus{}GPU\\PYZus{}CHECK}\\PY{p}{(}\\PY{n}{cudaStreamSynchronize}\\PY{p}{(}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{streams}\\PY{p}{[}\\PY{l+m+mi}{1}\\PY{p}{]}\\PY{p}{)}\\PY{p}{)}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{c+c1}{///////////////////}\n", + "\n", + "\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\n", + "\n", + "\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\n", + "\\PY{p}{\\PYZcb{}}\n", + "\n", + "\\PY{n}{DACE\\PYZus{}EXPORTED}\\PY{+w}{ }\\PY{k+kt}{void}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}program\\PYZus{}test3}\\PY{p}{(}\\PY{n}{test3\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{dace}\\PY{o}{:}\\PY{o}{:}\\PY{n}{uint}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}restrict\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{n}{A}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{dace}\\PY{o}{:}\\PY{o}{:}\\PY{n}{uint}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}restrict\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{n}{B}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{dace}\\PY{o}{:}\\PY{o}{:}\\PY{n}{uint}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}restrict\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{n}{C}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{dace}\\PY{o}{:}\\PY{o}{:}\\PY{n}{uint}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}restrict\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{n}{D}\\PY{p}{)}\n", + "\\PY{p}{\\PYZob{}}\n", + "\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}program\\PYZus{}test3\\PYZus{}internal}\\PY{p}{(}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{A}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{B}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{C}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{D}\\PY{p}{)}\\PY{p}{;}\n", + "\\PY{p}{\\PYZcb{}}\n", + "\\PY{n}{DACE\\PYZus{}EXPORTED}\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}dace\\PYZus{}init\\PYZus{}experimental\\PYZus{}cuda}\\PY{p}{(}\\PY{n}{test3\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{)}\\PY{p}{;}\n", + "\\PY{n}{DACE\\PYZus{}EXPORTED}\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}dace\\PYZus{}exit\\PYZus{}experimental\\PYZus{}cuda}\\PY{p}{(}\\PY{n}{test3\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{)}\\PY{p}{;}\n", + "\n", + "\\PY{n}{DACE\\PYZus{}EXPORTED}\\PY{+w}{ }\\PY{n}{test3\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}dace\\PYZus{}init\\PYZus{}test3}\\PY{p}{(}\\PY{p}{)}\n", + "\\PY{p}{\\PYZob{}}\n", + "\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}result}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{n}{test3\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{k}{new}\\PY{+w}{ }\\PY{n}{test3\\PYZus{}state\\PYZus{}t}\\PY{p}{;}\n", + "\n", + "\n", + "\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}result}\\PY{+w}{ }\\PY{o}{|}\\PY{o}{=}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}dace\\PYZus{}init\\PYZus{}experimental\\PYZus{}cuda}\\PY{p}{(}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{)}\\PY{p}{;}\n", + "\n", + "\\PY{+w}{ }\\PY{k}{if}\\PY{+w}{ }\\PY{p}{(}\\PY{n}{\\PYZus{}\\PYZus{}result}\\PY{p}{)}\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", + "\\PY{+w}{ }\\PY{k}{delete}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{k}{return}\\PY{+w}{ }\\PY{k}{nullptr}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\n", + "\\PY{+w}{ }\\PY{k}{return}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{;}\n", + "\\PY{p}{\\PYZcb{}}\n", + "\n", + "\\PY{n}{DACE\\PYZus{}EXPORTED}\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}dace\\PYZus{}exit\\PYZus{}test3}\\PY{p}{(}\\PY{n}{test3\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{)}\n", + "\\PY{p}{\\PYZob{}}\n", + "\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}err}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{;}\n", + "\n", + "\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}err\\PYZus{}experimental\\PYZus{}cuda}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}dace\\PYZus{}exit\\PYZus{}experimental\\PYZus{}cuda}\\PY{p}{(}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{)}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{k}{if}\\PY{+w}{ }\\PY{p}{(}\\PY{n}{\\PYZus{}\\PYZus{}err\\PYZus{}experimental\\PYZus{}cuda}\\PY{p}{)}\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", + "\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}err}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}err\\PYZus{}experimental\\PYZus{}cuda}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\n", + "\\PY{+w}{ }\\PY{k}{delete}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{k}{return}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}err}\\PY{p}{;}\n", + "\\PY{p}{\\PYZcb{}}\n", + "\\end{Verbatim}\n" + ], + "text/plain": [ + "/* DaCe AUTO-GENERATED FILE. DO NOT MODIFY */\n", + "#include \n", + "#include \"../../include/hash.h\"\n", + "\n", + "struct test3_state_t {\n", + " dace::cuda::Context *gpu_context;\n", + "};\n", + "\n", + "DACE_EXPORTED void __dace_runkernel_test3_10_4_11_0_0_6(test3_state_t *__state, dace::uint * __restrict__ C, dace::uint * __restrict__ D);\n", + "void __program_test3_internal(test3_state_t*__state, dace::uint * __restrict__ A, dace::uint * __restrict__ B, dace::uint * __restrict__ C, dace::uint * __restrict__ D)\n", + "{\n", + "\n", + " {\n", + "\n", + " DACE_GPU_CHECK(cudaMemcpyAsync(A, B, 10 * sizeof(dace::uint), cudaMemcpyDeviceToDevice, __state->gpu_context->streams[0]));\n", + " {\n", + "\n", + " ///////////////////\n", + " DACE_GPU_CHECK(cudaStreamSynchronize(__state->gpu_context->streams[0]));\n", + " ///////////////////\n", + "\n", + " }\n", + " {\n", + " for (auto i = 0; i < 3; i += 1) {\n", + " __dace_runkernel_test3_10_4_11_0_0_6(__state, C, D);\n", + " }\n", + " }\n", + " {\n", + "\n", + " ///////////////////\n", + " DACE_GPU_CHECK(cudaStreamSynchronize(__state->gpu_context->streams[1]));\n", + " ///////////////////\n", + "\n", + " }\n", + "\n", + " }\n", + "}\n", + "\n", + "DACE_EXPORTED void __program_test3(test3_state_t *__state, dace::uint * __restrict__ A, dace::uint * __restrict__ B, dace::uint * __restrict__ C, dace::uint * __restrict__ D)\n", + "{\n", + " __program_test3_internal(__state, A, B, C, D);\n", + "}\n", + "DACE_EXPORTED int __dace_init_experimental_cuda(test3_state_t *__state);\n", + "DACE_EXPORTED int __dace_exit_experimental_cuda(test3_state_t *__state);\n", + "\n", + "DACE_EXPORTED test3_state_t *__dace_init_test3()\n", + "{\n", + " int __result = 0;\n", + " test3_state_t *__state = new test3_state_t;\n", + "\n", + "\n", + " __result |= __dace_init_experimental_cuda(__state);\n", + "\n", + " if (__result) {\n", + " delete __state;\n", + " return nullptr;\n", + " }\n", + " return __state;\n", + "}\n", + "\n", + "DACE_EXPORTED int __dace_exit_test3(test3_state_t *__state)\n", + "{\n", + " int __err = 0;\n", + "\n", + " int __err_experimental_cuda = __dace_exit_experimental_cuda(__state);\n", + " if (__err_experimental_cuda) {\n", + " __err = __err_experimental_cuda;\n", + " }\n", + " delete __state;\n", + " return __err;\n", + "}" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "Code(sdfg.generate_code()[0].clean_code, language='cpp')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a029d5a5", + "metadata": {}, + "outputs": [], + "source": [ + "Code(sdfg.generate_code()[1].clean_code, language='cpp')" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "dace_env", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From 8b5020a9c21af14497ed0c945bfd72543a19c768 Mon Sep 17 00:00:00 2001 From: aydogdub Date: Thu, 5 Jun 2025 20:13:15 +0200 Subject: [PATCH 20/94] Major refactoring. Create several files, implement Strategy Pattern for Scopes, move helper functions into seperate file. --- dace/codegen/targets/__init__.py | 2 +- .../experimental_cuda.py | 869 ++++-------------- .../copy_strategies.py | 4 +- .../gpu_stream_manager.py | 50 + .../experimental_cuda_helpers/gpu_utils.py | 28 + .../scope_strategies.py | 574 ++++++++++++ 6 files changed, 815 insertions(+), 712 deletions(-) rename dace/codegen/targets/{new_cuda_codegen => }/experimental_cuda.py (68%) rename dace/codegen/targets/{new_cuda_codegen => experimental_cuda_helpers}/copy_strategies.py (99%) create mode 100644 dace/codegen/targets/experimental_cuda_helpers/gpu_stream_manager.py create mode 100644 dace/codegen/targets/experimental_cuda_helpers/gpu_utils.py create mode 100644 dace/codegen/targets/experimental_cuda_helpers/scope_strategies.py diff --git a/dace/codegen/targets/__init__.py b/dace/codegen/targets/__init__.py index 9ac7561c10..a0c2065524 100644 --- a/dace/codegen/targets/__init__.py +++ b/dace/codegen/targets/__init__.py @@ -9,4 +9,4 @@ from .mlir.mlir import MLIRCodeGen from .sve.codegen import SVECodeGen from .snitch import SnitchCodeGen -from .new_cuda_codegen.experimental_cuda import ExperimentalCUDACodeGen \ No newline at end of file +from .experimental_cuda import ExperimentalCUDACodeGen \ No newline at end of file diff --git a/dace/codegen/targets/new_cuda_codegen/experimental_cuda.py b/dace/codegen/targets/experimental_cuda.py similarity index 68% rename from dace/codegen/targets/new_cuda_codegen/experimental_cuda.py rename to dace/codegen/targets/experimental_cuda.py index e23a6c2741..81a8a84f13 100644 --- a/dace/codegen/targets/new_cuda_codegen/experimental_cuda.py +++ b/dace/codegen/targets/experimental_cuda.py @@ -1,38 +1,44 @@ -import ctypes -import functools +# Standard library imports import warnings -from typing import TYPE_CHECKING, Dict, List, Optional, Set, Tuple, Union, Any +from typing import TYPE_CHECKING, Dict, List, Optional, Set, Tuple, Union +# Third-party imports import networkx as nx import sympy -from six import StringIO +# DaCe core imports import dace from dace import data as dt, Memlet -from dace import dtypes, registry -from dace import subsets, symbolic -from dace.codegen import common, cppunparse +from dace import dtypes, registry, symbolic +from dace.config import Config +from dace.sdfg import SDFG, ScopeSubgraphView, SDFGState, nodes +from dace.sdfg import utils as sdutil +from dace.sdfg.graph import MultiConnectorEdge +from dace.sdfg.state import ControlFlowRegion, StateSubgraphView + +# DaCe codegen imports +from dace.codegen import common from dace.codegen.codeobject import CodeObject from dace.codegen.dispatcher import DefinedType, TargetDispatcher from dace.codegen.prettycode import CodeIOStream -from dace.codegen.targets import cpp from dace.codegen.common import update_persistent_desc -from dace.codegen.targets.cpp import (codeblock_to_cpp, cpp_array_expr, memlet_copy_to_absolute_strides, sym2cpp, - synchronize_streams, unparse_cr, mangle_dace_state_struct_name) +from dace.codegen.targets import cpp +from dace.codegen.targets.cpp import ( + codeblock_to_cpp, + memlet_copy_to_absolute_strides, + mangle_dace_state_struct_name +) from dace.codegen.targets.target import IllegalCopy, TargetCodeGenerator, make_absolute -from dace.config import Config -from dace.frontend import operations -from dace.sdfg import (SDFG, ScopeSubgraphView, SDFGState, has_dynamic_map_inputs, is_array_stream_view, - is_devicelevel_gpu, nodes, scope_contains_scope) -from dace.sdfg import utils as sdutil -from dace.sdfg.graph import MultiConnectorEdge -from dace.sdfg.state import ControlFlowRegion, StateSubgraphView -from dace.transformation import helpers + +# DaCe transformation imports from dace.transformation.passes import analysis as ap from dace.transformation.passes.gpustream_scheduling import NaiveGPUStreamScheduler +# Experimental CUDA helper imports +from dace.codegen.targets.experimental_cuda_helpers.gpu_stream_manager import GPUStreamManager +from dace.codegen.targets.experimental_cuda_helpers.gpu_utils import symbolic_to_cpp, product - +# Type checking imports (conditional) if TYPE_CHECKING: from dace.codegen.targets.framecode import DaCeCodeGenerator from dace.codegen.targets.cpu import CPUCodeGen @@ -43,40 +49,27 @@ # 2. Handle sync properties # 3. Warning/Error that GPU_deive must be used before other GPU schedules # 4. Emit sync +# 5. compute_release() # TODO's harder: # 2. Include constant expressions -# Ask yakup: -# 1. Show copy_strategy and ask about WithinGPU -# 2. How would you seperate it into several files? -# 3. Should I implement scope also using Strategy? Allocation is Dispatcher style - -# 4. How to handle "_compute_pool_release"? It also uses "getattr()" -# issues I have with it -> I messed this up because it was not obvious where deallocation happens -# I could leave it be for the most part and just implement the release (currently not needed, no test needs it) -# Or I could again add a helper class. We might then just have quite a number of helper classes - -# 5. Cudastream: Should I maybe document (once I have nothing todo/am tired) what exactly will be needed to do there? -# 6. shared memory, how is it handled now, how should it be handled in future? - - -# cudastream: as a pass, tasklet to synchronize - @registry.autoregister_params(name='experimental_cuda') class ExperimentalCUDACodeGen(TargetCodeGenerator): """ Experimental CUDA code generator.""" target_name = 'experimental_cuda' title = 'CUDA' - ######################## Initilization and Preprocessing related start ######################################################### + + ########################################################################### + # Initialization & Preprocessing def __init__(self, frame_codegen: 'DaCeCodeGenerator', sdfg: SDFG): self._frame: DaCeCodeGenerator = frame_codegen # creates the frame code, orchestrates the code generation for targets - self._dispatcher: TargetDispatcher= frame_codegen.dispatcher # responsible for dispatching code generation to the appropriate target + self._dispatcher: TargetDispatcher = frame_codegen.dispatcher # responsible for dispatching code generation to the appropriate target ExperimentalCUDACodeGen._in_kernel_code = False @@ -148,9 +141,6 @@ def __init__(self, frame_codegen: 'DaCeCodeGenerator', sdfg: SDFG): self._current_kernel_spec: Optional[KernelSpec] = None self._gpu_stream_manager: Optional[GPUStreamManager] = None - - - def preprocess(self, sdfg: SDFG) -> None: """ Preprocess the SDFG to prepare it for GPU code generation. This includes: @@ -222,8 +212,6 @@ def preprocess(self, sdfg: SDFG) -> None: # Find points where memory should be released to the memory pool self._compute_pool_release(sdfg) - - def _compute_pool_release(self, top_sdfg: SDFG): """ Computes positions in the code generator where a memory pool array is no longer used and @@ -313,6 +301,9 @@ def _compute_pool_release(self, top_sdfg: SDFG): self.pool_release[(sdfg, arr)] = (sink, set()) + ########################################################################### + # Determine wheter initializer and finalizer should be called + @property def has_initializer(self) -> bool: return True @@ -321,469 +312,88 @@ def has_finalizer(self) -> bool: return True + ########################################################################### + # Scope generation + def generate_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: ScopeSubgraphView, state_id: int, + function_stream: CodeIOStream, callsite_stream: CodeIOStream) -> None: + # Import strategies here to avoid circular dependencies + from dace.codegen.targets.experimental_cuda_helpers.scope_strategies import ( + ScopeGenerationStrategy, + KernelScopeGenerator, + ThreadBlockScopeGenerator, + WarpScopeGenerator + ) + + + #--------------- Start of Kernel Function Code Generation -------------------- - def generate_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: ScopeSubgraphView, state_id: int, - function_stream: CodeIOStream, callsite_stream: CodeIOStream) -> None: - - # Are we generating host (launch) code or device (kernel) code? if not ExperimentalCUDACodeGen._in_kernel_code: - # Prepare and cache kernel metadata (name, grid dims, arguments, etc.) + # Prepare and cache kernel metadata (name, dimensions, arguments, etc.) self._current_kernel_spec = KernelSpec( cudaCodeGen=self, sdfg=sdfg, cfg=cfg, dfg_scope=dfg_scope, state_id=state_id ) - - - self._generate_kernel_wrapper(sdfg, cfg, dfg_scope, state_id, function_stream, callsite_stream) - #--------------- Generate Kernel Function ---------------- + # Generate wrapper function + self._generate_kernel_wrapper(sdfg, cfg, dfg_scope, state_id, function_stream, callsite_stream) + # Enter kernel context and recursively generate device code ExperimentalCUDACodeGen._in_kernel_code = True kernel_stream = CodeIOStream() + kernel_function_stream = self._globalcode - kernel_name = self._current_kernel_spec.kernel_name - kernel_args = self._current_kernel_spec.args_typed - block_dims = self._current_kernel_spec.block_dims - node = dfg_scope.source_nodes()[0] - - # Conditionally add __launch_bounds__ for block size optimization. - launch_bounds = '' - if node.gpu_launch_bounds != '-1': - if node.gpu_launch_bounds == "0": - if not any(symbolic.issymbolic(b) for b in block_dims): - launch_bounds = f'__launch_bounds__({product(block_dims)})' - else: - launch_bounds = f'__launch_bounds__({node.gpu_launch_bounds})' - - - # Emit kernel function signature - kernel_stream.write( - f'__global__ void {launch_bounds} {kernel_name}({", ".join(kernel_args)}) ', - cfg, state_id, node - ) - - # generate kernel scope - self._generate_kernel_scope( - sdfg, cfg, dfg_scope, state_id, self._globalcode, kernel_stream - ) - - self._localcode.write(kernel_stream.getvalue() + '\n') - ExperimentalCUDACodeGen._in_kernel_code = False - # -------------------------------------------------------------- - - # Generate the actual launch call (host-side) - self._generate_kernel_launch(sdfg, cfg, dfg_scope, state_id, function_stream, callsite_stream) - - - else: - # Nested scope: already inside a GPU kernel - node = dfg_scope.source_nodes()[0] - schedule_type = node.map.schedule.name - - if schedule_type == dace.ScheduleType.GPU_Device: - raise NotImplementedError( - "Dynamic parallelism (nested GPU_Device schedules) is not supported." - ) - - gen = getattr(self, f'_generate_{schedule_type}_scope', None) - if gen: - gen(sdfg, cfg, dfg_scope, state_id, function_stream, callsite_stream) + kernel_scope_generator = KernelScopeGenerator(codegen=self) + if kernel_scope_generator.applicable(sdfg, cfg, dfg_scope, state_id, kernel_function_stream, kernel_stream): + kernel_scope_generator.generate(sdfg, cfg, dfg_scope, state_id, kernel_function_stream, kernel_stream) else: - raise NotImplementedError( - f"Scope generation for schedule type '{schedule_type}' is not implemented in ExperimentalCUDACodeGen. " - "Please check for supported schedule types or implement the corresponding generator." + raise ValueError( + "Invalid kernel configuration: This strategy is only applicable if the " + "outermost GPU schedule is of type GPU_Device (most likely cause)." ) - -####################### helper functions to generate_scope ###################################### - - - def _generate_kernel_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: ScopeSubgraphView, state_id: int, - function_stream: CodeIOStream, kernel_stream: CodeIOStream) -> None: - - - - with KernelScopeManager(cudaCodeGen=self, sdfg=sdfg, cfg=cfg, dfg_scope=dfg_scope, state_id=state_id, - function_stream=function_stream, callsite_stream=kernel_stream, comment="Kernel scope",) as scopeManager: - - - # ----------------- Retrieve kernel configuration ----------------------- - kernel_spec = self._current_kernel_spec - kernel_entry_node = kernel_spec._kernel_entry_node # = dfg_scope.source_nodes()[0] - kernel_map = kernel_spec.kernel_map - has_tbmap = kernel_spec.has_tbmap - kernel_block_dims = self._current_kernel_spec.block_dims - - - # ----------------- Kernel/Map Range Preprocessing ----------------------- - - reversed_kernel_range = kernel_map.range[::-1] # also reverse it - kernel_range = subsets.Range(reversed_kernel_range) - kernel_dimensions = len(kernel_range) - kernel_dim_sizes = kernel_range.size() - - - # ----------------- Set up symbolic index expressions ----------------------- - - symbolic_indices = [ symbolic.symbol(f'__SYM_IDX{dim}', nonnegative=True, integer=True) for dim in range(kernel_dimensions)] - symbolic_index_bounds = [ idx + block_dim - 1 for idx, block_dim in zip(symbolic_indices, kernel_block_dims)] - symbolic_coordinates = kernel_range.coord_at(symbolic_indices) - - - # ----------------- Generate Thread or Block index Definitions ----------------------- - - - thread_id_ctype = kernel_spec.gpu_index_ctype # Data type of CUDA thread/block indices - - - # In case there is no ThreadBlock map used in a submap, the map variables will - # be mapped to thread IDs instead of block IDs - for dim in range(kernel_dimensions): - - var_name = kernel_map.params[-dim - 1] # also reverse it here! - - # Compute index expressions for up to 3 dimensions (x, y, z) - if dim < 3: - if has_tbmap: - index_expr = f'blockIdx.{_get_cuda_dim(dim)}' - else: - index_expr = f'(blockIdx.{_get_cuda_dim(dim)} * {symbolic_to_cpp(kernel_block_dims[dim])} + threadIdx.{_get_cuda_dim(dim)})' - - # Delinearize third dimension if more than 3D (used in 3D+ mapping) - if dim == 2 and kernel_dimensions > 3: - tail_prod = product(kernel_dim_sizes[3:]) - index_expr = f"({index_expr} / ({symbolic_to_cpp(tail_prod)}))" - - else: # Handle dimensions beyond the third (delinearize and modulo) - if has_tbmap: - index_expr = f'blockIdx.z' - else: - index_expr = f'(blockIdx.z * {symbolic_to_cpp(kernel_block_dims[2])} + threadIdx.z)' - - tail_prod = product(kernel_dim_sizes[dim + 1:]) - index_expr = (f"({index_expr} / ({symbolic_to_cpp(tail_prod)})) % ({symbolic_to_cpp(kernel_dim_sizes[dim])})") - - - # Define thread/Block index - var_def = symbolic_to_cpp(symbolic_coordinates[dim]).replace(f'__SYM_IDX{dim}', index_expr) - kernel_stream.write(f'{thread_id_ctype} {var_name} = {var_def};', cfg, state_id, kernel_entry_node) - self._dispatcher.defined_vars.add(var_name, DefinedType.Scalar, thread_id_ctype) - - - # ----------------- Guard Conditions for Block Execution ----------------------- - - if not has_tbmap: - minels = kernel_range.min_element() - maxels = kernel_range.max_element() - - for dim, (var_name, start, end) in enumerate(zip(kernel_map.params[::-1], minels, maxels)): - condition = '' - - # Optimize conditions if they are always true - if dim >= 3 or (symbolic_indices[dim] >= start) != True: - condition += f'{var_name} >= {symbolic_to_cpp(start)}' - - if (dim >= 3 or ((symbolic_index_bounds[dim] < end) != False - and ((symbolic_index_bounds[dim] % kernel_block_dims[dim]) != 0) == True) or (kernel_block_dims[dim] > end) == True): - - if len(condition) > 0: - condition += ' && ' - condition += f'{var_name} < {symbolic_to_cpp(end + 1)}' - - if len(condition) > 0: - scopeManager.open(condition=condition) - - - # ----------------- Dispatch Subgraph code generation ----------------------- - - self._dispatcher.dispatch_subgraph(sdfg, cfg, dfg_scope, state_id, function_stream, - kernel_stream, skip_entry_node=True) - - - - - def _generate_GPU_ThreadBlock_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: ScopeSubgraphView, - state_id: int, function_stream: CodeIOStream, kernel_stream: CodeIOStream) -> None: - - - # NOTE: not my code, but my insights. Approval for commenting this needed - with KernelScopeManager(cudaCodeGen=self, sdfg=sdfg, cfg=cfg, dfg_scope=dfg_scope, state_id=state_id, - function_stream=function_stream, callsite_stream=kernel_stream, comment="ThreadBlock Scope",) as scopeManager: - - node = dfg_scope.source_nodes()[0] - scope_map = node.map - - - # ----------------- Map Range Preprocessing ----------------------- - - # Reverse range for better performance (e.g. memory coalescing) - reversed_scope_range = scope_map.range[::-1] - map_range = subsets.Range(reversed_scope_range) - map_dimensions = len(map_range) - map_dim_sizes = map_range.size() - - kernel_block_dims = self._current_kernel_spec.block_dims - - - # ----------------- Symbolic Index Expressions ----------------------- - - symbolic_indices = [ symbolic.symbol(f'__SYM_IDX{dim}', nonnegative=True, integer=True) for dim in range(map_dimensions)] - symbolic_index_bounds = [idx + (block_dim * rng[2]) - 1 for idx, block_dim, rng in zip(symbolic_indices, kernel_block_dims, map_range)] - symbolic_coordinates = map_range.coord_at(symbolic_indices) - - - # ----------------- Generate Index Variable Definitions ----------------------- - - # Get the block's index dace data type - block_id_ctype = self._current_kernel_spec.gpu_index_ctype - - for dim in range(map_dimensions): - var_name = scope_map.params[-dim - 1] # also reverse it here! - - if dim < 3: - # First three dimensions: direct mapping or partial delinearization - if dim == 2 and map_dimensions > 3: - tail_prod = product(map_dim_sizes[3:]) - base_expr = f"(threadIdx.z / ({symbolic_to_cpp(tail_prod)}))" - else: - base_expr = f"threadIdx.{_get_cuda_dim(dim)}" - else: - # Dimensions beyond the third: full delinearization - tail_prod = product(map_dim_sizes[dim + 1:]) - base_expr = (f"(threadIdx.z / ({symbolic_to_cpp(tail_prod)})) % "f"({symbolic_to_cpp(map_dim_sizes[dim])})") - - - var_def = symbolic_to_cpp(symbolic_coordinates[dim]).replace(f'__SYM_IDX{dim}', base_expr) - kernel_stream.write(f'{block_id_ctype} {var_name} = {var_def};', cfg, state_id, node) - self._dispatcher.defined_vars.add(var_name, DefinedType.Scalar, block_id_ctype) - - - # ----------------- Guard Conditions for Block Execution ----------------------- - - # Generate conditions for this block's execution using min and max - # element, e.g. skipping out-of-bounds threads in trailing block - minels = map_range.min_element() - maxels = map_range.max_element() - for dim, (var_name, start, end) in enumerate(zip(scope_map.params[::-1], minels, maxels)): - - # Optimize conditions if they are always true - ############################################# - - condition = '' - - # Block range start - if dim >= 3 or (symbolic_indices[dim] >= start) != True: - condition += f'{var_name} >= {symbolic_to_cpp(start)}' - - # Special case: block size is exactly the range of the map (0:b) - if dim >= 3: - skipcond = False - else: - skipcond = symbolic_index_bounds[dim].subs({symbolic_indices[dim]: start}) == end - - # Block range end - if dim >= 3 or (not skipcond and (symbolic_index_bounds[dim] < end) != True): - if len(condition) > 0: - condition += ' && ' - condition += f'{var_name} < {symbolic_to_cpp(end + 1)}' - - # Emit condition in code if any - if len(condition) > 0: - scopeManager.open(condition=condition) - - - # ----------------- Dispatch Subgraph code generation ----------------------- - - self._dispatcher.dispatch_subgraph(sdfg, cfg, dfg_scope, state_id, function_stream, - kernel_stream, skip_entry_node=True) - - - - - def _generate_GPU_Warp_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: ScopeSubgraphView, state_id: int, - function_stream: CodeIOStream, kernel_stream: CodeIOStream) -> None: - - - with KernelScopeManager(cudaCodeGen=self, sdfg=sdfg, cfg=cfg, dfg_scope=dfg_scope, state_id=state_id, - function_stream=function_stream, callsite_stream=kernel_stream, comment="WarpLevel Scope") as scopeManager: - - - # Get kernel specifications - kernel_spec = self._current_kernel_spec - block_dims = kernel_spec.block_dims - warpSize = kernel_spec.warpSize - - state_dfg = cfg.state(state_id) - node = dfg_scope.source_nodes()[0] - scope_map = node.map - - map_range = subsets.Range(scope_map.range[::-1]) # Reversed for potential better performance - warp_dim = len(map_range) - - # The following sizes and bounds are be symbolic - num_threads_in_block = product(block_dims) - warp_dim_bounds = [max_elem + 1 for max_elem in map_range.max_element()] - num_warps = product(warp_dim_bounds) - - - # The C type used to define the (flat) threadId and warpId variables - ids_ctype = kernel_spec.gpu_index_ctype - - # ----------------- Guard checks ----------------------- - - - # handles checks either at compile time or runtime (i.e. checks in the generated code) - self._handle_GPU_Warp_scope_guards(state_dfg, node, map_range, warp_dim, num_threads_in_block, num_warps, - kernel_stream, scopeManager) - - - - # ----------------- Define (flat) Thread ID within Block ----------------------- - - flattened_terms = [] - - for i, dim_size in enumerate(block_dims): - - if dim_size == 1: - continue - - dim = _get_cuda_dim(i) - stride = [f"{block_dims[j]}" for j in range(i) if block_dims[j] > 1] - idx_expr = " * ".join(stride + [f"threadIdx.{_get_cuda_dim(i)}"]) if stride else f"threadIdx.{dim}" - flattened_terms.append(idx_expr) - - - joined_terms = " + ".join(flattened_terms) - flat_thread_idx_expr = f"({joined_terms})" if len(flattened_terms) > 1 else joined_terms - - threadID_name = 'ThreadId_%s_%d_%d_%d' % (scope_map.label, cfg.cfg_id, state_dfg.block_id, state_dfg.node_id(node)) - - kernel_stream.write(f"{ids_ctype} {threadID_name} = ({flat_thread_idx_expr}) / {warpSize};", cfg, state_id, node) - self._dispatcher.defined_vars.add(threadID_name, DefinedType.Scalar, ids_ctype) - - - - # ----------------- Compute Map indices (= Warp indices) ----------------------- - - for i in range(warp_dim): - var_name = scope_map.params[-i - 1] # reverse order - previous_sizes = warp_dim_bounds[:i] - - if len(previous_sizes) > 0: - divisor = product(previous_sizes) - expr = f"({threadID_name} / {divisor}) % {warp_dim_bounds[i]}" - else: - expr = f"{threadID_name} % {warp_dim_bounds[i]}" - - kernel_stream.write(f"{ids_ctype} {var_name} = {expr};", cfg, state_id, node) - self._dispatcher.defined_vars.add(var_name, DefinedType.Scalar, ids_ctype) - + # Append generated kernel code to localcode + self._localcode.write(kernel_stream.getvalue() + '\n') + # Exit kernel context + ExperimentalCUDACodeGen._in_kernel_code = False - # ----------------- Guard Conditions for Warp Execution ----------------------- + # Generate kernel launch + self._generate_kernel_launch(sdfg, cfg, dfg_scope, state_id, function_stream, callsite_stream) + return - if num_warps * warpSize != num_threads_in_block: - condition = f'{threadID_name} < {num_warps}' - scopeManager.open(condition) - warp_range = [(start, end + 1, stride) for start, end, stride in map_range.ranges] + #--------------- Nested GPU Scope -------------------- - for dim, (var_name, (start, _, stride)) in enumerate(zip(scope_map.params[::-1], warp_range)): - - condition_terms = [] - - if start != 0: - condition_terms.append(f"{var_name} >= {start}") - - if stride != 1: - expr = var_name if start == 0 else f"({var_name} - {start})" - condition_terms.append(f'{expr} % {stride} == 0' ) - - if condition_terms: - condition = " && ".join(condition_terms) - scopeManager.open(condition) + supported_strategies: List[ScopeGenerationStrategy] = [ + ThreadBlockScopeGenerator(codegen=self), + WarpScopeGenerator(codegen=self) + ] + for strategy in supported_strategies: + if strategy.applicable(sdfg, cfg, dfg_scope, state_id, function_stream, callsite_stream): + strategy.generate(sdfg, cfg, dfg_scope, state_id, function_stream, callsite_stream) + return - # ----------------- Dispatch Subgraph code generation ----------------------- + #--------------- Unsupported Cases -------------------- + # Note: We are inside a nested GPU scope at this point. + node = dfg_scope.source_nodes()[0] + schedule_type = node.map.schedule - self._dispatcher.dispatch_subgraph( - sdfg, cfg, dfg_scope, state_id, function_stream, - kernel_stream, skip_entry_node=True + if schedule_type == dace.ScheduleType.GPU_Device: + raise NotImplementedError( + "Dynamic parallelism (nested GPU_Device schedules) is not supported." ) - - - - def _handle_GPU_Warp_scope_guards(self, state_dfg: SDFGState, node: nodes.MapEntry, map_range: subsets.Range, - warp_dim: int, num_threads_in_block, num_warps, kernel_stream: CodeIOStream, - scopeManager: 'KernelScopeManager'): + raise NotImplementedError( + f"Scope generation for schedule type '{schedule_type}' is not implemented in ExperimentalCUDACodeGen. " + "Please check for supported schedule types or implement the corresponding strategy." + ) - #TODO: Move them to sdfg validation as well if possible - - # Get warpSize from the kernel specification - warpSize = self._current_kernel_spec.warpSize - - parent_map, _ = helpers.get_parent_map(state_dfg, node) - if parent_map.schedule != dtypes.ScheduleType.GPU_ThreadBlock: - raise ValueError("GPU_Warp map must be nested within a GPU_ThreadBlock map.") - - if warp_dim > 3: - raise NotImplementedError("GPU_Warp maps are limited to 3 dimensions.") - - - # Guard against invalid thread/block configurations. - # - For concrete (compile-time) values, raise Python errors early. - # - For symbolic values, insert runtime CUDA checks (guards) into the generated kernel. - # These will emit meaningful error messages and abort execution if violated. - if isinstance(num_threads_in_block, symbolic.symbol): - condition = ( - f"{num_threads_in_block} % {warpSize} != 0 || " - f"{num_threads_in_block} > 1024 || " - f"{num_warps} * {warpSize} > {num_threads_in_block}" - ) - kernel_stream.write(f"""\ - if ({condition}) {{ - printf("CUDA error:\\n" - "1. Block must be a multiple of {warpSize} threads (DaCe requirement for GPU_Warp scheduling).\\n" - "2. Block size must not exceed 1024 threads (CUDA hardware limit).\\n" - "3. Number of warps x {warpSize} must fit in the block (otherwise logic is unclear).\\n"); - asm("trap;"); - }} - """) - - else: - if isinstance(num_warps, symbolic.symbol): - condition = f"{num_warps} * {warpSize} > {num_threads_in_block}" - scopeManager.open(condition=condition) - - elif num_warps * warpSize > num_threads_in_block: - raise ValueError(f"Invalid configuration: {num_warps} warps x {warpSize} threads exceed " - f"{num_threads_in_block} threads in the block.") - - if num_threads_in_block % warpSize != 0: - raise ValueError(f"Block must be a multiple of {warpSize} threads for GPU_Warp scheduling " - f"(got {num_threads_in_block}).") - - if num_threads_in_block > 1024: - raise ValueError("CUDA does not support more than 1024 threads per block (hardware limit).") - - - for min_element in map_range.min_element(): - if isinstance(min_element, symbolic.symbol): - kernel_stream.write(f'if ({min_element} < 0) {{\n' - f' printf("Runtime error: Warp ID symbol {min_element} must be non-negative.\\n");\n' - f' asm("trap;");\n' - f'}}\n') - elif min_element < 0: - raise ValueError(f"Warp ID value {min_element} must be non-negative.") - - - def _generate_kernel_wrapper(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: ScopeSubgraphView, state_id: int, function_stream: CodeIOStream, callsite_stream: CodeIOStream) -> None: @@ -803,9 +413,6 @@ def _generate_kernel_wrapper(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope callsite_stream.write( '__dace_runkernel_%s(%s);\n' % (kernel_name, ', '.join(kernel_wrapper_args)), cfg, state_id, scope_entry) - - - def _generate_kernel_launch(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: ScopeSubgraphView, state_id: int, function_stream: CodeIOStream, callsite_stream: CodeIOStream) -> None: @@ -889,15 +496,74 @@ def _generate_kernel_launch(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: self._localcode.write(f'DACE_KERNEL_LAUNCH_CHECK(__err, "{kernel_name}", {gdims}, {bdims});') self._localcode.write('}') + ########################################################################### + # Generation of Memory Copy Logic + + def copy_memory(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int, + src_node: Union[nodes.Tasklet, nodes.AccessNode], dst_node: Union[nodes.CodeNode, nodes.AccessNode], + edge: Tuple[nodes.Node, str, nodes.Node, str, Memlet], + function_stream: CodeIOStream, callsite_stream: CodeIOStream) -> None: + + from dace.codegen.targets.experimental_cuda_helpers.copy_strategies import ( + CopyContext, + CopyStrategy, + OutOfKernelCopyStrategy, + WithinGPUCopyStrategy, + FallBackGPUCopyStrategy, + ) + context = CopyContext(self, self._gpu_stream_manager, state_id, src_node, dst_node, edge, + sdfg, cfg, dfg, callsite_stream) - + # Order matters: fallback must come last + strategies: List[CopyStrategy] = [ + OutOfKernelCopyStrategy(), + WithinGPUCopyStrategy(), + FallBackGPUCopyStrategy() + ] + + for strategy in strategies: + if strategy.applicable(context): + strategy.generate_copy(context) + return + + raise RuntimeError("No applicable GPU memory copy strategy found (this should not happen).") + + ############################################################################# + # Predicates for Dispatcher -################################# NESTED SDFG handling ############################################ -# testing phase + def state_dispatch_predicate(self, sdfg, state): + """ + Determines whether a state should be handled by this + code generator (`ExperimentalCUDACodeGen`). + Returns True if the generator is currently generating kernel code. + """ + return ExperimentalCUDACodeGen._in_kernel_code + + def node_dispatch_predicate(self, sdfg, state, node): + """ + Determines whether a node should be handled by this + code generator (`ExperimentalCUDACodeGen`). + + Returns True if: + - The node has a GPU schedule handled by this backend, or + - The generator is currently generating kernel code. + """ + schedule = getattr(node, 'schedule', None) + + if schedule in dtypes.GPU_SCHEDULES_EXPERIMENTAL_CUDACODEGEN: + return True + + if ExperimentalCUDACodeGen._in_kernel_code: + return True + + return False + ############################################################################# + # Nested SDFG related, testing phase + def generate_state(self, sdfg: SDFG, cfg: ControlFlowRegion, @@ -928,34 +594,6 @@ def _emit_sync(self, codestream: CodeIOStream): if Config.get_bool('compiler', 'cuda', 'syncdebug'): codestream.write('''DACE_GPU_CHECK({backend}GetLastError()); DACE_GPU_CHECK({backend}DeviceSynchronize());'''.format(backend=self.backend)) - - def state_dispatch_predicate(self, sdfg, state): - """ - Determines whether a state should be handled by this - code generator (`ExperimentalCUDACodeGen`). - - Returns True if the generator is currently generating kernel code. - """ - return ExperimentalCUDACodeGen._in_kernel_code - - def node_dispatch_predicate(self, sdfg, state, node): - """ - Determines whether a node should be handled by this - code generator (`ExperimentalCUDACodeGen`). - - Returns True if: - - The node has a GPU schedule handled by this backend, or - - The generator is currently generating kernel code. - """ - schedule = getattr(node, 'schedule', None) - - if schedule in dtypes.GPU_SCHEDULES_EXPERIMENTAL_CUDACODEGEN: - return True - - if ExperimentalCUDACodeGen._in_kernel_code: - return True - - return False def generate_node(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int, node: nodes.Node, function_stream: CodeIOStream, callsite_stream: CodeIOStream) -> None: @@ -1000,9 +638,8 @@ def _generate_NestedSDFG(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSub self._toplevel_schedule = old_schedule -####################################################################### - # Rather Minor "actual" changes, but much nicer to extend and maintain - + ####################################################################### + # Array Declaration, Allocation and Deallocation def declare_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int, node: nodes.AccessNode, nodedesc: dt.Data, function_stream: CodeIOStream, @@ -1040,7 +677,6 @@ def declare_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphVi declaration_stream.write(f'{array_ctype} {dataname};\n', cfg, state_id, node) self._dispatcher.declared_arrays.add(dataname, DefinedType.Pointer, array_ctype) - def allocate_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int, node: nodes.AccessNode, nodedesc: dt.Data, function_stream: CodeIOStream, declaration_stream: CodeIOStream, allocation_stream: CodeIOStream) -> None: @@ -1076,7 +712,6 @@ def allocate_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphV else: raise NotImplementedError(f'CUDA: Unimplemented storage type {nodedesc.storage}') - def _prepare_GPU_Global_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int, node: nodes.AccessNode, nodedesc: dt.Data, function_stream: CodeIOStream, declaration_stream: CodeIOStream, allocation_stream: CodeIOStream): @@ -1124,7 +759,6 @@ def _prepare_GPU_Global_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: Sta cfg, state_id, node ) - def _prepare_CPU_Pinned_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int, node: nodes.AccessNode, nodedesc: dt.Data, function_stream: CodeIOStream, declaration_stream: CodeIOStream, allocation_stream: CodeIOStream): @@ -1162,7 +796,6 @@ def _prepare_CPU_Pinned_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: Sta cfg, state_id, node ) - def _prepare_GPU_Shared_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int, node: nodes.AccessNode, nodedesc: dt.Data, function_stream: CodeIOStream, declaration_stream: CodeIOStream, allocation_stream: CodeIOStream): @@ -1197,7 +830,6 @@ def _prepare_GPU_Shared_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: Sta cfg, state_id, node ) - def _prepare_Register_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int, node: nodes.AccessNode, nodedesc: dt.Data, function_stream: CodeIOStream, declaration_stream: CodeIOStream, allocation_stream: CodeIOStream): @@ -1223,7 +855,6 @@ def _prepare_Register_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: State self._dispatcher.defined_vars.add(dataname, DefinedType.Pointer, array_ctype) - def deallocate_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int, node: nodes.AccessNode, nodedesc: dt.Data, function_stream: CodeIOStream, callsite_stream: CodeIOStream) -> None: @@ -1273,10 +904,6 @@ def deallocate_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgrap else: raise NotImplementedError(f'Deallocation not implemented for storage type: {nodedesc.storage.name}') - -####################################################################### -# Copy-pasted, might be changed in future - def get_generated_codeobjects(self): # My comment: first part creates the header and stores it in a object property @@ -1431,6 +1058,9 @@ def get_generated_codeobjects(self): return [self._codeobject] + ####################################################################### + # Compilation Related + @staticmethod def cmake_options(): options = [] @@ -1468,6 +1098,9 @@ def cmake_options(): return options + ####################################################################### + # Callback to CPU codegen + def define_out_memlet(self, sdfg: SDFG, cfg: ControlFlowRegion, state_dfg: StateSubgraphView, state_id: int, src_node: nodes.Node, dst_node: nodes.Node, edge: MultiConnectorEdge[Memlet], function_stream: CodeIOStream, callsite_stream: CodeIOStream) -> None: @@ -1478,39 +1111,6 @@ def process_out_memlets(self, *args, **kwargs): # Call CPU implementation with this code generator as callback self._cpu_codegen.process_out_memlets(*args, codegen=self, **kwargs) - def copy_memory(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int, - src_node: Union[nodes.Tasklet, nodes.AccessNode], dst_node: Union[nodes.CodeNode, nodes.AccessNode], - edge: Tuple[nodes.Node, str, nodes.Node, str, Memlet], - function_stream: CodeIOStream, callsite_stream: CodeIOStream) -> None: - - # NOTE: - # There should be additional cudastream handling implemented- I can make this a TODO - - from dace.codegen.targets.new_cuda_codegen.copy_strategies import ( - CopyContext, CopyStrategy, OutOfKernelCopyStrategy, - WithinGPUCopyStrategy, FallBackGPUCopyStrategy) - - context = CopyContext(self, self._gpu_stream_manager, state_id, src_node, dst_node, edge, - sdfg, cfg, dfg, callsite_stream) - - # Order matters: fallback must come last - strategies: List[CopyStrategy] = [ - OutOfKernelCopyStrategy(), - WithinGPUCopyStrategy(), - FallBackGPUCopyStrategy() - ] - - for strategy in strategies: - if strategy.applicable(context): - strategy.generate_copy(context) - return - - raise RuntimeError("No applicable GPU memory copy strategy found (this should not happen).") - - - - - @@ -1518,29 +1118,8 @@ def copy_memory(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView ######################################################################### # helper classes and functions -def symbolic_to_cpp(arr): - """ Converts an array of symbolic variables (or one) to C++ strings. """ - if not isinstance(arr, list): - return cppunparse.pyexpr2cpp(symbolic.symstr(arr, cpp_mode=True)) - return [cppunparse.pyexpr2cpp(symbolic.symstr(d, cpp_mode=True)) for d in arr] - -def _get_cuda_dim(idx): - """ Converts 0 to x, 1 to y, 2 to z, or raises an exception. """ - if idx < 0 or idx > 2: - raise ValueError('idx must be between 0 and 2, got %d' % idx) - return ('x', 'y', 'z')[idx] - -def product(iterable): - """ - Computes the symbolic product of elements in the iterable using sympy.Mul. - - This is equivalent to: ```functools.reduce(sympy.Mul, iterable, 1)```. - - Purpose: This function is used to improve readability of the codeGen. - """ - return functools.reduce(sympy.Mul, iterable, 1) - -# Functions I had to redefine locally to not modify other files and ensure backwards compatibility +# NOTE: I had to redefine this function locally to not modify other files +# and ensure backwards compatibility with the old cudacodegen def ptr(name: str, desc: dace.data.Data, sdfg: SDFG = None, framecode=None) -> str: """ Returns a string that points to the data based on its name and descriptor. @@ -1578,6 +1157,9 @@ def ptr(name: str, desc: dace.data.Data, sdfg: SDFG = None, framecode=None) -> s return name +# This one is closely linked to the ExperimentalCUDACodeGen. In fact, +# it only exists to not have to much attributes and methods in the ExperimentalCUDACodeGen +# and to group Kernel specific methods & information. Thus, KernelSpec should remain in this file class KernelSpec: """ A helper class to encapsulate information required for working with kernels. @@ -1925,7 +1507,6 @@ def _get_maps_affecting_launch_dims(self, graph: ScopeSubgraphView) -> List[Tupl - @property def kernel_name(self) -> list[str]: """Returns the kernel name.""" @@ -1996,133 +1577,3 @@ def gpu_index_ctype(self) -> str: """ return self._gpu_index_ctype - - -class KernelScopeManager: - """ - A helper class to manage opening and closing brackets in a structured way using the 'with' statement. - This class simplifies the process of correctly opening and closing brackets. It also supports an optional - debug mode to include comments in the generated code, which can help with debugging and understanding - the code structure. - """ - - def __init__(self, cudaCodeGen: ExperimentalCUDACodeGen, sdfg: SDFG, - cfg: ControlFlowRegion, dfg_scope: ScopeSubgraphView, state_id: int, - function_stream: CodeIOStream, callsite_stream: CodeIOStream, comment: str = None, - debug: bool = True): - """ - Initializes the KernelScopeManager. - - :param cudaCodeGen: The ExperimentalCUDACodeGen instance for potential future use. - :param sdfg: The SDFG instance for context. - :param cfg: The ControlFlowRegion instance for context. - :param dfg_scope: The ScopeSubgraphView instance for context. - :param state_id: The ID of the current state for context. - :param function_stream: The CodeIOStream for function-level code. - :param callsite_stream: The CodeIOStream for callsite-level code. - :param comment: A descriptive comment explaining the purpose of the code block being opened. Default is None. - :param debug: Whether to include debug comments in the output. Defaults to False. - """ - self.cudaCodeGen = cudaCodeGen - self.sdfg = sdfg - self.cfg = cfg - self.dfg_scope = dfg_scope - self.state_id = state_id - self.function_stream = function_stream - self.callsite_stream = callsite_stream - self.comment = comment - self.debug = debug - self._opened = 0 - - self.entry_node = self.dfg_scope.source_nodes()[0] - self.exit_node = self.dfg_scope.sink_nodes()[0] - - def __enter__(self): - """ - Writes the opening bracket to the stream and allocates arrays in scope. - """ - self.open() - self.cudaCodeGen._frame.allocate_arrays_in_scope( - self.sdfg, self.cfg, self.entry_node, self.function_stream, self.callsite_stream - ) - return self - - def __exit__(self, exc_type, exc_value, traceback): - """ - Deallocates arrays in scope and writes the closing brackets to the stream. - """ - self.cudaCodeGen._frame.deallocate_arrays_in_scope( - self.sdfg, self.cfg, self.entry_node, self.function_stream, self.callsite_stream - ) - for i in range(self._opened): - line = "}" - if self.debug: - line += f" // {self.comment} (close {i + 1})" - self.callsite_stream.write(line, self.cfg, self.state_id, self.exit_node) - - def open(self, condition: str = None): - """ - Opens a bracket. If a condition is given, emits 'if (condition) {', otherwise just '{'. - Tracks the number of open brackets for closing later. - - :param condition: Optional condition for the opening bracket. - """ - line = f"if ({condition}) {{" if condition else "{" - if self.debug: - line += f" // {self.comment} (open {self._opened + 1})" - self.callsite_stream.write(line, self.cfg, self.state_id, self.entry_node) - self._opened += 1 - - - -class GPUStreamManager: - """ - Manages GPU backend streams (e.g., CUDA or HIP streams) for nodes in an SDFG. - Assumes that the initialization inputs come from the NaiveGPUScheduler pass. - - NOTE: "Stream" here refers to backend GPU streams, not DaCe data streams. - """ - - def __init__(self, sdfg: SDFG, assigned_streams: Dict[nodes.Node, Union[int, str]], stream_access_template: str): - self.sdfg = sdfg - self.assigned_streams = assigned_streams - self.stream_access_template = stream_access_template - - # Placeholder for future support of backend events (e.g., CUDA events) - self.num_gpu_events = 0 - - # Determine the number of streams used (stream IDs start from 0) - # Only count integer stream IDs (ignore string values like "nullptr") - int_stream_ids = [v for v in assigned_streams.values() if isinstance(v, int)] - self.num_gpu_streams = max(int_stream_ids, default=0) - - def get_stream_node(self, node: nodes.Node) -> str: - """ - Returns the GPU stream access expression for a given node. - - If the node has an assigned stream not equal the default "nullptr", returns - the formatted stream expression. Otherwise, returns "nullptr". - """ - if node in self.assigned_streams and self.assigned_streams[node] != "nullptr": - return self.stream_access_template.format(gpu_stream=self.assigned_streams[node]) - return "nullptr" - - def get_stream_edge(self, src_node: nodes.Node, dst_node: nodes.Node) -> str: - """ - Returns the stream access expression for an edge based on either the - source or destination node. If one of the nodes has an assigned stream not equal - to the default 'nullptr', that stream is returned (should be symmetric - when using the NaiveGPUStreamScheduler pass). Otherwise, returns 'nullptr'. - """ - if src_node in self.assigned_streams and self.assigned_streams[src_node] != "nullptr": - stream_id = self.assigned_streams[src_node] - return self.stream_access_template.format(gpu_stream=stream_id) - elif dst_node in self.assigned_streams and self.assigned_streams[dst_node] != "nullptr": - stream_id = self.assigned_streams[dst_node] - return self.stream_access_template.format(gpu_stream=stream_id) - else: - return "nullptr" - - - - diff --git a/dace/codegen/targets/new_cuda_codegen/copy_strategies.py b/dace/codegen/targets/experimental_cuda_helpers/copy_strategies.py similarity index 99% rename from dace/codegen/targets/new_cuda_codegen/copy_strategies.py rename to dace/codegen/targets/experimental_cuda_helpers/copy_strategies.py index 8b7bb32e8e..a1371547d5 100644 --- a/dace/codegen/targets/new_cuda_codegen/copy_strategies.py +++ b/dace/codegen/targets/experimental_cuda_helpers/copy_strategies.py @@ -4,8 +4,8 @@ from dace import symbolic from dace import Memlet, dtypes from dace.dtypes import StorageType -from dace.codegen.targets.new_cuda_codegen.experimental_cuda import ExperimentalCUDACodeGen, GPUStreamManager, product - +from dace.codegen.targets.experimental_cuda import ExperimentalCUDACodeGen, GPUStreamManager +from dace.codegen.targets.experimental_cuda_helpers.gpu_utils import product from dace.codegen.prettycode import CodeIOStream diff --git a/dace/codegen/targets/experimental_cuda_helpers/gpu_stream_manager.py b/dace/codegen/targets/experimental_cuda_helpers/gpu_stream_manager.py new file mode 100644 index 0000000000..f2fa05e9a9 --- /dev/null +++ b/dace/codegen/targets/experimental_cuda_helpers/gpu_stream_manager.py @@ -0,0 +1,50 @@ +from typing import Dict, Union +from dace import SDFG, nodes + +class GPUStreamManager: + """ + Manages GPU backend streams (e.g., CUDA or HIP streams) for nodes in an SDFG. + Assumes that the initialization inputs come from the NaiveGPUScheduler pass. + + NOTE: "Stream" here refers to backend GPU streams, not DaCe data streams. + """ + + def __init__(self, sdfg: SDFG, assigned_streams: Dict[nodes.Node, Union[int, str]], stream_access_template: str): + self.sdfg = sdfg + self.assigned_streams = assigned_streams + self.stream_access_template = stream_access_template + + # Placeholder for future support of backend events (e.g., CUDA events) + self.num_gpu_events = 0 + + # Determine the number of streams used (stream IDs start from 0) + # Only count integer stream IDs (ignore string values like "nullptr") + int_stream_ids = [v for v in assigned_streams.values() if isinstance(v, int)] + self.num_gpu_streams = max(int_stream_ids, default=0) + + def get_stream_node(self, node: nodes.Node) -> str: + """ + Returns the GPU stream access expression for a given node. + + If the node has an assigned stream not equal the default "nullptr", returns + the formatted stream expression. Otherwise, returns "nullptr". + """ + if node in self.assigned_streams and self.assigned_streams[node] != "nullptr": + return self.stream_access_template.format(gpu_stream=self.assigned_streams[node]) + return "nullptr" + + def get_stream_edge(self, src_node: nodes.Node, dst_node: nodes.Node) -> str: + """ + Returns the stream access expression for an edge based on either the + source or destination node. If one of the nodes has an assigned stream not equal + to the default 'nullptr', that stream is returned (should be symmetric + when using the NaiveGPUStreamScheduler pass). Otherwise, returns 'nullptr'. + """ + if src_node in self.assigned_streams and self.assigned_streams[src_node] != "nullptr": + stream_id = self.assigned_streams[src_node] + return self.stream_access_template.format(gpu_stream=stream_id) + elif dst_node in self.assigned_streams and self.assigned_streams[dst_node] != "nullptr": + stream_id = self.assigned_streams[dst_node] + return self.stream_access_template.format(gpu_stream=stream_id) + else: + return "nullptr" \ No newline at end of file diff --git a/dace/codegen/targets/experimental_cuda_helpers/gpu_utils.py b/dace/codegen/targets/experimental_cuda_helpers/gpu_utils.py new file mode 100644 index 0000000000..3a4f3dcde1 --- /dev/null +++ b/dace/codegen/targets/experimental_cuda_helpers/gpu_utils.py @@ -0,0 +1,28 @@ +import functools +import sympy + +from dace import symbolic +from dace.codegen import cppunparse + + +def symbolic_to_cpp(arr): + """ Converts an array of symbolic variables (or one) to C++ strings. """ + if not isinstance(arr, list): + return cppunparse.pyexpr2cpp(symbolic.symstr(arr, cpp_mode=True)) + return [cppunparse.pyexpr2cpp(symbolic.symstr(d, cpp_mode=True)) for d in arr] + +def get_cuda_dim(idx): + """ Converts 0 to x, 1 to y, 2 to z, or raises an exception. """ + if idx < 0 or idx > 2: + raise ValueError(f'idx must be between 0 and 2, got {idx}') + return ('x', 'y', 'z')[idx] + +def product(iterable): + """ + Computes the symbolic product of elements in the iterable using sympy.Mul. + + This is equivalent to: ```functools.reduce(sympy.Mul, iterable, 1)```. + + Purpose: This function is used to improve readability of the codeGen. + """ + return functools.reduce(sympy.Mul, iterable, 1) \ No newline at end of file diff --git a/dace/codegen/targets/experimental_cuda_helpers/scope_strategies.py b/dace/codegen/targets/experimental_cuda_helpers/scope_strategies.py new file mode 100644 index 0000000000..5b886d61a0 --- /dev/null +++ b/dace/codegen/targets/experimental_cuda_helpers/scope_strategies.py @@ -0,0 +1,574 @@ +# Standard library imports +from abc import ABC, abstractmethod +from typing import TYPE_CHECKING, Dict, Type + +# DaCe core imports +import dace +from dace import dtypes, subsets, symbolic + +# DaCe SDFG imports +from dace.sdfg import SDFG, ScopeSubgraphView, nodes, SDFGState +from dace.sdfg.state import ControlFlowRegion + +# DaCe codegen imports +from dace.codegen.prettycode import CodeIOStream +from dace.codegen.targets.framecode import DaCeCodeGenerator +from dace.codegen.dispatcher import DefinedType, TargetDispatcher + +# DaCe transformation imports +from dace.transformation import helpers + +# Experimental CUDA imports +from dace.codegen.targets.experimental_cuda import ExperimentalCUDACodeGen, KernelSpec +from dace.codegen.targets.experimental_cuda_helpers.gpu_utils import ( + symbolic_to_cpp, + get_cuda_dim, + product +) + + +#---------------------------------------------------------------------------------- +# GPU Scope Generation Strategies +#---------------------------------------------------------------------------------- + +class ScopeGenerationStrategy(ABC): + """Base strategy for generating GPU scope code""" + + def __init__(self, codegen: ExperimentalCUDACodeGen): + self.codegen: ExperimentalCUDACodeGen = codegen + self._dispatcher: TargetDispatcher = codegen._dispatcher + self._current_kernel_spec: KernelSpec = codegen._current_kernel_spec + + @abstractmethod + def applicable(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: ScopeSubgraphView, + state_id: int, function_stream: CodeIOStream, callsite_stream: CodeIOStream) -> bool: + raise NotImplementedError('Abstract class') + + @abstractmethod + def generate(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: ScopeSubgraphView, + state_id: int, function_stream: CodeIOStream, callsite_stream: CodeIOStream) -> None: + raise NotImplementedError('Abstract class') + + +class KernelScopeGenerator(ScopeGenerationStrategy): + + def __init__(self, codegen: ExperimentalCUDACodeGen): + super().__init__(codegen) + + def applicable(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: ScopeSubgraphView, + state_id: int, function_stream: CodeIOStream, callsite_stream: CodeIOStream) -> bool: + + node = dfg_scope.source_nodes()[0] + schedule_type = node.map.schedule + + # This strategy starts kernel code generation and is only valid if + # the outermost (first) GPU schedule is of type GPU_Device. + applicable = schedule_type == dtypes.ScheduleType.GPU_Device + return applicable + + def generate(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: ScopeSubgraphView, + state_id: int, function_stream: CodeIOStream, callsite_stream: CodeIOStream): + + + # Generate kernel function signature + self._generate_kernel_signature(sdfg, cfg, dfg_scope, state_id, function_stream, callsite_stream) + + # Generate kernel body + with ScopeManager(frame_codegen=self.codegen._frame, sdfg=sdfg, cfg=cfg, dfg_scope=dfg_scope, state_id=state_id, + function_stream=function_stream, callsite_stream=callsite_stream, comment="Kernel scope") as scope_manager: + + + # ----------------- Retrieve kernel configuration ----------------------- + kernel_spec = self._current_kernel_spec + kernel_entry_node = kernel_spec._kernel_entry_node # = dfg_scope.source_nodes()[0] + kernel_map = kernel_spec.kernel_map + has_tbmap = kernel_spec.has_tbmap + kernel_block_dims = self._current_kernel_spec.block_dims + + + # ----------------- Kernel/Map Range Preprocessing ----------------------- + + reversed_kernel_range = kernel_map.range[::-1] # also reverse it + kernel_range = subsets.Range(reversed_kernel_range) + kernel_dimensions = len(kernel_range) + kernel_dim_sizes = kernel_range.size() + + + # ----------------- Set up symbolic index expressions ----------------------- + + symbolic_indices = [ symbolic.symbol(f'__SYM_IDX{dim}', nonnegative=True, integer=True) for dim in range(kernel_dimensions)] + symbolic_index_bounds = [ idx + block_dim - 1 for idx, block_dim in zip(symbolic_indices, kernel_block_dims)] + symbolic_coordinates = kernel_range.coord_at(symbolic_indices) + + + # ----------------- Generate Thread or Block index Definitions ----------------------- + + + thread_id_ctype = kernel_spec.gpu_index_ctype # Data type of CUDA thread/block indices + + + # In case there is no ThreadBlock map used in a submap, the map variables will + # be mapped to thread IDs instead of block IDs + for dim in range(kernel_dimensions): + + var_name = kernel_map.params[-dim - 1] # also reverse it here! + + # Compute index expressions for up to 3 dimensions (x, y, z) + if dim < 3: + if has_tbmap: + index_expr = f'blockIdx.{get_cuda_dim(dim)}' + else: + index_expr = f'(blockIdx.{get_cuda_dim(dim)} * {symbolic_to_cpp(kernel_block_dims[dim])} + threadIdx.{get_cuda_dim(dim)})' + + # Delinearize third dimension if more than 3D (used in 3D+ mapping) + if dim == 2 and kernel_dimensions > 3: + tail_prod = product(kernel_dim_sizes[3:]) + index_expr = f"({index_expr} / ({symbolic_to_cpp(tail_prod)}))" + + else: # Handle dimensions beyond the third (delinearize and modulo) + if has_tbmap: + index_expr = f'blockIdx.z' + else: + index_expr = f'(blockIdx.z * {symbolic_to_cpp(kernel_block_dims[2])} + threadIdx.z)' + + tail_prod = product(kernel_dim_sizes[dim + 1:]) + index_expr = (f"({index_expr} / ({symbolic_to_cpp(tail_prod)})) % ({symbolic_to_cpp(kernel_dim_sizes[dim])})") + + + # Define thread/Block index + var_def = symbolic_to_cpp(symbolic_coordinates[dim]).replace(f'__SYM_IDX{dim}', index_expr) + callsite_stream.write(f'{thread_id_ctype} {var_name} = {var_def};', cfg, state_id, kernel_entry_node) + self._dispatcher.defined_vars.add(var_name, DefinedType.Scalar, thread_id_ctype) + + + # ----------------- Guard Conditions for Block Execution ----------------------- + + if not has_tbmap: + minels = kernel_range.min_element() + maxels = kernel_range.max_element() + + for dim, (var_name, start, end) in enumerate(zip(kernel_map.params[::-1], minels, maxels)): + condition = '' + + # Optimize conditions if they are always true + if dim >= 3 or (symbolic_indices[dim] >= start) != True: + condition += f'{var_name} >= {symbolic_to_cpp(start)}' + + if (dim >= 3 or ((symbolic_index_bounds[dim] < end) != False + and ((symbolic_index_bounds[dim] % kernel_block_dims[dim]) != 0) == True) or (kernel_block_dims[dim] > end) == True): + + if len(condition) > 0: + condition += ' && ' + condition += f'{var_name} < {symbolic_to_cpp(end + 1)}' + + if len(condition) > 0: + scope_manager.open(condition=condition) + + + # ----------------- Dispatch Subgraph code generation ----------------------- + + self._dispatcher.dispatch_subgraph(sdfg, cfg, dfg_scope, state_id, function_stream, + callsite_stream, skip_entry_node=True) + + def _generate_kernel_signature(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: ScopeSubgraphView, + state_id: int, function_stream: CodeIOStream, callsite_stream: CodeIOStream): + + kernel_name = self._current_kernel_spec.kernel_name + kernel_args = self._current_kernel_spec.args_typed + block_dims = self._current_kernel_spec.block_dims + node = dfg_scope.source_nodes()[0] + + # Conditionally add __launch_bounds__ for block size optimization. + launch_bounds = '' + if node.gpu_launch_bounds != '-1': + if node.gpu_launch_bounds == "0": + if not any(symbolic.issymbolic(b) for b in block_dims): + launch_bounds = f'__launch_bounds__({product(block_dims)})' + else: + launch_bounds = f'__launch_bounds__({node.gpu_launch_bounds})' + + + # Emit kernel function signature + callsite_stream.write( + f'__global__ void {launch_bounds} {kernel_name}({", ".join(kernel_args)}) ', + cfg, state_id, node + ) + + +class ThreadBlockScopeGenerator(ScopeGenerationStrategy): + + def __init__(self, codegen: ExperimentalCUDACodeGen): + super().__init__(codegen) + + def applicable(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: ScopeSubgraphView, + state_id: int, function_stream: CodeIOStream, callsite_stream: CodeIOStream) -> bool: + + node = dfg_scope.source_nodes()[0] + applicable = node.map.schedule == dtypes.ScheduleType.GPU_ThreadBlock + + return applicable + + def generate(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: ScopeSubgraphView, + state_id: int, function_stream: CodeIOStream, callsite_stream: CodeIOStream): + + # NOTE: not my code, but my insights. Approval for commenting this needed + with ScopeManager(frame_codegen=self.codegen._frame, sdfg=sdfg, cfg=cfg, dfg_scope=dfg_scope, state_id=state_id, + function_stream=function_stream, callsite_stream=callsite_stream, comment="ThreadBlock Scope") as scope_manager: + + node = dfg_scope.source_nodes()[0] + scope_map = node.map + + + # ----------------- Map Range Preprocessing ----------------------- + + # Reverse range for better performance (e.g. memory coalescing) + reversed_scope_range = scope_map.range[::-1] + map_range = subsets.Range(reversed_scope_range) + map_dimensions = len(map_range) + map_dim_sizes = map_range.size() + + kernel_block_dims = self._current_kernel_spec.block_dims + + + # ----------------- Symbolic Index Expressions ----------------------- + + symbolic_indices = [ symbolic.symbol(f'__SYM_IDX{dim}', nonnegative=True, integer=True) for dim in range(map_dimensions)] + symbolic_index_bounds = [idx + (block_dim * rng[2]) - 1 for idx, block_dim, rng in zip(symbolic_indices, kernel_block_dims, map_range)] + symbolic_coordinates = map_range.coord_at(symbolic_indices) + + + # ----------------- Generate Index Variable Definitions ----------------------- + + # Get the block's index dace data type + block_id_ctype = self._current_kernel_spec.gpu_index_ctype + + for dim in range(map_dimensions): + var_name = scope_map.params[-dim - 1] # also reverse it here! + + if dim < 3: + # First three dimensions: direct mapping or partial delinearization + if dim == 2 and map_dimensions > 3: + tail_prod = product(map_dim_sizes[3:]) + base_expr = f"(threadIdx.z / ({symbolic_to_cpp(tail_prod)}))" + else: + base_expr = f"threadIdx.{get_cuda_dim(dim)}" + else: + # Dimensions beyond the third: full delinearization + tail_prod = product(map_dim_sizes[dim + 1:]) + base_expr = (f"(threadIdx.z / ({symbolic_to_cpp(tail_prod)})) % "f"({symbolic_to_cpp(map_dim_sizes[dim])})") + + + var_def = symbolic_to_cpp(symbolic_coordinates[dim]).replace(f'__SYM_IDX{dim}', base_expr) + callsite_stream.write(f'{block_id_ctype} {var_name} = {var_def};', cfg, state_id, node) + self._dispatcher.defined_vars.add(var_name, DefinedType.Scalar, block_id_ctype) + + + # ----------------- Guard Conditions for Block Execution ----------------------- + + # Generate conditions for this block's execution using min and max + # element, e.g. skipping out-of-bounds threads in trailing block + minels = map_range.min_element() + maxels = map_range.max_element() + for dim, (var_name, start, end) in enumerate(zip(scope_map.params[::-1], minels, maxels)): + + # Optimize conditions if they are always true + ############################################# + + condition = '' + + # Block range start + if dim >= 3 or (symbolic_indices[dim] >= start) != True: + condition += f'{var_name} >= {symbolic_to_cpp(start)}' + + # Special case: block size is exactly the range of the map (0:b) + if dim >= 3: + skipcond = False + else: + skipcond = symbolic_index_bounds[dim].subs({symbolic_indices[dim]: start}) == end + + # Block range end + if dim >= 3 or (not skipcond and (symbolic_index_bounds[dim] < end) != True): + if len(condition) > 0: + condition += ' && ' + condition += f'{var_name} < {symbolic_to_cpp(end + 1)}' + + # Emit condition in code if any + if len(condition) > 0: + scope_manager.open(condition=condition) + + + # ----------------- Dispatch Subgraph code generation ----------------------- + + self._dispatcher.dispatch_subgraph(sdfg, cfg, dfg_scope, state_id, function_stream, + callsite_stream, skip_entry_node=True) + + +class WarpScopeGenerator(ScopeGenerationStrategy): + + def __init__(self, codegen: ExperimentalCUDACodeGen): + super().__init__(codegen) + + def applicable(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: ScopeSubgraphView, + state_id: int, function_stream: CodeIOStream, callsite_stream: CodeIOStream) -> bool: + + node = dfg_scope.source_nodes()[0] + applicable = node.map.schedule == dtypes.ScheduleType.GPU_Warp + + return applicable + + def generate(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: ScopeSubgraphView, + state_id: int, function_stream: CodeIOStream, callsite_stream: CodeIOStream): + + with ScopeManager(frame_codegen=self.codegen._frame, sdfg=sdfg, cfg=cfg, dfg_scope=dfg_scope, state_id=state_id, + function_stream=function_stream, callsite_stream=callsite_stream, comment="WarpLevel Scope") as scope_manager: + + + # Get kernel specifications + kernel_spec = self._current_kernel_spec + block_dims = kernel_spec.block_dims + warpSize = kernel_spec.warpSize + + state_dfg = cfg.state(state_id) + node = dfg_scope.source_nodes()[0] + scope_map = node.map + + map_range = subsets.Range(scope_map.range[::-1]) # Reversed for potential better performance + warp_dim = len(map_range) + + # The following sizes and bounds are be symbolic + num_threads_in_block = product(block_dims) + warp_dim_bounds = [max_elem + 1 for max_elem in map_range.max_element()] + num_warps = product(warp_dim_bounds) + + + # The C type used to define the (flat) threadId and warpId variables + ids_ctype = kernel_spec.gpu_index_ctype + + # ----------------- Guard checks ----------------------- + + + # handles checks either at compile time or runtime (i.e. checks in the generated code) + self._handle_GPU_Warp_scope_guards(state_dfg, node, map_range, warp_dim, num_threads_in_block, num_warps, + callsite_stream, scope_manager) + + + + # ----------------- Define (flat) Thread ID within Block ----------------------- + + flattened_terms = [] + + for i, dim_size in enumerate(block_dims): + + if dim_size == 1: + continue + + dim = get_cuda_dim(i) + stride = [f"{block_dims[j]}" for j in range(i) if block_dims[j] > 1] + idx_expr = " * ".join(stride + [f"threadIdx.{get_cuda_dim(i)}"]) if stride else f"threadIdx.{dim}" + flattened_terms.append(idx_expr) + + + joined_terms = " + ".join(flattened_terms) + flat_thread_idx_expr = f"({joined_terms})" if len(flattened_terms) > 1 else joined_terms + + threadID_name = 'ThreadId_%s_%d_%d_%d' % (scope_map.label, cfg.cfg_id, state_dfg.block_id, state_dfg.node_id(node)) + + callsite_stream.write(f"{ids_ctype} {threadID_name} = ({flat_thread_idx_expr}) / {warpSize};", cfg, state_id, node) + self._dispatcher.defined_vars.add(threadID_name, DefinedType.Scalar, ids_ctype) + + + + # ----------------- Compute Map indices (= Warp indices) ----------------------- + + for i in range(warp_dim): + var_name = scope_map.params[-i - 1] # reverse order + previous_sizes = warp_dim_bounds[:i] + + if len(previous_sizes) > 0: + divisor = product(previous_sizes) + expr = f"({threadID_name} / {divisor}) % {warp_dim_bounds[i]}" + else: + expr = f"{threadID_name} % {warp_dim_bounds[i]}" + + callsite_stream.write(f"{ids_ctype} {var_name} = {expr};", cfg, state_id, node) + self._dispatcher.defined_vars.add(var_name, DefinedType.Scalar, ids_ctype) + + + + # ----------------- Guard Conditions for Warp Execution ----------------------- + + + if num_warps * warpSize != num_threads_in_block: + condition = f'{threadID_name} < {num_warps}' + scope_manager.open(condition) + + warp_range = [(start, end + 1, stride) for start, end, stride in map_range.ranges] + + for dim, (var_name, (start, _, stride)) in enumerate(zip(scope_map.params[::-1], warp_range)): + + condition_terms = [] + + if start != 0: + condition_terms.append(f"{var_name} >= {start}") + + if stride != 1: + expr = var_name if start == 0 else f"({var_name} - {start})" + condition_terms.append(f'{expr} % {stride} == 0' ) + + if condition_terms: + condition = " && ".join(condition_terms) + scope_manager.open(condition) + + + # ----------------- Dispatch Subgraph code generation ----------------------- + + + self._dispatcher.dispatch_subgraph( + sdfg, cfg, dfg_scope, state_id, function_stream, + callsite_stream, skip_entry_node=True + ) + + def _handle_GPU_Warp_scope_guards(self, state_dfg: SDFGState, node: nodes.MapEntry, map_range: subsets.Range, + warp_dim: int, num_threads_in_block, num_warps, kernel_stream: CodeIOStream, + scope_manager: 'ScopeManager'): + + #TODO: Move them to sdfg validation as well if possible + + # Get warpSize from the kernel specification + warpSize = self._current_kernel_spec.warpSize + + parent_map, _ = helpers.get_parent_map(state_dfg, node) + if parent_map.schedule != dtypes.ScheduleType.GPU_ThreadBlock: + raise ValueError("GPU_Warp map must be nested within a GPU_ThreadBlock map.") + + if warp_dim > 3: + raise NotImplementedError("GPU_Warp maps are limited to 3 dimensions.") + + + # Guard against invalid thread/block configurations. + # - For concrete (compile-time) values, raise Python errors early. + # - For symbolic values, insert runtime CUDA checks (guards) into the generated kernel. + # These will emit meaningful error messages and abort execution if violated. + if isinstance(num_threads_in_block, symbolic.symbol): + condition = ( + f"{num_threads_in_block} % {warpSize} != 0 || " + f"{num_threads_in_block} > 1024 || " + f"{num_warps} * {warpSize} > {num_threads_in_block}" + ) + kernel_stream.write(f"""\ + if ({condition}) {{ + printf("CUDA error:\\n" + "1. Block must be a multiple of {warpSize} threads (DaCe requirement for GPU_Warp scheduling).\\n" + "2. Block size must not exceed 1024 threads (CUDA hardware limit).\\n" + "3. Number of warps x {warpSize} must fit in the block (otherwise logic is unclear).\\n"); + asm("trap;"); + }} + """) + + else: + if isinstance(num_warps, symbolic.symbol): + condition = f"{num_warps} * {warpSize} > {num_threads_in_block}" + scope_manager.open(condition=condition) + + elif num_warps * warpSize > num_threads_in_block: + raise ValueError(f"Invalid configuration: {num_warps} warps x {warpSize} threads exceed " + f"{num_threads_in_block} threads in the block.") + + if num_threads_in_block % warpSize != 0: + raise ValueError(f"Block must be a multiple of {warpSize} threads for GPU_Warp scheduling " + f"(got {num_threads_in_block}).") + + if num_threads_in_block > 1024: + raise ValueError("CUDA does not support more than 1024 threads per block (hardware limit).") + + + for min_element in map_range.min_element(): + if isinstance(min_element, symbolic.symbol): + kernel_stream.write(f'if ({min_element} < 0) {{\n' + f' printf("Runtime error: Warp ID symbol {min_element} must be non-negative.\\n");\n' + f' asm("trap;");\n' + f'}}\n') + elif min_element < 0: + raise ValueError(f"Warp ID value {min_element} must be non-negative.") + + +#---------------------------------------------------------------------------------- +# Scope Manager, handling brackets and allocation/deallocation of arrays in Scopes +#---------------------------------------------------------------------------------- + +class ScopeManager: + """ + A helper class to manage opening and closing brackets in a structured way using the 'with' statement. + This class simplifies the process of correctly opening and closing brackets. It also supports an optional + debug mode to include comments in the generated code, which can help with debugging and understanding + the code structure. + """ + + def __init__(self, frame_codegen: DaCeCodeGenerator, sdfg: SDFG, + cfg: ControlFlowRegion, dfg_scope: ScopeSubgraphView, state_id: int, + function_stream: CodeIOStream, callsite_stream: CodeIOStream, comment: str = None, + debug: bool = False): + """ + Initializes the KernelScopeManager. + + :param frame_codegen: The frame codegenerator used for allocation and deallocation of arrays in scopes + :param sdfg: The SDFG instance for context. + :param cfg: The ControlFlowRegion instance for context. + :param dfg_scope: The ScopeSubgraphView instance for context. + :param state_id: The ID of the current state for context. + :param function_stream: The CodeIOStream for function-level code. + :param callsite_stream: The CodeIOStream for callsite-level code. + :param comment: A descriptive comment explaining the purpose of the code block being opened. Default is None. + :param debug: Whether to include debug comments in the output. Defaults to False. + """ + self.frame_codegen = frame_codegen + self.sdfg = sdfg + self.cfg = cfg + self.dfg_scope = dfg_scope + self.state_id = state_id + self.function_stream = function_stream + self.callsite_stream = callsite_stream + self.comment = comment + self.debug = debug + self._opened = 0 + + self.entry_node = self.dfg_scope.source_nodes()[0] + self.exit_node = self.dfg_scope.sink_nodes()[0] + + def __enter__(self): + """ + Writes the opening bracket to the stream and allocates arrays in scope. + """ + self.open() + self.frame_codegen.allocate_arrays_in_scope( + self.sdfg, self.cfg, self.entry_node, self.function_stream, self.callsite_stream + ) + return self + + def __exit__(self, exc_type, exc_value, traceback): + """ + Deallocates arrays in scope and writes the closing brackets to the stream. + """ + self.frame_codegen.deallocate_arrays_in_scope( + self.sdfg, self.cfg, self.entry_node, self.function_stream, self.callsite_stream + ) + for i in range(self._opened): + line = "}" + if self.debug: + line += f" // {self.comment} (close {i + 1})" + self.callsite_stream.write(line, self.cfg, self.state_id, self.exit_node) + + def open(self, condition: str = None): + """ + Opens a bracket. If a condition is given, emits 'if (condition) {', otherwise just '{'. + Tracks the number of open brackets for closing later. + + :param condition: Optional condition for the opening bracket. + """ + line = f"if ({condition}) {{" if condition else "{" + if self.debug: + line += f" // {self.comment} (open {self._opened + 1})" + self.callsite_stream.write(line, self.cfg, self.state_id, self.entry_node) + self._opened += 1 + + From 597039186d307c803b9e4e6175d72f32361fab92 Mon Sep 17 00:00:00 2001 From: aydogdub Date: Thu, 12 Jun 2025 17:39:50 +0200 Subject: [PATCH 21/94] Synchronization Insertion pass, almost done + notebook examples --- berkay_workpace/scratch/constantArgs.ipynb | 1057 +++++++++++++++++ berkay_workpace/scratch/cudastreamPass.ipynb | 346 +----- berkay_workpace/scratch/notes.md | 17 - berkay_workpace/scratch/playfield.py | 38 - .../scalarMultiplication1.ipynb | 695 +++++++++++ .../scalarMultiplication2.ipynb | 259 ++++ .../scratch/smemPassAndCopy/simpleCopy.ipynb | 830 +++++++++++++ .../scratch/smemPassAndCopy/smth.sdfg | 889 ++++++++++++++ berkay_workpace/scratch/testbed.ipynb | 727 ++---------- .../passes/shared_memory_synchronization.py | 394 ++++++ 10 files changed, 4221 insertions(+), 1031 deletions(-) create mode 100644 berkay_workpace/scratch/constantArgs.ipynb delete mode 100644 berkay_workpace/scratch/notes.md delete mode 100644 berkay_workpace/scratch/playfield.py create mode 100644 berkay_workpace/scratch/smemPassAndCopy/scalarMultiplication1.ipynb create mode 100644 berkay_workpace/scratch/smemPassAndCopy/scalarMultiplication2.ipynb create mode 100644 berkay_workpace/scratch/smemPassAndCopy/simpleCopy.ipynb create mode 100644 berkay_workpace/scratch/smemPassAndCopy/smth.sdfg create mode 100644 dace/transformation/passes/shared_memory_synchronization.py diff --git a/berkay_workpace/scratch/constantArgs.ipynb b/berkay_workpace/scratch/constantArgs.ipynb new file mode 100644 index 0000000000..ceac5ff6d0 --- /dev/null +++ b/berkay_workpace/scratch/constantArgs.ipynb @@ -0,0 +1,1057 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "88ef6b75", + "metadata": {}, + "source": [ + "# CUDA Codegen Testing Playground\n", + "\n", + "A \"playfield\" for exploring whether the **experimental CUDA codegen** correctly identifies **constant (`const`) arguments** or whether its behavior is at least \n", + "similar to the **legacy codegen**.\n", + "\n", + "**Why does this matters?** Using `const` in CUDA can lead (and usually does lead) to **better performance** by enabling compiler optimizations. \n", + "\n", + "This notebook helps verify that constant arguments are being properly recognized." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "20625e0d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# import all modules needed at once\n", + "import dace\n", + "import cupy as cp\n", + "import numpy as np\n", + "from IPython.display import Code\n", + "from typing import Optional\n", + "\n", + "from dace import SDFG, properties\n", + "from dace.config import Config\n", + "from dace.transformation import pass_pipeline as ppl, transformation\n", + "from dace.sdfg import nodes\n", + "from dace import dtypes\n", + "from dace.transformation.passes.gpustream_scheduling import NaiveGPUStreamScheduler" + ] + }, + { + "cell_type": "markdown", + "id": "48b4b2ee", + "metadata": {}, + "source": [ + "Look which codegen we are currently using (legay or experimental):" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "cf68a501", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'experimental'" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "usedCodegen = Config.get('compiler', 'cuda', 'implementation')\n", + "usedCodegen" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "b9d10c4b", + "metadata": {}, + "outputs": [], + "source": [ + "N = dace.symbol('N')\n", + "\n", + "@dace.program\n", + "def vector_copy_dyn_sizes(A: dace.float64[N] @ dace.dtypes.StorageType.GPU_Global, B: dace.float64[N] @ dace.dtypes.StorageType.GPU_Global):\n", + " for i in dace.map[0:N] @ dace.dtypes.ScheduleType.GPU_Device:\n", + " A[i] = B[i]\n", + "\n", + "sdfg = vector_copy_dyn_sizes.to_sdfg()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "3f759a90", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/berkay/master-thesis/dace/dace/codegen/targets/experimental_cuda.py:1323: UserWarning: No `gpu_block_size` property specified on map \"vector_copy_dyn_sizes_5\". Falling back to the configuration entry `compiler.cuda.default_block_size`: 32,1,1. You can either specify the block size to use with the gpu_block_size property, or by adding nested `GPU_ThreadBlock` maps, which map work to individual threads. For more information, see https://spcldace.readthedocs.io/en/latest/optimization/gpu.html\n", + " warnings.warn(\n" + ] + }, + { + "data": { + "text/html": [ + "
/* DaCe AUTO-GENERATED FILE. DO NOT MODIFY */\n",
+       "#include <dace/dace.h>\n",
+       "#include "../../include/hash.h"\n",
+       "\n",
+       "struct vector_copy_dyn_sizes_state_t {\n",
+       "    dace::cuda::Context *gpu_context;\n",
+       "};\n",
+       "\n",
+       "DACE_EXPORTED void __dace_runkernel_vector_copy_dyn_sizes_5_0_0_0(vector_copy_dyn_sizes_state_t *__state, double * __restrict__ A, const double * __restrict__ B, int N);\n",
+       "void __program_vector_copy_dyn_sizes_internal(vector_copy_dyn_sizes_state_t*__state, double * __restrict__ A, double * __restrict__ B, int N)\n",
+       "{\n",
+       "\n",
+       "    {\n",
+       "\n",
+       "        __dace_runkernel_vector_copy_dyn_sizes_5_0_0_0(__state, A, B, N);\n",
+       "        {\n",
+       "\n",
+       "            ///////////////////\n",
+       "            DACE_GPU_CHECK(cudaStreamSynchronize(nullptr));\n",
+       "            ///////////////////\n",
+       "\n",
+       "        }\n",
+       "\n",
+       "    }\n",
+       "}\n",
+       "\n",
+       "DACE_EXPORTED void __program_vector_copy_dyn_sizes(vector_copy_dyn_sizes_state_t *__state, double * __restrict__ A, double * __restrict__ B, int N)\n",
+       "{\n",
+       "    __program_vector_copy_dyn_sizes_internal(__state, A, B, N);\n",
+       "}\n",
+       "DACE_EXPORTED int __dace_init_experimental_cuda(vector_copy_dyn_sizes_state_t *__state, int N);\n",
+       "DACE_EXPORTED int __dace_exit_experimental_cuda(vector_copy_dyn_sizes_state_t *__state);\n",
+       "\n",
+       "DACE_EXPORTED vector_copy_dyn_sizes_state_t *__dace_init_vector_copy_dyn_sizes(int N)\n",
+       "{\n",
+       "    int __result = 0;\n",
+       "    vector_copy_dyn_sizes_state_t *__state = new vector_copy_dyn_sizes_state_t;\n",
+       "\n",
+       "\n",
+       "    __result |= __dace_init_experimental_cuda(__state, N);\n",
+       "\n",
+       "    if (__result) {\n",
+       "        delete __state;\n",
+       "        return nullptr;\n",
+       "    }\n",
+       "    return __state;\n",
+       "}\n",
+       "\n",
+       "DACE_EXPORTED int __dace_exit_vector_copy_dyn_sizes(vector_copy_dyn_sizes_state_t *__state)\n",
+       "{\n",
+       "    int __err = 0;\n",
+       "\n",
+       "    int __err_experimental_cuda = __dace_exit_experimental_cuda(__state);\n",
+       "    if (__err_experimental_cuda) {\n",
+       "        __err = __err_experimental_cuda;\n",
+       "    }\n",
+       "    delete __state;\n",
+       "    return __err;\n",
+       "}\n",
+       "
\n" + ], + "text/latex": [ + "\\begin{Verbatim}[commandchars=\\\\\\{\\}]\n", + "\\PY{c+cm}{/* DaCe AUTO\\PYZhy{}GENERATED FILE. DO NOT MODIFY */}\n", + "\\PY{c+cp}{\\PYZsh{}}\\PY{c+cp}{include}\\PY{+w}{ }\\PY{c+cpf}{\\PYZlt{}dace/dace.h\\PYZgt{}}\n", + "\\PY{c+cp}{\\PYZsh{}}\\PY{c+cp}{include}\\PY{+w}{ }\\PY{c+cpf}{\\PYZdq{}../../include/hash.h\\PYZdq{}}\n", + "\n", + "\\PY{k}{struct}\\PY{+w}{ }\\PY{n+nc}{vector\\PYZus{}copy\\PYZus{}dyn\\PYZus{}sizes\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", + "\\PY{+w}{ }\\PY{n}{dace}\\PY{o}{:}\\PY{o}{:}\\PY{n}{cuda}\\PY{o}{:}\\PY{o}{:}\\PY{n}{Context}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{gpu\\PYZus{}context}\\PY{p}{;}\n", + "\\PY{p}{\\PYZcb{}}\\PY{p}{;}\n", + "\n", + "\\PY{n}{DACE\\PYZus{}EXPORTED}\\PY{+w}{ }\\PY{k+kt}{void}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}dace\\PYZus{}runkernel\\PYZus{}vector\\PYZus{}copy\\PYZus{}dyn\\PYZus{}sizes\\PYZus{}5\\PYZus{}0\\PYZus{}0\\PYZus{}0}\\PY{p}{(}\\PY{n}{vector\\PYZus{}copy\\PYZus{}dyn\\PYZus{}sizes\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{,}\\PY{+w}{ }\\PY{k+kt}{double}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}restrict\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{n}{A}\\PY{p}{,}\\PY{+w}{ }\\PY{k}{const}\\PY{+w}{ }\\PY{k+kt}{double}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}restrict\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{n}{B}\\PY{p}{,}\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{N}\\PY{p}{)}\\PY{p}{;}\n", + "\\PY{k+kt}{void}\\PY{+w}{ }\\PY{n+nf}{\\PYZus{}\\PYZus{}program\\PYZus{}vector\\PYZus{}copy\\PYZus{}dyn\\PYZus{}sizes\\PYZus{}internal}\\PY{p}{(}\\PY{n}{vector\\PYZus{}copy\\PYZus{}dyn\\PYZus{}sizes\\PYZus{}state\\PYZus{}t}\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{,}\\PY{+w}{ }\\PY{k+kt}{double}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}restrict\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{n}{A}\\PY{p}{,}\\PY{+w}{ }\\PY{k+kt}{double}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}restrict\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{n}{B}\\PY{p}{,}\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{N}\\PY{p}{)}\n", + "\\PY{p}{\\PYZob{}}\n", + "\n", + "\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", + "\n", + "\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}dace\\PYZus{}runkernel\\PYZus{}vector\\PYZus{}copy\\PYZus{}dyn\\PYZus{}sizes\\PYZus{}5\\PYZus{}0\\PYZus{}0\\PYZus{}0}\\PY{p}{(}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{A}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{B}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{N}\\PY{p}{)}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", + "\n", + "\\PY{+w}{ }\\PY{c+c1}{///////////////////}\n", + "\\PY{+w}{ }\\PY{n}{DACE\\PYZus{}GPU\\PYZus{}CHECK}\\PY{p}{(}\\PY{n}{cudaStreamSynchronize}\\PY{p}{(}\\PY{k}{nullptr}\\PY{p}{)}\\PY{p}{)}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{c+c1}{///////////////////}\n", + "\n", + "\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\n", + "\n", + "\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\n", + "\\PY{p}{\\PYZcb{}}\n", + "\n", + "\\PY{n}{DACE\\PYZus{}EXPORTED}\\PY{+w}{ }\\PY{k+kt}{void}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}program\\PYZus{}vector\\PYZus{}copy\\PYZus{}dyn\\PYZus{}sizes}\\PY{p}{(}\\PY{n}{vector\\PYZus{}copy\\PYZus{}dyn\\PYZus{}sizes\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{,}\\PY{+w}{ }\\PY{k+kt}{double}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}restrict\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{n}{A}\\PY{p}{,}\\PY{+w}{ }\\PY{k+kt}{double}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}restrict\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{n}{B}\\PY{p}{,}\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{N}\\PY{p}{)}\n", + "\\PY{p}{\\PYZob{}}\n", + "\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}program\\PYZus{}vector\\PYZus{}copy\\PYZus{}dyn\\PYZus{}sizes\\PYZus{}internal}\\PY{p}{(}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{A}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{B}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{N}\\PY{p}{)}\\PY{p}{;}\n", + "\\PY{p}{\\PYZcb{}}\n", + "\\PY{n}{DACE\\PYZus{}EXPORTED}\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}dace\\PYZus{}init\\PYZus{}experimental\\PYZus{}cuda}\\PY{p}{(}\\PY{n}{vector\\PYZus{}copy\\PYZus{}dyn\\PYZus{}sizes\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{,}\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{N}\\PY{p}{)}\\PY{p}{;}\n", + "\\PY{n}{DACE\\PYZus{}EXPORTED}\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}dace\\PYZus{}exit\\PYZus{}experimental\\PYZus{}cuda}\\PY{p}{(}\\PY{n}{vector\\PYZus{}copy\\PYZus{}dyn\\PYZus{}sizes\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{)}\\PY{p}{;}\n", + "\n", + "\\PY{n}{DACE\\PYZus{}EXPORTED}\\PY{+w}{ }\\PY{n}{vector\\PYZus{}copy\\PYZus{}dyn\\PYZus{}sizes\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}dace\\PYZus{}init\\PYZus{}vector\\PYZus{}copy\\PYZus{}dyn\\PYZus{}sizes}\\PY{p}{(}\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{N}\\PY{p}{)}\n", + "\\PY{p}{\\PYZob{}}\n", + "\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}result}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{n}{vector\\PYZus{}copy\\PYZus{}dyn\\PYZus{}sizes\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{k}{new}\\PY{+w}{ }\\PY{n}{vector\\PYZus{}copy\\PYZus{}dyn\\PYZus{}sizes\\PYZus{}state\\PYZus{}t}\\PY{p}{;}\n", + "\n", + "\n", + "\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}result}\\PY{+w}{ }\\PY{o}{|}\\PY{o}{=}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}dace\\PYZus{}init\\PYZus{}experimental\\PYZus{}cuda}\\PY{p}{(}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{N}\\PY{p}{)}\\PY{p}{;}\n", + "\n", + "\\PY{+w}{ }\\PY{k}{if}\\PY{+w}{ }\\PY{p}{(}\\PY{n}{\\PYZus{}\\PYZus{}result}\\PY{p}{)}\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", + "\\PY{+w}{ }\\PY{k}{delete}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{k}{return}\\PY{+w}{ }\\PY{k}{nullptr}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\n", + "\\PY{+w}{ }\\PY{k}{return}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{;}\n", + "\\PY{p}{\\PYZcb{}}\n", + "\n", + "\\PY{n}{DACE\\PYZus{}EXPORTED}\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}dace\\PYZus{}exit\\PYZus{}vector\\PYZus{}copy\\PYZus{}dyn\\PYZus{}sizes}\\PY{p}{(}\\PY{n}{vector\\PYZus{}copy\\PYZus{}dyn\\PYZus{}sizes\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{)}\n", + "\\PY{p}{\\PYZob{}}\n", + "\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}err}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{;}\n", + "\n", + "\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}err\\PYZus{}experimental\\PYZus{}cuda}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}dace\\PYZus{}exit\\PYZus{}experimental\\PYZus{}cuda}\\PY{p}{(}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{)}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{k}{if}\\PY{+w}{ }\\PY{p}{(}\\PY{n}{\\PYZus{}\\PYZus{}err\\PYZus{}experimental\\PYZus{}cuda}\\PY{p}{)}\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", + "\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}err}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}err\\PYZus{}experimental\\PYZus{}cuda}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\n", + "\\PY{+w}{ }\\PY{k}{delete}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{k}{return}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}err}\\PY{p}{;}\n", + "\\PY{p}{\\PYZcb{}}\n", + "\\end{Verbatim}\n" + ], + "text/plain": [ + "/* DaCe AUTO-GENERATED FILE. DO NOT MODIFY */\n", + "#include \n", + "#include \"../../include/hash.h\"\n", + "\n", + "struct vector_copy_dyn_sizes_state_t {\n", + " dace::cuda::Context *gpu_context;\n", + "};\n", + "\n", + "DACE_EXPORTED void __dace_runkernel_vector_copy_dyn_sizes_5_0_0_0(vector_copy_dyn_sizes_state_t *__state, double * __restrict__ A, const double * __restrict__ B, int N);\n", + "void __program_vector_copy_dyn_sizes_internal(vector_copy_dyn_sizes_state_t*__state, double * __restrict__ A, double * __restrict__ B, int N)\n", + "{\n", + "\n", + " {\n", + "\n", + " __dace_runkernel_vector_copy_dyn_sizes_5_0_0_0(__state, A, B, N);\n", + " {\n", + "\n", + " ///////////////////\n", + " DACE_GPU_CHECK(cudaStreamSynchronize(nullptr));\n", + " ///////////////////\n", + "\n", + " }\n", + "\n", + " }\n", + "}\n", + "\n", + "DACE_EXPORTED void __program_vector_copy_dyn_sizes(vector_copy_dyn_sizes_state_t *__state, double * __restrict__ A, double * __restrict__ B, int N)\n", + "{\n", + " __program_vector_copy_dyn_sizes_internal(__state, A, B, N);\n", + "}\n", + "DACE_EXPORTED int __dace_init_experimental_cuda(vector_copy_dyn_sizes_state_t *__state, int N);\n", + "DACE_EXPORTED int __dace_exit_experimental_cuda(vector_copy_dyn_sizes_state_t *__state);\n", + "\n", + "DACE_EXPORTED vector_copy_dyn_sizes_state_t *__dace_init_vector_copy_dyn_sizes(int N)\n", + "{\n", + " int __result = 0;\n", + " vector_copy_dyn_sizes_state_t *__state = new vector_copy_dyn_sizes_state_t;\n", + "\n", + "\n", + " __result |= __dace_init_experimental_cuda(__state, N);\n", + "\n", + " if (__result) {\n", + " delete __state;\n", + " return nullptr;\n", + " }\n", + " return __state;\n", + "}\n", + "\n", + "DACE_EXPORTED int __dace_exit_vector_copy_dyn_sizes(vector_copy_dyn_sizes_state_t *__state)\n", + "{\n", + " int __err = 0;\n", + "\n", + " int __err_experimental_cuda = __dace_exit_experimental_cuda(__state);\n", + " if (__err_experimental_cuda) {\n", + " __err = __err_experimental_cuda;\n", + " }\n", + " delete __state;\n", + " return __err;\n", + "}" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "Code(sdfg.generate_code()[0].clean_code, language='cpp')" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "31580e6d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
#include <cuda_runtime.h>\n",
+       "#include <dace/dace.h>\n",
+       "\n",
+       "\n",
+       "struct vector_copy_dyn_sizes_state_t {\n",
+       "    dace::cuda::Context *gpu_context;\n",
+       "};\n",
+       "\n",
+       "\n",
+       "\n",
+       "DACE_EXPORTED int __dace_init_experimental_cuda(vector_copy_dyn_sizes_state_t *__state, int N);\n",
+       "DACE_EXPORTED int __dace_exit_experimental_cuda(vector_copy_dyn_sizes_state_t *__state);\n",
+       "\n",
+       "\n",
+       "\n",
+       "int __dace_init_experimental_cuda(vector_copy_dyn_sizes_state_t *__state, int N) {\n",
+       "    int count;\n",
+       "\n",
+       "    // Check that we are able to run cuda code\n",
+       "    if (cudaGetDeviceCount(&count) != cudaSuccess)\n",
+       "    {\n",
+       "        printf("ERROR: GPU drivers are not configured or cuda-capable device "\n",
+       "               "not found\\n");\n",
+       "        return 1;\n",
+       "    }\n",
+       "    if (count == 0)\n",
+       "    {\n",
+       "        printf("ERROR: No cuda-capable devices found\\n");\n",
+       "        return 2;\n",
+       "    }\n",
+       "\n",
+       "    // Initialize cuda before we run the application\n",
+       "    float *dev_X;\n",
+       "    DACE_GPU_CHECK(cudaMalloc((void **) &dev_X, 1));\n",
+       "    DACE_GPU_CHECK(cudaFree(dev_X));\n",
+       "\n",
+       "    \n",
+       "\n",
+       "    __state->gpu_context = new dace::cuda::Context(0, 0);\n",
+       "\n",
+       "    // Create cuda streams and events\n",
+       "    for(int i = 0; i < 0; ++i) {\n",
+       "        DACE_GPU_CHECK(cudaStreamCreateWithFlags(&__state->gpu_context->internal_streams[i], cudaStreamNonBlocking));\n",
+       "        __state->gpu_context->streams[i] = __state->gpu_context->internal_streams[i]; // Allow for externals to modify streams\n",
+       "    }\n",
+       "    for(int i = 0; i < 0; ++i) {\n",
+       "        DACE_GPU_CHECK(cudaEventCreateWithFlags(&__state->gpu_context->events[i], cudaEventDisableTiming));\n",
+       "    }\n",
+       "\n",
+       "    \n",
+       "\n",
+       "    return 0;\n",
+       "}\n",
+       "\n",
+       "int __dace_exit_experimental_cuda(vector_copy_dyn_sizes_state_t *__state) {\n",
+       "    \n",
+       "\n",
+       "    // Synchronize and check for CUDA errors\n",
+       "    int __err = static_cast<int>(__state->gpu_context->lasterror);\n",
+       "    if (__err == 0)\n",
+       "        __err = static_cast<int>(cudaDeviceSynchronize());\n",
+       "\n",
+       "    // Destroy cuda streams and events\n",
+       "    for(int i = 0; i < 0; ++i) {\n",
+       "        DACE_GPU_CHECK(cudaStreamDestroy(__state->gpu_context->internal_streams[i]));\n",
+       "    }\n",
+       "    for(int i = 0; i < 0; ++i) {\n",
+       "        DACE_GPU_CHECK(cudaEventDestroy(__state->gpu_context->events[i]));\n",
+       "    }\n",
+       "\n",
+       "    delete __state->gpu_context;\n",
+       "    return __err;\n",
+       "}\n",
+       "\n",
+       "DACE_EXPORTED bool __dace_gpu_set_stream(vector_copy_dyn_sizes_state_t *__state, int streamid, gpuStream_t stream)\n",
+       "{\n",
+       "    if (streamid < 0 || streamid >= 0)\n",
+       "        return false;\n",
+       "\n",
+       "    __state->gpu_context->streams[streamid] = stream;\n",
+       "\n",
+       "    return true;\n",
+       "}\n",
+       "\n",
+       "DACE_EXPORTED void __dace_gpu_set_all_streams(vector_copy_dyn_sizes_state_t *__state, gpuStream_t stream)\n",
+       "{\n",
+       "    for (int i = 0; i < 0; ++i)\n",
+       "        __state->gpu_context->streams[i] = stream;\n",
+       "}\n",
+       "\n",
+       "__global__ void __launch_bounds__(32) vector_copy_dyn_sizes_5_0_0_0(double * __restrict__ A, const double * __restrict__ B, int N)\n",
+       "{\n",
+       "    int i = (blockIdx.x * 32 + threadIdx.x);\n",
+       "    if (i < N) {\n",
+       "        {\n",
+       "            double __inp = B[i];\n",
+       "            double __out;\n",
+       "\n",
+       "            ///////////////////\n",
+       "            // Tasklet code (assign_6_12)\n",
+       "            __out = __inp;\n",
+       "            ///////////////////\n",
+       "\n",
+       "            A[i] = __out;\n",
+       "        }\n",
+       "    }\n",
+       "}\n",
+       "\n",
+       "\n",
+       "DACE_EXPORTED void __dace_runkernel_vector_copy_dyn_sizes_5_0_0_0(vector_copy_dyn_sizes_state_t *__state, double * __restrict__ A, const double * __restrict__ B, int N);\n",
+       "void __dace_runkernel_vector_copy_dyn_sizes_5_0_0_0(vector_copy_dyn_sizes_state_t *__state, double * __restrict__ A, const double * __restrict__ B, int N)\n",
+       "{\n",
+       "\n",
+       "\n",
+       "    if (((int_ceil(int_ceil(N, 1), 32)) <= 0)) {\n",
+       "\n",
+       "        return;\n",
+       "    }\n",
+       "\n",
+       "    void  *vector_copy_dyn_sizes_5_0_0_0_args[] = { (void *)&A, (void *)&B, (void *)&N };\n",
+       "    gpuError_t __err = cudaLaunchKernel( (void*)vector_copy_dyn_sizes_5_0_0_0, dim3(int_ceil(int_ceil(N, 1), 32), 1, 1), dim3(32, 1, 1), vector_copy_dyn_sizes_5_0_0_0_args, 0, nullptr\n",
+       "    );\n",
+       "\n",
+       "    DACE_KERNEL_LAUNCH_CHECK(__err, "vector_copy_dyn_sizes_5_0_0_0", int_ceil(int_ceil(N, 1), 32), 1, 1, 32, 1, 1);\n",
+       "}\n",
+       "
\n" + ], + "text/latex": [ + "\\begin{Verbatim}[commandchars=\\\\\\{\\}]\n", + "\\PY{c+cp}{\\PYZsh{}}\\PY{c+cp}{include}\\PY{+w}{ }\\PY{c+cpf}{\\PYZlt{}cuda\\PYZus{}runtime.h\\PYZgt{}}\n", + "\\PY{c+cp}{\\PYZsh{}}\\PY{c+cp}{include}\\PY{+w}{ }\\PY{c+cpf}{\\PYZlt{}dace/dace.h\\PYZgt{}}\n", + "\n", + "\n", + "\\PY{k}{struct}\\PY{+w}{ }\\PY{n+nc}{vector\\PYZus{}copy\\PYZus{}dyn\\PYZus{}sizes\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", + "\\PY{+w}{ }\\PY{n}{dace}\\PY{o}{:}\\PY{o}{:}\\PY{n}{cuda}\\PY{o}{:}\\PY{o}{:}\\PY{n}{Context}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{gpu\\PYZus{}context}\\PY{p}{;}\n", + "\\PY{p}{\\PYZcb{}}\\PY{p}{;}\n", + "\n", + "\n", + "\n", + "\\PY{n}{DACE\\PYZus{}EXPORTED}\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}dace\\PYZus{}init\\PYZus{}experimental\\PYZus{}cuda}\\PY{p}{(}\\PY{n}{vector\\PYZus{}copy\\PYZus{}dyn\\PYZus{}sizes\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{,}\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{N}\\PY{p}{)}\\PY{p}{;}\n", + "\\PY{n}{DACE\\PYZus{}EXPORTED}\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}dace\\PYZus{}exit\\PYZus{}experimental\\PYZus{}cuda}\\PY{p}{(}\\PY{n}{vector\\PYZus{}copy\\PYZus{}dyn\\PYZus{}sizes\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{)}\\PY{p}{;}\n", + "\n", + "\n", + "\n", + "\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n+nf}{\\PYZus{}\\PYZus{}dace\\PYZus{}init\\PYZus{}experimental\\PYZus{}cuda}\\PY{p}{(}\\PY{n}{vector\\PYZus{}copy\\PYZus{}dyn\\PYZus{}sizes\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{,}\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{N}\\PY{p}{)}\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", + "\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{count}\\PY{p}{;}\n", + "\n", + "\\PY{+w}{ }\\PY{c+c1}{// Check that we are able to run cuda code}\n", + "\\PY{+w}{ }\\PY{k}{if}\\PY{+w}{ }\\PY{p}{(}\\PY{n}{cudaGetDeviceCount}\\PY{p}{(}\\PY{o}{\\PYZam{}}\\PY{n}{count}\\PY{p}{)}\\PY{+w}{ }\\PY{o}{!}\\PY{o}{=}\\PY{+w}{ }\\PY{n}{cudaSuccess}\\PY{p}{)}\n", + "\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", + "\\PY{+w}{ }\\PY{n}{printf}\\PY{p}{(}\\PY{l+s}{\\PYZdq{}}\\PY{l+s}{ERROR: GPU drivers are not configured or cuda\\PYZhy{}capable device }\\PY{l+s}{\\PYZdq{}}\n", + "\\PY{+w}{ }\\PY{l+s}{\\PYZdq{}}\\PY{l+s}{not found}\\PY{l+s+se}{\\PYZbs{}n}\\PY{l+s}{\\PYZdq{}}\\PY{p}{)}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{k}{return}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\n", + "\\PY{+w}{ }\\PY{k}{if}\\PY{+w}{ }\\PY{p}{(}\\PY{n}{count}\\PY{+w}{ }\\PY{o}{=}\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{)}\n", + "\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", + "\\PY{+w}{ }\\PY{n}{printf}\\PY{p}{(}\\PY{l+s}{\\PYZdq{}}\\PY{l+s}{ERROR: No cuda\\PYZhy{}capable devices found}\\PY{l+s+se}{\\PYZbs{}n}\\PY{l+s}{\\PYZdq{}}\\PY{p}{)}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{k}{return}\\PY{+w}{ }\\PY{l+m+mi}{2}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\n", + "\n", + "\\PY{+w}{ }\\PY{c+c1}{// Initialize cuda before we run the application}\n", + "\\PY{+w}{ }\\PY{k+kt}{float}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{dev\\PYZus{}X}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{n}{DACE\\PYZus{}GPU\\PYZus{}CHECK}\\PY{p}{(}\\PY{n}{cudaMalloc}\\PY{p}{(}\\PY{p}{(}\\PY{k+kt}{void}\\PY{+w}{ }\\PY{o}{*}\\PY{o}{*}\\PY{p}{)}\\PY{+w}{ }\\PY{o}{\\PYZam{}}\\PY{n}{dev\\PYZus{}X}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{)}\\PY{p}{)}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{n}{DACE\\PYZus{}GPU\\PYZus{}CHECK}\\PY{p}{(}\\PY{n}{cudaFree}\\PY{p}{(}\\PY{n}{dev\\PYZus{}X}\\PY{p}{)}\\PY{p}{)}\\PY{p}{;}\n", + "\n", + "\\PY{+w}{ }\n", + "\n", + "\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{k}{new}\\PY{+w}{ }\\PY{n}{dace}\\PY{o}{:}\\PY{o}{:}\\PY{n}{cuda}\\PY{o}{:}\\PY{o}{:}\\PY{n}{Context}\\PY{p}{(}\\PY{l+m+mi}{0}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{)}\\PY{p}{;}\n", + "\n", + "\\PY{+w}{ }\\PY{c+c1}{// Create cuda streams and events}\n", + "\\PY{+w}{ }\\PY{k}{for}\\PY{p}{(}\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{;}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{\\PYZlt{}}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{;}\\PY{+w}{ }\\PY{o}{+}\\PY{o}{+}\\PY{n}{i}\\PY{p}{)}\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", + "\\PY{+w}{ }\\PY{n}{DACE\\PYZus{}GPU\\PYZus{}CHECK}\\PY{p}{(}\\PY{n}{cudaStreamCreateWithFlags}\\PY{p}{(}\\PY{o}{\\PYZam{}}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{internal\\PYZus{}streams}\\PY{p}{[}\\PY{n}{i}\\PY{p}{]}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{cudaStreamNonBlocking}\\PY{p}{)}\\PY{p}{)}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{streams}\\PY{p}{[}\\PY{n}{i}\\PY{p}{]}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{internal\\PYZus{}streams}\\PY{p}{[}\\PY{n}{i}\\PY{p}{]}\\PY{p}{;}\\PY{+w}{ }\\PY{c+c1}{// Allow for externals to modify streams}\n", + "\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\n", + "\\PY{+w}{ }\\PY{k}{for}\\PY{p}{(}\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{;}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{\\PYZlt{}}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{;}\\PY{+w}{ }\\PY{o}{+}\\PY{o}{+}\\PY{n}{i}\\PY{p}{)}\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", + "\\PY{+w}{ }\\PY{n}{DACE\\PYZus{}GPU\\PYZus{}CHECK}\\PY{p}{(}\\PY{n}{cudaEventCreateWithFlags}\\PY{p}{(}\\PY{o}{\\PYZam{}}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{events}\\PY{p}{[}\\PY{n}{i}\\PY{p}{]}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{cudaEventDisableTiming}\\PY{p}{)}\\PY{p}{)}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\n", + "\n", + "\\PY{+w}{ }\n", + "\n", + "\\PY{+w}{ }\\PY{k}{return}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{;}\n", + "\\PY{p}{\\PYZcb{}}\n", + "\n", + "\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n+nf}{\\PYZus{}\\PYZus{}dace\\PYZus{}exit\\PYZus{}experimental\\PYZus{}cuda}\\PY{p}{(}\\PY{n}{vector\\PYZus{}copy\\PYZus{}dyn\\PYZus{}sizes\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{)}\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", + "\\PY{+w}{ }\n", + "\n", + "\\PY{+w}{ }\\PY{c+c1}{// Synchronize and check for CUDA errors}\n", + "\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}err}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{k}{static\\PYZus{}cast}\\PY{o}{\\PYZlt{}}\\PY{k+kt}{int}\\PY{o}{\\PYZgt{}}\\PY{p}{(}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{lasterror}\\PY{p}{)}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{k}{if}\\PY{+w}{ }\\PY{p}{(}\\PY{n}{\\PYZus{}\\PYZus{}err}\\PY{+w}{ }\\PY{o}{=}\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{)}\n", + "\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}err}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{k}{static\\PYZus{}cast}\\PY{o}{\\PYZlt{}}\\PY{k+kt}{int}\\PY{o}{\\PYZgt{}}\\PY{p}{(}\\PY{n}{cudaDeviceSynchronize}\\PY{p}{(}\\PY{p}{)}\\PY{p}{)}\\PY{p}{;}\n", + "\n", + "\\PY{+w}{ }\\PY{c+c1}{// Destroy cuda streams and events}\n", + "\\PY{+w}{ }\\PY{k}{for}\\PY{p}{(}\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{;}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{\\PYZlt{}}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{;}\\PY{+w}{ }\\PY{o}{+}\\PY{o}{+}\\PY{n}{i}\\PY{p}{)}\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", + "\\PY{+w}{ }\\PY{n}{DACE\\PYZus{}GPU\\PYZus{}CHECK}\\PY{p}{(}\\PY{n}{cudaStreamDestroy}\\PY{p}{(}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{internal\\PYZus{}streams}\\PY{p}{[}\\PY{n}{i}\\PY{p}{]}\\PY{p}{)}\\PY{p}{)}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\n", + "\\PY{+w}{ }\\PY{k}{for}\\PY{p}{(}\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{;}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{\\PYZlt{}}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{;}\\PY{+w}{ }\\PY{o}{+}\\PY{o}{+}\\PY{n}{i}\\PY{p}{)}\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", + "\\PY{+w}{ }\\PY{n}{DACE\\PYZus{}GPU\\PYZus{}CHECK}\\PY{p}{(}\\PY{n}{cudaEventDestroy}\\PY{p}{(}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{events}\\PY{p}{[}\\PY{n}{i}\\PY{p}{]}\\PY{p}{)}\\PY{p}{)}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\n", + "\n", + "\\PY{+w}{ }\\PY{k}{delete}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{k}{return}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}err}\\PY{p}{;}\n", + "\\PY{p}{\\PYZcb{}}\n", + "\n", + "\\PY{n}{DACE\\PYZus{}EXPORTED}\\PY{+w}{ }\\PY{k+kt}{bool}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}dace\\PYZus{}gpu\\PYZus{}set\\PYZus{}stream}\\PY{p}{(}\\PY{n}{vector\\PYZus{}copy\\PYZus{}dyn\\PYZus{}sizes\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{,}\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{streamid}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{gpuStream\\PYZus{}t}\\PY{+w}{ }\\PY{n}{stream}\\PY{p}{)}\n", + "\\PY{p}{\\PYZob{}}\n", + "\\PY{+w}{ }\\PY{k}{if}\\PY{+w}{ }\\PY{p}{(}\\PY{n}{streamid}\\PY{+w}{ }\\PY{o}{\\PYZlt{}}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{+w}{ }\\PY{o}{|}\\PY{o}{|}\\PY{+w}{ }\\PY{n}{streamid}\\PY{+w}{ }\\PY{o}{\\PYZgt{}}\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{)}\n", + "\\PY{+w}{ }\\PY{k}{return}\\PY{+w}{ }\\PY{n+nb}{false}\\PY{p}{;}\n", + "\n", + "\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{streams}\\PY{p}{[}\\PY{n}{streamid}\\PY{p}{]}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{stream}\\PY{p}{;}\n", + "\n", + "\\PY{+w}{ }\\PY{k}{return}\\PY{+w}{ }\\PY{n+nb}{true}\\PY{p}{;}\n", + "\\PY{p}{\\PYZcb{}}\n", + "\n", + "\\PY{n}{DACE\\PYZus{}EXPORTED}\\PY{+w}{ }\\PY{k+kt}{void}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}dace\\PYZus{}gpu\\PYZus{}set\\PYZus{}all\\PYZus{}streams}\\PY{p}{(}\\PY{n}{vector\\PYZus{}copy\\PYZus{}dyn\\PYZus{}sizes\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{gpuStream\\PYZus{}t}\\PY{+w}{ }\\PY{n}{stream}\\PY{p}{)}\n", + "\\PY{p}{\\PYZob{}}\n", + "\\PY{+w}{ }\\PY{k}{for}\\PY{+w}{ }\\PY{p}{(}\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{;}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{\\PYZlt{}}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{;}\\PY{+w}{ }\\PY{o}{+}\\PY{o}{+}\\PY{n}{i}\\PY{p}{)}\n", + "\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{streams}\\PY{p}{[}\\PY{n}{i}\\PY{p}{]}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{stream}\\PY{p}{;}\n", + "\\PY{p}{\\PYZcb{}}\n", + "\n", + "\\PY{n}{\\PYZus{}\\PYZus{}global\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{k+kt}{void}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}launch\\PYZus{}bounds\\PYZus{}\\PYZus{}}\\PY{p}{(}\\PY{l+m+mi}{32}\\PY{p}{)}\\PY{+w}{ }\\PY{n}{vector\\PYZus{}copy\\PYZus{}dyn\\PYZus{}sizes\\PYZus{}5\\PYZus{}0\\PYZus{}0\\PYZus{}0}\\PY{p}{(}\\PY{k+kt}{double}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}restrict\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{n}{A}\\PY{p}{,}\\PY{+w}{ }\\PY{k}{const}\\PY{+w}{ }\\PY{k+kt}{double}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}restrict\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{n}{B}\\PY{p}{,}\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{N}\\PY{p}{)}\n", + "\\PY{p}{\\PYZob{}}\n", + "\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{p}{(}\\PY{n}{blockIdx}\\PY{p}{.}\\PY{n}{x}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{l+m+mi}{32}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{n}{threadIdx}\\PY{p}{.}\\PY{n}{x}\\PY{p}{)}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{k}{if}\\PY{+w}{ }\\PY{p}{(}\\PY{n}{i}\\PY{+w}{ }\\PY{o}{\\PYZlt{}}\\PY{+w}{ }\\PY{n}{N}\\PY{p}{)}\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", + "\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", + "\\PY{+w}{ }\\PY{k+kt}{double}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}inp}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{B}\\PY{p}{[}\\PY{n}{i}\\PY{p}{]}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{k+kt}{double}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}out}\\PY{p}{;}\n", + "\n", + "\\PY{+w}{ }\\PY{c+c1}{///////////////////}\n", + "\\PY{+w}{ }\\PY{c+c1}{// Tasklet code (assign\\PYZus{}6\\PYZus{}12)}\n", + "\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}out}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}inp}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{c+c1}{///////////////////}\n", + "\n", + "\\PY{+w}{ }\\PY{n}{A}\\PY{p}{[}\\PY{n}{i}\\PY{p}{]}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}out}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\n", + "\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\n", + "\\PY{p}{\\PYZcb{}}\n", + "\n", + "\n", + "\\PY{n}{DACE\\PYZus{}EXPORTED}\\PY{+w}{ }\\PY{k+kt}{void}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}dace\\PYZus{}runkernel\\PYZus{}vector\\PYZus{}copy\\PYZus{}dyn\\PYZus{}sizes\\PYZus{}5\\PYZus{}0\\PYZus{}0\\PYZus{}0}\\PY{p}{(}\\PY{n}{vector\\PYZus{}copy\\PYZus{}dyn\\PYZus{}sizes\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{,}\\PY{+w}{ }\\PY{k+kt}{double}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}restrict\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{n}{A}\\PY{p}{,}\\PY{+w}{ }\\PY{k}{const}\\PY{+w}{ }\\PY{k+kt}{double}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}restrict\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{n}{B}\\PY{p}{,}\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{N}\\PY{p}{)}\\PY{p}{;}\n", + "\\PY{k+kt}{void}\\PY{+w}{ }\\PY{n+nf}{\\PYZus{}\\PYZus{}dace\\PYZus{}runkernel\\PYZus{}vector\\PYZus{}copy\\PYZus{}dyn\\PYZus{}sizes\\PYZus{}5\\PYZus{}0\\PYZus{}0\\PYZus{}0}\\PY{p}{(}\\PY{n}{vector\\PYZus{}copy\\PYZus{}dyn\\PYZus{}sizes\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{,}\\PY{+w}{ }\\PY{k+kt}{double}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}restrict\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{n}{A}\\PY{p}{,}\\PY{+w}{ }\\PY{k}{const}\\PY{+w}{ }\\PY{k+kt}{double}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}restrict\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{n}{B}\\PY{p}{,}\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{N}\\PY{p}{)}\n", + "\\PY{p}{\\PYZob{}}\n", + "\n", + "\n", + "\\PY{+w}{ }\\PY{k}{if}\\PY{+w}{ }\\PY{p}{(}\\PY{p}{(}\\PY{p}{(}\\PY{n}{int\\PYZus{}ceil}\\PY{p}{(}\\PY{n}{int\\PYZus{}ceil}\\PY{p}{(}\\PY{n}{N}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{)}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{32}\\PY{p}{)}\\PY{p}{)}\\PY{+w}{ }\\PY{o}{\\PYZlt{}}\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{)}\\PY{p}{)}\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", + "\n", + "\\PY{+w}{ }\\PY{k}{return}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\n", + "\n", + "\\PY{+w}{ }\\PY{k+kt}{void}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{vector\\PYZus{}copy\\PYZus{}dyn\\PYZus{}sizes\\PYZus{}5\\PYZus{}0\\PYZus{}0\\PYZus{}0\\PYZus{}args}\\PY{p}{[}\\PY{p}{]}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{p}{\\PYZob{}}\\PY{+w}{ }\\PY{p}{(}\\PY{k+kt}{void}\\PY{+w}{ }\\PY{o}{*}\\PY{p}{)}\\PY{o}{\\PYZam{}}\\PY{n}{A}\\PY{p}{,}\\PY{+w}{ }\\PY{p}{(}\\PY{k+kt}{void}\\PY{+w}{ }\\PY{o}{*}\\PY{p}{)}\\PY{o}{\\PYZam{}}\\PY{n}{B}\\PY{p}{,}\\PY{+w}{ }\\PY{p}{(}\\PY{k+kt}{void}\\PY{+w}{ }\\PY{o}{*}\\PY{p}{)}\\PY{o}{\\PYZam{}}\\PY{n}{N}\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{n}{gpuError\\PYZus{}t}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}err}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{cudaLaunchKernel}\\PY{p}{(}\\PY{+w}{ }\\PY{p}{(}\\PY{k+kt}{void}\\PY{o}{*}\\PY{p}{)}\\PY{n}{vector\\PYZus{}copy\\PYZus{}dyn\\PYZus{}sizes\\PYZus{}5\\PYZus{}0\\PYZus{}0\\PYZus{}0}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{dim3}\\PY{p}{(}\\PY{n}{int\\PYZus{}ceil}\\PY{p}{(}\\PY{n}{int\\PYZus{}ceil}\\PY{p}{(}\\PY{n}{N}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{)}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{32}\\PY{p}{)}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{)}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{dim3}\\PY{p}{(}\\PY{l+m+mi}{32}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{)}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{vector\\PYZus{}copy\\PYZus{}dyn\\PYZus{}sizes\\PYZus{}5\\PYZus{}0\\PYZus{}0\\PYZus{}0\\PYZus{}args}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{,}\\PY{+w}{ }\\PY{k}{nullptr}\n", + "\\PY{+w}{ }\\PY{p}{)}\\PY{p}{;}\n", + "\n", + "\\PY{+w}{ }\\PY{n}{DACE\\PYZus{}KERNEL\\PYZus{}LAUNCH\\PYZus{}CHECK}\\PY{p}{(}\\PY{n}{\\PYZus{}\\PYZus{}err}\\PY{p}{,}\\PY{+w}{ }\\PY{l+s}{\\PYZdq{}}\\PY{l+s}{vector\\PYZus{}copy\\PYZus{}dyn\\PYZus{}sizes\\PYZus{}5\\PYZus{}0\\PYZus{}0\\PYZus{}0}\\PY{l+s}{\\PYZdq{}}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{int\\PYZus{}ceil}\\PY{p}{(}\\PY{n}{int\\PYZus{}ceil}\\PY{p}{(}\\PY{n}{N}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{)}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{32}\\PY{p}{)}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{32}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{)}\\PY{p}{;}\n", + "\\PY{p}{\\PYZcb{}}\n", + "\\end{Verbatim}\n" + ], + "text/plain": [ + "\n", + "#include \n", + "#include \n", + "\n", + "\n", + "struct vector_copy_dyn_sizes_state_t {\n", + " dace::cuda::Context *gpu_context;\n", + "};\n", + "\n", + "\n", + "\n", + "DACE_EXPORTED int __dace_init_experimental_cuda(vector_copy_dyn_sizes_state_t *__state, int N);\n", + "DACE_EXPORTED int __dace_exit_experimental_cuda(vector_copy_dyn_sizes_state_t *__state);\n", + "\n", + "\n", + "\n", + "int __dace_init_experimental_cuda(vector_copy_dyn_sizes_state_t *__state, int N) {\n", + " int count;\n", + "\n", + " // Check that we are able to run cuda code\n", + " if (cudaGetDeviceCount(&count) != cudaSuccess)\n", + " {\n", + " printf(\"ERROR: GPU drivers are not configured or cuda-capable device \"\n", + " \"not found\\n\");\n", + " return 1;\n", + " }\n", + " if (count == 0)\n", + " {\n", + " printf(\"ERROR: No cuda-capable devices found\\n\");\n", + " return 2;\n", + " }\n", + "\n", + " // Initialize cuda before we run the application\n", + " float *dev_X;\n", + " DACE_GPU_CHECK(cudaMalloc((void **) &dev_X, 1));\n", + " DACE_GPU_CHECK(cudaFree(dev_X));\n", + "\n", + " \n", + "\n", + " __state->gpu_context = new dace::cuda::Context(0, 0);\n", + "\n", + " // Create cuda streams and events\n", + " for(int i = 0; i < 0; ++i) {\n", + " DACE_GPU_CHECK(cudaStreamCreateWithFlags(&__state->gpu_context->internal_streams[i], cudaStreamNonBlocking));\n", + " __state->gpu_context->streams[i] = __state->gpu_context->internal_streams[i]; // Allow for externals to modify streams\n", + " }\n", + " for(int i = 0; i < 0; ++i) {\n", + " DACE_GPU_CHECK(cudaEventCreateWithFlags(&__state->gpu_context->events[i], cudaEventDisableTiming));\n", + " }\n", + "\n", + " \n", + "\n", + " return 0;\n", + "}\n", + "\n", + "int __dace_exit_experimental_cuda(vector_copy_dyn_sizes_state_t *__state) {\n", + " \n", + "\n", + " // Synchronize and check for CUDA errors\n", + " int __err = static_cast(__state->gpu_context->lasterror);\n", + " if (__err == 0)\n", + " __err = static_cast(cudaDeviceSynchronize());\n", + "\n", + " // Destroy cuda streams and events\n", + " for(int i = 0; i < 0; ++i) {\n", + " DACE_GPU_CHECK(cudaStreamDestroy(__state->gpu_context->internal_streams[i]));\n", + " }\n", + " for(int i = 0; i < 0; ++i) {\n", + " DACE_GPU_CHECK(cudaEventDestroy(__state->gpu_context->events[i]));\n", + " }\n", + "\n", + " delete __state->gpu_context;\n", + " return __err;\n", + "}\n", + "\n", + "DACE_EXPORTED bool __dace_gpu_set_stream(vector_copy_dyn_sizes_state_t *__state, int streamid, gpuStream_t stream)\n", + "{\n", + " if (streamid < 0 || streamid >= 0)\n", + " return false;\n", + "\n", + " __state->gpu_context->streams[streamid] = stream;\n", + "\n", + " return true;\n", + "}\n", + "\n", + "DACE_EXPORTED void __dace_gpu_set_all_streams(vector_copy_dyn_sizes_state_t *__state, gpuStream_t stream)\n", + "{\n", + " for (int i = 0; i < 0; ++i)\n", + " __state->gpu_context->streams[i] = stream;\n", + "}\n", + "\n", + "__global__ void __launch_bounds__(32) vector_copy_dyn_sizes_5_0_0_0(double * __restrict__ A, const double * __restrict__ B, int N)\n", + "{\n", + " int i = (blockIdx.x * 32 + threadIdx.x);\n", + " if (i < N) {\n", + " {\n", + " double __inp = B[i];\n", + " double __out;\n", + "\n", + " ///////////////////\n", + " // Tasklet code (assign_6_12)\n", + " __out = __inp;\n", + " ///////////////////\n", + "\n", + " A[i] = __out;\n", + " }\n", + " }\n", + "}\n", + "\n", + "\n", + "DACE_EXPORTED void __dace_runkernel_vector_copy_dyn_sizes_5_0_0_0(vector_copy_dyn_sizes_state_t *__state, double * __restrict__ A, const double * __restrict__ B, int N);\n", + "void __dace_runkernel_vector_copy_dyn_sizes_5_0_0_0(vector_copy_dyn_sizes_state_t *__state, double * __restrict__ A, const double * __restrict__ B, int N)\n", + "{\n", + "\n", + "\n", + " if (((int_ceil(int_ceil(N, 1), 32)) <= 0)) {\n", + "\n", + " return;\n", + " }\n", + "\n", + " void *vector_copy_dyn_sizes_5_0_0_0_args[] = { (void *)&A, (void *)&B, (void *)&N };\n", + " gpuError_t __err = cudaLaunchKernel( (void*)vector_copy_dyn_sizes_5_0_0_0, dim3(int_ceil(int_ceil(N, 1), 32), 1, 1), dim3(32, 1, 1), vector_copy_dyn_sizes_5_0_0_0_args, 0, nullptr\n", + " );\n", + "\n", + " DACE_KERNEL_LAUNCH_CHECK(__err, \"vector_copy_dyn_sizes_5_0_0_0\", int_ceil(int_ceil(N, 1), 32), 1, 1, 32, 1, 1);\n", + "}\n" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "Code(sdfg.generate_code()[1].clean_code, language='cpp')" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "b1be294d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
#include <cstdlib>\n",
+       "#include "../include/vector_copy_dyn_sizes.h"\n",
+       "\n",
+       "int main(int argc, char **argv) {\n",
+       "    vector_copy_dyn_sizesHandle_t handle;\n",
+       "    int err;\n",
+       "    int N = 42;\n",
+       "    double * __restrict__ A = (double*) calloc(N, sizeof(double));\n",
+       "    double * __restrict__ B = (double*) calloc(N, sizeof(double));\n",
+       "\n",
+       "\n",
+       "    handle = __dace_init_vector_copy_dyn_sizes(N);\n",
+       "    __program_vector_copy_dyn_sizes(handle, A, B, N);\n",
+       "    err = __dace_exit_vector_copy_dyn_sizes(handle);\n",
+       "\n",
+       "    free(A);\n",
+       "    free(B);\n",
+       "\n",
+       "\n",
+       "    return err;\n",
+       "}\n",
+       "
\n" + ], + "text/latex": [ + "\\begin{Verbatim}[commandchars=\\\\\\{\\}]\n", + "\\PY{c+cp}{\\PYZsh{}}\\PY{c+cp}{include}\\PY{+w}{ }\\PY{c+cpf}{\\PYZlt{}cstdlib\\PYZgt{}}\n", + "\\PY{c+cp}{\\PYZsh{}}\\PY{c+cp}{include}\\PY{+w}{ }\\PY{c+cpf}{\\PYZdq{}../include/vector\\PYZus{}copy\\PYZus{}dyn\\PYZus{}sizes.h\\PYZdq{}}\n", + "\n", + "\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n+nf}{main}\\PY{p}{(}\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{argc}\\PY{p}{,}\\PY{+w}{ }\\PY{k+kt}{char}\\PY{+w}{ }\\PY{o}{*}\\PY{o}{*}\\PY{n}{argv}\\PY{p}{)}\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", + "\\PY{+w}{ }\\PY{n}{vector\\PYZus{}copy\\PYZus{}dyn\\PYZus{}sizesHandle\\PYZus{}t}\\PY{+w}{ }\\PY{n}{handle}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{err}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{N}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{42}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{k+kt}{double}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}restrict\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{n}{A}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{p}{(}\\PY{k+kt}{double}\\PY{o}{*}\\PY{p}{)}\\PY{+w}{ }\\PY{n}{calloc}\\PY{p}{(}\\PY{n}{N}\\PY{p}{,}\\PY{+w}{ }\\PY{k}{sizeof}\\PY{p}{(}\\PY{k+kt}{double}\\PY{p}{)}\\PY{p}{)}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{k+kt}{double}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}restrict\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{n}{B}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{p}{(}\\PY{k+kt}{double}\\PY{o}{*}\\PY{p}{)}\\PY{+w}{ }\\PY{n}{calloc}\\PY{p}{(}\\PY{n}{N}\\PY{p}{,}\\PY{+w}{ }\\PY{k}{sizeof}\\PY{p}{(}\\PY{k+kt}{double}\\PY{p}{)}\\PY{p}{)}\\PY{p}{;}\n", + "\n", + "\n", + "\\PY{+w}{ }\\PY{n}{handle}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}dace\\PYZus{}init\\PYZus{}vector\\PYZus{}copy\\PYZus{}dyn\\PYZus{}sizes}\\PY{p}{(}\\PY{n}{N}\\PY{p}{)}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}program\\PYZus{}vector\\PYZus{}copy\\PYZus{}dyn\\PYZus{}sizes}\\PY{p}{(}\\PY{n}{handle}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{A}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{B}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{N}\\PY{p}{)}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{n}{err}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}dace\\PYZus{}exit\\PYZus{}vector\\PYZus{}copy\\PYZus{}dyn\\PYZus{}sizes}\\PY{p}{(}\\PY{n}{handle}\\PY{p}{)}\\PY{p}{;}\n", + "\n", + "\\PY{+w}{ }\\PY{n}{free}\\PY{p}{(}\\PY{n}{A}\\PY{p}{)}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{n}{free}\\PY{p}{(}\\PY{n}{B}\\PY{p}{)}\\PY{p}{;}\n", + "\n", + "\n", + "\\PY{+w}{ }\\PY{k}{return}\\PY{+w}{ }\\PY{n}{err}\\PY{p}{;}\n", + "\\PY{p}{\\PYZcb{}}\n", + "\\end{Verbatim}\n" + ], + "text/plain": [ + "#include \n", + "#include \"../include/vector_copy_dyn_sizes.h\"\n", + "\n", + "int main(int argc, char **argv) {\n", + " vector_copy_dyn_sizesHandle_t handle;\n", + " int err;\n", + " int N = 42;\n", + " double * __restrict__ A = (double*) calloc(N, sizeof(double));\n", + " double * __restrict__ B = (double*) calloc(N, sizeof(double));\n", + "\n", + "\n", + " handle = __dace_init_vector_copy_dyn_sizes(N);\n", + " __program_vector_copy_dyn_sizes(handle, A, B, N);\n", + " err = __dace_exit_vector_copy_dyn_sizes(handle);\n", + "\n", + " free(A);\n", + " free(B);\n", + "\n", + "\n", + " return err;\n", + "}" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "Code(sdfg.generate_code()[3].clean_code, language='cpp')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "33d74b5c", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "dace_env", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/berkay_workpace/scratch/cudastreamPass.ipynb b/berkay_workpace/scratch/cudastreamPass.ipynb index 0f505df94e..c48362fff1 100644 --- a/berkay_workpace/scratch/cudastreamPass.ipynb +++ b/berkay_workpace/scratch/cudastreamPass.ipynb @@ -114,15 +114,15 @@ "text/html": [ "\n", "
\n", - "
\n", + "
\n", "
\n", "\n", "" @@ -228,15 +228,15 @@ "text/html": [ "\n", "
\n", - "
\n", + "
\n", "
\n", "\n", "" @@ -267,7 +267,19 @@ "execution_count": 9, "id": "60d817de", "metadata": {}, - "outputs": [], + "outputs": [ + { + "ename": "AssertionError", + "evalue": "", + "output_type": "error", + "traceback": [ + "\u001b[31m---------------------------------------------------------------------------\u001b[39m", + "\u001b[31mAssertionError\u001b[39m Traceback (most recent call last)", + "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[9]\u001b[39m\u001b[32m, line 3\u001b[39m\n\u001b[32m 1\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mdace\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01mconfig\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m Config\n\u001b[32m----> \u001b[39m\u001b[32m3\u001b[39m \u001b[38;5;28;01massert\u001b[39;00m Config.get(\u001b[33m'\u001b[39m\u001b[33mcompiler\u001b[39m\u001b[33m'\u001b[39m, \u001b[33m'\u001b[39m\u001b[33mcuda\u001b[39m\u001b[33m'\u001b[39m, \u001b[33m'\u001b[39m\u001b[33mimplementation\u001b[39m\u001b[33m'\u001b[39m) == \u001b[33m\"\u001b[39m\u001b[33mexperimental\u001b[39m\u001b[33m\"\u001b[39m\n", + "\u001b[31mAssertionError\u001b[39m: " + ] + } + ], "source": [ "from dace.config import Config\n", "\n", @@ -276,322 +288,10 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": null, "id": "bf7c6836", "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/home/berkay/master-thesis/dace/dace/codegen/targets/new_cuda_codegen/experimental_cuda.py:1728: UserWarning: No `gpu_block_size` property specified on map \"test3_10_4_11\". Falling back to the configuration entry `compiler.cuda.default_block_size`: 32,1,1. You can either specify the block size to use with the gpu_block_size property, or by adding nested `GPU_ThreadBlock` maps, which map work to individual threads. For more information, see https://spcldace.readthedocs.io/en/latest/optimization/gpu.html\n", - " warnings.warn(\n" - ] - }, - { - "data": { - "text/html": [ - "
/* DaCe AUTO-GENERATED FILE. DO NOT MODIFY */\n",
-       "#include <dace/dace.h>\n",
-       "#include "../../include/hash.h"\n",
-       "\n",
-       "struct test3_state_t {\n",
-       "    dace::cuda::Context *gpu_context;\n",
-       "};\n",
-       "\n",
-       "DACE_EXPORTED void __dace_runkernel_test3_10_4_11_0_0_6(test3_state_t *__state, dace::uint * __restrict__ C, dace::uint * __restrict__ D);\n",
-       "void __program_test3_internal(test3_state_t*__state, dace::uint * __restrict__ A, dace::uint * __restrict__ B, dace::uint * __restrict__ C, dace::uint * __restrict__ D)\n",
-       "{\n",
-       "\n",
-       "    {\n",
-       "\n",
-       "        DACE_GPU_CHECK(cudaMemcpyAsync(A, B, 10 * sizeof(dace::uint), cudaMemcpyDeviceToDevice, __state->gpu_context->streams[0]));\n",
-       "        {\n",
-       "\n",
-       "            ///////////////////\n",
-       "            DACE_GPU_CHECK(cudaStreamSynchronize(__state->gpu_context->streams[0]));\n",
-       "            ///////////////////\n",
-       "\n",
-       "        }\n",
-       "        {\n",
-       "            for (auto i = 0; i < 3; i += 1) {\n",
-       "                __dace_runkernel_test3_10_4_11_0_0_6(__state, C, D);\n",
-       "            }\n",
-       "        }\n",
-       "        {\n",
-       "\n",
-       "            ///////////////////\n",
-       "            DACE_GPU_CHECK(cudaStreamSynchronize(__state->gpu_context->streams[1]));\n",
-       "            ///////////////////\n",
-       "\n",
-       "        }\n",
-       "\n",
-       "    }\n",
-       "}\n",
-       "\n",
-       "DACE_EXPORTED void __program_test3(test3_state_t *__state, dace::uint * __restrict__ A, dace::uint * __restrict__ B, dace::uint * __restrict__ C, dace::uint * __restrict__ D)\n",
-       "{\n",
-       "    __program_test3_internal(__state, A, B, C, D);\n",
-       "}\n",
-       "DACE_EXPORTED int __dace_init_experimental_cuda(test3_state_t *__state);\n",
-       "DACE_EXPORTED int __dace_exit_experimental_cuda(test3_state_t *__state);\n",
-       "\n",
-       "DACE_EXPORTED test3_state_t *__dace_init_test3()\n",
-       "{\n",
-       "    int __result = 0;\n",
-       "    test3_state_t *__state = new test3_state_t;\n",
-       "\n",
-       "\n",
-       "    __result |= __dace_init_experimental_cuda(__state);\n",
-       "\n",
-       "    if (__result) {\n",
-       "        delete __state;\n",
-       "        return nullptr;\n",
-       "    }\n",
-       "    return __state;\n",
-       "}\n",
-       "\n",
-       "DACE_EXPORTED int __dace_exit_test3(test3_state_t *__state)\n",
-       "{\n",
-       "    int __err = 0;\n",
-       "\n",
-       "    int __err_experimental_cuda = __dace_exit_experimental_cuda(__state);\n",
-       "    if (__err_experimental_cuda) {\n",
-       "        __err = __err_experimental_cuda;\n",
-       "    }\n",
-       "    delete __state;\n",
-       "    return __err;\n",
-       "}\n",
-       "
\n" - ], - "text/latex": [ - "\\begin{Verbatim}[commandchars=\\\\\\{\\}]\n", - "\\PY{c+cm}{/* DaCe AUTO\\PYZhy{}GENERATED FILE. DO NOT MODIFY */}\n", - "\\PY{c+cp}{\\PYZsh{}}\\PY{c+cp}{include}\\PY{+w}{ }\\PY{c+cpf}{\\PYZlt{}dace/dace.h\\PYZgt{}}\n", - "\\PY{c+cp}{\\PYZsh{}}\\PY{c+cp}{include}\\PY{+w}{ }\\PY{c+cpf}{\\PYZdq{}../../include/hash.h\\PYZdq{}}\n", - "\n", - "\\PY{k}{struct}\\PY{+w}{ }\\PY{n+nc}{test3\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", - "\\PY{+w}{ }\\PY{n}{dace}\\PY{o}{:}\\PY{o}{:}\\PY{n}{cuda}\\PY{o}{:}\\PY{o}{:}\\PY{n}{Context}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{gpu\\PYZus{}context}\\PY{p}{;}\n", - "\\PY{p}{\\PYZcb{}}\\PY{p}{;}\n", - "\n", - "\\PY{n}{DACE\\PYZus{}EXPORTED}\\PY{+w}{ }\\PY{k+kt}{void}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}dace\\PYZus{}runkernel\\PYZus{}test3\\PYZus{}10\\PYZus{}4\\PYZus{}11\\PYZus{}0\\PYZus{}0\\PYZus{}6}\\PY{p}{(}\\PY{n}{test3\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{dace}\\PY{o}{:}\\PY{o}{:}\\PY{n}{uint}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}restrict\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{n}{C}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{dace}\\PY{o}{:}\\PY{o}{:}\\PY{n}{uint}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}restrict\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{n}{D}\\PY{p}{)}\\PY{p}{;}\n", - "\\PY{k+kt}{void}\\PY{+w}{ }\\PY{n+nf}{\\PYZus{}\\PYZus{}program\\PYZus{}test3\\PYZus{}internal}\\PY{p}{(}\\PY{n}{test3\\PYZus{}state\\PYZus{}t}\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{dace}\\PY{o}{:}\\PY{o}{:}\\PY{n}{uint}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}restrict\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{n}{A}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{dace}\\PY{o}{:}\\PY{o}{:}\\PY{n}{uint}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}restrict\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{n}{B}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{dace}\\PY{o}{:}\\PY{o}{:}\\PY{n}{uint}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}restrict\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{n}{C}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{dace}\\PY{o}{:}\\PY{o}{:}\\PY{n}{uint}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}restrict\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{n}{D}\\PY{p}{)}\n", - "\\PY{p}{\\PYZob{}}\n", - "\n", - "\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", - "\n", - "\\PY{+w}{ }\\PY{n}{DACE\\PYZus{}GPU\\PYZus{}CHECK}\\PY{p}{(}\\PY{n}{cudaMemcpyAsync}\\PY{p}{(}\\PY{n}{A}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{B}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{10}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{k}{sizeof}\\PY{p}{(}\\PY{n}{dace}\\PY{o}{:}\\PY{o}{:}\\PY{n}{uint}\\PY{p}{)}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{cudaMemcpyDeviceToDevice}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{streams}\\PY{p}{[}\\PY{l+m+mi}{0}\\PY{p}{]}\\PY{p}{)}\\PY{p}{)}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", - "\n", - "\\PY{+w}{ }\\PY{c+c1}{///////////////////}\n", - "\\PY{+w}{ }\\PY{n}{DACE\\PYZus{}GPU\\PYZus{}CHECK}\\PY{p}{(}\\PY{n}{cudaStreamSynchronize}\\PY{p}{(}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{streams}\\PY{p}{[}\\PY{l+m+mi}{0}\\PY{p}{]}\\PY{p}{)}\\PY{p}{)}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{c+c1}{///////////////////}\n", - "\n", - "\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\n", - "\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", - "\\PY{+w}{ }\\PY{k}{for}\\PY{+w}{ }\\PY{p}{(}\\PY{k}{auto}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{;}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{\\PYZlt{}}\\PY{+w}{ }\\PY{l+m+mi}{3}\\PY{p}{;}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{+}\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{)}\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", - "\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}dace\\PYZus{}runkernel\\PYZus{}test3\\PYZus{}10\\PYZus{}4\\PYZus{}11\\PYZus{}0\\PYZus{}0\\PYZus{}6}\\PY{p}{(}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{C}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{D}\\PY{p}{)}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\n", - "\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\n", - "\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", - "\n", - "\\PY{+w}{ }\\PY{c+c1}{///////////////////}\n", - "\\PY{+w}{ }\\PY{n}{DACE\\PYZus{}GPU\\PYZus{}CHECK}\\PY{p}{(}\\PY{n}{cudaStreamSynchronize}\\PY{p}{(}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{streams}\\PY{p}{[}\\PY{l+m+mi}{1}\\PY{p}{]}\\PY{p}{)}\\PY{p}{)}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{c+c1}{///////////////////}\n", - "\n", - "\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\n", - "\n", - "\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\n", - "\\PY{p}{\\PYZcb{}}\n", - "\n", - "\\PY{n}{DACE\\PYZus{}EXPORTED}\\PY{+w}{ }\\PY{k+kt}{void}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}program\\PYZus{}test3}\\PY{p}{(}\\PY{n}{test3\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{dace}\\PY{o}{:}\\PY{o}{:}\\PY{n}{uint}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}restrict\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{n}{A}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{dace}\\PY{o}{:}\\PY{o}{:}\\PY{n}{uint}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}restrict\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{n}{B}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{dace}\\PY{o}{:}\\PY{o}{:}\\PY{n}{uint}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}restrict\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{n}{C}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{dace}\\PY{o}{:}\\PY{o}{:}\\PY{n}{uint}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}restrict\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{n}{D}\\PY{p}{)}\n", - "\\PY{p}{\\PYZob{}}\n", - "\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}program\\PYZus{}test3\\PYZus{}internal}\\PY{p}{(}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{A}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{B}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{C}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{D}\\PY{p}{)}\\PY{p}{;}\n", - "\\PY{p}{\\PYZcb{}}\n", - "\\PY{n}{DACE\\PYZus{}EXPORTED}\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}dace\\PYZus{}init\\PYZus{}experimental\\PYZus{}cuda}\\PY{p}{(}\\PY{n}{test3\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{)}\\PY{p}{;}\n", - "\\PY{n}{DACE\\PYZus{}EXPORTED}\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}dace\\PYZus{}exit\\PYZus{}experimental\\PYZus{}cuda}\\PY{p}{(}\\PY{n}{test3\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{)}\\PY{p}{;}\n", - "\n", - "\\PY{n}{DACE\\PYZus{}EXPORTED}\\PY{+w}{ }\\PY{n}{test3\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}dace\\PYZus{}init\\PYZus{}test3}\\PY{p}{(}\\PY{p}{)}\n", - "\\PY{p}{\\PYZob{}}\n", - "\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}result}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{n}{test3\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{k}{new}\\PY{+w}{ }\\PY{n}{test3\\PYZus{}state\\PYZus{}t}\\PY{p}{;}\n", - "\n", - "\n", - "\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}result}\\PY{+w}{ }\\PY{o}{|}\\PY{o}{=}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}dace\\PYZus{}init\\PYZus{}experimental\\PYZus{}cuda}\\PY{p}{(}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{)}\\PY{p}{;}\n", - "\n", - "\\PY{+w}{ }\\PY{k}{if}\\PY{+w}{ }\\PY{p}{(}\\PY{n}{\\PYZus{}\\PYZus{}result}\\PY{p}{)}\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", - "\\PY{+w}{ }\\PY{k}{delete}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{k}{return}\\PY{+w}{ }\\PY{k}{nullptr}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\n", - "\\PY{+w}{ }\\PY{k}{return}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{;}\n", - "\\PY{p}{\\PYZcb{}}\n", - "\n", - "\\PY{n}{DACE\\PYZus{}EXPORTED}\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}dace\\PYZus{}exit\\PYZus{}test3}\\PY{p}{(}\\PY{n}{test3\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{)}\n", - "\\PY{p}{\\PYZob{}}\n", - "\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}err}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{;}\n", - "\n", - "\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}err\\PYZus{}experimental\\PYZus{}cuda}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}dace\\PYZus{}exit\\PYZus{}experimental\\PYZus{}cuda}\\PY{p}{(}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{)}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{k}{if}\\PY{+w}{ }\\PY{p}{(}\\PY{n}{\\PYZus{}\\PYZus{}err\\PYZus{}experimental\\PYZus{}cuda}\\PY{p}{)}\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", - "\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}err}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}err\\PYZus{}experimental\\PYZus{}cuda}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\n", - "\\PY{+w}{ }\\PY{k}{delete}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{k}{return}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}err}\\PY{p}{;}\n", - "\\PY{p}{\\PYZcb{}}\n", - "\\end{Verbatim}\n" - ], - "text/plain": [ - "/* DaCe AUTO-GENERATED FILE. DO NOT MODIFY */\n", - "#include \n", - "#include \"../../include/hash.h\"\n", - "\n", - "struct test3_state_t {\n", - " dace::cuda::Context *gpu_context;\n", - "};\n", - "\n", - "DACE_EXPORTED void __dace_runkernel_test3_10_4_11_0_0_6(test3_state_t *__state, dace::uint * __restrict__ C, dace::uint * __restrict__ D);\n", - "void __program_test3_internal(test3_state_t*__state, dace::uint * __restrict__ A, dace::uint * __restrict__ B, dace::uint * __restrict__ C, dace::uint * __restrict__ D)\n", - "{\n", - "\n", - " {\n", - "\n", - " DACE_GPU_CHECK(cudaMemcpyAsync(A, B, 10 * sizeof(dace::uint), cudaMemcpyDeviceToDevice, __state->gpu_context->streams[0]));\n", - " {\n", - "\n", - " ///////////////////\n", - " DACE_GPU_CHECK(cudaStreamSynchronize(__state->gpu_context->streams[0]));\n", - " ///////////////////\n", - "\n", - " }\n", - " {\n", - " for (auto i = 0; i < 3; i += 1) {\n", - " __dace_runkernel_test3_10_4_11_0_0_6(__state, C, D);\n", - " }\n", - " }\n", - " {\n", - "\n", - " ///////////////////\n", - " DACE_GPU_CHECK(cudaStreamSynchronize(__state->gpu_context->streams[1]));\n", - " ///////////////////\n", - "\n", - " }\n", - "\n", - " }\n", - "}\n", - "\n", - "DACE_EXPORTED void __program_test3(test3_state_t *__state, dace::uint * __restrict__ A, dace::uint * __restrict__ B, dace::uint * __restrict__ C, dace::uint * __restrict__ D)\n", - "{\n", - " __program_test3_internal(__state, A, B, C, D);\n", - "}\n", - "DACE_EXPORTED int __dace_init_experimental_cuda(test3_state_t *__state);\n", - "DACE_EXPORTED int __dace_exit_experimental_cuda(test3_state_t *__state);\n", - "\n", - "DACE_EXPORTED test3_state_t *__dace_init_test3()\n", - "{\n", - " int __result = 0;\n", - " test3_state_t *__state = new test3_state_t;\n", - "\n", - "\n", - " __result |= __dace_init_experimental_cuda(__state);\n", - "\n", - " if (__result) {\n", - " delete __state;\n", - " return nullptr;\n", - " }\n", - " return __state;\n", - "}\n", - "\n", - "DACE_EXPORTED int __dace_exit_test3(test3_state_t *__state)\n", - "{\n", - " int __err = 0;\n", - "\n", - " int __err_experimental_cuda = __dace_exit_experimental_cuda(__state);\n", - " if (__err_experimental_cuda) {\n", - " __err = __err_experimental_cuda;\n", - " }\n", - " delete __state;\n", - " return __err;\n", - "}" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "Code(sdfg.generate_code()[0].clean_code, language='cpp')" ] diff --git a/berkay_workpace/scratch/notes.md b/berkay_workpace/scratch/notes.md deleted file mode 100644 index fd34688b2c..0000000000 --- a/berkay_workpace/scratch/notes.md +++ /dev/null @@ -1,17 +0,0 @@ -# Preprocess notes - -**1. Application of "CopyToMap" transformation for certain cases of GPU<->GPU copy that cannot be done using a single copy command:** - -So I left it there because I did not in which kind of situation this will be needed, i.e. -a concrete example/situation. It seems to be interwined with the _emit_copy() function, where similar checks -are also performed (i.e. nobody cleaned this, the checks there actually make no sense after preprocessing). - - -**2. _compute_pool_release() Function:** - -I left it because it looks useful - helps to free memory. But it seems like the actual freeing is not performed anymore -in my code, so I maybe should remove it as well? Kind of happens if - -**3. _compute_cudastreams()** - -Also remove it for now? I mean, stream allocation/deallocation are not handled anyways. \ No newline at end of file diff --git a/berkay_workpace/scratch/playfield.py b/berkay_workpace/scratch/playfield.py deleted file mode 100644 index c8e44a7675..0000000000 --- a/berkay_workpace/scratch/playfield.py +++ /dev/null @@ -1,38 +0,0 @@ -import dace -import random -import cupy as cp -from dace.frontend.python.interface import inline - - -from dace import registry -from dace.sdfg.scope import ScopeSubgraphView -from dace.codegen.prettycode import CodeIOStream -from dace.codegen.targets.target import TargetCodeGenerator -from dace.codegen.targets.framecode import DaCeCodeGenerator -from dace.codegen.targets.cpp import sym2cpp -from IPython.display import Code -from dace.config import Config - - -bs = 512 -ns = 1024 -BS = dace.symbol('BS') -NS = dace.symbol('NS') - -START = dace.symbol('START') -WS = dace.symbol('WS') -STRIDE = dace.symbol('STRIDE') - -start = 2 -stride = 3 -ws = 16 -@dace.program -def symbolic_warp_map(A: dace.uint32[NS] @ dace.dtypes.StorageType.GPU_Global, B: dace.uint32[NS] @ dace.dtypes.StorageType.GPU_Global): - """ - Focus is in the use of symbolic variables in the MAP. - """ - A[:] = B[:] - -sdfg = symbolic_warp_map.to_sdfg() - -Code(sdfg.generate_code()[0].clean_code, language='cpp') \ No newline at end of file diff --git a/berkay_workpace/scratch/smemPassAndCopy/scalarMultiplication1.ipynb b/berkay_workpace/scratch/smemPassAndCopy/scalarMultiplication1.ipynb new file mode 100644 index 0000000000..f2710aa7b0 --- /dev/null +++ b/berkay_workpace/scratch/smemPassAndCopy/scalarMultiplication1.ipynb @@ -0,0 +1,695 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "15cd9104", + "metadata": {}, + "source": [ + "# Scalar Multiplication 1\n", + "\n", + "In this notebook, we will explore how the **`DefaultSharedMemorySync` pass** inserts `__syncthreads()` tasklets. We will specifically observe its behavior when **reusing shared memory** during a scalar multiplication. Our example involves multiplying a scalar by a long vector; we will import a consecutive subset of the vector into shared memory, perform the multiplication, and then restore it.\n", + "Only one threadblock is used and it gets each consecutive chunc using a **sequential map**. **Scalar Multiplication 2** does the same but uses a **for loop** instead. " + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "1e6f5b43", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# needed modules, nothing interesting :)\n", + "import dace\n", + "from IPython.display import Code\n", + "from dace.transformation import pass_pipeline\n", + "from dace.transformation.auto import auto_optimize\n", + "from dace.transformation.passes.shared_memory_synchronization import DefaultSharedMemorySync" + ] + }, + { + "cell_type": "markdown", + "id": "a370147d", + "metadata": {}, + "source": [ + "### Inspiration\n", + "\n", + "Here are some example SDFGs using the Python frontend that perform scalar multiplication. These served as inspiration to implement the same operation—this time using shared memory instead of a temporary local variable.\n", + "\n", + "Why not use shared memory in the Python frontend? Because we want more control over the program and prefer to focus on the concept itself, rather than the capabilities provided by the Python frontend.\n", + "\n", + "Note that we have several similar examples. They differ in where the sequential map is placed within the nested map. A sequential map **outside** the kernel (i.e., outside GPU schedules) does **not** require synchronization after the sequential iteration, as we simply launch the kernel again and do not reuse shared memory.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "66ef7e5f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "
\n", + "
\n", + "
\n", + "\n", + "" + ], + "text/plain": [ + "SDFG (scalarMultiplication3)" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "@dace.program\n", + "def scalarMultiplication1(A: dace.int32[128] @ dace.dtypes.StorageType.GPU_Global, scalar: dace.int32):\n", + " for k in dace.map[0:4] @ dace.dtypes.ScheduleType.Sequential:\n", + " for i in dace.map[0:32:32] @ dace.dtypes.ScheduleType.GPU_Device:\n", + " for j in dace.map[0:32] @ dace.dtypes.ScheduleType.GPU_ThreadBlock:\n", + " tmp = A[k * 32 + j]\n", + " A[k * 32 + j] = scalar * tmp\n", + "\n", + "@dace.program\n", + "def scalarMultiplication2(A: dace.int32[128] @ dace.dtypes.StorageType.GPU_Global, scalar: dace.int32):\n", + " for i in dace.map[0:32:32] @ dace.dtypes.ScheduleType.GPU_Device:\n", + " for k in dace.map[0:4] @ dace.dtypes.ScheduleType.Sequential:\n", + " for j in dace.map[0:32] @ dace.dtypes.ScheduleType.GPU_ThreadBlock:\n", + " tmp = A[k * 32 + j]\n", + " A[k * 32 + j] = scalar * tmp\n", + "\n", + "@dace.program\n", + "def scalarMultiplication3(A: dace.int32[128] @ dace.dtypes.StorageType.GPU_Global, scalar: dace.int32):\n", + " for i in dace.map[0:32:32] @ dace.dtypes.ScheduleType.GPU_Device:\n", + " for j in dace.map[0:32] @ dace.dtypes.ScheduleType.GPU_ThreadBlock:\n", + " for k in dace.map[0:4] @ dace.dtypes.ScheduleType.Sequential:\n", + " tmp = A[k * 32 + j]\n", + " A[k * 32 + j] = scalar * tmp\n", + "\n", + "\n", + "# Choose the sdfg you want so inspect below\n", + "sdfg_inspiration = scalarMultiplication3.to_sdfg()\n", + "sdfg_inspiration" + ] + }, + { + "cell_type": "markdown", + "id": "c6d4c63a", + "metadata": {}, + "source": [ + "Tipp: collapse the functions and only focus one at a time below. They are quite similar, only difference is where the sequential map occurs.\n", + "Select it and the observe whether the post-synchronization happens if required and whether it is omitted if unnecessary." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "9abdaf19", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "
\n", + "
\n", + "
\n", + "\n", + "" + ], + "text/plain": [ + "SDFG (scalarMultiplication2_smem)" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Here we should have NO post synchronization, since seq map is OUTSIDE of the kernel. \n", + "def scalarMultiplication1_smem():\n", + " # Create SDFG and state\n", + " sdfg = dace.SDFG(\"scalarMultiplication1_smem\")\n", + " state = sdfg.add_state(\"main\")\n", + "\n", + " # Add arrays\n", + " sdfg.add_array(\"A\", (128,), dace.uint32, storage=dace.dtypes.StorageType.GPU_Global)\n", + " sdfg.add_scalar(\"scalar\", dace.uint32)\n", + " sdfg.add_array(\"S\", (32,), dace.uint32, storage=dace.dtypes.StorageType.GPU_Shared, transient=True, lifetime=dace.dtypes.AllocationLifetime.Scope)\n", + "\n", + " # Add access nodes\n", + " a_acc = state.add_read(\"A\")\n", + " a_store = state.add_write(\"A\")\n", + " scalar_acc = state.add_access(\"scalar\")\n", + " s_acc= state.add_access(\"S\")\n", + "\n", + " # Sequential map (outermost) \n", + " seq_map_entry, seq_map_exit = state.add_map(\n", + " \"seq_map\",\n", + " dict(k=\"0:4\"),\n", + " schedule=dace.dtypes.ScheduleType.Sequential,\n", + " )\n", + "\n", + "\n", + " # GPU Device map\n", + " gpu_map_entry, gpu_map_exit = state.add_map(\n", + " \"gpu_map\",\n", + " dict(i=\"0:32:32\"),\n", + " schedule=dace.dtypes.ScheduleType.GPU_Device,\n", + " )\n", + "\n", + " # GPU TB map\n", + " tb_map_entry, tb_map_exit = state.add_map(\n", + " \"tb\",\n", + " dict(j=\"0:32\"),\n", + " schedule=dace.dtypes.ScheduleType.GPU_ThreadBlock,\n", + " )\n", + "\n", + " # Add tasklets for A -> S -> B\n", + " tasklet1 = state.add_tasklet(\n", + " \"addMult\",\n", + " inputs={\"__inp_A\", \"__inp_scalar\"},\n", + " outputs={\"__out\"},\n", + " code=\"__out = __inp_A * __inp_scalar;\",\n", + " language=dace.dtypes.Language.CPP\n", + " )\n", + "\n", + " tasklet2 = state.add_tasklet(\n", + " \"store_to_global\",\n", + " inputs={\"__inp\"},\n", + " outputs={\"__out\"},\n", + " code=\"__out = __inp;\",\n", + " language=dace.dtypes.Language.CPP\n", + " )\n", + "\n", + " # Edges\n", + "\n", + " # A and scalar to first map\n", + " state.add_edge(a_acc, None, seq_map_entry, None, dace.Memlet(\"A[0:128]\"))\n", + " state.add_edge(scalar_acc, None, seq_map_entry, None, dace.Memlet(\"scalar[0]\"))\n", + "\n", + " # Add both down to last map, the threadblock map\n", + " state.add_edge(seq_map_entry, None, gpu_map_entry, None, dace.Memlet(\"A[32 * k: 32 * (k+1)]\"))\n", + " state.add_edge(seq_map_entry, None, gpu_map_entry, None, dace.Memlet(\"scalar[0]\"))\n", + "\n", + " state.add_edge(gpu_map_entry, None, tb_map_entry, None, dace.Memlet(\"A[32 * k: 32 * (k+1)]\"))\n", + " state.add_edge(gpu_map_entry, None, tb_map_entry, None, dace.Memlet(\"scalar[0]\"))\n", + "\n", + " # connect to tasklets\n", + " state.add_edge(tb_map_entry, None, tasklet1, \"__inp_A\", dace.Memlet(\"A[j + 32* k]\"))\n", + " state.add_edge(tb_map_entry, None, tasklet1, \"__inp_scalar\", dace.Memlet(\"scalar[0]\"))\n", + "\n", + " state.add_edge(tasklet1, \"__out\", s_acc, None, dace.Memlet(\"S[j]\"))\n", + "\n", + " state.add_edge(s_acc, None, tasklet2, \"__inp\", dace.Memlet(\"S[j]\"))\n", + "\n", + " # connect to all map exit nodes and then back to A to store back\n", + " state.add_edge(tasklet2, \"__out\", tb_map_exit, None, dace.Memlet(\"A[j + 32* k]\"))\n", + " state.add_edge(tb_map_exit, None, gpu_map_exit, None, dace.Memlet(\"A[32 * k: 32 * (k+1)]\"))\n", + " state.add_edge(gpu_map_exit, None, seq_map_exit, None, dace.Memlet(\"A[32 * k: 32 * (k+1)]\"))\n", + " state.add_edge(seq_map_exit, None, a_store, None, dace.Memlet(\"A[0:128]\"))\n", + " \n", + " \n", + " sdfg.fill_scope_connectors()\n", + " return sdfg\n", + "\n", + "\n", + "# Here we should have post synchronization\n", + "def scalarMultiplication2_smem():\n", + " # Create SDFG and state\n", + " sdfg = dace.SDFG(\"scalarMultiplication2_smem\")\n", + " state = sdfg.add_state(\"main\")\n", + "\n", + " # Add arrays\n", + " sdfg.add_array(\"A\", (128,), dace.uint32, storage=dace.dtypes.StorageType.GPU_Global)\n", + " sdfg.add_scalar(\"scalar\", dace.uint32)\n", + " sdfg.add_array(\"S\", (32,), dace.uint32, storage=dace.dtypes.StorageType.GPU_Shared, transient=True, lifetime=dace.dtypes.AllocationLifetime.Scope)\n", + "\n", + " # Add access nodes\n", + " a_acc = state.add_read(\"A\")\n", + " a_store = state.add_write(\"A\")\n", + " scalar_acc = state.add_access(\"scalar\")\n", + " s_acc= state.add_access(\"S\")\n", + "\n", + " # Sequential map (outermost) \n", + " seq_map_entry, seq_map_exit = state.add_map(\n", + " \"seq_map\",\n", + " dict(k=\"0:4\"),\n", + " schedule=dace.dtypes.ScheduleType.Sequential,\n", + " )\n", + "\n", + "\n", + " # GPU Device map\n", + " gpu_map_entry, gpu_map_exit = state.add_map(\n", + " \"gpu_map\",\n", + " dict(i=\"0:32:32\"),\n", + " schedule=dace.dtypes.ScheduleType.GPU_Device,\n", + " )\n", + "\n", + " # GPU TB map\n", + " tb_map_entry, tb_map_exit = state.add_map(\n", + " \"tb\",\n", + " dict(j=\"0:32\"),\n", + " schedule=dace.dtypes.ScheduleType.GPU_ThreadBlock,\n", + " )\n", + "\n", + " # Add tasklets for A -> S -> B\n", + " tasklet1 = state.add_tasklet(\n", + " \"addMult\",\n", + " inputs={\"__inp_A\", \"__inp_scalar\"},\n", + " outputs={\"__out\"},\n", + " code=\"__out = __inp_A * __inp_scalar;\",\n", + " language=dace.dtypes.Language.CPP\n", + " )\n", + "\n", + " tasklet2 = state.add_tasklet(\n", + " \"store_to_global\",\n", + " inputs={\"__inp\"},\n", + " outputs={\"__out\"},\n", + " code=\"__out = __inp;\",\n", + " language=dace.dtypes.Language.CPP\n", + " )\n", + "\n", + " # Edges\n", + "\n", + " # A and scalar to first map\n", + " state.add_edge(a_acc, None, gpu_map_entry, None, dace.Memlet(\"A[0:128]\"))\n", + " state.add_edge(scalar_acc, None, gpu_map_entry, None, dace.Memlet(\"scalar[0]\"))\n", + "\n", + " # Add both down to last map, the threadblock map\n", + " state.add_edge(gpu_map_entry, None, seq_map_entry, None, dace.Memlet(\"A[0:128]\"))\n", + " state.add_edge(gpu_map_entry, None, seq_map_entry, None, dace.Memlet(\"scalar[0]\"))\n", + "\n", + " state.add_edge(seq_map_entry, None, tb_map_entry, None, dace.Memlet(\"A[32 * k: 32 * (k+1)]\"))\n", + " state.add_edge(seq_map_entry, None, tb_map_entry, None, dace.Memlet(\"scalar[0]\"))\n", + "\n", + " # connect to tasklets\n", + " state.add_edge(tb_map_entry, None, tasklet1, \"__inp_A\", dace.Memlet(\"A[j + 32* k]\"))\n", + " state.add_edge(tb_map_entry, None, tasklet1, \"__inp_scalar\", dace.Memlet(\"scalar[0]\"))\n", + "\n", + " state.add_edge(tasklet1, \"__out\", s_acc, None, dace.Memlet(\"S[j]\"))\n", + "\n", + " state.add_edge(s_acc, None, tasklet2, \"__inp\", dace.Memlet(\"S[j]\"))\n", + "\n", + " # connect to all map exit nodes and then back to A to store back\n", + " state.add_edge(tasklet2, \"__out\", tb_map_exit, None, dace.Memlet(\"A[j + 32* k]\"))\n", + " state.add_edge(tb_map_exit, None, seq_map_exit, None, dace.Memlet(\"A[32 * k: 32 * (k+1)]\"))\n", + " state.add_edge(seq_map_exit, None, gpu_map_exit, None, dace.Memlet(\"A[0:128]\"))\n", + " state.add_edge(gpu_map_exit, None, a_store, None, dace.Memlet(\"A[0:128]\"))\n", + " \n", + " \n", + " sdfg.fill_scope_connectors()\n", + " return sdfg\n", + "\n", + "\n", + "# As before, Here we should have post synchronization\n", + "def scalarMultiplication3_smem():\n", + " # Create SDFG and state\n", + " sdfg = dace.SDFG(\"scalarMultiplication3_smem\")\n", + " state = sdfg.add_state(\"main\")\n", + "\n", + " # Add arrays\n", + " sdfg.add_array(\"A\", (128,), dace.uint32, storage=dace.dtypes.StorageType.GPU_Global)\n", + " sdfg.add_scalar(\"scalar\", dace.uint32)\n", + " sdfg.add_array(\"S\", (32,), dace.uint32, storage=dace.dtypes.StorageType.GPU_Shared, transient=True, lifetime=dace.dtypes.AllocationLifetime.Scope)\n", + "\n", + " # Add access nodes\n", + " a_acc = state.add_read(\"A\")\n", + " a_store = state.add_write(\"A\")\n", + " scalar_acc = state.add_access(\"scalar\")\n", + " s_acc= state.add_access(\"S\")\n", + "\n", + " # Sequential map (outermost) \n", + " seq_map_entry, seq_map_exit = state.add_map(\n", + " \"seq_map\",\n", + " dict(k=\"0:4\"),\n", + " schedule=dace.dtypes.ScheduleType.Sequential,\n", + " )\n", + "\n", + "\n", + " # GPU Device map\n", + " gpu_map_entry, gpu_map_exit = state.add_map(\n", + " \"gpu_map\",\n", + " dict(i=\"0:32:32\"),\n", + " schedule=dace.dtypes.ScheduleType.GPU_Device,\n", + " )\n", + "\n", + " # GPU TB map\n", + " tb_map_entry, tb_map_exit = state.add_map(\n", + " \"tb\",\n", + " dict(j=\"0:32\"),\n", + " schedule=dace.dtypes.ScheduleType.GPU_ThreadBlock,\n", + " )\n", + "\n", + " # Add tasklets for A -> S -> B\n", + " tasklet1 = state.add_tasklet(\n", + " \"addMult\",\n", + " inputs={\"__inp_A\", \"__inp_scalar\"},\n", + " outputs={\"__out\"},\n", + " code=\"__out = __inp_A * __inp_scalar;\",\n", + " language=dace.dtypes.Language.CPP\n", + " )\n", + "\n", + " tasklet2 = state.add_tasklet(\n", + " \"store_to_global\",\n", + " inputs={\"__inp\"},\n", + " outputs={\"__out\"},\n", + " code=\"__out = __inp;\",\n", + " language=dace.dtypes.Language.CPP\n", + " )\n", + "\n", + " # Edges\n", + "\n", + " # A and scalar to first map\n", + " state.add_edge(a_acc, None, gpu_map_entry, None, dace.Memlet(\"A[0:128]\"))\n", + " state.add_edge(scalar_acc, None, gpu_map_entry, None, dace.Memlet(\"scalar[0]\"))\n", + "\n", + " # Add both down to last map, the threadblock map\n", + " state.add_edge(gpu_map_entry, None, tb_map_entry, None, dace.Memlet(\"A[0:128]\"))\n", + " state.add_edge(gpu_map_entry, None, tb_map_entry, None, dace.Memlet(\"scalar[0]\"))\n", + "\n", + " state.add_edge(tb_map_entry, None, seq_map_entry, None, dace.Memlet(\"A[j: j + 4]\")) # weird, but it is like this in the inspiration\n", + " state.add_edge(tb_map_entry, None, seq_map_entry, None, dace.Memlet(\"scalar[0]\"))\n", + "\n", + " # connect to tasklets\n", + " state.add_edge(seq_map_entry, None, tasklet1, \"__inp_A\", dace.Memlet(\"A[j + 32* k]\"))\n", + " state.add_edge(seq_map_entry, None, tasklet1, \"__inp_scalar\", dace.Memlet(\"scalar[0]\"))\n", + "\n", + " state.add_edge(tasklet1, \"__out\", s_acc, None, dace.Memlet(\"S[j]\"))\n", + "\n", + " state.add_edge(s_acc, None, tasklet2, \"__inp\", dace.Memlet(\"S[j]\"))\n", + "\n", + " # connect to all map exit nodes and then back to A to store back\n", + " state.add_edge(tasklet2, \"__out\", seq_map_exit, None, dace.Memlet(\"A[j + 32* k]\"))\n", + " state.add_edge(seq_map_exit, None, tb_map_exit, None, dace.Memlet(\"A[j: j + 4]\"))\n", + " state.add_edge(tb_map_exit, None, gpu_map_exit, None, dace.Memlet(\"A[0:128]\"))\n", + " state.add_edge(gpu_map_exit, None, a_store, None, dace.Memlet(\"A[0:128]\"))\n", + " \n", + " \n", + " sdfg.fill_scope_connectors()\n", + " return sdfg\n", + "\n", + "\n", + "# choose which of the three versions should be applied to the pass\n", + "sdfg = scalarMultiplication2_smem()\n", + "sdfg" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "6c8921a7", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "
\n", + "
\n", + "
\n", + "\n", + "" + ], + "text/plain": [ + "SDFG (scalarMultiplication2_smem)" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# insert synchronization barriers\n", + "DefaultSharedMemorySync().apply_pass(sdfg, None)\n", + "sdfg" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "e218c98a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "
\n", + "
\n", + "
\n", + "\n", + "" + ], + "text/plain": [ + "SDFG (scalarMultiplication2)" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "@dace.program\n", + "def scalarMultiplication2(A: dace.int32[128] @ dace.dtypes.StorageType.GPU_Global, scalar: dace.int32):\n", + " for i in dace.map[0:32:32] @ dace.dtypes.ScheduleType.GPU_Device:\n", + " for k in dace.map[0:4] @ dace.dtypes.ScheduleType.Sequential:\n", + " for j in dace.map[0:32] @ dace.dtypes.ScheduleType.GPU_ThreadBlock:\n", + " tmp = A[k * 32 + j]\n", + " A[k * 32 + j] = scalar * tmp\n", + "\n", + "scalarMultiplication2.to_sdfg()" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "d64b05ff", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "
\n", + "
\n", + "
\n", + "\n", + "" + ], + "text/plain": [ + "SDFG (scalarMultiplication3)" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "@dace.program\n", + "def scalarMultiplication3(A: dace.int32[128] @ dace.dtypes.StorageType.GPU_Global, scalar: dace.int32):\n", + " for i in dace.map[0:32:32] @ dace.dtypes.ScheduleType.GPU_Device:\n", + " for j in dace.map[0:32] @ dace.dtypes.ScheduleType.GPU_ThreadBlock:\n", + " for k in dace.map[0:4] @ dace.dtypes.ScheduleType.Sequential:\n", + " tmp = A[k * 32 + j]\n", + " A[k * 32 + j] = scalar * tmp\n", + "\n", + "scalarMultiplication3.to_sdfg()" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "63b2ee66", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "
\n", + "
\n", + "
\n", + "\n", + "" + ], + "text/plain": [ + "SDFG (scalarMultiplication4)" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# To next file\n", + "@dace.program\n", + "def scalarMultiplication4(A: dace.int32[128] @ dace.dtypes.StorageType.GPU_Global, scalar: dace.int32):\n", + " for i in dace.map[0:32:32] @ dace.dtypes.ScheduleType.GPU_Device:\n", + " for j in dace.map[0:32] @ dace.dtypes.ScheduleType.GPU_ThreadBlock:\n", + " for k in range(4):\n", + " tmp = A[k * 32 + j]\n", + " A[k * 32 + j] = scalar * tmp\n", + " \n", + "\n", + "sdfg = scalarMultiplication4.to_sdfg()\n", + "sdfg" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "a70dc41e", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[SDFGState (assign_8_16), SDFGState (block), SDFGState (block_0)]\n", + "0\n", + "1\n", + "1\n" + ] + } + ], + "source": [ + "state = sdfg.states()[0]\n", + "nodes = state.nodes()\n", + "nsdfg = [node for node in nodes if isinstance(node, dace.nodes.NestedSDFG)][0]\n", + "for_loop = nsdfg.sdfg.nodes()[0]\n", + "\n", + "nodes = for_loop.nodes()\n", + "print(nodes)\n", + "for n in nodes:\n", + " print(for_loop.out_degree(n))\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "ba14acfe", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'\\nstate = sdfg.states()[0]\\nnodes = state.nodes()\\nnsdfg = [node for node in nodes if isinstance(node, dace.nodes.NestedSDFG)][0]\\nfor_loop = nsdfg.sdfg.nodes()[0]\\nprint(nsdfg.sdfg.nodes())\\nprint(nsdfg.sdfg.states())\\nprint(for_loop.nodes())\\nprint(isinstance(for_loop, dace.sdfg.state.LoopRegion))\\nprint()\\nprint()\\nprint(sdfg.nodes())\\nprint(sdfg.states())\\n'" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "\"\"\"\n", + "state = sdfg.states()[0]\n", + "nodes = state.nodes()\n", + "nsdfg = [node for node in nodes if isinstance(node, dace.nodes.NestedSDFG)][0]\n", + "for_loop = nsdfg.sdfg.nodes()[0]\n", + "print(nsdfg.sdfg.nodes())\n", + "print(nsdfg.sdfg.states())\n", + "print(for_loop.nodes())\n", + "print(isinstance(for_loop, dace.sdfg.state.LoopRegion))\n", + "print()\n", + "print()\n", + "print(sdfg.nodes())\n", + "print(sdfg.states())\n", + "\"\"\"" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "dace_env", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/berkay_workpace/scratch/smemPassAndCopy/scalarMultiplication2.ipynb b/berkay_workpace/scratch/smemPassAndCopy/scalarMultiplication2.ipynb new file mode 100644 index 0000000000..a9490815e9 --- /dev/null +++ b/berkay_workpace/scratch/smemPassAndCopy/scalarMultiplication2.ipynb @@ -0,0 +1,259 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "012177f0", + "metadata": {}, + "source": [ + "# Scalar Multiplication 2\n", + "\n", + "This notebook is quite similar to **Scalar Multiplication 1**, but instead of reusing shared memory due to a sequential scheduled map, we reuse shared memory since it is in the body of a **for loop**.\n", + "This notebook is shorter and does not explain everything all over again in detail." + ] + }, + { + "cell_type": "markdown", + "id": "1fb5f12b", + "metadata": {}, + "source": [ + "Needed imports:" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "e607a9c6", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import dace\n", + "from dace.sdfg.state import LoopRegion\n", + "from IPython.display import Code\n", + "from dace.transformation.passes.shared_memory_synchronization import DefaultSharedMemorySync" + ] + }, + { + "cell_type": "markdown", + "id": "c12a68d3", + "metadata": {}, + "source": [ + "## Insipration\n", + "\n", + "As in **Scalar Multiplication 1**, the frontend DaCe program that was used as an inspiration. I omit the different positions where the for loop can be, I just assume it is the innermost \"scope\"." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "2769e30c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "
\n", + "
\n", + "
\n", + "\n", + "" + ], + "text/plain": [ + "SDFG (scalarMultiplication)" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# To next file\n", + "@dace.program\n", + "def scalarMultiplication(A: dace.int32[128] @ dace.dtypes.StorageType.GPU_Global, scalar: dace.int32):\n", + " for i in dace.map[0:32:32] @ dace.dtypes.ScheduleType.GPU_Device:\n", + " for j in dace.map[0:32] @ dace.dtypes.ScheduleType.GPU_ThreadBlock:\n", + " for k in range(4):\n", + " tmp = A[k * 32 + j]\n", + " A[k * 32 + j] = scalar * tmp\n", + " \n", + "\n", + "sdfg = scalarMultiplication.to_sdfg(save= True)\n", + "sdfg" + ] + }, + { + "cell_type": "markdown", + "id": "21bc45e6", + "metadata": {}, + "source": [ + "The sdfg we use with by using shared memory instead of a temporary local variable:" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "f0609dff", + "metadata": {}, + "outputs": [], + "source": [ + "def scalarMultiplication_smem():\n", + " sdfg = dace.SDFG(\"scalarMultiplication_smem\")\n", + " state = sdfg.add_state(\"main\")\n", + "\n", + " # Arrays and access nodes\n", + " sdfg.add_array(\"A\", (128,), dace.uint32, storage=dace.dtypes.StorageType.GPU_Global)\n", + " sdfg.add_scalar(\"scalar\", dace.uint32)\n", + " a_acc = state.add_read(\"A\")\n", + " a_store = state.add_write(\"A\")\n", + " scalar_acc = state.add_access(\"scalar\")\n", + "\n", + " # Device and thread-block maps\n", + " gpu_map_entry, gpu_map_exit = state.add_map(\n", + " \"gpu_map\", dict(i=\"0:32:32\"), schedule=dace.dtypes.ScheduleType.GPU_Device\n", + " )\n", + " tb_map_entry, tb_map_exit = state.add_map(\n", + " \"tb\", dict(j=\"0:32\"), schedule=dace.dtypes.ScheduleType.GPU_ThreadBlock\n", + " )\n", + "\n", + " # Nested SDFG setup\n", + " inner_sdfg = dace.SDFG('nested_sdfg')\n", + " nested = state.add_nested_sdfg(inner_sdfg, sdfg, inputs={'__inp_A', '__inp_scalar'}, outputs={'tmp_ret'})\n", + "\n", + " loopreg = LoopRegion(\"loop\", \"k < 4\", \"k\", \"k = 0\", \"k = (k + 1)\", False, inner_sdfg)\n", + " inner_sdfg.add_node(loopreg)\n", + " inner_state = loopreg.add_state(\"use_smem\")\n", + "\n", + " # Shared memory and result\n", + " inner_sdfg.add_array(\"S\", (32,), dace.uint32, storage=dace.dtypes.StorageType.GPU_Shared, transient=True)\n", + " inner_sdfg.add_scalar(\"tmp_ret\", dace.uint32)\n", + " s_acc = inner_state.add_access(\"S\")\n", + " ret = inner_state.add_write(\"tmp_ret\")\n", + "\n", + " # Tasklets\n", + " tasklet1 = inner_state.add_tasklet(\n", + " \"assign_to_smem\", inputs={}, outputs={\"__out1\"},\n", + " code=\"__out1 = __inp_A[j + 32 * k]\",\n", + " language=dace.dtypes.Language.CPP\n", + " )\n", + " tasklet2 = inner_state.add_tasklet(\n", + " \"addMult\", inputs={\"__inp2\"}, outputs={\"__out2\"},\n", + " code=\"__out2 = __inp2 * __inp_scalar;\",\n", + " language=dace.dtypes.Language.CPP\n", + " )\n", + "\n", + " # Main SDFG edges\n", + " state.add_edge(a_acc, None, gpu_map_entry, None, dace.Memlet(\"A[0:128]\"))\n", + " state.add_edge(scalar_acc, None, gpu_map_entry, None, dace.Memlet(\"scalar[0]\"))\n", + " state.add_edge(gpu_map_entry, None, tb_map_entry, None, dace.Memlet(\"A[0:128]\"))\n", + " state.add_edge(gpu_map_entry, None, tb_map_entry, None, dace.Memlet(\"scalar[0]\"))\n", + " state.add_edge(tb_map_entry, None, nested, \"__inp_A\", dace.Memlet(\"A[j : j + 97 : 32]\"))\n", + " state.add_edge(tb_map_entry, None, nested, \"__inp_scalar\", dace.Memlet(\"scalar[0]\"))\n", + " state.add_edge(nested, \"tmp_ret\", tb_map_exit, None, dace.Memlet(\"A[j : j + 97 : 32]\"))\n", + " state.add_edge(tb_map_exit, None, gpu_map_exit, None, dace.Memlet(\"A[0:128]\"))\n", + " state.add_edge(gpu_map_exit, None, a_store, None, dace.Memlet(\"A[0:128]\"))\n", + "\n", + " # Inner SDFG edges\n", + " inner_state.add_edge(tasklet1, \"__out1\", s_acc, None, dace.Memlet(\"S[j]\"))\n", + " inner_state.add_edge(s_acc, None, tasklet2, \"__inp2\", dace.Memlet(\"S[j]\"))\n", + " inner_state.add_edge(tasklet2, \"__out2\", ret, None, dace.Memlet(\"S[j]\"))\n", + "\n", + " sdfg.fill_scope_connectors()\n", + " return sdfg\n", + "\n", + "\n", + "sdfg = scalarMultiplication_smem()\n", + "#sdfg\n" + ] + }, + { + "cell_type": "markdown", + "id": "0e7e27ca", + "metadata": {}, + "source": [ + "Instead of having a synchronization tasklet, I use a synchronization state in the case there are several \"sink states\" inside a loopregion (if this is possible)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "3fac943d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "
\n", + "
\n", + "
\n", + "\n", + "" + ], + "text/plain": [ + "SDFG (scalarMultiplication_smem)" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# insert synchronization barriers\n", + "DefaultSharedMemorySync().apply_pass(sdfg, None)\n", + "sdfg" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "dace_env", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/berkay_workpace/scratch/smemPassAndCopy/simpleCopy.ipynb b/berkay_workpace/scratch/smemPassAndCopy/simpleCopy.ipynb new file mode 100644 index 0000000000..18c3aa8836 --- /dev/null +++ b/berkay_workpace/scratch/smemPassAndCopy/simpleCopy.ipynb @@ -0,0 +1,830 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "a819effe", + "metadata": {}, + "source": [ + "# Simple Memory Copy\n", + "\n", + "This example demonstrates a basic memory copy operation where **shared memory** is used as an intermediate buffer. \n", + "It serves as the simplest possible scenario to test whether the `DefaultSharedMemorySync()` pass correctly inserts synchronization.\n", + "\n", + "The goal is to observe shared memory behavior in a minimal setting.\n" + ] + }, + { + "cell_type": "markdown", + "id": "df0fbf69", + "metadata": {}, + "source": [ + "First, we import needed modules at the beginning:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e7f52766", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import dace\n", + "from IPython.display import Code\n", + "from dace.transformation.passes.shared_memory_synchronization import DefaultSharedMemorySync" + ] + }, + { + "cell_type": "markdown", + "id": "4215bbff", + "metadata": {}, + "source": [ + "## Insipration\n", + "\n", + "Below is the sdfg which I used for inspiration. The goal is to replace 'k' with a shared memory array later." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "3f225145", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "
\n", + "
\n", + "
\n", + "\n", + "" + ], + "text/plain": [ + "SDFG (simpleCopy)" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "@dace.program\n", + "def simpleCopy(A: dace.float64[32] @ dace.dtypes.StorageType.GPU_Global, B: dace.float64[32] @ dace.dtypes.StorageType.GPU_Global, C: dace.float64[32] @ dace.dtypes.StorageType.GPU_Global):\n", + " for i in dace.map[0:32:32] @ dace.dtypes.ScheduleType.GPU_Device:\n", + " for j in dace.map[0:32] @ dace.dtypes.ScheduleType.GPU_ThreadBlock:\n", + " k = A[j]\n", + " B[j] = k\n", + "\n", + "simpleCopy.to_sdfg()\n" + ] + }, + { + "cell_type": "markdown", + "id": "f6382749", + "metadata": {}, + "source": [ + "A DaCe program built using the sdfg API, corresponding to a simple memory copy using shared memory as a buffer:" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "e7b22e0c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "
\n", + "
\n", + "
\n", + "\n", + "" + ], + "text/plain": [ + "SDFG (simpleCopy_smem)" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "def simpleCopy_smem():\n", + " # Create SDFG and state\n", + " sdfg = dace.SDFG(\"simpleCopy_smem\")\n", + " state = sdfg.add_state(\"main\")\n", + "\n", + " # Add arrays\n", + " sdfg.add_array(\"A\", (32,), dace.uint32, storage=dace.dtypes.StorageType.GPU_Global)\n", + " sdfg.add_array(\"B\", (32,), dace.uint32, storage=dace.dtypes.StorageType.GPU_Global)\n", + " sdfg.add_array(\"S\", (32,), dace.uint32, storage=dace.dtypes.StorageType.GPU_Shared, transient=True, lifetime=dace.dtypes.AllocationLifetime.Scope)\n", + "\n", + " # Add access nodes\n", + " a_acc = state.add_access(\"A\")\n", + " b_acc = state.add_access(\"B\")\n", + " s_acc= state.add_access(\"S\")\n", + "\n", + " # GPU Device map\n", + " gpu_map_entry, gpu_map_exit = state.add_map(\n", + " \"gpu_map\",\n", + " dict(i=\"0:32:32\"),\n", + " schedule=dace.dtypes.ScheduleType.GPU_Device,\n", + " )\n", + "\n", + " # GPU TB map\n", + " tb_map_entry, tb_map_exit = state.add_map(\n", + " \"tb\",\n", + " dict(j=\"0:32\"),\n", + " schedule=dace.dtypes.ScheduleType.GPU_ThreadBlock,\n", + " )\n", + "\n", + " # Add tasklets for A -> S -> B\n", + " tasklet1 = state.add_tasklet(\n", + " \"copy_to_shared\",\n", + " inputs={\"__inp\"},\n", + " outputs={\"__out\"},\n", + " code=\"__out = __inp;\",\n", + " language=dace.dtypes.Language.CPP\n", + " )\n", + "\n", + " tasklet2 = state.add_tasklet(\n", + " \"copy_to_global\",\n", + " inputs={\"__inp\"},\n", + " outputs={\"__out\"},\n", + " code=\"__out = __inp;\",\n", + " language=dace.dtypes.Language.CPP\n", + " )\n", + "\n", + "\n", + " # Edges\n", + " state.add_edge(a_acc, None, gpu_map_entry, None, dace.Memlet(\"A[0:32]\"))\n", + " state.add_edge(gpu_map_entry, None, tb_map_entry, None, dace.Memlet(\"A[0:32]\"))\n", + " state.add_edge(tb_map_entry, None, tasklet1, \"__inp\", dace.Memlet(\"A[j]\"))\n", + " state.add_edge(tasklet1, \"__out\", s_acc, None, dace.Memlet(\"S[j]\"))\n", + " state.add_edge(s_acc, None, tasklet2, \"__inp\", dace.Memlet(\"S[j]\"))\n", + " state.add_edge(tasklet2, \"__out\", tb_map_exit, None, dace.Memlet(\"B[j]\"))\n", + " state.add_edge(tb_map_exit, None, gpu_map_exit, None, dace.Memlet(\"B[0:32]\"))\n", + " state.add_edge(gpu_map_exit, None, b_acc, None, dace.Memlet(\"B[0:32]\"))\n", + "\n", + " sdfg.fill_scope_connectors()\n", + " return sdfg\n", + "\n", + "sdfg = simpleCopy_smem()\n", + "sdfg" + ] + }, + { + "cell_type": "markdown", + "id": "ab08683d", + "metadata": {}, + "source": [ + "## Adding Synchronization Barriers\n", + "\n", + "A simple pass is used to add synchronization tasklets correct. We observe, that the synchronization tasklet is inserted after \n", + "the shared memory access and between an assignment tasklet, ensuring that the threads wait until all data is in shared memory before\n", + "using it. (Note, that in this case, synchronization would not be necessary since each thread access the same position in shared memory\n", + "it writes to. But we only care about the correct insertion after a shared memory accessNode is used)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "15d8af45", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "
\n", + "
\n", + "
\n", + "\n", + "" + ], + "text/plain": [ + "SDFG (simpleCopy_smem)" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "DefaultSharedMemorySync().apply_pass(sdfg, None)\n", + "sdfg" + ] + }, + { + "cell_type": "markdown", + "id": "93950b7b", + "metadata": {}, + "source": [ + "The generated code where the \"__syncthreads();\" tasklet is correctly placed:" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "2318db8f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
#include <cuda_runtime.h>\n",
+       "#include <dace/dace.h>\n",
+       "\n",
+       "\n",
+       "struct simpleCopy_smem_state_t {\n",
+       "    dace::cuda::Context *gpu_context;\n",
+       "};\n",
+       "\n",
+       "\n",
+       "\n",
+       "DACE_EXPORTED int __dace_init_cuda(simpleCopy_smem_state_t *__state);\n",
+       "DACE_EXPORTED int __dace_exit_cuda(simpleCopy_smem_state_t *__state);\n",
+       "\n",
+       "\n",
+       "\n",
+       "int __dace_init_cuda(simpleCopy_smem_state_t *__state) {\n",
+       "    int count;\n",
+       "\n",
+       "    // Check that we are able to run cuda code\n",
+       "    if (cudaGetDeviceCount(&count) != cudaSuccess)\n",
+       "    {\n",
+       "        printf("ERROR: GPU drivers are not configured or cuda-capable device "\n",
+       "               "not found\\n");\n",
+       "        return 1;\n",
+       "    }\n",
+       "    if (count == 0)\n",
+       "    {\n",
+       "        printf("ERROR: No cuda-capable devices found\\n");\n",
+       "        return 2;\n",
+       "    }\n",
+       "\n",
+       "    // Initialize cuda before we run the application\n",
+       "    float *dev_X;\n",
+       "    DACE_GPU_CHECK(cudaMalloc((void **) &dev_X, 1));\n",
+       "    DACE_GPU_CHECK(cudaFree(dev_X));\n",
+       "\n",
+       "    \n",
+       "\n",
+       "    __state->gpu_context = new dace::cuda::Context(1, 1);\n",
+       "\n",
+       "    // Create cuda streams and events\n",
+       "    for(int i = 0; i < 1; ++i) {\n",
+       "        DACE_GPU_CHECK(cudaStreamCreateWithFlags(&__state->gpu_context->internal_streams[i], cudaStreamNonBlocking));\n",
+       "        __state->gpu_context->streams[i] = __state->gpu_context->internal_streams[i]; // Allow for externals to modify streams\n",
+       "    }\n",
+       "    for(int i = 0; i < 1; ++i) {\n",
+       "        DACE_GPU_CHECK(cudaEventCreateWithFlags(&__state->gpu_context->events[i], cudaEventDisableTiming));\n",
+       "    }\n",
+       "\n",
+       "    \n",
+       "\n",
+       "    return 0;\n",
+       "}\n",
+       "\n",
+       "int __dace_exit_cuda(simpleCopy_smem_state_t *__state) {\n",
+       "    \n",
+       "\n",
+       "    // Synchronize and check for CUDA errors\n",
+       "    int __err = static_cast<int>(__state->gpu_context->lasterror);\n",
+       "    if (__err == 0)\n",
+       "        __err = static_cast<int>(cudaDeviceSynchronize());\n",
+       "\n",
+       "    // Destroy cuda streams and events\n",
+       "    for(int i = 0; i < 1; ++i) {\n",
+       "        DACE_GPU_CHECK(cudaStreamDestroy(__state->gpu_context->internal_streams[i]));\n",
+       "    }\n",
+       "    for(int i = 0; i < 1; ++i) {\n",
+       "        DACE_GPU_CHECK(cudaEventDestroy(__state->gpu_context->events[i]));\n",
+       "    }\n",
+       "\n",
+       "    delete __state->gpu_context;\n",
+       "    return __err;\n",
+       "}\n",
+       "\n",
+       "DACE_EXPORTED bool __dace_gpu_set_stream(simpleCopy_smem_state_t *__state, int streamid, gpuStream_t stream)\n",
+       "{\n",
+       "    if (streamid < 0 || streamid >= 1)\n",
+       "        return false;\n",
+       "\n",
+       "    __state->gpu_context->streams[streamid] = stream;\n",
+       "\n",
+       "    return true;\n",
+       "}\n",
+       "\n",
+       "DACE_EXPORTED void __dace_gpu_set_all_streams(simpleCopy_smem_state_t *__state, gpuStream_t stream)\n",
+       "{\n",
+       "    for (int i = 0; i < 1; ++i)\n",
+       "        __state->gpu_context->streams[i] = stream;\n",
+       "}\n",
+       "\n",
+       "__global__ void __launch_bounds__(32) gpu_map_0_0_3(const dace::uint * __restrict__ A, dace::uint * __restrict__ B) {\n",
+       "    {\n",
+       "        int i = (32 * blockIdx.x);\n",
+       "        {\n",
+       "            {\n",
+       "                __shared__ dace::uint S[32];\n",
+       "                int j = threadIdx.x;\n",
+       "                {\n",
+       "                    {\n",
+       "                        dace::uint __inp = A[j];\n",
+       "                        dace::uint __out;\n",
+       "\n",
+       "                        ///////////////////\n",
+       "                        __out = __inp;\n",
+       "                        ///////////////////\n",
+       "\n",
+       "                        S[j] = __out;\n",
+       "                    }\n",
+       "                    {\n",
+       "\n",
+       "                        ///////////////////\n",
+       "                        __syncthreads();\n",
+       "                        ///////////////////\n",
+       "\n",
+       "                    }\n",
+       "                    {\n",
+       "                        dace::uint __inp = S[j];\n",
+       "                        dace::uint __out;\n",
+       "\n",
+       "                        ///////////////////\n",
+       "                        __out = __inp;\n",
+       "                        ///////////////////\n",
+       "\n",
+       "                        B[j] = __out;\n",
+       "                    }\n",
+       "                }\n",
+       "            }\n",
+       "        }\n",
+       "    }\n",
+       "}\n",
+       "\n",
+       "\n",
+       "DACE_EXPORTED void __dace_runkernel_gpu_map_0_0_3(simpleCopy_smem_state_t *__state, const dace::uint * __restrict__ A, dace::uint * __restrict__ B);\n",
+       "void __dace_runkernel_gpu_map_0_0_3(simpleCopy_smem_state_t *__state, const dace::uint * __restrict__ A, dace::uint * __restrict__ B)\n",
+       "{\n",
+       "\n",
+       "    void  *gpu_map_0_0_3_args[] = { (void *)&A, (void *)&B };\n",
+       "    gpuError_t __err = cudaLaunchKernel((void*)gpu_map_0_0_3, dim3(1, 1, 1), dim3(32, 1, 1), gpu_map_0_0_3_args, 0, __state->gpu_context->streams[0]);\n",
+       "    DACE_KERNEL_LAUNCH_CHECK(__err, "gpu_map_0_0_3", 1, 1, 1, 32, 1, 1);\n",
+       "}\n",
+       "
\n" + ], + "text/latex": [ + "\\begin{Verbatim}[commandchars=\\\\\\{\\}]\n", + "\\PY{n}{\\PYZsh{}include}\\PY{+w}{ }\\PY{o}{\\PYZlt{}}\\PY{n}{cuda\\PYZus{}runtime}\\PY{p}{.}\\PY{n}{h}\\PY{o}{\\PYZgt{}}\n", + "\\PY{n}{\\PYZsh{}include}\\PY{+w}{ }\\PY{o}{\\PYZlt{}}\\PY{n}{dace}\\PY{o}{/}\\PY{n}{dace}\\PY{p}{.}\\PY{n}{h}\\PY{o}{\\PYZgt{}}\n", + "\n", + "\n", + "\\PY{n}{struct}\\PY{+w}{ }\\PY{n}{simpleCopy\\PYZus{}smem\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{err}{\\PYZob{}}\n", + "\\PY{+w}{ }\\PY{n+nl}{dace}\\PY{p}{:}\\PY{err}{:}\\PY{n+nl}{cuda}\\PY{p}{:}\\PY{err}{:}\\PY{n}{Context}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{gpu\\PYZus{}context}\\PY{p}{;}\n", + "\\PY{err}{\\PYZcb{}}\\PY{p}{;}\n", + "\n", + "\n", + "\n", + "\\PY{n}{DACE\\PYZus{}EXPORTED}\\PY{+w}{ }\\PY{n+nc}{int}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}dace\\PYZus{}init\\PYZus{}cuda}\\PY{p}{(}\\PY{n}{simpleCopy\\PYZus{}smem\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{)}\\PY{p}{;}\n", + "\\PY{n}{DACE\\PYZus{}EXPORTED}\\PY{+w}{ }\\PY{n+nc}{int}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}dace\\PYZus{}exit\\PYZus{}cuda}\\PY{p}{(}\\PY{n}{simpleCopy\\PYZus{}smem\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{)}\\PY{p}{;}\n", + "\n", + "\n", + "\n", + "\\PY{n+nc}{int}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}dace\\PYZus{}init\\PYZus{}cuda}\\PY{p}{(}\\PY{n}{simpleCopy\\PYZus{}smem\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{)}\\PY{+w}{ }\\PY{err}{\\PYZob{}}\n", + "\\PY{+w}{ }\\PY{n+nc}{int}\\PY{+w}{ }\\PY{n+nf}{count}\\PY{p}{;}\n", + "\n", + "\\PY{+w}{ }\\PY{o}{/}\\PY{o}{/}\\PY{+w}{ }\\PY{k}{Check}\\PY{+w}{ }\\PY{n}{that}\\PY{+w}{ }\\PY{n}{we}\\PY{+w}{ }\\PY{k}{are}\\PY{+w}{ }\\PY{n}{able}\\PY{+w}{ }\\PY{k}{to}\\PY{+w}{ }\\PY{n}{run}\\PY{+w}{ }\\PY{n}{cuda}\\PY{+w}{ }\\PY{n}{code}\n", + "\\PY{+w}{ }\\PY{k}{if}\\PY{+w}{ }\\PY{p}{(}\\PY{n}{cudaGetDeviceCount}\\PY{p}{(}\\PY{o}{\\PYZam{}}\\PY{n+nf}{count}\\PY{p}{)}\\PY{+w}{ }\\PY{o}{!=}\\PY{+w}{ }\\PY{n}{cudaSuccess}\\PY{p}{)}\n", + "\\PY{+w}{ }\\PY{err}{\\PYZob{}}\n", + "\\PY{+w}{ }\\PY{n}{printf}\\PY{p}{(}\\PY{l+s+ss}{\\PYZdq{}ERROR: GPU drivers are not configured or cuda\\PYZhy{}capable device \\PYZdq{}}\n", + "\\PY{+w}{ }\\PY{l+s+ss}{\\PYZdq{}not found\\PYZbs{}n\\PYZdq{}}\\PY{p}{)}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{k}{return}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{err}{\\PYZcb{}}\n", + "\\PY{+w}{ }\\PY{k}{if}\\PY{+w}{ }\\PY{p}{(}\\PY{n+nf}{count}\\PY{+w}{ }\\PY{o}{=}\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{)}\n", + "\\PY{+w}{ }\\PY{err}{\\PYZob{}}\n", + "\\PY{+w}{ }\\PY{n}{printf}\\PY{p}{(}\\PY{l+s+ss}{\\PYZdq{}ERROR: No cuda\\PYZhy{}capable devices found\\PYZbs{}n\\PYZdq{}}\\PY{p}{)}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{k}{return}\\PY{+w}{ }\\PY{l+m+mi}{2}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{err}{\\PYZcb{}}\n", + "\n", + "\\PY{+w}{ }\\PY{o}{/}\\PY{o}{/}\\PY{+w}{ }\\PY{k}{Initialize}\\PY{+w}{ }\\PY{n}{cuda}\\PY{+w}{ }\\PY{k}{before}\\PY{+w}{ }\\PY{n}{we}\\PY{+w}{ }\\PY{n}{run}\\PY{+w}{ }\\PY{n}{the}\\PY{+w}{ }\\PY{n}{application}\n", + "\\PY{+w}{ }\\PY{n+nc}{float}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{dev\\PYZus{}X}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{n}{DACE\\PYZus{}GPU\\PYZus{}CHECK}\\PY{p}{(}\\PY{n}{cudaMalloc}\\PY{p}{(}\\PY{p}{(}\\PY{n}{void}\\PY{+w}{ }\\PY{o}{*}\\PY{o}{*}\\PY{p}{)}\\PY{+w}{ }\\PY{o}{\\PYZam{}}\\PY{n}{dev\\PYZus{}X}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{)}\\PY{p}{)}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{n}{DACE\\PYZus{}GPU\\PYZus{}CHECK}\\PY{p}{(}\\PY{n}{cudaFree}\\PY{p}{(}\\PY{n}{dev\\PYZus{}X}\\PY{p}{)}\\PY{p}{)}\\PY{p}{;}\n", + "\n", + "\\PY{+w}{ }\n", + "\n", + "\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{k}{new}\\PY{+w}{ }\\PY{n+nl}{dace}\\PY{p}{:}\\PY{err}{:}\\PY{n+nl}{cuda}\\PY{p}{:}\\PY{err}{:}\\PY{n}{Context}\\PY{p}{(}\\PY{l+m+mi}{1}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{)}\\PY{p}{;}\n", + "\n", + "\\PY{+w}{ }\\PY{o}{/}\\PY{o}{/}\\PY{+w}{ }\\PY{k}{Create}\\PY{+w}{ }\\PY{n}{cuda}\\PY{+w}{ }\\PY{n}{streams}\\PY{+w}{ }\\PY{o+ow}{and}\\PY{+w}{ }\\PY{n}{events}\n", + "\\PY{+w}{ }\\PY{k}{for}\\PY{p}{(}\\PY{n+nc}{int}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{;}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{\\PYZlt{}}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{;}\\PY{+w}{ }\\PY{o}{+}\\PY{o}{+}\\PY{n}{i}\\PY{p}{)}\\PY{+w}{ }\\PY{err}{\\PYZob{}}\n", + "\\PY{+w}{ }\\PY{n}{DACE\\PYZus{}GPU\\PYZus{}CHECK}\\PY{p}{(}\\PY{n}{cudaStreamCreateWithFlags}\\PY{p}{(}\\PY{o}{\\PYZam{}}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{internal\\PYZus{}streams}\\PY{o}{[}\\PY{n}{i}\\PY{o}{]}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{cudaStreamNonBlocking}\\PY{p}{)}\\PY{p}{)}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{streams}\\PY{o}{[}\\PY{n}{i}\\PY{o}{]}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{internal\\PYZus{}streams}\\PY{o}{[}\\PY{n}{i}\\PY{o}{]}\\PY{p}{;}\\PY{+w}{ }\\PY{o}{/}\\PY{o}{/}\\PY{+w}{ }\\PY{n}{Allow}\\PY{+w}{ }\\PY{k}{for}\\PY{+w}{ }\\PY{n}{externals}\\PY{+w}{ }\\PY{k}{to}\\PY{+w}{ }\\PY{k}{modify}\\PY{+w}{ }\\PY{n}{streams}\n", + "\\PY{+w}{ }\\PY{err}{\\PYZcb{}}\n", + "\\PY{+w}{ }\\PY{k}{for}\\PY{p}{(}\\PY{n+nc}{int}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{;}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{\\PYZlt{}}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{;}\\PY{+w}{ }\\PY{o}{+}\\PY{o}{+}\\PY{n}{i}\\PY{p}{)}\\PY{+w}{ }\\PY{err}{\\PYZob{}}\n", + "\\PY{+w}{ }\\PY{n}{DACE\\PYZus{}GPU\\PYZus{}CHECK}\\PY{p}{(}\\PY{n}{cudaEventCreateWithFlags}\\PY{p}{(}\\PY{o}{\\PYZam{}}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{events}\\PY{o}{[}\\PY{n}{i}\\PY{o}{]}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{cudaEventDisableTiming}\\PY{p}{)}\\PY{p}{)}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{err}{\\PYZcb{}}\n", + "\n", + "\\PY{+w}{ }\n", + "\n", + "\\PY{+w}{ }\\PY{k}{return}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{;}\n", + "\\PY{err}{\\PYZcb{}}\n", + "\n", + "\\PY{n+nc}{int}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}dace\\PYZus{}exit\\PYZus{}cuda}\\PY{p}{(}\\PY{n}{simpleCopy\\PYZus{}smem\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{)}\\PY{+w}{ }\\PY{err}{\\PYZob{}}\n", + "\\PY{+w}{ }\n", + "\n", + "\\PY{+w}{ }\\PY{o}{/}\\PY{o}{/}\\PY{+w}{ }\\PY{n}{Synchronize}\\PY{+w}{ }\\PY{o+ow}{and}\\PY{+w}{ }\\PY{k}{check}\\PY{+w}{ }\\PY{k}{for}\\PY{+w}{ }\\PY{n}{CUDA}\\PY{+w}{ }\\PY{n}{errors}\n", + "\\PY{+w}{ }\\PY{n+nc}{int}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}err}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{static\\PYZus{}cast}\\PY{o}{\\PYZlt{}}\\PY{n+nc}{int}\\PY{o}{\\PYZgt{}}\\PY{p}{(}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{lasterror}\\PY{p}{)}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{k}{if}\\PY{+w}{ }\\PY{p}{(}\\PY{n}{\\PYZus{}\\PYZus{}err}\\PY{+w}{ }\\PY{o}{=}\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{)}\n", + "\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}err}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{static\\PYZus{}cast}\\PY{o}{\\PYZlt{}}\\PY{n+nc}{int}\\PY{o}{\\PYZgt{}}\\PY{p}{(}\\PY{n}{cudaDeviceSynchronize}\\PY{p}{(}\\PY{p}{)}\\PY{p}{)}\\PY{p}{;}\n", + "\n", + "\\PY{+w}{ }\\PY{o}{/}\\PY{o}{/}\\PY{+w}{ }\\PY{k}{Destroy}\\PY{+w}{ }\\PY{n}{cuda}\\PY{+w}{ }\\PY{n}{streams}\\PY{+w}{ }\\PY{o+ow}{and}\\PY{+w}{ }\\PY{n}{events}\n", + "\\PY{+w}{ }\\PY{k}{for}\\PY{p}{(}\\PY{n+nc}{int}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{;}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{\\PYZlt{}}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{;}\\PY{+w}{ }\\PY{o}{+}\\PY{o}{+}\\PY{n}{i}\\PY{p}{)}\\PY{+w}{ }\\PY{err}{\\PYZob{}}\n", + "\\PY{+w}{ }\\PY{n}{DACE\\PYZus{}GPU\\PYZus{}CHECK}\\PY{p}{(}\\PY{n}{cudaStreamDestroy}\\PY{p}{(}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{internal\\PYZus{}streams}\\PY{o}{[}\\PY{n}{i}\\PY{o}{]}\\PY{p}{)}\\PY{p}{)}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{err}{\\PYZcb{}}\n", + "\\PY{+w}{ }\\PY{k}{for}\\PY{p}{(}\\PY{n+nc}{int}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{;}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{\\PYZlt{}}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{;}\\PY{+w}{ }\\PY{o}{+}\\PY{o}{+}\\PY{n}{i}\\PY{p}{)}\\PY{+w}{ }\\PY{err}{\\PYZob{}}\n", + "\\PY{+w}{ }\\PY{n}{DACE\\PYZus{}GPU\\PYZus{}CHECK}\\PY{p}{(}\\PY{n}{cudaEventDestroy}\\PY{p}{(}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{events}\\PY{o}{[}\\PY{n}{i}\\PY{o}{]}\\PY{p}{)}\\PY{p}{)}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{err}{\\PYZcb{}}\n", + "\n", + "\\PY{+w}{ }\\PY{k}{delete}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{k}{return}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}err}\\PY{p}{;}\n", + "\\PY{err}{\\PYZcb{}}\n", + "\n", + "\\PY{n}{DACE\\PYZus{}EXPORTED}\\PY{+w}{ }\\PY{n}{bool}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}dace\\PYZus{}gpu\\PYZus{}set\\PYZus{}stream}\\PY{p}{(}\\PY{n}{simpleCopy\\PYZus{}smem\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{,}\\PY{+w}{ }\\PY{n+nc}{int}\\PY{+w}{ }\\PY{n}{streamid}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{gpuStream\\PYZus{}t}\\PY{+w}{ }\\PY{n}{stream}\\PY{p}{)}\n", + "\\PY{err}{\\PYZob{}}\n", + "\\PY{+w}{ }\\PY{k}{if}\\PY{+w}{ }\\PY{p}{(}\\PY{n}{streamid}\\PY{+w}{ }\\PY{o}{\\PYZlt{}}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{+w}{ }\\PY{o}{|}\\PY{o}{|}\\PY{+w}{ }\\PY{n}{streamid}\\PY{+w}{ }\\PY{o}{\\PYZgt{}=}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{)}\n", + "\\PY{+w}{ }\\PY{k}{return}\\PY{+w}{ }\\PY{k}{false}\\PY{p}{;}\n", + "\n", + "\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{streams}\\PY{o}{[}\\PY{n}{streamid}\\PY{o}{]}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{stream}\\PY{p}{;}\n", + "\n", + "\\PY{+w}{ }\\PY{k}{return}\\PY{+w}{ }\\PY{k}{true}\\PY{p}{;}\n", + "\\PY{err}{\\PYZcb{}}\n", + "\n", + "\\PY{n}{DACE\\PYZus{}EXPORTED}\\PY{+w}{ }\\PY{n}{void}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}dace\\PYZus{}gpu\\PYZus{}set\\PYZus{}all\\PYZus{}streams}\\PY{p}{(}\\PY{n}{simpleCopy\\PYZus{}smem\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{gpuStream\\PYZus{}t}\\PY{+w}{ }\\PY{n}{stream}\\PY{p}{)}\n", + "\\PY{err}{\\PYZob{}}\n", + "\\PY{+w}{ }\\PY{k}{for}\\PY{+w}{ }\\PY{p}{(}\\PY{n+nc}{int}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{;}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{\\PYZlt{}}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{;}\\PY{+w}{ }\\PY{o}{+}\\PY{o}{+}\\PY{n}{i}\\PY{p}{)}\n", + "\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{streams}\\PY{o}{[}\\PY{n}{i}\\PY{o}{]}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{stream}\\PY{p}{;}\n", + "\\PY{err}{\\PYZcb{}}\n", + "\n", + "\\PY{n}{\\PYZus{}\\PYZus{}global\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{n}{void}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}launch\\PYZus{}bounds\\PYZus{}\\PYZus{}}\\PY{p}{(}\\PY{l+m+mi}{32}\\PY{p}{)}\\PY{+w}{ }\\PY{n}{gpu\\PYZus{}map\\PYZus{}0\\PYZus{}0\\PYZus{}3}\\PY{p}{(}\\PY{n}{const}\\PY{+w}{ }\\PY{n+nl}{dace}\\PY{p}{:}\\PY{err}{:}\\PY{n}{uint}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}restrict\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{n}{A}\\PY{p}{,}\\PY{+w}{ }\\PY{n+nl}{dace}\\PY{p}{:}\\PY{err}{:}\\PY{n}{uint}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}restrict\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{n}{B}\\PY{p}{)}\\PY{+w}{ }\\PY{err}{\\PYZob{}}\n", + "\\PY{+w}{ }\\PY{err}{\\PYZob{}}\n", + "\\PY{+w}{ }\\PY{n+nc}{int}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{p}{(}\\PY{l+m+mi}{32}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{blockIdx}\\PY{p}{.}\\PY{n}{x}\\PY{p}{)}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{err}{\\PYZob{}}\n", + "\\PY{+w}{ }\\PY{err}{\\PYZob{}}\n", + "\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}shared\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{n+nl}{dace}\\PY{p}{:}\\PY{err}{:}\\PY{n}{uint}\\PY{+w}{ }\\PY{n}{S}\\PY{o}{[}\\PY{n}{32}\\PY{o}{]}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{n+nc}{int}\\PY{+w}{ }\\PY{n}{j}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{threadIdx}\\PY{p}{.}\\PY{n}{x}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{err}{\\PYZob{}}\n", + "\\PY{+w}{ }\\PY{err}{\\PYZob{}}\n", + "\\PY{+w}{ }\\PY{n+nl}{dace}\\PY{p}{:}\\PY{err}{:}\\PY{n}{uint}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}inp}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{A}\\PY{o}{[}\\PY{n}{j}\\PY{o}{]}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{n+nl}{dace}\\PY{p}{:}\\PY{err}{:}\\PY{n}{uint}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}out}\\PY{p}{;}\n", + "\n", + "\\PY{+w}{ }\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\n", + "\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}out}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}inp}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\n", + "\n", + "\\PY{+w}{ }\\PY{n}{S}\\PY{o}{[}\\PY{n}{j}\\PY{o}{]}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}out}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{err}{\\PYZcb{}}\n", + "\\PY{+w}{ }\\PY{err}{\\PYZob{}}\n", + "\n", + "\\PY{+w}{ }\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\n", + "\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}syncthreads}\\PY{p}{(}\\PY{p}{)}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\n", + "\n", + "\\PY{+w}{ }\\PY{err}{\\PYZcb{}}\n", + "\\PY{+w}{ }\\PY{err}{\\PYZob{}}\n", + "\\PY{+w}{ }\\PY{n+nl}{dace}\\PY{p}{:}\\PY{err}{:}\\PY{n}{uint}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}inp}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{S}\\PY{o}{[}\\PY{n}{j}\\PY{o}{]}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{n+nl}{dace}\\PY{p}{:}\\PY{err}{:}\\PY{n}{uint}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}out}\\PY{p}{;}\n", + "\n", + "\\PY{+w}{ }\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\n", + "\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}out}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}inp}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\n", + "\n", + "\\PY{+w}{ }\\PY{n}{B}\\PY{o}{[}\\PY{n}{j}\\PY{o}{]}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}out}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{err}{\\PYZcb{}}\n", + "\\PY{+w}{ }\\PY{err}{\\PYZcb{}}\n", + "\\PY{+w}{ }\\PY{err}{\\PYZcb{}}\n", + "\\PY{+w}{ }\\PY{err}{\\PYZcb{}}\n", + "\\PY{+w}{ }\\PY{err}{\\PYZcb{}}\n", + "\\PY{err}{\\PYZcb{}}\n", + "\n", + "\n", + "\\PY{n}{DACE\\PYZus{}EXPORTED}\\PY{+w}{ }\\PY{n}{void}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}dace\\PYZus{}runkernel\\PYZus{}gpu\\PYZus{}map\\PYZus{}0\\PYZus{}0\\PYZus{}3}\\PY{p}{(}\\PY{n}{simpleCopy\\PYZus{}smem\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{const}\\PY{+w}{ }\\PY{n+nl}{dace}\\PY{p}{:}\\PY{err}{:}\\PY{n}{uint}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}restrict\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{n}{A}\\PY{p}{,}\\PY{+w}{ }\\PY{n+nl}{dace}\\PY{p}{:}\\PY{err}{:}\\PY{n}{uint}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}restrict\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{n}{B}\\PY{p}{)}\\PY{p}{;}\n", + "\\PY{n}{void}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}dace\\PYZus{}runkernel\\PYZus{}gpu\\PYZus{}map\\PYZus{}0\\PYZus{}0\\PYZus{}3}\\PY{p}{(}\\PY{n}{simpleCopy\\PYZus{}smem\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{const}\\PY{+w}{ }\\PY{n+nl}{dace}\\PY{p}{:}\\PY{err}{:}\\PY{n}{uint}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}restrict\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{n}{A}\\PY{p}{,}\\PY{+w}{ }\\PY{n+nl}{dace}\\PY{p}{:}\\PY{err}{:}\\PY{n}{uint}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}restrict\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{n}{B}\\PY{p}{)}\n", + "\\PY{err}{\\PYZob{}}\n", + "\n", + "\\PY{+w}{ }\\PY{n}{void}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{gpu\\PYZus{}map\\PYZus{}0\\PYZus{}0\\PYZus{}3\\PYZus{}args}\\PY{err}{[}\\PY{err}{]}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{err}{\\PYZob{}}\\PY{+w}{ }\\PY{p}{(}\\PY{n}{void}\\PY{+w}{ }\\PY{o}{*}\\PY{p}{)}\\PY{o}{\\PYZam{}}\\PY{n}{A}\\PY{p}{,}\\PY{+w}{ }\\PY{p}{(}\\PY{n}{void}\\PY{+w}{ }\\PY{o}{*}\\PY{p}{)}\\PY{o}{\\PYZam{}}\\PY{n}{B}\\PY{+w}{ }\\PY{err}{\\PYZcb{}}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{n}{gpuError\\PYZus{}t}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}err}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{cudaLaunchKernel}\\PY{p}{(}\\PY{p}{(}\\PY{n}{void}\\PY{o}{*}\\PY{p}{)}\\PY{n}{gpu\\PYZus{}map\\PYZus{}0\\PYZus{}0\\PYZus{}3}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{dim3}\\PY{p}{(}\\PY{l+m+mi}{1}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{)}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{dim3}\\PY{p}{(}\\PY{l+m+mi}{32}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{)}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{gpu\\PYZus{}map\\PYZus{}0\\PYZus{}0\\PYZus{}3\\PYZus{}args}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{streams}\\PY{o}{[}\\PY{n}{0}\\PY{o}{]}\\PY{p}{)}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{n}{DACE\\PYZus{}KERNEL\\PYZus{}LAUNCH\\PYZus{}CHECK}\\PY{p}{(}\\PY{n}{\\PYZus{}\\PYZus{}err}\\PY{p}{,}\\PY{+w}{ }\\PY{l+s+ss}{\\PYZdq{}gpu\\PYZus{}map\\PYZus{}0\\PYZus{}0\\PYZus{}3\\PYZdq{}}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{32}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{)}\\PY{p}{;}\n", + "\\PY{err}{\\PYZcb{}}\n", + "\\end{Verbatim}\n" + ], + "text/plain": [ + "\n", + "#include \n", + "#include \n", + "\n", + "\n", + "struct simpleCopy_smem_state_t {\n", + " dace::cuda::Context *gpu_context;\n", + "};\n", + "\n", + "\n", + "\n", + "DACE_EXPORTED int __dace_init_cuda(simpleCopy_smem_state_t *__state);\n", + "DACE_EXPORTED int __dace_exit_cuda(simpleCopy_smem_state_t *__state);\n", + "\n", + "\n", + "\n", + "int __dace_init_cuda(simpleCopy_smem_state_t *__state) {\n", + " int count;\n", + "\n", + " // Check that we are able to run cuda code\n", + " if (cudaGetDeviceCount(&count) != cudaSuccess)\n", + " {\n", + " printf(\"ERROR: GPU drivers are not configured or cuda-capable device \"\n", + " \"not found\\n\");\n", + " return 1;\n", + " }\n", + " if (count == 0)\n", + " {\n", + " printf(\"ERROR: No cuda-capable devices found\\n\");\n", + " return 2;\n", + " }\n", + "\n", + " // Initialize cuda before we run the application\n", + " float *dev_X;\n", + " DACE_GPU_CHECK(cudaMalloc((void **) &dev_X, 1));\n", + " DACE_GPU_CHECK(cudaFree(dev_X));\n", + "\n", + " \n", + "\n", + " __state->gpu_context = new dace::cuda::Context(1, 1);\n", + "\n", + " // Create cuda streams and events\n", + " for(int i = 0; i < 1; ++i) {\n", + " DACE_GPU_CHECK(cudaStreamCreateWithFlags(&__state->gpu_context->internal_streams[i], cudaStreamNonBlocking));\n", + " __state->gpu_context->streams[i] = __state->gpu_context->internal_streams[i]; // Allow for externals to modify streams\n", + " }\n", + " for(int i = 0; i < 1; ++i) {\n", + " DACE_GPU_CHECK(cudaEventCreateWithFlags(&__state->gpu_context->events[i], cudaEventDisableTiming));\n", + " }\n", + "\n", + " \n", + "\n", + " return 0;\n", + "}\n", + "\n", + "int __dace_exit_cuda(simpleCopy_smem_state_t *__state) {\n", + " \n", + "\n", + " // Synchronize and check for CUDA errors\n", + " int __err = static_cast(__state->gpu_context->lasterror);\n", + " if (__err == 0)\n", + " __err = static_cast(cudaDeviceSynchronize());\n", + "\n", + " // Destroy cuda streams and events\n", + " for(int i = 0; i < 1; ++i) {\n", + " DACE_GPU_CHECK(cudaStreamDestroy(__state->gpu_context->internal_streams[i]));\n", + " }\n", + " for(int i = 0; i < 1; ++i) {\n", + " DACE_GPU_CHECK(cudaEventDestroy(__state->gpu_context->events[i]));\n", + " }\n", + "\n", + " delete __state->gpu_context;\n", + " return __err;\n", + "}\n", + "\n", + "DACE_EXPORTED bool __dace_gpu_set_stream(simpleCopy_smem_state_t *__state, int streamid, gpuStream_t stream)\n", + "{\n", + " if (streamid < 0 || streamid >= 1)\n", + " return false;\n", + "\n", + " __state->gpu_context->streams[streamid] = stream;\n", + "\n", + " return true;\n", + "}\n", + "\n", + "DACE_EXPORTED void __dace_gpu_set_all_streams(simpleCopy_smem_state_t *__state, gpuStream_t stream)\n", + "{\n", + " for (int i = 0; i < 1; ++i)\n", + " __state->gpu_context->streams[i] = stream;\n", + "}\n", + "\n", + "__global__ void __launch_bounds__(32) gpu_map_0_0_3(const dace::uint * __restrict__ A, dace::uint * __restrict__ B) {\n", + " {\n", + " int i = (32 * blockIdx.x);\n", + " {\n", + " {\n", + " __shared__ dace::uint S[32];\n", + " int j = threadIdx.x;\n", + " {\n", + " {\n", + " dace::uint __inp = A[j];\n", + " dace::uint __out;\n", + "\n", + " ///////////////////\n", + " __out = __inp;\n", + " ///////////////////\n", + "\n", + " S[j] = __out;\n", + " }\n", + " {\n", + "\n", + " ///////////////////\n", + " __syncthreads();\n", + " ///////////////////\n", + "\n", + " }\n", + " {\n", + " dace::uint __inp = S[j];\n", + " dace::uint __out;\n", + "\n", + " ///////////////////\n", + " __out = __inp;\n", + " ///////////////////\n", + "\n", + " B[j] = __out;\n", + " }\n", + " }\n", + " }\n", + " }\n", + " }\n", + "}\n", + "\n", + "\n", + "DACE_EXPORTED void __dace_runkernel_gpu_map_0_0_3(simpleCopy_smem_state_t *__state, const dace::uint * __restrict__ A, dace::uint * __restrict__ B);\n", + "void __dace_runkernel_gpu_map_0_0_3(simpleCopy_smem_state_t *__state, const dace::uint * __restrict__ A, dace::uint * __restrict__ B)\n", + "{\n", + "\n", + " void *gpu_map_0_0_3_args[] = { (void *)&A, (void *)&B };\n", + " gpuError_t __err = cudaLaunchKernel((void*)gpu_map_0_0_3, dim3(1, 1, 1), dim3(32, 1, 1), gpu_map_0_0_3_args, 0, __state->gpu_context->streams[0]);\n", + " DACE_KERNEL_LAUNCH_CHECK(__err, \"gpu_map_0_0_3\", 1, 1, 1, 32, 1, 1);\n", + "}\n" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "Code(sdfg.generate_code()[1].clean_code)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "61a16931", + "metadata": {}, + "outputs": [], + "source": [ + "#Code(sdfg.generate_code()[0].clean_code)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b5c5e3be", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "dace_env", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/berkay_workpace/scratch/smemPassAndCopy/smth.sdfg b/berkay_workpace/scratch/smemPassAndCopy/smth.sdfg new file mode 100644 index 0000000000..690f41c3e1 --- /dev/null +++ b/berkay_workpace/scratch/smemPassAndCopy/smth.sdfg @@ -0,0 +1,889 @@ +{ + "type": "SDFG", + "attributes": { + "name": "scalarMultiplication_smem", + "_arrays": { + "A": { + "type": "Array", + "attributes": { + "strides": [ + "1" + ], + "total_size": "128", + "offset": [ + "0" + ], + "dtype": "uint32", + "shape": [ + "128" + ], + "storage": "GPU_Global", + "debuginfo": null + } + }, + "scalar": { + "type": "Scalar", + "attributes": { + "dtype": "uint32", + "shape": [ + "1" + ], + "debuginfo": null + } + }, + "S": { + "type": "Array", + "attributes": { + "strides": [ + "1" + ], + "total_size": "32", + "offset": [ + "0" + ], + "optional": false, + "dtype": "uint32", + "shape": [ + "32" + ], + "transient": true, + "storage": "GPU_Shared", + "debuginfo": null + } + } + }, + "global_code": { + "frame": { + "string_data": "", + "language": "CPP" + } + }, + "init_code": { + "frame": { + "string_data": "", + "language": "CPP" + } + }, + "exit_code": { + "frame": { + "string_data": "", + "language": "CPP" + } + }, + "guid": "1de96299-3c2c-4b5d-8c80-7082c0faab46", + "hash": "8d8dc97005b7cab6a2083f6e6f012a016ba2fcc237e10c1c9dc078b01c858426" + }, + "nodes": [ + { + "type": "SDFGState", + "label": "main", + "id": 0, + "collapsed": false, + "scope_dict": { + "-1": [ + 0, + 1, + 2, + 3, + 4 + ], + "4": [ + 5, + 6 + ], + "6": [ + 7, + 8 + ] + }, + "nodes": [ + { + "type": "AccessNode", + "label": "A", + "attributes": { + "debuginfo": { + "type": "DebugInfo", + "start_line": 12, + "end_line": 12, + "start_column": 0, + "end_column": 0, + "filename": "/tmp/ipykernel_42346/245386100.py" + }, + "data": "A", + "guid": "567b339f-84b2-48c5-844d-91bfcf819213" + }, + "id": 0, + "scope_entry": null, + "scope_exit": null + }, + { + "type": "AccessNode", + "label": "A", + "attributes": { + "debuginfo": { + "type": "DebugInfo", + "start_line": 13, + "end_line": 13, + "start_column": 0, + "end_column": 0, + "filename": "/tmp/ipykernel_42346/245386100.py" + }, + "data": "A", + "guid": "e060d3ab-17ce-4c46-acec-ad04423b3731" + }, + "id": 1, + "scope_entry": null, + "scope_exit": null + }, + { + "type": "AccessNode", + "label": "scalar", + "attributes": { + "debuginfo": { + "type": "DebugInfo", + "start_line": 14, + "end_line": 14, + "start_column": 0, + "end_column": 0, + "filename": "/tmp/ipykernel_42346/245386100.py" + }, + "data": "scalar", + "guid": "99dcf8e3-36bb-4d98-81fd-a5d9d47c0e0c" + }, + "id": 2, + "scope_entry": null, + "scope_exit": null + }, + { + "type": "AccessNode", + "label": "S", + "attributes": { + "debuginfo": { + "type": "DebugInfo", + "start_line": 15, + "end_line": 15, + "start_column": 0, + "end_column": 0, + "filename": "/tmp/ipykernel_42346/245386100.py" + }, + "data": "S", + "guid": "24d7b12b-47d5-4e65-8248-bffd691c2d1b" + }, + "id": 3, + "scope_entry": null, + "scope_exit": null + }, + { + "type": "MapEntry", + "label": "gpu_map[i=0:32:32]", + "attributes": { + "label": "gpu_map", + "params": [ + "i" + ], + "range": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "31", + "step": "32", + "tile": "1" + } + ] + }, + "schedule": "GPU_Device", + "debuginfo": { + "type": "DebugInfo", + "start_line": 19, + "end_line": 19, + "start_column": 0, + "end_column": 0, + "filename": "/tmp/ipykernel_42346/245386100.py" + }, + "guid": "6314a764-e35d-41b6-a1db-c8669a43f5cf" + }, + "id": 4, + "scope_entry": null, + "scope_exit": "5" + }, + { + "type": "MapExit", + "label": "gpu_map[i=0:32:32]", + "attributes": { + "guid": "0edbf588-c052-419a-a4c0-d11f82565f31" + }, + "id": 5, + "scope_entry": "4", + "scope_exit": "5" + }, + { + "type": "MapEntry", + "label": "tb[j=0:32]", + "attributes": { + "label": "tb", + "params": [ + "j" + ], + "range": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "31", + "step": "1", + "tile": "1" + } + ] + }, + "schedule": "GPU_ThreadBlock", + "debuginfo": { + "type": "DebugInfo", + "start_line": 26, + "end_line": 26, + "start_column": 0, + "end_column": 0, + "filename": "/tmp/ipykernel_42346/245386100.py" + }, + "guid": "18e02f87-d308-45e7-b339-0d06be66dcc0" + }, + "id": 6, + "scope_entry": "4", + "scope_exit": "7" + }, + { + "type": "MapExit", + "label": "tb[j=0:32]", + "attributes": { + "guid": "5654767c-5745-4950-894c-247cf425d9b5" + }, + "id": 7, + "scope_entry": "6", + "scope_exit": "7" + }, + { + "type": "NestedSDFG", + "label": "nested_sdfg", + "attributes": { + "sdfg": { + "type": "SDFG", + "attributes": { + "name": "nested_sdfg", + "global_code": { + "frame": { + "string_data": "", + "language": "CPP" + } + }, + "init_code": { + "frame": { + "string_data": "", + "language": "CPP" + } + }, + "exit_code": { + "frame": { + "string_data": "", + "language": "CPP" + } + }, + "guid": "cb48e622-77c7-458e-864a-f143c84ea239" + }, + "nodes": [ + { + "type": "LoopRegion", + "attributes": { + "update_statement": { + "string_data": "k = (k + 1)", + "language": "Python" + }, + "init_statement": { + "string_data": "k = 0", + "language": "Python" + }, + "loop_condition": { + "string_data": "(k < 4)", + "language": "Python" + }, + "loop_variable": "k", + "guid": "6d0e8bde-999c-4d8d-8681-a386238445a4" + }, + "nodes": [ + { + "type": "SDFGState", + "label": "use_smem", + "id": 0, + "collapsed": false, + "scope_dict": { + "-1": [ + 0, + 1, + 2 + ] + }, + "nodes": [ + { + "type": "Tasklet", + "label": "assign_to_smem", + "attributes": { + "code": { + "string_data": "__out = __inp_A", + "language": "CPP" + }, + "debuginfo": { + "type": "DebugInfo", + "start_line": 46, + "end_line": 46, + "start_column": 0, + "end_column": 0, + "filename": "/tmp/ipykernel_42346/245386100.py" + }, + "label": "assign_to_smem", + "out_connectors": { + "__out": null + }, + "guid": "a63d73a7-a907-4bd0-9ae4-4d3105f15a89" + }, + "id": 0, + "scope_entry": null, + "scope_exit": null + }, + { + "type": "Tasklet", + "label": "addMult", + "attributes": { + "code": { + "string_data": "__out = __inp * __inp_scalar;", + "language": "CPP" + }, + "debuginfo": { + "type": "DebugInfo", + "start_line": 54, + "end_line": 54, + "start_column": 0, + "end_column": 0, + "filename": "/tmp/ipykernel_42346/245386100.py" + }, + "label": "addMult", + "in_connectors": { + "__inp": null + }, + "out_connectors": { + "__out": null + }, + "guid": "e1208512-68d1-4567-bd8e-3c2579ca61a9" + }, + "id": 1, + "scope_entry": null, + "scope_exit": null + }, + { + "type": "AccessNode", + "label": "S", + "attributes": { + "debuginfo": { + "type": "DebugInfo", + "start_line": 15, + "end_line": 15, + "start_column": 0, + "end_column": 0, + "filename": "/tmp/ipykernel_42346/245386100.py" + }, + "data": "S", + "guid": "24d7b12b-47d5-4e65-8248-bffd691c2d1b" + }, + "id": 2, + "scope_entry": null, + "scope_exit": null + } + ], + "edges": [ + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "1", + "subset": { + "type": "Range", + "ranges": [ + { + "start": "j", + "end": "j", + "step": "1", + "tile": "1" + } + ] + }, + "data": "S", + "debuginfo": null, + "guid": "f7d31c47-4858-48de-8388-7fb8ab968d6d", + "src_subset": null, + "dst_subset": { + "type": "Range", + "ranges": [ + { + "start": "j", + "end": "j", + "step": "1", + "tile": "1" + } + ] + }, + "is_data_src": false, + "num_accesses": "1" + } + } + }, + "src": "0", + "dst": "2", + "dst_connector": null, + "src_connector": "__out" + } + ], + "attributes": { + "guid": "d125a509-4ec4-497c-ae79-b34c09433ff1" + } + } + ], + "edges": [], + "collapsed": false, + "label": "loop", + "id": 0, + "cfg_list_id": 2, + "start_block": null + } + ], + "edges": [], + "collapsed": false, + "label": "", + "id": null, + "cfg_list_id": 1, + "start_block": null + }, + "debuginfo": { + "type": "DebugInfo", + "start_line": 35, + "end_line": 35, + "start_column": 0, + "end_column": 0, + "filename": "/tmp/ipykernel_42346/245386100.py" + }, + "label": "nested_sdfg", + "in_connectors": { + "__inp_A": null, + "__inp_scalar": null + }, + "out_connectors": { + "__out": null + }, + "guid": "9e71c20d-8658-4641-b85d-51880b2a5a5c" + }, + "id": 8, + "scope_entry": "6", + "scope_exit": "7" + } + ], + "edges": [ + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "128", + "subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "127", + "step": "1", + "tile": "1" + } + ] + }, + "data": "A", + "debuginfo": null, + "guid": "d05b830d-33b1-4236-b9d9-a6cfeba66c74", + "src_subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "127", + "step": "1", + "tile": "1" + } + ] + }, + "dst_subset": null, + "is_data_src": null, + "num_accesses": "128" + } + } + }, + "src": "0", + "dst": "4", + "dst_connector": null, + "src_connector": null + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "1", + "subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "0", + "step": "1", + "tile": "1" + } + ] + }, + "data": "scalar", + "debuginfo": null, + "guid": "80d32777-6d6f-40e7-b1b8-7552559022d6", + "src_subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "0", + "step": "1", + "tile": "1" + } + ] + }, + "dst_subset": null, + "is_data_src": null, + "num_accesses": "1" + } + } + }, + "src": "2", + "dst": "4", + "dst_connector": null, + "src_connector": null + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "128", + "subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "127", + "step": "1", + "tile": "1" + } + ] + }, + "data": "A", + "debuginfo": null, + "guid": "85bc8b8a-66a7-48f9-84c3-8be68c9b8d40", + "src_subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "127", + "step": "1", + "tile": "1" + } + ] + }, + "dst_subset": null, + "is_data_src": null, + "num_accesses": "128" + } + } + }, + "src": "4", + "dst": "6", + "dst_connector": null, + "src_connector": null + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "1", + "subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "0", + "step": "1", + "tile": "1" + } + ] + }, + "data": "scalar", + "debuginfo": null, + "guid": "efa679e1-5e45-48b6-a849-3cea0746876e", + "src_subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "0", + "step": "1", + "tile": "1" + } + ] + }, + "dst_subset": null, + "is_data_src": null, + "num_accesses": "1" + } + } + }, + "src": "4", + "dst": "6", + "dst_connector": null, + "src_connector": null + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "128", + "subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "127", + "step": "1", + "tile": "1" + } + ] + }, + "data": "A", + "debuginfo": null, + "guid": "76a3c00f-cbf8-4c04-ad75-947b01bacc8b", + "src_subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "127", + "step": "1", + "tile": "1" + } + ] + }, + "dst_subset": null, + "is_data_src": null, + "num_accesses": "128" + } + } + }, + "src": "7", + "dst": "5", + "dst_connector": null, + "src_connector": null + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "128", + "subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "127", + "step": "1", + "tile": "1" + } + ] + }, + "data": "A", + "debuginfo": null, + "guid": "f25b3395-879c-4afd-a757-523b740a2f73", + "src_subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "127", + "step": "1", + "tile": "1" + } + ] + }, + "dst_subset": null, + "is_data_src": null, + "num_accesses": "128" + } + } + }, + "src": "5", + "dst": "1", + "dst_connector": null, + "src_connector": null + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "4", + "subset": { + "type": "Range", + "ranges": [ + { + "start": "j", + "end": "j + 96", + "step": "32", + "tile": "1" + } + ] + }, + "data": "A", + "debuginfo": null, + "guid": "5d8bea57-d75a-4cf5-adc6-2bfe07f8daa9", + "src_subset": { + "type": "Range", + "ranges": [ + { + "start": "j", + "end": "j + 96", + "step": "32", + "tile": "1" + } + ] + }, + "dst_subset": null, + "is_data_src": null, + "num_accesses": "4" + } + } + }, + "src": "6", + "dst": "8", + "dst_connector": "__inp_A", + "src_connector": null + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "1", + "subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "0", + "step": "1", + "tile": "1" + } + ] + }, + "data": "scalar", + "debuginfo": null, + "guid": "b90cc3c4-73b5-4cb6-9941-50264e15babf", + "src_subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "0", + "step": "1", + "tile": "1" + } + ] + }, + "dst_subset": null, + "is_data_src": null, + "num_accesses": "1" + } + } + }, + "src": "6", + "dst": "8", + "dst_connector": "__inp_scalar", + "src_connector": null + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "4", + "subset": { + "type": "Range", + "ranges": [ + { + "start": "j", + "end": "j + 96", + "step": "32", + "tile": "1" + } + ] + }, + "data": "A", + "debuginfo": null, + "guid": "4e3b5f0f-ef6d-43b8-9588-9666518af1b9", + "src_subset": { + "type": "Range", + "ranges": [ + { + "start": "j", + "end": "j + 96", + "step": "32", + "tile": "1" + } + ] + }, + "dst_subset": null, + "is_data_src": null, + "num_accesses": "4" + } + } + }, + "src": "8", + "dst": "7", + "dst_connector": null, + "src_connector": "__out" + } + ], + "attributes": { + "guid": "ff16ec78-9e21-4834-8e40-572509836663" + } + } + ], + "edges": [], + "collapsed": false, + "label": "", + "id": null, + "cfg_list_id": 0, + "start_block": null, + "dace_version": "1.0.0" +} \ No newline at end of file diff --git a/berkay_workpace/scratch/testbed.ipynb b/berkay_workpace/scratch/testbed.ipynb index 65eecf1343..02ee911dd0 100644 --- a/berkay_workpace/scratch/testbed.ipynb +++ b/berkay_workpace/scratch/testbed.ipynb @@ -17,705 +17,126 @@ }, "metadata": {}, "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "experimental\n" - ] } ], "source": [ "import dace\n", "import cupy as cp\n", - "\n", + "import numpy as np\n", "from IPython.display import Code\n", - "from dace.config import Config\n", + "from typing import Optional\n", "\n", - "\n", - "print(Config.get('compiler', 'cuda', 'implementation'))\n" + "from dace import SDFG, properties\n", + "from dace.config import Config\n", + "from dace.transformation import pass_pipeline as ppl, transformation\n", + "from dace.sdfg import nodes\n", + "from dace import dtypes\n", + "from dace.transformation.passes.gpustream_scheduling import NaiveGPUStreamScheduler\n" ] }, { "cell_type": "code", "execution_count": 2, - "id": "58226f37", - "metadata": {}, - "outputs": [], - "source": [ - "bs = 512\n", - "ns = 1024\n", - "BS = dace.symbol('BS')\n", - "NS = dace.symbol('NS')\n", - "\n", - "START = dace.symbol('START')\n", - "WS = dace.symbol('WS')\n", - "STRIDE = dace.symbol('STRIDE')\n", - "\n", - "start = 2\n", - "stride = 3\n", - "ws = 16\n", - "@dace.program\n", - "def symbolic_warp_map(A: dace.uint32[NS] @ dace.dtypes.StorageType.GPU_Global, B: dace.uint32[NS] @ dace.dtypes.StorageType.GPU_Global):\n", - " \"\"\"\n", - " Focus is in the use of symbolic variables in the MAP.\n", - " \"\"\"\n", - " A[:] = B[:]\n", - "\n", - "sdfg = symbolic_warp_map.to_sdfg()" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "a755b788", + "id": "e66c2551", "metadata": {}, "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "not_in_kernel_code: True\n", - "is_between_access_nodes: True\n", - "involves_gpu_or_pinned: True\n", - "\n", - "\n", - "copy_context.src_storage: StorageType.GPU_Global\n", - "copy_context.dst_storage: StorageType.GPU_Global\n", - "is_not_cpu_to_cpu: True\n" - ] - }, { "data": { "text/html": [ - "
/* DaCe AUTO-GENERATED FILE. DO NOT MODIFY */\n",
-       "#include <dace/dace.h>\n",
-       "#include "../../include/hash.h"\n",
-       "\n",
-       "struct symbolic_warp_map_state_t {\n",
-       "    dace::cuda::Context *gpu_context;\n",
-       "};\n",
-       "\n",
-       "void __program_symbolic_warp_map_internal(symbolic_warp_map_state_t*__state, dace::uint * __restrict__ A, dace::uint * __restrict__ B, int NS)\n",
-       "{\n",
-       "\n",
-       "    {\n",
-       "\n",
-       "        DACE_GPU_CHECK(cudaMemcpyAsync(A, B, NS * sizeof(dace::uint), cudaMemcpyDeviceToDevice, __state->gpu_context->streams[0]));\n",
-       "\n",
-       "    }\n",
-       "}\n",
-       "\n",
-       "DACE_EXPORTED void __program_symbolic_warp_map(symbolic_warp_map_state_t *__state, dace::uint * __restrict__ A, dace::uint * __restrict__ B, int NS)\n",
-       "{\n",
-       "    __program_symbolic_warp_map_internal(__state, A, B, NS);\n",
-       "}\n",
-       "DACE_EXPORTED int __dace_init_experimental_cuda(symbolic_warp_map_state_t *__state, int NS);\n",
-       "DACE_EXPORTED int __dace_exit_experimental_cuda(symbolic_warp_map_state_t *__state);\n",
-       "\n",
-       "DACE_EXPORTED symbolic_warp_map_state_t *__dace_init_symbolic_warp_map(int NS)\n",
-       "{\n",
-       "    int __result = 0;\n",
-       "    symbolic_warp_map_state_t *__state = new symbolic_warp_map_state_t;\n",
-       "\n",
-       "\n",
-       "    __result |= __dace_init_experimental_cuda(__state, NS);\n",
-       "\n",
-       "    if (__result) {\n",
-       "        delete __state;\n",
-       "        return nullptr;\n",
-       "    }\n",
-       "    return __state;\n",
-       "}\n",
-       "\n",
-       "DACE_EXPORTED int __dace_exit_symbolic_warp_map(symbolic_warp_map_state_t *__state)\n",
-       "{\n",
-       "    int __err = 0;\n",
-       "\n",
-       "    int __err_experimental_cuda = __dace_exit_experimental_cuda(__state);\n",
-       "    if (__err_experimental_cuda) {\n",
-       "        __err = __err_experimental_cuda;\n",
-       "    }\n",
-       "    delete __state;\n",
-       "    return __err;\n",
-       "}\n",
-       "
\n" - ], - "text/latex": [ - "\\begin{Verbatim}[commandchars=\\\\\\{\\}]\n", - "\\PY{c+cm}{/* DaCe AUTO\\PYZhy{}GENERATED FILE. DO NOT MODIFY */}\n", - "\\PY{c+cp}{\\PYZsh{}}\\PY{c+cp}{include}\\PY{+w}{ }\\PY{c+cpf}{\\PYZlt{}dace/dace.h\\PYZgt{}}\n", - "\\PY{c+cp}{\\PYZsh{}}\\PY{c+cp}{include}\\PY{+w}{ }\\PY{c+cpf}{\\PYZdq{}../../include/hash.h\\PYZdq{}}\n", - "\n", - "\\PY{k}{struct}\\PY{+w}{ }\\PY{n+nc}{symbolic\\PYZus{}warp\\PYZus{}map\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", - "\\PY{+w}{ }\\PY{n}{dace}\\PY{o}{:}\\PY{o}{:}\\PY{n}{cuda}\\PY{o}{:}\\PY{o}{:}\\PY{n}{Context}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{gpu\\PYZus{}context}\\PY{p}{;}\n", - "\\PY{p}{\\PYZcb{}}\\PY{p}{;}\n", - "\n", - "\\PY{k+kt}{void}\\PY{+w}{ }\\PY{n+nf}{\\PYZus{}\\PYZus{}program\\PYZus{}symbolic\\PYZus{}warp\\PYZus{}map\\PYZus{}internal}\\PY{p}{(}\\PY{n}{symbolic\\PYZus{}warp\\PYZus{}map\\PYZus{}state\\PYZus{}t}\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{dace}\\PY{o}{:}\\PY{o}{:}\\PY{n}{uint}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}restrict\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{n}{A}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{dace}\\PY{o}{:}\\PY{o}{:}\\PY{n}{uint}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}restrict\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{n}{B}\\PY{p}{,}\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{NS}\\PY{p}{)}\n", - "\\PY{p}{\\PYZob{}}\n", - "\n", - "\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", "\n", - "\\PY{+w}{ }\\PY{n}{DACE\\PYZus{}GPU\\PYZus{}CHECK}\\PY{p}{(}\\PY{n}{cudaMemcpyAsync}\\PY{p}{(}\\PY{n}{A}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{B}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{NS}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{k}{sizeof}\\PY{p}{(}\\PY{n}{dace}\\PY{o}{:}\\PY{o}{:}\\PY{n}{uint}\\PY{p}{)}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{cudaMemcpyDeviceToDevice}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{streams}\\PY{p}{[}\\PY{l+m+mi}{0}\\PY{p}{]}\\PY{p}{)}\\PY{p}{)}\\PY{p}{;}\n", - "\n", - "\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\n", - "\\PY{p}{\\PYZcb{}}\n", - "\n", - "\\PY{n}{DACE\\PYZus{}EXPORTED}\\PY{+w}{ }\\PY{k+kt}{void}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}program\\PYZus{}symbolic\\PYZus{}warp\\PYZus{}map}\\PY{p}{(}\\PY{n}{symbolic\\PYZus{}warp\\PYZus{}map\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{dace}\\PY{o}{:}\\PY{o}{:}\\PY{n}{uint}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}restrict\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{n}{A}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{dace}\\PY{o}{:}\\PY{o}{:}\\PY{n}{uint}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}restrict\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{n}{B}\\PY{p}{,}\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{NS}\\PY{p}{)}\n", - "\\PY{p}{\\PYZob{}}\n", - "\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}program\\PYZus{}symbolic\\PYZus{}warp\\PYZus{}map\\PYZus{}internal}\\PY{p}{(}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{A}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{B}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{NS}\\PY{p}{)}\\PY{p}{;}\n", - "\\PY{p}{\\PYZcb{}}\n", - "\\PY{n}{DACE\\PYZus{}EXPORTED}\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}dace\\PYZus{}init\\PYZus{}experimental\\PYZus{}cuda}\\PY{p}{(}\\PY{n}{symbolic\\PYZus{}warp\\PYZus{}map\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{,}\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{NS}\\PY{p}{)}\\PY{p}{;}\n", - "\\PY{n}{DACE\\PYZus{}EXPORTED}\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}dace\\PYZus{}exit\\PYZus{}experimental\\PYZus{}cuda}\\PY{p}{(}\\PY{n}{symbolic\\PYZus{}warp\\PYZus{}map\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{)}\\PY{p}{;}\n", - "\n", - "\\PY{n}{DACE\\PYZus{}EXPORTED}\\PY{+w}{ }\\PY{n}{symbolic\\PYZus{}warp\\PYZus{}map\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}dace\\PYZus{}init\\PYZus{}symbolic\\PYZus{}warp\\PYZus{}map}\\PY{p}{(}\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{NS}\\PY{p}{)}\n", - "\\PY{p}{\\PYZob{}}\n", - "\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}result}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{n}{symbolic\\PYZus{}warp\\PYZus{}map\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{k}{new}\\PY{+w}{ }\\PY{n}{symbolic\\PYZus{}warp\\PYZus{}map\\PYZus{}state\\PYZus{}t}\\PY{p}{;}\n", - "\n", - "\n", - "\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}result}\\PY{+w}{ }\\PY{o}{|}\\PY{o}{=}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}dace\\PYZus{}init\\PYZus{}experimental\\PYZus{}cuda}\\PY{p}{(}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{NS}\\PY{p}{)}\\PY{p}{;}\n", - "\n", - "\\PY{+w}{ }\\PY{k}{if}\\PY{+w}{ }\\PY{p}{(}\\PY{n}{\\PYZus{}\\PYZus{}result}\\PY{p}{)}\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", - "\\PY{+w}{ }\\PY{k}{delete}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{k}{return}\\PY{+w}{ }\\PY{k}{nullptr}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\n", - "\\PY{+w}{ }\\PY{k}{return}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{;}\n", - "\\PY{p}{\\PYZcb{}}\n", - "\n", - "\\PY{n}{DACE\\PYZus{}EXPORTED}\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}dace\\PYZus{}exit\\PYZus{}symbolic\\PYZus{}warp\\PYZus{}map}\\PY{p}{(}\\PY{n}{symbolic\\PYZus{}warp\\PYZus{}map\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{)}\n", - "\\PY{p}{\\PYZob{}}\n", - "\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}err}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{;}\n", - "\n", - "\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}err\\PYZus{}experimental\\PYZus{}cuda}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}dace\\PYZus{}exit\\PYZus{}experimental\\PYZus{}cuda}\\PY{p}{(}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{)}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{k}{if}\\PY{+w}{ }\\PY{p}{(}\\PY{n}{\\PYZus{}\\PYZus{}err\\PYZus{}experimental\\PYZus{}cuda}\\PY{p}{)}\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", - "\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}err}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}err\\PYZus{}experimental\\PYZus{}cuda}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\n", - "\\PY{+w}{ }\\PY{k}{delete}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{k}{return}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}err}\\PY{p}{;}\n", - "\\PY{p}{\\PYZcb{}}\n", - "\\end{Verbatim}\n" + "
\n", + "
\n", + "
\n", + "\n", + "" ], "text/plain": [ - "/* DaCe AUTO-GENERATED FILE. DO NOT MODIFY */\n", - "#include \n", - "#include \"../../include/hash.h\"\n", - "\n", - "struct symbolic_warp_map_state_t {\n", - " dace::cuda::Context *gpu_context;\n", - "};\n", - "\n", - "void __program_symbolic_warp_map_internal(symbolic_warp_map_state_t*__state, dace::uint * __restrict__ A, dace::uint * __restrict__ B, int NS)\n", - "{\n", - "\n", - " {\n", - "\n", - " DACE_GPU_CHECK(cudaMemcpyAsync(A, B, NS * sizeof(dace::uint), cudaMemcpyDeviceToDevice, __state->gpu_context->streams[0]));\n", - "\n", - " }\n", - "}\n", - "\n", - "DACE_EXPORTED void __program_symbolic_warp_map(symbolic_warp_map_state_t *__state, dace::uint * __restrict__ A, dace::uint * __restrict__ B, int NS)\n", - "{\n", - " __program_symbolic_warp_map_internal(__state, A, B, NS);\n", - "}\n", - "DACE_EXPORTED int __dace_init_experimental_cuda(symbolic_warp_map_state_t *__state, int NS);\n", - "DACE_EXPORTED int __dace_exit_experimental_cuda(symbolic_warp_map_state_t *__state);\n", - "\n", - "DACE_EXPORTED symbolic_warp_map_state_t *__dace_init_symbolic_warp_map(int NS)\n", - "{\n", - " int __result = 0;\n", - " symbolic_warp_map_state_t *__state = new symbolic_warp_map_state_t;\n", - "\n", - "\n", - " __result |= __dace_init_experimental_cuda(__state, NS);\n", - "\n", - " if (__result) {\n", - " delete __state;\n", - " return nullptr;\n", - " }\n", - " return __state;\n", - "}\n", - "\n", - "DACE_EXPORTED int __dace_exit_symbolic_warp_map(symbolic_warp_map_state_t *__state)\n", - "{\n", - " int __err = 0;\n", - "\n", - " int __err_experimental_cuda = __dace_exit_experimental_cuda(__state);\n", - " if (__err_experimental_cuda) {\n", - " __err = __err_experimental_cuda;\n", - " }\n", - " delete __state;\n", - " return __err;\n", - "}" + "SDFG (test2)" ] }, - "execution_count": 3, + "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "Code(sdfg.generate_code()[0].clean_code, language='cpp')" + "@dace.program\n", + "def test1(A: dace.uint32[10] @ dace.dtypes.StorageType.GPU_Global,\n", + " B: dace.uint32[10] @ dace.dtypes.StorageType.GPU_Global,\n", + " C: dace.uint32[10],\n", + " D: dace.uint32[10] @ dace.dtypes.StorageType.GPU_Global\n", + " ):\n", + " A[:] = B[:]\n", + " C[:] = D[:]\n", + "\n", + "\n", + "@dace.program\n", + "def test2(A: dace.uint32[10] @ dace.dtypes.StorageType.GPU_Global,\n", + " B: dace.uint32[10] @ dace.dtypes.StorageType.GPU_Global,\n", + " C: dace.uint32[10],\n", + " D: dace.uint32[10] @ dace.dtypes.StorageType.GPU_Global\n", + " ):\n", + " \n", + " for i in dace.map[0:10] @ dace.dtypes.ScheduleType.GPU_Device:\n", + " A[i] = B[i]\n", + " \n", + " for j in dace.map[0:10] @ dace.dtypes.ScheduleType.GPU_Device:\n", + " C[j] = D[j]\n", + "\n", + "sdfg = test2.to_sdfg()\n", + "sdfg" ] }, { "cell_type": "code", - "execution_count": 4, - "id": "9426fb29", + "execution_count": 3, + "id": "0aaef92c", "metadata": {}, "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "not_in_kernel_code: True\n", - "is_between_access_nodes: True\n", - "involves_gpu_or_pinned: True\n", - "\n", - "\n", - "copy_context.src_storage: StorageType.GPU_Global\n", - "copy_context.dst_storage: StorageType.GPU_Global\n", - "is_not_cpu_to_cpu: True\n" - ] - }, { "data": { "text/html": [ - "
#include <cuda_runtime.h>\n",
-       "#include <dace/dace.h>\n",
-       "\n",
-       "\n",
-       "struct symbolic_warp_map_state_t {\n",
-       "    dace::cuda::Context *gpu_context;\n",
-       "};\n",
-       "\n",
-       "\n",
-       "\n",
-       "DACE_EXPORTED int __dace_init_experimental_cuda(symbolic_warp_map_state_t *__state, int NS);\n",
-       "DACE_EXPORTED int __dace_exit_experimental_cuda(symbolic_warp_map_state_t *__state);\n",
        "\n",
-       "\n",
-       "\n",
-       "int __dace_init_experimental_cuda(symbolic_warp_map_state_t *__state, int NS) {\n",
-       "    int count;\n",
-       "\n",
-       "    // Check that we are able to run cuda code\n",
-       "    if (cudaGetDeviceCount(&count) != cudaSuccess)\n",
-       "    {\n",
-       "        printf("ERROR: GPU drivers are not configured or cuda-capable device "\n",
-       "               "not found\\n");\n",
-       "        return 1;\n",
-       "    }\n",
-       "    if (count == 0)\n",
-       "    {\n",
-       "        printf("ERROR: No cuda-capable devices found\\n");\n",
-       "        return 2;\n",
-       "    }\n",
-       "\n",
-       "    // Initialize cuda before we run the application\n",
-       "    float *dev_X;\n",
-       "    DACE_GPU_CHECK(cudaMalloc((void **) &dev_X, 1));\n",
-       "    DACE_GPU_CHECK(cudaFree(dev_X));\n",
-       "\n",
-       "    \n",
-       "\n",
-       "    __state->gpu_context = new dace::cuda::Context(1, 1);\n",
-       "\n",
-       "    // Create cuda streams and events\n",
-       "    for(int i = 0; i < 1; ++i) {\n",
-       "        DACE_GPU_CHECK(cudaStreamCreateWithFlags(&__state->gpu_context->internal_streams[i], cudaStreamNonBlocking));\n",
-       "        __state->gpu_context->streams[i] = __state->gpu_context->internal_streams[i]; // Allow for externals to modify streams\n",
-       "    }\n",
-       "    for(int i = 0; i < 1; ++i) {\n",
-       "        DACE_GPU_CHECK(cudaEventCreateWithFlags(&__state->gpu_context->events[i], cudaEventDisableTiming));\n",
-       "    }\n",
-       "\n",
-       "    \n",
-       "\n",
-       "    return 0;\n",
-       "}\n",
-       "\n",
-       "int __dace_exit_experimental_cuda(symbolic_warp_map_state_t *__state) {\n",
-       "    \n",
-       "\n",
-       "    // Synchronize and check for CUDA errors\n",
-       "    int __err = static_cast<int>(__state->gpu_context->lasterror);\n",
-       "    if (__err == 0)\n",
-       "        __err = static_cast<int>(cudaDeviceSynchronize());\n",
-       "\n",
-       "    // Destroy cuda streams and events\n",
-       "    for(int i = 0; i < 1; ++i) {\n",
-       "        DACE_GPU_CHECK(cudaStreamDestroy(__state->gpu_context->internal_streams[i]));\n",
-       "    }\n",
-       "    for(int i = 0; i < 1; ++i) {\n",
-       "        DACE_GPU_CHECK(cudaEventDestroy(__state->gpu_context->events[i]));\n",
-       "    }\n",
-       "\n",
-       "    delete __state->gpu_context;\n",
-       "    return __err;\n",
-       "}\n",
-       "\n",
-       "DACE_EXPORTED bool __dace_gpu_set_stream(symbolic_warp_map_state_t *__state, int streamid, gpuStream_t stream)\n",
-       "{\n",
-       "    if (streamid < 0 || streamid >= 1)\n",
-       "        return false;\n",
-       "\n",
-       "    __state->gpu_context->streams[streamid] = stream;\n",
-       "\n",
-       "    return true;\n",
-       "}\n",
-       "\n",
-       "DACE_EXPORTED void __dace_gpu_set_all_streams(symbolic_warp_map_state_t *__state, gpuStream_t stream)\n",
-       "{\n",
-       "    for (int i = 0; i < 1; ++i)\n",
-       "        __state->gpu_context->streams[i] = stream;\n",
-       "}\n",
-       "
\n" - ], - "text/latex": [ - "\\begin{Verbatim}[commandchars=\\\\\\{\\}]\n", - "\\PY{c+cp}{\\PYZsh{}}\\PY{c+cp}{include}\\PY{+w}{ }\\PY{c+cpf}{\\PYZlt{}cuda\\PYZus{}runtime.h\\PYZgt{}}\n", - "\\PY{c+cp}{\\PYZsh{}}\\PY{c+cp}{include}\\PY{+w}{ }\\PY{c+cpf}{\\PYZlt{}dace/dace.h\\PYZgt{}}\n", - "\n", - "\n", - "\\PY{k}{struct}\\PY{+w}{ }\\PY{n+nc}{symbolic\\PYZus{}warp\\PYZus{}map\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", - "\\PY{+w}{ }\\PY{n}{dace}\\PY{o}{:}\\PY{o}{:}\\PY{n}{cuda}\\PY{o}{:}\\PY{o}{:}\\PY{n}{Context}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{gpu\\PYZus{}context}\\PY{p}{;}\n", - "\\PY{p}{\\PYZcb{}}\\PY{p}{;}\n", - "\n", - "\n", - "\n", - "\\PY{n}{DACE\\PYZus{}EXPORTED}\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}dace\\PYZus{}init\\PYZus{}experimental\\PYZus{}cuda}\\PY{p}{(}\\PY{n}{symbolic\\PYZus{}warp\\PYZus{}map\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{,}\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{NS}\\PY{p}{)}\\PY{p}{;}\n", - "\\PY{n}{DACE\\PYZus{}EXPORTED}\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}dace\\PYZus{}exit\\PYZus{}experimental\\PYZus{}cuda}\\PY{p}{(}\\PY{n}{symbolic\\PYZus{}warp\\PYZus{}map\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{)}\\PY{p}{;}\n", - "\n", - "\n", - "\n", - "\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n+nf}{\\PYZus{}\\PYZus{}dace\\PYZus{}init\\PYZus{}experimental\\PYZus{}cuda}\\PY{p}{(}\\PY{n}{symbolic\\PYZus{}warp\\PYZus{}map\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{,}\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{NS}\\PY{p}{)}\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", - "\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{count}\\PY{p}{;}\n", - "\n", - "\\PY{+w}{ }\\PY{c+c1}{// Check that we are able to run cuda code}\n", - "\\PY{+w}{ }\\PY{k}{if}\\PY{+w}{ }\\PY{p}{(}\\PY{n}{cudaGetDeviceCount}\\PY{p}{(}\\PY{o}{\\PYZam{}}\\PY{n}{count}\\PY{p}{)}\\PY{+w}{ }\\PY{o}{!}\\PY{o}{=}\\PY{+w}{ }\\PY{n}{cudaSuccess}\\PY{p}{)}\n", - "\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", - "\\PY{+w}{ }\\PY{n}{printf}\\PY{p}{(}\\PY{l+s}{\\PYZdq{}}\\PY{l+s}{ERROR: GPU drivers are not configured or cuda\\PYZhy{}capable device }\\PY{l+s}{\\PYZdq{}}\n", - "\\PY{+w}{ }\\PY{l+s}{\\PYZdq{}}\\PY{l+s}{not found}\\PY{l+s+se}{\\PYZbs{}n}\\PY{l+s}{\\PYZdq{}}\\PY{p}{)}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{k}{return}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\n", - "\\PY{+w}{ }\\PY{k}{if}\\PY{+w}{ }\\PY{p}{(}\\PY{n}{count}\\PY{+w}{ }\\PY{o}{=}\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{)}\n", - "\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", - "\\PY{+w}{ }\\PY{n}{printf}\\PY{p}{(}\\PY{l+s}{\\PYZdq{}}\\PY{l+s}{ERROR: No cuda\\PYZhy{}capable devices found}\\PY{l+s+se}{\\PYZbs{}n}\\PY{l+s}{\\PYZdq{}}\\PY{p}{)}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{k}{return}\\PY{+w}{ }\\PY{l+m+mi}{2}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\n", - "\n", - "\\PY{+w}{ }\\PY{c+c1}{// Initialize cuda before we run the application}\n", - "\\PY{+w}{ }\\PY{k+kt}{float}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{dev\\PYZus{}X}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{n}{DACE\\PYZus{}GPU\\PYZus{}CHECK}\\PY{p}{(}\\PY{n}{cudaMalloc}\\PY{p}{(}\\PY{p}{(}\\PY{k+kt}{void}\\PY{+w}{ }\\PY{o}{*}\\PY{o}{*}\\PY{p}{)}\\PY{+w}{ }\\PY{o}{\\PYZam{}}\\PY{n}{dev\\PYZus{}X}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{)}\\PY{p}{)}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{n}{DACE\\PYZus{}GPU\\PYZus{}CHECK}\\PY{p}{(}\\PY{n}{cudaFree}\\PY{p}{(}\\PY{n}{dev\\PYZus{}X}\\PY{p}{)}\\PY{p}{)}\\PY{p}{;}\n", - "\n", - "\\PY{+w}{ }\n", - "\n", - "\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{k}{new}\\PY{+w}{ }\\PY{n}{dace}\\PY{o}{:}\\PY{o}{:}\\PY{n}{cuda}\\PY{o}{:}\\PY{o}{:}\\PY{n}{Context}\\PY{p}{(}\\PY{l+m+mi}{1}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{)}\\PY{p}{;}\n", - "\n", - "\\PY{+w}{ }\\PY{c+c1}{// Create cuda streams and events}\n", - "\\PY{+w}{ }\\PY{k}{for}\\PY{p}{(}\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{;}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{\\PYZlt{}}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{;}\\PY{+w}{ }\\PY{o}{+}\\PY{o}{+}\\PY{n}{i}\\PY{p}{)}\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", - "\\PY{+w}{ }\\PY{n}{DACE\\PYZus{}GPU\\PYZus{}CHECK}\\PY{p}{(}\\PY{n}{cudaStreamCreateWithFlags}\\PY{p}{(}\\PY{o}{\\PYZam{}}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{internal\\PYZus{}streams}\\PY{p}{[}\\PY{n}{i}\\PY{p}{]}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{cudaStreamNonBlocking}\\PY{p}{)}\\PY{p}{)}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{streams}\\PY{p}{[}\\PY{n}{i}\\PY{p}{]}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{internal\\PYZus{}streams}\\PY{p}{[}\\PY{n}{i}\\PY{p}{]}\\PY{p}{;}\\PY{+w}{ }\\PY{c+c1}{// Allow for externals to modify streams}\n", - "\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\n", - "\\PY{+w}{ }\\PY{k}{for}\\PY{p}{(}\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{;}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{\\PYZlt{}}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{;}\\PY{+w}{ }\\PY{o}{+}\\PY{o}{+}\\PY{n}{i}\\PY{p}{)}\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", - "\\PY{+w}{ }\\PY{n}{DACE\\PYZus{}GPU\\PYZus{}CHECK}\\PY{p}{(}\\PY{n}{cudaEventCreateWithFlags}\\PY{p}{(}\\PY{o}{\\PYZam{}}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{events}\\PY{p}{[}\\PY{n}{i}\\PY{p}{]}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{cudaEventDisableTiming}\\PY{p}{)}\\PY{p}{)}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\n", - "\n", - "\\PY{+w}{ }\n", - "\n", - "\\PY{+w}{ }\\PY{k}{return}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{;}\n", - "\\PY{p}{\\PYZcb{}}\n", - "\n", - "\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n+nf}{\\PYZus{}\\PYZus{}dace\\PYZus{}exit\\PYZus{}experimental\\PYZus{}cuda}\\PY{p}{(}\\PY{n}{symbolic\\PYZus{}warp\\PYZus{}map\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{)}\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", - "\\PY{+w}{ }\n", - "\n", - "\\PY{+w}{ }\\PY{c+c1}{// Synchronize and check for CUDA errors}\n", - "\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}err}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{k}{static\\PYZus{}cast}\\PY{o}{\\PYZlt{}}\\PY{k+kt}{int}\\PY{o}{\\PYZgt{}}\\PY{p}{(}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{lasterror}\\PY{p}{)}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{k}{if}\\PY{+w}{ }\\PY{p}{(}\\PY{n}{\\PYZus{}\\PYZus{}err}\\PY{+w}{ }\\PY{o}{=}\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{)}\n", - "\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}err}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{k}{static\\PYZus{}cast}\\PY{o}{\\PYZlt{}}\\PY{k+kt}{int}\\PY{o}{\\PYZgt{}}\\PY{p}{(}\\PY{n}{cudaDeviceSynchronize}\\PY{p}{(}\\PY{p}{)}\\PY{p}{)}\\PY{p}{;}\n", - "\n", - "\\PY{+w}{ }\\PY{c+c1}{// Destroy cuda streams and events}\n", - "\\PY{+w}{ }\\PY{k}{for}\\PY{p}{(}\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{;}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{\\PYZlt{}}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{;}\\PY{+w}{ }\\PY{o}{+}\\PY{o}{+}\\PY{n}{i}\\PY{p}{)}\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", - "\\PY{+w}{ }\\PY{n}{DACE\\PYZus{}GPU\\PYZus{}CHECK}\\PY{p}{(}\\PY{n}{cudaStreamDestroy}\\PY{p}{(}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{internal\\PYZus{}streams}\\PY{p}{[}\\PY{n}{i}\\PY{p}{]}\\PY{p}{)}\\PY{p}{)}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\n", - "\\PY{+w}{ }\\PY{k}{for}\\PY{p}{(}\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{;}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{\\PYZlt{}}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{;}\\PY{+w}{ }\\PY{o}{+}\\PY{o}{+}\\PY{n}{i}\\PY{p}{)}\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", - "\\PY{+w}{ }\\PY{n}{DACE\\PYZus{}GPU\\PYZus{}CHECK}\\PY{p}{(}\\PY{n}{cudaEventDestroy}\\PY{p}{(}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{events}\\PY{p}{[}\\PY{n}{i}\\PY{p}{]}\\PY{p}{)}\\PY{p}{)}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\n", - "\n", - "\\PY{+w}{ }\\PY{k}{delete}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{k}{return}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}err}\\PY{p}{;}\n", - "\\PY{p}{\\PYZcb{}}\n", - "\n", - "\\PY{n}{DACE\\PYZus{}EXPORTED}\\PY{+w}{ }\\PY{k+kt}{bool}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}dace\\PYZus{}gpu\\PYZus{}set\\PYZus{}stream}\\PY{p}{(}\\PY{n}{symbolic\\PYZus{}warp\\PYZus{}map\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{,}\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{streamid}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{gpuStream\\PYZus{}t}\\PY{+w}{ }\\PY{n}{stream}\\PY{p}{)}\n", - "\\PY{p}{\\PYZob{}}\n", - "\\PY{+w}{ }\\PY{k}{if}\\PY{+w}{ }\\PY{p}{(}\\PY{n}{streamid}\\PY{+w}{ }\\PY{o}{\\PYZlt{}}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{+w}{ }\\PY{o}{|}\\PY{o}{|}\\PY{+w}{ }\\PY{n}{streamid}\\PY{+w}{ }\\PY{o}{\\PYZgt{}}\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{)}\n", - "\\PY{+w}{ }\\PY{k}{return}\\PY{+w}{ }\\PY{n+nb}{false}\\PY{p}{;}\n", - "\n", - "\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{streams}\\PY{p}{[}\\PY{n}{streamid}\\PY{p}{]}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{stream}\\PY{p}{;}\n", - "\n", - "\\PY{+w}{ }\\PY{k}{return}\\PY{+w}{ }\\PY{n+nb}{true}\\PY{p}{;}\n", - "\\PY{p}{\\PYZcb{}}\n", - "\n", - "\\PY{n}{DACE\\PYZus{}EXPORTED}\\PY{+w}{ }\\PY{k+kt}{void}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}dace\\PYZus{}gpu\\PYZus{}set\\PYZus{}all\\PYZus{}streams}\\PY{p}{(}\\PY{n}{symbolic\\PYZus{}warp\\PYZus{}map\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{gpuStream\\PYZus{}t}\\PY{+w}{ }\\PY{n}{stream}\\PY{p}{)}\n", - "\\PY{p}{\\PYZob{}}\n", - "\\PY{+w}{ }\\PY{k}{for}\\PY{+w}{ }\\PY{p}{(}\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{;}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{\\PYZlt{}}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{;}\\PY{+w}{ }\\PY{o}{+}\\PY{o}{+}\\PY{n}{i}\\PY{p}{)}\n", - "\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{streams}\\PY{p}{[}\\PY{n}{i}\\PY{p}{]}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{stream}\\PY{p}{;}\n", - "\\PY{p}{\\PYZcb{}}\n", - "\\end{Verbatim}\n" + "
\n", + "
\n", + "
\n", + "\n", + "" ], "text/plain": [ - "\n", - "#include \n", - "#include \n", - "\n", - "\n", - "struct symbolic_warp_map_state_t {\n", - " dace::cuda::Context *gpu_context;\n", - "};\n", - "\n", - "\n", - "\n", - "DACE_EXPORTED int __dace_init_experimental_cuda(symbolic_warp_map_state_t *__state, int NS);\n", - "DACE_EXPORTED int __dace_exit_experimental_cuda(symbolic_warp_map_state_t *__state);\n", - "\n", - "\n", - "\n", - "int __dace_init_experimental_cuda(symbolic_warp_map_state_t *__state, int NS) {\n", - " int count;\n", - "\n", - " // Check that we are able to run cuda code\n", - " if (cudaGetDeviceCount(&count) != cudaSuccess)\n", - " {\n", - " printf(\"ERROR: GPU drivers are not configured or cuda-capable device \"\n", - " \"not found\\n\");\n", - " return 1;\n", - " }\n", - " if (count == 0)\n", - " {\n", - " printf(\"ERROR: No cuda-capable devices found\\n\");\n", - " return 2;\n", - " }\n", - "\n", - " // Initialize cuda before we run the application\n", - " float *dev_X;\n", - " DACE_GPU_CHECK(cudaMalloc((void **) &dev_X, 1));\n", - " DACE_GPU_CHECK(cudaFree(dev_X));\n", - "\n", - " \n", - "\n", - " __state->gpu_context = new dace::cuda::Context(1, 1);\n", - "\n", - " // Create cuda streams and events\n", - " for(int i = 0; i < 1; ++i) {\n", - " DACE_GPU_CHECK(cudaStreamCreateWithFlags(&__state->gpu_context->internal_streams[i], cudaStreamNonBlocking));\n", - " __state->gpu_context->streams[i] = __state->gpu_context->internal_streams[i]; // Allow for externals to modify streams\n", - " }\n", - " for(int i = 0; i < 1; ++i) {\n", - " DACE_GPU_CHECK(cudaEventCreateWithFlags(&__state->gpu_context->events[i], cudaEventDisableTiming));\n", - " }\n", - "\n", - " \n", - "\n", - " return 0;\n", - "}\n", - "\n", - "int __dace_exit_experimental_cuda(symbolic_warp_map_state_t *__state) {\n", - " \n", - "\n", - " // Synchronize and check for CUDA errors\n", - " int __err = static_cast(__state->gpu_context->lasterror);\n", - " if (__err == 0)\n", - " __err = static_cast(cudaDeviceSynchronize());\n", - "\n", - " // Destroy cuda streams and events\n", - " for(int i = 0; i < 1; ++i) {\n", - " DACE_GPU_CHECK(cudaStreamDestroy(__state->gpu_context->internal_streams[i]));\n", - " }\n", - " for(int i = 0; i < 1; ++i) {\n", - " DACE_GPU_CHECK(cudaEventDestroy(__state->gpu_context->events[i]));\n", - " }\n", - "\n", - " delete __state->gpu_context;\n", - " return __err;\n", - "}\n", - "\n", - "DACE_EXPORTED bool __dace_gpu_set_stream(symbolic_warp_map_state_t *__state, int streamid, gpuStream_t stream)\n", - "{\n", - " if (streamid < 0 || streamid >= 1)\n", - " return false;\n", - "\n", - " __state->gpu_context->streams[streamid] = stream;\n", - "\n", - " return true;\n", - "}\n", - "\n", - "DACE_EXPORTED void __dace_gpu_set_all_streams(symbolic_warp_map_state_t *__state, gpuStream_t stream)\n", - "{\n", - " for (int i = 0; i < 1; ++i)\n", - " __state->gpu_context->streams[i] = stream;\n", - "}\n", - "\n" + "SDFG (test2)" ] }, - "execution_count": 4, + "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "Code(sdfg.generate_code()[1].clean_code, language='cpp')" + "\n", + "gpu_stream_access_template = \"__state->gpu_context->streams[{gpu_stream}]\" \n", + "\n", + "# Initialize and configure GPU stream scheduling pass\n", + "gpu_stream_pass = NaiveGPUStreamScheduler()\n", + "gpu_stream_pass.set_gpu_stream_access_template(gpu_stream_access_template)\n", + "assigned_streams = gpu_stream_pass.apply_pass(sdfg, None)\n", + "sdfg" ] } ], diff --git a/dace/transformation/passes/shared_memory_synchronization.py b/dace/transformation/passes/shared_memory_synchronization.py new file mode 100644 index 0000000000..609076f8d8 --- /dev/null +++ b/dace/transformation/passes/shared_memory_synchronization.py @@ -0,0 +1,394 @@ +from typing import Union, Dict, Set + +import dace +from dace import SDFG, properties, SDFGState +from dace import dtypes +from dace.codegen import common +from dace.config import Config +from dace.transformation import pass_pipeline as ppl, transformation +from dace.sdfg import nodes, InterstateEdge +from dace.sdfg.graph import Edge + +from dace.sdfg.state import LoopRegion, ControlFlowBlock +from dace.sdfg.nodes import AccessNode, Map, MapEntry, MapExit + + +@properties.make_properties +@transformation.explicit_cf_compatible +class DefaultSharedMemorySync(ppl.Pass): + """ + A DaCe transformation pass that automatically inserts GPU synchronization barriers + (__syncthreads()) for shared memory access patterns. + + This pass ensures proper synchronization in two scenarios: + 1. Pre-synchronization: Before consuming shared memory data (AccessNode -> CodeNode/MapEntry) + 2. Post-synchronization: After shared memory reuse in sequential loops/maps within GPU kernels + + The pass traverses the SDFG hierarchy and identifies shared memory access patterns + that require synchronization to prevent race conditions in GPU code. + + NOTE: This implementation handles commonly observed patterns. Unsupported cases + raise NotImplementedError with context for extending the implementation once comming across + another constellation which was not observed in the used common examples. + """ + + + def __init__(self): + """Initialize the synchronization pass.""" + + # Track which scopes (sequential maps and Loops) have already been + # synchronized to avoid duplicate barriers + self._synchronized_scopes: Set[Union[MapExit, LoopRegion]] = set() + + # Map from MapExit nodes to their containing states for post-synchronization + self._map_exit_to_state: Dict[MapExit, SDFGState] = dict() + + # Keep track of processed nested sdfgs + self._processed_nsdfg = set() + + + + def apply_pass(self, sdfg: SDFG, _) -> None: + """ + Apply the synchronization pass to the entire SDFG. + + Args: + sdfg: The SDFG to process (expected to be top-level) + _: Unused pass pipeline argument + """ + # Start processing from the top-level with empty scope stack + # The scope stack tracks nested execution contexts (maps, loops) + enclosing_scopes = [] + self._process_sdfg(sdfg, enclosing_scopes) + + + + def _process_sdfg(self, sdfg: SDFG, enclosing_scopes: list[Union[MapExit, LoopRegion]]) -> None: + """ + Recursively traverse all nodes in an SDFG, handling different node types. + + Args: + sdfg: The SDFG to traverse + enclosing_scopes: Stack of enclosing execution scopes (maps, loops) of sdfg + """ + for node in sdfg.nodes(): + + + if isinstance(node, LoopRegion): + self._process_loop_region(sdfg, node, enclosing_scopes) + + elif isinstance(node, SDFGState): + self._process_state(sdfg, node, enclosing_scopes) + + else: + raise NotImplementedError( + f"{self.__class__.__name__}: Unsupported node type '{type(node).__name__}' " + f"encountered during SDFG traversal. Please extend the implementation to handle this case." + ) + + def _process_loop_region(self, sdfg: SDFG, loop_region: LoopRegion, + enclosing_scopes: list[Union[MapExit, LoopRegion]]) -> None: + """ + Process a loop region by adding it to the scope stack and traversing its contents. + + Args: + sdfg: The containing SDFG + loop_region: The loop region to process + enclosing_scopes: Current scope stack which wraps around state + """ + # Create a new scope stack with this loop region added + nested_scopes = enclosing_scopes.copy() + nested_scopes.insert(0, loop_region) # Not append! :) careful + + # Process all states within the loop region + for node in loop_region.nodes(): + if isinstance(node, SDFGState): + self._process_state(sdfg, node, nested_scopes) + else: + raise NotImplementedError( + f"{self.__class__.__name__}: Unexpected node type '{type(node).__name__}' " + f"found inside LoopRegion. SDFGState nodes were expected. Extend if you think" + "the node type is also valid" + ) + + + def _process_state(self, sdfg: SDFG, state: SDFGState, + enclosing_scopes: list[Union[MapExit, LoopRegion]]) -> None: + """ + Process a single SDFG state, analyzing edges for shared memory access patterns. + + Args: + sdfg: The containing SDFG + state: The state to process + enclosing_scopes: Current scope stack which wrapp around state (NOT of each individual node) + """ + # Track destination nodes that already have synchronization tasklets + # This prevents creating duplicate barriers for the same consumer + nodes_with_sync: Dict[nodes.Node, nodes.Tasklet] = {} + + # Analyze each edge in the state for shared memory access patterns + for edge in state.edges(): + source_node, dest_node = edge.src, edge.dst + + # Skip edges that don't involve shared memory reads + # (either source is not shared memory, or it's a memory-to-memory copy) + if not self._is_shared_memory_access_node(sdfg, source_node) or isinstance(dest_node, nodes.AccessNode): + continue + + + # Handle different types of shared memory consumers + if isinstance(dest_node, (nodes.CodeNode, nodes.MapEntry)): + # Direct consumption by computation or map entry + self._insert_pre_synchronization_barrier(source_node, dest_node, state, nodes_with_sync) + + elif isinstance(dest_node, nodes.NestedSDFG): + # Consumption by nested SDFG - synchronize and recurse + # NOTE: For nesting, we append all scopes which wrap around the nestedSDFG + self._insert_pre_synchronization_barrier(source_node, dest_node, state, nodes_with_sync) + nested_scopes = self._build_nested_scope_stack(state, dest_node, enclosing_scopes) + self._process_sdfg(dest_node.sdfg, nested_scopes) + self._processed_nsdfg.add(dest_node) + else: + raise NotImplementedError( + f"{self.__class__.__name__}: Unsupported destination node type '{type(dest_node).__name__}' " + f"for shared memory access. Currently supported: CodeNode, MapEntry, AccessNode, NestedSDFG." + ) + + # Check if post-synchronization is needed and apply shared + self._handle_shared_memory_post_synchronization(state, source_node, enclosing_scopes) + + + # It may be the case that nestedSDFG were not recursed previously. Process them in that case + for node in state.nodes(): + + # Guards + if not isinstance(node, nodes.NestedSDFG): + continue + if node in self._processed_nsdfg: + continue + + # not yet processed NestedSDFG + nested_scopes = self._build_nested_scope_stack(state, node, enclosing_scopes) + self._process_sdfg(node.sdfg, nested_scopes) + self._processed_nsdfg.add(node) + + + def _is_shared_memory_access_node(self, sdfg: SDFG, node: nodes.Node) -> bool: + """ + Check if a node represents a GPU shared memory access. + + Args: + sdfg: The containing SDFG + node: The node to check + + Returns: + True if the node is an AccessNode with GPU_Shared storage + """ + return ( + isinstance(node, nodes.AccessNode) + and node.desc(sdfg).storage == dtypes.StorageType.GPU_Shared + ) + + + + def _insert_pre_synchronization_barrier(self, source_node: nodes.Node, dest_node: nodes.Node, + state: SDFGState, nodes_with_sync: Dict[nodes.Node, nodes.Tasklet]) -> None: + """ + Insert a __syncthreads() barrier before shared memory consumption. + Reuses existing barriers when multiple shared memory sources feed the same destination. + + Args: + source_node: The shared memory AccessNode + dest_node: The consuming node + state: The containing state + nodes_with_sync: Map tracking existing synchronization tasklets + """ + if dest_node in nodes_with_sync: + # Reuse existing synchronization barrier for this destination + existing_barrier = nodes_with_sync[dest_node] + state.add_edge(source_node, None, existing_barrier, None, dace.Memlet()) + else: + # Create a new synchronization barrier + sync_barrier = state.add_tasklet( + name="pre_sync_barrier", + inputs=set(), + outputs=set(), + code="__syncthreads();\n", + language=dtypes.Language.CPP + ) + + # Connect: shared_memory -> sync_barrier -> consumer + state.add_edge(source_node, None, sync_barrier, None, dace.Memlet()) + state.add_edge(sync_barrier, None, dest_node, None, dace.Memlet()) + nodes_with_sync[dest_node] = sync_barrier + + def _build_nested_scope_stack(self, state: SDFGState, nested_sdfg_node: nodes.NestedSDFG, + enclosing_scopes: list[Union[MapExit, LoopRegion]]) -> list[Union[MapExit, LoopRegion]]: + """ + Copy the 'enclosing_scopes' stack and extend it with all maps in 'state' that enclose 'nested_sdfg_node'. + It is assumed that the 'enclosing_scopes' stack contains all maps and loops that wrap around 'state', but + not individual nodes within 'state'. + + Args: + state: The state containing the nested SDFG + nested_sdfg_node: The NestedSDFG node + enclosing_scopes: Current scope stack + + Returns: + Updated scope stack including maps enclosing the nested SDFG + """ + scope_dict = state.scope_dict() + updated_scopes = enclosing_scopes.copy() + + # Walk up the scope hierarchy, adding all enclosing maps + current_map = scope_dict[nested_sdfg_node] + while current_map is not None: + + # Add MapExit node to scope, since it is only needed + # for post synchronization anyways + map_exit = state.exit_node(current_map) + updated_scopes.append(map_exit) + + # add the current state in which the map_exit is contained, + # needed for potential post synchronization barriers + self._map_exit_to_state[map_exit] = state + + # move up in the nested map hierarchy + current_map = scope_dict[current_map] + + return updated_scopes + + + def _handle_shared_memory_post_synchronization(self, state: SDFGState, shared_mem_node: nodes.Node, + enclosing_scopes: list[Union[MapExit, LoopRegion]]) -> None: + """ + Handle post-synchronization for shared memory reuse in sequential execution contexts. + + When shared memory is reused across iterations in a for loop or sequential map within + a GPU kernel, we need post-synchronization barriers to prevent race conditions. + + Args: + state: The state containing the shared memory access + shared_mem_node: The shared memory AccessNode + enclosing_scopes: Current scope stack + """ + scope_dict = state.scope_dict() + complete_scope_stack = enclosing_scopes.copy() + + # Build complete scope stack including maps inside the current state + # enclosing the shared memory node. Analogous as in _build_nested_scope_stack() + current_map = scope_dict[shared_mem_node] + while current_map is not None: + + map_exit = state.exit_node(current_map) + complete_scope_stack.append(map_exit) + self._map_exit_to_state[map_exit] = state + current_map = scope_dict[current_map] + + # Analyze scope stack to find synchronization requirements + inside_gpu_kernel = False + innermost_sequential_scope = None + + # Process scopes from outermost to innermost + while complete_scope_stack: + scope = complete_scope_stack.pop(0) + + if isinstance(scope, MapExit): + schedule = scope.schedule + if schedule == dtypes.ScheduleType.Sequential and innermost_sequential_scope is None: + innermost_sequential_scope = scope + elif schedule == dtypes.ScheduleType.GPU_Device: + inside_gpu_kernel = True + break + elif isinstance(scope, LoopRegion) and innermost_sequential_scope is None: + innermost_sequential_scope = scope + + # Validate that shared memory is used within GPU kernel context + if not inside_gpu_kernel: + raise ValueError( + "Shared memory usage detected outside GPU kernel context. " + "GPU shared memory is only valid within GPU_Device scheduled maps." + ) + + # No post synchronization needed if there's no sequential iteration context + if innermost_sequential_scope is None: + return + + + # Apply appropriate post-synchronization based on scope type + if isinstance(innermost_sequential_scope, MapExit): + self._add_post_sync_for_sequential_map(innermost_sequential_scope) + elif isinstance(innermost_sequential_scope, LoopRegion): + self._add_post_sync_for_loop_region(innermost_sequential_scope) + + + # TODO: Avoid synchronization if only one iteration + def _add_post_sync_for_sequential_map(self, seq_map_exit: MapExit) -> None: + """ + Add post-synchronization barrier after a sequential map that may reuse shared memory. + + Args: + seq_map_exit: The MapExit node of the sequential map + """ + # Avoid duplicate synchronization + if seq_map_exit in self._synchronized_scopes: + return + + # Find the state containing this map + containing_state = self._map_exit_to_state[seq_map_exit] + + # Create post-synchronization barrier + post_sync_barrier = containing_state.add_tasklet( + name="post_sync_barrier", + inputs=set(), + outputs=set(), + code="__syncthreads();\n", + language=dtypes.Language.CPP + ) + + # Insert barrier before the map exit and all other predecessors + incoming_edges = containing_state.in_edges(seq_map_exit) + for edge in incoming_edges: + + predecessor = edge.src + containing_state.add_edge(predecessor, None, post_sync_barrier, None, dace.Memlet()) + containing_state.add_edge(post_sync_barrier, None, seq_map_exit, None, dace.Memlet()) + + + # Mark as synchronized + self._synchronized_scopes.add(seq_map_exit) + + def _add_post_sync_for_loop_region(self, loop_region: LoopRegion) -> None: + """ + Add post-synchronization barrier for a loop region that reuses shared memory arrays. + + Args: + loop_region: The LoopRegion that needs post-synchronization + """ + + sink_blocks: list[ControlFlowBlock] = [] + for block in loop_region.nodes(): + + if loop_region.out_degree(block) == 0: + sink_blocks.append(block) + + # No sync needed + if len(sink_blocks) < 0: + return + + # Add new state which synchronizates all sink nodes of the loop + syn_block = loop_region.add_state("sync_state") + syn_block.add_tasklet( + name="post_sync_barrier", + inputs=set(), + outputs=set(), + code="__syncthreads();\n", + language=dtypes.Language.CPP + ) + + + for block in sink_blocks: + loop_region.add_edge(block, syn_block, InterstateEdge()) + + # Mark as synchronized + self._synchronized_scopes.add(loop_region) \ No newline at end of file From 50f54317b6c65aa543f65c5976eec8829c05361d Mon Sep 17 00:00:00 2001 From: aydogdub Date: Fri, 13 Jun 2025 13:30:18 +0200 Subject: [PATCH 22/94] finish open work on DefaultSharedMemorySync Pass (skipping sequential iterations that iterate only once) and provide two options for post synchronization when dealing with loopRegions. --- .../passes/shared_memory_synchronization.py | 82 ++++++++++++++++++- 1 file changed, 78 insertions(+), 4 deletions(-) diff --git a/dace/transformation/passes/shared_memory_synchronization.py b/dace/transformation/passes/shared_memory_synchronization.py index 609076f8d8..5ea32894bb 100644 --- a/dace/transformation/passes/shared_memory_synchronization.py +++ b/dace/transformation/passes/shared_memory_synchronization.py @@ -1,5 +1,8 @@ from typing import Union, Dict, Set +import functools +import sympy + import dace from dace import SDFG, properties, SDFGState from dace import dtypes @@ -12,6 +15,7 @@ from dace.sdfg.state import LoopRegion, ControlFlowBlock from dace.sdfg.nodes import AccessNode, Map, MapEntry, MapExit +from dace.transformation.passes import analysis as ap @properties.make_properties @transformation.explicit_cf_compatible @@ -296,11 +300,29 @@ def _handle_shared_memory_post_synchronization(self, state: SDFGState, shared_me if isinstance(scope, MapExit): schedule = scope.schedule if schedule == dtypes.ScheduleType.Sequential and innermost_sequential_scope is None: + + # Special: Skip if there is only one iteration + size_per_dim = scope.map.range.size() + number_total_iterations = functools.reduce(sympy.Mul, size_per_dim, 1) + if number_total_iterations.is_number and number_total_iterations <= 1: + continue + innermost_sequential_scope = scope + elif schedule == dtypes.ScheduleType.GPU_Device: inside_gpu_kernel = True break elif isinstance(scope, LoopRegion) and innermost_sequential_scope is None: + + # Special: Skip if there is only one iteration + start = ap.get_init_assignment(scope) + end = ap.get_loop_end(scope) + stride = ap.get_loop_stride(scope) + nr_iter = (end - start) / stride + + if nr_iter.is_number and nr_iter <= 1: + continue + innermost_sequential_scope = scope # Validate that shared memory is used within GPU kernel context @@ -319,10 +341,12 @@ def _handle_shared_memory_post_synchronization(self, state: SDFGState, shared_me if isinstance(innermost_sequential_scope, MapExit): self._add_post_sync_for_sequential_map(innermost_sequential_scope) elif isinstance(innermost_sequential_scope, LoopRegion): - self._add_post_sync_for_loop_region(innermost_sequential_scope) + # two options, see docstrings + self._add_post_sync_tasklets_for_loop_region(innermost_sequential_scope) + # _add_post_sync_state_for_loop_region(innermost_sequential_scope) - # TODO: Avoid synchronization if only one iteration + def _add_post_sync_for_sequential_map(self, seq_map_exit: MapExit) -> None: """ Add post-synchronization barrier after a sequential map that may reuse shared memory. @@ -333,7 +357,7 @@ def _add_post_sync_for_sequential_map(self, seq_map_exit: MapExit) -> None: # Avoid duplicate synchronization if seq_map_exit in self._synchronized_scopes: return - + # Find the state containing this map containing_state = self._map_exit_to_state[seq_map_exit] @@ -358,9 +382,11 @@ def _add_post_sync_for_sequential_map(self, seq_map_exit: MapExit) -> None: # Mark as synchronized self._synchronized_scopes.add(seq_map_exit) - def _add_post_sync_for_loop_region(self, loop_region: LoopRegion) -> None: + def _add_post_sync_state_for_loop_region(self, loop_region: LoopRegion) -> None: """ Add post-synchronization barrier for a loop region that reuses shared memory arrays. + It adds a new state, which contains only a synchronization tasklet that connects + to all sink blocks of the loop region. Args: loop_region: The LoopRegion that needs post-synchronization @@ -390,5 +416,53 @@ def _add_post_sync_for_loop_region(self, loop_region: LoopRegion) -> None: for block in sink_blocks: loop_region.add_edge(block, syn_block, InterstateEdge()) + # Mark as synchronized + self._synchronized_scopes.add(loop_region) + + + def _add_post_sync_tasklets_for_loop_region(self, loop_region: LoopRegion) -> None: + """ + Add post-synchronization barrier for a loop region that reuses shared memory arrays. + Determines all sink blocks in the LoopRegion, and then, for each sink block, adds a new synchronization + tasklet that connects to all sink nodes within that sink block. + + Args: + loop_region: The LoopRegion that needs post-synchronization + """ + + sink_blocks: list[SDFGState] = [] + for block in loop_region.nodes(): + + if not isinstance(block, SDFGState): + raise NotImplementedError(f"Block {block} is expected to be an SDFG state. But it is of type {type(block)}. " + "Extend use case if this should be valid." + ) + + if loop_region.out_degree(block) == 0: + sink_blocks.append(block) + + # No sync needed + if len(sink_blocks) < 0: + return + + + # For each sink block, synchronize at the end + for block in sink_blocks: + + sink_nodes: list[nodes.Node] = block.sink_nodes() + + # All sink nodes in the same block (= state) get the same sync tasklet + post_sync_barrier = block.add_tasklet( + name="post_sync_barrier", + inputs=set(), + outputs=set(), + code="__syncthreads();\n", + language=dtypes.Language.CPP + ) + + for snode in sink_nodes: + block.add_edge(snode, None, post_sync_barrier, None, dace.Memlet()) + + # Mark as synchronized self._synchronized_scopes.add(loop_region) \ No newline at end of file From 2a560bcb9ea78c3e123055566f845fc59fb9cd0f Mon Sep 17 00:00:00 2001 From: aydogdub Date: Fri, 13 Jun 2025 14:21:41 +0200 Subject: [PATCH 23/94] scratch notebooks where I visually checked the passes, deleted unnecessary fragments --- .../scalarMultiplication1.ipynb | 236 +---- .../scalarMultiplication2.ipynb | 22 +- .../scratch/smemPassAndCopy/smth.sdfg | 889 ------------------ 3 files changed, 32 insertions(+), 1115 deletions(-) delete mode 100644 berkay_workpace/scratch/smemPassAndCopy/smth.sdfg diff --git a/berkay_workpace/scratch/smemPassAndCopy/scalarMultiplication1.ipynb b/berkay_workpace/scratch/smemPassAndCopy/scalarMultiplication1.ipynb index f2710aa7b0..8c38188786 100644 --- a/berkay_workpace/scratch/smemPassAndCopy/scalarMultiplication1.ipynb +++ b/berkay_workpace/scratch/smemPassAndCopy/scalarMultiplication1.ipynb @@ -64,15 +64,15 @@ "text/html": [ "\n", "
\n", - "
\n", + "
\n", "
\n", "\n", "" @@ -128,7 +128,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "id": "9abdaf19", "metadata": {}, "outputs": [ @@ -137,15 +137,15 @@ "text/html": [ "\n", "
\n", - "
\n", + "
\n", "
\n", "\n", "" @@ -429,6 +429,14 @@ "sdfg" ] }, + { + "cell_type": "markdown", + "id": "2cddaa1c", + "metadata": {}, + "source": [ + "Observe how the pass inserts the synchronization barriers correctly:" + ] + }, { "cell_type": "code", "execution_count": 4, @@ -440,15 +448,15 @@ "text/html": [ "\n", "
\n", - "
\n", + "
\n", "
\n", "\n", "" @@ -467,208 +475,6 @@ "DefaultSharedMemorySync().apply_pass(sdfg, None)\n", "sdfg" ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "e218c98a", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "
\n", - "
\n", - "
\n", - "\n", - "" - ], - "text/plain": [ - "SDFG (scalarMultiplication2)" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "@dace.program\n", - "def scalarMultiplication2(A: dace.int32[128] @ dace.dtypes.StorageType.GPU_Global, scalar: dace.int32):\n", - " for i in dace.map[0:32:32] @ dace.dtypes.ScheduleType.GPU_Device:\n", - " for k in dace.map[0:4] @ dace.dtypes.ScheduleType.Sequential:\n", - " for j in dace.map[0:32] @ dace.dtypes.ScheduleType.GPU_ThreadBlock:\n", - " tmp = A[k * 32 + j]\n", - " A[k * 32 + j] = scalar * tmp\n", - "\n", - "scalarMultiplication2.to_sdfg()" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "d64b05ff", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "
\n", - "
\n", - "
\n", - "\n", - "" - ], - "text/plain": [ - "SDFG (scalarMultiplication3)" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "@dace.program\n", - "def scalarMultiplication3(A: dace.int32[128] @ dace.dtypes.StorageType.GPU_Global, scalar: dace.int32):\n", - " for i in dace.map[0:32:32] @ dace.dtypes.ScheduleType.GPU_Device:\n", - " for j in dace.map[0:32] @ dace.dtypes.ScheduleType.GPU_ThreadBlock:\n", - " for k in dace.map[0:4] @ dace.dtypes.ScheduleType.Sequential:\n", - " tmp = A[k * 32 + j]\n", - " A[k * 32 + j] = scalar * tmp\n", - "\n", - "scalarMultiplication3.to_sdfg()" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "63b2ee66", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "
\n", - "
\n", - "
\n", - "\n", - "" - ], - "text/plain": [ - "SDFG (scalarMultiplication4)" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# To next file\n", - "@dace.program\n", - "def scalarMultiplication4(A: dace.int32[128] @ dace.dtypes.StorageType.GPU_Global, scalar: dace.int32):\n", - " for i in dace.map[0:32:32] @ dace.dtypes.ScheduleType.GPU_Device:\n", - " for j in dace.map[0:32] @ dace.dtypes.ScheduleType.GPU_ThreadBlock:\n", - " for k in range(4):\n", - " tmp = A[k * 32 + j]\n", - " A[k * 32 + j] = scalar * tmp\n", - " \n", - "\n", - "sdfg = scalarMultiplication4.to_sdfg()\n", - "sdfg" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "id": "a70dc41e", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[SDFGState (assign_8_16), SDFGState (block), SDFGState (block_0)]\n", - "0\n", - "1\n", - "1\n" - ] - } - ], - "source": [ - "state = sdfg.states()[0]\n", - "nodes = state.nodes()\n", - "nsdfg = [node for node in nodes if isinstance(node, dace.nodes.NestedSDFG)][0]\n", - "for_loop = nsdfg.sdfg.nodes()[0]\n", - "\n", - "nodes = for_loop.nodes()\n", - "print(nodes)\n", - "for n in nodes:\n", - " print(for_loop.out_degree(n))\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "ba14acfe", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'\\nstate = sdfg.states()[0]\\nnodes = state.nodes()\\nnsdfg = [node for node in nodes if isinstance(node, dace.nodes.NestedSDFG)][0]\\nfor_loop = nsdfg.sdfg.nodes()[0]\\nprint(nsdfg.sdfg.nodes())\\nprint(nsdfg.sdfg.states())\\nprint(for_loop.nodes())\\nprint(isinstance(for_loop, dace.sdfg.state.LoopRegion))\\nprint()\\nprint()\\nprint(sdfg.nodes())\\nprint(sdfg.states())\\n'" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "\"\"\"\n", - "state = sdfg.states()[0]\n", - "nodes = state.nodes()\n", - "nsdfg = [node for node in nodes if isinstance(node, dace.nodes.NestedSDFG)][0]\n", - "for_loop = nsdfg.sdfg.nodes()[0]\n", - "print(nsdfg.sdfg.nodes())\n", - "print(nsdfg.sdfg.states())\n", - "print(for_loop.nodes())\n", - "print(isinstance(for_loop, dace.sdfg.state.LoopRegion))\n", - "print()\n", - "print()\n", - "print(sdfg.nodes())\n", - "print(sdfg.states())\n", - "\"\"\"" - ] } ], "metadata": { diff --git a/berkay_workpace/scratch/smemPassAndCopy/scalarMultiplication2.ipynb b/berkay_workpace/scratch/smemPassAndCopy/scalarMultiplication2.ipynb index a9490815e9..8a3f8154cb 100644 --- a/berkay_workpace/scratch/smemPassAndCopy/scalarMultiplication2.ipynb +++ b/berkay_workpace/scratch/smemPassAndCopy/scalarMultiplication2.ipynb @@ -57,7 +57,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "id": "2769e30c", "metadata": {}, "outputs": [ @@ -66,15 +66,15 @@ "text/html": [ "\n", "
\n", - "
\n", + "
\n", "
\n", "\n", "" @@ -99,7 +99,7 @@ " A[k * 32 + j] = scalar * tmp\n", " \n", "\n", - "sdfg = scalarMultiplication.to_sdfg(save= True)\n", + "sdfg = scalarMultiplication.to_sdfg()\n", "sdfg" ] }, @@ -192,7 +192,7 @@ "id": "0e7e27ca", "metadata": {}, "source": [ - "Instead of having a synchronization tasklet, I use a synchronization state in the case there are several \"sink states\" inside a loopregion (if this is possible)" + "Observe how the synchronization tasklets are inserted using the DefaultSharedMemorySync pass:" ] }, { @@ -206,15 +206,15 @@ "text/html": [ "\n", "
\n", - "
\n", + "
\n", "
\n", "\n", "" diff --git a/berkay_workpace/scratch/smemPassAndCopy/smth.sdfg b/berkay_workpace/scratch/smemPassAndCopy/smth.sdfg deleted file mode 100644 index 690f41c3e1..0000000000 --- a/berkay_workpace/scratch/smemPassAndCopy/smth.sdfg +++ /dev/null @@ -1,889 +0,0 @@ -{ - "type": "SDFG", - "attributes": { - "name": "scalarMultiplication_smem", - "_arrays": { - "A": { - "type": "Array", - "attributes": { - "strides": [ - "1" - ], - "total_size": "128", - "offset": [ - "0" - ], - "dtype": "uint32", - "shape": [ - "128" - ], - "storage": "GPU_Global", - "debuginfo": null - } - }, - "scalar": { - "type": "Scalar", - "attributes": { - "dtype": "uint32", - "shape": [ - "1" - ], - "debuginfo": null - } - }, - "S": { - "type": "Array", - "attributes": { - "strides": [ - "1" - ], - "total_size": "32", - "offset": [ - "0" - ], - "optional": false, - "dtype": "uint32", - "shape": [ - "32" - ], - "transient": true, - "storage": "GPU_Shared", - "debuginfo": null - } - } - }, - "global_code": { - "frame": { - "string_data": "", - "language": "CPP" - } - }, - "init_code": { - "frame": { - "string_data": "", - "language": "CPP" - } - }, - "exit_code": { - "frame": { - "string_data": "", - "language": "CPP" - } - }, - "guid": "1de96299-3c2c-4b5d-8c80-7082c0faab46", - "hash": "8d8dc97005b7cab6a2083f6e6f012a016ba2fcc237e10c1c9dc078b01c858426" - }, - "nodes": [ - { - "type": "SDFGState", - "label": "main", - "id": 0, - "collapsed": false, - "scope_dict": { - "-1": [ - 0, - 1, - 2, - 3, - 4 - ], - "4": [ - 5, - 6 - ], - "6": [ - 7, - 8 - ] - }, - "nodes": [ - { - "type": "AccessNode", - "label": "A", - "attributes": { - "debuginfo": { - "type": "DebugInfo", - "start_line": 12, - "end_line": 12, - "start_column": 0, - "end_column": 0, - "filename": "/tmp/ipykernel_42346/245386100.py" - }, - "data": "A", - "guid": "567b339f-84b2-48c5-844d-91bfcf819213" - }, - "id": 0, - "scope_entry": null, - "scope_exit": null - }, - { - "type": "AccessNode", - "label": "A", - "attributes": { - "debuginfo": { - "type": "DebugInfo", - "start_line": 13, - "end_line": 13, - "start_column": 0, - "end_column": 0, - "filename": "/tmp/ipykernel_42346/245386100.py" - }, - "data": "A", - "guid": "e060d3ab-17ce-4c46-acec-ad04423b3731" - }, - "id": 1, - "scope_entry": null, - "scope_exit": null - }, - { - "type": "AccessNode", - "label": "scalar", - "attributes": { - "debuginfo": { - "type": "DebugInfo", - "start_line": 14, - "end_line": 14, - "start_column": 0, - "end_column": 0, - "filename": "/tmp/ipykernel_42346/245386100.py" - }, - "data": "scalar", - "guid": "99dcf8e3-36bb-4d98-81fd-a5d9d47c0e0c" - }, - "id": 2, - "scope_entry": null, - "scope_exit": null - }, - { - "type": "AccessNode", - "label": "S", - "attributes": { - "debuginfo": { - "type": "DebugInfo", - "start_line": 15, - "end_line": 15, - "start_column": 0, - "end_column": 0, - "filename": "/tmp/ipykernel_42346/245386100.py" - }, - "data": "S", - "guid": "24d7b12b-47d5-4e65-8248-bffd691c2d1b" - }, - "id": 3, - "scope_entry": null, - "scope_exit": null - }, - { - "type": "MapEntry", - "label": "gpu_map[i=0:32:32]", - "attributes": { - "label": "gpu_map", - "params": [ - "i" - ], - "range": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "31", - "step": "32", - "tile": "1" - } - ] - }, - "schedule": "GPU_Device", - "debuginfo": { - "type": "DebugInfo", - "start_line": 19, - "end_line": 19, - "start_column": 0, - "end_column": 0, - "filename": "/tmp/ipykernel_42346/245386100.py" - }, - "guid": "6314a764-e35d-41b6-a1db-c8669a43f5cf" - }, - "id": 4, - "scope_entry": null, - "scope_exit": "5" - }, - { - "type": "MapExit", - "label": "gpu_map[i=0:32:32]", - "attributes": { - "guid": "0edbf588-c052-419a-a4c0-d11f82565f31" - }, - "id": 5, - "scope_entry": "4", - "scope_exit": "5" - }, - { - "type": "MapEntry", - "label": "tb[j=0:32]", - "attributes": { - "label": "tb", - "params": [ - "j" - ], - "range": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "31", - "step": "1", - "tile": "1" - } - ] - }, - "schedule": "GPU_ThreadBlock", - "debuginfo": { - "type": "DebugInfo", - "start_line": 26, - "end_line": 26, - "start_column": 0, - "end_column": 0, - "filename": "/tmp/ipykernel_42346/245386100.py" - }, - "guid": "18e02f87-d308-45e7-b339-0d06be66dcc0" - }, - "id": 6, - "scope_entry": "4", - "scope_exit": "7" - }, - { - "type": "MapExit", - "label": "tb[j=0:32]", - "attributes": { - "guid": "5654767c-5745-4950-894c-247cf425d9b5" - }, - "id": 7, - "scope_entry": "6", - "scope_exit": "7" - }, - { - "type": "NestedSDFG", - "label": "nested_sdfg", - "attributes": { - "sdfg": { - "type": "SDFG", - "attributes": { - "name": "nested_sdfg", - "global_code": { - "frame": { - "string_data": "", - "language": "CPP" - } - }, - "init_code": { - "frame": { - "string_data": "", - "language": "CPP" - } - }, - "exit_code": { - "frame": { - "string_data": "", - "language": "CPP" - } - }, - "guid": "cb48e622-77c7-458e-864a-f143c84ea239" - }, - "nodes": [ - { - "type": "LoopRegion", - "attributes": { - "update_statement": { - "string_data": "k = (k + 1)", - "language": "Python" - }, - "init_statement": { - "string_data": "k = 0", - "language": "Python" - }, - "loop_condition": { - "string_data": "(k < 4)", - "language": "Python" - }, - "loop_variable": "k", - "guid": "6d0e8bde-999c-4d8d-8681-a386238445a4" - }, - "nodes": [ - { - "type": "SDFGState", - "label": "use_smem", - "id": 0, - "collapsed": false, - "scope_dict": { - "-1": [ - 0, - 1, - 2 - ] - }, - "nodes": [ - { - "type": "Tasklet", - "label": "assign_to_smem", - "attributes": { - "code": { - "string_data": "__out = __inp_A", - "language": "CPP" - }, - "debuginfo": { - "type": "DebugInfo", - "start_line": 46, - "end_line": 46, - "start_column": 0, - "end_column": 0, - "filename": "/tmp/ipykernel_42346/245386100.py" - }, - "label": "assign_to_smem", - "out_connectors": { - "__out": null - }, - "guid": "a63d73a7-a907-4bd0-9ae4-4d3105f15a89" - }, - "id": 0, - "scope_entry": null, - "scope_exit": null - }, - { - "type": "Tasklet", - "label": "addMult", - "attributes": { - "code": { - "string_data": "__out = __inp * __inp_scalar;", - "language": "CPP" - }, - "debuginfo": { - "type": "DebugInfo", - "start_line": 54, - "end_line": 54, - "start_column": 0, - "end_column": 0, - "filename": "/tmp/ipykernel_42346/245386100.py" - }, - "label": "addMult", - "in_connectors": { - "__inp": null - }, - "out_connectors": { - "__out": null - }, - "guid": "e1208512-68d1-4567-bd8e-3c2579ca61a9" - }, - "id": 1, - "scope_entry": null, - "scope_exit": null - }, - { - "type": "AccessNode", - "label": "S", - "attributes": { - "debuginfo": { - "type": "DebugInfo", - "start_line": 15, - "end_line": 15, - "start_column": 0, - "end_column": 0, - "filename": "/tmp/ipykernel_42346/245386100.py" - }, - "data": "S", - "guid": "24d7b12b-47d5-4e65-8248-bffd691c2d1b" - }, - "id": 2, - "scope_entry": null, - "scope_exit": null - } - ], - "edges": [ - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "1", - "subset": { - "type": "Range", - "ranges": [ - { - "start": "j", - "end": "j", - "step": "1", - "tile": "1" - } - ] - }, - "data": "S", - "debuginfo": null, - "guid": "f7d31c47-4858-48de-8388-7fb8ab968d6d", - "src_subset": null, - "dst_subset": { - "type": "Range", - "ranges": [ - { - "start": "j", - "end": "j", - "step": "1", - "tile": "1" - } - ] - }, - "is_data_src": false, - "num_accesses": "1" - } - } - }, - "src": "0", - "dst": "2", - "dst_connector": null, - "src_connector": "__out" - } - ], - "attributes": { - "guid": "d125a509-4ec4-497c-ae79-b34c09433ff1" - } - } - ], - "edges": [], - "collapsed": false, - "label": "loop", - "id": 0, - "cfg_list_id": 2, - "start_block": null - } - ], - "edges": [], - "collapsed": false, - "label": "", - "id": null, - "cfg_list_id": 1, - "start_block": null - }, - "debuginfo": { - "type": "DebugInfo", - "start_line": 35, - "end_line": 35, - "start_column": 0, - "end_column": 0, - "filename": "/tmp/ipykernel_42346/245386100.py" - }, - "label": "nested_sdfg", - "in_connectors": { - "__inp_A": null, - "__inp_scalar": null - }, - "out_connectors": { - "__out": null - }, - "guid": "9e71c20d-8658-4641-b85d-51880b2a5a5c" - }, - "id": 8, - "scope_entry": "6", - "scope_exit": "7" - } - ], - "edges": [ - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "128", - "subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "127", - "step": "1", - "tile": "1" - } - ] - }, - "data": "A", - "debuginfo": null, - "guid": "d05b830d-33b1-4236-b9d9-a6cfeba66c74", - "src_subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "127", - "step": "1", - "tile": "1" - } - ] - }, - "dst_subset": null, - "is_data_src": null, - "num_accesses": "128" - } - } - }, - "src": "0", - "dst": "4", - "dst_connector": null, - "src_connector": null - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "1", - "subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "0", - "step": "1", - "tile": "1" - } - ] - }, - "data": "scalar", - "debuginfo": null, - "guid": "80d32777-6d6f-40e7-b1b8-7552559022d6", - "src_subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "0", - "step": "1", - "tile": "1" - } - ] - }, - "dst_subset": null, - "is_data_src": null, - "num_accesses": "1" - } - } - }, - "src": "2", - "dst": "4", - "dst_connector": null, - "src_connector": null - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "128", - "subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "127", - "step": "1", - "tile": "1" - } - ] - }, - "data": "A", - "debuginfo": null, - "guid": "85bc8b8a-66a7-48f9-84c3-8be68c9b8d40", - "src_subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "127", - "step": "1", - "tile": "1" - } - ] - }, - "dst_subset": null, - "is_data_src": null, - "num_accesses": "128" - } - } - }, - "src": "4", - "dst": "6", - "dst_connector": null, - "src_connector": null - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "1", - "subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "0", - "step": "1", - "tile": "1" - } - ] - }, - "data": "scalar", - "debuginfo": null, - "guid": "efa679e1-5e45-48b6-a849-3cea0746876e", - "src_subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "0", - "step": "1", - "tile": "1" - } - ] - }, - "dst_subset": null, - "is_data_src": null, - "num_accesses": "1" - } - } - }, - "src": "4", - "dst": "6", - "dst_connector": null, - "src_connector": null - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "128", - "subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "127", - "step": "1", - "tile": "1" - } - ] - }, - "data": "A", - "debuginfo": null, - "guid": "76a3c00f-cbf8-4c04-ad75-947b01bacc8b", - "src_subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "127", - "step": "1", - "tile": "1" - } - ] - }, - "dst_subset": null, - "is_data_src": null, - "num_accesses": "128" - } - } - }, - "src": "7", - "dst": "5", - "dst_connector": null, - "src_connector": null - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "128", - "subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "127", - "step": "1", - "tile": "1" - } - ] - }, - "data": "A", - "debuginfo": null, - "guid": "f25b3395-879c-4afd-a757-523b740a2f73", - "src_subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "127", - "step": "1", - "tile": "1" - } - ] - }, - "dst_subset": null, - "is_data_src": null, - "num_accesses": "128" - } - } - }, - "src": "5", - "dst": "1", - "dst_connector": null, - "src_connector": null - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "4", - "subset": { - "type": "Range", - "ranges": [ - { - "start": "j", - "end": "j + 96", - "step": "32", - "tile": "1" - } - ] - }, - "data": "A", - "debuginfo": null, - "guid": "5d8bea57-d75a-4cf5-adc6-2bfe07f8daa9", - "src_subset": { - "type": "Range", - "ranges": [ - { - "start": "j", - "end": "j + 96", - "step": "32", - "tile": "1" - } - ] - }, - "dst_subset": null, - "is_data_src": null, - "num_accesses": "4" - } - } - }, - "src": "6", - "dst": "8", - "dst_connector": "__inp_A", - "src_connector": null - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "1", - "subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "0", - "step": "1", - "tile": "1" - } - ] - }, - "data": "scalar", - "debuginfo": null, - "guid": "b90cc3c4-73b5-4cb6-9941-50264e15babf", - "src_subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "0", - "step": "1", - "tile": "1" - } - ] - }, - "dst_subset": null, - "is_data_src": null, - "num_accesses": "1" - } - } - }, - "src": "6", - "dst": "8", - "dst_connector": "__inp_scalar", - "src_connector": null - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "4", - "subset": { - "type": "Range", - "ranges": [ - { - "start": "j", - "end": "j + 96", - "step": "32", - "tile": "1" - } - ] - }, - "data": "A", - "debuginfo": null, - "guid": "4e3b5f0f-ef6d-43b8-9588-9666518af1b9", - "src_subset": { - "type": "Range", - "ranges": [ - { - "start": "j", - "end": "j + 96", - "step": "32", - "tile": "1" - } - ] - }, - "dst_subset": null, - "is_data_src": null, - "num_accesses": "4" - } - } - }, - "src": "8", - "dst": "7", - "dst_connector": null, - "src_connector": "__out" - } - ], - "attributes": { - "guid": "ff16ec78-9e21-4834-8e40-572509836663" - } - } - ], - "edges": [], - "collapsed": false, - "label": "", - "id": null, - "cfg_list_id": 0, - "start_block": null, - "dace_version": "1.0.0" -} \ No newline at end of file From 30796904e46dfcffd4ec84534bf7c8568c6d1ab8 Mon Sep 17 00:00:00 2001 From: Yakup Budanaz Date: Fri, 13 Jun 2025 15:53:42 +0200 Subject: [PATCH 24/94] const arrays utils alpha impl --- dace/sdfg/utils.py | 76 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 76 insertions(+) diff --git a/dace/sdfg/utils.py b/dace/sdfg/utils.py index da9699d332..d0653ef33a 100644 --- a/dace/sdfg/utils.py +++ b/dace/sdfg/utils.py @@ -2109,3 +2109,79 @@ def set_nested_sdfg_parent_references(sdfg: SDFG): if isinstance(node, NestedSDFG): node.sdfg.parent_sdfg = sdfg set_nested_sdfg_parent_references(node.sdfg) + + +def get_used_data(cfg: ControlFlowRegion | SDFGState) -> Set[str]: + """ + Returns a set of all data names that are used in the given control flow region or state. + Data is considered used if there is an access node + + :param cfg: The control flow region or state to check. + :return: A set of used data names. + """ + used_data = set() + for state in cfg.all_states() if not isinstance(cfg, SDFGState) else [cfg]: + for node in state.nodes(): + if isinstance(node, nd.AccessNode): + used_data.add(node.data) + + cfgs_to_check = {cfg.nodes()} if not isinstance(cfg, SDFGState) else {} + + while cfgs_to_check: + node = cfgs_to_check.pop() + if not isinstance(node, SDFGState): + cfgs_to_check.add(node.nodes()) + + for out_edge in cfg.out_edges(node): + assert isinstance(out_edge, InterstateEdge) + edge = out_edge.data + interstate_used_data = edge.used_arrays(arrays=cfg.sdfg.arrays, union_lhs_symbols=True) + used_data.update(interstate_used_data) + + if isinstance(node, ConditionalBlock): + for branch_code in node.branches: + pass + + if isinstance(node, LoopRegion): + pass + + return used_data + +def get_constant_data(cfg: ControlFlowRegion | SDFGState) -> Set[str]: + """ + Returns a set of all constant data names in the given control flow region or state. + Data is considered constant if there is any incoming edge to an access node of the data. + Due to the semantics of SDFG, if a nested SDFG writes to the data container it needs to be + visible in the parent graph as well, so the function does not need to be recursive. + + :param cfg: The control flow region or state to check. + :return: A set of constant data names. + """ + + data_written_to = set() + sdfg = cfg.sdfg + # Write accesses to scalars can happen through access nodes and interstate edges (assignments) + # Write accesses to arrays can only happen through access nodes + for state in cfg.all_states() if not isinstance(cfg, SDFGState) else [cfg]: + for node in state.nodes(): + if isinstance(node, nd.AccessNode): + if state.in_degree(node) > 0: + data_written_to.add(node.data) + + cfgs_to_check = {cfg.nodes()} if not isinstance(cfg, SDFGState) else {} + + while cfgs_to_check: + node = cfgs_to_check.pop() + if not isinstance(node, SDFGState): + cfgs_to_check.add(node.nodes()) + + for out_edge in cfg.out_edges(node): + assert isinstance(out_edge, InterstateEdge) + edge = out_edge.data + written_scalars = [arr_name for arr_name in edge.assignments if arr_name in sdfg.arrays] + if written_scalars: + data_written_to.update(written_scalars) + + all_accessed_data = set() + constants = all_accessed_data - data_written_to + return constants \ No newline at end of file From 54193dd48ff449b057a27709e3eac7fbfb5123e6 Mon Sep 17 00:00:00 2001 From: aydogdub Date: Fri, 13 Jun 2025 21:27:28 +0200 Subject: [PATCH 25/94] filling the strategy provisionally, might need change in future --- .../copy_strategies.py | 196 ++++++++++++++---- 1 file changed, 155 insertions(+), 41 deletions(-) diff --git a/dace/codegen/targets/experimental_cuda_helpers/copy_strategies.py b/dace/codegen/targets/experimental_cuda_helpers/copy_strategies.py index a1371547d5..4b59854fa8 100644 --- a/dace/codegen/targets/experimental_cuda_helpers/copy_strategies.py +++ b/dace/codegen/targets/experimental_cuda_helpers/copy_strategies.py @@ -4,8 +4,8 @@ from dace import symbolic from dace import Memlet, dtypes from dace.dtypes import StorageType -from dace.codegen.targets.experimental_cuda import ExperimentalCUDACodeGen, GPUStreamManager -from dace.codegen.targets.experimental_cuda_helpers.gpu_utils import product +from dace.codegen.targets.experimental_cuda import ExperimentalCUDACodeGen, GPUStreamManager, KernelSpec +from dace.codegen.targets.experimental_cuda_helpers.gpu_utils import product, symbolic_to_cpp from dace.codegen.prettycode import CodeIOStream @@ -13,7 +13,7 @@ from dace.sdfg.nodes import Node from dace.sdfg.state import ControlFlowRegion, StateSubgraphView -from dace.codegen.targets.cpp import memlet_copy_to_absolute_strides +from dace.codegen.targets.cpp import memlet_copy_to_absolute_strides, unparse_cr # TODO: Review Documentation once done here. And also, take care of the other @@ -108,12 +108,14 @@ def get_copy_call_parameters(self) -> Tuple[str, str, str, str, str, str, any]: src_location = 'Device' if self.src_storage == dtypes.StorageType.GPU_Global else 'Host' dst_location = 'Device' if self.dst_storage == dtypes.StorageType.GPU_Global else 'Host' - # Use the destination data type - ctype = self.dst_node.desc(self.sdfg).ctype - - # NOTE: I implicitly assume it is the same dtype as of the src. - assert ctype == self.src_node.desc(self.sdfg).dtype.ctype, \ - "Source and destination data types must match for the memory copy." + # Should be symmetric + ctype_src = self.src_node.desc(self.sdfg).dtype.ctype + ctype_dst = self.dst_node.desc(self.sdfg).dtype.ctype + ctype = ctype_dst + assert ctype_src == ctype_dst, ( + f"Source and destination data types must match for the memory copy: " + f"{ctype_src} != {ctype_dst}" + ) return self.backend, self.src_expr, self.dst_expr, src_location, dst_location, self.cudastream, ctype @@ -382,51 +384,163 @@ def _generate_nd_copy(self, copy_context: CopyContext) -> None: callsite_stream.write("}") -################ TODO, just here out of completeness for now ############# - +################ TODO, Might need to modified further ############# +# Below: Does collaborative copy class WithinGPUCopyStrategy(CopyStrategy): def applicable(self, copy_context: CopyContext) -> bool: + """ + Checks if the copy is eligible for a collaborative GPU-to-GPU copy. + Conditions: + 1. The copy is between GPU memory types (shared or global). + 2. The innermost non-sequential map is scheduled on GPU_Device. + """ from dace.sdfg import scope_contains_scope - from dace.transformation import helpers - - gpu_storage_types = [dtypes.StorageType.GPU_Global, dtypes.StorageType.GPU_Shared] - cond1 = copy_context.src_storage in gpu_storage_types and copy_context.dst_storage in gpu_storage_types + from dace.transformation import helpers + + # --- Condition 1: GPU to GPU memory transfer --- + gpu_storages = {dtypes.StorageType.GPU_Global, dtypes.StorageType.GPU_Shared} + if not (copy_context.src_storage in gpu_storages and + copy_context.dst_storage in gpu_storages): + return False - state_id = copy_context.state_id - cfg = copy_context.cfg - src_node = copy_context.src_node - dst_node = copy_context.dst_node - - state_dfg = cfg.state(state_id) - sdict = state_dfg.scope_dict() - schedule_node = copy_context.src_node - if scope_contains_scope(sdict, src_node, dst_node): - schedule_node = dst_node - - state = state_dfg - while (schedule_node is None or not isinstance(schedule_node, nodes.MapEntry) - or schedule_node.map.schedule == dtypes.ScheduleType.Sequential): - ret = helpers.get_parent_map(state, schedule_node) - if ret is None: - schedule_node = None + # --- Condition 2: Inside a GPU_Device map scope --- + state = copy_context.state_dfg + scope_dict = state.scope_dict() + + # Determine which node (src or dst) is in the deeper scope + src, dst = copy_context.src_node, copy_context.dst_node + deeper_scope_node = dst if scope_contains_scope(scope_dict, src, dst) else src + + # Determine the schedule type of the innermost non-sequential map. + # If no such map exists, use the default schedule. + current_node = deeper_scope_node + while (current_node is None or current_node.map.schedule == dtypes.ScheduleType.Sequential): + + parent = helpers.get_parent_map(state, current_node) + if parent is None: + current_node = None break - schedule_node, state = ret + current_node, state = parent - if schedule_node is None: - inner_schedule = dtypes.SCOPEDEFAULT_SCHEDULE[None] + if current_node is None: + schedule_type = dtypes.SCOPEDEFAULT_SCHEDULE[None] else: - inner_schedule = schedule_node.map.schedule + schedule_type = current_node.map.schedule - # Collaborative load - cond2 = inner_schedule == dtypes.ScheduleType.GPU_Device - - return cond1 and cond2 + return schedule_type == dtypes.ScheduleType.GPU_Device + def generate_copy(self, copy_context: CopyContext) -> None: - raise NotImplementedError(f'WithinGPUCopy not yet implemented in ExperimentalCUDACodeGen') + + from dace.frontend import operations + + + # Get required copy information + copy_shape, src_strides, dst_strides = copy_context.get_transfer_layout() + src_expr, dst_expr = copy_context.src_expr, copy_context.dst_expr + + sdfg = copy_context.sdfg + dtype = copy_context.src_node.desc(sdfg).dtype + ctype = dtype.ctype + + # Get copy function name (defined in runtime library) + num_dims = copy_context.num_dims + src_storage_name = self._get_storagename(copy_context.src_storage) + dst_storage_name = self._get_storagename(copy_context.dst_storage) + + function_name = f"dace::{src_storage_name}To{dst_storage_name}{num_dims}D" + + # Check for write-conflict resolution (WCR), it affects function call + accum = '' + custom_reduction = [] + _, _, _, _, memlet = copy_context.edge + wcr = memlet.wcr + + if wcr is not None: + reduction_type = operations.detect_reduction_type(wcr) + + if reduction_type != dtypes.ReductionType.Custom: + # Use predefined reduction + reduction_type_str = str(reduction_type).split('.')[-1] # e.g., "Sum" + reduction_template = f"<{reduction_type_str}>" + else: + custom_reduction = [unparse_cr(sdfg, wcr, dtype)] + reduction_template = "" + + accum = f"::template Accum{reduction_template}" + + # Dispatch to the correct backend copy template based on copy characteristics + + # get always used stuff + callsite_stream, cfg, state_id, src_node, dst_node = copy_context.get_write_context() + + # Retrieve kernel specs from the ExperimentalCUDACodegen instance (held in a dedicated class) + # Only there block_dims is stored, which is needed in this case + kernel_specifications: KernelSpec = copy_context.codegen._current_kernel_spec + block_dims = ', '.join(kernel_specifications.block_dims) + + # was called "is_async" previously. It determines whether a "__syncthreads()" is called at the + # end of the copy. In ExperimentalCUDACodegen, a pass is responsible to insert such sync barriers, + # so it is synchronized and we do not need "implicit" synchronization + synchronized = True + + if any(symbolic.issymbolic(s, copy_context.sdfg.constants) for s in copy_shape): + args_list = ( + [src_expr] + + src_strides + + [dst_expr] + + custom_reduction + + dst_strides + + copy_shape + ) + args = ", ".join(args_list) + callsite_stream.write(f"{function_name}Dynamic<{ctype}, {block_dims}, {synchronized}>{accum}({args});", + cfg, state_id, [src_node, dst_node]) + + + elif function_name == "dace::SharedToGlobal1D": + # special case: use a new template struct that provides functions for copy and reduction + copy_size = ', '.join(copy_shape) + accum = accum or '::Copy' + args_list = ( + [src_expr] + + src_strides + + [dst_expr] + + dst_strides + + custom_reduction + ) + args = ", ".join(args_list) + callsite_stream.write(f"{function_name}<{ctype}, {block_dims}, {copy_size}, {synchronized}>{accum}({args});", + cfg, state_id, [src_node, dst_node]) + + else: + copy_size = ', '.join(copy_shape) + accum = accum or '::Copy' + args_list = ( + [src_expr] + + src_strides + + [dst_expr] + + custom_reduction + ) + args = ", ".join(args_list) + callsite_stream.write(f"{function_name}<{ctype}, {block_dims}, {copy_size}, {dst_strides}, {synchronized}>{accum}({args});", + cfg, state_id, [src_node, dst_node]) + + + + + def _get_storagename(self, storage: dtypes.StorageType): + """ + Returns a string containing the name of the storage location. + + Example: dtypes.StorageType.GPU_Shared will return "Shared". + """ + storage_name = str(storage) + return storage_name[storage_name.rindex('_') + 1:] + class FallBackGPUCopyStrategy(CopyStrategy): From aa50223ed43d5831c5cf6c1c7a798778757795bc Mon Sep 17 00:00:00 2001 From: aydogdub Date: Sun, 15 Jun 2025 16:44:06 +0200 Subject: [PATCH 26/94] Extend synchronization pass and set scratch + configs. --- .../scalarMultiplication1.ipynb | 40 +- .../scalarMultiplication2.ipynb | 49 ++- .../scratch/smemPassAndCopy/simpleCopy.ipynb | 356 ++++++++++-------- dace/config_schema.yml | 1 - .../passes/shared_memory_synchronization.py | 56 ++- 5 files changed, 301 insertions(+), 201 deletions(-) diff --git a/berkay_workpace/scratch/smemPassAndCopy/scalarMultiplication1.ipynb b/berkay_workpace/scratch/smemPassAndCopy/scalarMultiplication1.ipynb index 8c38188786..c6a566ca36 100644 --- a/berkay_workpace/scratch/smemPassAndCopy/scalarMultiplication1.ipynb +++ b/berkay_workpace/scratch/smemPassAndCopy/scalarMultiplication1.ipynb @@ -64,15 +64,15 @@ "text/html": [ "\n", "
\n", - "
\n", + "
\n", "
\n", "\n", "" @@ -128,7 +128,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "id": "9abdaf19", "metadata": {}, "outputs": [ @@ -137,15 +137,15 @@ "text/html": [ "\n", "
\n", - "
\n", + "
\n", "
\n", "\n", "" @@ -154,7 +154,7 @@ "SDFG (scalarMultiplication2_smem)" ] }, - "execution_count": 3, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -439,7 +439,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 6, "id": "6c8921a7", "metadata": {}, "outputs": [ @@ -448,15 +448,15 @@ "text/html": [ "\n", "
\n", - "
\n", + "
\n", "
\n", "\n", "" @@ -465,7 +465,7 @@ "SDFG (scalarMultiplication2_smem)" ] }, - "execution_count": 4, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -475,6 +475,14 @@ "DefaultSharedMemorySync().apply_pass(sdfg, None)\n", "sdfg" ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "149b48c5", + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { diff --git a/berkay_workpace/scratch/smemPassAndCopy/scalarMultiplication2.ipynb b/berkay_workpace/scratch/smemPassAndCopy/scalarMultiplication2.ipynb index 8a3f8154cb..c86a07a8ab 100644 --- a/berkay_workpace/scratch/smemPassAndCopy/scalarMultiplication2.ipynb +++ b/berkay_workpace/scratch/smemPassAndCopy/scalarMultiplication2.ipynb @@ -57,7 +57,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "id": "2769e30c", "metadata": {}, "outputs": [ @@ -66,15 +66,15 @@ "text/html": [ "\n", "
\n", - "
\n", + "
\n", "
\n", "\n", "" @@ -116,7 +116,34 @@ "execution_count": 3, "id": "f0609dff", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "
\n", + "
\n", + "
\n", + "\n", + "" + ], + "text/plain": [ + "SDFG (scalarMultiplication_smem)" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "def scalarMultiplication_smem():\n", " sdfg = dace.SDFG(\"scalarMultiplication_smem\")\n", @@ -184,7 +211,7 @@ "\n", "\n", "sdfg = scalarMultiplication_smem()\n", - "#sdfg\n" + "sdfg\n" ] }, { @@ -206,15 +233,15 @@ "text/html": [ "\n", "
\n", - "
\n", + "
\n", "
\n", "\n", "" diff --git a/berkay_workpace/scratch/smemPassAndCopy/simpleCopy.ipynb b/berkay_workpace/scratch/smemPassAndCopy/simpleCopy.ipynb index 18c3aa8836..90f60b0a69 100644 --- a/berkay_workpace/scratch/smemPassAndCopy/simpleCopy.ipynb +++ b/berkay_workpace/scratch/smemPassAndCopy/simpleCopy.ipynb @@ -23,7 +23,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "id": "e7f52766", "metadata": {}, "outputs": [ @@ -67,15 +67,15 @@ "text/html": [ "\n", "
\n", - "
\n", + "
\n", "
\n", "\n", "" @@ -119,15 +119,15 @@ "text/html": [ "\n", "
\n", - "
\n", + "
\n", "
\n", "\n", "" @@ -230,15 +230,15 @@ "text/html": [ "\n", "
\n", - "
\n", + "
\n", "
\n", "\n", "" @@ -358,12 +358,12 @@ "\n", "\n", "\n", - "DACE_EXPORTED int __dace_init_cuda(simpleCopy_smem_state_t *__state);\n", - "DACE_EXPORTED int __dace_exit_cuda(simpleCopy_smem_state_t *__state);\n", + "DACE_EXPORTED int __dace_init_experimental_cuda(simpleCopy_smem_state_t *__state);\n", + "DACE_EXPORTED int __dace_exit_experimental_cuda(simpleCopy_smem_state_t *__state);\n", "\n", "\n", "\n", - "int __dace_init_cuda(simpleCopy_smem_state_t *__state) {\n", + "int __dace_init_experimental_cuda(simpleCopy_smem_state_t *__state) {\n", " int count;\n", "\n", " // Check that we are able to run cuda code\n", @@ -386,14 +386,14 @@ "\n", " \n", "\n", - " __state->gpu_context = new dace::cuda::Context(1, 1);\n", + " __state->gpu_context = new dace::cuda::Context(0, 0);\n", "\n", " // Create cuda streams and events\n", - " for(int i = 0; i < 1; ++i) {\n", + " for(int i = 0; i < 0; ++i) {\n", " DACE_GPU_CHECK(cudaStreamCreateWithFlags(&__state->gpu_context->internal_streams[i], cudaStreamNonBlocking));\n", " __state->gpu_context->streams[i] = __state->gpu_context->internal_streams[i]; // Allow for externals to modify streams\n", " }\n", - " for(int i = 0; i < 1; ++i) {\n", + " for(int i = 0; i < 0; ++i) {\n", " DACE_GPU_CHECK(cudaEventCreateWithFlags(&__state->gpu_context->events[i], cudaEventDisableTiming));\n", " }\n", "\n", @@ -402,7 +402,7 @@ " return 0;\n", "}\n", "\n", - "int __dace_exit_cuda(simpleCopy_smem_state_t *__state) {\n", + "int __dace_exit_experimental_cuda(simpleCopy_smem_state_t *__state) {\n", " \n", "\n", " // Synchronize and check for CUDA errors\n", @@ -411,10 +411,10 @@ " __err = static_cast<int>(cudaDeviceSynchronize());\n", "\n", " // Destroy cuda streams and events\n", - " for(int i = 0; i < 1; ++i) {\n", + " for(int i = 0; i < 0; ++i) {\n", " DACE_GPU_CHECK(cudaStreamDestroy(__state->gpu_context->internal_streams[i]));\n", " }\n", - " for(int i = 0; i < 1; ++i) {\n", + " for(int i = 0; i < 0; ++i) {\n", " DACE_GPU_CHECK(cudaEventDestroy(__state->gpu_context->events[i]));\n", " }\n", "\n", @@ -424,7 +424,7 @@ "\n", "DACE_EXPORTED bool __dace_gpu_set_stream(simpleCopy_smem_state_t *__state, int streamid, gpuStream_t stream)\n", "{\n", - " if (streamid < 0 || streamid >= 1)\n", + " if (streamid < 0 || streamid >= 0)\n", " return false;\n", "\n", " __state->gpu_context->streams[streamid] = stream;\n", @@ -434,58 +434,70 @@ "\n", "DACE_EXPORTED void __dace_gpu_set_all_streams(simpleCopy_smem_state_t *__state, gpuStream_t stream)\n", "{\n", - " for (int i = 0; i < 1; ++i)\n", + " for (int i = 0; i < 0; ++i)\n", " __state->gpu_context->streams[i] = stream;\n", "}\n", "\n", - "__global__ void __launch_bounds__(32) gpu_map_0_0_3(const dace::uint * __restrict__ A, dace::uint * __restrict__ B) {\n", + "__global__ void __launch_bounds__(32) gpu_map_0_0_3(dace::uint * __restrict__ A, dace::uint * __restrict__ B)\n", + "{\n", + " int i = (32 * blockIdx.x);\n", " {\n", - " int i = (32 * blockIdx.x);\n", + " __shared__ dace::uint S[32];\n", + " int j = threadIdx.x;\n", " {\n", - " {\n", - " __shared__ dace::uint S[32];\n", - " int j = threadIdx.x;\n", - " {\n", - " {\n", - " dace::uint __inp = A[j];\n", - " dace::uint __out;\n", - "\n", - " ///////////////////\n", - " __out = __inp;\n", - " ///////////////////\n", - "\n", - " S[j] = __out;\n", - " }\n", - " {\n", - "\n", - " ///////////////////\n", - " __syncthreads();\n", - " ///////////////////\n", - "\n", - " }\n", - " {\n", - " dace::uint __inp = S[j];\n", - " dace::uint __out;\n", - "\n", - " ///////////////////\n", - " __out = __inp;\n", - " ///////////////////\n", - "\n", - " B[j] = __out;\n", - " }\n", - " }\n", - " }\n", + " dace::uint __inp = A[j];\n", + " dace::uint __out;\n", + "\n", + " ///////////////////\n", + " __out = __inp;\n", + " ///////////////////\n", + "\n", + " S[j] = __out;\n", + " }\n", + " {\n", + "\n", + " ///////////////////\n", + " __syncthreads();\n", + " ///////////////////\n", + "\n", + " }\n", + " {\n", + "\n", + " ///////////////////\n", + " __syncthreads();\n", + " ///////////////////\n", + "\n", + " }\n", + " {\n", + "\n", + " ///////////////////\n", + " __syncthreads();\n", + " ///////////////////\n", + "\n", + " }\n", + " {\n", + " dace::uint __inp = S[j];\n", + " dace::uint __out;\n", + "\n", + " ///////////////////\n", + " __out = __inp;\n", + " ///////////////////\n", + "\n", + " B[j] = __out;\n", " }\n", " }\n", "}\n", "\n", "\n", - "DACE_EXPORTED void __dace_runkernel_gpu_map_0_0_3(simpleCopy_smem_state_t *__state, const dace::uint * __restrict__ A, dace::uint * __restrict__ B);\n", - "void __dace_runkernel_gpu_map_0_0_3(simpleCopy_smem_state_t *__state, const dace::uint * __restrict__ A, dace::uint * __restrict__ B)\n", + "DACE_EXPORTED void __dace_runkernel_gpu_map_0_0_3(simpleCopy_smem_state_t *__state, dace::uint * __restrict__ A, dace::uint * __restrict__ B);\n", + "void __dace_runkernel_gpu_map_0_0_3(simpleCopy_smem_state_t *__state, dace::uint * __restrict__ A, dace::uint * __restrict__ B)\n", "{\n", "\n", + "\n", " void *gpu_map_0_0_3_args[] = { (void *)&A, (void *)&B };\n", - " gpuError_t __err = cudaLaunchKernel((void*)gpu_map_0_0_3, dim3(1, 1, 1), dim3(32, 1, 1), gpu_map_0_0_3_args, 0, __state->gpu_context->streams[0]);\n", + " gpuError_t __err = cudaLaunchKernel( (void*)gpu_map_0_0_3, dim3(1, 1, 1), dim3(32, 1, 1), gpu_map_0_0_3_args, 0, nullptr\n", + " );\n", + "\n", " DACE_KERNEL_LAUNCH_CHECK(__err, "gpu_map_0_0_3", 1, 1, 1, 32, 1, 1);\n", "}\n", "\n" @@ -502,12 +514,12 @@ "\n", "\n", "\n", - "\\PY{n}{DACE\\PYZus{}EXPORTED}\\PY{+w}{ }\\PY{n+nc}{int}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}dace\\PYZus{}init\\PYZus{}cuda}\\PY{p}{(}\\PY{n}{simpleCopy\\PYZus{}smem\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{)}\\PY{p}{;}\n", - "\\PY{n}{DACE\\PYZus{}EXPORTED}\\PY{+w}{ }\\PY{n+nc}{int}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}dace\\PYZus{}exit\\PYZus{}cuda}\\PY{p}{(}\\PY{n}{simpleCopy\\PYZus{}smem\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{)}\\PY{p}{;}\n", + "\\PY{n}{DACE\\PYZus{}EXPORTED}\\PY{+w}{ }\\PY{n+nc}{int}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}dace\\PYZus{}init\\PYZus{}experimental\\PYZus{}cuda}\\PY{p}{(}\\PY{n}{simpleCopy\\PYZus{}smem\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{)}\\PY{p}{;}\n", + "\\PY{n}{DACE\\PYZus{}EXPORTED}\\PY{+w}{ }\\PY{n+nc}{int}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}dace\\PYZus{}exit\\PYZus{}experimental\\PYZus{}cuda}\\PY{p}{(}\\PY{n}{simpleCopy\\PYZus{}smem\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{)}\\PY{p}{;}\n", "\n", "\n", "\n", - "\\PY{n+nc}{int}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}dace\\PYZus{}init\\PYZus{}cuda}\\PY{p}{(}\\PY{n}{simpleCopy\\PYZus{}smem\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{)}\\PY{+w}{ }\\PY{err}{\\PYZob{}}\n", + "\\PY{n+nc}{int}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}dace\\PYZus{}init\\PYZus{}experimental\\PYZus{}cuda}\\PY{p}{(}\\PY{n}{simpleCopy\\PYZus{}smem\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{)}\\PY{+w}{ }\\PY{err}{\\PYZob{}}\n", "\\PY{+w}{ }\\PY{n+nc}{int}\\PY{+w}{ }\\PY{n+nf}{count}\\PY{p}{;}\n", "\n", "\\PY{+w}{ }\\PY{o}{/}\\PY{o}{/}\\PY{+w}{ }\\PY{k}{Check}\\PY{+w}{ }\\PY{n}{that}\\PY{+w}{ }\\PY{n}{we}\\PY{+w}{ }\\PY{k}{are}\\PY{+w}{ }\\PY{n}{able}\\PY{+w}{ }\\PY{k}{to}\\PY{+w}{ }\\PY{n}{run}\\PY{+w}{ }\\PY{n}{cuda}\\PY{+w}{ }\\PY{n}{code}\n", @@ -530,14 +542,14 @@ "\n", "\\PY{+w}{ }\n", "\n", - "\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{k}{new}\\PY{+w}{ }\\PY{n+nl}{dace}\\PY{p}{:}\\PY{err}{:}\\PY{n+nl}{cuda}\\PY{p}{:}\\PY{err}{:}\\PY{n}{Context}\\PY{p}{(}\\PY{l+m+mi}{1}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{)}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{k}{new}\\PY{+w}{ }\\PY{n+nl}{dace}\\PY{p}{:}\\PY{err}{:}\\PY{n+nl}{cuda}\\PY{p}{:}\\PY{err}{:}\\PY{n}{Context}\\PY{p}{(}\\PY{l+m+mi}{0}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{)}\\PY{p}{;}\n", "\n", "\\PY{+w}{ }\\PY{o}{/}\\PY{o}{/}\\PY{+w}{ }\\PY{k}{Create}\\PY{+w}{ }\\PY{n}{cuda}\\PY{+w}{ }\\PY{n}{streams}\\PY{+w}{ }\\PY{o+ow}{and}\\PY{+w}{ }\\PY{n}{events}\n", - "\\PY{+w}{ }\\PY{k}{for}\\PY{p}{(}\\PY{n+nc}{int}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{;}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{\\PYZlt{}}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{;}\\PY{+w}{ }\\PY{o}{+}\\PY{o}{+}\\PY{n}{i}\\PY{p}{)}\\PY{+w}{ }\\PY{err}{\\PYZob{}}\n", + "\\PY{+w}{ }\\PY{k}{for}\\PY{p}{(}\\PY{n+nc}{int}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{;}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{\\PYZlt{}}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{;}\\PY{+w}{ }\\PY{o}{+}\\PY{o}{+}\\PY{n}{i}\\PY{p}{)}\\PY{+w}{ }\\PY{err}{\\PYZob{}}\n", "\\PY{+w}{ }\\PY{n}{DACE\\PYZus{}GPU\\PYZus{}CHECK}\\PY{p}{(}\\PY{n}{cudaStreamCreateWithFlags}\\PY{p}{(}\\PY{o}{\\PYZam{}}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{internal\\PYZus{}streams}\\PY{o}{[}\\PY{n}{i}\\PY{o}{]}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{cudaStreamNonBlocking}\\PY{p}{)}\\PY{p}{)}\\PY{p}{;}\n", "\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{streams}\\PY{o}{[}\\PY{n}{i}\\PY{o}{]}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{internal\\PYZus{}streams}\\PY{o}{[}\\PY{n}{i}\\PY{o}{]}\\PY{p}{;}\\PY{+w}{ }\\PY{o}{/}\\PY{o}{/}\\PY{+w}{ }\\PY{n}{Allow}\\PY{+w}{ }\\PY{k}{for}\\PY{+w}{ }\\PY{n}{externals}\\PY{+w}{ }\\PY{k}{to}\\PY{+w}{ }\\PY{k}{modify}\\PY{+w}{ }\\PY{n}{streams}\n", "\\PY{+w}{ }\\PY{err}{\\PYZcb{}}\n", - "\\PY{+w}{ }\\PY{k}{for}\\PY{p}{(}\\PY{n+nc}{int}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{;}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{\\PYZlt{}}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{;}\\PY{+w}{ }\\PY{o}{+}\\PY{o}{+}\\PY{n}{i}\\PY{p}{)}\\PY{+w}{ }\\PY{err}{\\PYZob{}}\n", + "\\PY{+w}{ }\\PY{k}{for}\\PY{p}{(}\\PY{n+nc}{int}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{;}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{\\PYZlt{}}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{;}\\PY{+w}{ }\\PY{o}{+}\\PY{o}{+}\\PY{n}{i}\\PY{p}{)}\\PY{+w}{ }\\PY{err}{\\PYZob{}}\n", "\\PY{+w}{ }\\PY{n}{DACE\\PYZus{}GPU\\PYZus{}CHECK}\\PY{p}{(}\\PY{n}{cudaEventCreateWithFlags}\\PY{p}{(}\\PY{o}{\\PYZam{}}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{events}\\PY{o}{[}\\PY{n}{i}\\PY{o}{]}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{cudaEventDisableTiming}\\PY{p}{)}\\PY{p}{)}\\PY{p}{;}\n", "\\PY{+w}{ }\\PY{err}{\\PYZcb{}}\n", "\n", @@ -546,7 +558,7 @@ "\\PY{+w}{ }\\PY{k}{return}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{;}\n", "\\PY{err}{\\PYZcb{}}\n", "\n", - "\\PY{n+nc}{int}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}dace\\PYZus{}exit\\PYZus{}cuda}\\PY{p}{(}\\PY{n}{simpleCopy\\PYZus{}smem\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{)}\\PY{+w}{ }\\PY{err}{\\PYZob{}}\n", + "\\PY{n+nc}{int}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}dace\\PYZus{}exit\\PYZus{}experimental\\PYZus{}cuda}\\PY{p}{(}\\PY{n}{simpleCopy\\PYZus{}smem\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{)}\\PY{+w}{ }\\PY{err}{\\PYZob{}}\n", "\\PY{+w}{ }\n", "\n", "\\PY{+w}{ }\\PY{o}{/}\\PY{o}{/}\\PY{+w}{ }\\PY{n}{Synchronize}\\PY{+w}{ }\\PY{o+ow}{and}\\PY{+w}{ }\\PY{k}{check}\\PY{+w}{ }\\PY{k}{for}\\PY{+w}{ }\\PY{n}{CUDA}\\PY{+w}{ }\\PY{n}{errors}\n", @@ -555,10 +567,10 @@ "\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}err}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{static\\PYZus{}cast}\\PY{o}{\\PYZlt{}}\\PY{n+nc}{int}\\PY{o}{\\PYZgt{}}\\PY{p}{(}\\PY{n}{cudaDeviceSynchronize}\\PY{p}{(}\\PY{p}{)}\\PY{p}{)}\\PY{p}{;}\n", "\n", "\\PY{+w}{ }\\PY{o}{/}\\PY{o}{/}\\PY{+w}{ }\\PY{k}{Destroy}\\PY{+w}{ }\\PY{n}{cuda}\\PY{+w}{ }\\PY{n}{streams}\\PY{+w}{ }\\PY{o+ow}{and}\\PY{+w}{ }\\PY{n}{events}\n", - "\\PY{+w}{ }\\PY{k}{for}\\PY{p}{(}\\PY{n+nc}{int}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{;}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{\\PYZlt{}}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{;}\\PY{+w}{ }\\PY{o}{+}\\PY{o}{+}\\PY{n}{i}\\PY{p}{)}\\PY{+w}{ }\\PY{err}{\\PYZob{}}\n", + "\\PY{+w}{ }\\PY{k}{for}\\PY{p}{(}\\PY{n+nc}{int}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{;}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{\\PYZlt{}}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{;}\\PY{+w}{ }\\PY{o}{+}\\PY{o}{+}\\PY{n}{i}\\PY{p}{)}\\PY{+w}{ }\\PY{err}{\\PYZob{}}\n", "\\PY{+w}{ }\\PY{n}{DACE\\PYZus{}GPU\\PYZus{}CHECK}\\PY{p}{(}\\PY{n}{cudaStreamDestroy}\\PY{p}{(}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{internal\\PYZus{}streams}\\PY{o}{[}\\PY{n}{i}\\PY{o}{]}\\PY{p}{)}\\PY{p}{)}\\PY{p}{;}\n", "\\PY{+w}{ }\\PY{err}{\\PYZcb{}}\n", - "\\PY{+w}{ }\\PY{k}{for}\\PY{p}{(}\\PY{n+nc}{int}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{;}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{\\PYZlt{}}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{;}\\PY{+w}{ }\\PY{o}{+}\\PY{o}{+}\\PY{n}{i}\\PY{p}{)}\\PY{+w}{ }\\PY{err}{\\PYZob{}}\n", + "\\PY{+w}{ }\\PY{k}{for}\\PY{p}{(}\\PY{n+nc}{int}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{;}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{\\PYZlt{}}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{;}\\PY{+w}{ }\\PY{o}{+}\\PY{o}{+}\\PY{n}{i}\\PY{p}{)}\\PY{+w}{ }\\PY{err}{\\PYZob{}}\n", "\\PY{+w}{ }\\PY{n}{DACE\\PYZus{}GPU\\PYZus{}CHECK}\\PY{p}{(}\\PY{n}{cudaEventDestroy}\\PY{p}{(}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{events}\\PY{o}{[}\\PY{n}{i}\\PY{o}{]}\\PY{p}{)}\\PY{p}{)}\\PY{p}{;}\n", "\\PY{+w}{ }\\PY{err}{\\PYZcb{}}\n", "\n", @@ -568,7 +580,7 @@ "\n", "\\PY{n}{DACE\\PYZus{}EXPORTED}\\PY{+w}{ }\\PY{n}{bool}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}dace\\PYZus{}gpu\\PYZus{}set\\PYZus{}stream}\\PY{p}{(}\\PY{n}{simpleCopy\\PYZus{}smem\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{,}\\PY{+w}{ }\\PY{n+nc}{int}\\PY{+w}{ }\\PY{n}{streamid}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{gpuStream\\PYZus{}t}\\PY{+w}{ }\\PY{n}{stream}\\PY{p}{)}\n", "\\PY{err}{\\PYZob{}}\n", - "\\PY{+w}{ }\\PY{k}{if}\\PY{+w}{ }\\PY{p}{(}\\PY{n}{streamid}\\PY{+w}{ }\\PY{o}{\\PYZlt{}}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{+w}{ }\\PY{o}{|}\\PY{o}{|}\\PY{+w}{ }\\PY{n}{streamid}\\PY{+w}{ }\\PY{o}{\\PYZgt{}=}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{)}\n", + "\\PY{+w}{ }\\PY{k}{if}\\PY{+w}{ }\\PY{p}{(}\\PY{n}{streamid}\\PY{+w}{ }\\PY{o}{\\PYZlt{}}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{+w}{ }\\PY{o}{|}\\PY{o}{|}\\PY{+w}{ }\\PY{n}{streamid}\\PY{+w}{ }\\PY{o}{\\PYZgt{}=}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{)}\n", "\\PY{+w}{ }\\PY{k}{return}\\PY{+w}{ }\\PY{k}{false}\\PY{p}{;}\n", "\n", "\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{streams}\\PY{o}{[}\\PY{n}{streamid}\\PY{o}{]}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{stream}\\PY{p}{;}\n", @@ -578,58 +590,70 @@ "\n", "\\PY{n}{DACE\\PYZus{}EXPORTED}\\PY{+w}{ }\\PY{n}{void}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}dace\\PYZus{}gpu\\PYZus{}set\\PYZus{}all\\PYZus{}streams}\\PY{p}{(}\\PY{n}{simpleCopy\\PYZus{}smem\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{gpuStream\\PYZus{}t}\\PY{+w}{ }\\PY{n}{stream}\\PY{p}{)}\n", "\\PY{err}{\\PYZob{}}\n", - "\\PY{+w}{ }\\PY{k}{for}\\PY{+w}{ }\\PY{p}{(}\\PY{n+nc}{int}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{;}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{\\PYZlt{}}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{;}\\PY{+w}{ }\\PY{o}{+}\\PY{o}{+}\\PY{n}{i}\\PY{p}{)}\n", + "\\PY{+w}{ }\\PY{k}{for}\\PY{+w}{ }\\PY{p}{(}\\PY{n+nc}{int}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{;}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{\\PYZlt{}}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{;}\\PY{+w}{ }\\PY{o}{+}\\PY{o}{+}\\PY{n}{i}\\PY{p}{)}\n", "\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{streams}\\PY{o}{[}\\PY{n}{i}\\PY{o}{]}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{stream}\\PY{p}{;}\n", "\\PY{err}{\\PYZcb{}}\n", "\n", - "\\PY{n}{\\PYZus{}\\PYZus{}global\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{n}{void}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}launch\\PYZus{}bounds\\PYZus{}\\PYZus{}}\\PY{p}{(}\\PY{l+m+mi}{32}\\PY{p}{)}\\PY{+w}{ }\\PY{n}{gpu\\PYZus{}map\\PYZus{}0\\PYZus{}0\\PYZus{}3}\\PY{p}{(}\\PY{n}{const}\\PY{+w}{ }\\PY{n+nl}{dace}\\PY{p}{:}\\PY{err}{:}\\PY{n}{uint}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}restrict\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{n}{A}\\PY{p}{,}\\PY{+w}{ }\\PY{n+nl}{dace}\\PY{p}{:}\\PY{err}{:}\\PY{n}{uint}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}restrict\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{n}{B}\\PY{p}{)}\\PY{+w}{ }\\PY{err}{\\PYZob{}}\n", + "\\PY{n}{\\PYZus{}\\PYZus{}global\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{n}{void}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}launch\\PYZus{}bounds\\PYZus{}\\PYZus{}}\\PY{p}{(}\\PY{l+m+mi}{32}\\PY{p}{)}\\PY{+w}{ }\\PY{n}{gpu\\PYZus{}map\\PYZus{}0\\PYZus{}0\\PYZus{}3}\\PY{p}{(}\\PY{n+nl}{dace}\\PY{p}{:}\\PY{err}{:}\\PY{n}{uint}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}restrict\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{n}{A}\\PY{p}{,}\\PY{+w}{ }\\PY{n+nl}{dace}\\PY{p}{:}\\PY{err}{:}\\PY{n}{uint}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}restrict\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{n}{B}\\PY{p}{)}\n", + "\\PY{err}{\\PYZob{}}\n", + "\\PY{+w}{ }\\PY{n+nc}{int}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{p}{(}\\PY{l+m+mi}{32}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{blockIdx}\\PY{p}{.}\\PY{n}{x}\\PY{p}{)}\\PY{p}{;}\n", "\\PY{+w}{ }\\PY{err}{\\PYZob{}}\n", - "\\PY{+w}{ }\\PY{n+nc}{int}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{p}{(}\\PY{l+m+mi}{32}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{blockIdx}\\PY{p}{.}\\PY{n}{x}\\PY{p}{)}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}shared\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{n+nl}{dace}\\PY{p}{:}\\PY{err}{:}\\PY{n}{uint}\\PY{+w}{ }\\PY{n}{S}\\PY{o}{[}\\PY{n}{32}\\PY{o}{]}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{n+nc}{int}\\PY{+w}{ }\\PY{n}{j}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{threadIdx}\\PY{p}{.}\\PY{n}{x}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{err}{\\PYZob{}}\n", + "\\PY{+w}{ }\\PY{n+nl}{dace}\\PY{p}{:}\\PY{err}{:}\\PY{n}{uint}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}inp}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{A}\\PY{o}{[}\\PY{n}{j}\\PY{o}{]}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{n+nl}{dace}\\PY{p}{:}\\PY{err}{:}\\PY{n}{uint}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}out}\\PY{p}{;}\n", + "\n", + "\\PY{+w}{ }\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\n", + "\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}out}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}inp}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\n", + "\n", + "\\PY{+w}{ }\\PY{n}{S}\\PY{o}{[}\\PY{n}{j}\\PY{o}{]}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}out}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{err}{\\PYZcb{}}\n", + "\\PY{+w}{ }\\PY{err}{\\PYZob{}}\n", + "\n", + "\\PY{+w}{ }\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\n", + "\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}syncthreads}\\PY{p}{(}\\PY{p}{)}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\n", + "\n", + "\\PY{+w}{ }\\PY{err}{\\PYZcb{}}\n", "\\PY{+w}{ }\\PY{err}{\\PYZob{}}\n", - "\\PY{+w}{ }\\PY{err}{\\PYZob{}}\n", - "\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}shared\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{n+nl}{dace}\\PY{p}{:}\\PY{err}{:}\\PY{n}{uint}\\PY{+w}{ }\\PY{n}{S}\\PY{o}{[}\\PY{n}{32}\\PY{o}{]}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{n+nc}{int}\\PY{+w}{ }\\PY{n}{j}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{threadIdx}\\PY{p}{.}\\PY{n}{x}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{err}{\\PYZob{}}\n", - "\\PY{+w}{ }\\PY{err}{\\PYZob{}}\n", - "\\PY{+w}{ }\\PY{n+nl}{dace}\\PY{p}{:}\\PY{err}{:}\\PY{n}{uint}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}inp}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{A}\\PY{o}{[}\\PY{n}{j}\\PY{o}{]}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{n+nl}{dace}\\PY{p}{:}\\PY{err}{:}\\PY{n}{uint}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}out}\\PY{p}{;}\n", - "\n", - "\\PY{+w}{ }\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\n", - "\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}out}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}inp}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\n", - "\n", - "\\PY{+w}{ }\\PY{n}{S}\\PY{o}{[}\\PY{n}{j}\\PY{o}{]}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}out}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{err}{\\PYZcb{}}\n", - "\\PY{+w}{ }\\PY{err}{\\PYZob{}}\n", - "\n", - "\\PY{+w}{ }\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\n", - "\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}syncthreads}\\PY{p}{(}\\PY{p}{)}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\n", - "\n", - "\\PY{+w}{ }\\PY{err}{\\PYZcb{}}\n", - "\\PY{+w}{ }\\PY{err}{\\PYZob{}}\n", - "\\PY{+w}{ }\\PY{n+nl}{dace}\\PY{p}{:}\\PY{err}{:}\\PY{n}{uint}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}inp}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{S}\\PY{o}{[}\\PY{n}{j}\\PY{o}{]}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{n+nl}{dace}\\PY{p}{:}\\PY{err}{:}\\PY{n}{uint}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}out}\\PY{p}{;}\n", - "\n", - "\\PY{+w}{ }\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\n", - "\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}out}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}inp}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\n", - "\n", - "\\PY{+w}{ }\\PY{n}{B}\\PY{o}{[}\\PY{n}{j}\\PY{o}{]}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}out}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{err}{\\PYZcb{}}\n", - "\\PY{+w}{ }\\PY{err}{\\PYZcb{}}\n", - "\\PY{+w}{ }\\PY{err}{\\PYZcb{}}\n", + "\n", + "\\PY{+w}{ }\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\n", + "\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}syncthreads}\\PY{p}{(}\\PY{p}{)}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\n", + "\n", + "\\PY{+w}{ }\\PY{err}{\\PYZcb{}}\n", + "\\PY{+w}{ }\\PY{err}{\\PYZob{}}\n", + "\n", + "\\PY{+w}{ }\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\n", + "\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}syncthreads}\\PY{p}{(}\\PY{p}{)}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\n", + "\n", + "\\PY{+w}{ }\\PY{err}{\\PYZcb{}}\n", + "\\PY{+w}{ }\\PY{err}{\\PYZob{}}\n", + "\\PY{+w}{ }\\PY{n+nl}{dace}\\PY{p}{:}\\PY{err}{:}\\PY{n}{uint}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}inp}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{S}\\PY{o}{[}\\PY{n}{j}\\PY{o}{]}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{n+nl}{dace}\\PY{p}{:}\\PY{err}{:}\\PY{n}{uint}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}out}\\PY{p}{;}\n", + "\n", + "\\PY{+w}{ }\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\n", + "\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}out}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}inp}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\n", + "\n", + "\\PY{+w}{ }\\PY{n}{B}\\PY{o}{[}\\PY{n}{j}\\PY{o}{]}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}out}\\PY{p}{;}\n", "\\PY{+w}{ }\\PY{err}{\\PYZcb{}}\n", "\\PY{+w}{ }\\PY{err}{\\PYZcb{}}\n", "\\PY{err}{\\PYZcb{}}\n", "\n", "\n", - "\\PY{n}{DACE\\PYZus{}EXPORTED}\\PY{+w}{ }\\PY{n}{void}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}dace\\PYZus{}runkernel\\PYZus{}gpu\\PYZus{}map\\PYZus{}0\\PYZus{}0\\PYZus{}3}\\PY{p}{(}\\PY{n}{simpleCopy\\PYZus{}smem\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{const}\\PY{+w}{ }\\PY{n+nl}{dace}\\PY{p}{:}\\PY{err}{:}\\PY{n}{uint}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}restrict\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{n}{A}\\PY{p}{,}\\PY{+w}{ }\\PY{n+nl}{dace}\\PY{p}{:}\\PY{err}{:}\\PY{n}{uint}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}restrict\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{n}{B}\\PY{p}{)}\\PY{p}{;}\n", - "\\PY{n}{void}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}dace\\PYZus{}runkernel\\PYZus{}gpu\\PYZus{}map\\PYZus{}0\\PYZus{}0\\PYZus{}3}\\PY{p}{(}\\PY{n}{simpleCopy\\PYZus{}smem\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{const}\\PY{+w}{ }\\PY{n+nl}{dace}\\PY{p}{:}\\PY{err}{:}\\PY{n}{uint}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}restrict\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{n}{A}\\PY{p}{,}\\PY{+w}{ }\\PY{n+nl}{dace}\\PY{p}{:}\\PY{err}{:}\\PY{n}{uint}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}restrict\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{n}{B}\\PY{p}{)}\n", + "\\PY{n}{DACE\\PYZus{}EXPORTED}\\PY{+w}{ }\\PY{n}{void}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}dace\\PYZus{}runkernel\\PYZus{}gpu\\PYZus{}map\\PYZus{}0\\PYZus{}0\\PYZus{}3}\\PY{p}{(}\\PY{n}{simpleCopy\\PYZus{}smem\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{,}\\PY{+w}{ }\\PY{n+nl}{dace}\\PY{p}{:}\\PY{err}{:}\\PY{n}{uint}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}restrict\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{n}{A}\\PY{p}{,}\\PY{+w}{ }\\PY{n+nl}{dace}\\PY{p}{:}\\PY{err}{:}\\PY{n}{uint}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}restrict\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{n}{B}\\PY{p}{)}\\PY{p}{;}\n", + "\\PY{n}{void}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}dace\\PYZus{}runkernel\\PYZus{}gpu\\PYZus{}map\\PYZus{}0\\PYZus{}0\\PYZus{}3}\\PY{p}{(}\\PY{n}{simpleCopy\\PYZus{}smem\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{,}\\PY{+w}{ }\\PY{n+nl}{dace}\\PY{p}{:}\\PY{err}{:}\\PY{n}{uint}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}restrict\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{n}{A}\\PY{p}{,}\\PY{+w}{ }\\PY{n+nl}{dace}\\PY{p}{:}\\PY{err}{:}\\PY{n}{uint}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}restrict\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{n}{B}\\PY{p}{)}\n", "\\PY{err}{\\PYZob{}}\n", "\n", + "\n", "\\PY{+w}{ }\\PY{n}{void}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{gpu\\PYZus{}map\\PYZus{}0\\PYZus{}0\\PYZus{}3\\PYZus{}args}\\PY{err}{[}\\PY{err}{]}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{err}{\\PYZob{}}\\PY{+w}{ }\\PY{p}{(}\\PY{n}{void}\\PY{+w}{ }\\PY{o}{*}\\PY{p}{)}\\PY{o}{\\PYZam{}}\\PY{n}{A}\\PY{p}{,}\\PY{+w}{ }\\PY{p}{(}\\PY{n}{void}\\PY{+w}{ }\\PY{o}{*}\\PY{p}{)}\\PY{o}{\\PYZam{}}\\PY{n}{B}\\PY{+w}{ }\\PY{err}{\\PYZcb{}}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{n}{gpuError\\PYZus{}t}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}err}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{cudaLaunchKernel}\\PY{p}{(}\\PY{p}{(}\\PY{n}{void}\\PY{o}{*}\\PY{p}{)}\\PY{n}{gpu\\PYZus{}map\\PYZus{}0\\PYZus{}0\\PYZus{}3}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{dim3}\\PY{p}{(}\\PY{l+m+mi}{1}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{)}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{dim3}\\PY{p}{(}\\PY{l+m+mi}{32}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{)}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{gpu\\PYZus{}map\\PYZus{}0\\PYZus{}0\\PYZus{}3\\PYZus{}args}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{streams}\\PY{o}{[}\\PY{n}{0}\\PY{o}{]}\\PY{p}{)}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{n}{gpuError\\PYZus{}t}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}err}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{cudaLaunchKernel}\\PY{p}{(}\\PY{+w}{ }\\PY{p}{(}\\PY{n}{void}\\PY{o}{*}\\PY{p}{)}\\PY{n}{gpu\\PYZus{}map\\PYZus{}0\\PYZus{}0\\PYZus{}3}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{dim3}\\PY{p}{(}\\PY{l+m+mi}{1}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{)}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{dim3}\\PY{p}{(}\\PY{l+m+mi}{32}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{)}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{gpu\\PYZus{}map\\PYZus{}0\\PYZus{}0\\PYZus{}3\\PYZus{}args}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{nullptr}\n", + "\\PY{+w}{ }\\PY{p}{)}\\PY{p}{;}\n", + "\n", "\\PY{+w}{ }\\PY{n}{DACE\\PYZus{}KERNEL\\PYZus{}LAUNCH\\PYZus{}CHECK}\\PY{p}{(}\\PY{n}{\\PYZus{}\\PYZus{}err}\\PY{p}{,}\\PY{+w}{ }\\PY{l+s+ss}{\\PYZdq{}gpu\\PYZus{}map\\PYZus{}0\\PYZus{}0\\PYZus{}3\\PYZdq{}}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{32}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{)}\\PY{p}{;}\n", "\\PY{err}{\\PYZcb{}}\n", "\\end{Verbatim}\n" @@ -646,12 +670,12 @@ "\n", "\n", "\n", - "DACE_EXPORTED int __dace_init_cuda(simpleCopy_smem_state_t *__state);\n", - "DACE_EXPORTED int __dace_exit_cuda(simpleCopy_smem_state_t *__state);\n", + "DACE_EXPORTED int __dace_init_experimental_cuda(simpleCopy_smem_state_t *__state);\n", + "DACE_EXPORTED int __dace_exit_experimental_cuda(simpleCopy_smem_state_t *__state);\n", "\n", "\n", "\n", - "int __dace_init_cuda(simpleCopy_smem_state_t *__state) {\n", + "int __dace_init_experimental_cuda(simpleCopy_smem_state_t *__state) {\n", " int count;\n", "\n", " // Check that we are able to run cuda code\n", @@ -674,14 +698,14 @@ "\n", " \n", "\n", - " __state->gpu_context = new dace::cuda::Context(1, 1);\n", + " __state->gpu_context = new dace::cuda::Context(0, 0);\n", "\n", " // Create cuda streams and events\n", - " for(int i = 0; i < 1; ++i) {\n", + " for(int i = 0; i < 0; ++i) {\n", " DACE_GPU_CHECK(cudaStreamCreateWithFlags(&__state->gpu_context->internal_streams[i], cudaStreamNonBlocking));\n", " __state->gpu_context->streams[i] = __state->gpu_context->internal_streams[i]; // Allow for externals to modify streams\n", " }\n", - " for(int i = 0; i < 1; ++i) {\n", + " for(int i = 0; i < 0; ++i) {\n", " DACE_GPU_CHECK(cudaEventCreateWithFlags(&__state->gpu_context->events[i], cudaEventDisableTiming));\n", " }\n", "\n", @@ -690,7 +714,7 @@ " return 0;\n", "}\n", "\n", - "int __dace_exit_cuda(simpleCopy_smem_state_t *__state) {\n", + "int __dace_exit_experimental_cuda(simpleCopy_smem_state_t *__state) {\n", " \n", "\n", " // Synchronize and check for CUDA errors\n", @@ -699,10 +723,10 @@ " __err = static_cast(cudaDeviceSynchronize());\n", "\n", " // Destroy cuda streams and events\n", - " for(int i = 0; i < 1; ++i) {\n", + " for(int i = 0; i < 0; ++i) {\n", " DACE_GPU_CHECK(cudaStreamDestroy(__state->gpu_context->internal_streams[i]));\n", " }\n", - " for(int i = 0; i < 1; ++i) {\n", + " for(int i = 0; i < 0; ++i) {\n", " DACE_GPU_CHECK(cudaEventDestroy(__state->gpu_context->events[i]));\n", " }\n", "\n", @@ -712,7 +736,7 @@ "\n", "DACE_EXPORTED bool __dace_gpu_set_stream(simpleCopy_smem_state_t *__state, int streamid, gpuStream_t stream)\n", "{\n", - " if (streamid < 0 || streamid >= 1)\n", + " if (streamid < 0 || streamid >= 0)\n", " return false;\n", "\n", " __state->gpu_context->streams[streamid] = stream;\n", @@ -722,58 +746,70 @@ "\n", "DACE_EXPORTED void __dace_gpu_set_all_streams(simpleCopy_smem_state_t *__state, gpuStream_t stream)\n", "{\n", - " for (int i = 0; i < 1; ++i)\n", + " for (int i = 0; i < 0; ++i)\n", " __state->gpu_context->streams[i] = stream;\n", "}\n", "\n", - "__global__ void __launch_bounds__(32) gpu_map_0_0_3(const dace::uint * __restrict__ A, dace::uint * __restrict__ B) {\n", + "__global__ void __launch_bounds__(32) gpu_map_0_0_3(dace::uint * __restrict__ A, dace::uint * __restrict__ B)\n", + "{\n", + " int i = (32 * blockIdx.x);\n", " {\n", - " int i = (32 * blockIdx.x);\n", + " __shared__ dace::uint S[32];\n", + " int j = threadIdx.x;\n", + " {\n", + " dace::uint __inp = A[j];\n", + " dace::uint __out;\n", + "\n", + " ///////////////////\n", + " __out = __inp;\n", + " ///////////////////\n", + "\n", + " S[j] = __out;\n", + " }\n", + " {\n", + "\n", + " ///////////////////\n", + " __syncthreads();\n", + " ///////////////////\n", + "\n", + " }\n", + " {\n", + "\n", + " ///////////////////\n", + " __syncthreads();\n", + " ///////////////////\n", + "\n", + " }\n", + " {\n", + "\n", + " ///////////////////\n", + " __syncthreads();\n", + " ///////////////////\n", + "\n", + " }\n", " {\n", - " {\n", - " __shared__ dace::uint S[32];\n", - " int j = threadIdx.x;\n", - " {\n", - " {\n", - " dace::uint __inp = A[j];\n", - " dace::uint __out;\n", - "\n", - " ///////////////////\n", - " __out = __inp;\n", - " ///////////////////\n", - "\n", - " S[j] = __out;\n", - " }\n", - " {\n", - "\n", - " ///////////////////\n", - " __syncthreads();\n", - " ///////////////////\n", - "\n", - " }\n", - " {\n", - " dace::uint __inp = S[j];\n", - " dace::uint __out;\n", - "\n", - " ///////////////////\n", - " __out = __inp;\n", - " ///////////////////\n", - "\n", - " B[j] = __out;\n", - " }\n", - " }\n", - " }\n", + " dace::uint __inp = S[j];\n", + " dace::uint __out;\n", + "\n", + " ///////////////////\n", + " __out = __inp;\n", + " ///////////////////\n", + "\n", + " B[j] = __out;\n", " }\n", " }\n", "}\n", "\n", "\n", - "DACE_EXPORTED void __dace_runkernel_gpu_map_0_0_3(simpleCopy_smem_state_t *__state, const dace::uint * __restrict__ A, dace::uint * __restrict__ B);\n", - "void __dace_runkernel_gpu_map_0_0_3(simpleCopy_smem_state_t *__state, const dace::uint * __restrict__ A, dace::uint * __restrict__ B)\n", + "DACE_EXPORTED void __dace_runkernel_gpu_map_0_0_3(simpleCopy_smem_state_t *__state, dace::uint * __restrict__ A, dace::uint * __restrict__ B);\n", + "void __dace_runkernel_gpu_map_0_0_3(simpleCopy_smem_state_t *__state, dace::uint * __restrict__ A, dace::uint * __restrict__ B)\n", "{\n", "\n", + "\n", " void *gpu_map_0_0_3_args[] = { (void *)&A, (void *)&B };\n", - " gpuError_t __err = cudaLaunchKernel((void*)gpu_map_0_0_3, dim3(1, 1, 1), dim3(32, 1, 1), gpu_map_0_0_3_args, 0, __state->gpu_context->streams[0]);\n", + " gpuError_t __err = cudaLaunchKernel( (void*)gpu_map_0_0_3, dim3(1, 1, 1), dim3(32, 1, 1), gpu_map_0_0_3_args, 0, nullptr\n", + " );\n", + "\n", " DACE_KERNEL_LAUNCH_CHECK(__err, \"gpu_map_0_0_3\", 1, 1, 1, 32, 1, 1);\n", "}\n" ] diff --git a/dace/config_schema.yml b/dace/config_schema.yml index 0a8afa3b3c..70ed877dee 100644 --- a/dace/config_schema.yml +++ b/dace/config_schema.yml @@ -457,7 +457,6 @@ required: Yakup Koray Budanaz for Berkays master-thesis. enum: [legacy, experimental] default: experimental - gpu_index_type: type: str title: Thread/block/warp index data type diff --git a/dace/transformation/passes/shared_memory_synchronization.py b/dace/transformation/passes/shared_memory_synchronization.py index 5ea32894bb..0b50a0eba3 100644 --- a/dace/transformation/passes/shared_memory_synchronization.py +++ b/dace/transformation/passes/shared_memory_synchronization.py @@ -12,7 +12,7 @@ from dace.sdfg import nodes, InterstateEdge from dace.sdfg.graph import Edge -from dace.sdfg.state import LoopRegion, ControlFlowBlock +from dace.sdfg.state import LoopRegion, ConditionalBlock, ControlFlowBlock from dace.sdfg.nodes import AccessNode, Map, MapEntry, MapExit from dace.transformation.passes import analysis as ap @@ -73,22 +73,34 @@ def _process_sdfg(self, sdfg: SDFG, enclosing_scopes: list[Union[MapExit, LoopRe Args: sdfg: The SDFG to traverse - enclosing_scopes: Stack of enclosing execution scopes (maps, loops) of sdfg + enclosing_scopes: Stack of execution scopes (e.g., maps, loops) enclosing the SDFG as a whole. """ - for node in sdfg.nodes(): + for sdfg_elem in sdfg.nodes(): + self._process_sdfg_element(sdfg, sdfg_elem, enclosing_scopes) + + def _process_sdfg_element(self, sdfg: SDFG, element: any, enclosing_scopes: list[Union[MapExit, LoopRegion]]) -> None: + """ + Identifies the type of the SDFG element and processes it using the corresponding handler. - if isinstance(node, LoopRegion): - self._process_loop_region(sdfg, node, enclosing_scopes) + Args: + sdfg: The current SDFG we are in (innermost if nested) + enclosing_scopes: Stack of enclosing execution scopes (maps, loops) wrapping the current SDFG + """ + if isinstance(element, LoopRegion): + self._process_loop_region(sdfg, element, enclosing_scopes) - elif isinstance(node, SDFGState): - self._process_state(sdfg, node, enclosing_scopes) + elif isinstance(element, SDFGState): + self._process_state(sdfg, element, enclosing_scopes) - else: - raise NotImplementedError( - f"{self.__class__.__name__}: Unsupported node type '{type(node).__name__}' " - f"encountered during SDFG traversal. Please extend the implementation to handle this case." - ) + elif isinstance(element, ConditionalBlock): + self._process_conditionalBlock(sdfg, element, enclosing_scopes) + + else: + raise NotImplementedError( + f"{self.__class__.__name__}: Unsupported node type '{type(element).__name__}' " + f"encountered during SDFG traversal. Please extend the implementation to handle this case." + ) def _process_loop_region(self, sdfg: SDFG, loop_region: LoopRegion, enclosing_scopes: list[Union[MapExit, LoopRegion]]) -> None: @@ -177,6 +189,24 @@ def _process_state(self, sdfg: SDFG, state: SDFGState, self._processed_nsdfg.add(node) + def _process_conditionalBlock(self, sdfg: SDFG, cond_block: ConditionalBlock, + enclosing_scopes: list[Union[MapExit, LoopRegion]]) -> None: + """ + Processes a ConditionalBlock by visiting each clause body and its elements. + + Args: + sdfg: The current SDFG context. + cond_block: The ConditionalBlock to process (e.g., if-elif-else structure). + enclosing_scopes: Stack of execution scopes (e.g., maps, loops) enclosing the SDFG as a whole. + """ + clause_bodies: list[ControlFlowBlock] = cond_block.nodes() + + for body in clause_bodies: + for sdfg_elem in body.nodes(): + self._process_sdfg_element(sdfg, sdfg_elem, enclosing_scopes) + + + def _is_shared_memory_access_node(self, sdfg: SDFG, node: nodes.Node) -> bool: """ Check if a node represents a GPU shared memory access. @@ -343,7 +373,7 @@ def _handle_shared_memory_post_synchronization(self, state: SDFGState, shared_me elif isinstance(innermost_sequential_scope, LoopRegion): # two options, see docstrings self._add_post_sync_tasklets_for_loop_region(innermost_sequential_scope) - # _add_post_sync_state_for_loop_region(innermost_sequential_scope) + # self._add_post_sync_state_for_loop_region(innermost_sequential_scope) From 558a815768a3de4dc1af27beef2d8f5dc84f4d81 Mon Sep 17 00:00:00 2001 From: aydogdub Date: Sun, 15 Jun 2025 16:46:15 +0200 Subject: [PATCH 27/94] Fixed mistakes and integrated the synchronization pass to the codegen (now in preprocess). Also first tests for checking basic correctness of the pass --- .../smem_tests/default_smem_sync_pass_test.py | 344 ++++++++++++++++++ dace/codegen/targets/experimental_cuda.py | 35 +- .../copy_strategies.py | 21 +- 3 files changed, 384 insertions(+), 16 deletions(-) create mode 100644 berkay_workpace/tests/smem_tests/default_smem_sync_pass_test.py diff --git a/berkay_workpace/tests/smem_tests/default_smem_sync_pass_test.py b/berkay_workpace/tests/smem_tests/default_smem_sync_pass_test.py new file mode 100644 index 0000000000..0f11ff2764 --- /dev/null +++ b/berkay_workpace/tests/smem_tests/default_smem_sync_pass_test.py @@ -0,0 +1,344 @@ +import dace +import dace.sdfg.nodes as nodes +from dace.sdfg.state import LoopRegion +from dace.transformation.passes.shared_memory_synchronization import DefaultSharedMemorySync + +import pytest + + +""" +Simple tests checking core functionality of the "DefaultSharedMemorySync" pass. +""" + + +@pytest.mark.gpu +def test_scalar_multiplic(): + """ + Constructs an SDFG that performs scalar multiplication on a vector. + + In this test, a sequential loop is placed inside the GPU kernel, reusing shared memory. + As a result, the 'DefaultSharedMemorySync' pass should insert a "__syncthreads();" + at the end of each iteration to ensure correctness. + + Note: This test is designed to evaluate where the 'DefaultSharedMemorySync' pass places + synchronization tasklets. In this particular example, the inserted synchronizations are + not strictly necessary and could be avoided with more advanced analysis, which is beyond + the scope of this pass. + """ + + #----------------- Build test program/SDFG-------------------- + + # Create SDFG and state + sdfg = dace.SDFG("scalarMultiplication_smem") + state = sdfg.add_state("main") + + # Add arrays + sdfg.add_array("A", (128,), dace.uint32, storage=dace.dtypes.StorageType.GPU_Global) + sdfg.add_scalar("scalar", dace.uint32) + sdfg.add_array("S", (32,), dace.uint32, storage=dace.dtypes.StorageType.GPU_Shared, transient=True, lifetime=dace.dtypes.AllocationLifetime.Scope) + + # Add access nodes + a_acc = state.add_read("A") + a_store = state.add_write("A") + scalar_acc = state.add_access("scalar") + s_acc= state.add_access("S") + + # Sequential map (outermost) + seq_map_entry, seq_map_exit = state.add_map( + "seq_map", + dict(k="0:4"), + schedule=dace.dtypes.ScheduleType.Sequential, + ) + + # GPU Device map + gpu_map_entry, gpu_map_exit = state.add_map( + "gpu_map", + dict(i="0:32:32"), + schedule=dace.dtypes.ScheduleType.GPU_Device, + ) + + # GPU TB map + tb_map_entry, tb_map_exit = state.add_map( + "tb", + dict(j="0:32"), + schedule=dace.dtypes.ScheduleType.GPU_ThreadBlock, + ) + + # Add tasklets for A -> S -> B + tasklet1 = state.add_tasklet( + "addMult", + inputs={"__inp_A", "__inp_scalar"}, + outputs={"__out"}, + code="__out = __inp_A * __inp_scalar;", + language=dace.dtypes.Language.CPP + ) + + tasklet2 = state.add_tasklet( + "store_to_global", + inputs={"__inp"}, + outputs={"__out"}, + code="__out = __inp;", + language=dace.dtypes.Language.CPP + ) + + # Edges + + # A and scalar to first map + state.add_edge(a_acc, None, gpu_map_entry, None, dace.Memlet("A[0:128]")) + state.add_edge(scalar_acc, None, gpu_map_entry, None, dace.Memlet("scalar[0]")) + + # Add both down to last map, the threadblock map + state.add_edge(gpu_map_entry, None, seq_map_entry, None, dace.Memlet("A[0:128]")) + state.add_edge(gpu_map_entry, None, seq_map_entry, None, dace.Memlet("scalar[0]")) + state.add_edge(seq_map_entry, None, tb_map_entry, None, dace.Memlet("A[32 * k: 32 * (k+1)]")) + state.add_edge(seq_map_entry, None, tb_map_entry, None, dace.Memlet("scalar[0]")) + + # connect to tasklets + state.add_edge(tb_map_entry, None, tasklet1, "__inp_A", dace.Memlet("A[j + 32* k]")) + state.add_edge(tb_map_entry, None, tasklet1, "__inp_scalar", dace.Memlet("scalar[0]")) + state.add_edge(tasklet1, "__out", s_acc, None, dace.Memlet("S[j]")) + state.add_edge(s_acc, None, tasklet2, "__inp", dace.Memlet("S[j]")) + + # connect to all map exit nodes and then back to A to store back + state.add_edge(tasklet2, "__out", tb_map_exit, None, dace.Memlet("A[j + 32* k]")) + state.add_edge(tb_map_exit, None, seq_map_exit, None, dace.Memlet("A[32 * k: 32 * (k+1)]")) + state.add_edge(seq_map_exit, None, gpu_map_exit, None, dace.Memlet("A[0:128]")) + state.add_edge(gpu_map_exit, None, a_store, None, dace.Memlet("A[0:128]")) + + sdfg.fill_scope_connectors() + + + #----------------- Apply pass -------------------- + + DefaultSharedMemorySync().apply_pass(sdfg, None) + + + #----------------- Check correct insertion of sync tasklets -------------------- + + # s_acc has a sync tasklet successor + found = None + for succ in state.successors(s_acc): + if (hasattr(succ, "_label") and succ._label == "pre_sync_barrier" and + isinstance(succ, nodes.Tasklet) and "__syncthreads();" in succ.code.code): + found = succ + break + + assert found is not None, "There should be a synchronization tasklet after the shared memory access" + + # smem is reused in seq map, so we need synchronization after each iteration + found = None + for pred in state.predecessors(seq_map_exit): + if (hasattr(pred, "_label") and pred._label == "post_sync_barrier" and + isinstance(pred, nodes.Tasklet) and "__syncthreads();" in pred.code.code): + found = pred + break + + assert found is not None, "There should be a synchronization tasklet after each iteration of the sequential map" + +@pytest.mark.gpu +def test_scalar_multiplic_special(): + """ + Constructs an SDFG that performs scalar multiplication on a vector. + + Similar to 'test_scalar_multiplic()', but now, since the sequential map + only iterates once, there is no post synchronization required and should be + omitted (although having it would not lead to wrong computations). + + """ + + #----------------- Build test program/SDFG-------------------- + + # Create SDFG and state + sdfg = dace.SDFG("scalarMultiplication_smem") + state = sdfg.add_state("main") + + # Add arrays + sdfg.add_array("A", (32,), dace.uint32, storage=dace.dtypes.StorageType.GPU_Global) + sdfg.add_scalar("scalar", dace.uint32) + sdfg.add_array("S", (32,), dace.uint32, storage=dace.dtypes.StorageType.GPU_Shared, transient=True, lifetime=dace.dtypes.AllocationLifetime.Scope) + + # Add access nodes + a_acc = state.add_read("A") + a_store = state.add_write("A") + scalar_acc = state.add_access("scalar") + s_acc= state.add_access("S") + + # Sequential map (outermost) + seq_map_entry, seq_map_exit = state.add_map( + "seq_map", + dict(k="0:1"), + schedule=dace.dtypes.ScheduleType.Sequential, + ) + + # GPU Device map + gpu_map_entry, gpu_map_exit = state.add_map( + "gpu_map", + dict(i="0:32:32"), + schedule=dace.dtypes.ScheduleType.GPU_Device, + ) + + # GPU TB map + tb_map_entry, tb_map_exit = state.add_map( + "tb", + dict(j="0:32"), + schedule=dace.dtypes.ScheduleType.GPU_ThreadBlock, + ) + + # Add tasklets for A -> S -> B + tasklet1 = state.add_tasklet( + "addMult", + inputs={"__inp_A", "__inp_scalar"}, + outputs={"__out"}, + code="__out = __inp_A * __inp_scalar;", + language=dace.dtypes.Language.CPP + ) + + tasklet2 = state.add_tasklet( + "store_to_global", + inputs={"__inp"}, + outputs={"__out"}, + code="__out = __inp;", + language=dace.dtypes.Language.CPP + ) + + # Edges + + # A and scalar to first map + state.add_edge(a_acc, None, gpu_map_entry, None, dace.Memlet("A[0:32]")) + state.add_edge(scalar_acc, None, gpu_map_entry, None, dace.Memlet("scalar[0]")) + + # Add both down to last map, the threadblock map + state.add_edge(gpu_map_entry, None, seq_map_entry, None, dace.Memlet("A[0:32]")) + state.add_edge(gpu_map_entry, None, seq_map_entry, None, dace.Memlet("scalar[0]")) + state.add_edge(seq_map_entry, None, tb_map_entry, None, dace.Memlet("A[32 * k: 32 * (k+1)]")) + state.add_edge(seq_map_entry, None, tb_map_entry, None, dace.Memlet("scalar[0]")) + + # connect to tasklets + state.add_edge(tb_map_entry, None, tasklet1, "__inp_A", dace.Memlet("A[j + 32* k]")) + state.add_edge(tb_map_entry, None, tasklet1, "__inp_scalar", dace.Memlet("scalar[0]")) + state.add_edge(tasklet1, "__out", s_acc, None, dace.Memlet("S[j]")) + state.add_edge(s_acc, None, tasklet2, "__inp", dace.Memlet("S[j]")) + + # connect to all map exit nodes and then back to A to store back + state.add_edge(tasklet2, "__out", tb_map_exit, None, dace.Memlet("A[j + 32* k]")) + state.add_edge(tb_map_exit, None, seq_map_exit, None, dace.Memlet("A[32 * k: 32 * (k+1)]")) + state.add_edge(seq_map_exit, None, gpu_map_exit, None, dace.Memlet("A[0:32]")) + state.add_edge(gpu_map_exit, None, a_store, None, dace.Memlet("A[0:32]")) + + sdfg.fill_scope_connectors() + + + #----------------- Apply pass -------------------- + + DefaultSharedMemorySync().apply_pass(sdfg, None) + + + #----------------- Check correct insertion of sync tasklets -------------------- + + # s_acc has a sync tasklet successor + found = None + for succ in state.successors(s_acc): + if (hasattr(succ, "_label") and succ._label == "pre_sync_barrier" and + isinstance(succ, nodes.Tasklet) and "__syncthreads();" in succ.code.code): + found = succ + break + + assert found is not None, "There should be a synchronization tasklet after the shared memory access" + + # smem is NOT reused in seq map + found = None + for pred in state.predecessors(seq_map_exit): + if (hasattr(pred, "_label") and pred._label == "post_sync_barrier" and + isinstance(pred, nodes.Tasklet) and "__syncthreads();" in pred.code.code): + found = pred + break + + assert found is None, "The DefaultSharedMemorySync pass should not have inserted at the end of the sequential map body" + +@pytest.mark.gpu +def test_scalar_multiplic_loopRegion(): + """ + Constructs an SDFG that performs scalar multiplication on a vector. + + Analogous to 'test_scalar_multiplic()', where a for loop instead of a sequential map + is used. + """ + + #----------------- Build test program/SDFG-------------------- + + sdfg = dace.SDFG("scalarMultiplication_smem") + state = sdfg.add_state("main") + + # Arrays and access nodes + sdfg.add_array("A", (128,), dace.uint32, storage=dace.dtypes.StorageType.GPU_Global) + sdfg.add_scalar("scalar", dace.uint32) + a_acc = state.add_read("A") + a_store = state.add_write("A") + scalar_acc = state.add_access("scalar") + + # Device and thread-block maps + gpu_map_entry, gpu_map_exit = state.add_map( + "gpu_map", dict(i="0:32:32"), schedule=dace.dtypes.ScheduleType.GPU_Device + ) + tb_map_entry, tb_map_exit = state.add_map( + "tb", dict(j="0:32"), schedule=dace.dtypes.ScheduleType.GPU_ThreadBlock + ) + + # Nested SDFG setup + inner_sdfg = dace.SDFG('nested_sdfg') + nested = state.add_nested_sdfg(inner_sdfg, sdfg, inputs={'__inp_A', '__inp_scalar'}, outputs={'tmp_ret'}) + + loopreg = LoopRegion("loop", "k < 4", "k", "k = 0", "k = (k + 1)", False, inner_sdfg) + inner_sdfg.add_node(loopreg) + inner_state = loopreg.add_state("use_smem") + + # Shared memory and result + inner_sdfg.add_array("S", (32,), dace.uint32, storage=dace.dtypes.StorageType.GPU_Shared, transient=True) + inner_sdfg.add_scalar("tmp_ret", dace.uint32) + s_acc = inner_state.add_access("S") + ret = inner_state.add_write("tmp_ret") + + # Tasklets + tasklet1 = inner_state.add_tasklet( + "assign_to_smem", inputs={}, outputs={"__out1"}, + code="__out1 = __inp_A[j + 32 * k]", + language=dace.dtypes.Language.CPP + ) + tasklet2 = inner_state.add_tasklet( + "addMult", inputs={"__inp2"}, outputs={"__out2"}, + code="__out2 = __inp2 * __inp_scalar;", + language=dace.dtypes.Language.CPP + ) + + # Main SDFG edges + state.add_edge(a_acc, None, gpu_map_entry, None, dace.Memlet("A[0:128]")) + state.add_edge(scalar_acc, None, gpu_map_entry, None, dace.Memlet("scalar[0]")) + state.add_edge(gpu_map_entry, None, tb_map_entry, None, dace.Memlet("A[0:128]")) + state.add_edge(gpu_map_entry, None, tb_map_entry, None, dace.Memlet("scalar[0]")) + state.add_edge(tb_map_entry, None, nested, "__inp_A", dace.Memlet("A[j : j + 97 : 32]")) + state.add_edge(tb_map_entry, None, nested, "__inp_scalar", dace.Memlet("scalar[0]")) + state.add_edge(nested, "tmp_ret", tb_map_exit, None, dace.Memlet("A[j : j + 97 : 32]")) + state.add_edge(tb_map_exit, None, gpu_map_exit, None, dace.Memlet("A[0:128]")) + state.add_edge(gpu_map_exit, None, a_store, None, dace.Memlet("A[0:128]")) + + # Inner SDFG edges + inner_state.add_edge(tasklet1, "__out1", s_acc, None, dace.Memlet("S[j]")) + inner_state.add_edge(s_acc, None, tasklet2, "__inp2", dace.Memlet("S[j]")) + inner_state.add_edge(tasklet2, "__out2", ret, None, dace.Memlet("S[j]")) + + sdfg.fill_scope_connectors() + + #----------------- Apply pass -------------------- + + DefaultSharedMemorySync().apply_pass(sdfg, None) + + + #----------------- Check correct insertion of sync tasklets -------------------- + + try: + # there should be only one successor of the ret accessNode, which is a sync tasklet + post_sync_tasklet = inner_state.successors(ret)[0] + assert "__syncthreads();" in post_sync_tasklet.code.code, "Post synchronization tasklet is not correctly inserted" + except: + # Any other weird failures + assert False, "Post synchronization tasklet is not correctly inserted" \ No newline at end of file diff --git a/dace/codegen/targets/experimental_cuda.py b/dace/codegen/targets/experimental_cuda.py index 81a8a84f13..9a54f12d7b 100644 --- a/dace/codegen/targets/experimental_cuda.py +++ b/dace/codegen/targets/experimental_cuda.py @@ -22,7 +22,6 @@ from dace.codegen.dispatcher import DefinedType, TargetDispatcher from dace.codegen.prettycode import CodeIOStream from dace.codegen.common import update_persistent_desc -from dace.codegen.targets import cpp from dace.codegen.targets.cpp import ( codeblock_to_cpp, memlet_copy_to_absolute_strides, @@ -33,6 +32,7 @@ # DaCe transformation imports from dace.transformation.passes import analysis as ap from dace.transformation.passes.gpustream_scheduling import NaiveGPUStreamScheduler +from dace.transformation.passes.shared_memory_synchronization import DefaultSharedMemorySync # Experimental CUDA helper imports from dace.codegen.targets.experimental_cuda_helpers.gpu_stream_manager import GPUStreamManager @@ -47,13 +47,17 @@ # TODO's easy: # 1. Handle memory pools release # 2. Handle sync properties -# 3. Warning/Error that GPU_deive must be used before other GPU schedules -# 4. Emit sync -# 5. compute_release() +# 3. Emit sync # TODO's harder: -# 2. Include constant expressions +# 1. Include constant expressions +# Question: Getting "const" expressions leads to some issues. +# So it looks like, that I need to do make this visible to lower +# generation as well. + + +# extended todo: get const, like in a general way without a hack in a scope @registry.autoregister_params(name='experimental_cuda') @@ -207,6 +211,10 @@ def preprocess(self, sdfg: SDFG) -> None: # Initialize runtime GPU stream manager self._gpu_stream_manager = GPUStreamManager(sdfg, assigned_streams, gpu_stream_access_template) + #----------------- Shared Memory Synchronization related Logic ----------------- + + DefaultSharedMemorySync().apply_pass(sdfg, None) + #------------------------- Memory Pool related Logic -------------------------- # Find points where memory should be released to the memory pool @@ -409,7 +417,7 @@ def _generate_kernel_wrapper(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope function_stream.write('DACE_EXPORTED void __dace_runkernel_%s(%s);\n' % (kernel_name, ', '.join(kernel_wrapper_args_typed)), cfg, state_id, scope_entry) - # Calling he function which launches the kernel (C++ code) + # Calling the function which launches the kernel (C++ code) callsite_stream.write( '__dace_runkernel_%s(%s);\n' % (kernel_name, ', '.join(kernel_wrapper_args)), cfg, state_id, scope_entry) @@ -1171,6 +1179,7 @@ def __init__(self, cudaCodeGen: ExperimentalCUDACodeGen, sdfg: SDFG, cfg: Contro kernel_entry_node = dfg_scope.source_nodes()[0] + kernel_exit_node = dfg_scope.sink_nodes()[0] state: SDFGState = cfg.state(state_id) self._kernel_entry_node: nodes.MapEntry = kernel_entry_node @@ -1187,6 +1196,20 @@ def __init__(self, cudaCodeGen: ExperimentalCUDACodeGen, sdfg: SDFG, cfg: Contro arglist = state.scope_subgraph(node).arglist(defined_syms, shared_transients) break self._args: Dict = arglist + + """ + # const args + input_params = set(e.data.data for e in state.in_edges(kernel_entry_node)) + output_params = set(e.data.data for e in state.out_edges(kernel_exit_node)) + toplevel_params = set(node.data for node in dfg_scope.nodes() + if isinstance(node, nodes.AccessNode) and sdfg.arrays[node.data].toplevel) + dynamic_inputs = set(e.data.data for e in dace.sdfg.dynamic_map_inputs(state, kernel_entry_node)) + + const_args = input_params - (output_params | toplevel_params | dynamic_inputs) + self._args_typed: list[str] = [('const ' if aname in const_args else '') + adata.as_arg(name=aname) for aname, adata in self._args.items()] + """ + + # args typed correctly and as input self._args_typed: list[str] = [adata.as_arg(name=aname) for aname, adata in self._args.items()] self._args_as_input: list[str] = [ptr(aname, adata, sdfg, cudaCodeGen._frame) for aname, adata in self._args.items()] diff --git a/dace/codegen/targets/experimental_cuda_helpers/copy_strategies.py b/dace/codegen/targets/experimental_cuda_helpers/copy_strategies.py index 4b59854fa8..92212112f6 100644 --- a/dace/codegen/targets/experimental_cuda_helpers/copy_strategies.py +++ b/dace/codegen/targets/experimental_cuda_helpers/copy_strategies.py @@ -417,7 +417,8 @@ def applicable(self, copy_context: CopyContext) -> bool: # Determine the schedule type of the innermost non-sequential map. # If no such map exists, use the default schedule. current_node = deeper_scope_node - while (current_node is None or current_node.map.schedule == dtypes.ScheduleType.Sequential): + while (current_node is None or not isinstance(current_node, nodes.MapEntry) or + current_node.map.schedule == dtypes.ScheduleType.Sequential): parent = helpers.get_parent_map(state, current_node) if parent is None: @@ -480,12 +481,12 @@ def generate_copy(self, copy_context: CopyContext) -> None: # Retrieve kernel specs from the ExperimentalCUDACodegen instance (held in a dedicated class) # Only there block_dims is stored, which is needed in this case kernel_specifications: KernelSpec = copy_context.codegen._current_kernel_spec - block_dims = ', '.join(kernel_specifications.block_dims) + block_dims = ', '.join(symbolic_to_cpp(kernel_specifications.block_dims)) # was called "is_async" previously. It determines whether a "__syncthreads()" is called at the # end of the copy. In ExperimentalCUDACodegen, a pass is responsible to insert such sync barriers, # so it is synchronized and we do not need "implicit" synchronization - synchronized = True + synchronized = "false" if any(symbolic.issymbolic(s, copy_context.sdfg.constants) for s in copy_shape): args_list = ( @@ -496,14 +497,14 @@ def generate_copy(self, copy_context: CopyContext) -> None: + dst_strides + copy_shape ) - args = ", ".join(args_list) + args = ", ".join(symbolic_to_cpp(args_list)) callsite_stream.write(f"{function_name}Dynamic<{ctype}, {block_dims}, {synchronized}>{accum}({args});", cfg, state_id, [src_node, dst_node]) elif function_name == "dace::SharedToGlobal1D": # special case: use a new template struct that provides functions for copy and reduction - copy_size = ', '.join(copy_shape) + copy_size = ', '.join(symbolic_to_cpp(copy_shape)) accum = accum or '::Copy' args_list = ( [src_expr] @@ -512,21 +513,21 @@ def generate_copy(self, copy_context: CopyContext) -> None: + dst_strides + custom_reduction ) - args = ", ".join(args_list) + args = ", ".join(symbolic_to_cpp(args_list)) callsite_stream.write(f"{function_name}<{ctype}, {block_dims}, {copy_size}, {synchronized}>{accum}({args});", cfg, state_id, [src_node, dst_node]) else: - copy_size = ', '.join(copy_shape) - accum = accum or '::Copy' + copy_size = ', '.join(symbolic_to_cpp(copy_shape)) args_list = ( [src_expr] + src_strides + [dst_expr] + custom_reduction ) - args = ", ".join(args_list) - callsite_stream.write(f"{function_name}<{ctype}, {block_dims}, {copy_size}, {dst_strides}, {synchronized}>{accum}({args});", + args = ", ".join(symbolic_to_cpp(args_list)) + dst_strides_unpacked = ", ".join(symbolic_to_cpp(dst_strides)) + callsite_stream.write(f"{function_name}<{ctype}, {block_dims}, {copy_size}, {dst_strides_unpacked}, {synchronized}>{accum}({args});", cfg, state_id, [src_node, dst_node]) From a545ce81b181054f199cb192637835c99bf1ecb9 Mon Sep 17 00:00:00 2001 From: aydogdub Date: Mon, 16 Jun 2025 16:05:00 +0200 Subject: [PATCH 28/94] fix collaborative synchronization- not requred anymore --- .../targets/experimental_cuda_helpers/copy_strategies.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dace/codegen/targets/experimental_cuda_helpers/copy_strategies.py b/dace/codegen/targets/experimental_cuda_helpers/copy_strategies.py index 92212112f6..9b740d547d 100644 --- a/dace/codegen/targets/experimental_cuda_helpers/copy_strategies.py +++ b/dace/codegen/targets/experimental_cuda_helpers/copy_strategies.py @@ -486,7 +486,7 @@ def generate_copy(self, copy_context: CopyContext) -> None: # was called "is_async" previously. It determines whether a "__syncthreads()" is called at the # end of the copy. In ExperimentalCUDACodegen, a pass is responsible to insert such sync barriers, # so it is synchronized and we do not need "implicit" synchronization - synchronized = "false" + synchronized = "true" if any(symbolic.issymbolic(s, copy_context.sdfg.constants) for s in copy_shape): args_list = ( From b0c5e07deb110410edb020af0978c90a1d7a6274 Mon Sep 17 00:00:00 2001 From: aydogdub Date: Mon, 16 Jun 2025 16:06:54 +0200 Subject: [PATCH 29/94] Yakups examples, bad example of issues of legacy codegen and testing two of yakups examples --- .../thesis_related/const_check_fails.ipynb | 179 + .../2d_blocktiled_gemm_with_custom_copy.sdfg | 4165 +++++++++++++++++ .../yakups_examples/generate_sdfgs.ipynb | 656 +++ .../nice_global_to_shared_copy.sdfg | 1278 +++++ .../weird_global_to_global.sdfg | 1404 ++++++ .../weird_shared_to_shared_copy.sdfg | 896 ++++ berkay_workpace/tests/smem_tests/gemm_test.py | 32 + .../smem_tests/special_sync_pass_test.py | 37 + 8 files changed, 8647 insertions(+) create mode 100644 berkay_workpace/scratch/thesis_related/const_check_fails.ipynb create mode 100644 berkay_workpace/scratch/yakups_examples/2d_blocktiled_gemm_with_custom_copy.sdfg create mode 100644 berkay_workpace/scratch/yakups_examples/generate_sdfgs.ipynb create mode 100644 berkay_workpace/scratch/yakups_examples/nice_global_to_shared_copy.sdfg create mode 100644 berkay_workpace/scratch/yakups_examples/weird_global_to_global.sdfg create mode 100644 berkay_workpace/scratch/yakups_examples/weird_shared_to_shared_copy.sdfg create mode 100644 berkay_workpace/tests/smem_tests/gemm_test.py create mode 100644 berkay_workpace/tests/smem_tests/special_sync_pass_test.py diff --git a/berkay_workpace/scratch/thesis_related/const_check_fails.ipynb b/berkay_workpace/scratch/thesis_related/const_check_fails.ipynb new file mode 100644 index 0000000000..80250c2662 --- /dev/null +++ b/berkay_workpace/scratch/thesis_related/const_check_fails.ipynb @@ -0,0 +1,179 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "2362eee8", + "metadata": {}, + "source": [ + "# Const Check fail\n", + "\n", + "Here is a sdfg that leads to compilation error using the legacy CUDACodeGen. Below is the sdfg." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "f77627b8", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + "
\n", + "
\n", + "\n", + "" + ], + "text/plain": [ + "SDFG (kernel)" + ] + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import dace\n", + "\n", + "sdfg = dace.SDFG.from_file(\"../yakups_examples/weird_global_to_global.sdfg\")\n", + "sdfg" + ] + }, + { + "cell_type": "markdown", + "id": "ae442fd0", + "metadata": {}, + "source": [ + "Ensure you are using the legay CUDACodegen" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ce960a1f", + "metadata": {}, + "outputs": [], + "source": [ + "from dace.config import Config\n", + "\n", + "assert Config.get('compiler', 'cuda', 'implementation') == 'legacy', \"Select legacy CUDACodgen in config.yml & restart jupyter notebook\"" + ] + }, + { + "cell_type": "markdown", + "id": "d8e04ba1", + "metadata": {}, + "source": [ + "Now let's try to compile the sdfg and observe how it fails:" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "177184e4", + "metadata": {}, + "outputs": [ + { + "ename": "CompilationError", + "evalue": "Compiler failure:\n[ 16%] \u001b[32mBuilding CXX object CMakeFiles/kernel.dir/home/berkay/master-thesis/dace/berkay_workpace/scratch/thesis_related/.dacecache/kernel/src/cpu/kernel.cpp.o\u001b[0m\nIn file included from /home/berkay/master-thesis/dace/dace/codegen/../runtime/include/dace/dace.h:14,\n from /home/berkay/master-thesis/dace/berkay_workpace/scratch/thesis_related/.dacecache/kernel/src/cpu/kernel.cpp:2:\n/home/berkay/master-thesis/dace/dace/codegen/../runtime/include/dace/types.h: In constructor ‘dace::half::half(float)’:\n/home/berkay/master-thesis/dace/dace/codegen/../runtime/include/dace/types.h:101:28: warning: dereferencing type-punned pointer will break strict-aliasing rules [-Wstrict-aliasing]\n 101 | uint32_t x = *((uint32_t*)&f);\n | ~^~~~~~~~~~~~~~\n[ 33%] \u001b[32mBuilding CUDA object CMakeFiles/kernel.dir/home/berkay/master-thesis/dace/berkay_workpace/scratch/thesis_related/.dacecache/kernel/src/cuda/kernel_cuda.cu.o\u001b[0m\n/home/berkay/master-thesis/dace/dace/codegen/../runtime/include/dace/../../../external/moodycamel/concurrentqueue.h(3599): warning #68-D: integer conversion resulted in a change of sign\nRemark: The warnings can be suppressed with \"-diag-suppress \"\n\n/home/berkay/master-thesis/dace/dace/codegen/../runtime/include/dace/../../../external/moodycamel/concurrentqueue.h(3607): warning #68-D: integer conversion resulted in a change of sign\n\n/home/berkay/master-thesis/dace/berkay_workpace/scratch/thesis_related/.dacecache/kernel/src/cuda/kernel_cuda.cu(97): error: no instance of function template \"dace::GlobalToGlobal1D\" matches the argument list\n argument types are: (const double *, int, const double *__restrict__)\n\n/home/berkay/master-thesis/dace/berkay_workpace/scratch/thesis_related/.dacecache/kernel/src/cuda/kernel_cuda.cu(99): error: no instance of function template \"dace::GlobalToGlobal1D\" matches the argument list\n argument types are: (const double *, int, const double *__restrict__)\n\n2 errors detected in the compilation of \"/home/berkay/master-thesis/dace/berkay_workpace/scratch/thesis_related/.dacecache/kernel/src/cuda/kernel_cuda.cu\".\ngmake[2]: *** [CMakeFiles/kernel.dir/build.make:94: CMakeFiles/kernel.dir/home/berkay/master-thesis/dace/berkay_workpace/scratch/thesis_related/.dacecache/kernel/src/cuda/kernel_cuda.cu.o] Error 2\ngmake[1]: *** [CMakeFiles/Makefile2:90: CMakeFiles/kernel.dir/all] Error 2\ngmake: *** [Makefile:91: all] Error 2\n", + "output_type": "error", + "traceback": [ + "\u001b[31m---------------------------------------------------------------------------\u001b[39m", + "\u001b[31mCalledProcessError\u001b[39m Traceback (most recent call last)", + "\u001b[36mFile \u001b[39m\u001b[32m~/master-thesis/dace/dace/codegen/compiler.py:245\u001b[39m, in \u001b[36mconfigure_and_compile\u001b[39m\u001b[34m(program_folder, program_name, output_stream)\u001b[39m\n\u001b[32m 244\u001b[39m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[32m--> \u001b[39m\u001b[32m245\u001b[39m \u001b[43m_run_liveoutput\u001b[49m\u001b[43m(\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mcmake --build . --config \u001b[39;49m\u001b[38;5;132;43;01m%s\u001b[39;49;00m\u001b[33;43m\"\u001b[39;49m\u001b[43m \u001b[49m\u001b[43m%\u001b[49m\u001b[43m \u001b[49m\u001b[43m(\u001b[49m\u001b[43mConfig\u001b[49m\u001b[43m.\u001b[49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[33;43m'\u001b[39;49m\u001b[33;43mcompiler\u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[33;43m'\u001b[39;49m\u001b[33;43mbuild_type\u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 246\u001b[39m \u001b[43m \u001b[49m\u001b[43mshell\u001b[49m\u001b[43m=\u001b[49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[32m 247\u001b[39m \u001b[43m \u001b[49m\u001b[43mcwd\u001b[49m\u001b[43m=\u001b[49m\u001b[43mbuild_folder\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 248\u001b[39m \u001b[43m \u001b[49m\u001b[43moutput_stream\u001b[49m\u001b[43m=\u001b[49m\u001b[43moutput_stream\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 249\u001b[39m \u001b[38;5;28;01mexcept\u001b[39;00m subprocess.CalledProcessError \u001b[38;5;28;01mas\u001b[39;00m ex:\n\u001b[32m 250\u001b[39m \u001b[38;5;66;03m# If unsuccessful, print results\u001b[39;00m\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/master-thesis/dace/dace/codegen/compiler.py:429\u001b[39m, in \u001b[36m_run_liveoutput\u001b[39m\u001b[34m(command, output_stream, **kwargs)\u001b[39m\n\u001b[32m 428\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m process.returncode != \u001b[32m0\u001b[39m:\n\u001b[32m--> \u001b[39m\u001b[32m429\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m subprocess.CalledProcessError(process.returncode, command, output.getvalue())\n", + "\u001b[31mCalledProcessError\u001b[39m: Command 'cmake --build . --config RelWithDebInfo' returned non-zero exit status 2.", + "\nDuring handling of the above exception, another exception occurred:\n", + "\u001b[31mCompilationError\u001b[39m Traceback (most recent call last)", + "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[3]\u001b[39m\u001b[32m, line 1\u001b[39m\n\u001b[32m----> \u001b[39m\u001b[32m1\u001b[39m \u001b[43msdfg\u001b[49m\u001b[43m.\u001b[49m\u001b[43mcompile\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/master-thesis/dace/dace/sdfg/sdfg.py:2396\u001b[39m, in \u001b[36mSDFG.compile\u001b[39m\u001b[34m(self, output_file, validate, return_program_handle)\u001b[39m\n\u001b[32m 2393\u001b[39m sdfg = \u001b[38;5;28mself\u001b[39m\n\u001b[32m 2395\u001b[39m \u001b[38;5;66;03m# Compile the code and get the shared library path\u001b[39;00m\n\u001b[32m-> \u001b[39m\u001b[32m2396\u001b[39m shared_library = \u001b[43mcompiler\u001b[49m\u001b[43m.\u001b[49m\u001b[43mconfigure_and_compile\u001b[49m\u001b[43m(\u001b[49m\u001b[43mprogram_folder\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43msdfg\u001b[49m\u001b[43m.\u001b[49m\u001b[43mname\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 2398\u001b[39m \u001b[38;5;66;03m# If provided, save output to path or filename\u001b[39;00m\n\u001b[32m 2399\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m output_file \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/master-thesis/dace/dace/codegen/compiler.py:254\u001b[39m, in \u001b[36mconfigure_and_compile\u001b[39m\u001b[34m(program_folder, program_name, output_stream)\u001b[39m\n\u001b[32m 252\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m cgx.CompilationError(\u001b[33m'\u001b[39m\u001b[33mCompiler failure\u001b[39m\u001b[33m'\u001b[39m)\n\u001b[32m 253\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m--> \u001b[39m\u001b[32m254\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m cgx.CompilationError(\u001b[33m'\u001b[39m\u001b[33mCompiler failure:\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[33m'\u001b[39m + ex.output)\n\u001b[32m 256\u001b[39m shared_library_path = os.path.join(build_folder, \u001b[33m\"\u001b[39m\u001b[33mlib\u001b[39m\u001b[38;5;132;01m{}\u001b[39;00m\u001b[33m.\u001b[39m\u001b[38;5;132;01m{}\u001b[39;00m\u001b[33m\"\u001b[39m.format(program_name,\n\u001b[32m 257\u001b[39m Config.get(\u001b[33m'\u001b[39m\u001b[33mcompiler\u001b[39m\u001b[33m'\u001b[39m, \u001b[33m'\u001b[39m\u001b[33mlibrary_extension\u001b[39m\u001b[33m'\u001b[39m)))\n\u001b[32m 259\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m shared_library_path\n", + "\u001b[31mCompilationError\u001b[39m: Compiler failure:\n[ 16%] \u001b[32mBuilding CXX object CMakeFiles/kernel.dir/home/berkay/master-thesis/dace/berkay_workpace/scratch/thesis_related/.dacecache/kernel/src/cpu/kernel.cpp.o\u001b[0m\nIn file included from /home/berkay/master-thesis/dace/dace/codegen/../runtime/include/dace/dace.h:14,\n from /home/berkay/master-thesis/dace/berkay_workpace/scratch/thesis_related/.dacecache/kernel/src/cpu/kernel.cpp:2:\n/home/berkay/master-thesis/dace/dace/codegen/../runtime/include/dace/types.h: In constructor ‘dace::half::half(float)’:\n/home/berkay/master-thesis/dace/dace/codegen/../runtime/include/dace/types.h:101:28: warning: dereferencing type-punned pointer will break strict-aliasing rules [-Wstrict-aliasing]\n 101 | uint32_t x = *((uint32_t*)&f);\n | ~^~~~~~~~~~~~~~\n[ 33%] \u001b[32mBuilding CUDA object CMakeFiles/kernel.dir/home/berkay/master-thesis/dace/berkay_workpace/scratch/thesis_related/.dacecache/kernel/src/cuda/kernel_cuda.cu.o\u001b[0m\n/home/berkay/master-thesis/dace/dace/codegen/../runtime/include/dace/../../../external/moodycamel/concurrentqueue.h(3599): warning #68-D: integer conversion resulted in a change of sign\nRemark: The warnings can be suppressed with \"-diag-suppress \"\n\n/home/berkay/master-thesis/dace/dace/codegen/../runtime/include/dace/../../../external/moodycamel/concurrentqueue.h(3607): warning #68-D: integer conversion resulted in a change of sign\n\n/home/berkay/master-thesis/dace/berkay_workpace/scratch/thesis_related/.dacecache/kernel/src/cuda/kernel_cuda.cu(97): error: no instance of function template \"dace::GlobalToGlobal1D\" matches the argument list\n argument types are: (const double *, int, const double *__restrict__)\n\n/home/berkay/master-thesis/dace/berkay_workpace/scratch/thesis_related/.dacecache/kernel/src/cuda/kernel_cuda.cu(99): error: no instance of function template \"dace::GlobalToGlobal1D\" matches the argument list\n argument types are: (const double *, int, const double *__restrict__)\n\n2 errors detected in the compilation of \"/home/berkay/master-thesis/dace/berkay_workpace/scratch/thesis_related/.dacecache/kernel/src/cuda/kernel_cuda.cu\".\ngmake[2]: *** [CMakeFiles/kernel.dir/build.make:94: CMakeFiles/kernel.dir/home/berkay/master-thesis/dace/berkay_workpace/scratch/thesis_related/.dacecache/kernel/src/cuda/kernel_cuda.cu.o] Error 2\ngmake[1]: *** [CMakeFiles/Makefile2:90: CMakeFiles/kernel.dir/all] Error 2\ngmake: *** [Makefile:91: all] Error 2\n" + ] + } + ], + "source": [ + "sdfg.compile()" + ] + }, + { + "cell_type": "markdown", + "id": "95ba7c6b", + "metadata": {}, + "source": [ + "### ❓ Why does this fail?\n", + "\n", + "The error is a **compilation failure** when calling the function template `dace::GlobalToGlobal1D`, due to a mismatch in argument types.\n", + "\n", + "The function expects:\n", + "\n", + "```cpp\n", + "(const T* src, int src_xstride, T* dst)\n", + "```\n", + "\n", + "But in the following example:\n", + "\n", + "```cpp\n", + "dace::GlobalToGlobal1D(A + i, 1, A);\n", + "```\n", + "\n", + "both the **source** (`A + i`) and **destination** (`A`) point to the **same array**, just at different locations.\n", + "\n", + "---\n", + "\n", + "### 🧠 Why is this a problem?\n", + "\n", + "1. **Wrong `const` deduction** \n", + " The old code generator mistakenly marks `A` as `const`, even though it is used as the **destination**.\n", + "\n", + "2. **Missing overloads** \n", + " The template should allow for cases where the destination is not `const`. Overloading should be used to handle this properly — even if it is probably not \n", + " strictly an error, it is probably good practice\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "dace_env", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/berkay_workpace/scratch/yakups_examples/2d_blocktiled_gemm_with_custom_copy.sdfg b/berkay_workpace/scratch/yakups_examples/2d_blocktiled_gemm_with_custom_copy.sdfg new file mode 100644 index 0000000000..c49fc87dfa --- /dev/null +++ b/berkay_workpace/scratch/yakups_examples/2d_blocktiled_gemm_with_custom_copy.sdfg @@ -0,0 +1,4165 @@ +{ + "type": "SDFG", + "attributes": { + "name": "dace_naive_matmul", + "arg_names": [ + "A", + "B", + "C" + ], + "_arrays": { + "A": { + "type": "Array", + "attributes": { + "strides": [ + "K", + "1" + ], + "total_size": "K*M", + "offset": [ + "0", + "0" + ], + "optional": false, + "dtype": "float32", + "shape": [ + "M", + "K" + ], + "storage": "GPU_Global", + "debuginfo": null + } + }, + "B": { + "type": "Array", + "attributes": { + "strides": [ + "N", + "1" + ], + "total_size": "K*N", + "offset": [ + "0", + "0" + ], + "optional": false, + "dtype": "float32", + "shape": [ + "K", + "N" + ], + "storage": "GPU_Global", + "debuginfo": null + } + }, + "C": { + "type": "Array", + "attributes": { + "strides": [ + "N", + "1" + ], + "total_size": "M*N", + "offset": [ + "0", + "0" + ], + "optional": false, + "dtype": "float32", + "shape": [ + "M", + "N" + ], + "storage": "GPU_Global", + "debuginfo": null + } + }, + "__tmp3": { + "type": "Scalar", + "attributes": { + "dtype": "float32", + "shape": [ + "1" + ], + "transient": true, + "storage": "Register", + "debuginfo": null + } + }, + "__tmp4": { + "type": "Scalar", + "attributes": { + "dtype": "float32", + "shape": [ + "1" + ], + "transient": true, + "storage": "Register", + "debuginfo": null + } + }, + "tmp": { + "type": "Array", + "attributes": { + "strides": [ + "8", + "1" + ], + "total_size": "64", + "offset": [ + "0", + "0" + ], + "alignment": 16, + "optional": false, + "dtype": "float32", + "shape": [ + "8", + "8" + ], + "transient": true, + "storage": "Register", + "debuginfo": null + } + }, + "shrB": { + "type": "Array", + "attributes": { + "strides": [ + "128", + "1" + ], + "total_size": "2048", + "offset": [ + "0", + "0" + ], + "optional": false, + "dtype": "float32", + "shape": [ + "16", + "128" + ], + "transient": true, + "storage": "GPU_Shared", + "debuginfo": null + } + }, + "shrA": { + "type": "Array", + "attributes": { + "strides": [ + "16", + "1" + ], + "total_size": "2048", + "offset": [ + "0", + "0" + ], + "optional": false, + "dtype": "float32", + "shape": [ + "128", + "16" + ], + "transient": true, + "storage": "GPU_Shared", + "debuginfo": null + } + } + }, + "symbols": { + "K": "int32", + "M": "int32", + "N": "int32" + }, + "global_code": { + "frame": { + "string_data": "", + "language": "CPP" + } + }, + "init_code": { + "frame": { + "string_data": "", + "language": "CPP" + } + }, + "exit_code": { + "frame": { + "string_data": "", + "language": "CPP" + } + }, + "orig_sdfg": { + "type": "SDFG", + "attributes": { + "name": "dace_naive_matmul", + "arg_names": [ + "A", + "B", + "C" + ], + "_arrays": { + "A": { + "type": "Array", + "attributes": { + "strides": [ + "K", + "1" + ], + "total_size": "K*M", + "offset": [ + "0", + "0" + ], + "optional": false, + "dtype": "float32", + "shape": [ + "M", + "K" + ], + "storage": "GPU_Global", + "debuginfo": null + } + }, + "B": { + "type": "Array", + "attributes": { + "strides": [ + "N", + "1" + ], + "total_size": "K*N", + "offset": [ + "0", + "0" + ], + "optional": false, + "dtype": "float32", + "shape": [ + "K", + "N" + ], + "storage": "GPU_Global", + "debuginfo": null + } + }, + "C": { + "type": "Array", + "attributes": { + "strides": [ + "N", + "1" + ], + "total_size": "M*N", + "offset": [ + "0", + "0" + ], + "optional": false, + "dtype": "float32", + "shape": [ + "M", + "N" + ], + "storage": "GPU_Global", + "debuginfo": null + } + }, + "tmp": { + "type": "Scalar", + "attributes": { + "dtype": "float32", + "shape": [ + "1" + ], + "transient": true, + "storage": "Register", + "debuginfo": null + } + }, + "__tmp3": { + "type": "Scalar", + "attributes": { + "dtype": "float32", + "shape": [ + "1" + ], + "transient": true, + "debuginfo": null + } + }, + "__tmp4": { + "type": "Scalar", + "attributes": { + "dtype": "float32", + "shape": [ + "1" + ], + "transient": true, + "debuginfo": null + } + } + }, + "symbols": { + "K": "int32", + "M": "int32", + "N": "int32" + }, + "global_code": { + "frame": { + "string_data": "", + "language": "CPP" + } + }, + "init_code": { + "frame": { + "string_data": "", + "language": "CPP" + } + }, + "exit_code": { + "frame": { + "string_data": "", + "language": "CPP" + } + }, + "debuginfo": { + "type": "DebugInfo", + "start_line": 254, + "end_line": 262, + "start_column": 0, + "end_column": 0, + "filename": "/home/primrose/Work/DaCellerator/MatMul/matmul.py" + }, + "guid": "b58195d5-3a0f-46d0-ac04-c263d4e3772b" + }, + "nodes": [ + { + "type": "SDFGState", + "label": "MapState", + "id": 0, + "collapsed": false, + "scope_dict": { + "-1": [ + 0, + 2, + 3, + 4 + ], + "0": [ + 1, + 5, + 6, + 8, + 9 + ], + "6": [ + 7, + 10, + 11, + 12, + 13, + 14 + ] + }, + "nodes": [ + { + "type": "MapEntry", + "label": "dace_naive_matmul_258[i=0:M, j=0:N]", + "attributes": { + "label": "dace_naive_matmul_258", + "params": [ + "i", + "j" + ], + "range": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "M - 1", + "step": "1", + "tile": "1" + }, + { + "start": "0", + "end": "N - 1", + "step": "1", + "tile": "1" + } + ] + }, + "schedule": "GPU_Device", + "debuginfo": { + "type": "DebugInfo", + "start_line": 258, + "end_line": 258, + "start_column": 2, + "end_column": 2, + "filename": "/home/primrose/Work/DaCellerator/MatMul/matmul.py" + }, + "in_connectors": { + "IN_A": null, + "IN_B": null + }, + "out_connectors": { + "OUT_A": null, + "OUT_B": null + }, + "guid": "3befdc17-85e0-4f77-8d69-28f32bfd3669" + }, + "id": 0, + "scope_entry": null, + "scope_exit": "1" + }, + { + "type": "MapExit", + "label": "dace_naive_matmul_258[i=0:M, j=0:N]", + "attributes": { + "in_connectors": { + "IN_C": null + }, + "out_connectors": { + "OUT_C": null + }, + "guid": "0778c935-cb9e-40f0-b04a-8c24d0f3fd9f" + }, + "id": 1, + "scope_entry": "0", + "scope_exit": "1" + }, + { + "type": "AccessNode", + "label": "A", + "attributes": { + "debuginfo": { + "type": "DebugInfo", + "start_line": 258, + "end_line": 258, + "start_column": 2, + "end_column": 2, + "filename": "/home/primrose/Work/DaCellerator/MatMul/matmul.py" + }, + "data": "A", + "guid": "c63b7b81-d6b7-4d02-bb1a-08b16d50d082" + }, + "id": 2, + "scope_entry": null, + "scope_exit": null + }, + { + "type": "AccessNode", + "label": "B", + "attributes": { + "debuginfo": { + "type": "DebugInfo", + "start_line": 258, + "end_line": 258, + "start_column": 2, + "end_column": 2, + "filename": "/home/primrose/Work/DaCellerator/MatMul/matmul.py" + }, + "data": "B", + "guid": "0775dbf8-a80f-41bd-94df-85333c04942c" + }, + "id": 3, + "scope_entry": null, + "scope_exit": null + }, + { + "type": "AccessNode", + "label": "C", + "attributes": { + "debuginfo": { + "type": "DebugInfo", + "start_line": 258, + "end_line": 258, + "start_column": 2, + "end_column": 2, + "filename": "/home/primrose/Work/DaCellerator/MatMul/matmul.py" + }, + "data": "C", + "guid": "fd617edc-81aa-4499-bbe1-3f1183629c0f" + }, + "id": 4, + "scope_entry": null, + "scope_exit": null + }, + { + "type": "AccessNode", + "label": "tmp", + "attributes": { + "setzero": true, + "debuginfo": { + "type": "DebugInfo", + "start_line": 259, + "end_line": 259, + "start_column": 23, + "end_column": 23, + "filename": "/home/primrose/Work/DaCellerator/MatMul/matmul.py" + }, + "data": "tmp", + "guid": "156c4b4e-6d95-4233-8a56-ae9a92225d9f" + }, + "id": 5, + "scope_entry": "0", + "scope_exit": "1" + }, + { + "type": "MapEntry", + "label": "dace_naive_matmul_258_2_260[k=0:K]", + "attributes": { + "label": "dace_naive_matmul_258_2_260", + "params": [ + "k" + ], + "range": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "K - 1", + "step": "1", + "tile": "1" + } + ] + }, + "schedule": "Sequential", + "debuginfo": { + "type": "DebugInfo", + "start_line": 260, + "end_line": 260, + "start_column": 4, + "end_column": 4, + "filename": "/home/primrose/Work/DaCellerator/MatMul/matmul.py" + }, + "in_connectors": { + "IN___tmp1": null, + "IN___tmp_261_18_r_in_from_2_0": null, + "IN___tmp_261_28_r_in_from_2_0": null + }, + "out_connectors": { + "OUT___tmp1": null, + "OUT___tmp_261_18_r_in_from_2_0": null, + "OUT___tmp_261_28_r_in_from_2_0": null + }, + "guid": "dbd98cec-d976-42db-a740-b8c99d25fd8d" + }, + "id": 6, + "scope_entry": "0", + "scope_exit": "7" + }, + { + "type": "MapExit", + "label": "dace_naive_matmul_258_2_260[k=0:K]", + "attributes": { + "in_connectors": { + "IN___tmp1": null + }, + "out_connectors": { + "OUT___tmp1": null + }, + "guid": "fcc83bb0-47e1-4eef-a4a8-3ce4d9596be2" + }, + "id": 7, + "scope_entry": "6", + "scope_exit": "7" + }, + { + "type": "AccessNode", + "label": "tmp", + "attributes": { + "debuginfo": { + "type": "DebugInfo", + "start_line": 260, + "end_line": 260, + "start_column": 4, + "end_column": 4, + "filename": "/home/primrose/Work/DaCellerator/MatMul/matmul.py" + }, + "data": "tmp", + "guid": "7d953f40-4e3d-430f-9398-f394902d4cab" + }, + "id": 8, + "scope_entry": "0", + "scope_exit": "1" + }, + { + "type": "Tasklet", + "label": "assign_262_4", + "attributes": { + "code": { + "string_data": "__out = __inp", + "language": "Python" + }, + "debuginfo": { + "type": "DebugInfo", + "start_line": 262, + "end_line": 262, + "start_column": 9, + "end_column": 9, + "filename": "/home/primrose/Work/DaCellerator/MatMul/matmul.py" + }, + "label": "assign_262_4", + "in_connectors": { + "__inp": null + }, + "out_connectors": { + "__out": null + }, + "guid": "f55f6be2-ce3f-4b4d-b1e2-f349ded7943e" + }, + "id": 9, + "scope_entry": "0", + "scope_exit": "1" + }, + { + "type": "Tasklet", + "label": "_Mult_", + "attributes": { + "code": { + "string_data": "__out = (__in1 * __in2)", + "language": "Python" + }, + "debuginfo": { + "type": "DebugInfo", + "start_line": 261, + "end_line": 261, + "start_column": 33, + "end_column": 33, + "filename": "/home/primrose/Work/DaCellerator/MatMul/matmul.py" + }, + "label": "_Mult_", + "in_connectors": { + "__in1": null, + "__in2": null + }, + "out_connectors": { + "__out": null + }, + "guid": "73aecb99-070f-4e61-a316-3b2a7b40b929" + }, + "id": 10, + "scope_entry": "6", + "scope_exit": "7" + }, + { + "type": "AccessNode", + "label": "__tmp3", + "attributes": { + "debuginfo": { + "type": "DebugInfo", + "start_line": 261, + "end_line": 261, + "start_column": 33, + "end_column": 33, + "filename": "/home/primrose/Work/DaCellerator/MatMul/matmul.py" + }, + "data": "__tmp3", + "guid": "9a838b65-220e-482a-83bb-e0d554444eb4" + }, + "id": 11, + "scope_entry": "6", + "scope_exit": "7" + }, + { + "type": "Tasklet", + "label": "_Add_", + "attributes": { + "code": { + "string_data": "__out = (__in1 + __in2)", + "language": "Python" + }, + "debuginfo": { + "type": "DebugInfo", + "start_line": 261, + "end_line": 261, + "start_column": 33, + "end_column": 33, + "filename": "/home/primrose/Work/DaCellerator/MatMul/matmul.py" + }, + "label": "_Add_", + "in_connectors": { + "__in1": null, + "__in2": null + }, + "out_connectors": { + "__out": null + }, + "guid": "ae8acca8-acfa-42de-a0e2-dc45be188f59" + }, + "id": 12, + "scope_entry": "6", + "scope_exit": "7" + }, + { + "type": "AccessNode", + "label": "__tmp4", + "attributes": { + "debuginfo": { + "type": "DebugInfo", + "start_line": 261, + "end_line": 261, + "start_column": 33, + "end_column": 33, + "filename": "/home/primrose/Work/DaCellerator/MatMul/matmul.py" + }, + "data": "__tmp4", + "guid": "c45f593f-64ec-4deb-9a7a-06c9028a359e" + }, + "id": 13, + "scope_entry": "6", + "scope_exit": "7" + }, + { + "type": "Tasklet", + "label": "assign_261_6", + "attributes": { + "code": { + "string_data": "__out = __inp", + "language": "Python" + }, + "debuginfo": { + "type": "DebugInfo", + "start_line": 261, + "end_line": 261, + "start_column": 33, + "end_column": 33, + "filename": "/home/primrose/Work/DaCellerator/MatMul/matmul.py" + }, + "label": "assign_261_6", + "in_connectors": { + "__inp": null + }, + "out_connectors": { + "__out": null + }, + "guid": "ee02b28c-b419-4cfe-87a0-5cb6f00e7728" + }, + "id": 14, + "scope_entry": "6", + "scope_exit": "7" + } + ], + "edges": [ + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "1", + "debuginfo": null, + "guid": "955a2c4f-712d-4f48-ba92-25f0630625d8", + "src_subset": null, + "dst_subset": null, + "is_data_src": false, + "num_accesses": "1" + } + } + }, + "src": "0", + "dst": "5", + "dst_connector": null, + "src_connector": null + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "K*M*N", + "subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "M - 1", + "step": "1", + "tile": "1" + }, + { + "start": "0", + "end": "K - 1", + "step": "1", + "tile": "1" + } + ] + }, + "data": "A", + "debuginfo": null, + "guid": "440297ed-13f9-441c-8590-54de034648b1", + "src_subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "M - 1", + "step": "1", + "tile": "1" + }, + { + "start": "0", + "end": "K - 1", + "step": "1", + "tile": "1" + } + ] + }, + "dst_subset": null, + "is_data_src": true, + "num_accesses": "K*M*N" + } + } + }, + "src": "2", + "dst": "0", + "dst_connector": "IN_A", + "src_connector": null + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "K*M*N", + "subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "K - 1", + "step": "1", + "tile": "1" + }, + { + "start": "0", + "end": "N - 1", + "step": "1", + "tile": "1" + } + ] + }, + "data": "B", + "debuginfo": null, + "guid": "b5a04c2f-9139-4e91-b3fa-05e31fb685c1", + "src_subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "K - 1", + "step": "1", + "tile": "1" + }, + { + "start": "0", + "end": "N - 1", + "step": "1", + "tile": "1" + } + ] + }, + "dst_subset": null, + "is_data_src": true, + "num_accesses": "K*M*N" + } + } + }, + "src": "3", + "dst": "0", + "dst_connector": "IN_B", + "src_connector": null + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "K", + "subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "0", + "step": "1", + "tile": "1" + } + ] + }, + "data": "tmp", + "debuginfo": null, + "guid": "b475ec44-1676-44fa-a2a0-475d09fd1f58", + "src_subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "0", + "step": "1", + "tile": "1" + } + ] + }, + "dst_subset": null, + "is_data_src": true, + "num_accesses": "K" + } + } + }, + "src": "5", + "dst": "6", + "dst_connector": "IN___tmp1", + "src_connector": null + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "1", + "subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "0", + "step": "1", + "tile": "1" + } + ] + }, + "data": "__tmp3", + "debuginfo": null, + "guid": "4d74242e-cc00-4c30-aa79-af5208c734bc", + "src_subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "0", + "step": "1", + "tile": "1" + } + ] + }, + "dst_subset": null, + "is_data_src": true, + "num_accesses": "1" + } + } + }, + "src": "11", + "dst": "12", + "dst_connector": "__in2", + "src_connector": null + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "1", + "subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "0", + "step": "1", + "tile": "1" + } + ] + }, + "data": "tmp", + "debuginfo": null, + "guid": "9294be17-8966-48f4-af5b-b94952ae2ba6", + "src_subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "0", + "step": "1", + "tile": "1" + } + ] + }, + "dst_subset": null, + "is_data_src": true, + "num_accesses": "1" + } + } + }, + "src": "8", + "dst": "9", + "dst_connector": "__inp", + "src_connector": null + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "1", + "subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "0", + "step": "1", + "tile": "1" + } + ] + }, + "data": "__tmp4", + "debuginfo": null, + "guid": "879548dd-1ea6-4b6d-be90-636186f7af63", + "src_subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "0", + "step": "1", + "tile": "1" + } + ] + }, + "dst_subset": null, + "is_data_src": true, + "num_accesses": "1" + } + } + }, + "src": "13", + "dst": "14", + "dst_connector": "__inp", + "src_connector": null + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "K", + "subset": { + "type": "Range", + "ranges": [ + { + "start": "i", + "end": "i", + "step": "1", + "tile": "1" + }, + { + "start": "0", + "end": "K - 1", + "step": "1", + "tile": "1" + } + ] + }, + "data": "A", + "debuginfo": null, + "guid": "c358fd7e-f83f-499e-baf6-634a74e0eb68", + "src_subset": { + "type": "Range", + "ranges": [ + { + "start": "i", + "end": "i", + "step": "1", + "tile": "1" + }, + { + "start": "0", + "end": "K - 1", + "step": "1", + "tile": "1" + } + ] + }, + "dst_subset": null, + "is_data_src": true, + "num_accesses": "K" + } + } + }, + "src": "0", + "dst": "6", + "dst_connector": "IN___tmp_261_18_r_in_from_2_0", + "src_connector": "OUT_A" + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "K", + "subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "K - 1", + "step": "1", + "tile": "1" + }, + { + "start": "j", + "end": "j", + "step": "1", + "tile": "1" + } + ] + }, + "data": "B", + "debuginfo": null, + "guid": "a01e117a-7338-40c0-a93d-98a99b5ba416", + "src_subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "K - 1", + "step": "1", + "tile": "1" + }, + { + "start": "j", + "end": "j", + "step": "1", + "tile": "1" + } + ] + }, + "dst_subset": null, + "is_data_src": true, + "num_accesses": "K" + } + } + }, + "src": "0", + "dst": "6", + "dst_connector": "IN___tmp_261_28_r_in_from_2_0", + "src_connector": "OUT_B" + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "M*N", + "subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "M - 1", + "step": "1", + "tile": "1" + }, + { + "start": "0", + "end": "N - 1", + "step": "1", + "tile": "1" + } + ] + }, + "data": "C", + "debuginfo": null, + "guid": "8d793faf-6c88-41ef-b15f-73d6cc4769ab", + "src_subset": null, + "dst_subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "M - 1", + "step": "1", + "tile": "1" + }, + { + "start": "0", + "end": "N - 1", + "step": "1", + "tile": "1" + } + ] + }, + "is_data_src": false, + "num_accesses": "M*N" + } + } + }, + "src": "1", + "dst": "4", + "dst_connector": null, + "src_connector": "OUT_C" + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "K", + "subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "0", + "step": "1", + "tile": "1" + } + ] + }, + "data": "tmp", + "debuginfo": null, + "guid": "a8a897dc-36b0-4558-92e6-337b9c4569eb", + "src_subset": null, + "dst_subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "0", + "step": "1", + "tile": "1" + } + ] + }, + "is_data_src": false, + "num_accesses": "K" + } + } + }, + "src": "7", + "dst": "8", + "dst_connector": null, + "src_connector": "OUT___tmp1" + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "1", + "subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "0", + "step": "1", + "tile": "1" + } + ] + }, + "data": "tmp", + "debuginfo": null, + "guid": "b4b91eb5-6f11-4107-8eaa-f00684d09166", + "src_subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "0", + "step": "1", + "tile": "1" + } + ] + }, + "dst_subset": null, + "is_data_src": true, + "num_accesses": "1" + } + } + }, + "src": "6", + "dst": "12", + "dst_connector": "__in1", + "src_connector": "OUT___tmp1" + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "1", + "subset": { + "type": "Range", + "ranges": [ + { + "start": "i", + "end": "i", + "step": "1", + "tile": "1" + }, + { + "start": "k", + "end": "k", + "step": "1", + "tile": "1" + } + ] + }, + "data": "A", + "debuginfo": null, + "guid": "78e57126-1091-4d19-95ca-1a9baa8b2980", + "src_subset": { + "type": "Range", + "ranges": [ + { + "start": "i", + "end": "i", + "step": "1", + "tile": "1" + }, + { + "start": "k", + "end": "k", + "step": "1", + "tile": "1" + } + ] + }, + "dst_subset": null, + "is_data_src": true, + "num_accesses": "1" + } + } + }, + "src": "6", + "dst": "10", + "dst_connector": "__in1", + "src_connector": "OUT___tmp_261_18_r_in_from_2_0" + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "1", + "subset": { + "type": "Range", + "ranges": [ + { + "start": "k", + "end": "k", + "step": "1", + "tile": "1" + }, + { + "start": "j", + "end": "j", + "step": "1", + "tile": "1" + } + ] + }, + "data": "B", + "debuginfo": null, + "guid": "90d034c5-b25b-439e-aa00-5b36f40fa3c1", + "src_subset": { + "type": "Range", + "ranges": [ + { + "start": "k", + "end": "k", + "step": "1", + "tile": "1" + }, + { + "start": "j", + "end": "j", + "step": "1", + "tile": "1" + } + ] + }, + "dst_subset": null, + "is_data_src": true, + "num_accesses": "1" + } + } + }, + "src": "6", + "dst": "10", + "dst_connector": "__in2", + "src_connector": "OUT___tmp_261_28_r_in_from_2_0" + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "1", + "subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "0", + "step": "1", + "tile": "1" + } + ] + }, + "data": "__tmp3", + "debuginfo": null, + "guid": "5c44d599-d384-49d0-9e03-023a46d05dd2", + "src_subset": null, + "dst_subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "0", + "step": "1", + "tile": "1" + } + ] + }, + "is_data_src": false, + "num_accesses": "1" + } + } + }, + "src": "10", + "dst": "11", + "dst_connector": null, + "src_connector": "__out" + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "1", + "subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "0", + "step": "1", + "tile": "1" + } + ] + }, + "data": "__tmp4", + "debuginfo": null, + "guid": "7d8d1da4-4f02-48b8-980a-33a71b57c26c", + "src_subset": null, + "dst_subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "0", + "step": "1", + "tile": "1" + } + ] + }, + "is_data_src": false, + "num_accesses": "1" + } + } + }, + "src": "12", + "dst": "13", + "dst_connector": null, + "src_connector": "__out" + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "1", + "subset": { + "type": "Range", + "ranges": [ + { + "start": "i", + "end": "i", + "step": "1", + "tile": "1" + }, + { + "start": "j", + "end": "j", + "step": "1", + "tile": "1" + } + ] + }, + "data": "C", + "debuginfo": null, + "guid": "0fb8b2f9-f4eb-43c6-b28b-778145ad2cc5", + "src_subset": null, + "dst_subset": { + "type": "Range", + "ranges": [ + { + "start": "i", + "end": "i", + "step": "1", + "tile": "1" + }, + { + "start": "j", + "end": "j", + "step": "1", + "tile": "1" + } + ] + }, + "is_data_src": false, + "num_accesses": "1" + } + } + }, + "src": "9", + "dst": "1", + "dst_connector": "IN_C", + "src_connector": "__out" + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "1", + "subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "0", + "step": "1", + "tile": "1" + } + ] + }, + "data": "tmp", + "debuginfo": null, + "guid": "7590ecfd-7127-48af-a7e4-98906d48b08e", + "src_subset": null, + "dst_subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "0", + "step": "1", + "tile": "1" + } + ] + }, + "is_data_src": false, + "num_accesses": "1" + } + } + }, + "src": "14", + "dst": "7", + "dst_connector": "IN___tmp1", + "src_connector": "__out" + } + ], + "attributes": { + "executions": "1", + "dynamic_executions": false, + "guid": "21d6aa0d-ccfb-4036-ab01-8654ab4417ed" + } + } + ], + "edges": [], + "collapsed": false, + "label": "", + "id": null, + "cfg_list_id": 0, + "start_block": 0, + "dace_version": "1.0.0" + }, + "transformation_hist": [ + { + "type": "PatternTransformation", + "transformation": "AddThreadBlockMap", + "_subgraph": { + "0": 0 + } + }, + { + "type": "PatternTransformation", + "transformation": "MapTiling", + "prefix": "b", + "tile_sizes": [ + "8", + "32" + ], + "divides_evenly": true, + "tile_trivial": true, + "skew": true, + "_subgraph": { + "0": 0 + } + }, + { + "type": "PatternTransformation", + "transformation": "ChangeThreadBlockMap", + "dim_size_x": "16", + "dim_size_y": "16", + "_subgraph": { + "0": 15, + "1": 0 + } + }, + { + "type": "PatternTransformation", + "transformation": "ThreadCoarsening", + "tile_size_x": "8", + "tile_size_y": "8", + "_subgraph": { + "0": 0, + "1": 15 + } + }, + { + "type": "PatternTransformation", + "transformation": "MapTiling", + "prefix": "d", + "tile_sizes": [ + "8", + "8" + ], + "divides_evenly": true, + "tile_trivial": true, + "skew": true, + "_subgraph": { + "0": 0 + } + }, + { + "type": "PatternTransformation", + "transformation": "BlockTiling", + "_subgraph": { + "0": 17, + "1": 6 + } + }, + { + "type": "PatternTransformation", + "transformation": "ExplicitMemoryMove", + "_subgraph": { + "0": 12, + "1": 14, + "2": 16 + } + }, + { + "type": "ExpandTransformation", + "transformation": "Expansion", + "classpath": "dace.libraries.standard.nodes.code.Expansion", + "_subgraph": { + "0": 20 + } + }, + { + "type": "ExpandTransformation", + "transformation": "Expansion", + "classpath": "dace.libraries.standard.nodes.code.Expansion", + "_subgraph": { + "0": 21 + } + } + ], + "debuginfo": { + "type": "DebugInfo", + "start_line": 254, + "end_line": 262, + "start_column": 0, + "end_column": 0, + "filename": "/home/primrose/Work/DaCellerator/MatMul/matmul.py" + }, + "guid": "b58195d5-3a0f-46d0-ac04-c263d4e3772b", + "hash": "cdf5cd16d9a29f92023e8d4b01e385997cc6f91086fcfba358c900713434638b" + }, + "nodes": [ + { + "type": "SDFGState", + "label": "MapState", + "id": 0, + "collapsed": false, + "scope_dict": { + "-1": [ + 2, + 3, + 4, + 12 + ], + "0": [ + 1, + 5 + ], + "5": [ + 6, + 7, + 8, + 9, + 10, + 11 + ], + "12": [ + 13, + 14 + ], + "14": [ + 15, + 16, + 18, + 19 + ], + "16": [ + 0, + 17, + 20, + 21, + 22, + 23 + ] + }, + "nodes": [ + { + "type": "MapEntry", + "label": "ThreadCoarsenedMap[i=0:8, j=0:8]", + "attributes": { + "label": "ThreadCoarsenedMap", + "params": [ + "i", + "j" + ], + "param_types": { + "i": "int32", + "j": "int32" + }, + "range": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "7", + "step": "1", + "tile": "1" + }, + { + "start": "0", + "end": "7", + "step": "1", + "tile": "1" + } + ] + }, + "schedule": "Sequential", + "unroll": true, + "debuginfo": { + "type": "DebugInfo", + "start_line": 258, + "end_line": 258, + "start_column": 2, + "end_column": 2, + "filename": "/home/primrose/Work/DaCellerator/MatMul/matmul.py" + }, + "in_connectors": { + "IN_A": { + "type": "pointer", + "dtype": "float32" + }, + "IN_B": { + "type": "pointer", + "dtype": "float32" + }, + "IN___tmp1": { + "type": "pointer", + "dtype": "float32" + } + }, + "out_connectors": { + "OUT_A": { + "type": "pointer", + "dtype": "float32" + }, + "OUT_B": { + "type": "pointer", + "dtype": "float32" + }, + "OUT___tmp1": "float32" + }, + "guid": "3befdc17-85e0-4f77-8d69-28f32bfd3669" + }, + "id": 0, + "scope_entry": "16", + "scope_exit": "1" + }, + { + "type": "MapExit", + "label": "ThreadCoarsenedMap[i=0:8, j=0:8]", + "attributes": { + "in_connectors": { + "IN_C": "float32" + }, + "out_connectors": { + "OUT_C": { + "type": "pointer", + "dtype": "float32" + } + }, + "guid": "0778c935-cb9e-40f0-b04a-8c24d0f3fd9f" + }, + "id": 1, + "scope_entry": "0", + "scope_exit": "1" + }, + { + "type": "AccessNode", + "label": "A", + "attributes": { + "debuginfo": { + "type": "DebugInfo", + "start_line": 258, + "end_line": 258, + "start_column": 2, + "end_column": 2, + "filename": "/home/primrose/Work/DaCellerator/MatMul/matmul.py" + }, + "data": "A", + "guid": "0ed9ab6a-02de-4951-9c25-86a43dc8edae" + }, + "id": 2, + "scope_entry": null, + "scope_exit": null + }, + { + "type": "AccessNode", + "label": "B", + "attributes": { + "debuginfo": { + "type": "DebugInfo", + "start_line": 258, + "end_line": 258, + "start_column": 2, + "end_column": 2, + "filename": "/home/primrose/Work/DaCellerator/MatMul/matmul.py" + }, + "data": "B", + "guid": "9a3d8a83-bfc2-4359-9eca-a0a42565bfd3" + }, + "id": 3, + "scope_entry": null, + "scope_exit": null + }, + { + "type": "AccessNode", + "label": "C", + "attributes": { + "debuginfo": { + "type": "DebugInfo", + "start_line": 258, + "end_line": 258, + "start_column": 2, + "end_column": 2, + "filename": "/home/primrose/Work/DaCellerator/MatMul/matmul.py" + }, + "data": "C", + "guid": "fec6b795-a139-4a14-9e63-44d333192daa" + }, + "id": 4, + "scope_entry": null, + "scope_exit": null + }, + { + "type": "MapEntry", + "label": "InnerWorkMapNo0[tk=0:16]", + "attributes": { + "label": "InnerWorkMapNo0", + "params": [ + "tk" + ], + "param_types": { + "tk": "int32" + }, + "range": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "15", + "step": "1", + "tile": "1" + } + ] + }, + "schedule": "Sequential", + "unroll": true, + "debuginfo": { + "type": "DebugInfo", + "start_line": 260, + "end_line": 260, + "start_column": 4, + "end_column": 4, + "filename": "/home/primrose/Work/DaCellerator/MatMul/matmul.py" + }, + "in_connectors": { + "IN___tmp1": "float32", + "IN___tmp_261_18_r_in_from_2_0": { + "type": "pointer", + "dtype": "float32" + }, + "IN___tmp_261_28_r_in_from_2_0": { + "type": "pointer", + "dtype": "float32" + } + }, + "out_connectors": { + "OUT___tmp1": "float32", + "OUT___tmp_261_18_r_in_from_2_0": "float32", + "OUT___tmp_261_28_r_in_from_2_0": "float32" + }, + "guid": "dbd98cec-d976-42db-a740-b8c99d25fd8d" + }, + "id": 5, + "scope_entry": "0", + "scope_exit": "6" + }, + { + "type": "MapExit", + "label": "InnerWorkMapNo0[tk=0:16]", + "attributes": { + "in_connectors": { + "IN___tmp1": "float32" + }, + "out_connectors": { + "OUT___tmp1": "float32" + }, + "guid": "fcc83bb0-47e1-4eef-a4a8-3ce4d9596be2" + }, + "id": 6, + "scope_entry": "5", + "scope_exit": "6" + }, + { + "type": "Tasklet", + "label": "_Mult_", + "attributes": { + "code": { + "string_data": "__out = (__in1 * __in2)", + "language": "Python" + }, + "debuginfo": { + "type": "DebugInfo", + "start_line": 261, + "end_line": 261, + "start_column": 33, + "end_column": 33, + "filename": "/home/primrose/Work/DaCellerator/MatMul/matmul.py" + }, + "label": "_Mult_", + "in_connectors": { + "__in1": "float32", + "__in2": "float32" + }, + "out_connectors": { + "__out": "float32" + }, + "guid": "73aecb99-070f-4e61-a316-3b2a7b40b929" + }, + "id": 7, + "scope_entry": "5", + "scope_exit": "6" + }, + { + "type": "AccessNode", + "label": "__tmp3", + "attributes": { + "debuginfo": { + "type": "DebugInfo", + "start_line": 261, + "end_line": 261, + "start_column": 33, + "end_column": 33, + "filename": "/home/primrose/Work/DaCellerator/MatMul/matmul.py" + }, + "data": "__tmp3", + "guid": "3ab24690-9afa-4fdf-be28-532806a0d849" + }, + "id": 8, + "scope_entry": "5", + "scope_exit": "6" + }, + { + "type": "Tasklet", + "label": "_Add_", + "attributes": { + "code": { + "string_data": "__out = (__in1 + __in2)", + "language": "Python" + }, + "debuginfo": { + "type": "DebugInfo", + "start_line": 261, + "end_line": 261, + "start_column": 33, + "end_column": 33, + "filename": "/home/primrose/Work/DaCellerator/MatMul/matmul.py" + }, + "label": "_Add_", + "in_connectors": { + "__in1": "float32", + "__in2": "float32" + }, + "out_connectors": { + "__out": "float32" + }, + "guid": "ae8acca8-acfa-42de-a0e2-dc45be188f59" + }, + "id": 9, + "scope_entry": "5", + "scope_exit": "6" + }, + { + "type": "AccessNode", + "label": "__tmp4", + "attributes": { + "debuginfo": { + "type": "DebugInfo", + "start_line": 261, + "end_line": 261, + "start_column": 33, + "end_column": 33, + "filename": "/home/primrose/Work/DaCellerator/MatMul/matmul.py" + }, + "data": "__tmp4", + "guid": "6e664a3e-01db-4577-b456-90aa0c41eaaf" + }, + "id": 10, + "scope_entry": "5", + "scope_exit": "6" + }, + { + "type": "Tasklet", + "label": "assign_261_6", + "attributes": { + "code": { + "string_data": "__out = __inp", + "language": "Python" + }, + "debuginfo": { + "type": "DebugInfo", + "start_line": 261, + "end_line": 261, + "start_column": 33, + "end_column": 33, + "filename": "/home/primrose/Work/DaCellerator/MatMul/matmul.py" + }, + "label": "assign_261_6", + "in_connectors": { + "__inp": "float32" + }, + "out_connectors": { + "__out": "float32" + }, + "guid": "ee02b28c-b419-4cfe-87a0-5cb6f00e7728" + }, + "id": 11, + "scope_entry": "5", + "scope_exit": "6" + }, + { + "type": "MapEntry", + "label": "KernelEntryMap[b_i=0:M:128, b_j=0:N:128]", + "attributes": { + "label": "KernelEntryMap", + "params": [ + "b_i", + "b_j" + ], + "param_types": { + "b_i": "int32", + "b_j": "int32" + }, + "range": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "M - 1", + "step": "128", + "tile": "1" + }, + { + "start": "0", + "end": "N - 1", + "step": "128", + "tile": "1" + } + ] + }, + "schedule": "GPU_Device", + "debuginfo": null, + "in_connectors": { + "IN_A": { + "type": "pointer", + "dtype": "float32" + }, + "IN_B": { + "type": "pointer", + "dtype": "float32" + } + }, + "out_connectors": { + "OUT_A": { + "type": "pointer", + "dtype": "float32" + }, + "OUT_B": { + "type": "pointer", + "dtype": "float32" + } + }, + "guid": "2e19f5de-2731-48f4-bfb6-f406686efc57" + }, + "id": 12, + "scope_entry": null, + "scope_exit": "13" + }, + { + "type": "MapExit", + "label": "KernelEntryMap[b_i=0:M:128, b_j=0:N:128]", + "attributes": { + "in_connectors": { + "IN_C": { + "type": "pointer", + "dtype": "float32" + } + }, + "out_connectors": { + "OUT_C": { + "type": "pointer", + "dtype": "float32" + } + }, + "guid": "977a6c95-f6b2-405d-81b9-9f47afd3a213" + }, + "id": 13, + "scope_entry": "12", + "scope_exit": "13" + }, + { + "type": "MapEntry", + "label": "ThreadBlockMap[d_i=0:128:8, d_j=0:128:8]", + "attributes": { + "label": "ThreadBlockMap", + "params": [ + "d_i", + "d_j" + ], + "param_types": { + "d_i": "int32", + "d_j": "int32" + }, + "range": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "127", + "step": "8", + "tile": "1" + }, + { + "start": "0", + "end": "127", + "step": "8", + "tile": "1" + } + ] + }, + "schedule": "GPU_ThreadBlock", + "debuginfo": null, + "in_connectors": { + "IN_A": { + "type": "pointer", + "dtype": "float32" + }, + "IN_B": { + "type": "pointer", + "dtype": "float32" + } + }, + "out_connectors": { + "OUT_A": { + "type": "pointer", + "dtype": "float32" + }, + "OUT_B": { + "type": "pointer", + "dtype": "float32" + } + }, + "guid": "6afb15dc-ce53-46a0-9605-bd36e62f9a23" + }, + "id": 14, + "scope_entry": "12", + "scope_exit": "15" + }, + { + "type": "MapExit", + "label": "ThreadBlockMap[d_i=0:128:8, d_j=0:128:8]", + "attributes": { + "in_connectors": { + "IN_C": { + "type": "pointer", + "dtype": "float32" + } + }, + "out_connectors": { + "OUT_C": { + "type": "pointer", + "dtype": "float32" + } + }, + "guid": "47fc83c9-18d4-44fb-a814-4f91e6347a1f" + }, + "id": 15, + "scope_entry": "14", + "scope_exit": "15" + }, + { + "type": "MapEntry", + "label": "OuterWorkMapNo0[k=0:K:16]", + "attributes": { + "label": "OuterWorkMapNo0", + "params": [ + "k" + ], + "param_types": { + "k": "int32" + }, + "range": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "K - 1", + "step": "16", + "tile": "1" + } + ] + }, + "schedule": "Sequential", + "debuginfo": null, + "in_connectors": { + "IN_A": { + "type": "pointer", + "dtype": "float32" + }, + "IN_B": { + "type": "pointer", + "dtype": "float32" + }, + "IN___tmp1": { + "type": "pointer", + "dtype": "float32" + } + }, + "out_connectors": { + "OUT_A": { + "type": "pointer", + "dtype": "float32" + }, + "OUT_B": { + "type": "pointer", + "dtype": "float32" + }, + "OUT___tmp1": { + "type": "pointer", + "dtype": "float32" + } + }, + "guid": "58cc03e7-cb61-484b-a116-9e5198e3686d" + }, + "id": 16, + "scope_entry": "14", + "scope_exit": "17" + }, + { + "type": "MapExit", + "label": "OuterWorkMapNo0[k=0:K:16]", + "attributes": { + "in_connectors": { + "IN_C": { + "type": "pointer", + "dtype": "float32" + } + }, + "out_connectors": { + "OUT_C": { + "type": "pointer", + "dtype": "float32" + } + }, + "guid": "d6ccdaf9-3fb9-4801-96b4-796bb619219d" + }, + "id": 17, + "scope_entry": "16", + "scope_exit": "17" + }, + { + "type": "AccessNode", + "label": "tmp", + "attributes": { + "setzero": true, + "debuginfo": { + "type": "DebugInfo", + "start_line": 259, + "end_line": 259, + "start_column": 23, + "end_column": 23, + "filename": "/home/primrose/Work/DaCellerator/MatMul/matmul.py" + }, + "data": "tmp", + "guid": "2cbe0906-ffb6-480f-8918-5d403b2c2349" + }, + "id": 18, + "scope_entry": "14", + "scope_exit": "15" + }, + { + "type": "AccessNode", + "label": "tmp", + "attributes": { + "debuginfo": null, + "data": "tmp", + "guid": "bc41207b-4ce4-439f-9cdc-d375ba9a0580" + }, + "id": 19, + "scope_entry": "14", + "scope_exit": "15" + }, + { + "type": "AccessNode", + "label": "shrB", + "attributes": { + "debuginfo": null, + "data": "shrB", + "guid": "943f436f-6a65-4cad-a365-4f858ffd5dfa" + }, + "id": 20, + "scope_entry": "16", + "scope_exit": "17" + }, + { + "type": "AccessNode", + "label": "shrA", + "attributes": { + "debuginfo": null, + "data": "shrA", + "guid": "21a4393c-7e02-4b7b-b62c-68e902b3bf0e" + }, + "id": 21, + "scope_entry": "16", + "scope_exit": "17" + }, + { + "type": "Tasklet", + "label": "custom_code", + "attributes": { + "code": { + "string_data": "// B[K,N]\n// shrB[16,128]\n// Inner Loop Condition: k <= K - 16 && b_j <= N - 128\nconst int tid = threadIdx.x + blockDim.x * threadIdx.y + (blockDim.x * blockDim.y) * threadIdx.z;\n// Num Threads: 256, Line Length (max): 128\nif (k <= K - 16 && b_j <= N - 128) {\n// load multiple lines at a time 2\nconst int line_offset = tid % 128;\nconst int line_num = tid / 128;\n#pragma unroll\nfor (int i0 = 0; i0 < 16; i0 += 2) {\nshrB[line_num*128 + line_offset + ((i0) * 128)] = B[(N*(k))+(1*(b_j)) + line_num*N + line_offset + ((i0) * N)];\n}\n} else { \n// load multiple lines at a time 2\nconst int effective_line_len = Min(N - b_j, 128);\nconst int line_offset = tid % effective_line_len;\nconst int line_num = tid / effective_line_len;\nconst int effectivenum_threads = 2 * effective_line_len;\nif (tid < effectivenum_threads){\n#pragma unroll\nfor (int i0 = 0; i0 < Min(K - k, 16); i0 += 2) {\nif(line_offset < effective_line_len && line_num + i0 < Min(K - k, 16)){\nshrB[line_num*128 + line_offset + ((i0) * 128)] = B[(N*(k))+(1*(b_j)) + line_num*N + line_offset + ((i0) * N)];\n}\n}\n}\n}\n", + "language": "CPP" + }, + "debuginfo": { + "type": "DebugInfo", + "start_line": 0, + "end_line": 0, + "start_column": 0, + "end_column": 0, + "filename": null + }, + "label": "custom_code", + "in_connectors": { + "IN_B": { + "type": "pointer", + "dtype": "float32" + } + }, + "out_connectors": { + "OUT_B": { + "type": "pointer", + "dtype": "float32" + } + }, + "guid": "ee6d7afd-583f-4121-9dcf-7ae5f6f60080" + }, + "id": 22, + "scope_entry": "16", + "scope_exit": "17" + }, + { + "type": "Tasklet", + "label": "custom_code", + "attributes": { + "code": { + "string_data": "// A[M,K]\n// shrA[128,16]\n// Inner Loop Condition: b_i <= M - 128 && k <= K - 16\nconst int tid = threadIdx.x + blockDim.x * threadIdx.y + (blockDim.x * blockDim.y) * threadIdx.z;\n// Num Threads: 256, Line Length (max): 16\nif (b_i <= M - 128 && k <= K - 16) {\n// load multiple lines at a time 16\nconst int line_offset = tid % 16;\nconst int line_num = tid / 16;\n#pragma unroll\nfor (int i0 = 0; i0 < 128; i0 += 16) {\nshrA[line_num*16 + line_offset + ((i0) * 16)] = A[(K*(b_i))+(1*(k)) + line_num*K + line_offset + ((i0) * K)];\n}\n} else { \n// load multiple lines at a time 16\nconst int effective_line_len = Min(K - k, 16);\nconst int line_offset = tid % effective_line_len;\nconst int line_num = tid / effective_line_len;\nconst int effectivenum_threads = 16 * effective_line_len;\nif (tid < effectivenum_threads){\n#pragma unroll\nfor (int i0 = 0; i0 < Min(M - b_i, 128); i0 += 16) {\nif(line_offset < effective_line_len && line_num + i0 < Min(M - b_i, 128)){\nshrA[line_num*16 + line_offset + ((i0) * 16)] = A[(K*(b_i))+(1*(k)) + line_num*K + line_offset + ((i0) * K)];\n}\n}\n}\n}\n", + "language": "CPP" + }, + "debuginfo": { + "type": "DebugInfo", + "start_line": 0, + "end_line": 0, + "start_column": 0, + "end_column": 0, + "filename": null + }, + "label": "custom_code", + "in_connectors": { + "IN_A": { + "type": "pointer", + "dtype": "float32" + } + }, + "out_connectors": { + "OUT_A": { + "type": "pointer", + "dtype": "float32" + } + }, + "guid": "87c1e78b-08d7-4ad4-9df4-7c8307dfb4e1" + }, + "id": 23, + "scope_entry": "16", + "scope_exit": "17" + } + ], + "edges": [ + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "1", + "debuginfo": null, + "guid": "717e461d-b7de-4653-8912-6ebfd0b95b49", + "src_subset": null, + "dst_subset": null, + "is_data_src": false, + "num_accesses": "1" + } + } + }, + "src": "14", + "dst": "18", + "dst_connector": null, + "src_connector": null + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "16384*K*ceiling(M/128)*ceiling(N/128)", + "subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "M - 1", + "step": "1", + "tile": "1" + }, + { + "start": "0", + "end": "K - 1", + "step": "1", + "tile": "1" + } + ] + }, + "data": "A", + "debuginfo": null, + "guid": "00b2c1ed-40a8-462b-b84b-e498f0e61cb0", + "src_subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "M - 1", + "step": "1", + "tile": "1" + }, + { + "start": "0", + "end": "K - 1", + "step": "1", + "tile": "1" + } + ] + }, + "dst_subset": null, + "is_data_src": true, + "num_accesses": "16384*K*ceiling(M/128)*ceiling(N/128)" + } + } + }, + "src": "2", + "dst": "12", + "dst_connector": "IN_A", + "src_connector": null + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "128", + "subset": { + "type": "Range", + "ranges": [ + { + "start": "d_i", + "end": "d_i + 7", + "step": "1", + "tile": "1" + }, + { + "start": "0", + "end": "15", + "step": "1", + "tile": "1" + } + ] + }, + "data": "shrA", + "debuginfo": null, + "guid": "4e20efeb-25e3-48b2-bd36-62f0e610f7a0", + "src_subset": { + "type": "Range", + "ranges": [ + { + "start": "d_i", + "end": "d_i + 7", + "step": "1", + "tile": "1" + }, + { + "start": "0", + "end": "15", + "step": "1", + "tile": "1" + } + ] + }, + "dst_subset": null, + "is_data_src": true, + "num_accesses": "128" + } + } + }, + "src": "21", + "dst": "0", + "dst_connector": "IN_A", + "src_connector": null + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "16384*K*ceiling(M/128)*ceiling(N/128)", + "subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "K - 1", + "step": "1", + "tile": "1" + }, + { + "start": "0", + "end": "N - 1", + "step": "1", + "tile": "1" + } + ] + }, + "data": "B", + "debuginfo": null, + "guid": "dfca0807-6b73-41d8-803f-bd9657893bd6", + "src_subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "K - 1", + "step": "1", + "tile": "1" + }, + { + "start": "0", + "end": "N - 1", + "step": "1", + "tile": "1" + } + ] + }, + "dst_subset": null, + "is_data_src": true, + "num_accesses": "16384*K*ceiling(M/128)*ceiling(N/128)" + } + } + }, + "src": "3", + "dst": "12", + "dst_connector": "IN_B", + "src_connector": null + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "128", + "subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "15", + "step": "1", + "tile": "1" + }, + { + "start": "d_j", + "end": "d_j + 7", + "step": "1", + "tile": "1" + } + ] + }, + "data": "shrB", + "debuginfo": null, + "guid": "0a152535-c69d-4240-b0de-5473188c98af", + "src_subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "15", + "step": "1", + "tile": "1" + }, + { + "start": "d_j", + "end": "d_j + 7", + "step": "1", + "tile": "1" + } + ] + }, + "dst_subset": null, + "is_data_src": true, + "num_accesses": "128" + } + } + }, + "src": "20", + "dst": "0", + "dst_connector": "IN_B", + "src_connector": null + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "64", + "subset": { + "type": "Range", + "ranges": [ + { + "start": "b_i + d_i", + "end": "b_i + d_i + 7", + "step": "1", + "tile": "1" + }, + { + "start": "b_j + d_j", + "end": "b_j + d_j + 7", + "step": "1", + "tile": "1" + } + ] + }, + "data": "C", + "debuginfo": null, + "guid": "4ac6087d-0a43-4f88-bcfa-7d5188895035", + "src_subset": null, + "dst_subset": { + "type": "Range", + "ranges": [ + { + "start": "b_i + d_i", + "end": "b_i + d_i + 7", + "step": "1", + "tile": "1" + }, + { + "start": "b_j + d_j", + "end": "b_j + d_j + 7", + "step": "1", + "tile": "1" + } + ] + }, + "is_data_src": false, + "num_accesses": "64" + } + } + }, + "src": "19", + "dst": "15", + "dst_connector": "IN_C", + "src_connector": null + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "64", + "subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "7", + "step": "1", + "tile": "1" + }, + { + "start": "0", + "end": "7", + "step": "1", + "tile": "1" + } + ] + }, + "data": "tmp", + "debuginfo": null, + "guid": "5686066b-de34-4a45-a4d1-be03e4d3ce70", + "src_subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "7", + "step": "1", + "tile": "1" + }, + { + "start": "0", + "end": "7", + "step": "1", + "tile": "1" + } + ] + }, + "dst_subset": null, + "is_data_src": true, + "num_accesses": "64" + } + } + }, + "src": "18", + "dst": "16", + "dst_connector": "IN___tmp1", + "src_connector": null + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "1", + "subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "0", + "step": "1", + "tile": "1" + } + ] + }, + "data": "__tmp3", + "debuginfo": null, + "guid": "e45b8649-9d7f-4505-8d96-cb03e2c5a2ec", + "src_subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "0", + "step": "1", + "tile": "1" + } + ] + }, + "dst_subset": null, + "is_data_src": true, + "num_accesses": "1" + } + } + }, + "src": "8", + "dst": "9", + "dst_connector": "__in2", + "src_connector": null + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "1", + "subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "0", + "step": "1", + "tile": "1" + } + ] + }, + "data": "__tmp4", + "debuginfo": null, + "guid": "08c7701b-f572-41de-9386-a9bcd94fb418", + "src_subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "0", + "step": "1", + "tile": "1" + } + ] + }, + "dst_subset": null, + "is_data_src": true, + "num_accesses": "1" + } + } + }, + "src": "10", + "dst": "11", + "dst_connector": "__inp", + "src_connector": null + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "128", + "subset": { + "type": "Range", + "ranges": [ + { + "start": "d_i", + "end": "d_i + 7", + "step": "1", + "tile": "1" + }, + { + "start": "0", + "end": "15", + "step": "1", + "tile": "1" + } + ] + }, + "data": "shrA", + "debuginfo": null, + "guid": "89f85fcb-cbb5-49ac-b0bd-fcba411c131d", + "src_subset": null, + "dst_subset": { + "type": "Range", + "ranges": [ + { + "start": "d_i", + "end": "d_i + 7", + "step": "1", + "tile": "1" + }, + { + "start": "0", + "end": "15", + "step": "1", + "tile": "1" + } + ] + }, + "is_data_src": false, + "num_accesses": "128" + } + } + }, + "src": "23", + "dst": "21", + "dst_connector": null, + "src_connector": "OUT_A" + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "16384*K", + "subset": { + "type": "Range", + "ranges": [ + { + "start": "b_i", + "end": "b_i + 127", + "step": "1", + "tile": "1" + }, + { + "start": "0", + "end": "K - 1", + "step": "1", + "tile": "1" + } + ] + }, + "data": "A", + "debuginfo": null, + "guid": "33372399-9833-4551-81fa-2bbd4fc94206", + "src_subset": { + "type": "Range", + "ranges": [ + { + "start": "b_i", + "end": "b_i + 127", + "step": "1", + "tile": "1" + }, + { + "start": "0", + "end": "K - 1", + "step": "1", + "tile": "1" + } + ] + }, + "dst_subset": null, + "is_data_src": true, + "num_accesses": "16384*K" + } + } + }, + "src": "12", + "dst": "14", + "dst_connector": "IN_A", + "src_connector": "OUT_A" + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "64*K", + "subset": { + "type": "Range", + "ranges": [ + { + "start": "b_i + d_i", + "end": "b_i + d_i + 7", + "step": "1", + "tile": "1" + }, + { + "start": "0", + "end": "K - 1", + "step": "1", + "tile": "1" + } + ] + }, + "data": "A", + "debuginfo": null, + "guid": "5752458c-f2d7-412c-86ab-66dee65fb42e", + "src_subset": { + "type": "Range", + "ranges": [ + { + "start": "b_i + d_i", + "end": "b_i + d_i + 7", + "step": "1", + "tile": "1" + }, + { + "start": "0", + "end": "K - 1", + "step": "1", + "tile": "1" + } + ] + }, + "dst_subset": null, + "is_data_src": true, + "num_accesses": "64*K" + } + } + }, + "src": "14", + "dst": "16", + "dst_connector": "IN_A", + "src_connector": "OUT_A" + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "128", + "subset": { + "type": "Range", + "ranges": [ + { + "start": "b_i + d_i", + "end": "b_i + d_i + 7", + "step": "1", + "tile": "1" + }, + { + "start": "k", + "end": "k + 15", + "step": "1", + "tile": "1" + } + ] + }, + "data": "A", + "debuginfo": null, + "guid": "f677499b-19ac-428a-94d6-edc64403f5cd", + "src_subset": { + "type": "Range", + "ranges": [ + { + "start": "b_i + d_i", + "end": "b_i + d_i + 7", + "step": "1", + "tile": "1" + }, + { + "start": "k", + "end": "k + 15", + "step": "1", + "tile": "1" + } + ] + }, + "dst_subset": null, + "is_data_src": true, + "num_accesses": "128" + } + } + }, + "src": "16", + "dst": "23", + "dst_connector": "IN_A", + "src_connector": "OUT_A" + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "16", + "subset": { + "type": "Range", + "ranges": [ + { + "start": "d_i + i", + "end": "d_i + i", + "step": "1", + "tile": "1" + }, + { + "start": "0", + "end": "15", + "step": "1", + "tile": "1" + } + ] + }, + "data": "shrA", + "debuginfo": null, + "guid": "22c32597-9371-45d1-8394-e06e6b9e67f7", + "src_subset": { + "type": "Range", + "ranges": [ + { + "start": "d_i + i", + "end": "d_i + i", + "step": "1", + "tile": "1" + }, + { + "start": "0", + "end": "15", + "step": "1", + "tile": "1" + } + ] + }, + "dst_subset": null, + "is_data_src": true, + "num_accesses": "16" + } + } + }, + "src": "0", + "dst": "5", + "dst_connector": "IN___tmp_261_18_r_in_from_2_0", + "src_connector": "OUT_A" + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "128", + "subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "15", + "step": "1", + "tile": "1" + }, + { + "start": "d_j", + "end": "d_j + 7", + "step": "1", + "tile": "1" + } + ] + }, + "data": "shrB", + "debuginfo": null, + "guid": "0b2cf75a-cf1c-4808-99ae-447b3c88ddb8", + "src_subset": null, + "dst_subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "15", + "step": "1", + "tile": "1" + }, + { + "start": "d_j", + "end": "d_j + 7", + "step": "1", + "tile": "1" + } + ] + }, + "is_data_src": false, + "num_accesses": "128" + } + } + }, + "src": "22", + "dst": "20", + "dst_connector": null, + "src_connector": "OUT_B" + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "16384*K", + "subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "K - 1", + "step": "1", + "tile": "1" + }, + { + "start": "b_j", + "end": "b_j + 127", + "step": "1", + "tile": "1" + } + ] + }, + "data": "B", + "debuginfo": null, + "guid": "b741ef83-a465-4021-811c-6bd84813fa19", + "src_subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "K - 1", + "step": "1", + "tile": "1" + }, + { + "start": "b_j", + "end": "b_j + 127", + "step": "1", + "tile": "1" + } + ] + }, + "dst_subset": null, + "is_data_src": true, + "num_accesses": "16384*K" + } + } + }, + "src": "12", + "dst": "14", + "dst_connector": "IN_B", + "src_connector": "OUT_B" + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "64*K", + "subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "K - 1", + "step": "1", + "tile": "1" + }, + { + "start": "b_j + d_j", + "end": "b_j + d_j + 7", + "step": "1", + "tile": "1" + } + ] + }, + "data": "B", + "debuginfo": null, + "guid": "1f1b7232-f62a-45eb-ad3f-e2cc948b0a85", + "src_subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "K - 1", + "step": "1", + "tile": "1" + }, + { + "start": "b_j + d_j", + "end": "b_j + d_j + 7", + "step": "1", + "tile": "1" + } + ] + }, + "dst_subset": null, + "is_data_src": true, + "num_accesses": "64*K" + } + } + }, + "src": "14", + "dst": "16", + "dst_connector": "IN_B", + "src_connector": "OUT_B" + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "128", + "subset": { + "type": "Range", + "ranges": [ + { + "start": "k", + "end": "k + 15", + "step": "1", + "tile": "1" + }, + { + "start": "b_j + d_j", + "end": "b_j + d_j + 7", + "step": "1", + "tile": "1" + } + ] + }, + "data": "B", + "debuginfo": null, + "guid": "2742c5c3-debb-405b-9e4c-c125b0d8c5b3", + "src_subset": { + "type": "Range", + "ranges": [ + { + "start": "k", + "end": "k + 15", + "step": "1", + "tile": "1" + }, + { + "start": "b_j + d_j", + "end": "b_j + d_j + 7", + "step": "1", + "tile": "1" + } + ] + }, + "dst_subset": null, + "is_data_src": true, + "num_accesses": "128" + } + } + }, + "src": "16", + "dst": "22", + "dst_connector": "IN_B", + "src_connector": "OUT_B" + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "16", + "subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "15", + "step": "1", + "tile": "1" + }, + { + "start": "d_j + j", + "end": "d_j + j", + "step": "1", + "tile": "1" + } + ] + }, + "data": "shrB", + "debuginfo": null, + "guid": "b1cf4e06-8c3a-4f1b-a8c9-1d72c93aad79", + "src_subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "15", + "step": "1", + "tile": "1" + }, + { + "start": "d_j + j", + "end": "d_j + j", + "step": "1", + "tile": "1" + } + ] + }, + "dst_subset": null, + "is_data_src": true, + "num_accesses": "16" + } + } + }, + "src": "0", + "dst": "5", + "dst_connector": "IN___tmp_261_28_r_in_from_2_0", + "src_connector": "OUT_B" + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "M*N", + "subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "M - 1", + "step": "1", + "tile": "1" + }, + { + "start": "0", + "end": "N - 1", + "step": "1", + "tile": "1" + } + ] + }, + "data": "C", + "debuginfo": null, + "guid": "28fe3bb5-718f-4fa5-aece-d31643392657", + "src_subset": null, + "dst_subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "M - 1", + "step": "1", + "tile": "1" + }, + { + "start": "0", + "end": "N - 1", + "step": "1", + "tile": "1" + } + ] + }, + "is_data_src": false, + "num_accesses": "M*N" + } + } + }, + "src": "13", + "dst": "4", + "dst_connector": null, + "src_connector": "OUT_C" + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "64", + "subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "7", + "step": "1", + "tile": "1" + }, + { + "start": "0", + "end": "7", + "step": "1", + "tile": "1" + } + ] + }, + "data": "tmp", + "debuginfo": null, + "guid": "f756aeb2-08bb-4a62-bdc3-7946ac999339", + "src_subset": null, + "dst_subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "7", + "step": "1", + "tile": "1" + }, + { + "start": "0", + "end": "7", + "step": "1", + "tile": "1" + } + ] + }, + "is_data_src": false, + "num_accesses": "64" + } + } + }, + "src": "17", + "dst": "19", + "dst_connector": null, + "src_connector": "OUT_C" + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "16384", + "subset": { + "type": "Range", + "ranges": [ + { + "start": "b_i", + "end": "b_i + 127", + "step": "1", + "tile": "1" + }, + { + "start": "b_j", + "end": "b_j + 127", + "step": "1", + "tile": "1" + } + ] + }, + "data": "C", + "debuginfo": null, + "guid": "fe8d8836-ded8-4ef6-a40a-78bcdad3eeb6", + "src_subset": null, + "dst_subset": { + "type": "Range", + "ranges": [ + { + "start": "b_i", + "end": "b_i + 127", + "step": "1", + "tile": "1" + }, + { + "start": "b_j", + "end": "b_j + 127", + "step": "1", + "tile": "1" + } + ] + }, + "is_data_src": false, + "num_accesses": "16384" + } + } + }, + "src": "15", + "dst": "13", + "dst_connector": "IN_C", + "src_connector": "OUT_C" + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "64", + "subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "7", + "step": "1", + "tile": "1" + }, + { + "start": "0", + "end": "7", + "step": "1", + "tile": "1" + } + ] + }, + "data": "tmp", + "debuginfo": null, + "guid": "11f9a0db-52f5-4d91-ad9e-3f26ca230d90", + "src_subset": null, + "dst_subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "7", + "step": "1", + "tile": "1" + }, + { + "start": "0", + "end": "7", + "step": "1", + "tile": "1" + } + ] + }, + "is_data_src": false, + "num_accesses": "64" + } + } + }, + "src": "1", + "dst": "17", + "dst_connector": "IN_C", + "src_connector": "OUT_C" + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "1", + "subset": { + "type": "Range", + "ranges": [ + { + "start": "i", + "end": "i", + "step": "1", + "tile": "1" + }, + { + "start": "j", + "end": "j", + "step": "1", + "tile": "1" + } + ] + }, + "data": "tmp", + "debuginfo": null, + "guid": "4f8548fd-6128-45ea-9fc9-f6df35f09325", + "src_subset": null, + "dst_subset": { + "type": "Range", + "ranges": [ + { + "start": "i", + "end": "i", + "step": "1", + "tile": "1" + }, + { + "start": "j", + "end": "j", + "step": "1", + "tile": "1" + } + ] + }, + "is_data_src": false, + "num_accesses": "1" + } + } + }, + "src": "6", + "dst": "1", + "dst_connector": "IN_C", + "src_connector": "OUT___tmp1" + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "1", + "subset": { + "type": "Range", + "ranges": [ + { + "start": "i", + "end": "i", + "step": "1", + "tile": "1" + }, + { + "start": "j", + "end": "j", + "step": "1", + "tile": "1" + } + ] + }, + "data": "tmp", + "debuginfo": null, + "guid": "0be0a668-1110-4d99-90e6-c11606cc50f6", + "src_subset": { + "type": "Range", + "ranges": [ + { + "start": "i", + "end": "i", + "step": "1", + "tile": "1" + }, + { + "start": "j", + "end": "j", + "step": "1", + "tile": "1" + } + ] + }, + "dst_subset": null, + "is_data_src": true, + "num_accesses": "1" + } + } + }, + "src": "0", + "dst": "5", + "dst_connector": "IN___tmp1", + "src_connector": "OUT___tmp1" + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "64", + "subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "7", + "step": "1", + "tile": "1" + }, + { + "start": "0", + "end": "7", + "step": "1", + "tile": "1" + } + ] + }, + "data": "tmp", + "debuginfo": null, + "guid": "42cadbaf-e4d6-4b6f-aa92-1c652a7b7d0b", + "src_subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "7", + "step": "1", + "tile": "1" + }, + { + "start": "0", + "end": "7", + "step": "1", + "tile": "1" + } + ] + }, + "dst_subset": null, + "is_data_src": true, + "num_accesses": "64" + } + } + }, + "src": "16", + "dst": "0", + "dst_connector": "IN___tmp1", + "src_connector": "OUT___tmp1" + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "1", + "subset": { + "type": "Range", + "ranges": [ + { + "start": "i", + "end": "i", + "step": "1", + "tile": "1" + }, + { + "start": "j", + "end": "j", + "step": "1", + "tile": "1" + } + ] + }, + "data": "tmp", + "debuginfo": null, + "guid": "c36f83d1-d066-4dd9-b63a-2616908697bd", + "src_subset": { + "type": "Range", + "ranges": [ + { + "start": "i", + "end": "i", + "step": "1", + "tile": "1" + }, + { + "start": "j", + "end": "j", + "step": "1", + "tile": "1" + } + ] + }, + "dst_subset": null, + "is_data_src": true, + "num_accesses": "1" + } + } + }, + "src": "5", + "dst": "9", + "dst_connector": "__in1", + "src_connector": "OUT___tmp1" + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "1", + "subset": { + "type": "Range", + "ranges": [ + { + "start": "d_i + i", + "end": "d_i + i", + "step": "1", + "tile": "1" + }, + { + "start": "tk", + "end": "tk", + "step": "1", + "tile": "1" + } + ] + }, + "data": "shrA", + "debuginfo": null, + "guid": "b24525e0-08c4-4174-9baf-8d00fba27490", + "src_subset": { + "type": "Range", + "ranges": [ + { + "start": "d_i + i", + "end": "d_i + i", + "step": "1", + "tile": "1" + }, + { + "start": "tk", + "end": "tk", + "step": "1", + "tile": "1" + } + ] + }, + "dst_subset": null, + "is_data_src": true, + "num_accesses": "1" + } + } + }, + "src": "5", + "dst": "7", + "dst_connector": "__in1", + "src_connector": "OUT___tmp_261_18_r_in_from_2_0" + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "1", + "subset": { + "type": "Range", + "ranges": [ + { + "start": "tk", + "end": "tk", + "step": "1", + "tile": "1" + }, + { + "start": "d_j + j", + "end": "d_j + j", + "step": "1", + "tile": "1" + } + ] + }, + "data": "shrB", + "debuginfo": null, + "guid": "31287127-040d-4c00-aeac-badd43792f80", + "src_subset": { + "type": "Range", + "ranges": [ + { + "start": "tk", + "end": "tk", + "step": "1", + "tile": "1" + }, + { + "start": "d_j + j", + "end": "d_j + j", + "step": "1", + "tile": "1" + } + ] + }, + "dst_subset": null, + "is_data_src": true, + "num_accesses": "1" + } + } + }, + "src": "5", + "dst": "7", + "dst_connector": "__in2", + "src_connector": "OUT___tmp_261_28_r_in_from_2_0" + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "1", + "subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "0", + "step": "1", + "tile": "1" + } + ] + }, + "data": "__tmp3", + "debuginfo": null, + "guid": "a0276658-2d4c-451c-880c-00e32a770af8", + "src_subset": null, + "dst_subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "0", + "step": "1", + "tile": "1" + } + ] + }, + "is_data_src": false, + "num_accesses": "1" + } + } + }, + "src": "7", + "dst": "8", + "dst_connector": null, + "src_connector": "__out" + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "1", + "subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "0", + "step": "1", + "tile": "1" + } + ] + }, + "data": "__tmp4", + "debuginfo": null, + "guid": "d926e555-2206-4946-936f-637170fedb10", + "src_subset": null, + "dst_subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "0", + "step": "1", + "tile": "1" + } + ] + }, + "is_data_src": false, + "num_accesses": "1" + } + } + }, + "src": "9", + "dst": "10", + "dst_connector": null, + "src_connector": "__out" + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "1", + "subset": { + "type": "Range", + "ranges": [ + { + "start": "i", + "end": "i", + "step": "1", + "tile": "1" + }, + { + "start": "j", + "end": "j", + "step": "1", + "tile": "1" + } + ] + }, + "data": "tmp", + "debuginfo": null, + "guid": "482095c1-5f12-45ec-885b-312e5b96205f", + "src_subset": null, + "dst_subset": { + "type": "Range", + "ranges": [ + { + "start": "i", + "end": "i", + "step": "1", + "tile": "1" + }, + { + "start": "j", + "end": "j", + "step": "1", + "tile": "1" + } + ] + }, + "is_data_src": false, + "num_accesses": "1" + } + } + }, + "src": "11", + "dst": "6", + "dst_connector": "IN___tmp1", + "src_connector": "__out" + } + ], + "attributes": { + "executions": "1", + "dynamic_executions": false, + "guid": "21d6aa0d-ccfb-4036-ab01-8654ab4417ed" + } + } + ], + "edges": [], + "collapsed": false, + "label": "", + "id": null, + "cfg_list_id": 0, + "start_block": 0, + "dace_version": "1.0.0" +} \ No newline at end of file diff --git a/berkay_workpace/scratch/yakups_examples/generate_sdfgs.ipynb b/berkay_workpace/scratch/yakups_examples/generate_sdfgs.ipynb new file mode 100644 index 0000000000..e8ce4f567e --- /dev/null +++ b/berkay_workpace/scratch/yakups_examples/generate_sdfgs.ipynb @@ -0,0 +1,656 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "c60775aa", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import dace\n", + "import cupy as cp\n", + "import numpy as np\n", + "from IPython.display import Code\n", + "from typing import Optional\n", + "\n", + "from dace import SDFG, properties\n", + "from dace.config import Config\n", + "from dace.transformation import pass_pipeline as ppl, transformation\n", + "from dace.sdfg import nodes\n", + "from dace import dtypes\n", + "from dace.transformation.passes.shared_memory_synchronization import DefaultSharedMemorySync\n", + "from dace.sdfg.state import LoopRegion, ConditionalBlock" + ] + }, + { + "cell_type": "markdown", + "id": "d4a9dd26", + "metadata": {}, + "source": [ + "## Weird Global To Global Example\n", + "\n", + "This is actually a nice example which does not compile when using the legacy CUDACodeGen due to const checks." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "ed5d3795", + "metadata": {}, + "outputs": [], + "source": [ + "sdfg = dace.SDFG.from_file('weird_global_to_global.sdfg')" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "537b7cc2", + "metadata": {}, + "outputs": [], + "source": [ + "#sdfg.compile()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "c276f872", + "metadata": {}, + "outputs": [], + "source": [ + "#Code(sdfg.generate_code()[1].clean_code)" + ] + }, + { + "cell_type": "markdown", + "id": "958dfcbf", + "metadata": {}, + "source": [ + "## Weir Shared To Shared Example" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "a98006e2", + "metadata": {}, + "outputs": [], + "source": [ + "sdfg = dace.SDFG.from_file('weird_shared_to_shared_copy.sdfg')" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "e819c1b0", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/berkay/master-thesis/dace/dace/codegen/targets/cuda.py:1872: UserWarning: No `gpu_block_size` property specified on map \"kernel_101\". Falling back to the configuration entry `compiler.cuda.default_block_size`: 32,1,1. You can either specify the block size to use with the gpu_block_size property, or by adding nested `GPU_ThreadBlock` maps, which map work to individual threads. For more information, see https://spcldace.readthedocs.io/en/latest/optimization/gpu.html\n", + " warnings.warn(\n" + ] + }, + { + "data": { + "text/html": [ + "
#include <cuda_runtime.h>\n",
+       "#include <dace/dace.h>\n",
+       "\n",
+       "\n",
+       "struct kernel_state_t {\n",
+       "    dace::cuda::Context *gpu_context;\n",
+       "};\n",
+       "\n",
+       "\n",
+       "\n",
+       "DACE_EXPORTED int __dace_init_cuda(kernel_state_t *__state, int64_t N);\n",
+       "DACE_EXPORTED int __dace_exit_cuda(kernel_state_t *__state);\n",
+       "\n",
+       "\n",
+       "\n",
+       "int __dace_init_cuda(kernel_state_t *__state, int64_t N) {\n",
+       "    int count;\n",
+       "\n",
+       "    // Check that we are able to run cuda code\n",
+       "    if (cudaGetDeviceCount(&count) != cudaSuccess)\n",
+       "    {\n",
+       "        printf("ERROR: GPU drivers are not configured or cuda-capable device "\n",
+       "               "not found\\n");\n",
+       "        return 1;\n",
+       "    }\n",
+       "    if (count == 0)\n",
+       "    {\n",
+       "        printf("ERROR: No cuda-capable devices found\\n");\n",
+       "        return 2;\n",
+       "    }\n",
+       "\n",
+       "    // Initialize cuda before we run the application\n",
+       "    float *dev_X;\n",
+       "    DACE_GPU_CHECK(cudaMalloc((void **) &dev_X, 1));\n",
+       "    DACE_GPU_CHECK(cudaFree(dev_X));\n",
+       "\n",
+       "    \n",
+       "\n",
+       "    __state->gpu_context = new dace::cuda::Context(1, 1);\n",
+       "\n",
+       "    // Create cuda streams and events\n",
+       "    for(int i = 0; i < 1; ++i) {\n",
+       "        DACE_GPU_CHECK(cudaStreamCreateWithFlags(&__state->gpu_context->internal_streams[i], cudaStreamNonBlocking));\n",
+       "        __state->gpu_context->streams[i] = __state->gpu_context->internal_streams[i]; // Allow for externals to modify streams\n",
+       "    }\n",
+       "    for(int i = 0; i < 1; ++i) {\n",
+       "        DACE_GPU_CHECK(cudaEventCreateWithFlags(&__state->gpu_context->events[i], cudaEventDisableTiming));\n",
+       "    }\n",
+       "\n",
+       "    \n",
+       "\n",
+       "    return 0;\n",
+       "}\n",
+       "\n",
+       "int __dace_exit_cuda(kernel_state_t *__state) {\n",
+       "    \n",
+       "\n",
+       "    // Synchronize and check for CUDA errors\n",
+       "    int __err = static_cast<int>(__state->gpu_context->lasterror);\n",
+       "    if (__err == 0)\n",
+       "        __err = static_cast<int>(cudaDeviceSynchronize());\n",
+       "\n",
+       "    // Destroy cuda streams and events\n",
+       "    for(int i = 0; i < 1; ++i) {\n",
+       "        DACE_GPU_CHECK(cudaStreamDestroy(__state->gpu_context->internal_streams[i]));\n",
+       "    }\n",
+       "    for(int i = 0; i < 1; ++i) {\n",
+       "        DACE_GPU_CHECK(cudaEventDestroy(__state->gpu_context->events[i]));\n",
+       "    }\n",
+       "\n",
+       "    delete __state->gpu_context;\n",
+       "    return __err;\n",
+       "}\n",
+       "\n",
+       "DACE_EXPORTED bool __dace_gpu_set_stream(kernel_state_t *__state, int streamid, gpuStream_t stream)\n",
+       "{\n",
+       "    if (streamid < 0 || streamid >= 1)\n",
+       "        return false;\n",
+       "\n",
+       "    __state->gpu_context->streams[streamid] = stream;\n",
+       "\n",
+       "    return true;\n",
+       "}\n",
+       "\n",
+       "DACE_EXPORTED void __dace_gpu_set_all_streams(kernel_state_t *__state, gpuStream_t stream)\n",
+       "{\n",
+       "    for (int i = 0; i < 1; ++i)\n",
+       "        __state->gpu_context->streams[i] = stream;\n",
+       "}\n",
+       "\n",
+       "__global__ void __launch_bounds__(32) kernel_101_0_0_0(const double * __restrict__ A, const double * __restrict__ B, double * __restrict__ C, int N) {\n",
+       "    {\n",
+       "        int i = (blockIdx.x * 32 + threadIdx.x);\n",
+       "        double __tmp1;\n",
+       "        __shared__ double shr_A[32];\n",
+       "        __shared__ double shr_B[32];\n",
+       "        if (i < N) {\n",
+       "            dace::GlobalToShared1D<double, 32, 1, 1, 1, 1, false>(A + i, 1, shr_A + (i % 32));\n",
+       "            dace::GlobalToShared1D<double, 32, 1, 1, 1, 1, false>(B + i, 1, shr_B + (i % 32));\n",
+       "            {\n",
+       "                double __in1 = shr_A[(i % 32)];\n",
+       "                double __in2 = shr_B[(i % 32)];\n",
+       "                double __out;\n",
+       "\n",
+       "                ///////////////////\n",
+       "                // Tasklet code (_Add_)\n",
+       "                __out = (__in1 + __in2);\n",
+       "                ///////////////////\n",
+       "\n",
+       "                __tmp1 = __out;\n",
+       "            }\n",
+       "            {\n",
+       "                double __inp = __tmp1;\n",
+       "                double __out;\n",
+       "\n",
+       "                ///////////////////\n",
+       "                // Tasklet code (assign_102_16)\n",
+       "                __out = __inp;\n",
+       "                ///////////////////\n",
+       "\n",
+       "                C[i] = __out;\n",
+       "            }\n",
+       "        }\n",
+       "    }\n",
+       "}\n",
+       "\n",
+       "\n",
+       "DACE_EXPORTED void __dace_runkernel_kernel_101_0_0_0(kernel_state_t *__state, const double * __restrict__ A, const double * __restrict__ B, double * __restrict__ C, int N);\n",
+       "void __dace_runkernel_kernel_101_0_0_0(kernel_state_t *__state, const double * __restrict__ A, const double * __restrict__ B, double * __restrict__ C, int N)\n",
+       "{\n",
+       "\n",
+       "    if (((int_ceil(int_ceil(N, 1), 32)) <= 0)) {\n",
+       "\n",
+       "        return;\n",
+       "    }\n",
+       "\n",
+       "    void  *kernel_101_0_0_0_args[] = { (void *)&A, (void *)&B, (void *)&C, (void *)&N };\n",
+       "    gpuError_t __err = cudaLaunchKernel((void*)kernel_101_0_0_0, dim3(int_ceil(int_ceil(N, 1), 32), 1, 1), dim3(32, 1, 1), kernel_101_0_0_0_args, 0, __state->gpu_context->streams[0]);\n",
+       "    DACE_KERNEL_LAUNCH_CHECK(__err, "kernel_101_0_0_0", int_ceil(int_ceil(N, 1), 32), 1, 1, 32, 1, 1);\n",
+       "}\n",
+       "
\n" + ], + "text/latex": [ + "\\begin{Verbatim}[commandchars=\\\\\\{\\}]\n", + "\\PY{n}{\\PYZsh{}include}\\PY{+w}{ }\\PY{o}{\\PYZlt{}}\\PY{n}{cuda\\PYZus{}runtime}\\PY{p}{.}\\PY{n}{h}\\PY{o}{\\PYZgt{}}\n", + "\\PY{n}{\\PYZsh{}include}\\PY{+w}{ }\\PY{o}{\\PYZlt{}}\\PY{n}{dace}\\PY{o}{/}\\PY{n}{dace}\\PY{p}{.}\\PY{n}{h}\\PY{o}{\\PYZgt{}}\n", + "\n", + "\n", + "\\PY{n}{struct}\\PY{+w}{ }\\PY{n}{kernel\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{err}{\\PYZob{}}\n", + "\\PY{+w}{ }\\PY{n+nl}{dace}\\PY{p}{:}\\PY{err}{:}\\PY{n+nl}{cuda}\\PY{p}{:}\\PY{err}{:}\\PY{n}{Context}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{gpu\\PYZus{}context}\\PY{p}{;}\n", + "\\PY{err}{\\PYZcb{}}\\PY{p}{;}\n", + "\n", + "\n", + "\n", + "\\PY{n}{DACE\\PYZus{}EXPORTED}\\PY{+w}{ }\\PY{n+nc}{int}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}dace\\PYZus{}init\\PYZus{}cuda}\\PY{p}{(}\\PY{n}{kernel\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{int64\\PYZus{}t}\\PY{+w}{ }\\PY{n}{N}\\PY{p}{)}\\PY{p}{;}\n", + "\\PY{n}{DACE\\PYZus{}EXPORTED}\\PY{+w}{ }\\PY{n+nc}{int}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}dace\\PYZus{}exit\\PYZus{}cuda}\\PY{p}{(}\\PY{n}{kernel\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{)}\\PY{p}{;}\n", + "\n", + "\n", + "\n", + "\\PY{n+nc}{int}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}dace\\PYZus{}init\\PYZus{}cuda}\\PY{p}{(}\\PY{n}{kernel\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{int64\\PYZus{}t}\\PY{+w}{ }\\PY{n}{N}\\PY{p}{)}\\PY{+w}{ }\\PY{err}{\\PYZob{}}\n", + "\\PY{+w}{ }\\PY{n+nc}{int}\\PY{+w}{ }\\PY{n+nf}{count}\\PY{p}{;}\n", + "\n", + "\\PY{+w}{ }\\PY{o}{/}\\PY{o}{/}\\PY{+w}{ }\\PY{k}{Check}\\PY{+w}{ }\\PY{n}{that}\\PY{+w}{ }\\PY{n}{we}\\PY{+w}{ }\\PY{k}{are}\\PY{+w}{ }\\PY{n}{able}\\PY{+w}{ }\\PY{k}{to}\\PY{+w}{ }\\PY{n}{run}\\PY{+w}{ }\\PY{n}{cuda}\\PY{+w}{ }\\PY{n}{code}\n", + "\\PY{+w}{ }\\PY{k}{if}\\PY{+w}{ }\\PY{p}{(}\\PY{n}{cudaGetDeviceCount}\\PY{p}{(}\\PY{o}{\\PYZam{}}\\PY{n+nf}{count}\\PY{p}{)}\\PY{+w}{ }\\PY{o}{!=}\\PY{+w}{ }\\PY{n}{cudaSuccess}\\PY{p}{)}\n", + "\\PY{+w}{ }\\PY{err}{\\PYZob{}}\n", + "\\PY{+w}{ }\\PY{n}{printf}\\PY{p}{(}\\PY{l+s+ss}{\\PYZdq{}ERROR: GPU drivers are not configured or cuda\\PYZhy{}capable device \\PYZdq{}}\n", + "\\PY{+w}{ }\\PY{l+s+ss}{\\PYZdq{}not found\\PYZbs{}n\\PYZdq{}}\\PY{p}{)}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{k}{return}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{err}{\\PYZcb{}}\n", + "\\PY{+w}{ }\\PY{k}{if}\\PY{+w}{ }\\PY{p}{(}\\PY{n+nf}{count}\\PY{+w}{ }\\PY{o}{=}\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{)}\n", + "\\PY{+w}{ }\\PY{err}{\\PYZob{}}\n", + "\\PY{+w}{ }\\PY{n}{printf}\\PY{p}{(}\\PY{l+s+ss}{\\PYZdq{}ERROR: No cuda\\PYZhy{}capable devices found\\PYZbs{}n\\PYZdq{}}\\PY{p}{)}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{k}{return}\\PY{+w}{ }\\PY{l+m+mi}{2}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{err}{\\PYZcb{}}\n", + "\n", + "\\PY{+w}{ }\\PY{o}{/}\\PY{o}{/}\\PY{+w}{ }\\PY{k}{Initialize}\\PY{+w}{ }\\PY{n}{cuda}\\PY{+w}{ }\\PY{k}{before}\\PY{+w}{ }\\PY{n}{we}\\PY{+w}{ }\\PY{n}{run}\\PY{+w}{ }\\PY{n}{the}\\PY{+w}{ }\\PY{n}{application}\n", + "\\PY{+w}{ }\\PY{n+nc}{float}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{dev\\PYZus{}X}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{n}{DACE\\PYZus{}GPU\\PYZus{}CHECK}\\PY{p}{(}\\PY{n}{cudaMalloc}\\PY{p}{(}\\PY{p}{(}\\PY{n}{void}\\PY{+w}{ }\\PY{o}{*}\\PY{o}{*}\\PY{p}{)}\\PY{+w}{ }\\PY{o}{\\PYZam{}}\\PY{n}{dev\\PYZus{}X}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{)}\\PY{p}{)}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{n}{DACE\\PYZus{}GPU\\PYZus{}CHECK}\\PY{p}{(}\\PY{n}{cudaFree}\\PY{p}{(}\\PY{n}{dev\\PYZus{}X}\\PY{p}{)}\\PY{p}{)}\\PY{p}{;}\n", + "\n", + "\\PY{+w}{ }\n", + "\n", + "\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{k}{new}\\PY{+w}{ }\\PY{n+nl}{dace}\\PY{p}{:}\\PY{err}{:}\\PY{n+nl}{cuda}\\PY{p}{:}\\PY{err}{:}\\PY{n}{Context}\\PY{p}{(}\\PY{l+m+mi}{1}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{)}\\PY{p}{;}\n", + "\n", + "\\PY{+w}{ }\\PY{o}{/}\\PY{o}{/}\\PY{+w}{ }\\PY{k}{Create}\\PY{+w}{ }\\PY{n}{cuda}\\PY{+w}{ }\\PY{n}{streams}\\PY{+w}{ }\\PY{o+ow}{and}\\PY{+w}{ }\\PY{n}{events}\n", + "\\PY{+w}{ }\\PY{k}{for}\\PY{p}{(}\\PY{n+nc}{int}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{;}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{\\PYZlt{}}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{;}\\PY{+w}{ }\\PY{o}{+}\\PY{o}{+}\\PY{n}{i}\\PY{p}{)}\\PY{+w}{ }\\PY{err}{\\PYZob{}}\n", + "\\PY{+w}{ }\\PY{n}{DACE\\PYZus{}GPU\\PYZus{}CHECK}\\PY{p}{(}\\PY{n}{cudaStreamCreateWithFlags}\\PY{p}{(}\\PY{o}{\\PYZam{}}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{internal\\PYZus{}streams}\\PY{o}{[}\\PY{n}{i}\\PY{o}{]}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{cudaStreamNonBlocking}\\PY{p}{)}\\PY{p}{)}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{streams}\\PY{o}{[}\\PY{n}{i}\\PY{o}{]}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{internal\\PYZus{}streams}\\PY{o}{[}\\PY{n}{i}\\PY{o}{]}\\PY{p}{;}\\PY{+w}{ }\\PY{o}{/}\\PY{o}{/}\\PY{+w}{ }\\PY{n}{Allow}\\PY{+w}{ }\\PY{k}{for}\\PY{+w}{ }\\PY{n}{externals}\\PY{+w}{ }\\PY{k}{to}\\PY{+w}{ }\\PY{k}{modify}\\PY{+w}{ }\\PY{n}{streams}\n", + "\\PY{+w}{ }\\PY{err}{\\PYZcb{}}\n", + "\\PY{+w}{ }\\PY{k}{for}\\PY{p}{(}\\PY{n+nc}{int}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{;}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{\\PYZlt{}}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{;}\\PY{+w}{ }\\PY{o}{+}\\PY{o}{+}\\PY{n}{i}\\PY{p}{)}\\PY{+w}{ }\\PY{err}{\\PYZob{}}\n", + "\\PY{+w}{ }\\PY{n}{DACE\\PYZus{}GPU\\PYZus{}CHECK}\\PY{p}{(}\\PY{n}{cudaEventCreateWithFlags}\\PY{p}{(}\\PY{o}{\\PYZam{}}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{events}\\PY{o}{[}\\PY{n}{i}\\PY{o}{]}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{cudaEventDisableTiming}\\PY{p}{)}\\PY{p}{)}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{err}{\\PYZcb{}}\n", + "\n", + "\\PY{+w}{ }\n", + "\n", + "\\PY{+w}{ }\\PY{k}{return}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{;}\n", + "\\PY{err}{\\PYZcb{}}\n", + "\n", + "\\PY{n+nc}{int}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}dace\\PYZus{}exit\\PYZus{}cuda}\\PY{p}{(}\\PY{n}{kernel\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{)}\\PY{+w}{ }\\PY{err}{\\PYZob{}}\n", + "\\PY{+w}{ }\n", + "\n", + "\\PY{+w}{ }\\PY{o}{/}\\PY{o}{/}\\PY{+w}{ }\\PY{n}{Synchronize}\\PY{+w}{ }\\PY{o+ow}{and}\\PY{+w}{ }\\PY{k}{check}\\PY{+w}{ }\\PY{k}{for}\\PY{+w}{ }\\PY{n}{CUDA}\\PY{+w}{ }\\PY{n}{errors}\n", + "\\PY{+w}{ }\\PY{n+nc}{int}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}err}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{static\\PYZus{}cast}\\PY{o}{\\PYZlt{}}\\PY{n+nc}{int}\\PY{o}{\\PYZgt{}}\\PY{p}{(}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{lasterror}\\PY{p}{)}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{k}{if}\\PY{+w}{ }\\PY{p}{(}\\PY{n}{\\PYZus{}\\PYZus{}err}\\PY{+w}{ }\\PY{o}{=}\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{)}\n", + "\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}err}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{static\\PYZus{}cast}\\PY{o}{\\PYZlt{}}\\PY{n+nc}{int}\\PY{o}{\\PYZgt{}}\\PY{p}{(}\\PY{n}{cudaDeviceSynchronize}\\PY{p}{(}\\PY{p}{)}\\PY{p}{)}\\PY{p}{;}\n", + "\n", + "\\PY{+w}{ }\\PY{o}{/}\\PY{o}{/}\\PY{+w}{ }\\PY{k}{Destroy}\\PY{+w}{ }\\PY{n}{cuda}\\PY{+w}{ }\\PY{n}{streams}\\PY{+w}{ }\\PY{o+ow}{and}\\PY{+w}{ }\\PY{n}{events}\n", + "\\PY{+w}{ }\\PY{k}{for}\\PY{p}{(}\\PY{n+nc}{int}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{;}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{\\PYZlt{}}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{;}\\PY{+w}{ }\\PY{o}{+}\\PY{o}{+}\\PY{n}{i}\\PY{p}{)}\\PY{+w}{ }\\PY{err}{\\PYZob{}}\n", + "\\PY{+w}{ }\\PY{n}{DACE\\PYZus{}GPU\\PYZus{}CHECK}\\PY{p}{(}\\PY{n}{cudaStreamDestroy}\\PY{p}{(}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{internal\\PYZus{}streams}\\PY{o}{[}\\PY{n}{i}\\PY{o}{]}\\PY{p}{)}\\PY{p}{)}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{err}{\\PYZcb{}}\n", + "\\PY{+w}{ }\\PY{k}{for}\\PY{p}{(}\\PY{n+nc}{int}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{;}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{\\PYZlt{}}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{;}\\PY{+w}{ }\\PY{o}{+}\\PY{o}{+}\\PY{n}{i}\\PY{p}{)}\\PY{+w}{ }\\PY{err}{\\PYZob{}}\n", + "\\PY{+w}{ }\\PY{n}{DACE\\PYZus{}GPU\\PYZus{}CHECK}\\PY{p}{(}\\PY{n}{cudaEventDestroy}\\PY{p}{(}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{events}\\PY{o}{[}\\PY{n}{i}\\PY{o}{]}\\PY{p}{)}\\PY{p}{)}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{err}{\\PYZcb{}}\n", + "\n", + "\\PY{+w}{ }\\PY{k}{delete}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{k}{return}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}err}\\PY{p}{;}\n", + "\\PY{err}{\\PYZcb{}}\n", + "\n", + "\\PY{n}{DACE\\PYZus{}EXPORTED}\\PY{+w}{ }\\PY{n}{bool}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}dace\\PYZus{}gpu\\PYZus{}set\\PYZus{}stream}\\PY{p}{(}\\PY{n}{kernel\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{,}\\PY{+w}{ }\\PY{n+nc}{int}\\PY{+w}{ }\\PY{n}{streamid}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{gpuStream\\PYZus{}t}\\PY{+w}{ }\\PY{n}{stream}\\PY{p}{)}\n", + "\\PY{err}{\\PYZob{}}\n", + "\\PY{+w}{ }\\PY{k}{if}\\PY{+w}{ }\\PY{p}{(}\\PY{n}{streamid}\\PY{+w}{ }\\PY{o}{\\PYZlt{}}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{+w}{ }\\PY{o}{|}\\PY{o}{|}\\PY{+w}{ }\\PY{n}{streamid}\\PY{+w}{ }\\PY{o}{\\PYZgt{}=}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{)}\n", + "\\PY{+w}{ }\\PY{k}{return}\\PY{+w}{ }\\PY{k}{false}\\PY{p}{;}\n", + "\n", + "\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{streams}\\PY{o}{[}\\PY{n}{streamid}\\PY{o}{]}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{stream}\\PY{p}{;}\n", + "\n", + "\\PY{+w}{ }\\PY{k}{return}\\PY{+w}{ }\\PY{k}{true}\\PY{p}{;}\n", + "\\PY{err}{\\PYZcb{}}\n", + "\n", + "\\PY{n}{DACE\\PYZus{}EXPORTED}\\PY{+w}{ }\\PY{n}{void}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}dace\\PYZus{}gpu\\PYZus{}set\\PYZus{}all\\PYZus{}streams}\\PY{p}{(}\\PY{n}{kernel\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{gpuStream\\PYZus{}t}\\PY{+w}{ }\\PY{n}{stream}\\PY{p}{)}\n", + "\\PY{err}{\\PYZob{}}\n", + "\\PY{+w}{ }\\PY{k}{for}\\PY{+w}{ }\\PY{p}{(}\\PY{n+nc}{int}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{;}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{\\PYZlt{}}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{;}\\PY{+w}{ }\\PY{o}{+}\\PY{o}{+}\\PY{n}{i}\\PY{p}{)}\n", + "\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{streams}\\PY{o}{[}\\PY{n}{i}\\PY{o}{]}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{stream}\\PY{p}{;}\n", + "\\PY{err}{\\PYZcb{}}\n", + "\n", + "\\PY{n}{\\PYZus{}\\PYZus{}global\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{n}{void}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}launch\\PYZus{}bounds\\PYZus{}\\PYZus{}}\\PY{p}{(}\\PY{l+m+mi}{32}\\PY{p}{)}\\PY{+w}{ }\\PY{n}{kernel\\PYZus{}101\\PYZus{}0\\PYZus{}0\\PYZus{}0}\\PY{p}{(}\\PY{n}{const}\\PY{+w}{ }\\PY{k}{double}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}restrict\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{n}{A}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{const}\\PY{+w}{ }\\PY{k}{double}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}restrict\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{n}{B}\\PY{p}{,}\\PY{+w}{ }\\PY{k}{double}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}restrict\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{n}{C}\\PY{p}{,}\\PY{+w}{ }\\PY{n+nc}{int}\\PY{+w}{ }\\PY{n}{N}\\PY{p}{)}\\PY{+w}{ }\\PY{err}{\\PYZob{}}\n", + "\\PY{+w}{ }\\PY{err}{\\PYZob{}}\n", + "\\PY{+w}{ }\\PY{n+nc}{int}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{p}{(}\\PY{n}{blockIdx}\\PY{p}{.}\\PY{n}{x}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{l+m+mi}{32}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{n}{threadIdx}\\PY{p}{.}\\PY{n}{x}\\PY{p}{)}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{k}{double}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}tmp1}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}shared\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{k}{double}\\PY{+w}{ }\\PY{n}{shr\\PYZus{}A}\\PY{o}{[}\\PY{n}{32}\\PY{o}{]}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}shared\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{k}{double}\\PY{+w}{ }\\PY{n}{shr\\PYZus{}B}\\PY{o}{[}\\PY{n}{32}\\PY{o}{]}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{k}{if}\\PY{+w}{ }\\PY{p}{(}\\PY{n}{i}\\PY{+w}{ }\\PY{o}{\\PYZlt{}}\\PY{+w}{ }\\PY{n}{N}\\PY{p}{)}\\PY{+w}{ }\\PY{err}{\\PYZob{}}\n", + "\\PY{+w}{ }\\PY{n+nl}{dace}\\PY{p}{:}\\PY{err}{:}\\PY{n}{GlobalToShared1D}\\PY{o}{\\PYZlt{}}\\PY{k}{double}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{32}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{,}\\PY{+w}{ }\\PY{k}{false}\\PY{o}{\\PYZgt{}}\\PY{p}{(}\\PY{n}{A}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{n}{i}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{shr\\PYZus{}A}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{p}{(}\\PY{n}{i}\\PY{+w}{ }\\PY{o}{\\PYZpc{}}\\PY{+w}{ }\\PY{l+m+mi}{32}\\PY{p}{)}\\PY{p}{)}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{n+nl}{dace}\\PY{p}{:}\\PY{err}{:}\\PY{n}{GlobalToShared1D}\\PY{o}{\\PYZlt{}}\\PY{k}{double}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{32}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{,}\\PY{+w}{ }\\PY{k}{false}\\PY{o}{\\PYZgt{}}\\PY{p}{(}\\PY{n}{B}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{n}{i}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{shr\\PYZus{}B}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{p}{(}\\PY{n}{i}\\PY{+w}{ }\\PY{o}{\\PYZpc{}}\\PY{+w}{ }\\PY{l+m+mi}{32}\\PY{p}{)}\\PY{p}{)}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{err}{\\PYZob{}}\n", + "\\PY{+w}{ }\\PY{k}{double}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}in1}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{shr\\PYZus{}A}\\PY{o}{[}\\PY{n}{(i \\PYZpc{} 32)}\\PY{o}{]}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{k}{double}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}in2}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{shr\\PYZus{}B}\\PY{o}{[}\\PY{n}{(i \\PYZpc{} 32)}\\PY{o}{]}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{k}{double}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}out}\\PY{p}{;}\n", + "\n", + "\\PY{+w}{ }\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\n", + "\\PY{+w}{ }\\PY{o}{/}\\PY{o}{/}\\PY{+w}{ }\\PY{n}{Tasklet}\\PY{+w}{ }\\PY{n}{code}\\PY{+w}{ }\\PY{p}{(}\\PY{n}{\\PYZus{}Add\\PYZus{}}\\PY{p}{)}\n", + "\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}out}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{p}{(}\\PY{n}{\\PYZus{}\\PYZus{}in1}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}in2}\\PY{p}{)}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\n", + "\n", + "\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}tmp1}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}out}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{err}{\\PYZcb{}}\n", + "\\PY{+w}{ }\\PY{err}{\\PYZob{}}\n", + "\\PY{+w}{ }\\PY{k}{double}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}inp}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}tmp1}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{k}{double}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}out}\\PY{p}{;}\n", + "\n", + "\\PY{+w}{ }\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\n", + "\\PY{+w}{ }\\PY{o}{/}\\PY{o}{/}\\PY{+w}{ }\\PY{n}{Tasklet}\\PY{+w}{ }\\PY{n}{code}\\PY{+w}{ }\\PY{p}{(}\\PY{n}{assign\\PYZus{}102\\PYZus{}16}\\PY{p}{)}\n", + "\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}out}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}inp}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\n", + "\n", + "\\PY{+w}{ }\\PY{n}{C}\\PY{o}{[}\\PY{n}{i}\\PY{o}{]}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}out}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{err}{\\PYZcb{}}\n", + "\\PY{+w}{ }\\PY{err}{\\PYZcb{}}\n", + "\\PY{+w}{ }\\PY{err}{\\PYZcb{}}\n", + "\\PY{err}{\\PYZcb{}}\n", + "\n", + "\n", + "\\PY{n}{DACE\\PYZus{}EXPORTED}\\PY{+w}{ }\\PY{n}{void}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}dace\\PYZus{}runkernel\\PYZus{}kernel\\PYZus{}101\\PYZus{}0\\PYZus{}0\\PYZus{}0}\\PY{p}{(}\\PY{n}{kernel\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{const}\\PY{+w}{ }\\PY{k}{double}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}restrict\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{n}{A}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{const}\\PY{+w}{ }\\PY{k}{double}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}restrict\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{n}{B}\\PY{p}{,}\\PY{+w}{ }\\PY{k}{double}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}restrict\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{n}{C}\\PY{p}{,}\\PY{+w}{ }\\PY{n+nc}{int}\\PY{+w}{ }\\PY{n}{N}\\PY{p}{)}\\PY{p}{;}\n", + "\\PY{n}{void}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}dace\\PYZus{}runkernel\\PYZus{}kernel\\PYZus{}101\\PYZus{}0\\PYZus{}0\\PYZus{}0}\\PY{p}{(}\\PY{n}{kernel\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{const}\\PY{+w}{ }\\PY{k}{double}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}restrict\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{n}{A}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{const}\\PY{+w}{ }\\PY{k}{double}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}restrict\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{n}{B}\\PY{p}{,}\\PY{+w}{ }\\PY{k}{double}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}restrict\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{n}{C}\\PY{p}{,}\\PY{+w}{ }\\PY{n+nc}{int}\\PY{+w}{ }\\PY{n}{N}\\PY{p}{)}\n", + "\\PY{err}{\\PYZob{}}\n", + "\n", + "\\PY{+w}{ }\\PY{k}{if}\\PY{+w}{ }\\PY{p}{(}\\PY{p}{(}\\PY{p}{(}\\PY{n}{int\\PYZus{}ceil}\\PY{p}{(}\\PY{n}{int\\PYZus{}ceil}\\PY{p}{(}\\PY{n}{N}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{)}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{32}\\PY{p}{)}\\PY{p}{)}\\PY{+w}{ }\\PY{o}{\\PYZlt{}=}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{)}\\PY{p}{)}\\PY{+w}{ }\\PY{err}{\\PYZob{}}\n", + "\n", + "\\PY{+w}{ }\\PY{k}{return}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{err}{\\PYZcb{}}\n", + "\n", + "\\PY{+w}{ }\\PY{n}{void}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{kernel\\PYZus{}101\\PYZus{}0\\PYZus{}0\\PYZus{}0\\PYZus{}args}\\PY{err}{[}\\PY{err}{]}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{err}{\\PYZob{}}\\PY{+w}{ }\\PY{p}{(}\\PY{n}{void}\\PY{+w}{ }\\PY{o}{*}\\PY{p}{)}\\PY{o}{\\PYZam{}}\\PY{n}{A}\\PY{p}{,}\\PY{+w}{ }\\PY{p}{(}\\PY{n}{void}\\PY{+w}{ }\\PY{o}{*}\\PY{p}{)}\\PY{o}{\\PYZam{}}\\PY{n}{B}\\PY{p}{,}\\PY{+w}{ }\\PY{p}{(}\\PY{n}{void}\\PY{+w}{ }\\PY{o}{*}\\PY{p}{)}\\PY{o}{\\PYZam{}}\\PY{n}{C}\\PY{p}{,}\\PY{+w}{ }\\PY{p}{(}\\PY{n}{void}\\PY{+w}{ }\\PY{o}{*}\\PY{p}{)}\\PY{o}{\\PYZam{}}\\PY{n}{N}\\PY{+w}{ }\\PY{err}{\\PYZcb{}}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{n}{gpuError\\PYZus{}t}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}err}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{cudaLaunchKernel}\\PY{p}{(}\\PY{p}{(}\\PY{n}{void}\\PY{o}{*}\\PY{p}{)}\\PY{n}{kernel\\PYZus{}101\\PYZus{}0\\PYZus{}0\\PYZus{}0}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{dim3}\\PY{p}{(}\\PY{n}{int\\PYZus{}ceil}\\PY{p}{(}\\PY{n}{int\\PYZus{}ceil}\\PY{p}{(}\\PY{n}{N}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{)}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{32}\\PY{p}{)}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{)}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{dim3}\\PY{p}{(}\\PY{l+m+mi}{32}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{)}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{kernel\\PYZus{}101\\PYZus{}0\\PYZus{}0\\PYZus{}0\\PYZus{}args}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{streams}\\PY{o}{[}\\PY{n}{0}\\PY{o}{]}\\PY{p}{)}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{n}{DACE\\PYZus{}KERNEL\\PYZus{}LAUNCH\\PYZus{}CHECK}\\PY{p}{(}\\PY{n}{\\PYZus{}\\PYZus{}err}\\PY{p}{,}\\PY{+w}{ }\\PY{l+s+ss}{\\PYZdq{}kernel\\PYZus{}101\\PYZus{}0\\PYZus{}0\\PYZus{}0\\PYZdq{}}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{int\\PYZus{}ceil}\\PY{p}{(}\\PY{n}{int\\PYZus{}ceil}\\PY{p}{(}\\PY{n}{N}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{)}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{32}\\PY{p}{)}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{32}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{)}\\PY{p}{;}\n", + "\\PY{err}{\\PYZcb{}}\n", + "\\end{Verbatim}\n" + ], + "text/plain": [ + "\n", + "#include \n", + "#include \n", + "\n", + "\n", + "struct kernel_state_t {\n", + " dace::cuda::Context *gpu_context;\n", + "};\n", + "\n", + "\n", + "\n", + "DACE_EXPORTED int __dace_init_cuda(kernel_state_t *__state, int64_t N);\n", + "DACE_EXPORTED int __dace_exit_cuda(kernel_state_t *__state);\n", + "\n", + "\n", + "\n", + "int __dace_init_cuda(kernel_state_t *__state, int64_t N) {\n", + " int count;\n", + "\n", + " // Check that we are able to run cuda code\n", + " if (cudaGetDeviceCount(&count) != cudaSuccess)\n", + " {\n", + " printf(\"ERROR: GPU drivers are not configured or cuda-capable device \"\n", + " \"not found\\n\");\n", + " return 1;\n", + " }\n", + " if (count == 0)\n", + " {\n", + " printf(\"ERROR: No cuda-capable devices found\\n\");\n", + " return 2;\n", + " }\n", + "\n", + " // Initialize cuda before we run the application\n", + " float *dev_X;\n", + " DACE_GPU_CHECK(cudaMalloc((void **) &dev_X, 1));\n", + " DACE_GPU_CHECK(cudaFree(dev_X));\n", + "\n", + " \n", + "\n", + " __state->gpu_context = new dace::cuda::Context(1, 1);\n", + "\n", + " // Create cuda streams and events\n", + " for(int i = 0; i < 1; ++i) {\n", + " DACE_GPU_CHECK(cudaStreamCreateWithFlags(&__state->gpu_context->internal_streams[i], cudaStreamNonBlocking));\n", + " __state->gpu_context->streams[i] = __state->gpu_context->internal_streams[i]; // Allow for externals to modify streams\n", + " }\n", + " for(int i = 0; i < 1; ++i) {\n", + " DACE_GPU_CHECK(cudaEventCreateWithFlags(&__state->gpu_context->events[i], cudaEventDisableTiming));\n", + " }\n", + "\n", + " \n", + "\n", + " return 0;\n", + "}\n", + "\n", + "int __dace_exit_cuda(kernel_state_t *__state) {\n", + " \n", + "\n", + " // Synchronize and check for CUDA errors\n", + " int __err = static_cast(__state->gpu_context->lasterror);\n", + " if (__err == 0)\n", + " __err = static_cast(cudaDeviceSynchronize());\n", + "\n", + " // Destroy cuda streams and events\n", + " for(int i = 0; i < 1; ++i) {\n", + " DACE_GPU_CHECK(cudaStreamDestroy(__state->gpu_context->internal_streams[i]));\n", + " }\n", + " for(int i = 0; i < 1; ++i) {\n", + " DACE_GPU_CHECK(cudaEventDestroy(__state->gpu_context->events[i]));\n", + " }\n", + "\n", + " delete __state->gpu_context;\n", + " return __err;\n", + "}\n", + "\n", + "DACE_EXPORTED bool __dace_gpu_set_stream(kernel_state_t *__state, int streamid, gpuStream_t stream)\n", + "{\n", + " if (streamid < 0 || streamid >= 1)\n", + " return false;\n", + "\n", + " __state->gpu_context->streams[streamid] = stream;\n", + "\n", + " return true;\n", + "}\n", + "\n", + "DACE_EXPORTED void __dace_gpu_set_all_streams(kernel_state_t *__state, gpuStream_t stream)\n", + "{\n", + " for (int i = 0; i < 1; ++i)\n", + " __state->gpu_context->streams[i] = stream;\n", + "}\n", + "\n", + "__global__ void __launch_bounds__(32) kernel_101_0_0_0(const double * __restrict__ A, const double * __restrict__ B, double * __restrict__ C, int N) {\n", + " {\n", + " int i = (blockIdx.x * 32 + threadIdx.x);\n", + " double __tmp1;\n", + " __shared__ double shr_A[32];\n", + " __shared__ double shr_B[32];\n", + " if (i < N) {\n", + " dace::GlobalToShared1D(A + i, 1, shr_A + (i % 32));\n", + " dace::GlobalToShared1D(B + i, 1, shr_B + (i % 32));\n", + " {\n", + " double __in1 = shr_A[(i % 32)];\n", + " double __in2 = shr_B[(i % 32)];\n", + " double __out;\n", + "\n", + " ///////////////////\n", + " // Tasklet code (_Add_)\n", + " __out = (__in1 + __in2);\n", + " ///////////////////\n", + "\n", + " __tmp1 = __out;\n", + " }\n", + " {\n", + " double __inp = __tmp1;\n", + " double __out;\n", + "\n", + " ///////////////////\n", + " // Tasklet code (assign_102_16)\n", + " __out = __inp;\n", + " ///////////////////\n", + "\n", + " C[i] = __out;\n", + " }\n", + " }\n", + " }\n", + "}\n", + "\n", + "\n", + "DACE_EXPORTED void __dace_runkernel_kernel_101_0_0_0(kernel_state_t *__state, const double * __restrict__ A, const double * __restrict__ B, double * __restrict__ C, int N);\n", + "void __dace_runkernel_kernel_101_0_0_0(kernel_state_t *__state, const double * __restrict__ A, const double * __restrict__ B, double * __restrict__ C, int N)\n", + "{\n", + "\n", + " if (((int_ceil(int_ceil(N, 1), 32)) <= 0)) {\n", + "\n", + " return;\n", + " }\n", + "\n", + " void *kernel_101_0_0_0_args[] = { (void *)&A, (void *)&B, (void *)&C, (void *)&N };\n", + " gpuError_t __err = cudaLaunchKernel((void*)kernel_101_0_0_0, dim3(int_ceil(int_ceil(N, 1), 32), 1, 1), dim3(32, 1, 1), kernel_101_0_0_0_args, 0, __state->gpu_context->streams[0]);\n", + " DACE_KERNEL_LAUNCH_CHECK(__err, \"kernel_101_0_0_0\", int_ceil(int_ceil(N, 1), 32), 1, 1, 32, 1, 1);\n", + "}\n" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "Code(sdfg.generate_code()[1].clean_code)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "686e4fc6", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "dace_env", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/berkay_workpace/scratch/yakups_examples/nice_global_to_shared_copy.sdfg b/berkay_workpace/scratch/yakups_examples/nice_global_to_shared_copy.sdfg new file mode 100644 index 0000000000..e040a6f997 --- /dev/null +++ b/berkay_workpace/scratch/yakups_examples/nice_global_to_shared_copy.sdfg @@ -0,0 +1,1278 @@ +{ + "type": "SDFG", + "attributes": { + "name": "kernel", + "arg_names": [ + "A", + "B", + "C" + ], + "_arrays": { + "A": { + "type": "Array", + "attributes": { + "strides": [ + "1" + ], + "total_size": "N", + "offset": [ + "0" + ], + "optional": false, + "dtype": "float64", + "shape": [ + "N" + ], + "storage": "GPU_Global", + "debuginfo": null + } + }, + "B": { + "type": "Array", + "attributes": { + "strides": [ + "1" + ], + "total_size": "N", + "offset": [ + "0" + ], + "optional": false, + "dtype": "float64", + "shape": [ + "N" + ], + "storage": "GPU_Global", + "debuginfo": null + } + }, + "C": { + "type": "Array", + "attributes": { + "strides": [ + "1" + ], + "total_size": "N", + "offset": [ + "0" + ], + "optional": false, + "dtype": "float64", + "shape": [ + "N" + ], + "storage": "GPU_Global", + "debuginfo": null + } + }, + "__tmp3": { + "type": "Scalar", + "attributes": { + "dtype": "float64", + "shape": [ + "1" + ], + "transient": true, + "debuginfo": null + } + }, + "shr_A": { + "type": "Array", + "attributes": { + "strides": [ + "1" + ], + "total_size": "256", + "offset": [ + "0" + ], + "optional": false, + "dtype": "float64", + "shape": [ + "256" + ], + "transient": true, + "storage": "GPU_Shared", + "debuginfo": null + } + }, + "shr_B": { + "type": "Array", + "attributes": { + "strides": [ + "1" + ], + "total_size": "256", + "offset": [ + "0" + ], + "optional": false, + "dtype": "float64", + "shape": [ + "256" + ], + "transient": true, + "storage": "GPU_Shared", + "debuginfo": null + } + } + }, + "symbols": { + "N": "int64" + }, + "global_code": { + "frame": { + "string_data": "", + "language": "CPP" + } + }, + "init_code": { + "frame": { + "string_data": "", + "language": "CPP" + } + }, + "exit_code": { + "frame": { + "string_data": "", + "language": "CPP" + } + }, + "debuginfo": { + "type": "DebugInfo", + "start_line": 20, + "end_line": 29, + "start_column": 0, + "end_column": 0, + "filename": "/home/primrose/Work/DaceLayoutAndScheduleTransformations/layout_and_schedule_transformations/tests/shrmem_test.py" + }, + "using_explicit_control_flow": true, + "guid": "82bba983-0c4a-4ea9-a197-02e81a8cff11", + "hash": "30a8e228873be667ed26ed0ded89004c1c79a380321118fa9e38753bb6cfa4f8" + }, + "nodes": [ + { + "type": "SDFGState", + "label": "MapState", + "id": 0, + "collapsed": false, + "scope_dict": { + "-1": [ + 0, + 2, + 3, + 4 + ], + "0": [ + 1, + 5, + 12, + 13 + ], + "5": [ + 6, + 7 + ], + "7": [ + 8, + 9, + 10, + 11 + ] + }, + "nodes": [ + { + "type": "MapEntry", + "label": "kernel_26[i=0:N:256]", + "attributes": { + "label": "kernel_26", + "params": [ + "i" + ], + "range": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "N - 1", + "step": "256", + "tile": "1" + } + ] + }, + "schedule": "GPU_Device", + "debuginfo": { + "type": "DebugInfo", + "start_line": 26, + "end_line": 26, + "start_column": 30, + "end_column": 30, + "filename": "/home/primrose/Work/DaceLayoutAndScheduleTransformations/layout_and_schedule_transformations/tests/shrmem_test.py" + }, + "in_connectors": { + "IN_A": null, + "IN_B": null + }, + "out_connectors": { + "OUT_A": null, + "OUT_B": null + }, + "guid": "49c7e4c6-3cc7-4ea9-8c5c-87e19b34cd49" + }, + "id": 0, + "scope_entry": null, + "scope_exit": "1" + }, + { + "type": "MapExit", + "label": "kernel_26[i=0:N:256]", + "attributes": { + "in_connectors": { + "IN_C": null + }, + "out_connectors": { + "OUT_C": null + }, + "guid": "633311fa-fef3-47c7-811a-b55dbaf09c9b" + }, + "id": 1, + "scope_entry": "0", + "scope_exit": "1" + }, + { + "type": "AccessNode", + "label": "A", + "attributes": { + "debuginfo": { + "type": "DebugInfo", + "start_line": 26, + "end_line": 26, + "start_column": 30, + "end_column": 30, + "filename": "/home/primrose/Work/DaceLayoutAndScheduleTransformations/layout_and_schedule_transformations/tests/shrmem_test.py" + }, + "data": "A", + "guid": "13961939-8f39-4f3f-b505-83b06e3f89c8" + }, + "id": 2, + "scope_entry": null, + "scope_exit": null + }, + { + "type": "AccessNode", + "label": "B", + "attributes": { + "debuginfo": { + "type": "DebugInfo", + "start_line": 26, + "end_line": 26, + "start_column": 30, + "end_column": 30, + "filename": "/home/primrose/Work/DaceLayoutAndScheduleTransformations/layout_and_schedule_transformations/tests/shrmem_test.py" + }, + "data": "B", + "guid": "cf20b347-bd6f-46af-b499-59775d1eb039" + }, + "id": 3, + "scope_entry": null, + "scope_exit": null + }, + { + "type": "AccessNode", + "label": "C", + "attributes": { + "debuginfo": { + "type": "DebugInfo", + "start_line": 26, + "end_line": 26, + "start_column": 30, + "end_column": 30, + "filename": "/home/primrose/Work/DaceLayoutAndScheduleTransformations/layout_and_schedule_transformations/tests/shrmem_test.py" + }, + "data": "C", + "guid": "3f602807-15c6-4dbc-a17f-bec5154cea16" + }, + "id": 4, + "scope_entry": null, + "scope_exit": null + }, + { + "type": "MapEntry", + "label": "kernel_26_4_27[k=0]", + "attributes": { + "label": "kernel_26_4_27", + "params": [ + "k" + ], + "range": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "0", + "step": "1", + "tile": "1" + } + ] + }, + "schedule": "Sequential", + "debuginfo": { + "type": "DebugInfo", + "start_line": 27, + "end_line": 27, + "start_column": 8, + "end_column": 8, + "filename": "/home/primrose/Work/DaceLayoutAndScheduleTransformations/layout_and_schedule_transformations/tests/shrmem_test.py" + }, + "in_connectors": { + "IN___tmp_29_37_r_in_from_1_0_in_from_1_0": null, + "IN___tmp_29_58_r_in_from_1_0_in_from_1_0": null + }, + "out_connectors": { + "OUT___tmp_29_37_r_in_from_1_0_in_from_1_0": null, + "OUT___tmp_29_58_r_in_from_1_0_in_from_1_0": null + }, + "guid": "16f44972-423e-4407-9f82-ceba8fbbd9cb" + }, + "id": 5, + "scope_entry": "0", + "scope_exit": "6" + }, + { + "type": "MapExit", + "label": "kernel_26_4_27[k=0]", + "attributes": { + "in_connectors": { + "IN___tmp_29_16_w_out_of_1_1_out_of_1_1": null + }, + "out_connectors": { + "OUT___tmp_29_16_w_out_of_1_1_out_of_1_1": null + }, + "guid": "0881fb62-ec71-49cc-8801-f2c1ec221db3" + }, + "id": 6, + "scope_entry": "5", + "scope_exit": "6" + }, + { + "type": "MapEntry", + "label": "kernel_26_4_27_8_28[j=0:256]", + "attributes": { + "label": "kernel_26_4_27_8_28", + "params": [ + "j" + ], + "range": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "255", + "step": "1", + "tile": "1" + } + ] + }, + "schedule": "GPU_ThreadBlock", + "debuginfo": { + "type": "DebugInfo", + "start_line": 28, + "end_line": 28, + "start_column": 12, + "end_column": 12, + "filename": "/home/primrose/Work/DaceLayoutAndScheduleTransformations/layout_and_schedule_transformations/tests/shrmem_test.py" + }, + "in_connectors": { + "IN___tmp_29_37_r_in_from_1_0": null, + "IN___tmp_29_58_r_in_from_1_0": null + }, + "out_connectors": { + "OUT___tmp_29_37_r_in_from_1_0": null, + "OUT___tmp_29_58_r_in_from_1_0": null + }, + "guid": "851c1aed-dd11-44c0-a861-e38e10854e37" + }, + "id": 7, + "scope_entry": "5", + "scope_exit": "8" + }, + { + "type": "MapExit", + "label": "kernel_26_4_27_8_28[j=0:256]", + "attributes": { + "in_connectors": { + "IN___tmp_29_16_w_out_of_1_1": null + }, + "out_connectors": { + "OUT___tmp_29_16_w_out_of_1_1": null + }, + "guid": "dbf82313-ebe2-4ccd-9130-359733ccfe16" + }, + "id": 8, + "scope_entry": "7", + "scope_exit": "8" + }, + { + "type": "Tasklet", + "label": "_Add_", + "attributes": { + "code": { + "string_data": "__out = (__in1 + __in2)", + "language": "Python" + }, + "debuginfo": { + "type": "DebugInfo", + "start_line": 29, + "end_line": 29, + "start_column": 72, + "end_column": 72, + "filename": "/home/primrose/Work/DaceLayoutAndScheduleTransformations/layout_and_schedule_transformations/tests/shrmem_test.py" + }, + "label": "_Add_", + "in_connectors": { + "__in1": null, + "__in2": null + }, + "out_connectors": { + "__out": null + }, + "guid": "1d6a1f25-c807-4172-a42b-1a33f5ee1e75" + }, + "id": 9, + "scope_entry": "7", + "scope_exit": "8" + }, + { + "type": "AccessNode", + "label": "__tmp3", + "attributes": { + "debuginfo": { + "type": "DebugInfo", + "start_line": 29, + "end_line": 29, + "start_column": 72, + "end_column": 72, + "filename": "/home/primrose/Work/DaceLayoutAndScheduleTransformations/layout_and_schedule_transformations/tests/shrmem_test.py" + }, + "data": "__tmp3", + "guid": "25a5107c-0b90-4bd8-9deb-378f3a3463b2" + }, + "id": 10, + "scope_entry": "7", + "scope_exit": "8" + }, + { + "type": "Tasklet", + "label": "assign_29_16", + "attributes": { + "code": { + "string_data": "__out = __inp", + "language": "Python" + }, + "debuginfo": { + "type": "DebugInfo", + "start_line": 29, + "end_line": 29, + "start_column": 30, + "end_column": 30, + "filename": "/home/primrose/Work/DaceLayoutAndScheduleTransformations/layout_and_schedule_transformations/tests/shrmem_test.py" + }, + "label": "assign_29_16", + "in_connectors": { + "__inp": null + }, + "out_connectors": { + "__out": null + }, + "guid": "34f9f783-0749-47eb-93db-97d53d173644" + }, + "id": 11, + "scope_entry": "7", + "scope_exit": "8" + }, + { + "type": "AccessNode", + "label": "shr_A", + "attributes": { + "debuginfo": { + "type": "DebugInfo", + "start_line": 48, + "end_line": 48, + "start_column": 0, + "end_column": 0, + "filename": "/home/primrose/Work/DaceLayoutAndScheduleTransformations/layout_and_schedule_transformations/tests/test_utils.py" + }, + "data": "shr_A", + "guid": "3d052d35-1d25-4300-9969-30227e71974d" + }, + "id": 12, + "scope_entry": "0", + "scope_exit": "1" + }, + { + "type": "AccessNode", + "label": "shr_B", + "attributes": { + "debuginfo": { + "type": "DebugInfo", + "start_line": 48, + "end_line": 48, + "start_column": 0, + "end_column": 0, + "filename": "/home/primrose/Work/DaceLayoutAndScheduleTransformations/layout_and_schedule_transformations/tests/test_utils.py" + }, + "data": "shr_B", + "guid": "d73f5a2f-40d2-411a-bb05-adedfe3e2166" + }, + "id": 13, + "scope_entry": "0", + "scope_exit": "1" + } + ], + "edges": [ + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "256*ceiling(N/256)", + "subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "N - 1", + "step": "1", + "tile": "1" + } + ] + }, + "data": "A", + "debuginfo": null, + "guid": "6f946fb5-33ba-4489-8eac-8a96431fd08f", + "src_subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "N - 1", + "step": "1", + "tile": "1" + } + ] + }, + "dst_subset": null, + "is_data_src": true, + "num_accesses": "256*ceiling(N/256)" + } + } + }, + "src": "2", + "dst": "0", + "dst_connector": "IN_A", + "src_connector": null + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "256*ceiling(N/256)", + "subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "N - 1", + "step": "1", + "tile": "1" + } + ] + }, + "data": "B", + "debuginfo": null, + "guid": "39f2af82-f36f-4955-a1d0-88813578a913", + "src_subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "N - 1", + "step": "1", + "tile": "1" + } + ] + }, + "dst_subset": null, + "is_data_src": true, + "num_accesses": "256*ceiling(N/256)" + } + } + }, + "src": "3", + "dst": "0", + "dst_connector": "IN_B", + "src_connector": null + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "256", + "subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "255", + "step": "1", + "tile": "1" + } + ] + }, + "data": "shr_A", + "debuginfo": null, + "guid": "22ac004d-8470-42c8-a6c5-3e9d8ec80b58", + "src_subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "255", + "step": "1", + "tile": "1" + } + ] + }, + "dst_subset": null, + "is_data_src": true, + "num_accesses": "256" + } + } + }, + "src": "12", + "dst": "5", + "dst_connector": "IN___tmp_29_37_r_in_from_1_0_in_from_1_0", + "src_connector": null + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "256", + "subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "255", + "step": "1", + "tile": "1" + } + ] + }, + "data": "shr_B", + "debuginfo": null, + "guid": "325ac1e2-6982-442c-a419-6bc987ac7c89", + "src_subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "255", + "step": "1", + "tile": "1" + } + ] + }, + "dst_subset": null, + "is_data_src": true, + "num_accesses": "256" + } + } + }, + "src": "13", + "dst": "5", + "dst_connector": "IN___tmp_29_58_r_in_from_1_0_in_from_1_0", + "src_connector": null + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "1", + "subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "0", + "step": "1", + "tile": "1" + } + ] + }, + "data": "__tmp3", + "debuginfo": null, + "guid": "373cdd55-1fd6-4899-aa53-b4390f7eb5d1", + "src_subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "0", + "step": "1", + "tile": "1" + } + ] + }, + "dst_subset": null, + "is_data_src": true, + "num_accesses": "1" + } + } + }, + "src": "10", + "dst": "11", + "dst_connector": "__inp", + "src_connector": null + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "256", + "subset": { + "type": "Range", + "ranges": [ + { + "start": "i", + "end": "i + 255", + "step": "1", + "tile": "1" + } + ] + }, + "other_subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "255", + "step": "1", + "tile": "1" + } + ] + }, + "data": "A", + "debuginfo": null, + "guid": "ebd0daa9-e1db-403e-9aa7-d420b50a67e3", + "src_subset": { + "type": "Range", + "ranges": [ + { + "start": "i", + "end": "i + 255", + "step": "1", + "tile": "1" + } + ] + }, + "dst_subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "255", + "step": "1", + "tile": "1" + } + ] + }, + "is_data_src": true, + "num_accesses": "256" + } + } + }, + "src": "0", + "dst": "12", + "dst_connector": null, + "src_connector": "OUT_A" + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "256", + "subset": { + "type": "Range", + "ranges": [ + { + "start": "i", + "end": "i + 255", + "step": "1", + "tile": "1" + } + ] + }, + "other_subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "255", + "step": "1", + "tile": "1" + } + ] + }, + "data": "B", + "debuginfo": null, + "guid": "6c6c4371-2f5f-4ce1-8d3e-9e103de2bf8c", + "src_subset": { + "type": "Range", + "ranges": [ + { + "start": "i", + "end": "i + 255", + "step": "1", + "tile": "1" + } + ] + }, + "dst_subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "255", + "step": "1", + "tile": "1" + } + ] + }, + "is_data_src": true, + "num_accesses": "256" + } + } + }, + "src": "0", + "dst": "13", + "dst_connector": null, + "src_connector": "OUT_B" + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "256*ceiling(N/256)", + "subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "N - 1", + "step": "1", + "tile": "1" + } + ] + }, + "data": "C", + "debuginfo": null, + "guid": "b655475c-0841-4511-8a7e-0314a72ac3e9", + "src_subset": null, + "dst_subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "N - 1", + "step": "1", + "tile": "1" + } + ] + }, + "is_data_src": false, + "num_accesses": "256*ceiling(N/256)" + } + } + }, + "src": "1", + "dst": "4", + "dst_connector": null, + "src_connector": "OUT_C" + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "256", + "subset": { + "type": "Range", + "ranges": [ + { + "start": "i + 256*k", + "end": "i + 256*k + 255", + "step": "1", + "tile": "1" + } + ] + }, + "data": "C", + "debuginfo": null, + "guid": "cc08b6fa-9597-4e8d-b7ed-c8b640f64093", + "src_subset": null, + "dst_subset": { + "type": "Range", + "ranges": [ + { + "start": "i + 256*k", + "end": "i + 256*k + 255", + "step": "1", + "tile": "1" + } + ] + }, + "is_data_src": false, + "num_accesses": "256" + } + } + }, + "src": "8", + "dst": "6", + "dst_connector": "IN___tmp_29_16_w_out_of_1_1_out_of_1_1", + "src_connector": "OUT___tmp_29_16_w_out_of_1_1" + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "256", + "subset": { + "type": "Range", + "ranges": [ + { + "start": "i", + "end": "i + 255", + "step": "1", + "tile": "1" + } + ] + }, + "data": "C", + "debuginfo": null, + "guid": "a052d8cc-d3fc-4ceb-9f77-5b9cafc2ca66", + "src_subset": null, + "dst_subset": { + "type": "Range", + "ranges": [ + { + "start": "i", + "end": "i + 255", + "step": "1", + "tile": "1" + } + ] + }, + "is_data_src": false, + "num_accesses": "256" + } + } + }, + "src": "6", + "dst": "1", + "dst_connector": "IN_C", + "src_connector": "OUT___tmp_29_16_w_out_of_1_1_out_of_1_1" + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "1", + "subset": { + "type": "Range", + "ranges": [ + { + "start": "j + 256*k", + "end": "j + 256*k", + "step": "1", + "tile": "1" + } + ] + }, + "data": "shr_A", + "debuginfo": null, + "guid": "95a58f89-1cfe-4e1f-b042-a165849dd0eb", + "src_subset": { + "type": "Range", + "ranges": [ + { + "start": "j + 256*k", + "end": "j + 256*k", + "step": "1", + "tile": "1" + } + ] + }, + "dst_subset": null, + "is_data_src": true, + "num_accesses": "1" + } + } + }, + "src": "7", + "dst": "9", + "dst_connector": "__in1", + "src_connector": "OUT___tmp_29_37_r_in_from_1_0" + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "256", + "subset": { + "type": "Range", + "ranges": [ + { + "start": "256*k", + "end": "256*k + 255", + "step": "1", + "tile": "1" + } + ] + }, + "data": "shr_A", + "debuginfo": null, + "guid": "2ff7abff-b07d-4f3f-89f1-877c46f9d44b", + "src_subset": { + "type": "Range", + "ranges": [ + { + "start": "256*k", + "end": "256*k + 255", + "step": "1", + "tile": "1" + } + ] + }, + "dst_subset": null, + "is_data_src": true, + "num_accesses": "256" + } + } + }, + "src": "5", + "dst": "7", + "dst_connector": "IN___tmp_29_37_r_in_from_1_0", + "src_connector": "OUT___tmp_29_37_r_in_from_1_0_in_from_1_0" + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "1", + "subset": { + "type": "Range", + "ranges": [ + { + "start": "j + 256*k", + "end": "j + 256*k", + "step": "1", + "tile": "1" + } + ] + }, + "data": "shr_B", + "debuginfo": null, + "guid": "2d56b2f0-9785-47d4-9aec-d79abfd11d5f", + "src_subset": { + "type": "Range", + "ranges": [ + { + "start": "j + 256*k", + "end": "j + 256*k", + "step": "1", + "tile": "1" + } + ] + }, + "dst_subset": null, + "is_data_src": true, + "num_accesses": "1" + } + } + }, + "src": "7", + "dst": "9", + "dst_connector": "__in2", + "src_connector": "OUT___tmp_29_58_r_in_from_1_0" + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "256", + "subset": { + "type": "Range", + "ranges": [ + { + "start": "256*k", + "end": "256*k + 255", + "step": "1", + "tile": "1" + } + ] + }, + "data": "shr_B", + "debuginfo": null, + "guid": "0a44e0d1-4899-490e-827b-31557145c45d", + "src_subset": { + "type": "Range", + "ranges": [ + { + "start": "256*k", + "end": "256*k + 255", + "step": "1", + "tile": "1" + } + ] + }, + "dst_subset": null, + "is_data_src": true, + "num_accesses": "256" + } + } + }, + "src": "5", + "dst": "7", + "dst_connector": "IN___tmp_29_58_r_in_from_1_0", + "src_connector": "OUT___tmp_29_58_r_in_from_1_0_in_from_1_0" + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "1", + "subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "0", + "step": "1", + "tile": "1" + } + ] + }, + "data": "__tmp3", + "debuginfo": null, + "guid": "30d11bb4-e9e3-49d3-9e54-2bd7285f0136", + "src_subset": null, + "dst_subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "0", + "step": "1", + "tile": "1" + } + ] + }, + "is_data_src": false, + "num_accesses": "1" + } + } + }, + "src": "9", + "dst": "10", + "dst_connector": null, + "src_connector": "__out" + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "1", + "subset": { + "type": "Range", + "ranges": [ + { + "start": "i + j + 256*k", + "end": "i + j + 256*k", + "step": "1", + "tile": "1" + } + ] + }, + "data": "C", + "debuginfo": null, + "guid": "53b5e6e3-8ae7-4baf-91c2-94fbfdf344d5", + "src_subset": null, + "dst_subset": { + "type": "Range", + "ranges": [ + { + "start": "i + j + 256*k", + "end": "i + j + 256*k", + "step": "1", + "tile": "1" + } + ] + }, + "is_data_src": false, + "num_accesses": "1" + } + } + }, + "src": "11", + "dst": "8", + "dst_connector": "IN___tmp_29_16_w_out_of_1_1", + "src_connector": "__out" + } + ], + "attributes": { + "guid": "ee35ff06-cf28-4b20-985b-04aa7d3a2686", + "executions": "1", + "dynamic_executions": false + } + } + ], + "edges": [], + "collapsed": false, + "label": "", + "id": null, + "cfg_list_id": 0, + "start_block": 0, + "dace_version": "1.0.0" +} \ No newline at end of file diff --git a/berkay_workpace/scratch/yakups_examples/weird_global_to_global.sdfg b/berkay_workpace/scratch/yakups_examples/weird_global_to_global.sdfg new file mode 100644 index 0000000000..0804dde52a --- /dev/null +++ b/berkay_workpace/scratch/yakups_examples/weird_global_to_global.sdfg @@ -0,0 +1,1404 @@ +{ + "type": "SDFG", + "attributes": { + "name": "kernel", + "arg_names": [ + "A", + "B", + "C" + ], + "_arrays": { + "A": { + "type": "Array", + "attributes": { + "strides": [ + "1" + ], + "total_size": "N", + "offset": [ + "0" + ], + "optional": false, + "dtype": "float64", + "shape": [ + "N" + ], + "storage": "GPU_Global", + "debuginfo": null + } + }, + "B": { + "type": "Array", + "attributes": { + "strides": [ + "1" + ], + "total_size": "N", + "offset": [ + "0" + ], + "optional": false, + "dtype": "float64", + "shape": [ + "N" + ], + "storage": "GPU_Global", + "debuginfo": null + } + }, + "C": { + "type": "Array", + "attributes": { + "strides": [ + "1" + ], + "total_size": "N", + "offset": [ + "0" + ], + "optional": false, + "dtype": "float64", + "shape": [ + "N" + ], + "storage": "GPU_Global", + "debuginfo": null + } + }, + "__tmp3": { + "type": "Scalar", + "attributes": { + "dtype": "float64", + "shape": [ + "1" + ], + "transient": true, + "debuginfo": null + } + }, + "shr_A": { + "type": "Array", + "attributes": { + "strides": [ + "1" + ], + "total_size": "256", + "offset": [ + "0" + ], + "optional": false, + "dtype": "float64", + "shape": [ + "256" + ], + "transient": true, + "storage": "GPU_Shared", + "debuginfo": null + } + }, + "shr_B": { + "type": "Array", + "attributes": { + "strides": [ + "1" + ], + "total_size": "256", + "offset": [ + "0" + ], + "optional": false, + "dtype": "float64", + "shape": [ + "256" + ], + "transient": true, + "storage": "GPU_Shared", + "debuginfo": null + } + } + }, + "symbols": { + "N": "int64" + }, + "global_code": { + "frame": { + "string_data": "", + "language": "CPP" + } + }, + "init_code": { + "frame": { + "string_data": "", + "language": "CPP" + } + }, + "exit_code": { + "frame": { + "string_data": "", + "language": "CPP" + } + }, + "debuginfo": { + "type": "DebugInfo", + "start_line": 20, + "end_line": 29, + "start_column": 0, + "end_column": 0, + "filename": "/home/primrose/Work/DaceLayoutAndScheduleTransformations/layout_and_schedule_transformations/tests/shrmem_test.py" + }, + "using_explicit_control_flow": true, + "guid": "29a0076f-cd9e-40c6-83b8-3792f2274970", + "hash": "6a91f9230ef48fd70071ef71d500050e80a256b5cf8904637d47d3b2ae04184f" + }, + "nodes": [ + { + "type": "SDFGState", + "label": "MapState", + "id": 0, + "collapsed": false, + "scope_dict": { + "-1": [ + 0, + 2, + 3, + 4 + ], + "0": [ + 1, + 5, + 12, + 13, + 14, + 15 + ], + "5": [ + 6, + 7 + ], + "7": [ + 8, + 9, + 10, + 11 + ] + }, + "nodes": [ + { + "type": "MapEntry", + "label": "kernel_26[i=0:N:256]", + "attributes": { + "label": "kernel_26", + "params": [ + "i" + ], + "range": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "N - 1", + "step": "256", + "tile": "1" + } + ] + }, + "schedule": "GPU_Device", + "debuginfo": { + "type": "DebugInfo", + "start_line": 26, + "end_line": 26, + "start_column": 30, + "end_column": 30, + "filename": "/home/primrose/Work/DaceLayoutAndScheduleTransformations/layout_and_schedule_transformations/tests/shrmem_test.py" + }, + "in_connectors": { + "IN_A": null, + "IN_B": null + }, + "out_connectors": { + "OUT_A": null, + "OUT_B": null + }, + "guid": "f808be18-fafd-4130-8275-065342d16162" + }, + "id": 0, + "scope_entry": null, + "scope_exit": "1" + }, + { + "type": "MapExit", + "label": "kernel_26[i=0:N:256]", + "attributes": { + "in_connectors": { + "IN_C": null + }, + "out_connectors": { + "OUT_C": null + }, + "guid": "19fb21a2-5228-44dc-9921-5b0daaaa2bca" + }, + "id": 1, + "scope_entry": "0", + "scope_exit": "1" + }, + { + "type": "AccessNode", + "label": "A", + "attributes": { + "debuginfo": { + "type": "DebugInfo", + "start_line": 26, + "end_line": 26, + "start_column": 30, + "end_column": 30, + "filename": "/home/primrose/Work/DaceLayoutAndScheduleTransformations/layout_and_schedule_transformations/tests/shrmem_test.py" + }, + "data": "A", + "guid": "0bdcca6e-5c4e-4971-837a-a83b7000befc" + }, + "id": 2, + "scope_entry": null, + "scope_exit": null + }, + { + "type": "AccessNode", + "label": "B", + "attributes": { + "debuginfo": { + "type": "DebugInfo", + "start_line": 26, + "end_line": 26, + "start_column": 30, + "end_column": 30, + "filename": "/home/primrose/Work/DaceLayoutAndScheduleTransformations/layout_and_schedule_transformations/tests/shrmem_test.py" + }, + "data": "B", + "guid": "d97310f7-f1f4-45b0-9e32-33588cf45de2" + }, + "id": 3, + "scope_entry": null, + "scope_exit": null + }, + { + "type": "AccessNode", + "label": "C", + "attributes": { + "debuginfo": { + "type": "DebugInfo", + "start_line": 26, + "end_line": 26, + "start_column": 30, + "end_column": 30, + "filename": "/home/primrose/Work/DaceLayoutAndScheduleTransformations/layout_and_schedule_transformations/tests/shrmem_test.py" + }, + "data": "C", + "guid": "e62d00c9-d840-4fcd-820b-2e6e6cc02db3" + }, + "id": 4, + "scope_entry": null, + "scope_exit": null + }, + { + "type": "MapEntry", + "label": "kernel_26_4_27[k=0]", + "attributes": { + "label": "kernel_26_4_27", + "params": [ + "k" + ], + "range": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "0", + "step": "1", + "tile": "1" + } + ] + }, + "schedule": "Sequential", + "debuginfo": { + "type": "DebugInfo", + "start_line": 27, + "end_line": 27, + "start_column": 8, + "end_column": 8, + "filename": "/home/primrose/Work/DaceLayoutAndScheduleTransformations/layout_and_schedule_transformations/tests/shrmem_test.py" + }, + "in_connectors": { + "IN___tmp_29_37_r_in_from_1_0_in_from_1_0": null, + "IN___tmp_29_58_r_in_from_1_0_in_from_1_0": null + }, + "out_connectors": { + "OUT___tmp_29_37_r_in_from_1_0_in_from_1_0": null, + "OUT___tmp_29_58_r_in_from_1_0_in_from_1_0": null + }, + "guid": "18a4a94c-d93a-4787-aad4-f5f35504caf2" + }, + "id": 5, + "scope_entry": "0", + "scope_exit": "6" + }, + { + "type": "MapExit", + "label": "kernel_26_4_27[k=0]", + "attributes": { + "in_connectors": { + "IN___tmp_29_16_w_out_of_1_1_out_of_1_1": null + }, + "out_connectors": { + "OUT___tmp_29_16_w_out_of_1_1_out_of_1_1": null + }, + "guid": "504168ed-6581-4939-abf1-8533df45e0d6" + }, + "id": 6, + "scope_entry": "5", + "scope_exit": "6" + }, + { + "type": "MapEntry", + "label": "kernel_26_4_27_8_28[j=0:256]", + "attributes": { + "label": "kernel_26_4_27_8_28", + "params": [ + "j" + ], + "range": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "255", + "step": "1", + "tile": "1" + } + ] + }, + "schedule": "GPU_ThreadBlock", + "debuginfo": { + "type": "DebugInfo", + "start_line": 28, + "end_line": 28, + "start_column": 12, + "end_column": 12, + "filename": "/home/primrose/Work/DaceLayoutAndScheduleTransformations/layout_and_schedule_transformations/tests/shrmem_test.py" + }, + "in_connectors": { + "IN___tmp_29_37_r_in_from_1_0": null, + "IN___tmp_29_58_r_in_from_1_0": null + }, + "out_connectors": { + "OUT___tmp_29_37_r_in_from_1_0": null, + "OUT___tmp_29_58_r_in_from_1_0": null + }, + "guid": "b2a484ce-0085-4176-bb6a-f1883b41450e" + }, + "id": 7, + "scope_entry": "5", + "scope_exit": "8" + }, + { + "type": "MapExit", + "label": "kernel_26_4_27_8_28[j=0:256]", + "attributes": { + "in_connectors": { + "IN___tmp_29_16_w_out_of_1_1": null + }, + "out_connectors": { + "OUT___tmp_29_16_w_out_of_1_1": null + }, + "guid": "5b163845-29df-4006-bc69-bd20f003db25" + }, + "id": 8, + "scope_entry": "7", + "scope_exit": "8" + }, + { + "type": "Tasklet", + "label": "_Add_", + "attributes": { + "code": { + "string_data": "__out = (__in1 + __in2)", + "language": "Python" + }, + "debuginfo": { + "type": "DebugInfo", + "start_line": 29, + "end_line": 29, + "start_column": 72, + "end_column": 72, + "filename": "/home/primrose/Work/DaceLayoutAndScheduleTransformations/layout_and_schedule_transformations/tests/shrmem_test.py" + }, + "label": "_Add_", + "in_connectors": { + "__in1": null, + "__in2": null + }, + "out_connectors": { + "__out": null + }, + "guid": "89e5325f-ec0d-4d3c-9377-cf96b2c7d445" + }, + "id": 9, + "scope_entry": "7", + "scope_exit": "8" + }, + { + "type": "AccessNode", + "label": "__tmp3", + "attributes": { + "debuginfo": { + "type": "DebugInfo", + "start_line": 29, + "end_line": 29, + "start_column": 72, + "end_column": 72, + "filename": "/home/primrose/Work/DaceLayoutAndScheduleTransformations/layout_and_schedule_transformations/tests/shrmem_test.py" + }, + "data": "__tmp3", + "guid": "f089e4d9-82ab-443f-bd07-97db1bfcdbff" + }, + "id": 10, + "scope_entry": "7", + "scope_exit": "8" + }, + { + "type": "Tasklet", + "label": "assign_29_16", + "attributes": { + "code": { + "string_data": "__out = __inp", + "language": "Python" + }, + "debuginfo": { + "type": "DebugInfo", + "start_line": 29, + "end_line": 29, + "start_column": 30, + "end_column": 30, + "filename": "/home/primrose/Work/DaceLayoutAndScheduleTransformations/layout_and_schedule_transformations/tests/shrmem_test.py" + }, + "label": "assign_29_16", + "in_connectors": { + "__inp": null + }, + "out_connectors": { + "__out": null + }, + "guid": "5d9c843f-085b-4c53-acc3-f8ea40cfdfda" + }, + "id": 11, + "scope_entry": "7", + "scope_exit": "8" + }, + { + "type": "AccessNode", + "label": "A", + "attributes": { + "debuginfo": { + "type": "DebugInfo", + "start_line": 32, + "end_line": 32, + "start_column": 0, + "end_column": 0, + "filename": "/home/primrose/Work/DaceLayoutAndScheduleTransformations/layout_and_schedule_transformations/tests/test_utils.py" + }, + "data": "A", + "guid": "d53c93ef-ec10-4a01-8b4d-17ad5c6c6010" + }, + "id": 12, + "scope_entry": "0", + "scope_exit": "1" + }, + { + "type": "AccessNode", + "label": "shr_A", + "attributes": { + "debuginfo": { + "type": "DebugInfo", + "start_line": 33, + "end_line": 33, + "start_column": 0, + "end_column": 0, + "filename": "/home/primrose/Work/DaceLayoutAndScheduleTransformations/layout_and_schedule_transformations/tests/test_utils.py" + }, + "data": "shr_A", + "guid": "ea981031-8002-4cab-be84-affd26c3fe30" + }, + "id": 13, + "scope_entry": "0", + "scope_exit": "1" + }, + { + "type": "AccessNode", + "label": "B", + "attributes": { + "debuginfo": { + "type": "DebugInfo", + "start_line": 32, + "end_line": 32, + "start_column": 0, + "end_column": 0, + "filename": "/home/primrose/Work/DaceLayoutAndScheduleTransformations/layout_and_schedule_transformations/tests/test_utils.py" + }, + "data": "B", + "guid": "8eb37d53-7ea4-47eb-8082-d375d39320aa" + }, + "id": 14, + "scope_entry": "0", + "scope_exit": "1" + }, + { + "type": "AccessNode", + "label": "shr_B", + "attributes": { + "debuginfo": { + "type": "DebugInfo", + "start_line": 33, + "end_line": 33, + "start_column": 0, + "end_column": 0, + "filename": "/home/primrose/Work/DaceLayoutAndScheduleTransformations/layout_and_schedule_transformations/tests/test_utils.py" + }, + "data": "shr_B", + "guid": "62cd600a-509d-4362-b577-c1fb48b7d3e6" + }, + "id": 15, + "scope_entry": "0", + "scope_exit": "1" + } + ], + "edges": [ + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "256", + "subset": { + "type": "Range", + "ranges": [ + { + "start": "i", + "end": "i + 255", + "step": "1", + "tile": "1" + } + ] + }, + "other_subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "255", + "step": "1", + "tile": "1" + } + ] + }, + "data": "A", + "debuginfo": null, + "guid": "15aa0bf1-5e41-4e8c-9cbf-dbf3f2807f0b", + "src_subset": { + "type": "Range", + "ranges": [ + { + "start": "i", + "end": "i + 255", + "step": "1", + "tile": "1" + } + ] + }, + "dst_subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "255", + "step": "1", + "tile": "1" + } + ] + }, + "is_data_src": true, + "num_accesses": "256" + } + } + }, + "src": "12", + "dst": "13", + "dst_connector": null, + "src_connector": null + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "256", + "subset": { + "type": "Range", + "ranges": [ + { + "start": "i", + "end": "i + 255", + "step": "1", + "tile": "1" + } + ] + }, + "other_subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "255", + "step": "1", + "tile": "1" + } + ] + }, + "data": "B", + "debuginfo": null, + "guid": "ebc72b7f-f62d-4e50-96c9-3b61c345593b", + "src_subset": { + "type": "Range", + "ranges": [ + { + "start": "i", + "end": "i + 255", + "step": "1", + "tile": "1" + } + ] + }, + "dst_subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "255", + "step": "1", + "tile": "1" + } + ] + }, + "is_data_src": true, + "num_accesses": "256" + } + } + }, + "src": "14", + "dst": "15", + "dst_connector": null, + "src_connector": null + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "256*ceiling(N/256)", + "subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "N - 1", + "step": "1", + "tile": "1" + } + ] + }, + "data": "A", + "debuginfo": null, + "guid": "31eeb421-3699-4131-bca0-927923ab3a01", + "src_subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "N - 1", + "step": "1", + "tile": "1" + } + ] + }, + "dst_subset": null, + "is_data_src": true, + "num_accesses": "256*ceiling(N/256)" + } + } + }, + "src": "2", + "dst": "0", + "dst_connector": "IN_A", + "src_connector": null + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "256*ceiling(N/256)", + "subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "N - 1", + "step": "1", + "tile": "1" + } + ] + }, + "data": "B", + "debuginfo": null, + "guid": "3a919458-c1c6-49b9-a3ab-100f74087649", + "src_subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "N - 1", + "step": "1", + "tile": "1" + } + ] + }, + "dst_subset": null, + "is_data_src": true, + "num_accesses": "256*ceiling(N/256)" + } + } + }, + "src": "3", + "dst": "0", + "dst_connector": "IN_B", + "src_connector": null + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "256", + "subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "255", + "step": "1", + "tile": "1" + } + ] + }, + "data": "shr_A", + "debuginfo": null, + "guid": "eb547942-f49e-4344-8601-9ef92363566e", + "src_subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "255", + "step": "1", + "tile": "1" + } + ] + }, + "dst_subset": null, + "is_data_src": true, + "num_accesses": "256" + } + } + }, + "src": "13", + "dst": "5", + "dst_connector": "IN___tmp_29_37_r_in_from_1_0_in_from_1_0", + "src_connector": null + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "256", + "subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "255", + "step": "1", + "tile": "1" + } + ] + }, + "data": "shr_B", + "debuginfo": null, + "guid": "9b95e40b-23ca-4d63-8698-f72a5e191e9f", + "src_subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "255", + "step": "1", + "tile": "1" + } + ] + }, + "dst_subset": null, + "is_data_src": true, + "num_accesses": "256" + } + } + }, + "src": "15", + "dst": "5", + "dst_connector": "IN___tmp_29_58_r_in_from_1_0_in_from_1_0", + "src_connector": null + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "1", + "subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "0", + "step": "1", + "tile": "1" + } + ] + }, + "data": "__tmp3", + "debuginfo": null, + "guid": "a19e5bc2-904d-4337-9e4b-83d41cfe29d5", + "src_subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "0", + "step": "1", + "tile": "1" + } + ] + }, + "dst_subset": null, + "is_data_src": true, + "num_accesses": "1" + } + } + }, + "src": "10", + "dst": "11", + "dst_connector": "__inp", + "src_connector": null + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "256", + "subset": { + "type": "Range", + "ranges": [ + { + "start": "i", + "end": "i + 255", + "step": "1", + "tile": "1" + } + ] + }, + "data": "A", + "debuginfo": null, + "guid": "d1281909-b405-44f5-a31e-1f6d3d656aee", + "src_subset": { + "type": "Range", + "ranges": [ + { + "start": "i", + "end": "i + 255", + "step": "1", + "tile": "1" + } + ] + }, + "dst_subset": null, + "is_data_src": true, + "num_accesses": "256" + } + } + }, + "src": "0", + "dst": "12", + "dst_connector": null, + "src_connector": "OUT_A" + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "256", + "subset": { + "type": "Range", + "ranges": [ + { + "start": "i", + "end": "i + 255", + "step": "1", + "tile": "1" + } + ] + }, + "data": "B", + "debuginfo": null, + "guid": "d6c0b98c-3ce7-40bd-ba2e-94ac0446b651", + "src_subset": { + "type": "Range", + "ranges": [ + { + "start": "i", + "end": "i + 255", + "step": "1", + "tile": "1" + } + ] + }, + "dst_subset": null, + "is_data_src": true, + "num_accesses": "256" + } + } + }, + "src": "0", + "dst": "14", + "dst_connector": null, + "src_connector": "OUT_B" + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "256*ceiling(N/256)", + "subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "N - 1", + "step": "1", + "tile": "1" + } + ] + }, + "data": "C", + "debuginfo": null, + "guid": "7eb38a56-2770-46fc-81df-c0c386363b73", + "src_subset": null, + "dst_subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "N - 1", + "step": "1", + "tile": "1" + } + ] + }, + "is_data_src": false, + "num_accesses": "256*ceiling(N/256)" + } + } + }, + "src": "1", + "dst": "4", + "dst_connector": null, + "src_connector": "OUT_C" + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "256", + "subset": { + "type": "Range", + "ranges": [ + { + "start": "i + 256*k", + "end": "i + 256*k + 255", + "step": "1", + "tile": "1" + } + ] + }, + "data": "C", + "debuginfo": null, + "guid": "aaf84bab-0c41-4b0d-8886-da40683baa16", + "src_subset": null, + "dst_subset": { + "type": "Range", + "ranges": [ + { + "start": "i + 256*k", + "end": "i + 256*k + 255", + "step": "1", + "tile": "1" + } + ] + }, + "is_data_src": false, + "num_accesses": "256" + } + } + }, + "src": "8", + "dst": "6", + "dst_connector": "IN___tmp_29_16_w_out_of_1_1_out_of_1_1", + "src_connector": "OUT___tmp_29_16_w_out_of_1_1" + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "256", + "subset": { + "type": "Range", + "ranges": [ + { + "start": "i", + "end": "i + 255", + "step": "1", + "tile": "1" + } + ] + }, + "data": "C", + "debuginfo": null, + "guid": "4b2bd584-3030-4cfa-b4e3-cbd9263a1924", + "src_subset": null, + "dst_subset": { + "type": "Range", + "ranges": [ + { + "start": "i", + "end": "i + 255", + "step": "1", + "tile": "1" + } + ] + }, + "is_data_src": false, + "num_accesses": "256" + } + } + }, + "src": "6", + "dst": "1", + "dst_connector": "IN_C", + "src_connector": "OUT___tmp_29_16_w_out_of_1_1_out_of_1_1" + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "1", + "subset": { + "type": "Range", + "ranges": [ + { + "start": "j + 256*k", + "end": "j + 256*k", + "step": "1", + "tile": "1" + } + ] + }, + "data": "shr_A", + "debuginfo": null, + "guid": "6004ab2e-675e-4b25-8f4c-ac9bb8ebfec3", + "src_subset": { + "type": "Range", + "ranges": [ + { + "start": "j + 256*k", + "end": "j + 256*k", + "step": "1", + "tile": "1" + } + ] + }, + "dst_subset": null, + "is_data_src": true, + "num_accesses": "1" + } + } + }, + "src": "7", + "dst": "9", + "dst_connector": "__in1", + "src_connector": "OUT___tmp_29_37_r_in_from_1_0" + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "256", + "subset": { + "type": "Range", + "ranges": [ + { + "start": "256*k", + "end": "256*k + 255", + "step": "1", + "tile": "1" + } + ] + }, + "data": "shr_A", + "debuginfo": null, + "guid": "e9fcd9dd-119f-49b8-92f8-35dae9b73999", + "src_subset": { + "type": "Range", + "ranges": [ + { + "start": "256*k", + "end": "256*k + 255", + "step": "1", + "tile": "1" + } + ] + }, + "dst_subset": null, + "is_data_src": true, + "num_accesses": "256" + } + } + }, + "src": "5", + "dst": "7", + "dst_connector": "IN___tmp_29_37_r_in_from_1_0", + "src_connector": "OUT___tmp_29_37_r_in_from_1_0_in_from_1_0" + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "1", + "subset": { + "type": "Range", + "ranges": [ + { + "start": "j + 256*k", + "end": "j + 256*k", + "step": "1", + "tile": "1" + } + ] + }, + "data": "shr_B", + "debuginfo": null, + "guid": "1e6fa9ca-2a9e-4dfc-826c-868c214ac387", + "src_subset": { + "type": "Range", + "ranges": [ + { + "start": "j + 256*k", + "end": "j + 256*k", + "step": "1", + "tile": "1" + } + ] + }, + "dst_subset": null, + "is_data_src": true, + "num_accesses": "1" + } + } + }, + "src": "7", + "dst": "9", + "dst_connector": "__in2", + "src_connector": "OUT___tmp_29_58_r_in_from_1_0" + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "256", + "subset": { + "type": "Range", + "ranges": [ + { + "start": "256*k", + "end": "256*k + 255", + "step": "1", + "tile": "1" + } + ] + }, + "data": "shr_B", + "debuginfo": null, + "guid": "98d23be2-1bf1-484e-85dc-a408af3c4acd", + "src_subset": { + "type": "Range", + "ranges": [ + { + "start": "256*k", + "end": "256*k + 255", + "step": "1", + "tile": "1" + } + ] + }, + "dst_subset": null, + "is_data_src": true, + "num_accesses": "256" + } + } + }, + "src": "5", + "dst": "7", + "dst_connector": "IN___tmp_29_58_r_in_from_1_0", + "src_connector": "OUT___tmp_29_58_r_in_from_1_0_in_from_1_0" + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "1", + "subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "0", + "step": "1", + "tile": "1" + } + ] + }, + "data": "__tmp3", + "debuginfo": null, + "guid": "805ec629-7f1a-4533-9488-81aa404fef26", + "src_subset": null, + "dst_subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "0", + "step": "1", + "tile": "1" + } + ] + }, + "is_data_src": false, + "num_accesses": "1" + } + } + }, + "src": "9", + "dst": "10", + "dst_connector": null, + "src_connector": "__out" + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "1", + "subset": { + "type": "Range", + "ranges": [ + { + "start": "i + j + 256*k", + "end": "i + j + 256*k", + "step": "1", + "tile": "1" + } + ] + }, + "data": "C", + "debuginfo": null, + "guid": "5c3be07e-084d-45b5-95a2-39985435aa51", + "src_subset": null, + "dst_subset": { + "type": "Range", + "ranges": [ + { + "start": "i + j + 256*k", + "end": "i + j + 256*k", + "step": "1", + "tile": "1" + } + ] + }, + "is_data_src": false, + "num_accesses": "1" + } + } + }, + "src": "11", + "dst": "8", + "dst_connector": "IN___tmp_29_16_w_out_of_1_1", + "src_connector": "__out" + } + ], + "attributes": { + "guid": "1958e62b-83b6-42ac-9a59-bb49bb2433f4", + "executions": "1", + "dynamic_executions": false + } + } + ], + "edges": [], + "collapsed": false, + "label": "", + "id": null, + "cfg_list_id": 0, + "start_block": 0, + "dace_version": "1.0.0" +} \ No newline at end of file diff --git a/berkay_workpace/scratch/yakups_examples/weird_shared_to_shared_copy.sdfg b/berkay_workpace/scratch/yakups_examples/weird_shared_to_shared_copy.sdfg new file mode 100644 index 0000000000..b4b0bc8300 --- /dev/null +++ b/berkay_workpace/scratch/yakups_examples/weird_shared_to_shared_copy.sdfg @@ -0,0 +1,896 @@ +{ + "type": "SDFG", + "attributes": { + "name": "kernel", + "arg_names": [ + "A", + "B", + "C" + ], + "_arrays": { + "A": { + "type": "Array", + "attributes": { + "strides": [ + "1" + ], + "total_size": "N", + "offset": [ + "0" + ], + "optional": false, + "dtype": "float64", + "shape": [ + "N" + ], + "storage": "GPU_Global", + "debuginfo": null + } + }, + "B": { + "type": "Array", + "attributes": { + "strides": [ + "1" + ], + "total_size": "N", + "offset": [ + "0" + ], + "optional": false, + "dtype": "float64", + "shape": [ + "N" + ], + "storage": "GPU_Global", + "debuginfo": null + } + }, + "C": { + "type": "Array", + "attributes": { + "strides": [ + "1" + ], + "total_size": "N", + "offset": [ + "0" + ], + "optional": false, + "dtype": "float64", + "shape": [ + "N" + ], + "storage": "GPU_Global", + "debuginfo": null + } + }, + "__tmp1": { + "type": "Scalar", + "attributes": { + "dtype": "float64", + "shape": [ + "1" + ], + "transient": true, + "debuginfo": null + } + }, + "shr_A": { + "type": "Array", + "attributes": { + "strides": [ + "1" + ], + "total_size": "32", + "offset": [ + "0" + ], + "optional": false, + "dtype": "float64", + "shape": [ + "32" + ], + "transient": true, + "storage": "GPU_Shared", + "debuginfo": null + } + }, + "shr_B": { + "type": "Array", + "attributes": { + "strides": [ + "1" + ], + "total_size": "32", + "offset": [ + "0" + ], + "optional": false, + "dtype": "float64", + "shape": [ + "32" + ], + "transient": true, + "storage": "GPU_Shared", + "debuginfo": null + } + } + }, + "symbols": { + "N": "int64" + }, + "global_code": { + "frame": { + "string_data": "", + "language": "CPP" + } + }, + "init_code": { + "frame": { + "string_data": "", + "language": "CPP" + } + }, + "exit_code": { + "frame": { + "string_data": "", + "language": "CPP" + } + }, + "debuginfo": { + "type": "DebugInfo", + "start_line": 95, + "end_line": 102, + "start_column": 0, + "end_column": 0, + "filename": "/home/primrose/Work/DaceLayoutAndScheduleTransformations/layout_and_schedule_transformations/tests/shrmem_test.py" + }, + "using_explicit_control_flow": true, + "guid": "6f84fc47-c268-49be-b4bf-b3db5d8f4afc", + "hash": "02c373fd95a88d2386204512c0f24ede7d6118e3675ac1cf6abf4aef43326074" + }, + "nodes": [ + { + "type": "SDFGState", + "label": "MapState", + "id": 0, + "collapsed": false, + "scope_dict": { + "-1": [ + 0, + 2, + 3, + 4 + ], + "0": [ + 1, + 5, + 6, + 7, + 8, + 9 + ] + }, + "nodes": [ + { + "type": "MapEntry", + "label": "kernel_101[i=0:N]", + "attributes": { + "label": "kernel_101", + "params": [ + "i" + ], + "range": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "N - 1", + "step": "1", + "tile": "1" + } + ] + }, + "schedule": "GPU_Device", + "debuginfo": { + "type": "DebugInfo", + "start_line": 101, + "end_line": 101, + "start_column": 4, + "end_column": 4, + "filename": "/home/primrose/Work/DaceLayoutAndScheduleTransformations/layout_and_schedule_transformations/tests/shrmem_test.py" + }, + "in_connectors": { + "IN_A": null, + "IN_B": null + }, + "out_connectors": { + "OUT_A": null, + "OUT_B": null + }, + "guid": "65bd4c66-c8b1-4bb0-a886-38f7742d80d8" + }, + "id": 0, + "scope_entry": null, + "scope_exit": "1" + }, + { + "type": "MapExit", + "label": "kernel_101[i=0:N]", + "attributes": { + "in_connectors": { + "IN_C": null + }, + "out_connectors": { + "OUT_C": null + }, + "guid": "a4aeac5c-af3c-4250-bdae-6a9c6eed9d0c" + }, + "id": 1, + "scope_entry": "0", + "scope_exit": "1" + }, + { + "type": "AccessNode", + "label": "A", + "attributes": { + "debuginfo": { + "type": "DebugInfo", + "start_line": 101, + "end_line": 101, + "start_column": 4, + "end_column": 4, + "filename": "/home/primrose/Work/DaceLayoutAndScheduleTransformations/layout_and_schedule_transformations/tests/shrmem_test.py" + }, + "data": "A", + "guid": "0f4fdd7b-2487-4972-be44-c65f2e9706a2" + }, + "id": 2, + "scope_entry": null, + "scope_exit": null + }, + { + "type": "AccessNode", + "label": "B", + "attributes": { + "debuginfo": { + "type": "DebugInfo", + "start_line": 101, + "end_line": 101, + "start_column": 4, + "end_column": 4, + "filename": "/home/primrose/Work/DaceLayoutAndScheduleTransformations/layout_and_schedule_transformations/tests/shrmem_test.py" + }, + "data": "B", + "guid": "0d49a9f9-f91e-43ad-98b0-564a96450129" + }, + "id": 3, + "scope_entry": null, + "scope_exit": null + }, + { + "type": "AccessNode", + "label": "C", + "attributes": { + "debuginfo": { + "type": "DebugInfo", + "start_line": 101, + "end_line": 101, + "start_column": 4, + "end_column": 4, + "filename": "/home/primrose/Work/DaceLayoutAndScheduleTransformations/layout_and_schedule_transformations/tests/shrmem_test.py" + }, + "data": "C", + "guid": "bfea091e-eac1-4556-835f-a8541497b4fb" + }, + "id": 4, + "scope_entry": null, + "scope_exit": null + }, + { + "type": "Tasklet", + "label": "_Add_", + "attributes": { + "code": { + "string_data": "__out = (__in1 + __in2)", + "language": "Python" + }, + "debuginfo": { + "type": "DebugInfo", + "start_line": 102, + "end_line": 102, + "start_column": 32, + "end_column": 32, + "filename": "/home/primrose/Work/DaceLayoutAndScheduleTransformations/layout_and_schedule_transformations/tests/shrmem_test.py" + }, + "label": "_Add_", + "in_connectors": { + "__in1": null, + "__in2": null + }, + "out_connectors": { + "__out": null + }, + "guid": "f88baeb1-b915-4c5e-bffa-7ba8c5ecc856" + }, + "id": 5, + "scope_entry": "0", + "scope_exit": "1" + }, + { + "type": "AccessNode", + "label": "__tmp1", + "attributes": { + "debuginfo": { + "type": "DebugInfo", + "start_line": 102, + "end_line": 102, + "start_column": 32, + "end_column": 32, + "filename": "/home/primrose/Work/DaceLayoutAndScheduleTransformations/layout_and_schedule_transformations/tests/shrmem_test.py" + }, + "data": "__tmp1", + "guid": "6dffddb3-1a08-41da-ba88-077ab159c935" + }, + "id": 6, + "scope_entry": "0", + "scope_exit": "1" + }, + { + "type": "Tasklet", + "label": "assign_102_16", + "attributes": { + "code": { + "string_data": "__out = __inp", + "language": "Python" + }, + "debuginfo": { + "type": "DebugInfo", + "start_line": 102, + "end_line": 102, + "start_column": 18, + "end_column": 18, + "filename": "/home/primrose/Work/DaceLayoutAndScheduleTransformations/layout_and_schedule_transformations/tests/shrmem_test.py" + }, + "label": "assign_102_16", + "in_connectors": { + "__inp": null + }, + "out_connectors": { + "__out": null + }, + "guid": "749cbeda-09f4-4238-9e12-034c49b8e6df" + }, + "id": 7, + "scope_entry": "0", + "scope_exit": "1" + }, + { + "type": "AccessNode", + "label": "shr_A", + "attributes": { + "debuginfo": { + "type": "DebugInfo", + "start_line": 117, + "end_line": 117, + "start_column": 0, + "end_column": 0, + "filename": "/home/primrose/Work/DaceLayoutAndScheduleTransformations/layout_and_schedule_transformations/tests/test_utils.py" + }, + "data": "shr_A", + "guid": "b6be5ff8-a202-495d-acf9-436948e970c1" + }, + "id": 8, + "scope_entry": "0", + "scope_exit": "1" + }, + { + "type": "AccessNode", + "label": "shr_B", + "attributes": { + "debuginfo": { + "type": "DebugInfo", + "start_line": 117, + "end_line": 117, + "start_column": 0, + "end_column": 0, + "filename": "/home/primrose/Work/DaceLayoutAndScheduleTransformations/layout_and_schedule_transformations/tests/test_utils.py" + }, + "data": "shr_B", + "guid": "b7ba403f-3139-4e7a-974f-44f41106ae37" + }, + "id": 9, + "scope_entry": "0", + "scope_exit": "1" + } + ], + "edges": [ + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "N", + "subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "N - 1", + "step": "1", + "tile": "1" + } + ] + }, + "data": "A", + "debuginfo": null, + "guid": "2368aa63-cb9d-4289-b1de-558cbc048fd4", + "src_subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "N - 1", + "step": "1", + "tile": "1" + } + ] + }, + "dst_subset": null, + "is_data_src": true, + "num_accesses": "N" + } + } + }, + "src": "2", + "dst": "0", + "dst_connector": "IN_A", + "src_connector": null + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "N", + "subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "N - 1", + "step": "1", + "tile": "1" + } + ] + }, + "data": "B", + "debuginfo": null, + "guid": "7abc4434-9e30-4369-aad3-d1f6b7c8443f", + "src_subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "N - 1", + "step": "1", + "tile": "1" + } + ] + }, + "dst_subset": null, + "is_data_src": true, + "num_accesses": "N" + } + } + }, + "src": "3", + "dst": "0", + "dst_connector": "IN_B", + "src_connector": null + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "1", + "subset": { + "type": "Range", + "ranges": [ + { + "start": "Mod(i, 32)", + "end": "Mod(i, 32)", + "step": "1", + "tile": "1" + } + ] + }, + "data": "shr_A", + "debuginfo": null, + "guid": "9e7599f9-608f-4c22-ba63-e13fa392c2a3", + "src_subset": { + "type": "Range", + "ranges": [ + { + "start": "Mod(i, 32)", + "end": "Mod(i, 32)", + "step": "1", + "tile": "1" + } + ] + }, + "dst_subset": null, + "is_data_src": true, + "num_accesses": "1" + } + } + }, + "src": "8", + "dst": "5", + "dst_connector": "__in1", + "src_connector": null + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "1", + "subset": { + "type": "Range", + "ranges": [ + { + "start": "Mod(i, 32)", + "end": "Mod(i, 32)", + "step": "1", + "tile": "1" + } + ] + }, + "data": "shr_B", + "debuginfo": null, + "guid": "81574450-25db-4cd3-b433-d1e1ff842de8", + "src_subset": { + "type": "Range", + "ranges": [ + { + "start": "Mod(i, 32)", + "end": "Mod(i, 32)", + "step": "1", + "tile": "1" + } + ] + }, + "dst_subset": null, + "is_data_src": true, + "num_accesses": "1" + } + } + }, + "src": "9", + "dst": "5", + "dst_connector": "__in2", + "src_connector": null + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "1", + "subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "0", + "step": "1", + "tile": "1" + } + ] + }, + "data": "__tmp1", + "debuginfo": null, + "guid": "400ec4c5-929d-4d54-8a49-0c699b2611dd", + "src_subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "0", + "step": "1", + "tile": "1" + } + ] + }, + "dst_subset": null, + "is_data_src": true, + "num_accesses": "1" + } + } + }, + "src": "6", + "dst": "7", + "dst_connector": "__inp", + "src_connector": null + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "1", + "subset": { + "type": "Range", + "ranges": [ + { + "start": "i", + "end": "i", + "step": "1", + "tile": "1" + } + ] + }, + "other_subset": { + "type": "Range", + "ranges": [ + { + "start": "Mod(i, 32)", + "end": "Mod(i, 32)", + "step": "1", + "tile": "1" + } + ] + }, + "data": "A", + "debuginfo": null, + "guid": "673015a0-2f78-45a9-a1a9-1030396e4f10", + "src_subset": { + "type": "Range", + "ranges": [ + { + "start": "i", + "end": "i", + "step": "1", + "tile": "1" + } + ] + }, + "dst_subset": { + "type": "Range", + "ranges": [ + { + "start": "Mod(i, 32)", + "end": "Mod(i, 32)", + "step": "1", + "tile": "1" + } + ] + }, + "is_data_src": true, + "num_accesses": "1" + } + } + }, + "src": "0", + "dst": "8", + "dst_connector": null, + "src_connector": "OUT_A" + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "1", + "subset": { + "type": "Range", + "ranges": [ + { + "start": "i", + "end": "i", + "step": "1", + "tile": "1" + } + ] + }, + "other_subset": { + "type": "Range", + "ranges": [ + { + "start": "Mod(i, 32)", + "end": "Mod(i, 32)", + "step": "1", + "tile": "1" + } + ] + }, + "data": "B", + "debuginfo": null, + "guid": "b4750339-ce40-4ea4-a6ad-5e89c9860a1b", + "src_subset": { + "type": "Range", + "ranges": [ + { + "start": "i", + "end": "i", + "step": "1", + "tile": "1" + } + ] + }, + "dst_subset": { + "type": "Range", + "ranges": [ + { + "start": "Mod(i, 32)", + "end": "Mod(i, 32)", + "step": "1", + "tile": "1" + } + ] + }, + "is_data_src": true, + "num_accesses": "1" + } + } + }, + "src": "0", + "dst": "9", + "dst_connector": null, + "src_connector": "OUT_B" + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "N", + "subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "N - 1", + "step": "1", + "tile": "1" + } + ] + }, + "data": "C", + "debuginfo": null, + "guid": "5322f7ea-bfea-4063-9d7b-9b25e05d480c", + "src_subset": null, + "dst_subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "N - 1", + "step": "1", + "tile": "1" + } + ] + }, + "is_data_src": false, + "num_accesses": "N" + } + } + }, + "src": "1", + "dst": "4", + "dst_connector": null, + "src_connector": "OUT_C" + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "1", + "subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "0", + "step": "1", + "tile": "1" + } + ] + }, + "data": "__tmp1", + "debuginfo": null, + "guid": "2239ae7c-60ae-43dd-b95e-4f4534646662", + "src_subset": null, + "dst_subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "0", + "step": "1", + "tile": "1" + } + ] + }, + "is_data_src": false, + "num_accesses": "1" + } + } + }, + "src": "5", + "dst": "6", + "dst_connector": null, + "src_connector": "__out" + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "1", + "subset": { + "type": "Range", + "ranges": [ + { + "start": "i", + "end": "i", + "step": "1", + "tile": "1" + } + ] + }, + "data": "C", + "debuginfo": null, + "guid": "c9b5e42d-2483-40f5-a125-a1693a6687b0", + "src_subset": null, + "dst_subset": { + "type": "Range", + "ranges": [ + { + "start": "i", + "end": "i", + "step": "1", + "tile": "1" + } + ] + }, + "is_data_src": false, + "num_accesses": "1" + } + } + }, + "src": "7", + "dst": "1", + "dst_connector": "IN_C", + "src_connector": "__out" + } + ], + "attributes": { + "guid": "3b24f0bd-0925-4793-bb62-44efcd062222", + "executions": "1", + "dynamic_executions": false + } + } + ], + "edges": [], + "collapsed": false, + "label": "", + "id": null, + "cfg_list_id": 0, + "start_block": 0, + "dace_version": "1.0.0" +} \ No newline at end of file diff --git a/berkay_workpace/tests/smem_tests/gemm_test.py b/berkay_workpace/tests/smem_tests/gemm_test.py new file mode 100644 index 0000000000..a42afc5f14 --- /dev/null +++ b/berkay_workpace/tests/smem_tests/gemm_test.py @@ -0,0 +1,32 @@ +import dace +from dace import dtypes + +import cupy as cp +import pytest +import os + + +@pytest.mark.gpu +def test_gemm(): + """ + Advanced test: Checks shared memory synchronization and numerical correctness + of a GEMM SDFG using 2D block tiling with custom copy. + """ + current_dir = os.path.dirname(os.path.abspath(__file__)) + sdfg_path = os.path.join(current_dir, '../../scratch/yakups_examples/2d_blocktiled_gemm_with_custom_copy.sdfg') + sdfg = dace.SDFG.from_file(sdfg_path) + + m, n, k = 1024, 1024, 1024 + A = cp.random.rand(m, k).astype(cp.float32) + B = cp.random.rand(k, n).astype(cp.float32) + C = cp.random.rand(m, n).astype(cp.float32) + + # Count __syncthreads(); calls across all generated files + generated_code = sdfg.generate_code() + nr_sync_barriers = sum(f.clean_code.count("__syncthreads();") for f in generated_code) + assert nr_sync_barriers == 2, f"Expected exactly 2 '__syncthreads();' calls, but found {nr_sync_barriers}" + + # Compute expected result + expected = A @ B + sdfg(A=A, B=B, C=C, M=m, N=n, K=k) + cp.testing.assert_allclose(C, expected, atol=0.001, err_msg="Mismatch: unexpected GEMM result") diff --git a/berkay_workpace/tests/smem_tests/special_sync_pass_test.py b/berkay_workpace/tests/smem_tests/special_sync_pass_test.py new file mode 100644 index 0000000000..5338933f4b --- /dev/null +++ b/berkay_workpace/tests/smem_tests/special_sync_pass_test.py @@ -0,0 +1,37 @@ +import dace +from dace import dtypes + +import cupy as cp +import pytest +import os + + +@pytest.mark.gpu +def test_correctness_and_reuse(): + """ + Only one synchronization barrier should be her (other tests verify + already that at the end of this seq map there is no synchronization, because + the range has size 1). This tests essentially shows that we reuse the sync tasklet + (which is more optimal) by checking that only one such barrier is in the generated code + (we also check correcntess, which is however not interesting here since threads only access + smem locations which they also write to, so synchronization is not stictly needed here) + """ + current_dir = os.path.dirname(os.path.abspath(__file__)) + sdfg_path = os.path.join(current_dir, '../../scratch/yakups_examples/nice_global_to_shared_copy.sdfg') + sdfg = dace.SDFG.from_file(sdfg_path) + + size = 512 + a = cp.random.rand(size, dtype=cp.float64) + b = cp.random.rand(size, dtype=cp.float64) + c = cp.zeros((size,), dtype=cp.float64) + + # count that there is only one __syncthread(); call. You can also inspect the final SDFG in the cache for that + generated_code = sdfg.generate_code()[1].clean_code + nr_sync_barriers = generated_code.count("__syncthreads();") + + assert nr_sync_barriers == 1, f"expected only 1 '__syncthreads(); call, but got '{nr_sync_barriers}" + + # Check whether result is correctly computed + expected_res = a + b + sdfg(A=a, B=b, C=c, N=size) + cp.testing.assert_allclose(c, expected_res, err_msg="Mismatch: Not expected result") From 409971cb6914b9d471691541fbbc6b9401575d85 Mon Sep 17 00:00:00 2001 From: aydogdub Date: Sun, 22 Jun 2025 16:57:44 +0200 Subject: [PATCH 30/94] yapkups sdfg examples, stored for inspection and testing for later --- .../dbuff_related/double_buffering_async.sdfg | 4821 +++++++++++++++++ .../original_sdfg_with_shared_memory.sdfg | 1278 +++++ .../2d_blocktiled_gemm_with_custom_copy.sdfg | 0 .../{ => smem_related}/generate_sdfgs.ipynb | 0 .../nice_global_to_shared_copy.sdfg | 0 .../weird_global_to_global.sdfg | 0 .../weird_shared_to_shared_copy.sdfg | 0 7 files changed, 6099 insertions(+) create mode 100644 berkay_workpace/scratch/yakups_examples/dbuff_related/double_buffering_async.sdfg create mode 100644 berkay_workpace/scratch/yakups_examples/dbuff_related/original_sdfg_with_shared_memory.sdfg rename berkay_workpace/scratch/yakups_examples/{ => smem_related}/2d_blocktiled_gemm_with_custom_copy.sdfg (100%) rename berkay_workpace/scratch/yakups_examples/{ => smem_related}/generate_sdfgs.ipynb (100%) rename berkay_workpace/scratch/yakups_examples/{ => smem_related}/nice_global_to_shared_copy.sdfg (100%) rename berkay_workpace/scratch/yakups_examples/{ => smem_related}/weird_global_to_global.sdfg (100%) rename berkay_workpace/scratch/yakups_examples/{ => smem_related}/weird_shared_to_shared_copy.sdfg (100%) diff --git a/berkay_workpace/scratch/yakups_examples/dbuff_related/double_buffering_async.sdfg b/berkay_workpace/scratch/yakups_examples/dbuff_related/double_buffering_async.sdfg new file mode 100644 index 0000000000..12fe140292 --- /dev/null +++ b/berkay_workpace/scratch/yakups_examples/dbuff_related/double_buffering_async.sdfg @@ -0,0 +1,4821 @@ +{ + "type": "SDFG", + "attributes": { + "name": "kernel_double_buffered_async", + "arg_names": [ + "A", + "B", + "C" + ], + "_arrays": { + "A": { + "type": "Array", + "attributes": { + "strides": [ + "1" + ], + "total_size": "N", + "offset": [ + "0" + ], + "optional": false, + "dtype": "float64", + "shape": [ + "N" + ], + "storage": "GPU_Global", + "debuginfo": null + } + }, + "B": { + "type": "Array", + "attributes": { + "strides": [ + "1" + ], + "total_size": "N", + "offset": [ + "0" + ], + "optional": false, + "dtype": "float64", + "shape": [ + "N" + ], + "storage": "GPU_Global", + "debuginfo": null + } + }, + "C": { + "type": "Array", + "attributes": { + "strides": [ + "1" + ], + "total_size": "N", + "offset": [ + "0" + ], + "optional": false, + "dtype": "float64", + "shape": [ + "N" + ], + "storage": "GPU_Global", + "debuginfo": null + } + }, + "__tmp3": { + "type": "Scalar", + "attributes": { + "dtype": "float64", + "shape": [ + "1" + ], + "transient": true, + "debuginfo": null + } + }, + "shr_B": { + "type": "Array", + "attributes": { + "strides": [ + "256", + "1" + ], + "total_size": "512", + "offset": [ + "0", + "0" + ], + "optional": false, + "dtype": "float64", + "shape": [ + "2", + "256" + ], + "transient": true, + "storage": "GPU_Shared", + "debuginfo": null + } + }, + "shr_A": { + "type": "Array", + "attributes": { + "strides": [ + "256", + "1" + ], + "total_size": "512", + "offset": [ + "0", + "0" + ], + "optional": false, + "dtype": "float64", + "shape": [ + "2", + "256" + ], + "transient": true, + "storage": "GPU_Shared", + "debuginfo": null + } + } + }, + "symbols": { + "N": "int64" + }, + "global_code": { + "frame": { + "string_data": "", + "language": "CPP" + } + }, + "init_code": { + "frame": { + "string_data": "", + "language": "CPP" + } + }, + "exit_code": { + "frame": { + "string_data": "", + "language": "CPP" + } + }, + "orig_sdfg": { + "type": "SDFG", + "attributes": { + "name": "kernel_double_buffered_async", + "arg_names": [ + "A", + "B", + "C" + ], + "_arrays": { + "A": { + "type": "Array", + "attributes": { + "strides": [ + "1" + ], + "total_size": "N", + "offset": [ + "0" + ], + "optional": false, + "dtype": "float64", + "shape": [ + "N" + ], + "storage": "GPU_Global", + "debuginfo": null + } + }, + "B": { + "type": "Array", + "attributes": { + "strides": [ + "1" + ], + "total_size": "N", + "offset": [ + "0" + ], + "optional": false, + "dtype": "float64", + "shape": [ + "N" + ], + "storage": "GPU_Global", + "debuginfo": null + } + }, + "C": { + "type": "Array", + "attributes": { + "strides": [ + "1" + ], + "total_size": "N", + "offset": [ + "0" + ], + "optional": false, + "dtype": "float64", + "shape": [ + "N" + ], + "storage": "GPU_Global", + "debuginfo": null + } + }, + "__tmp3": { + "type": "Scalar", + "attributes": { + "dtype": "float64", + "shape": [ + "1" + ], + "transient": true, + "debuginfo": null + } + }, + "shr_A": { + "type": "Array", + "attributes": { + "strides": [ + "1" + ], + "total_size": "256", + "offset": [ + "0" + ], + "optional": false, + "dtype": "float64", + "shape": [ + "256" + ], + "transient": true, + "storage": "GPU_Shared", + "debuginfo": null + } + }, + "shr_B": { + "type": "Array", + "attributes": { + "strides": [ + "1" + ], + "total_size": "256", + "offset": [ + "0" + ], + "optional": false, + "dtype": "float64", + "shape": [ + "256" + ], + "transient": true, + "storage": "GPU_Shared", + "debuginfo": null + } + } + }, + "symbols": { + "N": "int64" + }, + "global_code": { + "frame": { + "string_data": "", + "language": "CPP" + } + }, + "init_code": { + "frame": { + "string_data": "", + "language": "CPP" + } + }, + "exit_code": { + "frame": { + "string_data": "", + "language": "CPP" + } + }, + "debuginfo": { + "type": "DebugInfo", + "start_line": 20, + "end_line": 29, + "start_column": 0, + "end_column": 0, + "filename": "/home/primrose/Work/DaceLayoutAndScheduleTransformations/layout_and_schedule_transformations/tests/multiple_buffering_test.py" + }, + "using_explicit_control_flow": true, + "guid": "b1393913-a60f-4a10-a006-50ad9ec459e3" + }, + "nodes": [ + { + "type": "SDFGState", + "label": "MapState", + "id": 0, + "collapsed": false, + "scope_dict": { + "-1": [ + 0, + 2, + 3, + 4 + ], + "0": [ + 1, + 5 + ], + "5": [ + 6, + 7, + 12, + 13 + ], + "7": [ + 8, + 9, + 10, + 11 + ] + }, + "nodes": [ + { + "type": "MapEntry", + "label": "kernel_26[i=0:N:512]", + "attributes": { + "label": "kernel_26", + "params": [ + "i" + ], + "range": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "N - 1", + "step": "512", + "tile": "1" + } + ] + }, + "schedule": "GPU_Device", + "debuginfo": { + "type": "DebugInfo", + "start_line": 26, + "end_line": 26, + "start_column": 4, + "end_column": 4, + "filename": "/home/primrose/Work/DaceLayoutAndScheduleTransformations/layout_and_schedule_transformations/tests/multiple_buffering_test.py" + }, + "in_connectors": { + "IN_A": null, + "IN_B": null + }, + "out_connectors": { + "OUT_A": null, + "OUT_B": null + }, + "guid": "e572193f-4078-4079-8a7b-f9e60c00c3f9" + }, + "id": 0, + "scope_entry": null, + "scope_exit": "1" + }, + { + "type": "MapExit", + "label": "kernel_26[i=0:N:512]", + "attributes": { + "in_connectors": { + "IN_C": null + }, + "out_connectors": { + "OUT_C": null + }, + "guid": "2692fa6e-5d5b-4152-8604-77292eca079e" + }, + "id": 1, + "scope_entry": "0", + "scope_exit": "1" + }, + { + "type": "AccessNode", + "label": "A", + "attributes": { + "debuginfo": { + "type": "DebugInfo", + "start_line": 26, + "end_line": 26, + "start_column": 4, + "end_column": 4, + "filename": "/home/primrose/Work/DaceLayoutAndScheduleTransformations/layout_and_schedule_transformations/tests/multiple_buffering_test.py" + }, + "data": "A", + "guid": "ab74a37e-b46f-4c4f-a769-f5c7a748410d" + }, + "id": 2, + "scope_entry": null, + "scope_exit": null + }, + { + "type": "AccessNode", + "label": "B", + "attributes": { + "debuginfo": { + "type": "DebugInfo", + "start_line": 26, + "end_line": 26, + "start_column": 4, + "end_column": 4, + "filename": "/home/primrose/Work/DaceLayoutAndScheduleTransformations/layout_and_schedule_transformations/tests/multiple_buffering_test.py" + }, + "data": "B", + "guid": "61eb4a00-468b-4b1f-860d-09ce3d6d359e" + }, + "id": 3, + "scope_entry": null, + "scope_exit": null + }, + { + "type": "AccessNode", + "label": "C", + "attributes": { + "debuginfo": { + "type": "DebugInfo", + "start_line": 26, + "end_line": 26, + "start_column": 4, + "end_column": 4, + "filename": "/home/primrose/Work/DaceLayoutAndScheduleTransformations/layout_and_schedule_transformations/tests/multiple_buffering_test.py" + }, + "data": "C", + "guid": "f166467e-40a6-46e0-9a94-d19c7e8c9b13" + }, + "id": 4, + "scope_entry": null, + "scope_exit": null + }, + { + "type": "MapEntry", + "label": "kernel_26_4_27[k=0:2]", + "attributes": { + "label": "kernel_26_4_27", + "params": [ + "k" + ], + "range": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "1", + "step": "1", + "tile": "1" + } + ] + }, + "schedule": "Sequential", + "debuginfo": { + "type": "DebugInfo", + "start_line": 27, + "end_line": 27, + "start_column": 8, + "end_column": 8, + "filename": "/home/primrose/Work/DaceLayoutAndScheduleTransformations/layout_and_schedule_transformations/tests/multiple_buffering_test.py" + }, + "in_connectors": { + "IN___tmp_29_37_r_in_from_1_0_in_from_1_0": null, + "IN___tmp_29_58_r_in_from_1_0_in_from_1_0": null + }, + "out_connectors": { + "OUT___tmp_29_37_r_in_from_1_0_in_from_1_0": null, + "OUT___tmp_29_58_r_in_from_1_0_in_from_1_0": null + }, + "guid": "148ec12e-ac47-4a5e-bc34-836556a4bb1d" + }, + "id": 5, + "scope_entry": "0", + "scope_exit": "6" + }, + { + "type": "MapExit", + "label": "kernel_26_4_27[k=0:2]", + "attributes": { + "in_connectors": { + "IN___tmp_29_16_w_out_of_1_1_out_of_1_1": null + }, + "out_connectors": { + "OUT___tmp_29_16_w_out_of_1_1_out_of_1_1": null + }, + "guid": "525be732-b8e8-4067-a15d-9ad9a5ca1096" + }, + "id": 6, + "scope_entry": "5", + "scope_exit": "6" + }, + { + "type": "MapEntry", + "label": "kernel_26_4_27_8_28[j=0:256]", + "attributes": { + "label": "kernel_26_4_27_8_28", + "params": [ + "j" + ], + "range": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "255", + "step": "1", + "tile": "1" + } + ] + }, + "schedule": "GPU_ThreadBlock", + "debuginfo": { + "type": "DebugInfo", + "start_line": 28, + "end_line": 28, + "start_column": 12, + "end_column": 12, + "filename": "/home/primrose/Work/DaceLayoutAndScheduleTransformations/layout_and_schedule_transformations/tests/multiple_buffering_test.py" + }, + "in_connectors": { + "IN___tmp_29_37_r_in_from_1_0": null, + "IN___tmp_29_58_r_in_from_1_0": null + }, + "out_connectors": { + "OUT___tmp_29_37_r_in_from_1_0": null, + "OUT___tmp_29_58_r_in_from_1_0": null + }, + "guid": "bf2fc620-040a-48b3-be54-f9d9b1997e41" + }, + "id": 7, + "scope_entry": "5", + "scope_exit": "8" + }, + { + "type": "MapExit", + "label": "kernel_26_4_27_8_28[j=0:256]", + "attributes": { + "in_connectors": { + "IN___tmp_29_16_w_out_of_1_1": null + }, + "out_connectors": { + "OUT___tmp_29_16_w_out_of_1_1": null + }, + "guid": "db988a0e-a1c1-45a3-9905-32dc050c6b76" + }, + "id": 8, + "scope_entry": "7", + "scope_exit": "8" + }, + { + "type": "Tasklet", + "label": "_Add_", + "attributes": { + "code": { + "string_data": "__out = (__in1 + __in2)", + "language": "Python" + }, + "debuginfo": { + "type": "DebugInfo", + "start_line": 29, + "end_line": 29, + "start_column": 72, + "end_column": 72, + "filename": "/home/primrose/Work/DaceLayoutAndScheduleTransformations/layout_and_schedule_transformations/tests/multiple_buffering_test.py" + }, + "label": "_Add_", + "in_connectors": { + "__in1": null, + "__in2": null + }, + "out_connectors": { + "__out": null + }, + "guid": "86df7814-ffdc-41a0-afe1-f211ae009e6a" + }, + "id": 9, + "scope_entry": "7", + "scope_exit": "8" + }, + { + "type": "AccessNode", + "label": "__tmp3", + "attributes": { + "debuginfo": { + "type": "DebugInfo", + "start_line": 29, + "end_line": 29, + "start_column": 72, + "end_column": 72, + "filename": "/home/primrose/Work/DaceLayoutAndScheduleTransformations/layout_and_schedule_transformations/tests/multiple_buffering_test.py" + }, + "data": "__tmp3", + "guid": "5e5592d0-1ffe-4ddb-9788-b3b72aecae6a" + }, + "id": 10, + "scope_entry": "7", + "scope_exit": "8" + }, + { + "type": "Tasklet", + "label": "assign_29_16", + "attributes": { + "code": { + "string_data": "__out = __inp", + "language": "Python" + }, + "debuginfo": { + "type": "DebugInfo", + "start_line": 29, + "end_line": 29, + "start_column": 30, + "end_column": 30, + "filename": "/home/primrose/Work/DaceLayoutAndScheduleTransformations/layout_and_schedule_transformations/tests/multiple_buffering_test.py" + }, + "label": "assign_29_16", + "in_connectors": { + "__inp": null + }, + "out_connectors": { + "__out": null + }, + "guid": "ee7b20f7-fbc2-49a9-8fba-ccffff2b7780" + }, + "id": 11, + "scope_entry": "7", + "scope_exit": "8" + }, + { + "type": "AccessNode", + "label": "shr_A", + "attributes": { + "debuginfo": { + "type": "DebugInfo", + "start_line": 48, + "end_line": 48, + "start_column": 0, + "end_column": 0, + "filename": "/home/primrose/Work/DaceLayoutAndScheduleTransformations/layout_and_schedule_transformations/tests/test_utils.py" + }, + "data": "shr_A", + "guid": "4905f521-649a-4796-9472-fad5dd9602a1" + }, + "id": 12, + "scope_entry": "5", + "scope_exit": "6" + }, + { + "type": "AccessNode", + "label": "shr_B", + "attributes": { + "debuginfo": { + "type": "DebugInfo", + "start_line": 48, + "end_line": 48, + "start_column": 0, + "end_column": 0, + "filename": "/home/primrose/Work/DaceLayoutAndScheduleTransformations/layout_and_schedule_transformations/tests/test_utils.py" + }, + "data": "shr_B", + "guid": "5aea9089-f43c-4272-8cf1-80301f87e5f5" + }, + "id": 13, + "scope_entry": "5", + "scope_exit": "6" + } + ], + "edges": [ + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "512*ceiling(N/512)", + "subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "N - 1", + "step": "1", + "tile": "1" + } + ] + }, + "data": "A", + "debuginfo": null, + "guid": "69b08c61-8a53-4825-bd28-3ce6d2fa75f7", + "src_subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "N - 1", + "step": "1", + "tile": "1" + } + ] + }, + "dst_subset": null, + "is_data_src": true, + "num_accesses": "512*ceiling(N/512)" + } + } + }, + "src": "2", + "dst": "0", + "dst_connector": "IN_A", + "src_connector": null + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "512*ceiling(N/512)", + "subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "N - 1", + "step": "1", + "tile": "1" + } + ] + }, + "data": "B", + "debuginfo": null, + "guid": "0185e810-4989-4e8b-b57c-1c12f5647a48", + "src_subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "N - 1", + "step": "1", + "tile": "1" + } + ] + }, + "dst_subset": null, + "is_data_src": true, + "num_accesses": "512*ceiling(N/512)" + } + } + }, + "src": "3", + "dst": "0", + "dst_connector": "IN_B", + "src_connector": null + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "256", + "subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "255", + "step": "1", + "tile": "1" + } + ] + }, + "data": "shr_A", + "debuginfo": null, + "guid": "9053c547-1a07-4e28-9a07-b5866a347a2b", + "src_subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "255", + "step": "1", + "tile": "1" + } + ] + }, + "dst_subset": null, + "is_data_src": true, + "num_accesses": "256" + } + } + }, + "src": "12", + "dst": "7", + "dst_connector": "IN___tmp_29_37_r_in_from_1_0", + "src_connector": null + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "256", + "subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "255", + "step": "1", + "tile": "1" + } + ] + }, + "data": "shr_B", + "debuginfo": null, + "guid": "aa3712fc-e871-41e5-ba81-47436eb090a9", + "src_subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "255", + "step": "1", + "tile": "1" + } + ] + }, + "dst_subset": null, + "is_data_src": true, + "num_accesses": "256" + } + } + }, + "src": "13", + "dst": "7", + "dst_connector": "IN___tmp_29_58_r_in_from_1_0", + "src_connector": null + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "1", + "subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "0", + "step": "1", + "tile": "1" + } + ] + }, + "data": "__tmp3", + "debuginfo": null, + "guid": "232a4b3c-63de-4475-a777-63057a798de3", + "src_subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "0", + "step": "1", + "tile": "1" + } + ] + }, + "dst_subset": null, + "is_data_src": true, + "num_accesses": "1" + } + } + }, + "src": "10", + "dst": "11", + "dst_connector": "__inp", + "src_connector": null + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "512", + "subset": { + "type": "Range", + "ranges": [ + { + "start": "i", + "end": "i + 511", + "step": "1", + "tile": "1" + } + ] + }, + "data": "A", + "debuginfo": null, + "guid": "f2fca7c5-eb37-41b5-aa10-1698132a8f8b", + "src_subset": { + "type": "Range", + "ranges": [ + { + "start": "i", + "end": "i + 511", + "step": "1", + "tile": "1" + } + ] + }, + "dst_subset": null, + "is_data_src": true, + "num_accesses": "512" + } + } + }, + "src": "0", + "dst": "5", + "dst_connector": "IN___tmp_29_37_r_in_from_1_0_in_from_1_0", + "src_connector": "OUT_A" + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "512", + "subset": { + "type": "Range", + "ranges": [ + { + "start": "i", + "end": "i + 511", + "step": "1", + "tile": "1" + } + ] + }, + "data": "B", + "debuginfo": null, + "guid": "76a9bb89-c68f-4161-8fc0-5d272cb46c5d", + "src_subset": { + "type": "Range", + "ranges": [ + { + "start": "i", + "end": "i + 511", + "step": "1", + "tile": "1" + } + ] + }, + "dst_subset": null, + "is_data_src": true, + "num_accesses": "512" + } + } + }, + "src": "0", + "dst": "5", + "dst_connector": "IN___tmp_29_58_r_in_from_1_0_in_from_1_0", + "src_connector": "OUT_B" + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "512*ceiling(N/512)", + "subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "N - 1", + "step": "1", + "tile": "1" + } + ] + }, + "data": "C", + "debuginfo": null, + "guid": "1dd8cf39-3f8a-4419-8403-80c1c0393bfb", + "src_subset": null, + "dst_subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "N - 1", + "step": "1", + "tile": "1" + } + ] + }, + "is_data_src": false, + "num_accesses": "512*ceiling(N/512)" + } + } + }, + "src": "1", + "dst": "4", + "dst_connector": null, + "src_connector": "OUT_C" + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "256", + "subset": { + "type": "Range", + "ranges": [ + { + "start": "i + 256*k", + "end": "i + 256*k + 255", + "step": "1", + "tile": "1" + } + ] + }, + "data": "C", + "debuginfo": null, + "guid": "59d4f720-d116-4bb0-be9d-dbee7b4461e5", + "src_subset": null, + "dst_subset": { + "type": "Range", + "ranges": [ + { + "start": "i + 256*k", + "end": "i + 256*k + 255", + "step": "1", + "tile": "1" + } + ] + }, + "is_data_src": false, + "num_accesses": "256" + } + } + }, + "src": "8", + "dst": "6", + "dst_connector": "IN___tmp_29_16_w_out_of_1_1_out_of_1_1", + "src_connector": "OUT___tmp_29_16_w_out_of_1_1" + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "512", + "subset": { + "type": "Range", + "ranges": [ + { + "start": "i", + "end": "i + 511", + "step": "1", + "tile": "1" + } + ] + }, + "data": "C", + "debuginfo": null, + "guid": "57a2e327-a170-434b-94b1-784054dc4fae", + "src_subset": null, + "dst_subset": { + "type": "Range", + "ranges": [ + { + "start": "i", + "end": "i + 511", + "step": "1", + "tile": "1" + } + ] + }, + "is_data_src": false, + "num_accesses": "512" + } + } + }, + "src": "6", + "dst": "1", + "dst_connector": "IN_C", + "src_connector": "OUT___tmp_29_16_w_out_of_1_1_out_of_1_1" + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "1", + "subset": { + "type": "Range", + "ranges": [ + { + "start": "j", + "end": "j", + "step": "1", + "tile": "1" + } + ] + }, + "data": "shr_A", + "debuginfo": null, + "guid": "e1e070d6-e51e-46a7-aa37-6cd892324359", + "src_subset": { + "type": "Range", + "ranges": [ + { + "start": "j", + "end": "j", + "step": "1", + "tile": "1" + } + ] + }, + "dst_subset": null, + "is_data_src": true, + "num_accesses": "1" + } + } + }, + "src": "7", + "dst": "9", + "dst_connector": "__in1", + "src_connector": "OUT___tmp_29_37_r_in_from_1_0" + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "256", + "subset": { + "type": "Range", + "ranges": [ + { + "start": "i + 256*k", + "end": "i + 256*k + 255", + "step": "1", + "tile": "1" + } + ] + }, + "other_subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "255", + "step": "1", + "tile": "1" + } + ] + }, + "data": "A", + "debuginfo": null, + "guid": "1fbf8afd-549b-4b0d-9a0a-64fd318d6034", + "src_subset": { + "type": "Range", + "ranges": [ + { + "start": "i + 256*k", + "end": "i + 256*k + 255", + "step": "1", + "tile": "1" + } + ] + }, + "dst_subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "255", + "step": "1", + "tile": "1" + } + ] + }, + "is_data_src": true, + "num_accesses": "256" + } + } + }, + "src": "5", + "dst": "12", + "dst_connector": null, + "src_connector": "OUT___tmp_29_37_r_in_from_1_0_in_from_1_0" + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "1", + "subset": { + "type": "Range", + "ranges": [ + { + "start": "j", + "end": "j", + "step": "1", + "tile": "1" + } + ] + }, + "data": "shr_B", + "debuginfo": null, + "guid": "57ba67c5-bd2f-4844-9e9c-40753922ad01", + "src_subset": { + "type": "Range", + "ranges": [ + { + "start": "j", + "end": "j", + "step": "1", + "tile": "1" + } + ] + }, + "dst_subset": null, + "is_data_src": true, + "num_accesses": "1" + } + } + }, + "src": "7", + "dst": "9", + "dst_connector": "__in2", + "src_connector": "OUT___tmp_29_58_r_in_from_1_0" + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "256", + "subset": { + "type": "Range", + "ranges": [ + { + "start": "i + 256*k", + "end": "i + 256*k + 255", + "step": "1", + "tile": "1" + } + ] + }, + "other_subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "255", + "step": "1", + "tile": "1" + } + ] + }, + "data": "B", + "debuginfo": null, + "guid": "bfb890ef-cdef-4ebf-8995-f3754b0953bd", + "src_subset": { + "type": "Range", + "ranges": [ + { + "start": "i + 256*k", + "end": "i + 256*k + 255", + "step": "1", + "tile": "1" + } + ] + }, + "dst_subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "255", + "step": "1", + "tile": "1" + } + ] + }, + "is_data_src": true, + "num_accesses": "256" + } + } + }, + "src": "5", + "dst": "13", + "dst_connector": null, + "src_connector": "OUT___tmp_29_58_r_in_from_1_0_in_from_1_0" + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "1", + "subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "0", + "step": "1", + "tile": "1" + } + ] + }, + "data": "__tmp3", + "debuginfo": null, + "guid": "9e39802d-f185-4f97-aa0d-24ec5e28fa3a", + "src_subset": null, + "dst_subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "0", + "step": "1", + "tile": "1" + } + ] + }, + "is_data_src": false, + "num_accesses": "1" + } + } + }, + "src": "9", + "dst": "10", + "dst_connector": null, + "src_connector": "__out" + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "1", + "subset": { + "type": "Range", + "ranges": [ + { + "start": "i + j + 256*k", + "end": "i + j + 256*k", + "step": "1", + "tile": "1" + } + ] + }, + "data": "C", + "debuginfo": null, + "guid": "cd431e0e-63f8-458d-a4f7-82a85dc09e96", + "src_subset": null, + "dst_subset": { + "type": "Range", + "ranges": [ + { + "start": "i + j + 256*k", + "end": "i + j + 256*k", + "step": "1", + "tile": "1" + } + ] + }, + "is_data_src": false, + "num_accesses": "1" + } + } + }, + "src": "11", + "dst": "8", + "dst_connector": "IN___tmp_29_16_w_out_of_1_1", + "src_connector": "__out" + } + ], + "attributes": { + "guid": "328decff-fd21-4fff-881f-74fc87b42fa7", + "executions": "1", + "dynamic_executions": false + } + } + ], + "edges": [], + "collapsed": false, + "label": "", + "id": null, + "cfg_list_id": 0, + "start_block": 0, + "dace_version": "1.0.0" + }, + "transformation_hist": [ + { + "type": "PatternTransformation", + "transformation": "MultipleBuffering", + "prefill_cfg_id": 1, + "prefetch_cfg_id": 1, + "synchronous": false, + "_subgraph": { + "0": 0 + } + } + ], + "debuginfo": { + "type": "DebugInfo", + "start_line": 20, + "end_line": 29, + "start_column": 0, + "end_column": 0, + "filename": "/home/primrose/Work/DaceLayoutAndScheduleTransformations/layout_and_schedule_transformations/tests/multiple_buffering_test.py" + }, + "using_explicit_control_flow": true, + "guid": "b1393913-a60f-4a10-a006-50ad9ec459e3", + "hash": "986f8867fdf3dcbeb3b07b32b05c3af6ad04a3b67e786fb203ee054b1d2dbd97" + }, + "nodes": [ + { + "type": "SDFGState", + "label": "MapState", + "id": 0, + "collapsed": false, + "scope_dict": { + "-1": [ + 0, + 2, + 3, + 4 + ], + "0": [ + 1, + 5, + 12, + 13, + 14 + ], + "5": [ + 6, + 7, + 15, + 16, + 17, + 18, + 19, + 20 + ], + "7": [ + 8, + 9, + 10, + 11 + ] + }, + "nodes": [ + { + "type": "MapEntry", + "label": "kernel_26[i=0:N:512]", + "attributes": { + "label": "kernel_26", + "params": [ + "i" + ], + "range": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "N - 1", + "step": "512", + "tile": "1" + } + ] + }, + "schedule": "GPU_Device", + "debuginfo": { + "type": "DebugInfo", + "start_line": 26, + "end_line": 26, + "start_column": 4, + "end_column": 4, + "filename": "/home/primrose/Work/DaceLayoutAndScheduleTransformations/layout_and_schedule_transformations/tests/multiple_buffering_test.py" + }, + "in_connectors": { + "IN_A": null, + "IN_B": null + }, + "out_connectors": { + "OUT_A": null, + "OUT_B": null + }, + "guid": "e572193f-4078-4079-8a7b-f9e60c00c3f9" + }, + "id": 0, + "scope_entry": null, + "scope_exit": "1" + }, + { + "type": "MapExit", + "label": "kernel_26[i=0:N:512]", + "attributes": { + "in_connectors": { + "IN_C": null + }, + "out_connectors": { + "OUT_C": null + }, + "guid": "2692fa6e-5d5b-4152-8604-77292eca079e" + }, + "id": 1, + "scope_entry": "0", + "scope_exit": "1" + }, + { + "type": "AccessNode", + "label": "A", + "attributes": { + "debuginfo": { + "type": "DebugInfo", + "start_line": 26, + "end_line": 26, + "start_column": 4, + "end_column": 4, + "filename": "/home/primrose/Work/DaceLayoutAndScheduleTransformations/layout_and_schedule_transformations/tests/multiple_buffering_test.py" + }, + "data": "A", + "guid": "0af74d98-4cac-44d4-afd5-dcbb63f357fd" + }, + "id": 2, + "scope_entry": null, + "scope_exit": null + }, + { + "type": "AccessNode", + "label": "B", + "attributes": { + "debuginfo": { + "type": "DebugInfo", + "start_line": 26, + "end_line": 26, + "start_column": 4, + "end_column": 4, + "filename": "/home/primrose/Work/DaceLayoutAndScheduleTransformations/layout_and_schedule_transformations/tests/multiple_buffering_test.py" + }, + "data": "B", + "guid": "43f7746b-9786-42ce-a43c-8b0489e0f7a6" + }, + "id": 3, + "scope_entry": null, + "scope_exit": null + }, + { + "type": "AccessNode", + "label": "C", + "attributes": { + "debuginfo": { + "type": "DebugInfo", + "start_line": 26, + "end_line": 26, + "start_column": 4, + "end_column": 4, + "filename": "/home/primrose/Work/DaceLayoutAndScheduleTransformations/layout_and_schedule_transformations/tests/multiple_buffering_test.py" + }, + "data": "C", + "guid": "dd76042d-85d3-47dc-a981-a26bd1c75088" + }, + "id": 4, + "scope_entry": null, + "scope_exit": null + }, + { + "type": "MapEntry", + "label": "kernel_26_4_27[k=0:2]", + "attributes": { + "label": "kernel_26_4_27", + "params": [ + "k" + ], + "range": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "1", + "step": "1", + "tile": "1" + } + ] + }, + "schedule": "Sequential", + "debuginfo": { + "type": "DebugInfo", + "start_line": 27, + "end_line": 27, + "start_column": 8, + "end_column": 8, + "filename": "/home/primrose/Work/DaceLayoutAndScheduleTransformations/layout_and_schedule_transformations/tests/multiple_buffering_test.py" + }, + "in_connectors": { + "IN___tmp_29_37_r_in_from_1_0_in_from_1_0": null, + "IN___tmp_29_58_r_in_from_1_0_in_from_1_0": null, + "IN_prefetch_A": null, + "IN_prefetch_B": null + }, + "out_connectors": { + "OUT___tmp_29_37_r_in_from_1_0_in_from_1_0": null, + "OUT___tmp_29_58_r_in_from_1_0_in_from_1_0": null, + "OUT_prefetch_A": null, + "OUT_prefetch_B": null + }, + "guid": "148ec12e-ac47-4a5e-bc34-836556a4bb1d" + }, + "id": 5, + "scope_entry": "0", + "scope_exit": "6" + }, + { + "type": "MapExit", + "label": "kernel_26_4_27[k=0:2]", + "attributes": { + "in_connectors": { + "IN___tmp_29_16_w_out_of_1_1_out_of_1_1": null + }, + "out_connectors": { + "OUT___tmp_29_16_w_out_of_1_1_out_of_1_1": null + }, + "guid": "525be732-b8e8-4067-a15d-9ad9a5ca1096" + }, + "id": 6, + "scope_entry": "5", + "scope_exit": "6" + }, + { + "type": "MapEntry", + "label": "kernel_26_4_27_8_28[j=0:256]", + "attributes": { + "label": "kernel_26_4_27_8_28", + "params": [ + "j" + ], + "range": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "255", + "step": "1", + "tile": "1" + } + ] + }, + "schedule": "GPU_ThreadBlock", + "debuginfo": { + "type": "DebugInfo", + "start_line": 28, + "end_line": 28, + "start_column": 12, + "end_column": 12, + "filename": "/home/primrose/Work/DaceLayoutAndScheduleTransformations/layout_and_schedule_transformations/tests/multiple_buffering_test.py" + }, + "in_connectors": { + "IN___tmp_29_37_r_in_from_1_0": null, + "IN___tmp_29_58_r_in_from_1_0": null, + "prefetch_shr_A": null, + "prefetch_shr_B": null + }, + "out_connectors": { + "OUT___tmp_29_37_r_in_from_1_0": null, + "OUT___tmp_29_58_r_in_from_1_0": null + }, + "guid": "bf2fc620-040a-48b3-be54-f9d9b1997e41" + }, + "id": 7, + "scope_entry": "5", + "scope_exit": "8" + }, + { + "type": "MapExit", + "label": "kernel_26_4_27_8_28[j=0:256]", + "attributes": { + "in_connectors": { + "IN___tmp_29_16_w_out_of_1_1": null + }, + "out_connectors": { + "OUT___tmp_29_16_w_out_of_1_1": null + }, + "guid": "db988a0e-a1c1-45a3-9905-32dc050c6b76" + }, + "id": 8, + "scope_entry": "7", + "scope_exit": "8" + }, + { + "type": "Tasklet", + "label": "_Add_", + "attributes": { + "code": { + "string_data": "__out = (__in1 + __in2)", + "language": "Python" + }, + "debuginfo": { + "type": "DebugInfo", + "start_line": 29, + "end_line": 29, + "start_column": 72, + "end_column": 72, + "filename": "/home/primrose/Work/DaceLayoutAndScheduleTransformations/layout_and_schedule_transformations/tests/multiple_buffering_test.py" + }, + "label": "_Add_", + "in_connectors": { + "__in1": null, + "__in2": null + }, + "out_connectors": { + "__out": null + }, + "guid": "86df7814-ffdc-41a0-afe1-f211ae009e6a" + }, + "id": 9, + "scope_entry": "7", + "scope_exit": "8" + }, + { + "type": "AccessNode", + "label": "__tmp3", + "attributes": { + "debuginfo": { + "type": "DebugInfo", + "start_line": 29, + "end_line": 29, + "start_column": 72, + "end_column": 72, + "filename": "/home/primrose/Work/DaceLayoutAndScheduleTransformations/layout_and_schedule_transformations/tests/multiple_buffering_test.py" + }, + "data": "__tmp3", + "guid": "189ae714-2453-4033-9ea3-497068c70521" + }, + "id": 10, + "scope_entry": "7", + "scope_exit": "8" + }, + { + "type": "Tasklet", + "label": "assign_29_16", + "attributes": { + "code": { + "string_data": "__out = __inp", + "language": "Python" + }, + "debuginfo": { + "type": "DebugInfo", + "start_line": 29, + "end_line": 29, + "start_column": 30, + "end_column": 30, + "filename": "/home/primrose/Work/DaceLayoutAndScheduleTransformations/layout_and_schedule_transformations/tests/multiple_buffering_test.py" + }, + "label": "assign_29_16", + "in_connectors": { + "__inp": null + }, + "out_connectors": { + "__out": null + }, + "guid": "ee7b20f7-fbc2-49a9-8fba-ccffff2b7780" + }, + "id": 11, + "scope_entry": "7", + "scope_exit": "8" + }, + { + "type": "NestedSDFG", + "label": "pipeline_prefill_nsdfg_0", + "attributes": { + "sdfg": { + "type": "SDFG", + "attributes": { + "name": "pipeline_prefill_main_sdfg_0", + "_arrays": { + "B": { + "type": "Array", + "attributes": { + "strides": [ + "1" + ], + "total_size": "N", + "offset": [ + "0" + ], + "optional": false, + "dtype": "float64", + "shape": [ + "N" + ], + "storage": "GPU_Global", + "debuginfo": null + } + }, + "shr_B": { + "type": "Array", + "attributes": { + "strides": [ + "256", + "1" + ], + "total_size": "512", + "offset": [ + "0", + "0" + ], + "optional": false, + "dtype": "float64", + "shape": [ + "2", + "256" + ], + "storage": "GPU_Shared", + "debuginfo": null + } + }, + "A": { + "type": "Array", + "attributes": { + "strides": [ + "1" + ], + "total_size": "N", + "offset": [ + "0" + ], + "optional": false, + "dtype": "float64", + "shape": [ + "N" + ], + "storage": "GPU_Global", + "debuginfo": null + } + }, + "shr_A": { + "type": "Array", + "attributes": { + "strides": [ + "256", + "1" + ], + "total_size": "512", + "offset": [ + "0", + "0" + ], + "optional": false, + "dtype": "float64", + "shape": [ + "2", + "256" + ], + "storage": "GPU_Shared", + "debuginfo": null + } + } + }, + "symbols": { + "N": "int64", + "i": "int64" + }, + "global_code": { + "frame": { + "string_data": "", + "language": "CPP" + } + }, + "init_code": { + "frame": { + "string_data": "", + "language": "CPP" + } + }, + "exit_code": { + "frame": { + "string_data": "", + "language": "CPP" + } + }, + "guid": "eea04031-8543-47bf-8b5a-f84b21ebbe6f" + }, + "nodes": [ + { + "type": "LoopRegion", + "attributes": { + "update_statement": { + "string_data": "pipe_stage = (pipe_stage + 1)", + "language": "Python" + }, + "init_statement": { + "string_data": "pipe_stage = 0", + "language": "Python" + }, + "loop_condition": { + "string_data": "(pipe_stage < 1)", + "language": "Python" + }, + "loop_variable": "pipe_stage", + "guid": "5bc4de60-cb97-4474-81ee-2eb518eef744" + }, + "nodes": [ + { + "type": "ConditionalBlock", + "attributes": { + "guid": "03025064-bdb3-43db-909c-3488ec9b7504" + }, + "nodes": [ + { + "type": "ControlFlowRegion", + "attributes": { + "guid": "05b5e995-af61-4e76-854c-081bc25f7a42" + }, + "nodes": [ + { + "type": "SDFGState", + "label": "pipeline_prefill_state_0", + "id": 0, + "collapsed": false, + "scope_dict": { + "-1": [ + 0, + 1, + 2, + 3 + ] + }, + "nodes": [ + { + "type": "AccessNode", + "label": "B", + "attributes": { + "debuginfo": { + "type": "DebugInfo", + "start_line": 771, + "end_line": 771, + "start_column": 0, + "end_column": 0, + "filename": "/home/primrose/Work/dace/dace/transformation/dataflow/multiple_buffering.py" + }, + "data": "B", + "guid": "c41027b9-626d-4477-9adb-64d4737d1262" + }, + "id": 0, + "scope_entry": null, + "scope_exit": null + }, + { + "type": "AccessNode", + "label": "shr_B", + "attributes": { + "debuginfo": { + "type": "DebugInfo", + "start_line": 772, + "end_line": 772, + "start_column": 0, + "end_column": 0, + "filename": "/home/primrose/Work/dace/dace/transformation/dataflow/multiple_buffering.py" + }, + "data": "shr_B", + "guid": "b0787fde-9244-48b5-a20e-aa7eda1dde3b" + }, + "id": 1, + "scope_entry": null, + "scope_exit": null + }, + { + "type": "AccessNode", + "label": "A", + "attributes": { + "debuginfo": { + "type": "DebugInfo", + "start_line": 771, + "end_line": 771, + "start_column": 0, + "end_column": 0, + "filename": "/home/primrose/Work/dace/dace/transformation/dataflow/multiple_buffering.py" + }, + "data": "A", + "guid": "1b3ebd30-f4f5-4dcd-9d69-268a6198a074" + }, + "id": 2, + "scope_entry": null, + "scope_exit": null + }, + { + "type": "AccessNode", + "label": "shr_A", + "attributes": { + "debuginfo": { + "type": "DebugInfo", + "start_line": 772, + "end_line": 772, + "start_column": 0, + "end_column": 0, + "filename": "/home/primrose/Work/dace/dace/transformation/dataflow/multiple_buffering.py" + }, + "data": "shr_A", + "guid": "49ff03e3-8c02-4502-8f41-42cfa81acc43" + }, + "id": 3, + "scope_entry": null, + "scope_exit": null + } + ], + "edges": [ + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "256", + "subset": { + "type": "Range", + "ranges": [ + { + "start": "256*pipe_stage + i", + "end": "256*pipe_stage + i + 255", + "step": "1", + "tile": "1" + } + ] + }, + "other_subset": { + "type": "Range", + "ranges": [ + { + "start": "pipe_stage", + "end": "pipe_stage", + "step": "1", + "tile": "1" + }, + { + "start": "0", + "end": "255", + "step": "1", + "tile": "1" + } + ] + }, + "data": "B", + "debuginfo": null, + "guid": "f10bf60c-98c6-498b-8bb1-b32392cee7ec", + "src_subset": { + "type": "Range", + "ranges": [ + { + "start": "256*pipe_stage + i", + "end": "256*pipe_stage + i + 255", + "step": "1", + "tile": "1" + } + ] + }, + "dst_subset": { + "type": "Range", + "ranges": [ + { + "start": "pipe_stage", + "end": "pipe_stage", + "step": "1", + "tile": "1" + }, + { + "start": "0", + "end": "255", + "step": "1", + "tile": "1" + } + ] + }, + "is_data_src": true, + "num_accesses": "256" + } + } + }, + "src": "0", + "dst": "1", + "dst_connector": null, + "src_connector": null + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "256", + "subset": { + "type": "Range", + "ranges": [ + { + "start": "256*pipe_stage + i", + "end": "256*pipe_stage + i + 255", + "step": "1", + "tile": "1" + } + ] + }, + "other_subset": { + "type": "Range", + "ranges": [ + { + "start": "pipe_stage", + "end": "pipe_stage", + "step": "1", + "tile": "1" + }, + { + "start": "0", + "end": "255", + "step": "1", + "tile": "1" + } + ] + }, + "data": "A", + "debuginfo": null, + "guid": "9289d8e8-28ba-469c-9130-53adce8d0b59", + "src_subset": { + "type": "Range", + "ranges": [ + { + "start": "256*pipe_stage + i", + "end": "256*pipe_stage + i + 255", + "step": "1", + "tile": "1" + } + ] + }, + "dst_subset": { + "type": "Range", + "ranges": [ + { + "start": "pipe_stage", + "end": "pipe_stage", + "step": "1", + "tile": "1" + }, + { + "start": "0", + "end": "255", + "step": "1", + "tile": "1" + } + ] + }, + "is_data_src": true, + "num_accesses": "256" + } + } + }, + "src": "2", + "dst": "3", + "dst_connector": null, + "src_connector": null + } + ], + "attributes": { + "guid": "016184bb-8e43-4101-859a-a2608a20b280" + } + } + ], + "edges": [], + "collapsed": false, + "label": "pipeline_prefill_test_0", + "id": 0, + "cfg_list_id": 4, + "start_block": 0 + } + ], + "edges": [], + "collapsed": false, + "label": "pipeline_prefill_cond_0", + "id": 0, + "cfg_list_id": 3, + "start_block": null, + "branches": [ + [ + { + "string_data": "(pipe_stage <= 1)", + "language": "Python" + }, + { + "type": "ControlFlowRegion", + "attributes": { + "guid": "05b5e995-af61-4e76-854c-081bc25f7a42" + }, + "nodes": [ + { + "type": "SDFGState", + "label": "pipeline_prefill_state_0", + "id": 0, + "collapsed": false, + "scope_dict": { + "-1": [ + 0, + 1, + 2, + 3 + ] + }, + "nodes": [ + { + "type": "AccessNode", + "label": "B", + "attributes": { + "debuginfo": { + "type": "DebugInfo", + "start_line": 771, + "end_line": 771, + "start_column": 0, + "end_column": 0, + "filename": "/home/primrose/Work/dace/dace/transformation/dataflow/multiple_buffering.py" + }, + "data": "B", + "guid": "c41027b9-626d-4477-9adb-64d4737d1262" + }, + "id": 0, + "scope_entry": null, + "scope_exit": null + }, + { + "type": "AccessNode", + "label": "shr_B", + "attributes": { + "debuginfo": { + "type": "DebugInfo", + "start_line": 772, + "end_line": 772, + "start_column": 0, + "end_column": 0, + "filename": "/home/primrose/Work/dace/dace/transformation/dataflow/multiple_buffering.py" + }, + "data": "shr_B", + "guid": "b0787fde-9244-48b5-a20e-aa7eda1dde3b" + }, + "id": 1, + "scope_entry": null, + "scope_exit": null + }, + { + "type": "AccessNode", + "label": "A", + "attributes": { + "debuginfo": { + "type": "DebugInfo", + "start_line": 771, + "end_line": 771, + "start_column": 0, + "end_column": 0, + "filename": "/home/primrose/Work/dace/dace/transformation/dataflow/multiple_buffering.py" + }, + "data": "A", + "guid": "1b3ebd30-f4f5-4dcd-9d69-268a6198a074" + }, + "id": 2, + "scope_entry": null, + "scope_exit": null + }, + { + "type": "AccessNode", + "label": "shr_A", + "attributes": { + "debuginfo": { + "type": "DebugInfo", + "start_line": 772, + "end_line": 772, + "start_column": 0, + "end_column": 0, + "filename": "/home/primrose/Work/dace/dace/transformation/dataflow/multiple_buffering.py" + }, + "data": "shr_A", + "guid": "49ff03e3-8c02-4502-8f41-42cfa81acc43" + }, + "id": 3, + "scope_entry": null, + "scope_exit": null + } + ], + "edges": [ + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "256", + "subset": { + "type": "Range", + "ranges": [ + { + "start": "256*pipe_stage + i", + "end": "256*pipe_stage + i + 255", + "step": "1", + "tile": "1" + } + ] + }, + "other_subset": { + "type": "Range", + "ranges": [ + { + "start": "pipe_stage", + "end": "pipe_stage", + "step": "1", + "tile": "1" + }, + { + "start": "0", + "end": "255", + "step": "1", + "tile": "1" + } + ] + }, + "data": "B", + "debuginfo": null, + "guid": "f10bf60c-98c6-498b-8bb1-b32392cee7ec", + "src_subset": { + "type": "Range", + "ranges": [ + { + "start": "256*pipe_stage + i", + "end": "256*pipe_stage + i + 255", + "step": "1", + "tile": "1" + } + ] + }, + "dst_subset": { + "type": "Range", + "ranges": [ + { + "start": "pipe_stage", + "end": "pipe_stage", + "step": "1", + "tile": "1" + }, + { + "start": "0", + "end": "255", + "step": "1", + "tile": "1" + } + ] + }, + "is_data_src": true, + "num_accesses": "256" + } + } + }, + "src": "0", + "dst": "1", + "dst_connector": null, + "src_connector": null + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "256", + "subset": { + "type": "Range", + "ranges": [ + { + "start": "256*pipe_stage + i", + "end": "256*pipe_stage + i + 255", + "step": "1", + "tile": "1" + } + ] + }, + "other_subset": { + "type": "Range", + "ranges": [ + { + "start": "pipe_stage", + "end": "pipe_stage", + "step": "1", + "tile": "1" + }, + { + "start": "0", + "end": "255", + "step": "1", + "tile": "1" + } + ] + }, + "data": "A", + "debuginfo": null, + "guid": "9289d8e8-28ba-469c-9130-53adce8d0b59", + "src_subset": { + "type": "Range", + "ranges": [ + { + "start": "256*pipe_stage + i", + "end": "256*pipe_stage + i + 255", + "step": "1", + "tile": "1" + } + ] + }, + "dst_subset": { + "type": "Range", + "ranges": [ + { + "start": "pipe_stage", + "end": "pipe_stage", + "step": "1", + "tile": "1" + }, + { + "start": "0", + "end": "255", + "step": "1", + "tile": "1" + } + ] + }, + "is_data_src": true, + "num_accesses": "256" + } + } + }, + "src": "2", + "dst": "3", + "dst_connector": null, + "src_connector": null + } + ], + "attributes": { + "guid": "016184bb-8e43-4101-859a-a2608a20b280" + } + } + ], + "edges": [], + "collapsed": false, + "label": "pipeline_prefill_test_0", + "id": null, + "cfg_list_id": 4, + "start_block": 0 + } + ] + ] + } + ], + "edges": [], + "collapsed": false, + "label": "pipeline_prefill_loop_0", + "id": 0, + "cfg_list_id": 2, + "start_block": 0 + } + ], + "edges": [], + "collapsed": false, + "label": "", + "id": null, + "cfg_list_id": 1, + "start_block": 0 + }, + "schedule": "Sequential", + "symbol_mapping": { + "N": "N", + "i": "i" + }, + "debuginfo": { + "type": "DebugInfo", + "start_line": 761, + "end_line": 761, + "start_column": 0, + "end_column": 0, + "filename": "/home/primrose/Work/dace/dace/transformation/dataflow/multiple_buffering.py" + }, + "label": "pipeline_prefill_nsdfg_0", + "in_connectors": { + "A": null, + "B": null + }, + "out_connectors": { + "shr_A": null, + "shr_B": null + }, + "guid": "431c5863-6cf8-4bff-a7fc-11bea2357e5e" + }, + "id": 12, + "scope_entry": "0", + "scope_exit": "1" + }, + { + "type": "AccessNode", + "label": "shr_B", + "attributes": { + "debuginfo": { + "type": "DebugInfo", + "start_line": 398, + "end_line": 398, + "start_column": 0, + "end_column": 0, + "filename": "/home/primrose/Work/dace/dace/transformation/dataflow/multiple_buffering.py" + }, + "data": "shr_B", + "guid": "9aff2a42-f653-436e-adde-43b71fedb84e" + }, + "id": 13, + "scope_entry": "0", + "scope_exit": "1" + }, + { + "type": "AccessNode", + "label": "shr_A", + "attributes": { + "debuginfo": { + "type": "DebugInfo", + "start_line": 398, + "end_line": 398, + "start_column": 0, + "end_column": 0, + "filename": "/home/primrose/Work/dace/dace/transformation/dataflow/multiple_buffering.py" + }, + "data": "shr_A", + "guid": "d60871ef-9696-4fed-9bac-72fe656f3747" + }, + "id": 14, + "scope_entry": "0", + "scope_exit": "1" + }, + { + "type": "NestedSDFG", + "label": "pipeline_prefetch_nsdfg_0", + "attributes": { + "sdfg": { + "type": "SDFG", + "attributes": { + "name": "pipeline_prefetch_main_sdfg_0", + "_arrays": { + "B": { + "type": "Array", + "attributes": { + "strides": [ + "1" + ], + "total_size": "N", + "offset": [ + "0" + ], + "optional": false, + "dtype": "float64", + "shape": [ + "N" + ], + "storage": "GPU_Global", + "debuginfo": null + } + }, + "shr_B": { + "type": "Array", + "attributes": { + "strides": [ + "256", + "1" + ], + "total_size": "512", + "offset": [ + "0", + "0" + ], + "optional": false, + "dtype": "float64", + "shape": [ + "2", + "256" + ], + "storage": "GPU_Shared", + "debuginfo": null + } + }, + "A": { + "type": "Array", + "attributes": { + "strides": [ + "1" + ], + "total_size": "N", + "offset": [ + "0" + ], + "optional": false, + "dtype": "float64", + "shape": [ + "N" + ], + "storage": "GPU_Global", + "debuginfo": null + } + }, + "shr_A": { + "type": "Array", + "attributes": { + "strides": [ + "256", + "1" + ], + "total_size": "512", + "offset": [ + "0", + "0" + ], + "optional": false, + "dtype": "float64", + "shape": [ + "2", + "256" + ], + "storage": "GPU_Shared", + "debuginfo": null + } + } + }, + "symbols": { + "N": "int64", + "i": "int64", + "k": "int32" + }, + "global_code": { + "frame": { + "string_data": "", + "language": "CPP" + } + }, + "init_code": { + "frame": { + "string_data": "", + "language": "CPP" + } + }, + "exit_code": { + "frame": { + "string_data": "", + "language": "CPP" + } + }, + "guid": "c1a1acfc-7ab9-4f27-af94-09ce0d5f8c8d" + }, + "nodes": [ + { + "type": "ConditionalBlock", + "attributes": { + "guid": "40691bac-7151-4891-984f-8e3a8d3ac974" + }, + "nodes": [ + { + "type": "ControlFlowRegion", + "attributes": { + "guid": "99957ad5-0143-430c-9e3b-6a6d458c2bcb" + }, + "nodes": [ + { + "type": "SDFGState", + "label": "pipeline_prefetch_state_0", + "id": 0, + "collapsed": false, + "scope_dict": { + "-1": [ + 0, + 1, + 2, + 3 + ] + }, + "nodes": [ + { + "type": "AccessNode", + "label": "B", + "attributes": { + "debuginfo": { + "type": "DebugInfo", + "start_line": 842, + "end_line": 842, + "start_column": 0, + "end_column": 0, + "filename": "/home/primrose/Work/dace/dace/transformation/dataflow/multiple_buffering.py" + }, + "data": "B", + "guid": "8629e09d-3256-4746-b257-e1d1633b4d09" + }, + "id": 0, + "scope_entry": null, + "scope_exit": null + }, + { + "type": "AccessNode", + "label": "shr_B", + "attributes": { + "debuginfo": { + "type": "DebugInfo", + "start_line": 843, + "end_line": 843, + "start_column": 0, + "end_column": 0, + "filename": "/home/primrose/Work/dace/dace/transformation/dataflow/multiple_buffering.py" + }, + "data": "shr_B", + "guid": "19c17ed2-1764-4274-b020-606c61d4b7f8" + }, + "id": 1, + "scope_entry": null, + "scope_exit": null + }, + { + "type": "AccessNode", + "label": "A", + "attributes": { + "debuginfo": { + "type": "DebugInfo", + "start_line": 842, + "end_line": 842, + "start_column": 0, + "end_column": 0, + "filename": "/home/primrose/Work/dace/dace/transformation/dataflow/multiple_buffering.py" + }, + "data": "A", + "guid": "c231ef74-5202-4f53-a930-10174929ebfc" + }, + "id": 2, + "scope_entry": null, + "scope_exit": null + }, + { + "type": "AccessNode", + "label": "shr_A", + "attributes": { + "debuginfo": { + "type": "DebugInfo", + "start_line": 843, + "end_line": 843, + "start_column": 0, + "end_column": 0, + "filename": "/home/primrose/Work/dace/dace/transformation/dataflow/multiple_buffering.py" + }, + "data": "shr_A", + "guid": "646deb68-da76-4a92-b937-da96ba475b21" + }, + "id": 3, + "scope_entry": null, + "scope_exit": null + } + ], + "edges": [ + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "256", + "subset": { + "type": "Range", + "ranges": [ + { + "start": "256*k + i + 256", + "end": "256*k + i + 511", + "step": "1", + "tile": "1" + } + ] + }, + "other_subset": { + "type": "Range", + "ranges": [ + { + "start": "Mod(k + 1, 2)", + "end": "Mod(k + 1, 2)", + "step": "1", + "tile": "1" + }, + { + "start": "0", + "end": "255", + "step": "1", + "tile": "1" + } + ] + }, + "data": "B", + "debuginfo": null, + "guid": "0d43d4bf-add6-433d-9686-67cb71d9dbb0", + "src_subset": { + "type": "Range", + "ranges": [ + { + "start": "256*k + i + 256", + "end": "256*k + i + 511", + "step": "1", + "tile": "1" + } + ] + }, + "dst_subset": { + "type": "Range", + "ranges": [ + { + "start": "Mod(k + 1, 2)", + "end": "Mod(k + 1, 2)", + "step": "1", + "tile": "1" + }, + { + "start": "0", + "end": "255", + "step": "1", + "tile": "1" + } + ] + }, + "is_data_src": true, + "num_accesses": "256" + } + } + }, + "src": "0", + "dst": "1", + "dst_connector": null, + "src_connector": null + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "256", + "subset": { + "type": "Range", + "ranges": [ + { + "start": "256*k + i + 256", + "end": "256*k + i + 511", + "step": "1", + "tile": "1" + } + ] + }, + "other_subset": { + "type": "Range", + "ranges": [ + { + "start": "Mod(k + 1, 2)", + "end": "Mod(k + 1, 2)", + "step": "1", + "tile": "1" + }, + { + "start": "0", + "end": "255", + "step": "1", + "tile": "1" + } + ] + }, + "data": "A", + "debuginfo": null, + "guid": "ea82a854-7399-463e-ad83-d281f8f3e8b3", + "src_subset": { + "type": "Range", + "ranges": [ + { + "start": "256*k + i + 256", + "end": "256*k + i + 511", + "step": "1", + "tile": "1" + } + ] + }, + "dst_subset": { + "type": "Range", + "ranges": [ + { + "start": "Mod(k + 1, 2)", + "end": "Mod(k + 1, 2)", + "step": "1", + "tile": "1" + }, + { + "start": "0", + "end": "255", + "step": "1", + "tile": "1" + } + ] + }, + "is_data_src": true, + "num_accesses": "256" + } + } + }, + "src": "2", + "dst": "3", + "dst_connector": null, + "src_connector": null + } + ], + "attributes": { + "guid": "8ba50dba-4244-4519-86f9-189eb2300202" + } + } + ], + "edges": [], + "collapsed": false, + "label": "pipeline_prefetch_test_0", + "id": 0, + "cfg_list_id": 7, + "start_block": 0 + } + ], + "edges": [], + "collapsed": false, + "label": "pipeline_prefetch_cond_0", + "id": 0, + "cfg_list_id": 6, + "start_block": null, + "branches": [ + [ + { + "string_data": "((k + 1) <= 1)", + "language": "Python" + }, + { + "type": "ControlFlowRegion", + "attributes": { + "guid": "99957ad5-0143-430c-9e3b-6a6d458c2bcb" + }, + "nodes": [ + { + "type": "SDFGState", + "label": "pipeline_prefetch_state_0", + "id": 0, + "collapsed": false, + "scope_dict": { + "-1": [ + 0, + 1, + 2, + 3 + ] + }, + "nodes": [ + { + "type": "AccessNode", + "label": "B", + "attributes": { + "debuginfo": { + "type": "DebugInfo", + "start_line": 842, + "end_line": 842, + "start_column": 0, + "end_column": 0, + "filename": "/home/primrose/Work/dace/dace/transformation/dataflow/multiple_buffering.py" + }, + "data": "B", + "guid": "8629e09d-3256-4746-b257-e1d1633b4d09" + }, + "id": 0, + "scope_entry": null, + "scope_exit": null + }, + { + "type": "AccessNode", + "label": "shr_B", + "attributes": { + "debuginfo": { + "type": "DebugInfo", + "start_line": 843, + "end_line": 843, + "start_column": 0, + "end_column": 0, + "filename": "/home/primrose/Work/dace/dace/transformation/dataflow/multiple_buffering.py" + }, + "data": "shr_B", + "guid": "19c17ed2-1764-4274-b020-606c61d4b7f8" + }, + "id": 1, + "scope_entry": null, + "scope_exit": null + }, + { + "type": "AccessNode", + "label": "A", + "attributes": { + "debuginfo": { + "type": "DebugInfo", + "start_line": 842, + "end_line": 842, + "start_column": 0, + "end_column": 0, + "filename": "/home/primrose/Work/dace/dace/transformation/dataflow/multiple_buffering.py" + }, + "data": "A", + "guid": "c231ef74-5202-4f53-a930-10174929ebfc" + }, + "id": 2, + "scope_entry": null, + "scope_exit": null + }, + { + "type": "AccessNode", + "label": "shr_A", + "attributes": { + "debuginfo": { + "type": "DebugInfo", + "start_line": 843, + "end_line": 843, + "start_column": 0, + "end_column": 0, + "filename": "/home/primrose/Work/dace/dace/transformation/dataflow/multiple_buffering.py" + }, + "data": "shr_A", + "guid": "646deb68-da76-4a92-b937-da96ba475b21" + }, + "id": 3, + "scope_entry": null, + "scope_exit": null + } + ], + "edges": [ + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "256", + "subset": { + "type": "Range", + "ranges": [ + { + "start": "256*k + i + 256", + "end": "256*k + i + 511", + "step": "1", + "tile": "1" + } + ] + }, + "other_subset": { + "type": "Range", + "ranges": [ + { + "start": "Mod(k + 1, 2)", + "end": "Mod(k + 1, 2)", + "step": "1", + "tile": "1" + }, + { + "start": "0", + "end": "255", + "step": "1", + "tile": "1" + } + ] + }, + "data": "B", + "debuginfo": null, + "guid": "0d43d4bf-add6-433d-9686-67cb71d9dbb0", + "src_subset": { + "type": "Range", + "ranges": [ + { + "start": "256*k + i + 256", + "end": "256*k + i + 511", + "step": "1", + "tile": "1" + } + ] + }, + "dst_subset": { + "type": "Range", + "ranges": [ + { + "start": "Mod(k + 1, 2)", + "end": "Mod(k + 1, 2)", + "step": "1", + "tile": "1" + }, + { + "start": "0", + "end": "255", + "step": "1", + "tile": "1" + } + ] + }, + "is_data_src": true, + "num_accesses": "256" + } + } + }, + "src": "0", + "dst": "1", + "dst_connector": null, + "src_connector": null + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "256", + "subset": { + "type": "Range", + "ranges": [ + { + "start": "256*k + i + 256", + "end": "256*k + i + 511", + "step": "1", + "tile": "1" + } + ] + }, + "other_subset": { + "type": "Range", + "ranges": [ + { + "start": "Mod(k + 1, 2)", + "end": "Mod(k + 1, 2)", + "step": "1", + "tile": "1" + }, + { + "start": "0", + "end": "255", + "step": "1", + "tile": "1" + } + ] + }, + "data": "A", + "debuginfo": null, + "guid": "ea82a854-7399-463e-ad83-d281f8f3e8b3", + "src_subset": { + "type": "Range", + "ranges": [ + { + "start": "256*k + i + 256", + "end": "256*k + i + 511", + "step": "1", + "tile": "1" + } + ] + }, + "dst_subset": { + "type": "Range", + "ranges": [ + { + "start": "Mod(k + 1, 2)", + "end": "Mod(k + 1, 2)", + "step": "1", + "tile": "1" + }, + { + "start": "0", + "end": "255", + "step": "1", + "tile": "1" + } + ] + }, + "is_data_src": true, + "num_accesses": "256" + } + } + }, + "src": "2", + "dst": "3", + "dst_connector": null, + "src_connector": null + } + ], + "attributes": { + "guid": "8ba50dba-4244-4519-86f9-189eb2300202" + } + } + ], + "edges": [], + "collapsed": false, + "label": "pipeline_prefetch_test_0", + "id": null, + "cfg_list_id": 7, + "start_block": 0 + } + ] + ] + } + ], + "edges": [], + "collapsed": false, + "label": "", + "id": null, + "cfg_list_id": 5, + "start_block": 0 + }, + "schedule": "Sequential", + "symbol_mapping": { + "N": "N", + "i": "i", + "k": "k" + }, + "debuginfo": { + "type": "DebugInfo", + "start_line": 832, + "end_line": 832, + "start_column": 0, + "end_column": 0, + "filename": "/home/primrose/Work/dace/dace/transformation/dataflow/multiple_buffering.py" + }, + "label": "pipeline_prefetch_nsdfg_0", + "in_connectors": { + "A": null, + "B": null + }, + "out_connectors": { + "shr_A": null, + "shr_B": null + }, + "guid": "457b9063-b1ef-47fa-ac64-2c212ab5d96c" + }, + "id": 15, + "scope_entry": "5", + "scope_exit": "6" + }, + { + "type": "AccessNode", + "label": "shr_B", + "attributes": { + "debuginfo": { + "type": "DebugInfo", + "start_line": 496, + "end_line": 496, + "start_column": 0, + "end_column": 0, + "filename": "/home/primrose/Work/dace/dace/transformation/dataflow/multiple_buffering.py" + }, + "data": "shr_B", + "guid": "e371ae13-b0fe-4cbb-889d-d4ac8b31c21f" + }, + "id": 16, + "scope_entry": "5", + "scope_exit": "6" + }, + { + "type": "Tasklet", + "label": "sync_pipeline_shr_B", + "attributes": { + "code": { + "string_data": "pipeline_shr_B.consumer_wait();", + "language": "CPP" + }, + "debuginfo": { + "type": "DebugInfo", + "start_line": 549, + "end_line": 549, + "start_column": 0, + "end_column": 0, + "filename": "/home/primrose/Work/dace/dace/transformation/dataflow/multiple_buffering.py" + }, + "label": "sync_pipeline_shr_B", + "in_connectors": { + "_in1": null + }, + "out_connectors": { + "_out1": null + }, + "guid": "84793100-da93-4b8c-9456-1533a44515d5" + }, + "id": 17, + "scope_entry": "5", + "scope_exit": "6" + }, + { + "type": "AccessNode", + "label": "shr_A", + "attributes": { + "debuginfo": { + "type": "DebugInfo", + "start_line": 496, + "end_line": 496, + "start_column": 0, + "end_column": 0, + "filename": "/home/primrose/Work/dace/dace/transformation/dataflow/multiple_buffering.py" + }, + "data": "shr_A", + "guid": "f38872ae-3a27-42d3-bf8a-5bb504cc02f8" + }, + "id": 18, + "scope_entry": "5", + "scope_exit": "6" + }, + { + "type": "Tasklet", + "label": "sync_pipeline_shr_A", + "attributes": { + "code": { + "string_data": "pipeline_shr_A.consumer_wait();", + "language": "CPP" + }, + "debuginfo": { + "type": "DebugInfo", + "start_line": 549, + "end_line": 549, + "start_column": 0, + "end_column": 0, + "filename": "/home/primrose/Work/dace/dace/transformation/dataflow/multiple_buffering.py" + }, + "label": "sync_pipeline_shr_A", + "in_connectors": { + "_in1": null + }, + "out_connectors": { + "_out1": null + }, + "guid": "e40b5bbc-50e4-4f39-9205-c0e52e50f3dc" + }, + "id": 19, + "scope_entry": "5", + "scope_exit": "6" + }, + { + "type": "Tasklet", + "label": "release_pipelines", + "attributes": { + "code": { + "string_data": "pipeline_shr_B.consumer_release();\npipeline_shr_A.consumer_release();", + "language": "CPP" + }, + "debuginfo": { + "type": "DebugInfo", + "start_line": 570, + "end_line": 570, + "start_column": 0, + "end_column": 0, + "filename": "/home/primrose/Work/dace/dace/transformation/dataflow/multiple_buffering.py" + }, + "side_effects": true, + "label": "release_pipelines", + "guid": "385fdeb6-b4b2-4d4a-9d7e-6a2c8150fa70" + }, + "id": 20, + "scope_entry": "5", + "scope_exit": "6" + } + ], + "edges": [ + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "1", + "debuginfo": null, + "guid": "c5a715ef-8281-40c6-a5ce-6c005d9bdf59", + "src_subset": null, + "dst_subset": null, + "is_data_src": false, + "num_accesses": "1" + } + } + }, + "src": "8", + "dst": "20", + "dst_connector": null, + "src_connector": null + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "1", + "debuginfo": null, + "guid": "ae86542a-8a33-46cf-811f-1d5ea7e4264f", + "src_subset": null, + "dst_subset": null, + "is_data_src": false, + "num_accesses": "1" + } + } + }, + "src": "20", + "dst": "6", + "dst_connector": null, + "src_connector": null + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "512*ceiling(N/512)", + "subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "N - 1", + "step": "1", + "tile": "1" + } + ] + }, + "data": "A", + "debuginfo": null, + "guid": "90277fce-a899-4a98-9096-c9bc13119b39", + "src_subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "N - 1", + "step": "1", + "tile": "1" + } + ] + }, + "dst_subset": null, + "is_data_src": true, + "num_accesses": "512*ceiling(N/512)" + } + } + }, + "src": "2", + "dst": "0", + "dst_connector": "IN_A", + "src_connector": null + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "512*ceiling(N/512)", + "subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "N - 1", + "step": "1", + "tile": "1" + } + ] + }, + "data": "B", + "debuginfo": null, + "guid": "6dc9354e-bbcf-4372-b5e6-d47e5b615d27", + "src_subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "N - 1", + "step": "1", + "tile": "1" + } + ] + }, + "dst_subset": null, + "is_data_src": true, + "num_accesses": "512*ceiling(N/512)" + } + } + }, + "src": "3", + "dst": "0", + "dst_connector": "IN_B", + "src_connector": null + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "512", + "subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "1", + "step": "1", + "tile": "1" + }, + { + "start": "0", + "end": "255", + "step": "1", + "tile": "1" + } + ] + }, + "data": "shr_A", + "debuginfo": null, + "guid": "17775573-6dfe-42b9-9968-e5138ac5bde0", + "src_subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "1", + "step": "1", + "tile": "1" + }, + { + "start": "0", + "end": "255", + "step": "1", + "tile": "1" + } + ] + }, + "dst_subset": null, + "is_data_src": true, + "num_accesses": "512" + } + } + }, + "src": "14", + "dst": "5", + "dst_connector": "IN___tmp_29_37_r_in_from_1_0_in_from_1_0", + "src_connector": null + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "512", + "subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "1", + "step": "1", + "tile": "1" + }, + { + "start": "0", + "end": "255", + "step": "1", + "tile": "1" + } + ] + }, + "data": "shr_B", + "debuginfo": null, + "guid": "d1680be5-efe6-4947-9c23-725e934436bd", + "src_subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "1", + "step": "1", + "tile": "1" + }, + { + "start": "0", + "end": "255", + "step": "1", + "tile": "1" + } + ] + }, + "dst_subset": null, + "is_data_src": true, + "num_accesses": "512" + } + } + }, + "src": "13", + "dst": "5", + "dst_connector": "IN___tmp_29_58_r_in_from_1_0_in_from_1_0", + "src_connector": null + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "1", + "subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "0", + "step": "1", + "tile": "1" + } + ] + }, + "data": "__tmp3", + "debuginfo": null, + "guid": "6861379f-d4ea-4c88-b194-3cf644ff3955", + "src_subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "0", + "step": "1", + "tile": "1" + } + ] + }, + "dst_subset": null, + "is_data_src": true, + "num_accesses": "1" + } + } + }, + "src": "10", + "dst": "11", + "dst_connector": "__inp", + "src_connector": null + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "256", + "subset": { + "type": "Range", + "ranges": [ + { + "start": "Mod(k + 1, 2)", + "end": "Mod(k + 1, 2)", + "step": "1", + "tile": "1" + }, + { + "start": "0", + "end": "255", + "step": "1", + "tile": "1" + } + ] + }, + "data": "shr_A", + "debuginfo": null, + "guid": "4cd3b695-856a-43a8-aa26-0d356be0414f", + "src_subset": { + "type": "Range", + "ranges": [ + { + "start": "Mod(k + 1, 2)", + "end": "Mod(k + 1, 2)", + "step": "1", + "tile": "1" + }, + { + "start": "0", + "end": "255", + "step": "1", + "tile": "1" + } + ] + }, + "dst_subset": null, + "is_data_src": true, + "num_accesses": "256" + } + } + }, + "src": "18", + "dst": "7", + "dst_connector": "prefetch_shr_A", + "src_connector": null + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "256", + "subset": { + "type": "Range", + "ranges": [ + { + "start": "Mod(k + 1, 2)", + "end": "Mod(k + 1, 2)", + "step": "1", + "tile": "1" + }, + { + "start": "0", + "end": "255", + "step": "1", + "tile": "1" + } + ] + }, + "data": "shr_B", + "debuginfo": null, + "guid": "5bfb1bc5-9e69-4171-9f3c-a924d6d87773", + "src_subset": { + "type": "Range", + "ranges": [ + { + "start": "Mod(k + 1, 2)", + "end": "Mod(k + 1, 2)", + "step": "1", + "tile": "1" + }, + { + "start": "0", + "end": "255", + "step": "1", + "tile": "1" + } + ] + }, + "dst_subset": null, + "is_data_src": true, + "num_accesses": "256" + } + } + }, + "src": "16", + "dst": "7", + "dst_connector": "prefetch_shr_B", + "src_connector": null + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "N", + "subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "N - 1", + "step": "1", + "tile": "1" + } + ] + }, + "data": "A", + "debuginfo": null, + "guid": "a0d5df7f-f651-489f-878d-0d60e7652cb1", + "src_subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "N - 1", + "step": "1", + "tile": "1" + } + ] + }, + "dst_subset": null, + "is_data_src": true, + "num_accesses": "N" + } + } + }, + "src": "0", + "dst": "12", + "dst_connector": "A", + "src_connector": "OUT_A" + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "N", + "subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "N - 1", + "step": "1", + "tile": "1" + } + ] + }, + "data": "A", + "debuginfo": null, + "guid": "9dac117d-7faa-4fc2-ab7c-586720478046", + "src_subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "N - 1", + "step": "1", + "tile": "1" + } + ] + }, + "dst_subset": null, + "is_data_src": true, + "num_accesses": "N" + } + } + }, + "src": "0", + "dst": "5", + "dst_connector": "IN_prefetch_A", + "src_connector": "OUT_A" + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "N", + "subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "N - 1", + "step": "1", + "tile": "1" + } + ] + }, + "data": "B", + "debuginfo": null, + "guid": "b8def05b-494b-46eb-9bc6-c48a75198385", + "src_subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "N - 1", + "step": "1", + "tile": "1" + } + ] + }, + "dst_subset": null, + "is_data_src": true, + "num_accesses": "N" + } + } + }, + "src": "0", + "dst": "12", + "dst_connector": "B", + "src_connector": "OUT_B" + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "N", + "subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "N - 1", + "step": "1", + "tile": "1" + } + ] + }, + "data": "B", + "debuginfo": null, + "guid": "ff09596e-1fe4-4702-b57f-d6e04402c053", + "src_subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "N - 1", + "step": "1", + "tile": "1" + } + ] + }, + "dst_subset": null, + "is_data_src": true, + "num_accesses": "N" + } + } + }, + "src": "0", + "dst": "5", + "dst_connector": "IN_prefetch_B", + "src_connector": "OUT_B" + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "512*ceiling(N/512)", + "subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "N - 1", + "step": "1", + "tile": "1" + } + ] + }, + "data": "C", + "debuginfo": null, + "guid": "ce685d36-588d-4cd6-978b-3546f7e26f93", + "src_subset": null, + "dst_subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "N - 1", + "step": "1", + "tile": "1" + } + ] + }, + "is_data_src": false, + "num_accesses": "512*ceiling(N/512)" + } + } + }, + "src": "1", + "dst": "4", + "dst_connector": null, + "src_connector": "OUT_C" + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "256", + "subset": { + "type": "Range", + "ranges": [ + { + "start": "i + 256*k", + "end": "i + 256*k + 255", + "step": "1", + "tile": "1" + } + ] + }, + "data": "C", + "debuginfo": null, + "guid": "ac7d4bd5-5cfe-4dad-832a-2c3f949f55af", + "src_subset": null, + "dst_subset": { + "type": "Range", + "ranges": [ + { + "start": "i + 256*k", + "end": "i + 256*k + 255", + "step": "1", + "tile": "1" + } + ] + }, + "is_data_src": false, + "num_accesses": "256" + } + } + }, + "src": "8", + "dst": "6", + "dst_connector": "IN___tmp_29_16_w_out_of_1_1_out_of_1_1", + "src_connector": "OUT___tmp_29_16_w_out_of_1_1" + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "512", + "subset": { + "type": "Range", + "ranges": [ + { + "start": "i", + "end": "i + 511", + "step": "1", + "tile": "1" + } + ] + }, + "data": "C", + "debuginfo": null, + "guid": "63b28d6f-5f69-4f75-b3e2-4823990157fd", + "src_subset": null, + "dst_subset": { + "type": "Range", + "ranges": [ + { + "start": "i", + "end": "i + 511", + "step": "1", + "tile": "1" + } + ] + }, + "is_data_src": false, + "num_accesses": "512" + } + } + }, + "src": "6", + "dst": "1", + "dst_connector": "IN_C", + "src_connector": "OUT___tmp_29_16_w_out_of_1_1_out_of_1_1" + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "1", + "subset": { + "type": "Range", + "ranges": [ + { + "start": "Mod(k, 2)", + "end": "Mod(k, 2)", + "step": "1", + "tile": "1" + }, + { + "start": "j", + "end": "j", + "step": "1", + "tile": "1" + } + ] + }, + "data": "shr_A", + "debuginfo": null, + "guid": "abd95999-8a64-422c-88bb-36906fba9305", + "src_subset": null, + "dst_subset": { + "type": "Range", + "ranges": [ + { + "start": "Mod(k, 2)", + "end": "Mod(k, 2)", + "step": "1", + "tile": "1" + }, + { + "start": "j", + "end": "j", + "step": "1", + "tile": "1" + } + ] + }, + "is_data_src": false, + "num_accesses": "1" + } + } + }, + "src": "7", + "dst": "9", + "dst_connector": "__in1", + "src_connector": "OUT___tmp_29_37_r_in_from_1_0" + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "256", + "subset": { + "type": "Range", + "ranges": [ + { + "start": "Mod(k, 2)", + "end": "Mod(k, 2)", + "step": "1", + "tile": "1" + }, + { + "start": "0", + "end": "255", + "step": "1", + "tile": "1" + } + ] + }, + "data": "shr_A", + "debuginfo": null, + "guid": "d3d2a22b-6feb-4aac-8363-3e151992cafb", + "src_subset": { + "type": "Range", + "ranges": [ + { + "start": "Mod(k, 2)", + "end": "Mod(k, 2)", + "step": "1", + "tile": "1" + }, + { + "start": "0", + "end": "255", + "step": "1", + "tile": "1" + } + ] + }, + "dst_subset": null, + "is_data_src": true, + "num_accesses": "256" + } + } + }, + "src": "5", + "dst": "19", + "dst_connector": "_in1", + "src_connector": "OUT___tmp_29_37_r_in_from_1_0_in_from_1_0" + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "1", + "subset": { + "type": "Range", + "ranges": [ + { + "start": "Mod(k, 2)", + "end": "Mod(k, 2)", + "step": "1", + "tile": "1" + }, + { + "start": "j", + "end": "j", + "step": "1", + "tile": "1" + } + ] + }, + "data": "shr_B", + "debuginfo": null, + "guid": "8637db87-9a9f-4c27-afaa-7c6995a16ce1", + "src_subset": null, + "dst_subset": { + "type": "Range", + "ranges": [ + { + "start": "Mod(k, 2)", + "end": "Mod(k, 2)", + "step": "1", + "tile": "1" + }, + { + "start": "j", + "end": "j", + "step": "1", + "tile": "1" + } + ] + }, + "is_data_src": false, + "num_accesses": "1" + } + } + }, + "src": "7", + "dst": "9", + "dst_connector": "__in2", + "src_connector": "OUT___tmp_29_58_r_in_from_1_0" + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "256", + "subset": { + "type": "Range", + "ranges": [ + { + "start": "Mod(k, 2)", + "end": "Mod(k, 2)", + "step": "1", + "tile": "1" + }, + { + "start": "0", + "end": "255", + "step": "1", + "tile": "1" + } + ] + }, + "data": "shr_B", + "debuginfo": null, + "guid": "8d339332-544e-4173-a66c-a335e8993d7c", + "src_subset": { + "type": "Range", + "ranges": [ + { + "start": "Mod(k, 2)", + "end": "Mod(k, 2)", + "step": "1", + "tile": "1" + }, + { + "start": "0", + "end": "255", + "step": "1", + "tile": "1" + } + ] + }, + "dst_subset": null, + "is_data_src": true, + "num_accesses": "256" + } + } + }, + "src": "5", + "dst": "17", + "dst_connector": "_in1", + "src_connector": "OUT___tmp_29_58_r_in_from_1_0_in_from_1_0" + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "N", + "subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "N - 1", + "step": "1", + "tile": "1" + } + ] + }, + "data": "A", + "debuginfo": null, + "guid": "e4f49262-d8a1-4255-8e14-715ba57fdaff", + "src_subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "N - 1", + "step": "1", + "tile": "1" + } + ] + }, + "dst_subset": null, + "is_data_src": true, + "num_accesses": "N" + } + } + }, + "src": "5", + "dst": "15", + "dst_connector": "A", + "src_connector": "OUT_prefetch_A" + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "N", + "subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "N - 1", + "step": "1", + "tile": "1" + } + ] + }, + "data": "B", + "debuginfo": null, + "guid": "d0d710b5-083c-48f3-91de-7968ddbda1ff", + "src_subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "N - 1", + "step": "1", + "tile": "1" + } + ] + }, + "dst_subset": null, + "is_data_src": true, + "num_accesses": "N" + } + } + }, + "src": "5", + "dst": "15", + "dst_connector": "B", + "src_connector": "OUT_prefetch_B" + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "1", + "subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "0", + "step": "1", + "tile": "1" + } + ] + }, + "data": "__tmp3", + "debuginfo": null, + "guid": "71ea9976-2759-4850-8f7a-1b5e703bf25a", + "src_subset": null, + "dst_subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "0", + "step": "1", + "tile": "1" + } + ] + }, + "is_data_src": false, + "num_accesses": "1" + } + } + }, + "src": "9", + "dst": "10", + "dst_connector": null, + "src_connector": "__out" + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "1", + "subset": { + "type": "Range", + "ranges": [ + { + "start": "i + j + 256*k", + "end": "i + j + 256*k", + "step": "1", + "tile": "1" + } + ] + }, + "data": "C", + "debuginfo": null, + "guid": "b1ce4418-75a4-4381-9827-fa16b36e97e2", + "src_subset": null, + "dst_subset": { + "type": "Range", + "ranges": [ + { + "start": "i + j + 256*k", + "end": "i + j + 256*k", + "step": "1", + "tile": "1" + } + ] + }, + "is_data_src": false, + "num_accesses": "1" + } + } + }, + "src": "11", + "dst": "8", + "dst_connector": "IN___tmp_29_16_w_out_of_1_1", + "src_connector": "__out" + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "256", + "subset": { + "type": "Range", + "ranges": [ + { + "start": "Mod(k, 2)", + "end": "Mod(k, 2)", + "step": "1", + "tile": "1" + }, + { + "start": "0", + "end": "255", + "step": "1", + "tile": "1" + } + ] + }, + "data": "shr_A", + "debuginfo": null, + "guid": "b889d248-0781-46d4-ab8b-6160c654c033", + "src_subset": null, + "dst_subset": { + "type": "Range", + "ranges": [ + { + "start": "Mod(k, 2)", + "end": "Mod(k, 2)", + "step": "1", + "tile": "1" + }, + { + "start": "0", + "end": "255", + "step": "1", + "tile": "1" + } + ] + }, + "is_data_src": false, + "num_accesses": "256" + } + } + }, + "src": "19", + "dst": "7", + "dst_connector": "IN___tmp_29_37_r_in_from_1_0", + "src_connector": "_out1" + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "256", + "subset": { + "type": "Range", + "ranges": [ + { + "start": "Mod(k, 2)", + "end": "Mod(k, 2)", + "step": "1", + "tile": "1" + }, + { + "start": "0", + "end": "255", + "step": "1", + "tile": "1" + } + ] + }, + "data": "shr_B", + "debuginfo": null, + "guid": "8ff7c450-f4c8-4b4a-b25b-baadce878a89", + "src_subset": null, + "dst_subset": { + "type": "Range", + "ranges": [ + { + "start": "Mod(k, 2)", + "end": "Mod(k, 2)", + "step": "1", + "tile": "1" + }, + { + "start": "0", + "end": "255", + "step": "1", + "tile": "1" + } + ] + }, + "is_data_src": false, + "num_accesses": "256" + } + } + }, + "src": "17", + "dst": "7", + "dst_connector": "IN___tmp_29_58_r_in_from_1_0", + "src_connector": "_out1" + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "512", + "subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "1", + "step": "1", + "tile": "1" + }, + { + "start": "0", + "end": "255", + "step": "1", + "tile": "1" + } + ] + }, + "data": "shr_A", + "debuginfo": null, + "guid": "44a55ced-6a67-48ae-bcad-cf9619213664", + "src_subset": null, + "dst_subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "1", + "step": "1", + "tile": "1" + }, + { + "start": "0", + "end": "255", + "step": "1", + "tile": "1" + } + ] + }, + "is_data_src": false, + "num_accesses": "512" + } + } + }, + "src": "12", + "dst": "14", + "dst_connector": null, + "src_connector": "shr_A" + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "512", + "subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "1", + "step": "1", + "tile": "1" + }, + { + "start": "0", + "end": "255", + "step": "1", + "tile": "1" + } + ] + }, + "data": "shr_A", + "debuginfo": null, + "guid": "b9092fdc-de95-4a67-8386-fb195d8e280a", + "src_subset": null, + "dst_subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "1", + "step": "1", + "tile": "1" + }, + { + "start": "0", + "end": "255", + "step": "1", + "tile": "1" + } + ] + }, + "is_data_src": false, + "num_accesses": "512" + } + } + }, + "src": "15", + "dst": "18", + "dst_connector": null, + "src_connector": "shr_A" + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "512", + "subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "1", + "step": "1", + "tile": "1" + }, + { + "start": "0", + "end": "255", + "step": "1", + "tile": "1" + } + ] + }, + "data": "shr_B", + "debuginfo": null, + "guid": "14f2eda5-56fa-41b9-bf80-44a5102b6343", + "src_subset": null, + "dst_subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "1", + "step": "1", + "tile": "1" + }, + { + "start": "0", + "end": "255", + "step": "1", + "tile": "1" + } + ] + }, + "is_data_src": false, + "num_accesses": "512" + } + } + }, + "src": "12", + "dst": "13", + "dst_connector": null, + "src_connector": "shr_B" + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "512", + "subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "1", + "step": "1", + "tile": "1" + }, + { + "start": "0", + "end": "255", + "step": "1", + "tile": "1" + } + ] + }, + "data": "shr_B", + "debuginfo": null, + "guid": "23e84f63-c96b-4c3d-bbfc-50ae9e3a365c", + "src_subset": null, + "dst_subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "1", + "step": "1", + "tile": "1" + }, + { + "start": "0", + "end": "255", + "step": "1", + "tile": "1" + } + ] + }, + "is_data_src": false, + "num_accesses": "512" + } + } + }, + "src": "15", + "dst": "16", + "dst_connector": null, + "src_connector": "shr_B" + } + ], + "attributes": { + "guid": "328decff-fd21-4fff-881f-74fc87b42fa7", + "executions": "1", + "dynamic_executions": false + } + } + ], + "edges": [], + "collapsed": false, + "label": "", + "id": null, + "cfg_list_id": 0, + "start_block": 0, + "dace_version": "1.0.0" +} \ No newline at end of file diff --git a/berkay_workpace/scratch/yakups_examples/dbuff_related/original_sdfg_with_shared_memory.sdfg b/berkay_workpace/scratch/yakups_examples/dbuff_related/original_sdfg_with_shared_memory.sdfg new file mode 100644 index 0000000000..e4b5ed96bb --- /dev/null +++ b/berkay_workpace/scratch/yakups_examples/dbuff_related/original_sdfg_with_shared_memory.sdfg @@ -0,0 +1,1278 @@ +{ + "type": "SDFG", + "attributes": { + "name": "kernel_double_buffered", + "arg_names": [ + "A", + "B", + "C" + ], + "_arrays": { + "A": { + "type": "Array", + "attributes": { + "strides": [ + "1" + ], + "total_size": "N", + "offset": [ + "0" + ], + "optional": false, + "dtype": "float64", + "shape": [ + "N" + ], + "storage": "GPU_Global", + "debuginfo": null + } + }, + "B": { + "type": "Array", + "attributes": { + "strides": [ + "1" + ], + "total_size": "N", + "offset": [ + "0" + ], + "optional": false, + "dtype": "float64", + "shape": [ + "N" + ], + "storage": "GPU_Global", + "debuginfo": null + } + }, + "C": { + "type": "Array", + "attributes": { + "strides": [ + "1" + ], + "total_size": "N", + "offset": [ + "0" + ], + "optional": false, + "dtype": "float64", + "shape": [ + "N" + ], + "storage": "GPU_Global", + "debuginfo": null + } + }, + "__tmp3": { + "type": "Scalar", + "attributes": { + "dtype": "float64", + "shape": [ + "1" + ], + "transient": true, + "debuginfo": null + } + }, + "shr_A": { + "type": "Array", + "attributes": { + "strides": [ + "1" + ], + "total_size": "256", + "offset": [ + "0" + ], + "optional": false, + "dtype": "float64", + "shape": [ + "256" + ], + "transient": true, + "storage": "GPU_Shared", + "debuginfo": null + } + }, + "shr_B": { + "type": "Array", + "attributes": { + "strides": [ + "1" + ], + "total_size": "256", + "offset": [ + "0" + ], + "optional": false, + "dtype": "float64", + "shape": [ + "256" + ], + "transient": true, + "storage": "GPU_Shared", + "debuginfo": null + } + } + }, + "symbols": { + "N": "int64" + }, + "global_code": { + "frame": { + "string_data": "", + "language": "CPP" + } + }, + "init_code": { + "frame": { + "string_data": "", + "language": "CPP" + } + }, + "exit_code": { + "frame": { + "string_data": "", + "language": "CPP" + } + }, + "debuginfo": { + "type": "DebugInfo", + "start_line": 21, + "end_line": 30, + "start_column": 0, + "end_column": 0, + "filename": "/home/primrose/Work/DaceLayoutAndScheduleTransformations/layout_and_schedule_transformations/tests/multiple_buffering_test.py" + }, + "using_explicit_control_flow": true, + "guid": "3876c3d3-22e3-48a5-a227-0e6227e60775", + "hash": "15d16bc3e33636c2aa74e39db44a85f7abeb0ad003b99d3195b94e51c7c687d9" + }, + "nodes": [ + { + "type": "SDFGState", + "label": "MapState", + "id": 0, + "collapsed": false, + "scope_dict": { + "-1": [ + 0, + 2, + 3, + 4 + ], + "0": [ + 1, + 5 + ], + "5": [ + 6, + 7, + 12, + 13 + ], + "7": [ + 8, + 9, + 10, + 11 + ] + }, + "nodes": [ + { + "type": "MapEntry", + "label": "kernel_27[i=0:N:512]", + "attributes": { + "label": "kernel_27", + "params": [ + "i" + ], + "range": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "N - 1", + "step": "512", + "tile": "1" + } + ] + }, + "schedule": "GPU_Device", + "debuginfo": { + "type": "DebugInfo", + "start_line": 27, + "end_line": 27, + "start_column": 4, + "end_column": 4, + "filename": "/home/primrose/Work/DaceLayoutAndScheduleTransformations/layout_and_schedule_transformations/tests/multiple_buffering_test.py" + }, + "in_connectors": { + "IN_A": null, + "IN_B": null + }, + "out_connectors": { + "OUT_A": null, + "OUT_B": null + }, + "guid": "59ba5c8f-2679-4811-9d5c-6612cee18053" + }, + "id": 0, + "scope_entry": null, + "scope_exit": "1" + }, + { + "type": "MapExit", + "label": "kernel_27[i=0:N:512]", + "attributes": { + "in_connectors": { + "IN_C": null + }, + "out_connectors": { + "OUT_C": null + }, + "guid": "937530c2-b4a3-4e83-81d3-ee5db5735ae8" + }, + "id": 1, + "scope_entry": "0", + "scope_exit": "1" + }, + { + "type": "AccessNode", + "label": "A", + "attributes": { + "debuginfo": { + "type": "DebugInfo", + "start_line": 27, + "end_line": 27, + "start_column": 4, + "end_column": 4, + "filename": "/home/primrose/Work/DaceLayoutAndScheduleTransformations/layout_and_schedule_transformations/tests/multiple_buffering_test.py" + }, + "data": "A", + "guid": "06bd864e-06f7-41e8-b701-4907600053ea" + }, + "id": 2, + "scope_entry": null, + "scope_exit": null + }, + { + "type": "AccessNode", + "label": "B", + "attributes": { + "debuginfo": { + "type": "DebugInfo", + "start_line": 27, + "end_line": 27, + "start_column": 4, + "end_column": 4, + "filename": "/home/primrose/Work/DaceLayoutAndScheduleTransformations/layout_and_schedule_transformations/tests/multiple_buffering_test.py" + }, + "data": "B", + "guid": "000290da-2594-43de-ae95-9a25c7b8c9d0" + }, + "id": 3, + "scope_entry": null, + "scope_exit": null + }, + { + "type": "AccessNode", + "label": "C", + "attributes": { + "debuginfo": { + "type": "DebugInfo", + "start_line": 27, + "end_line": 27, + "start_column": 4, + "end_column": 4, + "filename": "/home/primrose/Work/DaceLayoutAndScheduleTransformations/layout_and_schedule_transformations/tests/multiple_buffering_test.py" + }, + "data": "C", + "guid": "997761eb-8cfa-4d27-a7d5-cc4bb5005f93" + }, + "id": 4, + "scope_entry": null, + "scope_exit": null + }, + { + "type": "MapEntry", + "label": "kernel_27_4_28[k=0:2]", + "attributes": { + "label": "kernel_27_4_28", + "params": [ + "k" + ], + "range": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "1", + "step": "1", + "tile": "1" + } + ] + }, + "schedule": "Sequential", + "debuginfo": { + "type": "DebugInfo", + "start_line": 28, + "end_line": 28, + "start_column": 8, + "end_column": 8, + "filename": "/home/primrose/Work/DaceLayoutAndScheduleTransformations/layout_and_schedule_transformations/tests/multiple_buffering_test.py" + }, + "in_connectors": { + "IN___tmp_30_37_r_in_from_1_0_in_from_1_0": null, + "IN___tmp_30_58_r_in_from_1_0_in_from_1_0": null + }, + "out_connectors": { + "OUT___tmp_30_37_r_in_from_1_0_in_from_1_0": null, + "OUT___tmp_30_58_r_in_from_1_0_in_from_1_0": null + }, + "guid": "460bbd40-7325-4651-a422-fb41394d8752" + }, + "id": 5, + "scope_entry": "0", + "scope_exit": "6" + }, + { + "type": "MapExit", + "label": "kernel_27_4_28[k=0:2]", + "attributes": { + "in_connectors": { + "IN___tmp_30_16_w_out_of_1_1_out_of_1_1": null + }, + "out_connectors": { + "OUT___tmp_30_16_w_out_of_1_1_out_of_1_1": null + }, + "guid": "fa7eaf22-39d3-4216-b93a-36061c5bb53e" + }, + "id": 6, + "scope_entry": "5", + "scope_exit": "6" + }, + { + "type": "MapEntry", + "label": "kernel_27_4_28_8_29[j=0:256]", + "attributes": { + "label": "kernel_27_4_28_8_29", + "params": [ + "j" + ], + "range": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "255", + "step": "1", + "tile": "1" + } + ] + }, + "schedule": "GPU_ThreadBlock", + "debuginfo": { + "type": "DebugInfo", + "start_line": 29, + "end_line": 29, + "start_column": 12, + "end_column": 12, + "filename": "/home/primrose/Work/DaceLayoutAndScheduleTransformations/layout_and_schedule_transformations/tests/multiple_buffering_test.py" + }, + "in_connectors": { + "IN___tmp_30_37_r_in_from_1_0": null, + "IN___tmp_30_58_r_in_from_1_0": null + }, + "out_connectors": { + "OUT___tmp_30_37_r_in_from_1_0": null, + "OUT___tmp_30_58_r_in_from_1_0": null + }, + "guid": "0f6e50ab-156b-4197-8f75-fcc83bd993ef" + }, + "id": 7, + "scope_entry": "5", + "scope_exit": "8" + }, + { + "type": "MapExit", + "label": "kernel_27_4_28_8_29[j=0:256]", + "attributes": { + "in_connectors": { + "IN___tmp_30_16_w_out_of_1_1": null + }, + "out_connectors": { + "OUT___tmp_30_16_w_out_of_1_1": null + }, + "guid": "67bf3a8a-7f84-4717-a6c6-443d181e0703" + }, + "id": 8, + "scope_entry": "7", + "scope_exit": "8" + }, + { + "type": "Tasklet", + "label": "_Add_", + "attributes": { + "code": { + "string_data": "__out = (__in1 + __in2)", + "language": "Python" + }, + "debuginfo": { + "type": "DebugInfo", + "start_line": 30, + "end_line": 30, + "start_column": 72, + "end_column": 72, + "filename": "/home/primrose/Work/DaceLayoutAndScheduleTransformations/layout_and_schedule_transformations/tests/multiple_buffering_test.py" + }, + "label": "_Add_", + "in_connectors": { + "__in1": null, + "__in2": null + }, + "out_connectors": { + "__out": null + }, + "guid": "467c5541-6c78-466f-9cd0-59b52ab5f3bb" + }, + "id": 9, + "scope_entry": "7", + "scope_exit": "8" + }, + { + "type": "AccessNode", + "label": "__tmp3", + "attributes": { + "debuginfo": { + "type": "DebugInfo", + "start_line": 30, + "end_line": 30, + "start_column": 72, + "end_column": 72, + "filename": "/home/primrose/Work/DaceLayoutAndScheduleTransformations/layout_and_schedule_transformations/tests/multiple_buffering_test.py" + }, + "data": "__tmp3", + "guid": "e1635af8-7f3b-44e5-90af-05c10b4d0beb" + }, + "id": 10, + "scope_entry": "7", + "scope_exit": "8" + }, + { + "type": "Tasklet", + "label": "assign_30_16", + "attributes": { + "code": { + "string_data": "__out = __inp", + "language": "Python" + }, + "debuginfo": { + "type": "DebugInfo", + "start_line": 30, + "end_line": 30, + "start_column": 30, + "end_column": 30, + "filename": "/home/primrose/Work/DaceLayoutAndScheduleTransformations/layout_and_schedule_transformations/tests/multiple_buffering_test.py" + }, + "label": "assign_30_16", + "in_connectors": { + "__inp": null + }, + "out_connectors": { + "__out": null + }, + "guid": "94a47063-816a-420f-9bcd-e507b8e52932" + }, + "id": 11, + "scope_entry": "7", + "scope_exit": "8" + }, + { + "type": "AccessNode", + "label": "shr_A", + "attributes": { + "debuginfo": { + "type": "DebugInfo", + "start_line": 48, + "end_line": 48, + "start_column": 0, + "end_column": 0, + "filename": "/home/primrose/Work/DaceLayoutAndScheduleTransformations/layout_and_schedule_transformations/tests/test_utils.py" + }, + "data": "shr_A", + "guid": "a0583fa5-f8c8-450a-822d-c06bc504e3d8" + }, + "id": 12, + "scope_entry": "5", + "scope_exit": "6" + }, + { + "type": "AccessNode", + "label": "shr_B", + "attributes": { + "debuginfo": { + "type": "DebugInfo", + "start_line": 48, + "end_line": 48, + "start_column": 0, + "end_column": 0, + "filename": "/home/primrose/Work/DaceLayoutAndScheduleTransformations/layout_and_schedule_transformations/tests/test_utils.py" + }, + "data": "shr_B", + "guid": "ddfc614b-a21e-4d47-a07f-3377589f5f1e" + }, + "id": 13, + "scope_entry": "5", + "scope_exit": "6" + } + ], + "edges": [ + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "512*ceiling(N/512)", + "subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "N - 1", + "step": "1", + "tile": "1" + } + ] + }, + "data": "A", + "debuginfo": null, + "guid": "bbbfba59-0966-4964-ade6-951656e12f8b", + "src_subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "N - 1", + "step": "1", + "tile": "1" + } + ] + }, + "dst_subset": null, + "is_data_src": true, + "num_accesses": "512*ceiling(N/512)" + } + } + }, + "src": "2", + "dst": "0", + "dst_connector": "IN_A", + "src_connector": null + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "512*ceiling(N/512)", + "subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "N - 1", + "step": "1", + "tile": "1" + } + ] + }, + "data": "B", + "debuginfo": null, + "guid": "2cc2c126-f362-468c-9541-aafb9ceef5ed", + "src_subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "N - 1", + "step": "1", + "tile": "1" + } + ] + }, + "dst_subset": null, + "is_data_src": true, + "num_accesses": "512*ceiling(N/512)" + } + } + }, + "src": "3", + "dst": "0", + "dst_connector": "IN_B", + "src_connector": null + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "256", + "subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "255", + "step": "1", + "tile": "1" + } + ] + }, + "data": "shr_A", + "debuginfo": null, + "guid": "dfa616c1-fd38-4d9a-90e5-cf9530f3d8d5", + "src_subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "255", + "step": "1", + "tile": "1" + } + ] + }, + "dst_subset": null, + "is_data_src": true, + "num_accesses": "256" + } + } + }, + "src": "12", + "dst": "7", + "dst_connector": "IN___tmp_30_37_r_in_from_1_0", + "src_connector": null + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "256", + "subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "255", + "step": "1", + "tile": "1" + } + ] + }, + "data": "shr_B", + "debuginfo": null, + "guid": "805f691f-2ad8-436f-9165-b939865e438a", + "src_subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "255", + "step": "1", + "tile": "1" + } + ] + }, + "dst_subset": null, + "is_data_src": true, + "num_accesses": "256" + } + } + }, + "src": "13", + "dst": "7", + "dst_connector": "IN___tmp_30_58_r_in_from_1_0", + "src_connector": null + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "1", + "subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "0", + "step": "1", + "tile": "1" + } + ] + }, + "data": "__tmp3", + "debuginfo": null, + "guid": "24c8fcc0-12fb-4256-8c81-b1f06d6e0b24", + "src_subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "0", + "step": "1", + "tile": "1" + } + ] + }, + "dst_subset": null, + "is_data_src": true, + "num_accesses": "1" + } + } + }, + "src": "10", + "dst": "11", + "dst_connector": "__inp", + "src_connector": null + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "512", + "subset": { + "type": "Range", + "ranges": [ + { + "start": "i", + "end": "i + 511", + "step": "1", + "tile": "1" + } + ] + }, + "data": "A", + "debuginfo": null, + "guid": "16438026-7243-4acd-b483-e50afcf2bae1", + "src_subset": { + "type": "Range", + "ranges": [ + { + "start": "i", + "end": "i + 511", + "step": "1", + "tile": "1" + } + ] + }, + "dst_subset": null, + "is_data_src": true, + "num_accesses": "512" + } + } + }, + "src": "0", + "dst": "5", + "dst_connector": "IN___tmp_30_37_r_in_from_1_0_in_from_1_0", + "src_connector": "OUT_A" + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "512", + "subset": { + "type": "Range", + "ranges": [ + { + "start": "i", + "end": "i + 511", + "step": "1", + "tile": "1" + } + ] + }, + "data": "B", + "debuginfo": null, + "guid": "9f86156c-d84e-476a-b4fd-586781af92b3", + "src_subset": { + "type": "Range", + "ranges": [ + { + "start": "i", + "end": "i + 511", + "step": "1", + "tile": "1" + } + ] + }, + "dst_subset": null, + "is_data_src": true, + "num_accesses": "512" + } + } + }, + "src": "0", + "dst": "5", + "dst_connector": "IN___tmp_30_58_r_in_from_1_0_in_from_1_0", + "src_connector": "OUT_B" + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "512*ceiling(N/512)", + "subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "N - 1", + "step": "1", + "tile": "1" + } + ] + }, + "data": "C", + "debuginfo": null, + "guid": "7de92b91-bf9e-4ae7-a3a5-45120338e67a", + "src_subset": null, + "dst_subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "N - 1", + "step": "1", + "tile": "1" + } + ] + }, + "is_data_src": false, + "num_accesses": "512*ceiling(N/512)" + } + } + }, + "src": "1", + "dst": "4", + "dst_connector": null, + "src_connector": "OUT_C" + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "256", + "subset": { + "type": "Range", + "ranges": [ + { + "start": "i + 256*k", + "end": "i + 256*k + 255", + "step": "1", + "tile": "1" + } + ] + }, + "data": "C", + "debuginfo": null, + "guid": "fce63159-8480-4884-bc3c-a4e4f3af4ba5", + "src_subset": null, + "dst_subset": { + "type": "Range", + "ranges": [ + { + "start": "i + 256*k", + "end": "i + 256*k + 255", + "step": "1", + "tile": "1" + } + ] + }, + "is_data_src": false, + "num_accesses": "256" + } + } + }, + "src": "8", + "dst": "6", + "dst_connector": "IN___tmp_30_16_w_out_of_1_1_out_of_1_1", + "src_connector": "OUT___tmp_30_16_w_out_of_1_1" + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "512", + "subset": { + "type": "Range", + "ranges": [ + { + "start": "i", + "end": "i + 511", + "step": "1", + "tile": "1" + } + ] + }, + "data": "C", + "debuginfo": null, + "guid": "b6addcac-3b8b-44ff-977a-d16564b8689c", + "src_subset": null, + "dst_subset": { + "type": "Range", + "ranges": [ + { + "start": "i", + "end": "i + 511", + "step": "1", + "tile": "1" + } + ] + }, + "is_data_src": false, + "num_accesses": "512" + } + } + }, + "src": "6", + "dst": "1", + "dst_connector": "IN_C", + "src_connector": "OUT___tmp_30_16_w_out_of_1_1_out_of_1_1" + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "1", + "subset": { + "type": "Range", + "ranges": [ + { + "start": "j", + "end": "j", + "step": "1", + "tile": "1" + } + ] + }, + "data": "shr_A", + "debuginfo": null, + "guid": "1b3b0248-b13f-4cc9-a417-69fd0d81f95d", + "src_subset": { + "type": "Range", + "ranges": [ + { + "start": "j", + "end": "j", + "step": "1", + "tile": "1" + } + ] + }, + "dst_subset": null, + "is_data_src": true, + "num_accesses": "1" + } + } + }, + "src": "7", + "dst": "9", + "dst_connector": "__in1", + "src_connector": "OUT___tmp_30_37_r_in_from_1_0" + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "256", + "subset": { + "type": "Range", + "ranges": [ + { + "start": "i + 256*k", + "end": "i + 256*k + 255", + "step": "1", + "tile": "1" + } + ] + }, + "other_subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "255", + "step": "1", + "tile": "1" + } + ] + }, + "data": "A", + "debuginfo": null, + "guid": "0fd0f751-5a94-4181-b3a9-da66b015a93f", + "src_subset": { + "type": "Range", + "ranges": [ + { + "start": "i + 256*k", + "end": "i + 256*k + 255", + "step": "1", + "tile": "1" + } + ] + }, + "dst_subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "255", + "step": "1", + "tile": "1" + } + ] + }, + "is_data_src": true, + "num_accesses": "256" + } + } + }, + "src": "5", + "dst": "12", + "dst_connector": null, + "src_connector": "OUT___tmp_30_37_r_in_from_1_0_in_from_1_0" + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "1", + "subset": { + "type": "Range", + "ranges": [ + { + "start": "j", + "end": "j", + "step": "1", + "tile": "1" + } + ] + }, + "data": "shr_B", + "debuginfo": null, + "guid": "9f88305d-28a9-44c7-b87a-19473ddd2986", + "src_subset": { + "type": "Range", + "ranges": [ + { + "start": "j", + "end": "j", + "step": "1", + "tile": "1" + } + ] + }, + "dst_subset": null, + "is_data_src": true, + "num_accesses": "1" + } + } + }, + "src": "7", + "dst": "9", + "dst_connector": "__in2", + "src_connector": "OUT___tmp_30_58_r_in_from_1_0" + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "256", + "subset": { + "type": "Range", + "ranges": [ + { + "start": "i + 256*k", + "end": "i + 256*k + 255", + "step": "1", + "tile": "1" + } + ] + }, + "other_subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "255", + "step": "1", + "tile": "1" + } + ] + }, + "data": "B", + "debuginfo": null, + "guid": "6b67d7f9-a952-4447-a5f4-ff00f9d26712", + "src_subset": { + "type": "Range", + "ranges": [ + { + "start": "i + 256*k", + "end": "i + 256*k + 255", + "step": "1", + "tile": "1" + } + ] + }, + "dst_subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "255", + "step": "1", + "tile": "1" + } + ] + }, + "is_data_src": true, + "num_accesses": "256" + } + } + }, + "src": "5", + "dst": "13", + "dst_connector": null, + "src_connector": "OUT___tmp_30_58_r_in_from_1_0_in_from_1_0" + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "1", + "subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "0", + "step": "1", + "tile": "1" + } + ] + }, + "data": "__tmp3", + "debuginfo": null, + "guid": "4cfe2901-4813-4424-9231-49dbab83199b", + "src_subset": null, + "dst_subset": { + "type": "Range", + "ranges": [ + { + "start": "0", + "end": "0", + "step": "1", + "tile": "1" + } + ] + }, + "is_data_src": false, + "num_accesses": "1" + } + } + }, + "src": "9", + "dst": "10", + "dst_connector": null, + "src_connector": "__out" + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "1", + "subset": { + "type": "Range", + "ranges": [ + { + "start": "i + j + 256*k", + "end": "i + j + 256*k", + "step": "1", + "tile": "1" + } + ] + }, + "data": "C", + "debuginfo": null, + "guid": "62b2982c-4bc7-404b-8ccf-7c8c213034a9", + "src_subset": null, + "dst_subset": { + "type": "Range", + "ranges": [ + { + "start": "i + j + 256*k", + "end": "i + j + 256*k", + "step": "1", + "tile": "1" + } + ] + }, + "is_data_src": false, + "num_accesses": "1" + } + } + }, + "src": "11", + "dst": "8", + "dst_connector": "IN___tmp_30_16_w_out_of_1_1", + "src_connector": "__out" + } + ], + "attributes": { + "guid": "e529de92-f3c5-45b7-8313-ae1ae22a2a4c", + "executions": "1", + "dynamic_executions": false + } + } + ], + "edges": [], + "collapsed": false, + "label": "", + "id": null, + "cfg_list_id": 0, + "start_block": 0, + "dace_version": "1.0.0" +} \ No newline at end of file diff --git a/berkay_workpace/scratch/yakups_examples/2d_blocktiled_gemm_with_custom_copy.sdfg b/berkay_workpace/scratch/yakups_examples/smem_related/2d_blocktiled_gemm_with_custom_copy.sdfg similarity index 100% rename from berkay_workpace/scratch/yakups_examples/2d_blocktiled_gemm_with_custom_copy.sdfg rename to berkay_workpace/scratch/yakups_examples/smem_related/2d_blocktiled_gemm_with_custom_copy.sdfg diff --git a/berkay_workpace/scratch/yakups_examples/generate_sdfgs.ipynb b/berkay_workpace/scratch/yakups_examples/smem_related/generate_sdfgs.ipynb similarity index 100% rename from berkay_workpace/scratch/yakups_examples/generate_sdfgs.ipynb rename to berkay_workpace/scratch/yakups_examples/smem_related/generate_sdfgs.ipynb diff --git a/berkay_workpace/scratch/yakups_examples/nice_global_to_shared_copy.sdfg b/berkay_workpace/scratch/yakups_examples/smem_related/nice_global_to_shared_copy.sdfg similarity index 100% rename from berkay_workpace/scratch/yakups_examples/nice_global_to_shared_copy.sdfg rename to berkay_workpace/scratch/yakups_examples/smem_related/nice_global_to_shared_copy.sdfg diff --git a/berkay_workpace/scratch/yakups_examples/weird_global_to_global.sdfg b/berkay_workpace/scratch/yakups_examples/smem_related/weird_global_to_global.sdfg similarity index 100% rename from berkay_workpace/scratch/yakups_examples/weird_global_to_global.sdfg rename to berkay_workpace/scratch/yakups_examples/smem_related/weird_global_to_global.sdfg diff --git a/berkay_workpace/scratch/yakups_examples/weird_shared_to_shared_copy.sdfg b/berkay_workpace/scratch/yakups_examples/smem_related/weird_shared_to_shared_copy.sdfg similarity index 100% rename from berkay_workpace/scratch/yakups_examples/weird_shared_to_shared_copy.sdfg rename to berkay_workpace/scratch/yakups_examples/smem_related/weird_shared_to_shared_copy.sdfg From 88d29a054426392534e964df66f1d6b8ca0810a3 Mon Sep 17 00:00:00 2001 From: aydogdub Date: Sun, 22 Jun 2025 17:07:20 +0200 Subject: [PATCH 31/94] New smem tests, copy past old tests to new testing folder, apply fixes to failing tests, a first approach on handling asynchronous memory copies under assumptions where sdfg and nodes contain information on it --- .../tests/reusable_tests/cuda_smem2d_test.py | 177 +++++++++++++ .../tests/reusable_tests/cuda_smem_test.py | 62 +++++ .../tests/reusable_tests/halfvec_cudatest.py | 160 ++++++++++++ .../reusable_tests/multiprogram_cudatest.py | 57 ++++ .../multistream_copy_cudatest.py | 93 +++++++ .../multistream_kernel_cudatest.py | 79 ++++++ berkay_workpace/tests/smem_tests/gemm_test.py | 2 +- .../smem_tests/special_sync_pass_test.py | 2 +- dace/codegen/targets/experimental_cuda.py | 106 +++++--- .../copy_strategies.py | 134 +++++++++- .../experimental_cuda_helpers/gpu_utils.py | 22 +- .../scope_strategies.py | 60 ++++- dace/config_schema.yml | 13 + dace/sdfg/nodes.py | 13 + dace/sdfg/sdfg.py | 8 + .../passes/gpustream_scheduling.py | 245 +++++++++++++++--- 16 files changed, 1143 insertions(+), 90 deletions(-) create mode 100644 berkay_workpace/tests/reusable_tests/cuda_smem2d_test.py create mode 100644 berkay_workpace/tests/reusable_tests/cuda_smem_test.py create mode 100644 berkay_workpace/tests/reusable_tests/halfvec_cudatest.py create mode 100644 berkay_workpace/tests/reusable_tests/multiprogram_cudatest.py create mode 100644 berkay_workpace/tests/reusable_tests/multistream_copy_cudatest.py create mode 100644 berkay_workpace/tests/reusable_tests/multistream_kernel_cudatest.py diff --git a/berkay_workpace/tests/reusable_tests/cuda_smem2d_test.py b/berkay_workpace/tests/reusable_tests/cuda_smem2d_test.py new file mode 100644 index 0000000000..0b2225daef --- /dev/null +++ b/berkay_workpace/tests/reusable_tests/cuda_smem2d_test.py @@ -0,0 +1,177 @@ +# Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved. +import dace +import numpy as np +from dace import nodes +from dace.transformation.dataflow import GPUTransformMap, InLocalStorage +import pytest + +H = dace.symbol('H') +W = dace.symbol('W') + + +@dace.program(dace.float64[H, W], dace.float64[H, W]) +def cudahello(V, Vout): + + @dace.mapscope(_[0:H:8, 0:W:32]) + def multiplication(i, j): + + @dace.map(_[0:8, 0:32]) + def mult_block(bi, bj): + in_V << V[i + bi, j + bj] + out >> Vout[i + bi, j + bj] + out = in_V * 2.0 + + +def _test(sdfg): + W = 128 + H = 64 + + print('Vector double CUDA (shared memory 2D) %dx%d' % (W, H)) + + V = dace.ndarray([H, W], dace.float64) + Vout = dace.ndarray([H, W], dace.float64) + V[:] = np.random.rand(H, W).astype(dace.float64.type) + Vout[:] = dace.float64(0) + + sdfg(V=V, Vout=Vout, H=H, W=W) + + diff = np.linalg.norm(2 * V - Vout) / (H * W) + print("Difference:", diff) + assert diff <= 1e-5 + + +def test_cpu(): + sdfg = cudahello.to_sdfg() + sdfg.name = "cuda_smem2d_cpu" + _test(sdfg) + + +@pytest.mark.gpu +def test_gpu(): + sdfg = cudahello.to_sdfg() + sdfg.name = "cuda_smem2d_gpu" + _test(sdfg) + + +@pytest.mark.gpu +def test_gpu_localstorage(): + sdfg = cudahello.to_sdfg() + sdfg.name = "cuda_smem2d_gpu_localstorage" + assert sdfg.apply_transformations([GPUTransformMap, InLocalStorage], options=[{}, {'array': 'gpu_V'}]) == 2 + _test(sdfg) + + +@pytest.mark.gpu +def test_gpu_2localstorage(): + + @dace.program + def addtwoandmult(A: dace.float64[H, W], B: dace.float64[H, W], Vout: dace.float64[H, W]): + for i, j in dace.map[0:H:8, 0:W:32]: + for bi, bj in dace.map[0:8, 0:32]: + with dace.tasklet: + a << A[i + bi, j + bj] + b << B[i + bi, j + bj] + out = (a + b) * 2.0 + out >> Vout[i + bi, j + bj] + + sdfg = addtwoandmult.to_sdfg() + sdfg.name = "cuda_2_smem2d_gpu_localstorage" + assert sdfg.apply_transformations([GPUTransformMap, InLocalStorage, InLocalStorage], + options=[{}, { + 'array': 'gpu_A' + }, { + 'array': 'gpu_B' + }]) == 3 + + A = np.random.rand(128, 64) + B = np.random.rand(128, 64) + out = np.random.rand(128, 64) + refout = (A + B) * 2 + sdfg(A, B, out, H=128, W=64) + assert np.allclose(refout, out) + + +@pytest.mark.gpu +def test_gpu_2shared_for(): + + @dace.program + def addtwoandmult(A: dace.float64[H, W], B: dace.float64[H, W], Vout: dace.float64[H, W]): + for i, j in dace.map[0:H:8, 0:W:32]: + for _ in range(1): + local_a = dace.ndarray([8, 32], dtype=dace.float64, storage=dace.StorageType.GPU_Shared) + local_b = dace.ndarray([8, 32], dtype=dace.float64, storage=dace.StorageType.GPU_Shared) + local_a << A[i:i + 8, j:j + 32] + local_b << B[i:i + 8, j:j + 32] + for bi, bj in dace.map[0:8, 0:32]: + with dace.tasklet: + a << local_a[bi, bj] + b << local_b[bi, bj] + out = (a + b) * 2.0 + out >> Vout[i + bi, j + bj] + + sdfg = addtwoandmult.to_sdfg() + sdfg.name = "cuda_2_shared_for" + state = sdfg.nodes()[0] + map_entry = -1 + for node in state.nodes(): + if isinstance(node, nodes.MapEntry) and 'i' in node.map.params: + map_entry = state.node_id(node) + break + transformation = GPUTransformMap() + transformation.setup_match(sdfg, 0, 0, {GPUTransformMap.map_entry: map_entry}, 0) + transformation.apply(state, sdfg) + + A = np.random.rand(128, 64) + B = np.random.rand(128, 64) + out = np.random.rand(128, 64) + refout = (A + B) * 2 + sdfg(A, B, out, H=128, W=64) + assert np.allclose(refout, out) + + +def _find_map_by_param(sdfg: dace.SDFG, pname: str) -> dace.nodes.MapEntry: + """ Finds the first map entry node by the given parameter name. """ + return next(n for n, _ in sdfg.all_nodes_recursive() if isinstance(n, dace.nodes.MapEntry) and pname in n.params) + + +@pytest.mark.gpu +def test_gpu_2shared_map(): + K = dace.symbol('K') + + @dace.program + def addtwoandmult(A: dace.float64[H, W], B: dace.float64[H, W], Vout: dace.float64[H, W]): + for i, j in dace.map[0:H:8, 0:W:32]: + for _ in dace.map[0:K]: + local_a = dace.ndarray([8, 32], dtype=dace.float64, storage=dace.StorageType.GPU_Shared) + local_b = dace.ndarray([8, 32], dtype=dace.float64, storage=dace.StorageType.GPU_Shared) + local_a << A[i:i + 8, j:j + 32] + local_b << B[i:i + 8, j:j + 32] + for bi, bj in dace.map[0:8, 0:32]: + with dace.tasklet: + a << local_a[bi, bj] + b << local_b[bi, bj] + out = (a + b) * 2.0 + out >> Vout[i + bi, j + bj] + + sdfg = addtwoandmult.to_sdfg() + sdfg.name = "cuda_2_shared_map" + + me = _find_map_by_param(sdfg, '_') + me.schedule = dace.ScheduleType.Sequential + sdfg.apply_gpu_transformations() + me = _find_map_by_param(sdfg, 'bi') + me.schedule = dace.ScheduleType.GPU_ThreadBlock + + A = np.random.rand(128, 64) + B = np.random.rand(128, 64) + out = np.random.rand(128, 64) + refout = (A + B) * 2 + sdfg(A, B, out, H=128, W=64, K=1) + assert np.allclose(refout, out) + + +if __name__ == "__main__": + test_cpu() + test_gpu_2localstorage() + test_gpu_2shared_for() + test_gpu_2shared_map() diff --git a/berkay_workpace/tests/reusable_tests/cuda_smem_test.py b/berkay_workpace/tests/reusable_tests/cuda_smem_test.py new file mode 100644 index 0000000000..e7191a2631 --- /dev/null +++ b/berkay_workpace/tests/reusable_tests/cuda_smem_test.py @@ -0,0 +1,62 @@ +# Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved. + +import dace +from dace.transformation.dataflow import GPUTransformMap, InLocalStorage +from dace.transformation.passes import gpustream_scheduling +import numpy as np +import pytest + +N = dace.symbol('N') + + +@dace.program(dace.float64[N], dace.float64[N]) +def cudahello(A, Vout): + + @dace.mapscope(_[0:ceiling(N / 32)]) + def multiplication(i): + + @dace.map(_[i * 32:min(N, (i + 1) * 32)]) + def mult_block(bi): + in_V << A[bi] + out >> Vout[bi] + out = in_V * 2.0 + + +def _test(sdfg): + N = 144 + + print('Vector double CUDA (shared memory) %d' % (N)) + + V = dace.ndarray([N], dace.float64) + Vout = dace.ndarray([N], dace.float64) + V[:] = np.random.rand(N).astype(dace.float64.type) + Vout[:] = dace.float64(0) + + sdfg(A=V, Vout=Vout, N=N) + + diff = np.linalg.norm(2 * V - Vout) / N + print("Difference:", diff) + assert diff <= 1e-5 + + +def test_cpu(): + _test(cudahello.to_sdfg()) + + +@pytest.mark.gpu +def test_gpu(): + sdfg = cudahello.to_sdfg() + assert sdfg.apply_transformations(GPUTransformMap) == 1 + _test(sdfg) + + +@pytest.mark.gpu +def test_gpu_localstorage(): + sdfg = cudahello.to_sdfg() + assert sdfg.apply_transformations([GPUTransformMap, InLocalStorage], options=[{}, {'array': 'gpu_A'}]) == 2 + _test(sdfg) + + +if __name__ == "__main__": + test_cpu() + test_gpu() diff --git a/berkay_workpace/tests/reusable_tests/halfvec_cudatest.py b/berkay_workpace/tests/reusable_tests/halfvec_cudatest.py new file mode 100644 index 0000000000..8772d6b24d --- /dev/null +++ b/berkay_workpace/tests/reusable_tests/halfvec_cudatest.py @@ -0,0 +1,160 @@ +# Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved. +""" Tests for half-precision syntax quirks. """ + +import dace +import math +import numpy as np +import pytest +from dace.transformation.dataflow import MapFusion, Vectorization +from dace.transformation.optimizer import Optimizer + +N = dace.symbol('N') + + +def _config(): + # Prerequisite for test: CUDA compute capability >= 6.0 + dace.Config.set('compiler', 'cuda', 'cuda_arch', value='60') + + +def _test_half(veclen): + """ Tests a set of elementwise operations on a vector half type. """ + _config() + + @dace.program + def halftest(A: dace.float16[N], B: dace.float16[N]): + return A * B + A + + A = np.random.rand(24).astype(np.float16) + B = np.random.rand(24).astype(np.float16) + sdfg = halftest.to_sdfg() + sdfg.simplify() + sdfg.apply_gpu_transformations() + + # Apply vectorization on each map and count applied + applied = 0 + for xform in Optimizer(sdfg).get_pattern_matches(patterns=Vectorization, + options=dict(vector_len=veclen, postamble=False)): + xform.apply(sdfg.node(xform.state_id), sdfg) + applied += 1 + assert applied == 2 + + out = sdfg(A=A, B=B, N=24) + assert np.allclose(out, A * B + A) + + +@pytest.mark.gpu +def test_half4(): + """ Tests a set of elementwise operations on half with vector length 4. """ + _test_half(4) + + +@pytest.mark.gpu +def test_half8(): + """ Tests a set of elementwise operations on half with vector length 8. """ + _test_half(8) + + +@pytest.mark.gpu +def test_exp_vec(): + """ Tests an exp operator on a vector half type. """ + _config() + + @dace.program + def halftest(A: dace.float16[N]): + out = np.ndarray([N], dace.float16) + for i in dace.map[0:N]: + with dace.tasklet: + a << A[i] + o >> out[i] + o = math.exp(a) + return out + + A = np.random.rand(24).astype(np.float16) + sdfg = halftest.to_sdfg() + sdfg.apply_gpu_transformations() + assert sdfg.apply_transformations(Vectorization, dict(vector_len=8)) == 1 + out = sdfg(A=A, N=24) + assert np.allclose(out, np.exp(A)) + + +@pytest.mark.gpu +def test_relu_vec(): + """ Tests a ReLU operator on a vector half type. """ + _config() + + @dace.program + def halftest(A: dace.float16[N]): + out = np.ndarray([N], dace.float16) + for i in dace.map[0:N]: + with dace.tasklet: + a << A[i] + o >> out[i] + o = max(a, dace.float16(0)) + return out + + A = np.random.rand(24).astype(np.float16) + sdfg = halftest.to_sdfg() + sdfg.apply_gpu_transformations() + assert sdfg.apply_transformations(Vectorization, dict(vector_len=8)) == 1 + out = sdfg(A=A, N=24) + assert np.allclose(out, np.maximum(A, 0)) + + +@pytest.mark.gpu +def test_dropout_vec(): + """ Tests a dropout operator on a vector half type. """ + _config() + + @dace.program + def halftest(A: dace.float16[N], mask: dace.float16[N]): + out = np.ndarray([N], dace.float16) + for i in dace.map[0:N]: + with dace.tasklet: + a << A[i] + d << mask[i] + o >> out[i] + o = a * d + return out + + A = np.random.rand(24).astype(np.float16) + mask = np.random.randint(0, 2, size=[24]).astype(np.float16) + sdfg: dace.SDFG = halftest.to_sdfg() + sdfg.apply_gpu_transformations() + assert sdfg.apply_transformations(Vectorization, dict(vector_len=8)) == 1 + out = sdfg(A=A, mask=mask, N=24) + assert np.allclose(out, A * mask) + + +@pytest.mark.gpu +def test_gelu_vec(): + """ Tests a GELU operator on a vector half type. """ + _config() + s2pi = math.sqrt(2.0 / math.pi) + + @dace.program + def halftest(A: dace.float16[N]): + out = np.ndarray([N], dace.float16) + for i in dace.map[0:N]: + with dace.tasklet: + a << A[i] + o >> out[i] + o = dace.float16(0.5) * a * (dace.float16(1) + + math.tanh(dace.float16(s2pi) * (a + dace.float16(0.044715) * (a**3)))) + return out + + A = np.random.rand(24).astype(np.float16) + sdfg = halftest.to_sdfg() + sdfg.apply_gpu_transformations() + assert sdfg.apply_transformations(Vectorization, dict(vector_len=4)) == 1 + out = sdfg(A=A, N=24) + expected = 0.5 * A * (1 + np.tanh(math.sqrt(2.0 / math.pi) * (A + 0.044715 * (A**3)))) + assert np.allclose(out, expected, rtol=1e-2, atol=1e-4) + + +if __name__ == '__main__': + test_half4() + test_half8() + test_exp_vec() + test_relu_vec() + test_dropout_vec() + test_gelu_vec() diff --git a/berkay_workpace/tests/reusable_tests/multiprogram_cudatest.py b/berkay_workpace/tests/reusable_tests/multiprogram_cudatest.py new file mode 100644 index 0000000000..1b8dae3247 --- /dev/null +++ b/berkay_workpace/tests/reusable_tests/multiprogram_cudatest.py @@ -0,0 +1,57 @@ +# Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved. +import dace +from dace.transformation import optimizer +from dace.transformation.dataflow import GPUTransformMap +import numpy as np +import pytest + + +@dace.program +def prog1(A: dace.float32[32], B: dace.float32[32]): + + @dace.map + def work1(i: _[0:32]): + a << A[i] + b >> B[i] + b = a * 2.0 + + +@dace.program +def prog2(A: dace.float32[32], B: dace.float32[32]): + + @dace.map + def work2(i: _[0:32]): + a << A[i] + b >> B[i] + b = a / 2.0 + + +###################################### +@pytest.mark.gpu +def test_multiprogram(): + print('Multi-program CUDA test') + + A = np.random.rand(32).astype(np.float32) + B = np.random.rand(32).astype(np.float32) + C = np.random.rand(32).astype(np.float32) + + s1 = prog1.to_sdfg() + s1.apply_transformations(GPUTransformMap) + + s2 = prog2.to_sdfg() + s2.apply_transformations(GPUTransformMap) + + s1func = s1.compile() + s2func = s2.compile() + + s1func(A=A, B=B) + s2func(A=B, B=C) + + diff = np.linalg.norm(A - C) + + print('Difference:', diff) + assert diff <= 1e-5 + + +if __name__ == '__main__': + test() diff --git a/berkay_workpace/tests/reusable_tests/multistream_copy_cudatest.py b/berkay_workpace/tests/reusable_tests/multistream_copy_cudatest.py new file mode 100644 index 0000000000..df307d9958 --- /dev/null +++ b/berkay_workpace/tests/reusable_tests/multistream_copy_cudatest.py @@ -0,0 +1,93 @@ +# Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved. +import dace +import numpy as np +import pytest + + +###################################### +@pytest.mark.gpu +def test_multistream_copy(): + sdfg = dace.SDFG('multistream') + + _, A = sdfg.add_array('A', [2], dace.float32, storage=dace.StorageType.CPU_Pinned) + _, B = sdfg.add_array('B', [2], dace.float32, storage=dace.StorageType.CPU_Pinned) + _, C = sdfg.add_array('C', [2], dace.float32, storage=dace.StorageType.CPU_Pinned) + + gA = sdfg.add_transient('gA', [2], dace.float32, storage=dace.StorageType.GPU_Global) + gB = sdfg.add_transient('gB', [2], dace.float32, storage=dace.StorageType.GPU_Global) + gC = sdfg.add_transient('gC', [2], dace.float32, storage=dace.StorageType.GPU_Global) + + state = sdfg.add_state('s0') + + a1 = state.add_read('A') + a2 = state.add_access('gA') + + b1 = state.add_read('B') + b2 = state.add_access('gB') + + c1 = state.add_access('gC') + c2 = state.add_write('C') + + state.add_nedge(a1, a2, dace.Memlet.from_array('A', A)) + state.add_nedge(b1, b2, dace.Memlet.from_array('B', B)) + state.add_nedge(c1, c2, dace.Memlet.from_array('C', C)) + + state.add_nedge(a2, c1, dace.Memlet.simple('gA', '0')) + state.add_nedge(b2, c1, dace.Memlet.simple('gB', '1', other_subset_str='1')) + + # Validate correctness of initial SDFG + sdfg.validate() + + a = np.random.rand(2).astype(np.float32) + b = np.random.rand(2).astype(np.float32) + c = np.random.rand(2).astype(np.float32) + + sdfg(A=a, B=b, C=c) + + refC = np.array([a[0], b[1]], dtype=np.float32) + diff = np.linalg.norm(c - refC) + print('Difference:', diff) + assert diff <= 1e-5 + + +@pytest.mark.gpu +def test_copy_sync(): + sdfg = dace.SDFG('h2dsync') + sdfg.add_scalar('scal_outer', dace.float32) + sdfg.add_scalar('gpu_scal_outer', dace.float32, dace.StorageType.GPU_Global, transient=True) + sdfg.add_array('output_outer', [1], dace.float32) + + nsdfg = dace.SDFG('nested') + nsdfg.add_scalar('gpu_scal', dace.float32, dace.StorageType.GPU_Global) + nsdfg.add_scalar('cpu_scal', dace.float32, transient=True) + nsdfg.add_array('output', [1], dace.float32) + + nstate = nsdfg.add_state() + r = nstate.add_read('gpu_scal') + a = nstate.add_access('cpu_scal') + nt = nstate.add_tasklet('addone', {'inp'}, {'out'}, 'out = inp + 1') + w = nstate.add_write('output') + nstate.add_nedge(r, a, dace.Memlet('gpu_scal')) + nstate.add_edge(a, None, nt, 'inp', dace.Memlet('cpu_scal')) + nstate.add_edge(nt, 'out', w, None, dace.Memlet('output')) + + state = sdfg.add_state() + r = state.add_read('scal_outer') + w = state.add_write('gpu_scal_outer') + state.add_nedge(r, w, dace.Memlet('scal_outer')) + + state = sdfg.add_state_after(state) + ro = state.add_read('gpu_scal_outer') + wo = state.add_write('output_outer') + nsdfg_node = state.add_nested_sdfg(nsdfg, None, {'gpu_scal'}, {'output'}) + state.add_edge(ro, None, nsdfg_node, 'gpu_scal', dace.Memlet('gpu_scal_outer')) + state.add_edge(nsdfg_node, 'output', wo, None, dace.Memlet('output_outer')) + + out = np.random.rand(1).astype(np.float32) + sdfg(scal_outer=np.float32(2), output_outer=out) + assert np.allclose(out, 3) + + +if __name__ == '__main__': + test_multistream_copy() + test_copy_sync() diff --git a/berkay_workpace/tests/reusable_tests/multistream_kernel_cudatest.py b/berkay_workpace/tests/reusable_tests/multistream_kernel_cudatest.py new file mode 100644 index 0000000000..f1451003ac --- /dev/null +++ b/berkay_workpace/tests/reusable_tests/multistream_kernel_cudatest.py @@ -0,0 +1,79 @@ +# Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved. +import dace +import numpy as np +import pytest + +sdfg = dace.SDFG('multistream_kernel') + +sdfg.add_array('A', [2], dace.float32, storage=dace.StorageType.CPU_Pinned) +sdfg.add_array('B', [2], dace.float32, storage=dace.StorageType.CPU_Pinned) +sdfg.add_array('C', [2], dace.float32, storage=dace.StorageType.CPU_Pinned) + +sdfg.add_transient('gA1', [2], dace.float32, storage=dace.StorageType.GPU_Global) +sdfg.add_transient('gA2', [2], dace.float32, storage=dace.StorageType.GPU_Global) +sdfg.add_transient('gB1', [2], dace.float32, storage=dace.StorageType.GPU_Global) +sdfg.add_transient('gB2', [2], dace.float32, storage=dace.StorageType.GPU_Global) +sdfg.add_transient('gC', [2], dace.float32, storage=dace.StorageType.GPU_Global) + +state = sdfg.add_state('s0') + +a = state.add_read('A') +ga1 = state.add_access('gA1') +ga2 = state.add_access('gA2') +state.add_nedge(a, ga1, dace.Memlet.simple('A', '0:2')) + +b = state.add_read('B') +gb1 = state.add_access('gB1') +gb2 = state.add_access('gB2') +state.add_nedge(b, gb1, dace.Memlet.simple('B', '0:2')) + +gc = state.add_access('gC') +c = state.add_write('C') +state.add_nedge(gc, c, dace.Memlet.simple('gC', '0:2')) + +t1, me1, mx1 = state.add_mapped_tasklet('addone', dict(i='0:2'), + dict(inp=dace.Memlet.simple('gA1', 'i')), 'out = inp + 1', + dict(out=dace.Memlet.simple('gA2', 'i')), dace.ScheduleType.GPU_Device) +t2, me2, mx2 = state.add_mapped_tasklet('addtwo', dict(i='0:2'), + dict(inp=dace.Memlet.simple('gB1', 'i')), 'out = inp + 2', + dict(out=dace.Memlet.simple('gB2', 'i')), dace.ScheduleType.GPU_Device) + +t2, me3, mx3 = state.add_mapped_tasklet('twoarrays', dict(i='0:2'), + dict(inp1=dace.Memlet.simple('gA2', 'i'), + inp2=dace.Memlet.simple('gB2', 'i')), 'out = inp1 * inp2', + dict(out=dace.Memlet.simple('gC', 'i')), dace.ScheduleType.GPU_Device) + +state.add_nedge(ga1, me1, dace.Memlet.simple('gA1', '0:2')) +state.add_nedge(gb1, me2, dace.Memlet.simple('gB1', '0:2')) +state.add_nedge(mx1, ga2, dace.Memlet.simple('gA2', '0:2')) +state.add_nedge(mx2, gb2, dace.Memlet.simple('gB2', '0:2')) + +state.add_nedge(ga2, me3, dace.Memlet.simple('gA2', '0:2')) +state.add_nedge(gb2, me3, dace.Memlet.simple('gB2', '0:2')) +state.add_nedge(mx3, gc, dace.Memlet.simple('gC', '0:2')) + +sdfg.fill_scope_connectors() + +# Validate correctness of initial SDFG +sdfg.validate() + + +###################################### +@pytest.mark.gpu +def test_multistream_kernel(): + print('Multi-stream kernel test') + + a = np.random.rand(2).astype(np.float32) + b = np.random.rand(2).astype(np.float32) + c = np.random.rand(2).astype(np.float32) + + sdfg(A=a, B=b, C=c) + + refC = (a + 1) * (b + 2) + diff = np.linalg.norm(c - refC) + print('Difference:', diff) + assert diff <= 1e-5 + + +if __name__ == "__main__": + test_multistream_kernel() diff --git a/berkay_workpace/tests/smem_tests/gemm_test.py b/berkay_workpace/tests/smem_tests/gemm_test.py index a42afc5f14..925d4aac5d 100644 --- a/berkay_workpace/tests/smem_tests/gemm_test.py +++ b/berkay_workpace/tests/smem_tests/gemm_test.py @@ -13,7 +13,7 @@ def test_gemm(): of a GEMM SDFG using 2D block tiling with custom copy. """ current_dir = os.path.dirname(os.path.abspath(__file__)) - sdfg_path = os.path.join(current_dir, '../../scratch/yakups_examples/2d_blocktiled_gemm_with_custom_copy.sdfg') + sdfg_path = os.path.join(current_dir, '../../scratch/yakups_examples/smem_related/2d_blocktiled_gemm_with_custom_copy.sdfg') sdfg = dace.SDFG.from_file(sdfg_path) m, n, k = 1024, 1024, 1024 diff --git a/berkay_workpace/tests/smem_tests/special_sync_pass_test.py b/berkay_workpace/tests/smem_tests/special_sync_pass_test.py index 5338933f4b..1d56f31df9 100644 --- a/berkay_workpace/tests/smem_tests/special_sync_pass_test.py +++ b/berkay_workpace/tests/smem_tests/special_sync_pass_test.py @@ -17,7 +17,7 @@ def test_correctness_and_reuse(): smem locations which they also write to, so synchronization is not stictly needed here) """ current_dir = os.path.dirname(os.path.abspath(__file__)) - sdfg_path = os.path.join(current_dir, '../../scratch/yakups_examples/nice_global_to_shared_copy.sdfg') + sdfg_path = os.path.join(current_dir, '../../scratch/yakups_examples/smem_related/nice_global_to_shared_copy.sdfg') sdfg = dace.SDFG.from_file(sdfg_path) size = 512 diff --git a/dace/codegen/targets/experimental_cuda.py b/dace/codegen/targets/experimental_cuda.py index 9a54f12d7b..d312fcf10d 100644 --- a/dace/codegen/targets/experimental_cuda.py +++ b/dace/codegen/targets/experimental_cuda.py @@ -25,7 +25,9 @@ from dace.codegen.targets.cpp import ( codeblock_to_cpp, memlet_copy_to_absolute_strides, - mangle_dace_state_struct_name + mangle_dace_state_struct_name, + ptr, + sym2cpp ) from dace.codegen.targets.target import IllegalCopy, TargetCodeGenerator, make_absolute @@ -36,7 +38,7 @@ # Experimental CUDA helper imports from dace.codegen.targets.experimental_cuda_helpers.gpu_stream_manager import GPUStreamManager -from dace.codegen.targets.experimental_cuda_helpers.gpu_utils import symbolic_to_cpp, product +from dace.codegen.targets.experimental_cuda_helpers.gpu_utils import symbolic_to_cpp, product, emit_sync_debug_checks # Type checking imports (conditional) if TYPE_CHECKING: @@ -45,19 +47,13 @@ # TODO's easy: -# 1. Handle memory pools release -# 2. Handle sync properties -# 3. Emit sync +# 3. Emit sync -> yea not easy + +# add symbolic_to_cpp ! # TODO's harder: # 1. Include constant expressions -# Question: Getting "const" expressions leads to some issues. -# So it looks like, that I need to do make this visible to lower -# generation as well. - - -# extended todo: get const, like in a general way without a hack in a scope @registry.autoregister_params(name='experimental_cuda') @@ -93,8 +89,6 @@ def __init__(self, frame_codegen: 'DaCeCodeGenerator', sdfg: SDFG): - self._scope_has_collaborative_copy = False - self._localcode = CodeIOStream() self._globalcode = CodeIOStream() @@ -197,6 +191,7 @@ def preprocess(self, sdfg: SDFG) -> None: #------------------------- GPU Stream related Logic -------------------------- + # Register GPU context in state struct self._frame.statestruct.append('dace::cuda::Context *gpu_context;') @@ -213,7 +208,9 @@ def preprocess(self, sdfg: SDFG) -> None: #----------------- Shared Memory Synchronization related Logic ----------------- - DefaultSharedMemorySync().apply_pass(sdfg, None) + auto_sync = Config.get('compiler', 'cuda', 'auto_syncthreads_insertion') + if auto_sync: + DefaultSharedMemorySync().apply_pass(sdfg, None) #------------------------- Memory Pool related Logic -------------------------- @@ -263,10 +260,14 @@ def _compute_pool_release(self, top_sdfg: SDFG): if all(nx.has_path(state.nx, an2, an1) for an2 in ans if an2 is not an1): terminator = an1 break - + + # Old logic below, now we use the gpu_stream manager which returns nullptr automatically + # to all nodes thatdid not got assigned a cuda stream + """ # Enforce a cuda_stream field so that the state-wide deallocation would work if not hasattr(an1, '_cuda_stream'): an1._cuda_stream = 'nullptr' + """ # If access node was found, find the point where all its reads are complete terminators = set() @@ -502,6 +503,7 @@ def _generate_kernel_launch(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: self._localcode.write(f'DACE_KERNEL_LAUNCH_CHECK(__err, "{kernel_name}", {gdims}, {bdims});') + emit_sync_debug_checks(self.backend, self._localcode) self._localcode.write('}') ########################################################################### @@ -517,8 +519,9 @@ def copy_memory(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView CopyContext, CopyStrategy, OutOfKernelCopyStrategy, - WithinGPUCopyStrategy, - FallBackGPUCopyStrategy, + SyncCollaboritveGPUCopyStrategy, + AsyncCollaboritveGPUCopyStrategy, + FallBackGPUCopyStrategy ) context = CopyContext(self, self._gpu_stream_manager, state_id, src_node, dst_node, edge, @@ -527,7 +530,8 @@ def copy_memory(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView # Order matters: fallback must come last strategies: List[CopyStrategy] = [ OutOfKernelCopyStrategy(), - WithinGPUCopyStrategy(), + SyncCollaboritveGPUCopyStrategy(), + AsyncCollaboritveGPUCopyStrategy(), FallBackGPUCopyStrategy() ] @@ -580,28 +584,37 @@ def generate_state(self, callsite_stream: CodeIOStream, generate_state_footer: bool = False) -> None: - if ExperimentalCUDACodeGen._in_kernel_code: - self.generate_devicelevel_state(sdfg, cfg, state, function_stream, callsite_stream) - else: - self._frame.generate_state(sdfg, cfg, state, function_stream, callsite_stream, generate_state_footer=False) - - def generate_devicelevel_state(self, sdfg: SDFG, cfg: ControlFlowRegion, state: SDFGState, - function_stream: CodeIOStream, callsite_stream: CodeIOStream) -> None: - # Special case: if this is a GPU grid state and something is reading - # from a possible result of a collaborative write, sync first - if self._toplevel_schedule == dtypes.ScheduleType.GPU_Device: - for node in state.nodes(): - if (isinstance(node, nodes.AccessNode) and node.desc(sdfg).storage == dtypes.StorageType.GPU_Shared - and state.in_degree(node) == 0 and state.out_degree(node) > 0): - break - return - + # User frame code to generate state self._frame.generate_state(sdfg, cfg, state, function_stream, callsite_stream) - def _emit_sync(self, codestream: CodeIOStream): - if Config.get_bool('compiler', 'cuda', 'syncdebug'): - codestream.write('''DACE_GPU_CHECK({backend}GetLastError()); - DACE_GPU_CHECK({backend}DeviceSynchronize());'''.format(backend=self.backend)) + # Special: Release of pooled memory if not in device code that need to be released her + if not ExperimentalCUDACodeGen._in_kernel_code: + + handled_keys = set() + backend = self.backend + for (pool_sdfg, name), (pool_state, _) in self.pool_release.items(): + + if (pool_sdfg is not sdfg) or (pool_state is not state): + continue + + data_descriptor = pool_sdfg.arrays[name] + ptrname = ptr(name, data_descriptor, pool_sdfg, self._frame) + + # Adjust if there is an offset + if isinstance(data_descriptor, dt.Array) and data_descriptor.start_offset != 0: + ptrname = f'({ptrname} - {sym2cpp(data_descriptor.start_offset)})' + + # Free the memory + callsite_stream.write(f'DACE_GPU_CHECK({backend}Free({ptrname}));\n', pool_sdfg) + + emit_sync_debug_checks(self.backend, callsite_stream) + + # We handled the key (pool_sdfg, name) and can remove it later + handled_keys.add((pool_sdfg, name)) + + # Delete the handled keys here (not in the for loop, which would cause issues) + for key in handled_keys: + del self.pool_release[key] def generate_node(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int, node: nodes.Node, function_stream: CodeIOStream, callsite_stream: CodeIOStream) -> None: @@ -739,14 +752,15 @@ def _prepare_GPU_Global_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: Sta arrsize_malloc = f'{symbolic_to_cpp(arrsize)} * sizeof({nodedesc.dtype.ctype})' if nodedesc.pool: - cudastream = getattr(node, '_cuda_stream', 'nullptr') - if cudastream != 'nullptr': - cudastream = f'__state->gpu_context->streams[{cudastream}]' + gpu_stream_manager = self._gpu_stream_manager + gpu_stream = gpu_stream_manager.get_stream_node(node) + if gpu_stream != 'nullptr': + gpu_stream = f'__state->gpu_context->streams[{gpu_stream}]' allocation_stream.write( - f'DACE_GPU_CHECK({self.backend}MallocAsync((void**)&{dataname}, {arrsize_malloc}, {cudastream}));\n', + f'DACE_GPU_CHECK({self.backend}MallocAsync((void**)&{dataname}, {arrsize_malloc}, {gpu_stream}));\n', cfg, state_id, node ) - self._emit_sync(allocation_stream) + emit_sync_debug_checks(self.backend, allocation_stream) else: # Strides are left to the user's discretion allocation_stream.write( @@ -967,6 +981,12 @@ def get_generated_codeobjects(self): #include <{backend_header}> #include +// New, cooperative groups and asnyc copy +#include +#include + +namespace cg = cooperative_groups; + {file_header} DACE_EXPORTED int __dace_init_experimental_cuda({sdfg_state_name} *__state{params}); diff --git a/dace/codegen/targets/experimental_cuda_helpers/copy_strategies.py b/dace/codegen/targets/experimental_cuda_helpers/copy_strategies.py index 9b740d547d..7ee48755fd 100644 --- a/dace/codegen/targets/experimental_cuda_helpers/copy_strategies.py +++ b/dace/codegen/targets/experimental_cuda_helpers/copy_strategies.py @@ -5,7 +5,7 @@ from dace import Memlet, dtypes from dace.dtypes import StorageType from dace.codegen.targets.experimental_cuda import ExperimentalCUDACodeGen, GPUStreamManager, KernelSpec -from dace.codegen.targets.experimental_cuda_helpers.gpu_utils import product, symbolic_to_cpp +from dace.codegen.targets.experimental_cuda_helpers.gpu_utils import product, symbolic_to_cpp, emit_sync_debug_checks from dace.codegen.prettycode import CodeIOStream @@ -251,6 +251,9 @@ def generate_copy(self, copy_context: CopyContext) -> None: # sanity check assert num_dims > 2, f"Expected copy shape with more than 2 dimensions, but got {num_dims}." self._generate_nd_copy(copy_context) + + # We use library calls thus for debugging we provide sync option + emit_sync_debug_checks(copy_context.backend, copy_context.callsite_stream) def _generate_1d_copy(self, copy_context: CopyContext) -> None: """ @@ -268,10 +271,10 @@ def _generate_1d_copy(self, copy_context: CopyContext) -> None: # ----------------- Generate backend call -------------------- if copy_context.is_contiguous_copy(): # Memory is linear: can use {backend}MemcpyAsync - num_bytes = f'{product(copy_shape)} * sizeof({ctype})' + copysize = ' * '.join(symbolic_to_cpp(copy_shape)) + copysize += f' * sizeof({ctype})' kind = f'{backend}Memcpy{src_location}To{dst_location}' - - call = f'DACE_GPU_CHECK({backend}MemcpyAsync({dst_expr}, {src_expr}, {num_bytes}, {kind}, {cudastream}));\n' + call = f'DACE_GPU_CHECK({backend}MemcpyAsync({dst_expr}, {src_expr}, {copysize}, {kind}, {cudastream}));\n' else: # Memory is strided: use {backend}Memcpy2DAsync with dpitch/spitch @@ -380,14 +383,14 @@ def _generate_nd_copy(self, copy_context: CopyContext) -> None: callsite_stream.write(call, cfg, state_id, [src_node, dst_node]) # Write for-loop footers - for d in range(num_dims - 2): + for dim in range(num_dims - 2): callsite_stream.write("}") ################ TODO, Might need to modified further ############# # Below: Does collaborative copy -class WithinGPUCopyStrategy(CopyStrategy): +class SyncCollaboritveGPUCopyStrategy(CopyStrategy): def applicable(self, copy_context: CopyContext) -> bool: """ @@ -406,6 +409,14 @@ def applicable(self, copy_context: CopyContext) -> bool: copy_context.dst_storage in gpu_storages): return False + + + dst_node = copy_context.dst_node + if isinstance(dst_node, nodes.AccessNode) and dst_node.async_copy: + return False + + + # --- Condition 2: Inside a GPU_Device map scope --- state = copy_context.state_dfg scope_dict = state.scope_dict() @@ -544,6 +555,117 @@ def _get_storagename(self, storage: dtypes.StorageType): + +class AsyncCollaboritveGPUCopyStrategy(CopyStrategy): + + def applicable(self, copy_context: CopyContext)-> bool: + + from dace.sdfg import scope_contains_scope + from dace.transformation import helpers + + # --- Condition 1: GPU to GPU memory transfer --- + gpu_storages = {dtypes.StorageType.GPU_Global, dtypes.StorageType.GPU_Shared} + if not (copy_context.src_storage in gpu_storages and + copy_context.dst_storage in gpu_storages): + return False + + + + dst_node = copy_context.dst_node + if not (isinstance(dst_node, nodes.AccessNode) and dst_node.async_copy): + return False + + + # --- Condition 2: Inside a GPU_Device map scope --- + state = copy_context.state_dfg + scope_dict = state.scope_dict() + + # Determine which node (src or dst) is in the deeper scope + src, dst = copy_context.src_node, copy_context.dst_node + deeper_scope_node = dst if scope_contains_scope(scope_dict, src, dst) else src + + # Determine the schedule type of the innermost non-sequential map. + # If no such map exists, use the default schedule. + current_node = deeper_scope_node + while (current_node is None or not isinstance(current_node, nodes.MapEntry) or + current_node.map.schedule == dtypes.ScheduleType.Sequential): + + parent = helpers.get_parent_map(state, current_node) + if parent is None: + current_node = None + break + current_node, state = parent + + if current_node is None: + schedule_type = dtypes.SCOPEDEFAULT_SCHEDULE[None] + else: + schedule_type = current_node.map.schedule + + return schedule_type == dtypes.ScheduleType.GPU_Device + + + + def generate_copy(self, copy_context: CopyContext): + + # Show Yakup: + # Asynchronous memory copies are only allowed if they are contiguous + if not copy_context.is_contiguous_copy(): + raise NotImplementedError("Asynchronous memory copies are not supported for not contigous memory copies") + + + # Get required copy information + copy_shape, src_strides, dst_strides = copy_context.get_transfer_layout() + src_expr, dst_expr = copy_context.src_expr, copy_context.dst_expr + + sdfg = copy_context.sdfg + dtype = copy_context.src_node.desc(sdfg).dtype + ctype = dtype.ctype + + # Get write context: + callsite_stream, cfg, state_id, src_node, dst_node = copy_context.get_write_context() + # copy dimension + num_dims = len(copy_shape) + + if num_dims == 1: + pipeline = dst_node.async_pipeline + size = f'{product(copy_shape)} *sizeof({ctype})' + callsite_stream.write(f"cuda::memcpy_async(block, {dst_expr}, {src_expr}, {size}, {pipeline});\n", cfg, state_id, [src_node, dst_node]) + + elif num_dims > 1: + + # No built-in functionality for higher dimension copies- + # But solvable looping and doing 1D copies + + # write for-loop header: + for dim in range(num_dims - 1): + callsite_stream.write( + f"for (int __copyidx{dim} = 0; __copyidx{dim} < {copy_shape[dim]}; ++__copyidx{dim}) {{") + + + offset_src = ' + '.join(f'(__copyidx{d} * ({s}))' for d, s in enumerate(src_strides[:-1])) + offset_dst = ' + '.join(f'(__copyidx{d} * ({s}))' for d, s in enumerate(dst_strides[:-1])) + + size = f'{copy_shape[-1]} *sizeof({ctype})' + src = f'{src_expr} + {offset_src}' + dst = f'{dst_expr} + {offset_dst}' + + callsite_stream.write(f"cuda::memcpy_async(block, {dst}, {src}, {size}, {pipeline});\n", cfg, state_id, [src_node, dst_node]) + + # Write for-loop footers + for dim in range(num_dims - 2): + callsite_stream.write("}") + + + else: + # Should not be possible- otherwise, doing nothing is also okay + # because a empty copy shape means we don't copy anything + pass + + + emit_sync_debug_checks(copy_context.backend, copy_context.callsite_stream) + + + class FallBackGPUCopyStrategy(CopyStrategy): def applicable(self, copy_context: CopyContext)-> bool: diff --git a/dace/codegen/targets/experimental_cuda_helpers/gpu_utils.py b/dace/codegen/targets/experimental_cuda_helpers/gpu_utils.py index 3a4f3dcde1..f487db5a88 100644 --- a/dace/codegen/targets/experimental_cuda_helpers/gpu_utils.py +++ b/dace/codegen/targets/experimental_cuda_helpers/gpu_utils.py @@ -1,8 +1,10 @@ import functools import sympy -from dace import symbolic -from dace.codegen import cppunparse +from dace import Config, symbolic + +from dace.codegen import cppunparse +from dace.codegen.prettycode import CodeIOStream def symbolic_to_cpp(arr): @@ -25,4 +27,18 @@ def product(iterable): Purpose: This function is used to improve readability of the codeGen. """ - return functools.reduce(sympy.Mul, iterable, 1) \ No newline at end of file + return functools.reduce(sympy.Mul, iterable, 1) + +def emit_sync_debug_checks(backend: str, codestream: CodeIOStream): + """ + Emit backend sync and error-check calls if synchronous debugging is enabled. + + Args: + backend (str): Backend API prefix (e.g., 'cuda'). + codestream (CodeIOStream): Stream to write code to. + """ + if Config.get_bool('compiler', 'cuda', 'syncdebug'): + codestream.write( + f"DACE_GPU_CHECK({backend}GetLastError());\n" + f"DACE_GPU_CHECK({backend}DeviceSynchronize());\n" + ) \ No newline at end of file diff --git a/dace/codegen/targets/experimental_cuda_helpers/scope_strategies.py b/dace/codegen/targets/experimental_cuda_helpers/scope_strategies.py index 5b886d61a0..2d4f91b605 100644 --- a/dace/codegen/targets/experimental_cuda_helpers/scope_strategies.py +++ b/dace/codegen/targets/experimental_cuda_helpers/scope_strategies.py @@ -78,7 +78,12 @@ def generate(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: ScopeSubgraphV function_stream=function_stream, callsite_stream=callsite_stream, comment="Kernel scope") as scope_manager: - # ----------------- Retrieve kernel configuration ----------------------- + # ----------------- Initialize Kernel Scope Constructs ----------------------- + + self._generate_kernel_initialization(sdfg, cfg, dfg_scope, state_id, function_stream, callsite_stream) + + # ----------------- Retrieve kernel configuration ----------------------- + kernel_spec = self._current_kernel_spec kernel_entry_node = kernel_spec._kernel_entry_node # = dfg_scope.source_nodes()[0] kernel_map = kernel_spec.kernel_map @@ -193,7 +198,60 @@ def _generate_kernel_signature(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_sco f'__global__ void {launch_bounds} {kernel_name}({", ".join(kernel_args)}) ', cfg, state_id, node ) + + def _generate_kernel_initialization(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: ScopeSubgraphView, + state_id: int, function_stream: CodeIOStream, callsite_stream: CodeIOStream): + + """ + Tell yakup: + 1. This is as far as I know really cuda specific- maybe I should raise an error if wrong backend is used + 2. What about the shared state allocation? Is it correct to tell about this allocation? generally, did I + tell the dispatcher everything correctly? + """ + + # Skip this if there are no metada, nothing to initialize + metadata = sdfg.metadata + if metadata == None: + return + + node = dfg_scope.source_nodes()[0] + + callsite_stream.write(f"\n", cfg, state_id, node) + # initialize block group using coopertive groups + tblock_obj_name = "block" + tblock_obj_ctype = "auto" + callsite_stream.write(f"{tblock_obj_ctype} {tblock_obj_name} = cg::this_thread_block();\n", cfg, state_id, node) + self._dispatcher.defined_vars.add(tblock_obj_name, DefinedType.Object, tblock_obj_ctype) + + # initialize pipeline + pipelines = dict() + for node_guid, node_meta in metadata.items(): + pipelines = node_meta.get("pipelines", {}) + for pipeline_name, pipeline_info in pipelines.items(): + pipelines[pipeline_name] = pipeline_info["pipeline_depth"] + + + + for pipeline_name, pipeline_depth in pipelines.items(): + callsite_stream.write(f"\n", cfg, state_id, node) + # initialize pipeline depth scalar + depth_name = f"pipeline_depth_{pipeline_name}" + depth_ctype = "const uint" + callsite_stream.write(f"{depth_ctype} {depth_name} = {pipeline_depth};\n", cfg, state_id, node) + self._dispatcher.defined_vars.add(depth_name, DefinedType.Scalar, depth_ctype) + + # allocate shared pipeline state + shared_state_name = f"shared_state_{pipeline_name}" + shared_state_ctype = f"cuda::pipeline_shared_state" + callsite_stream.write(f" __shared__ {shared_state_ctype} {shared_state_name};\n") + self._dispatcher.declared_arrays.add(shared_state_name, DefinedType.Pointer, shared_state_ctype) + + # intialize the pipeline + pipeline_ctype = "auto" + callsite_stream.write(f"{pipeline_ctype} {pipeline_name} = cuda::make_pipeline({tblock_obj_name}, &{shared_state_name});\n", cfg, state_id, node) + self._dispatcher.defined_vars.add(pipeline_name, DefinedType.Object, pipeline_ctype) + callsite_stream.write(f"\n", cfg, state_id, node) class ThreadBlockScopeGenerator(ScopeGenerationStrategy): diff --git a/dace/config_schema.yml b/dace/config_schema.yml index 70ed877dee..ab34f33f05 100644 --- a/dace/config_schema.yml +++ b/dace/config_schema.yml @@ -457,6 +457,7 @@ required: Yakup Koray Budanaz for Berkays master-thesis. enum: [legacy, experimental] default: experimental + gpu_index_type: type: str title: Thread/block/warp index data type @@ -469,6 +470,7 @@ required: range, or to reduce memory usage. This replaces ``thread_id_type`` in ``ExperimentalCUDACodeGen`` , as the new name more accurately reflects its broader usage. + cuda_warp_size: type: int title: CUDA warp size @@ -487,6 +489,17 @@ required: The default value for AMD GPUs is typically 64. This setting should only be modified if you have a clear understanding of what you are doing. default: 64 + + auto_syncthreads_insertion: + type: bool + title: Insert Default __syncthreads() Tasklets + description: > + If enabled, inserts default __syncthreads() tasklets during preprocessing + in ExperimentalCUDACodeGen to ensure shared memory is ready before access. + This is a simple safeguard for correctness—it may not be complete, but it + does the job for basic SDFGs. Disable if you handle synchronization manually + or use other mechanisms like async copies or pipelines. + default: True ############################################# # General FPGA flags diff --git a/dace/sdfg/nodes.py b/dace/sdfg/nodes.py index e71d95d26a..06a8fae71d 100644 --- a/dace/sdfg/nodes.py +++ b/dace/sdfg/nodes.py @@ -283,6 +283,16 @@ class AccessNode(Node): default=dtypes.DataInstrumentationType.No_Instrumentation) instrument_condition = CodeProperty(desc="Condition under which to trigger the instrumentation", default=CodeBlock("1", language=dtypes.Language.CPP)) + + # Experimental-CUDA-specific properties + async_copy = Property(dtype=bool, + desc="Marks the data copy to this node (if any) as asynchronous (CUDA-specific).", + default=False) + + async_pipeline = Property(dtype=str, + desc="Name of the CUDA pipeline responsible for synchronization. " + "Only relevant if async_copy is True. May be None.", + allow_none=True) def __init__(self, data, debuginfo=None): super(AccessNode, self).__init__() @@ -311,6 +321,9 @@ def __deepcopy__(self, memo): node._guid = graph.generate_element_id(node) + node._async_copy = self._async_copy + node._async_pipeline = self._async_pipeline + return node @property diff --git a/dace/sdfg/sdfg.py b/dace/sdfg/sdfg.py index a612ea9f27..2169de72a2 100644 --- a/dace/sdfg/sdfg.py +++ b/dace/sdfg/sdfg.py @@ -470,6 +470,11 @@ class SDFG(ControlFlowRegion): default=False, desc="Whether the SDFG contains explicit control flow constructs") + metadata = Property(dtype=dict, + desc="Metada attached to the SDFG", + default=None, + allow_none=True) + def __init__(self, name: str, constants: Dict[str, Tuple[dt.Data, Any]] = None, @@ -562,6 +567,9 @@ def __deepcopy__(self, memo): if fixed: warnings.warn(f'Fixed {fixed} nested SDFG parent references during deep copy.') + # copy metadata + result._metadata = copy.deepcopy(self._metadata, memo) + return result @property diff --git a/dace/transformation/passes/gpustream_scheduling.py b/dace/transformation/passes/gpustream_scheduling.py index 76de3e351e..7c335e988f 100644 --- a/dace/transformation/passes/gpustream_scheduling.py +++ b/dace/transformation/passes/gpustream_scheduling.py @@ -52,7 +52,6 @@ def apply_pass(self, sdfg: SDFG, _) -> Dict[nodes.Node, Union[int, str]]: """ Assigns GPU streams and adds synchronization tasklets. """ - assigned_nodes = self._assign_streams_to_sdfg(sdfg) num_assigned_streams = max(assigned_nodes.values(), default=0) @@ -63,7 +62,7 @@ def apply_pass(self, sdfg: SDFG, _) -> Dict[nodes.Node, Union[int, str]]: for k in assigned_nodes.keys(): assigned_nodes[k] = "nullptr" - self._add_sync_tasklet(sdfg, assigned_nodes) + self._add_sync_tasklet_heuristically(sdfg, assigned_nodes) return assigned_nodes @@ -78,11 +77,11 @@ def _assign_streams_to_sdfg(self, sdfg: SDFG, assigned_nodes=None, visited=None) visited = set() for state in sdfg.states(): - self._assign_streams_to_state_recursively(sdfg, state, assigned_nodes, visited, 0) + self._assign_streams_to_state_recursively(sdfg, False, state, assigned_nodes, visited, 0) return assigned_nodes - def _assign_streams_to_state_recursively(self, sdfg: SDFG, state: SDFGState, assigned_nodes: Dict, visited: Set, gpu_stream:int): + def _assign_streams_to_state_recursively(self, sdfg: SDFG, in_nested_sdfg: bool, state: SDFGState, assigned_nodes: Dict, visited: Set, gpu_stream:int): """ Processes connected components in a state, assigning each to a different GPU stream, but only if they contain GPU-related nodes (otherwise, stream assignment is skipped). @@ -115,15 +114,15 @@ def _assign_streams_to_state_recursively(self, sdfg: SDFG, state: SDFGState, ass # Recursively process nested SDFG states with same stream if isinstance(src, nodes.NestedSDFG): for nested_state in src.sdfg.states(): - self._assign_streams_to_state_recursively(src.sdfg, nested_state, assigned_nodes, visited, gpu_stream) + self._assign_streams_to_state_recursively(src.sdfg, True, nested_state, assigned_nodes, visited, gpu_stream) if isinstance(dst, nodes.NestedSDFG): for nested_state in dst.sdfg.states(): - self._assign_streams_to_state_recursively(dst.sdfg, nested_state, assigned_nodes, visited, gpu_stream) + self._assign_streams_to_state_recursively(dst.sdfg, True, nested_state, assigned_nodes, visited, gpu_stream) - # Move to next stream if we assigned any nodes in this component - if len(assigned_nodes) > nodes_assigned_before: + # Move to next stream if we assigned any nodes in this component (careful: if nested, states are in same component) + if not in_nested_sdfg and len(assigned_nodes) > nodes_assigned_before: gpu_stream = self._next_stream(gpu_stream) def _is_gpu_node(self, node: nodes.Node, sdfg: SDFG) -> bool: @@ -172,37 +171,213 @@ def _next_stream(self, gpu_stream: int) -> int: else: return (gpu_stream + 1) % self.max_concurrent_streams - def _add_sync_tasklet(self, sdfg: SDFG, assigned_nodes: dict): + def _add_sync_tasklet(self, sdfg: SDFG, assigned_nodes: Dict): """ - Adds a synchronization tasklet for each sink node in a connected component, - but only for top-level states (not inside nested SDFGs). + Adds a GPU stream synchronization tasklet after kernels finish execution. - Specifically: - - If a sink node is an AccessNode and has been assigned a GPU stream, - a tasklet is inserted after it to call stream synchronization. - - This ensures proper synchronization. + Synchronization is inserted: + - After kernel writes to a sink GPU AccessNode + - After a GPU-to-CPU (Device to Host) copy + - Raises NotImplementedError for unhandled AccessNode->non-AccessNode GPU edges """ - for state in sdfg.states(): - for snode in state.sink_nodes(): - - if isinstance(snode, nodes.AccessNode) and snode in assigned_nodes.keys(): - - # get correct stream access expr - stream = assigned_nodes[snode] - if stream == "nullptr": - gpu_stream_access_expr = "nullptr" - else: - gpu_stream_access_expr = self.gpu_stream_access_template.format(gpu_stream=stream) - # Add tasklet and connect it to the sink node - tasklet = state.add_tasklet( - name=f"sync_{stream}", inputs=set(), outputs=set(), - code=f"DACE_GPU_CHECK({self.backend}StreamSynchronize({gpu_stream_access_expr}));\n", - language=dtypes.Language.CPP - ) - - state.add_edge(snode, None, tasklet, None, dace.Memlet()) + + # Track all GPU Device kernel exits and their states + gpu_kernel_exits: Dict[nodes.MapExit, SDFGState] = {} + + for node, parent in sdfg.all_nodes_recursive(): + if isinstance(node, nodes.MapExit) and node.schedule == dtypes.ScheduleType.GPU_Device: + if not isinstance(parent, SDFGState): + raise NotImplementedError(f"Unexpected parent type '{type(parent).__name__}' for node '{node}'. " + "Expected 'SDFGState'. Please handle this case explicitly.") + + gpu_kernel_exits[node] = parent + + + # Determine where to insert synchronization tasklets + insert_sync_after: Dict[nodes.Node, SDFGState] = {} + + for exit_node, state in gpu_kernel_exits.items(): + for edge in state.dfs_edges(exit_node): + src, dst = edge.src, edge.dst + + + src_is_gpu = isinstance(src, nodes.AccessNode) and \ + src.desc(state.parent).storage == dtypes.StorageType.GPU_Global + dst_is_gpu = isinstance(dst, nodes.AccessNode) and \ + dst.desc(state.parent).storage == dtypes.StorageType.GPU_Global + + # Case 1: MapExit → GPU AccessNode (sink node) + if (src is exit_node and isinstance(dst, nodes.AccessNode) and dst_is_gpu + and state.out_degree(dst) == 0): + insert_sync_after[dst] = state + + # Case 2: GPU AccessNode → Non-GPU AccessNode + elif src_is_gpu and isinstance(dst, nodes.AccessNode) and not dst_is_gpu: + insert_sync_after[dst] = state + + # Case 3: GPU AccessNode → non-AccessNode (unexpected case) + elif src_is_gpu and not isinstance(dst, nodes.AccessNode): + raise NotImplementedError(f"Found edge from GPU AccessNode '{src}' to non-AccessNode '{dst}' after kernel computation. " + "This case is currently not handled and may need special sync logic.") + + # handle this + for edge, state in sdfg.all_edges_recursive(): + src, dst = edge.src, edge.dst + + src_is_not_gpu = isinstance(src, nodes.AccessNode) and \ + src.desc(state.parent).storage != dtypes.StorageType.GPU_Global + dst_is_gpu = isinstance(dst, nodes.AccessNode) and \ + dst.desc(state.parent).storage == dtypes.StorageType.GPU_Global + + if src_is_not_gpu and dst_is_gpu: + if not isinstance(state, SDFGState): + raise NotImplementedError(f"Unexpected parent type '{type(parent).__name__}' for node '{node}'. " + "Expected 'SDFGState'. Please handle this case explicitly.") + insert_sync_after[dst] = state + + # Insert synchronization tasklets + for node, kernel_state in insert_sync_after.items(): + + # get correct stream access expr + stream = assigned_nodes.get(node, "nullptr") + if stream == "nullptr": + gpu_stream_access_expr = "nullptr" + else: + gpu_stream_access_expr = self.gpu_stream_access_template.format(gpu_stream=stream) + + tasklet = kernel_state.add_tasklet( + name=f"gpu_stream_sync_{stream}", inputs=set(), outputs=set(), + code=f"DACE_GPU_CHECK({self.backend}StreamSynchronize({gpu_stream_access_expr}));\n", + language=dtypes.Language.CPP + ) + + # important: First get the successors, then add the tasklet + successors = list(kernel_state.successors(node)) + kernel_state.add_edge(node, None, tasklet, None, dace.Memlet()) + + # Also handles sink nodes (case 1). Nothing to add between successors + for succ in successors : + kernel_state.add_edge(tasklet, None, succ, None, dace.Memlet()) + + + + + def _add_sync_tasklet_heuristically(self, sdfg: SDFG, assigned_nodes: Dict): + # Keep track which streams should be synced at the end of a state + # And when we need to sync after a node + sync_state: Dict[SDFGState, Set[str]] = {} + sync_node: Dict[nodes.Node, SDFGState] = {} + + GPU_STORAGE = [dtypes.StorageType.GPU_Global, dtypes.StorageType.GPU_Shared] + + for edge, state in sdfg.all_edges_recursive(): + + src, dst = edge.src, edge.dst + + # Check node types and storage + src_is_gpu_accNode = isinstance(src, nodes.AccessNode) and \ + src.desc(state.parent).storage == dtypes.StorageType.GPU_Global + + dst_is_gpu_accNode = isinstance(dst, nodes.AccessNode) and \ + dst.desc(state.parent).storage == dtypes.StorageType.GPU_Global + + dst_is_nongpu_accNode = isinstance(dst, nodes.AccessNode) and \ + not dst.desc(state.parent).storage in GPU_STORAGE + src_is_nongpu_accNode = isinstance(src, nodes.AccessNode) and \ + not src.desc(state.parent).storage in GPU_STORAGE + + src_is_kernelExit = isinstance(src, nodes.ExitNode) and src.schedule == dtypes.ScheduleType.GPU_Device + + dst_is_sink_node = state.out_degree(dst) == 0 + + # Initialize sync_state for this state if not exists + if state not in sync_state: + sync_state[state] = set() + + # Apply heuristics + if src_is_gpu_accNode and dst_is_nongpu_accNode and dst_is_sink_node: + # GPU AccessNode -> Non GPU AccessNode which is a sink: Just sync at end of state + if dst not in assigned_nodes: + raise ValueError("Missed to assign gpu stream to dst") + sync_state[state].add(assigned_nodes[dst]) + + elif src_is_gpu_accNode and dst_is_nongpu_accNode and not dst_is_sink_node: + # GPU AccessNode -> Non GPU AccessNode which is NOT a sink -> Sync at end of state and after dst + if dst not in assigned_nodes: + raise ValueError("Missed to assign gpu stream to dst") + sync_state[state].add(assigned_nodes[dst]) + sync_node[dst] = state + + elif src_is_nongpu_accNode and dst_is_gpu_accNode: + # Non_GPU AccessNode -> GPU AccessNode -> No matter what, just at end of state sync is needed + if dst not in assigned_nodes: + raise ValueError("Missed to assign gpu stream to dst") + sync_state[state].add(assigned_nodes[dst]) + + elif src_is_kernelExit and dst_is_gpu_accNode and dst_is_sink_node: + # KernelExit -> GPU AccessNode which is a sink node -> add to state + if dst not in assigned_nodes: + raise ValueError("Missed to assign gpu stream to dst") + sync_state[state].add(assigned_nodes[dst]) + + else: + continue + + # we didn't return, so lets verify that sync state has truly on sdfgstates as keys + if not isinstance(state, SDFGState): + raise NotImplementedError(f"Unexpected parent type '{type(state).__name__}' for edge '{edge}'. " + "Expected 'SDFGState'. Please handle this case explicitly.") + + + + # Remove states with no streams to sync + sync_state = {state: streams for state, streams in sync_state.items() if streams} + for state, streams in sync_state.items(): + code = "" + for stream in streams: + if stream == "nullptr": + gpu_stream_access_expr = "nullptr" else: - continue + gpu_stream_access_expr = self.gpu_stream_access_template.format(gpu_stream=stream) + + code += f"DACE_GPU_CHECK({self.backend}StreamSynchronize({gpu_stream_access_expr}));\n" + + + + sink_nodes = list(state.sink_nodes()) + tasklet = state.add_tasklet( + name=f"gpu_stream_sync_{state}", inputs=set(), outputs=set(), + code=code, + language=dtypes.Language.CPP + ) + + for sink_node in sink_nodes: + state.add_edge(sink_node, None, tasklet, None, dace.Memlet()) + + + for node, state in sync_node.items(): + # get correct stream access expr + stream = assigned_nodes.get(node, "nullptr") + if stream == "nullptr": + gpu_stream_access_expr = "nullptr" + else: + gpu_stream_access_expr = self.gpu_stream_access_template.format(gpu_stream=stream) + + tasklet = state.add_tasklet( + name=f"gpu_stream_sync_{stream}", inputs=set(), outputs=set(), + code=f"DACE_GPU_CHECK({self.backend}StreamSynchronize({gpu_stream_access_expr}));\n", + language=dtypes.Language.CPP + ) + + # important: First get the successors, then add the tasklet + successors = list(state.successors(node)) + state.add_edge(node, None, tasklet, None, dace.Memlet()) + + for succ in successors : + state.add_edge(tasklet, None, succ, None, dace.Memlet()) + + + def set_gpu_stream_access_template(self, expr_template: str): """ From 2332a6390b7371b05ba2d3cec914c31cacd31c7b Mon Sep 17 00:00:00 2001 From: aydogdub Date: Wed, 25 Jun 2025 14:51:16 +0200 Subject: [PATCH 32/94] copying yakups validation adaption (and symbolic.py, which has newcode needed in validation.py --- dace/sdfg/validation.py | 66 +++++++++++++-- dace/symbolic.py | 176 +++++++++++++++++++++++++++++++++++----- 2 files changed, 216 insertions(+), 26 deletions(-) diff --git a/dace/sdfg/validation.py b/dace/sdfg/validation.py index ccfb0adada..d8bc00c49a 100644 --- a/dace/sdfg/validation.py +++ b/dace/sdfg/validation.py @@ -9,7 +9,7 @@ import networkx as nx -from dace import dtypes, subsets, symbolic +from dace import dtypes, subsets, symbolic, data from dace.dtypes import DebugInfo if TYPE_CHECKING: @@ -285,6 +285,35 @@ def validate_sdfg(sdfg: 'dace.sdfg.SDFG', references: Set[int] = None, **context f'Cannot use scalar data descriptor ("{name}") as return value of a top-level function.', sdfg, None) + # Check for UndefinedSymbol in transient data shape (needed for memory allocation) + if desc.transient: + # Check dimensions + for i, dim in enumerate(desc.shape): + if symbolic.is_undefined(dim): + raise InvalidSDFGError( + f'Transient data container "{name}" contains undefined symbol in dimension {i}, ' + f'which is required for memory allocation', sdfg, None) + + # Check strides if array + if hasattr(desc, 'strides'): + for i, stride in enumerate(desc.strides): + if symbolic.is_undefined(stride): + raise InvalidSDFGError( + f'Transient data container "{name}" contains undefined symbol in stride {i}, ' + f'which is required for memory allocation', sdfg, None) + + # Check total size + if hasattr(desc, 'total_size') and symbolic.is_undefined(desc.total_size): + raise InvalidSDFGError( + f'Transient data container "{name}" has undefined total size, ' + f'which is required for memory allocation', sdfg, None) + + # Check any other undefined symbols in the data descriptor + if any(symbolic.is_undefined(s) for s in desc.used_symbols(all_symbols=False)): + raise InvalidSDFGError( + f'Transient data container "{name}" has undefined symbols, ' + f'which are required for memory allocation', sdfg, None) + # Validate array names if name is not None and not dtypes.validate_name(name): raise InvalidSDFGError("Invalid array name %s" % name, sdfg, None) @@ -333,6 +362,7 @@ def validate_sdfg(sdfg: 'dace.sdfg.SDFG', references: Set[int] = None, **context for desc in sdfg.arrays.values(): for sym in desc.free_symbols: symbols[str(sym)] = sym.dtype + validate_control_flow_region(sdfg, sdfg, initialized_transients, symbols, references, **context) except InvalidSDFGError as ex: @@ -656,7 +686,6 @@ def validate_state(state: 'dace.sdfg.SDFGState', ) ######################################## - # Memlet checks for eid, e in enumerate(state.edges()): # Reference check if id(e) in references: @@ -680,6 +709,27 @@ def validate_state(state: 'dace.sdfg.SDFGState', except Exception as ex: raise InvalidSDFGEdgeError("Edge validation failed: " + str(ex), sdfg, state_id, eid) + # If the edge is a connection between two AccessNodes check if the subset has negative size. + # NOTE: We _should_ do this check in `Memlet.validate()` however, this is not possible, + # because the connection between am AccessNode and a MapEntry, with a negative size, is + # legal because, the Map will not run in that case. However, this constellation can not + # be tested for in the Memlet's validation function, so we have to do it here. + # NOTE: Zero size is explicitly allowed because it is essentially `memcpy(dst, src, 0)` + # which is save. + # TODO: The AN to AN connection is the most obvious one, but it should be extended. + if isinstance(e.src, nd.AccessNode) and isinstance(e.dst, nd.AccessNode): + e_memlet: dace.Memlet = e.data + if e_memlet.subset is not None: + if any((ss < 0) == True for ss in e_memlet.subset.size()): + raise InvalidSDFGEdgeError( + f'`subset` of an AccessNode to AccessNode Memlet contains a negative size; the size was {e_memlet.subset.size()}', + sdfg, state_id, eid) + if e_memlet.other_subset is not None: + if any((ss < 0) == True for ss in e_memlet.other_subset.size()): + raise InvalidSDFGEdgeError( + f'`other_subset` of an AccessNode to AccessNode Memlet contains a negative size; the size was {e_memlet.other_subset.size()}', + sdfg, state_id, eid) + # For every memlet, obtain its full path in the DFG path = state.memlet_path(e) src_node = path[0].src @@ -820,9 +870,13 @@ def validate_state(state: 'dace.sdfg.SDFGState', if e.data.is_empty() and isinstance(dst_node, nd.ExitNode): pass else: - raise InvalidSDFGEdgeError( - f"Memlet creates an invalid path (sink node {dst_node}" - " should be a data node)", sdfg, state_id, eid) + if isinstance(dst_node, nd.Tasklet) and len(dst_node.in_connectors) == 0 and len(dst_node.out_connectors) == 0: + # Tasklets with no input or output connector -> sync tasklet -> OK + pass + else: + raise InvalidSDFGEdgeError( + f"Memlet creates an invalid path (sink node {dst_node}" + " should be a data node)", sdfg, state_id, eid) # If scope(dst) is disjoint from scope(src), it's an illegal memlet else: raise InvalidSDFGEdgeError("Illegal memlet between disjoint scopes", sdfg, state_id, eid) @@ -1102,4 +1156,4 @@ def validate_memlet_data(memlet_data: str, access_data: str) -> bool: access_tokens = access_data.split('.') memlet_tokens = memlet_data.split('.') mem_root = '.'.join(memlet_tokens[:len(access_tokens)]) - return mem_root == access_data + return mem_root == access_data \ No newline at end of file diff --git a/dace/symbolic.py b/dace/symbolic.py index 00592c4bd9..3f4edcd6b4 100644 --- a/dace/symbolic.py +++ b/dace/symbolic.py @@ -39,7 +39,7 @@ def equal_valued(x, y): class symbol(sympy.Symbol): - """ Defines a symbolic expression. Extends SymPy symbols with DaCe-related + """ Defines a symbolic variable. Extends SymPy symbols with DaCe-related information. """ s_currentsymbol = 0 @@ -98,28 +98,12 @@ def set_constraints(self, constraint_list): except TypeError: # constraint_list is not iterable self._constraints = [constraint_list] - # Check for the new constraints and reset symbol value if necessary - if symbol.s_values[self.name] is not None: - try: - self.check_constraints(symbol.s_values[self.name]) - except RuntimeError: - self.reset() # Reset current value - raise - def add_constraints(self, constraint_list): try: iter(constraint_list) - symbol.s_constraints[self.name].extend(constraint_list) + self._constraints.extend(constraint_list) except TypeError: # constraint_list is not iterable - symbol.s_constraints[self.name].append(constraint_list) - - # Check for the new constraints and reset symbol value if necessary - if symbol.s_values[self.name] is not None: - try: - self.check_constraints(symbol.s_values[self.name]) - except RuntimeError: - self.reset() # Reset current value - raise + self._constraints.append(constraint_list) @property def constraints(self): @@ -139,6 +123,98 @@ def check_constraints(self, value): raise RuntimeError('Value %s invalidates constraint %s for symbol %s' % (str(value), str(fail), self.name)) +class UndefinedSymbol(symbol): + """ Defines an undefined symbolic expression whose value is deferred to runtime. + + Similar to NaN values, any operation on an undefined symbol results in an + undefined symbol. When used in code generation, an informative exception + will be raised. + + This class is useful in situations where a symbol's value is not known + at compile time but symbolic analysis should continue. For example, when + a data container's size is undefined but other symbols with concrete + values should still be analyzed. + + Examples + -------- + >>> from dace.symbolic import UndefinedSymbol, symbol + >>> N = symbol('N') + >>> undefined = UndefinedSymbol() + >>> N + undefined # Returns an UndefinedSymbol + >>> + >>> # This will eventually raise an exception during code generation: + >>> expr = N * undefined + 5 + """ + + def __new__(cls, dtype=DEFAULT_SYMBOL_TYPE, **assumptions): + # Bypass the name validation + self = sympy.Symbol.__xnew__(cls, "?", **assumptions) + self.dtype = dtype + self._constraints = [] + return self + + # Make undefined symbol behavior propagate through operations + def _eval_subs(self, old, new): + # Consolidated logic for substitution + if isinstance(old, UndefinedSymbol): + return self + # Additional logic from the second _eval_subs definition (if any) + return super()._eval_subs(old, new) + + def __abs__(self): + return UndefinedSymbol(self.dtype) + + def __add__(self, other): + return UndefinedSymbol(self.dtype) + + def __radd__(self, other): + return UndefinedSymbol(self.dtype) + + def __sub__(self, other): + return UndefinedSymbol(self.dtype) + + def __rsub__(self, other): + return UndefinedSymbol(self.dtype) + + def __mul__(self, other): + return UndefinedSymbol(self.dtype) + + def __rmul__(self, other): + return UndefinedSymbol(self.dtype) + + def __truediv__(self, other): + return UndefinedSymbol(self.dtype) + + def __rtruediv__(self, other): + return UndefinedSymbol(self.dtype) + + def __pow__(self, other): + return UndefinedSymbol(self.dtype) + + def __rpow__(self, other): + return UndefinedSymbol(self.dtype) + + # Comparisons always return False to indicate indeterminate equality + def __eq__(self, other): + return False + + def __lt__(self, other): + return None + + def __gt__(self, other): + return None + + def __le__(self, other): + return None + + def __ge__(self, other): + return None + + def __hash__(self): + # Make UndefinedSymbol hashable as required by SymPy + return hash(self.name) + + class SymExpr(object): """ Symbolic expressions with support for an overapproximation expression. """ @@ -337,6 +413,10 @@ def evaluate(expr: Union[sympy.Basic, int, float], symbols: Dict[Union[symbol, s if isinstance(expr, SymExpr): return evaluate(expr.expr, symbols) if issymbolic(expr, set(map(str, symbols.keys()))): + # Check for UndefinedSymbol + for atom in expr.atoms(): + if isinstance(atom, UndefinedSymbol): + raise TypeError(f'Cannot evaluate expression "{expr}" containing undefined symbol') raise TypeError(f'Symbolic expression "{expr}" cannot be evaluated to a constant') if isinstance(expr, (int, float, numpy.number)): return expr @@ -361,10 +441,14 @@ def issymbolic(value, constants=None): constants = constants or {} if isinstance(value, SymExpr): return issymbolic(value.expr) + if isinstance(value, UndefinedSymbol): + return True if isinstance(value, (sympy.Symbol, symbol)) and value.name not in constants: return True if isinstance(value, sympy.Basic): for atom in value.atoms(): + if isinstance(atom, UndefinedSymbol): + return True if isinstance(atom, (sympy.Symbol, symbol)) and atom.name not in constants: return True return False @@ -549,6 +633,30 @@ def free_symbols_and_functions(expr: Union[SymbolicType, str]) -> Set[str]: return result +def is_undefined(expr: Union[SymbolicType, str]) -> bool: + """ + Checks if a symbolic expression contains any UndefinedSymbol atoms. + + :param expr: The expression to check. + :return: True if the expression contains undefined symbols, False otherwise. + """ + if isinstance(expr, str): + expr = pystr_to_symbolic(expr) + + if isinstance(expr, UndefinedSymbol): + return True + + if not isinstance(expr, sympy.Basic): + return False + + # Check all atoms in the expression + for atom in expr.atoms(): + if isinstance(atom, UndefinedSymbol): + return True + + return False + + def sympy_numeric_fix(expr): """ Fix for printing out integers as floats with ".00000000". Converts the float constants in a given expression to integers. """ @@ -569,6 +677,12 @@ def sympy_numeric_fix(expr): return sympy.oo else: return -sympy.oo + + # Check if expression contains UndefinedSymbol and propagate it + for atom in expr.atoms(): + if isinstance(atom, UndefinedSymbol): + return UndefinedSymbol() + return expr @@ -1155,6 +1269,8 @@ def pystr_to_symbolic(expr, symbol_map=None, simplify=None) -> sympy.Basic: return sympy.Float(float(expr)) except ValueError: pass + if "?" in expr: # Note that this will convert expressions like "a ? b : c" or "some_func(?)" to UndefinedSymbol + return UndefinedSymbol() if dtypes.validate_name(expr): return symbol(expr) @@ -1481,6 +1597,16 @@ def inequal_symbols(a: Union[sympy.Expr, Any], b: Union[sympy.Expr, Any]) -> boo """ Compares 2 symbolic expressions and returns True if they are not equal. """ + # Check for UndefinedSymbol in either expression + if isinstance(a, sympy.Basic): + for atom in a.atoms(): + if isinstance(atom, UndefinedSymbol): + return True + if isinstance(b, sympy.Basic): + for atom in b.atoms(): + if isinstance(atom, UndefinedSymbol): + return True + if not isinstance(a, sympy.Expr) or not isinstance(b, sympy.Expr): return a != b else: @@ -1504,6 +1630,16 @@ def equal(a: SymbolicType, b: SymbolicType, is_length: bool = True) -> Union[boo args = [arg.expr if isinstance(arg, SymExpr) else arg for arg in (a, b)] + # Check for UndefinedSymbol in either expression + if isinstance(a, sympy.Basic): + for atom in a.atoms(): + if isinstance(atom, UndefinedSymbol): + return None + if isinstance(b, sympy.Basic): + for atom in b.atoms(): + if isinstance(atom, UndefinedSymbol): + return None + if any([args is None for args in args]): return False @@ -1535,4 +1671,4 @@ def symbols_in_code(code: str, potential_symbols: Set[str] = None, symbols_to_ig tokens &= potential_symbols if symbols_to_ignore is None: return tokens - return tokens - symbols_to_ignore + return tokens - symbols_to_ignore \ No newline at end of file From df452ba650cdbd058099639d6787dbfec1e98f55 Mon Sep 17 00:00:00 2001 From: aydogdub Date: Wed, 25 Jun 2025 14:53:02 +0200 Subject: [PATCH 33/94] User can now choose name current thread blocks variable name, which may be required for custom tasklets --- .../experimental_cuda_helpers/scope_strategies.py | 8 ++++++-- dace/config_schema.yml | 11 +++++++++++ 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/dace/codegen/targets/experimental_cuda_helpers/scope_strategies.py b/dace/codegen/targets/experimental_cuda_helpers/scope_strategies.py index 2d4f91b605..c968339947 100644 --- a/dace/codegen/targets/experimental_cuda_helpers/scope_strategies.py +++ b/dace/codegen/targets/experimental_cuda_helpers/scope_strategies.py @@ -6,6 +6,8 @@ import dace from dace import dtypes, subsets, symbolic +from dace.config import Config + # DaCe SDFG imports from dace.sdfg import SDFG, ScopeSubgraphView, nodes, SDFGState from dace.sdfg.state import ControlFlowRegion @@ -203,8 +205,9 @@ def _generate_kernel_initialization(self, sdfg: SDFG, cfg: ControlFlowRegion, df state_id: int, function_stream: CodeIOStream, callsite_stream: CodeIOStream): """ + NOTE: Under construction Tell yakup: - 1. This is as far as I know really cuda specific- maybe I should raise an error if wrong backend is used + 1. This is as far as I know really cuda specific- maybe I should raise an error if wrong backend (HIP) is used 2. What about the shared state allocation? Is it correct to tell about this allocation? generally, did I tell the dispatcher everything correctly? """ @@ -218,7 +221,7 @@ def _generate_kernel_initialization(self, sdfg: SDFG, cfg: ControlFlowRegion, df callsite_stream.write(f"\n", cfg, state_id, node) # initialize block group using coopertive groups - tblock_obj_name = "block" + tblock_obj_name = Config.get('compiler', 'cuda', 'current_thread_block_name') tblock_obj_ctype = "auto" callsite_stream.write(f"{tblock_obj_ctype} {tblock_obj_name} = cg::this_thread_block();\n", cfg, state_id, node) self._dispatcher.defined_vars.add(tblock_obj_name, DefinedType.Object, tblock_obj_ctype) @@ -253,6 +256,7 @@ def _generate_kernel_initialization(self, sdfg: SDFG, cfg: ControlFlowRegion, df callsite_stream.write(f"\n", cfg, state_id, node) + class ThreadBlockScopeGenerator(ScopeGenerationStrategy): def __init__(self, codegen: ExperimentalCUDACodeGen): diff --git a/dace/config_schema.yml b/dace/config_schema.yml index ab34f33f05..7a5537e9f5 100644 --- a/dace/config_schema.yml +++ b/dace/config_schema.yml @@ -501,6 +501,17 @@ required: or use other mechanisms like async copies or pipelines. default: True + current_thread_block_name: + type: str + title: Variable name for the current thread block + description: > + Specifies the name of the variable that holds the current thread block group, + initialized using `cooperative_groups::this_thread_block()`. This is useful in + contexts like custom tasklets, where the variable is explicitly referenced + (e.g., `cooperative_groups::wait(block)`). Setting this allows users to customize the + variable name without modifying the source code or relying on a fixed name. + default: block + ############################################# # General FPGA flags fpga: From 5aad68a08204095b55aca5c8675548b1b8949a6a Mon Sep 17 00:00:00 2001 From: aydogdub Date: Thu, 26 Jun 2025 16:47:52 +0200 Subject: [PATCH 34/94] Fixed and refactored GPU stream sync pass --- dace/dtypes.py | 7 + .../passes/gpustream_scheduling.py | 503 ++++++++---------- 2 files changed, 233 insertions(+), 277 deletions(-) diff --git a/dace/dtypes.py b/dace/dtypes.py index a94521f728..2156e70503 100644 --- a/dace/dtypes.py +++ b/dace/dtypes.py @@ -93,6 +93,13 @@ class ScheduleType(aenum.AutoNumberEnum): ScheduleType.GPU_Warp, ] +# A subset of on-GPU storage types for ExperimentalCUDACodeGen +GPU_MEMORY_STORAGES_EXPERIMENTAL_CUDACODEGEN = [ + StorageType.GPU_Global, + StorageType.GPU_Shared, +] + + # A subset of CPU schedule types CPU_SCHEDULES = [ ScheduleType.CPU_Multicore, diff --git a/dace/transformation/passes/gpustream_scheduling.py b/dace/transformation/passes/gpustream_scheduling.py index 7c335e988f..57012d5a48 100644 --- a/dace/transformation/passes/gpustream_scheduling.py +++ b/dace/transformation/passes/gpustream_scheduling.py @@ -1,4 +1,4 @@ -from typing import Union, Dict, Set +from typing import Union, Dict, Set, List, Tuple import dace from dace import SDFG, properties, SDFGState @@ -7,155 +7,185 @@ from dace.config import Config from dace.transformation import pass_pipeline as ppl, transformation from dace.sdfg import nodes -from dace.sdfg.graph import Edge +from dace.sdfg.graph import Edge, Graph, NodeT @properties.make_properties @transformation.explicit_cf_compatible class NaiveGPUStreamScheduler(ppl.Pass): """ - Assigns GPU streams to relevant nodes based on connected components. - Also, it adds synchronization tasklets where required. - - Strategy: - - "Relevant nodes" in connected components within a state are assigned the same stream. - - Each state (except for nested states) starts fresh with stream 0. - - States in nested SDFGs inherit the parent component's stream. - - Only nodes that are either ("relevant nodes"): - * in GPU memory (AccessNodes in GPU memory), - * GPU scheduled (e.g., maps/kernels or library nodes), - * or directly connected to such nodes, - are assigned a stream. - - GPU stream IDs wrap around based on the max_concurrent_streams config. + Assigns GPU streams to relevant nodes and inserts synchronization tasklets where needed. + + Strategy Overview: + ------------------ + - GPU stream assignment is based on weakly connected components (WCCs) within each state. + - "Relevant nodes" in a WCC are assigned to the same stream. + Relevant nodes include: + * AccessNodes in GPU memory, + * GPU-scheduled nodes (Maps or Library nodes), + * Nodes directly connected to the above. + - For top-level states (not within nested SDFGs), each new WCC starts on a new stream (starting from 0). + - In nested SDFGs: + * Stream assignment is inherited from the parent component, + * All internal components share the parent's stream (consider revisiting this for performance tuning). + - GPU stream IDs wrap around according to the `max_concurrent_streams` configuration. + - Synchronization tasklets are inserted using a simple heuristic: + * At the end of a state, if outputs certain patterns regarding GPU memory occur, + * After a node, if its outputs cross GPU boundaries and are reused downstream. Example: - A state with K1->K2, K3->K4->K5, K6 becomes: - K1,K2 → stream0 - K3,K4,K5 → stream1 - K6 → stream2 - (assuming no limit on the number of CUDA streams) - - NOTE: These are backend streams (CUDA/HIP), not DaCe streams. + -------- + A state with the following independent chains: + K1 → K2 + K3 → K4 → K5 + K6 + + would be scheduled as: + K1, K2 → stream 0 + K3, K4, K5 → stream 1 + K6 → stream 2 + + (assuming no limit on the number of concurrent streams) + + Note: + ----- + These refer to **backend GPU streams** (e.g., CUDA or HIP), not DaCe symbolic streams. """ - - # max configured number of concurrent streams - max_concurrent_streams = int(Config.get('compiler', 'cuda', 'max_concurrent_streams')) - # needed to call correct backend synchronization functions and in correct language - backend: str = common.get_gpu_backend() - language = 'cu' if backend == 'cuda' else 'cpp' + def __init__(self): + # max configured number of concurrent streams + self._max_concurrent_streams = int(Config.get('compiler', 'cuda', 'max_concurrent_streams')) + + # needed to call correct backend synchronization functions + self._backend: str = common.get_gpu_backend() - # This is expected to be set by the calling target codegenerator. - gpu_stream_access_template: str = "" + # This is expected to be set by the calling backend code generator before applying the pass + self._gpu_stream_access_template: str = "" def apply_pass(self, sdfg: SDFG, _) -> Dict[nodes.Node, Union[int, str]]: """ - Assigns GPU streams and adds synchronization tasklets. + Assigns GPU streams to nodes and inserts synchronization tasklets where needed. """ - assigned_nodes = self._assign_streams_to_sdfg(sdfg) - - num_assigned_streams = max(assigned_nodes.values(), default=0) - # If all use 0 stream or max_concurrent_stream is -1 (only default stream) - # then assign to all nodes the nullptr. - if num_assigned_streams == 0: # note: self.max_concurrent_streams == -1 implies num_assigned_streams == 0 + # 1. Traverse each top-level state and assign stream IDs to eligible nodes (starting from stream 0). + assigned_nodes = dict() + for state in sdfg.states(): + self._assign_gpu_streams_in_state(sdfg, False, state, assigned_nodes, 0) + + # 2. If only one stream is used set all assignments to "nullptr". + num_assigned_streams = max(assigned_nodes.values(), default=0) # self.max_concurrent_streams == -1 (default) also handled here + if num_assigned_streams == 0: for k in assigned_nodes.keys(): assigned_nodes[k] = "nullptr" - self._add_sync_tasklet_heuristically(sdfg, assigned_nodes) + # 3. Insert synchronization tasklets based on stream usage. + self._insert_gpu_stream_sync_tasklet(sdfg, assigned_nodes) return assigned_nodes - def _assign_streams_to_sdfg(self, sdfg: SDFG, assigned_nodes=None, visited=None) -> Dict: - """ - Traverse all SDFG states and assign streams to connected components. - Each state (exluding nested states) restarts stream assignment from 0. - """ - if assigned_nodes is None: - assigned_nodes = dict() - if visited is None: - visited = set() - - for state in sdfg.states(): - self._assign_streams_to_state_recursively(sdfg, False, state, assigned_nodes, visited, 0) - - return assigned_nodes - - def _assign_streams_to_state_recursively(self, sdfg: SDFG, in_nested_sdfg: bool, state: SDFGState, assigned_nodes: Dict, visited: Set, gpu_stream:int): + def _assign_gpu_streams_in_state(self, sdfg: SDFG, in_nested_sdfg: bool, state: SDFGState, assigned_nodes: Dict, gpu_stream:int): """ - Processes connected components in a state, assigning each to a different GPU stream, - but only if they contain GPU-related nodes (otherwise, stream assignment is skipped). + Processes connected components in a state, assigning each to a different GPU stream if not inside a nested SDFG. + If inside a nested SDFG, components inherit the stream from the parent state/component. - Nested SDFGs inherit the GPU stream of their parent state/component. + Stream assignment is performed only for components that contain GPU-related nodes; + components without such nodes are skipped. """ - for source_node in state.source_nodes(): - if source_node in visited: - continue # Skip already processed components - - nodes_assigned_before = len(assigned_nodes) - - # Process all nodes in this connected component - for edge in state.dfs_edges(source_node): - - # get both ends of the edge - src = edge.src - dst = edge.dst - - # both are visited, potentially again - visited.add(src) - visited.add(dst) - - # Either they are gpu nodes are directly connected to them, - # so they get assigned to the current gpu_stream - if self._is_gpu_node(src, sdfg) or self._is_gpu_node(dst, sdfg): - assigned_nodes[src] = gpu_stream - assigned_nodes[dst] = gpu_stream - - # Recursively process nested SDFG states with same stream - if isinstance(src, nodes.NestedSDFG): - for nested_state in src.sdfg.states(): - self._assign_streams_to_state_recursively(src.sdfg, True, nested_state, assigned_nodes, visited, gpu_stream) - - if isinstance(dst, nodes.NestedSDFG): - for nested_state in dst.sdfg.states(): - self._assign_streams_to_state_recursively(dst.sdfg, True, nested_state, assigned_nodes, visited, gpu_stream) + components = self._get_weakly_connected_nodes(state) + for component in components: + nodes_assigned_before = len(assigned_nodes) - # Move to next stream if we assigned any nodes in this component (careful: if nested, states are in same component) + for node in component: + + if self._is_relevant_for_gpu_stream(node, sdfg, state): + assigned_nodes[node] = gpu_stream + + if isinstance(node, nodes.NestedSDFG): + for nested_state in node.sdfg.states(): + self._assign_gpu_streams_in_state(node.sdfg, True, nested_state, assigned_nodes, gpu_stream) + + # Move to next stream if we assigned streams to any node in this component (careful: if nested, states are in same component) if not in_nested_sdfg and len(assigned_nodes) > nodes_assigned_before: gpu_stream = self._next_stream(gpu_stream) - def _is_gpu_node(self, node: nodes.Node, sdfg: SDFG) -> bool: + def _get_weakly_connected_nodes(self, graph: Graph) -> List[Set[NodeT]]: """ - Determine if a node is a gpu node. - - This includes GPU-scheduled library nodes, kernels (maps), and GPU global memory - access nodes. - + Returns all weakly connected components in the given directed graph. + + A weakly connected component is a maximal group of nodes such that each pair + of nodes is connected by a path when ignoring edge directions. + + :param graph: A directed graph (Graph) instance. + :return: A list of sets, each containing the nodes of one weakly connected component. + """ + visited: Set[NodeT] = set() + components: List[Set[NodeT]] = [] + + for node in graph.nodes(): + if node in visited: + continue + + # Start a new weakly connected component + component: Set[NodeT] = set() + stack = [node] + + while stack: + current = stack.pop() + if current in visited: + continue + + visited.add(current) + component.add(current) + + for neighbor in graph.neighbors(current): + if neighbor not in visited: + stack.append(neighbor) + + components.append(component) + + return components + + def _is_relevant_for_gpu_stream(self, node: nodes.Node, sdfg: SDFG, state: SDFGState) -> bool: + """ + Determines whether a node is relevant for GPU stream assignment. + + A node is considered relevant if: + - It is an AccessNode accessing GPU global memory, + - It is a GPU-scheduled map entry/exit node (i.e., a kernel), + - It is a GPU-scheduled library node, + - Or it is directly connected (via in/out edges) to such a node. + Args: - node: Node to check - sdfg: SDFG for context - + node: The node to check. + sdfg: The SDFG for memory/storage context. + state: The state in which the node resides. + Returns: - True if node is a gpu node + True if the node is relevant for GPU stream assignment, False otherwise. """ - # GPU global memory access nodes - if (isinstance(node, nodes.AccessNode) and - node.desc(sdfg).storage == dtypes.StorageType.GPU_Global): - return True - - # GPU-scheduled map entry/exit nodes (kernels) - if (isinstance(node, (nodes.EntryNode, nodes.ExitNode)) and - node.schedule in dtypes.GPU_SCHEDULES): - return True - - # GPU-scheduled library nodes - if (isinstance(node, nodes.LibraryNode) and - node.schedule in dtypes.GPU_SCHEDULES): - return True - - return False + node_and_neighbors = list(state.neighbors(node)) + node_and_neighbors.append(node) + + for n in node_and_neighbors: + # GPU global memory access nodes + if (isinstance(n, nodes.AccessNode) and + n.desc(sdfg).storage == dtypes.StorageType.GPU_Global): + return True + + # GPU-scheduled map entry/exit nodes (kernels) + if (isinstance(n, (nodes.EntryNode, nodes.ExitNode)) and + n.schedule in dtypes.GPU_SCHEDULES): + return True + + # GPU-scheduled library nodes + if (isinstance(n, nodes.LibraryNode) and + n.schedule in dtypes.GPU_SCHEDULES): + return True + + return False + def _next_stream(self, gpu_stream: int) -> int: """ Returns the next CUDA stream index based on the configured concurrency policy. @@ -164,220 +194,139 @@ def _next_stream(self, gpu_stream: int) -> int: - If max_concurrent_streams == -1: default → always return 0 - Else: wrap around within the allowed number of streams """ - if self.max_concurrent_streams == 0: + if self._max_concurrent_streams == 0: return gpu_stream + 1 - elif self.max_concurrent_streams == -1: + elif self._max_concurrent_streams == -1: return 0 else: - return (gpu_stream + 1) % self.max_concurrent_streams + return (gpu_stream + 1) % self._max_concurrent_streams - def _add_sync_tasklet(self, sdfg: SDFG, assigned_nodes: Dict): + def _insert_gpu_stream_sync_tasklet(self, sdfg: SDFG, assigned_nodes: Dict) -> None: """ - Adds a GPU stream synchronization tasklet after kernels finish execution. - - Synchronization is inserted: - - After kernel writes to a sink GPU AccessNode - - After a GPU-to-CPU (Device to Host) copy - - Raises NotImplementedError for unhandled AccessNode->non-AccessNode GPU edges + Inserts GPU stream synchronization tasklets at required locations: + - At the end of a state, for streams used in the state. + - After specific nodes, if their outputs need to synchronize before reuse. """ + sync_state, sync_node = self._identify_sync_locations(sdfg, assigned_nodes) - # Track all GPU Device kernel exits and their states - gpu_kernel_exits: Dict[nodes.MapExit, SDFGState] = {} - - for node, parent in sdfg.all_nodes_recursive(): - if isinstance(node, nodes.MapExit) and node.schedule == dtypes.ScheduleType.GPU_Device: - if not isinstance(parent, SDFGState): - raise NotImplementedError(f"Unexpected parent type '{type(parent).__name__}' for node '{node}'. " - "Expected 'SDFGState'. Please handle this case explicitly.") - - gpu_kernel_exits[node] = parent - - - # Determine where to insert synchronization tasklets - insert_sync_after: Dict[nodes.Node, SDFGState] = {} + #----------------- Insert synchronization tasklets at the end of each state ----------------- + for state, streams in sync_state.items(): - for exit_node, state in gpu_kernel_exits.items(): - for edge in state.dfs_edges(exit_node): - src, dst = edge.src, edge.dst + # Important: get sink nodes before adding the tasklet + sink_nodes = list(state.sink_nodes()) + # Generate sync code for all streams used in this state + sync_code_lines = [] + for stream in streams: - src_is_gpu = isinstance(src, nodes.AccessNode) and \ - src.desc(state.parent).storage == dtypes.StorageType.GPU_Global - dst_is_gpu = isinstance(dst, nodes.AccessNode) and \ - dst.desc(state.parent).storage == dtypes.StorageType.GPU_Global + if stream == "nullptr": + gpu_stream_access_expr = "nullptr" + else: + gpu_stream_access_expr = self._gpu_stream_access_template.format(gpu_stream=stream) - # Case 1: MapExit → GPU AccessNode (sink node) - if (src is exit_node and isinstance(dst, nodes.AccessNode) and dst_is_gpu - and state.out_degree(dst) == 0): - insert_sync_after[dst] = state + sync_code_lines.append(f"DACE_GPU_CHECK({self._backend}StreamSynchronize({gpu_stream_access_expr}));") - # Case 2: GPU AccessNode → Non-GPU AccessNode - elif src_is_gpu and isinstance(dst, nodes.AccessNode) and not dst_is_gpu: - insert_sync_after[dst] = state + sync_code = "\n".join(sync_code_lines) - # Case 3: GPU AccessNode → non-AccessNode (unexpected case) - elif src_is_gpu and not isinstance(dst, nodes.AccessNode): - raise NotImplementedError(f"Found edge from GPU AccessNode '{src}' to non-AccessNode '{dst}' after kernel computation. " - "This case is currently not handled and may need special sync logic.") - - # handle this - for edge, state in sdfg.all_edges_recursive(): - src, dst = edge.src, edge.dst + tasklet = state.add_tasklet( + name=f"gpu_stream_sync_{state}", inputs=set(), outputs=set(), + code=sync_code, + language=dtypes.Language.CPP + ) + + for sink_node in sink_nodes: + state.add_edge(sink_node, None, tasklet, None, dace.Memlet()) - src_is_not_gpu = isinstance(src, nodes.AccessNode) and \ - src.desc(state.parent).storage != dtypes.StorageType.GPU_Global - dst_is_gpu = isinstance(dst, nodes.AccessNode) and \ - dst.desc(state.parent).storage == dtypes.StorageType.GPU_Global - if src_is_not_gpu and dst_is_gpu: - if not isinstance(state, SDFGState): - raise NotImplementedError(f"Unexpected parent type '{type(parent).__name__}' for node '{node}'. " - "Expected 'SDFGState'. Please handle this case explicitly.") - insert_sync_after[dst] = state + #----------------- Insert synchronization tasklets after specific nodes ----------------- - # Insert synchronization tasklets - for node, kernel_state in insert_sync_after.items(): + for node, state in sync_node.items(): # get correct stream access expr stream = assigned_nodes.get(node, "nullptr") if stream == "nullptr": gpu_stream_access_expr = "nullptr" else: - gpu_stream_access_expr = self.gpu_stream_access_template.format(gpu_stream=stream) + gpu_stream_access_expr = self._gpu_stream_access_template.format(gpu_stream=stream) - tasklet = kernel_state.add_tasklet( + tasklet = state.add_tasklet( name=f"gpu_stream_sync_{stream}", inputs=set(), outputs=set(), - code=f"DACE_GPU_CHECK({self.backend}StreamSynchronize({gpu_stream_access_expr}));\n", + code=f"DACE_GPU_CHECK({self._backend}StreamSynchronize({gpu_stream_access_expr}));\n", language=dtypes.Language.CPP ) # important: First get the successors, then add the tasklet - successors = list(kernel_state.successors(node)) - kernel_state.add_edge(node, None, tasklet, None, dace.Memlet()) + successors = list(state.successors(node)) + state.add_edge(node, None, tasklet, None, dace.Memlet()) - # Also handles sink nodes (case 1). Nothing to add between successors for succ in successors : - kernel_state.add_edge(tasklet, None, succ, None, dace.Memlet()) + state.add_edge(tasklet, None, succ, None, dace.Memlet()) + + def _identify_sync_locations(self, sdfg: SDFG, assigned_nodes: Dict) -> Tuple[Dict[SDFGState, Set[str]], Dict[nodes.Node, SDFGState]]: + """ + Heuristically identifies GPU stream synchronization points in an SDFG. + + Synchronization is needed: + - At the end of a state, if we copy to/from GPU AccessNodes. + - Immediately after a node, if data leaves GPU memory and is further used. + + Returns: + - sync_state: Maps each SDFGState to a set of stream IDs to sync at the end of the state. + - sync_node: Maps individual nodes to the state where a sync is required after the node. + """ + + # ------------------ Helper predicates ----------------------------- + def is_gpu_accessnode(node, state): + return isinstance(node, nodes.AccessNode) and node.desc(state.parent).storage == dtypes.StorageType.GPU_Global + def is_nongpu_accessnode(node, state): + return isinstance(node, nodes.AccessNode) and node.desc(state.parent).storage not in dtypes.GPU_MEMORY_STORAGES_EXPERIMENTAL_CUDACODEGEN + + def is_kernel_exit(node): + return isinstance(node, nodes.ExitNode) and node.schedule == dtypes.ScheduleType.GPU_Device + + def is_sink_node(node, state): + return state.out_degree(node) == 0 + # ------------------ Sync detection logic ----------------------------- - def _add_sync_tasklet_heuristically(self, sdfg: SDFG, assigned_nodes: Dict): - # Keep track which streams should be synced at the end of a state - # And when we need to sync after a node sync_state: Dict[SDFGState, Set[str]] = {} sync_node: Dict[nodes.Node, SDFGState] = {} - - GPU_STORAGE = [dtypes.StorageType.GPU_Global, dtypes.StorageType.GPU_Shared] for edge, state in sdfg.all_edges_recursive(): - src, dst = edge.src, edge.dst - - # Check node types and storage - src_is_gpu_accNode = isinstance(src, nodes.AccessNode) and \ - src.desc(state.parent).storage == dtypes.StorageType.GPU_Global - - dst_is_gpu_accNode = isinstance(dst, nodes.AccessNode) and \ - dst.desc(state.parent).storage == dtypes.StorageType.GPU_Global - - dst_is_nongpu_accNode = isinstance(dst, nodes.AccessNode) and \ - not dst.desc(state.parent).storage in GPU_STORAGE - src_is_nongpu_accNode = isinstance(src, nodes.AccessNode) and \ - not src.desc(state.parent).storage in GPU_STORAGE - - src_is_kernelExit = isinstance(src, nodes.ExitNode) and src.schedule == dtypes.ScheduleType.GPU_Device - - dst_is_sink_node = state.out_degree(dst) == 0 - - # Initialize sync_state for this state if not exists + + # Ensure state is initialized in sync_state if state not in sync_state: sync_state[state] = set() - - # Apply heuristics - if src_is_gpu_accNode and dst_is_nongpu_accNode and dst_is_sink_node: - # GPU AccessNode -> Non GPU AccessNode which is a sink: Just sync at end of state - if dst not in assigned_nodes: - raise ValueError("Missed to assign gpu stream to dst") + + # --- Heuristics for when to sync --- + if is_gpu_accessnode(src, state) and is_nongpu_accessnode(dst, state) and is_sink_node(dst, state): sync_state[state].add(assigned_nodes[dst]) - - elif src_is_gpu_accNode and dst_is_nongpu_accNode and not dst_is_sink_node: - # GPU AccessNode -> Non GPU AccessNode which is NOT a sink -> Sync at end of state and after dst - if dst not in assigned_nodes: - raise ValueError("Missed to assign gpu stream to dst") + + elif is_gpu_accessnode(src, state) and is_nongpu_accessnode(dst, state) and not is_sink_node(dst, state): sync_state[state].add(assigned_nodes[dst]) sync_node[dst] = state - - elif src_is_nongpu_accNode and dst_is_gpu_accNode: - # Non_GPU AccessNode -> GPU AccessNode -> No matter what, just at end of state sync is needed - if dst not in assigned_nodes: - raise ValueError("Missed to assign gpu stream to dst") + + elif is_nongpu_accessnode(src, state) and is_gpu_accessnode(dst, state): sync_state[state].add(assigned_nodes[dst]) - - elif src_is_kernelExit and dst_is_gpu_accNode and dst_is_sink_node: - # KernelExit -> GPU AccessNode which is a sink node -> add to state - if dst not in assigned_nodes: - raise ValueError("Missed to assign gpu stream to dst") + + elif is_kernel_exit(src) and is_gpu_accessnode(dst, state) and is_sink_node(dst, state): sync_state[state].add(assigned_nodes[dst]) else: continue - # we didn't return, so lets verify that sync state has truly on sdfgstates as keys + # Check that state is indeed a SDFGState when added to the dictionary, to be on the safe side if not isinstance(state, SDFGState): raise NotImplementedError(f"Unexpected parent type '{type(state).__name__}' for edge '{edge}'. " - "Expected 'SDFGState'. Please handle this case explicitly.") - - - - # Remove states with no streams to sync - sync_state = {state: streams for state, streams in sync_state.items() if streams} - for state, streams in sync_state.items(): - code = "" - for stream in streams: - if stream == "nullptr": - gpu_stream_access_expr = "nullptr" - else: - gpu_stream_access_expr = self.gpu_stream_access_template.format(gpu_stream=stream) - - code += f"DACE_GPU_CHECK({self.backend}StreamSynchronize({gpu_stream_access_expr}));\n" - - - - sink_nodes = list(state.sink_nodes()) - tasklet = state.add_tasklet( - name=f"gpu_stream_sync_{state}", inputs=set(), outputs=set(), - code=code, - language=dtypes.Language.CPP - ) - - for sink_node in sink_nodes: - state.add_edge(sink_node, None, tasklet, None, dace.Memlet()) - - - for node, state in sync_node.items(): - # get correct stream access expr - stream = assigned_nodes.get(node, "nullptr") - if stream == "nullptr": - gpu_stream_access_expr = "nullptr" - else: - gpu_stream_access_expr = self.gpu_stream_access_template.format(gpu_stream=stream) - - tasklet = state.add_tasklet( - name=f"gpu_stream_sync_{stream}", inputs=set(), outputs=set(), - code=f"DACE_GPU_CHECK({self.backend}StreamSynchronize({gpu_stream_access_expr}));\n", - language=dtypes.Language.CPP - ) - - # important: First get the successors, then add the tasklet - successors = list(state.successors(node)) - state.add_edge(node, None, tasklet, None, dace.Memlet()) - - for succ in successors : - state.add_edge(tasklet, None, succ, None, dace.Memlet()) - + "Expected 'SDFGState'. Please handle this case explicitly.") + # Remove states with no syncs + sync_state = {state: streams for state, streams in sync_state.items() if len(streams) > 0} + return sync_state, sync_node def set_gpu_stream_access_template(self, expr_template: str): """ @@ -386,5 +335,5 @@ def set_gpu_stream_access_template(self, expr_template: str): gpu code generator. """ if "{gpu_stream}" not in expr_template: - raise ValueError("gpu_stream_access_template must include '{gpu_stream}' placeholder.") - self.gpu_stream_access_template = expr_template \ No newline at end of file + raise ValueError("self._gpu_stream_access_template must include '{gpu_stream}' placeholder.") + self._gpu_stream_access_template = expr_template \ No newline at end of file From 34aa669a5da9b9a2079af37becfbe4030d3461ed Mon Sep 17 00:00:00 2001 From: aydogdub Date: Fri, 27 Jun 2025 14:22:23 +0200 Subject: [PATCH 35/94] Fixed mistake in the copy strategy --- .../memcopy_tests/out_of_kernel_memcpy_test.py | 15 +++++++++------ .../experimental_cuda_helpers/copy_strategies.py | 7 +++++-- 2 files changed, 14 insertions(+), 8 deletions(-) diff --git a/berkay_workpace/tests/memcopy_tests/out_of_kernel_memcpy_test.py b/berkay_workpace/tests/memcopy_tests/out_of_kernel_memcpy_test.py index 787b1d8b87..85c9c41147 100644 --- a/berkay_workpace/tests/memcopy_tests/out_of_kernel_memcpy_test.py +++ b/berkay_workpace/tests/memcopy_tests/out_of_kernel_memcpy_test.py @@ -195,13 +195,12 @@ def test_2d_out_of_kernel_memcpy_one_strided(): cp.testing.assert_array_equal(dst, expected) @pytest.mark.gpu -def test_2d_oofkmemcpy_two_strided_fail(): +def test_2d_oofkmemcpy_strided(): """ Test strided 2D out-of-kernel memcpy. - This test should fail (notImplementedError). """ - sdfg = dace.SDFG("failing_2D_memory_copy") + sdfg = dace.SDFG("strided_2D_memory_copy") state = sdfg.add_state("main") # Access nodes @@ -220,9 +219,13 @@ def test_2d_oofkmemcpy_two_strided_fail(): src = cp.ones((2,20), dtype=cp.uint32) dst = cp.zeros((2,10), dtype=cp.uint32) - # notImplementedError should be raised - with pytest.raises(NotImplementedError): - sdfg(src=src, dst=dst) + # Execute program + sdfg(src=src, dst=dst) + + # Compute expected result & verify + expected = cp.zeros((2,10), dtype=cp.uint32) + expected[0:2, 0:10:5] = src[0:2, 0:20:10] + cp.testing.assert_array_equal(dst, expected) # ---------- Higher-Dimensional (>2D) Memory Copy Tests -------- @pytest.mark.gpu diff --git a/dace/codegen/targets/experimental_cuda_helpers/copy_strategies.py b/dace/codegen/targets/experimental_cuda_helpers/copy_strategies.py index 7ee48755fd..6932ea992b 100644 --- a/dace/codegen/targets/experimental_cuda_helpers/copy_strategies.py +++ b/dace/codegen/targets/experimental_cuda_helpers/copy_strategies.py @@ -312,7 +312,7 @@ def _generate_2d_copy(self, copy_context: CopyContext) -> None: call = f'DACE_GPU_CHECK({backend}Memcpy2DAsync({dst_expr}, {dpitch}, {src_expr}, {spitch}, {width}, {height}, {kind}, {cudastream}));\n' - elif src_strides[-1] == 1 or dst_strides[-1] == 1: + elif src_strides[-1] != 1 or dst_strides[-1] != 1: # TODO: Checks this, I am not sure but the old code and its description # seems to be more complicated here than necessary.. # But worth to mention: we essentiall flatten @@ -330,7 +330,10 @@ def _generate_2d_copy(self, copy_context: CopyContext) -> None: call = f'DACE_GPU_CHECK({backend}Memcpy2DAsync({dst_expr}, {dpitch}, {src_expr}, {spitch}, {width}, {height}, {kind}, {cudastream}));\n' else: - raise NotImplementedError('2D copy only supported with one stride') + raise NotImplementedError( + f"Unsupported 2D memory copy: shape={copy_shape}, src_strides={src_strides}, dst_strides={dst_strides}." + " Please implement this case if it is valid, or raise a more descriptive error if this path should not be taken." + ) # ----------------- Write copy call to code stream -------------------- From d03c5d0fd93a4b51c27d8b4ac45c17274cb73309 Mon Sep 17 00:00:00 2001 From: Yakup Budanaz Date: Thu, 3 Jul 2025 10:40:50 +0200 Subject: [PATCH 36/94] Add validation check to check for intersatte edge assignments to scalars or arrays --- dace/sdfg/validation.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/dace/sdfg/validation.py b/dace/sdfg/validation.py index 51855e3eea..cfa05fcb5c 100644 --- a/dace/sdfg/validation.py +++ b/dace/sdfg/validation.py @@ -350,6 +350,9 @@ def validate_sdfg(sdfg: 'dace.sdfg.SDFG', references: Set[int] = None, **context "Arrays that use a multibank access pattern must have the size of the first dimension equal" f" the number of banks and have at least 2 dimensions for array {name}", sdfg, None) + # Check for interstate edges that write to scalars or arrays + _no_writes_to_scalars_or_arrays_on_interstate_edges(sdfg) + # Check if SDFG is located within a GPU kernel context['in_gpu'] = is_devicelevel_gpu(sdfg, None, None) context['in_fpga'] = is_devicelevel_fpga(sdfg, None, None) @@ -372,6 +375,14 @@ def validate_sdfg(sdfg: 'dace.sdfg.SDFG', references: Set[int] = None, **context ex.path = fpath raise +def _no_writes_to_scalars_or_arrays_on_interstate_edges(sdfg: 'dace.sdfg.SDFG'): + for edge, graph in sdfg.all_edges_recursive(): + if edge.data is not None and isinstance(edge.data, dace.sdfg.InterstateEdge): + # sdfg.arrays return arrays and scalars, it is invalid to write to them + if any([key in graph.sdfg.arrays for key in edge.data.assignments]): + raise InvalidSDFGInterstateEdgeError( + f'Assignment to a scalar or an array detected in an interstate edge: "{edge}"', + graph.sdfg, graph.edge_id(edge)) def _accessible(sdfg: 'dace.sdfg.SDFG', container: str, context: Dict[str, bool]): """ From 974573aadbb5f58d05af3822e747a280507c7ed7 Mon Sep 17 00:00:00 2001 From: Yakup Budanaz Date: Thu, 3 Jul 2025 13:32:39 +0200 Subject: [PATCH 37/94] Alpha implementation sketch --- dace/sdfg/nodes.py | 48 ++++++++++++ dace/sdfg/utils.py | 169 ++++++++++++++++++++++++++-------------- dace/sdfg/validation.py | 3 +- 3 files changed, 162 insertions(+), 58 deletions(-) diff --git a/dace/sdfg/nodes.py b/dace/sdfg/nodes.py index e71d95d26a..cf7e516c55 100644 --- a/dace/sdfg/nodes.py +++ b/dace/sdfg/nodes.py @@ -877,7 +877,55 @@ def new_symbols(self, sdfg, state, symbols) -> Dict[str, dtypes.typeclass]: return result + def used_symbols(self): + parent_state : dace.SDFGState = self.parent_graph + parent_sdfg : dace.SDFG = parent_state.sdfg + + all_symbols = set() + new_symbols = set() + free_symbols = set() + + # Free symbols from nodes + for n in parent_state.all_nodes_between(self, parent_state.exit_node(self)): + if isinstance(n, EntryNode): + new_symbols |= set(n.new_symbols(parent_sdfg, parent_state, {}).keys()) + elif isinstance(n, AccessNode): + # Add data descriptor symbols + freesyms |= set(map(str, n.desc(parent_sdfg).used_symbols(all_symbols))) + elif isinstance(n, Tasklet): + if n.language == dtypes.Language.Python: + # Consider callbacks defined as symbols as free + for stmt in n.code.code: + for astnode in ast.walk(stmt): + if (isinstance(astnode, ast.Call) and isinstance(astnode.func, ast.Name) + and astnode.func.id in parent_sdfg.symbols): + freesyms.add(astnode.func.id) + else: + # Find all string tokens and filter them to sdfg.symbols, while ignoring connectors + code_symbols = dace.symbolic.symbols_in_code( + n.code.as_string, + potential_symbols=parent_sdfg.symbols.keys(), + symbols_to_ignore=(n.in_connectors.keys() | n.out_connectors.keys() | n.ignored_symbols), + ) + free_symbols |= code_symbols + continue + + if hasattr(n, 'used_symbols'): + freesyms |= n.used_symbols(all_symbols) + else: + freesyms |= n.free_symbols + + # Free symbols from memlets + for e in parent_state.all_edges(parent_state.all_nodes_between(self, parent_state.exit_node(self))): + # If used for code generation, only consider memlet tree leaves + if not all_symbols and not self.is_leaf_memlet(e): + continue + + freesyms |= e.data.used_symbols(all_symbols, e) + # Do not consider SDFG constants as symbols + new_symbols.update(set(parent_sdfg.constants.keys())) + return freesyms - new_symbols @dace.serialize.serializable class MapExit(ExitNode): """ Node that closes a Map scope. diff --git a/dace/sdfg/utils.py b/dace/sdfg/utils.py index 6f477d089f..3a17042916 100644 --- a/dace/sdfg/utils.py +++ b/dace/sdfg/utils.py @@ -2118,77 +2118,132 @@ def set_nested_sdfg_parent_references(sdfg: SDFG): set_nested_sdfg_parent_references(node.sdfg) -def get_used_data(cfg: ControlFlowRegion | SDFGState) -> Set[str]: +def get_used_data(scope: ControlFlowRegion | SDFGState | nd.MapEntry | nd.NestedSDFG) -> Set[str]: """ - Returns a set of all data names that are used in the given control flow region or state. + Returns a set of all data names that are used in the given control flow region, state, map entry or nested SDFG node. Data is considered used if there is an access node - :param cfg: The control flow region or state to check. + :param cfg: The control flow region, state, or map entry node to check. :return: A set of used data names. """ - used_data = set() - for state in cfg.all_states() if not isinstance(cfg, SDFGState) else [cfg]: - for node in state.nodes(): + if isinstance(scope, SDFGState) or isinstance(scope, ControlFlowRegion): + read_data, write_data = scope.read_and_write_sets() + return read_data.union(write_data) + elif isinstance(scope, nd.NestedSDFG): + read_data, write_data = scope.sdfg.read_and_write_sets() + return read_data.union(write_data) + elif isinstance(scope, nd.MapEntry): + state: SDFGState = scope.parent_graph + + # How can data be accessed in an SDFG?: + # Read interstate edges or access nodes using memlets + # Written to access nodes using memlets + # For map inputs the data might be not directly coming through an access node, + # need to check the edges too + # + # To get all used data, within a state iterate access nodes + # If data is passed to a nested SDFG (even if it is only used on an interstate edge), + # the access node must be present in the parent graph. + used_data = set() + + # All data used in the NestedSDFGs need to be connected through access nodes + for node in state.all_nodes_between(scope, state.exit_node(scope)): if isinstance(node, nd.AccessNode): used_data.add(node.data) + # Need to consider map inputs and outputs too + for ie in state.in_edges(scope): + if ie.data is not None and ie.data.data is not None: + used_data.add(ie.data.data) + for oe in state.out_edges(scope): + if oe.data is not None and oe.data.data is not None: + used_data.add(oe.data.data) + + return used_data + else: + raise Exception("Unsupported scope type for get_constant_data: {}".format(type(scope))) - cfgs_to_check = {cfg.nodes()} if not isinstance(cfg, SDFGState) else {} - - while cfgs_to_check: - node = cfgs_to_check.pop() - if not isinstance(node, SDFGState): - cfgs_to_check.add(node.nodes()) - - for out_edge in cfg.out_edges(node): - assert isinstance(out_edge, InterstateEdge) - edge = out_edge.data - interstate_used_data = edge.used_arrays(arrays=cfg.sdfg.arrays, union_lhs_symbols=True) - used_data.update(interstate_used_data) - - if isinstance(node, ConditionalBlock): - for branch_code in node.branches: - pass - - if isinstance(node, LoopRegion): - pass - - return used_data - -def get_constant_data(cfg: ControlFlowRegion | SDFGState) -> Set[str]: +def get_constant_data(scope: ControlFlowRegion | SDFGState | nd.MapEntry | nd.NestedSDFG) -> Set[str]: """ - Returns a set of all constant data names in the given control flow region or state. + Returns a set of all constant data in the given control flow region, state, or with the map scope. Data is considered constant if there is any incoming edge to an access node of the data. Due to the semantics of SDFG, if a nested SDFG writes to the data container it needs to be visible in the parent graph as well, so the function does not need to be recursive. - :param cfg: The control flow region or state to check. + :param cfg: The control flow region, state or a map entry node to check. :return: A set of constant data names. """ + def _no_incoming_memlet(state: SDFGState, node: nd.AccessNode) -> bool: + return (state.in_degree(node) == 0 or state.in_degree(node) > 0 and all([e.data is None for e in state.in_edges(node)])) - data_written_to = set() - sdfg = cfg.sdfg - # Write accesses to scalars can happen through access nodes and interstate edges (assignments) - # Write accesses to arrays can only happen through access nodes - for state in cfg.all_states() if not isinstance(cfg, SDFGState) else [cfg]: - for node in state.nodes(): + def _incoming_memlet(state: SDFGState, node: nd.AccessNode) -> bool: + return (state.in_degree(node) > 0 and any([e.data is not None for e in state.in_edges(node)])) + + if isinstance(scope, SDFGState) or isinstance(scope, ControlFlowRegion): + read_data, write_data = scope.read_and_write_sets() + return read_data - write_data + elif isinstance(scope, nd.NestedSDFG): + read_data, write_data = scope.sdfg.read_and_write_sets() + return read_data - write_data + elif isinstance(scope, nd.MapEntry): + state: SDFGState = scope.parent_graph + + # Which data are const: + # All access nodes that have no incoming edges + used_data = set() + written_data = set() + + # All data used in the NestedSDFGs need to be connected through access nodes + for node in state.all_nodes_between(scope, state.exit_node(scope)): if isinstance(node, nd.AccessNode): - if state.in_degree(node) > 0: - data_written_to.add(node.data) - - cfgs_to_check = {cfg.nodes()} if not isinstance(cfg, SDFGState) else {} - - while cfgs_to_check: - node = cfgs_to_check.pop() - if not isinstance(node, SDFGState): - cfgs_to_check.add(node.nodes()) - - for out_edge in cfg.out_edges(node): - assert isinstance(out_edge, InterstateEdge) - edge = out_edge.data - written_scalars = [arr_name for arr_name in edge.assignments if arr_name in sdfg.arrays] - if written_scalars: - data_written_to.update(written_scalars) - - all_accessed_data = set() - constants = all_accessed_data - data_written_to - return constants \ No newline at end of file + # Either no incoming edge, or no incoming edge has a Memlet (dependency edge only) + if _incoming_memlet(state, node): + written_data.add(node.data) + + # Need to consider map outputs and outputs too + for ie in state.in_edges(scope): + if ie.data is not None and ie.data.data is not None: + used_data.add(ie.data.data) + for oe in state.out_edges(scope): + if oe.data is not None and oe.data.data is not None: + written_data.add(oe.data.data) + used_data.add(oe.data.data) + + return used_data - written_data + else: + raise Exception("Unsupported scope type for get_constant_data: {}".format(type(scope))) + +def get_constant_symbols(scope: SDFG | ControlFlowRegion | SDFGState | nd.MapEntry | nd.NestedSDFG) -> Set[str]: + """ + Returns a set of all constant symbols in the given control flow region, state, or with the map scope. + A symbol is considered constant if no interstate edge writes to it. + + :param cfg: The control flow region, state or a map entry node to check. + :return: A set of constant symbol names. + """ + def _get_assignments(cfg: ControlFlowRegion | SDFG) -> Set[str]: + written_symbols = set() + for edge in cfg.all_edges(*list(cfg.all_control_flow_blocks())): + if edge.data is not None and isinstance(edge.data, dace.InterstateEdge): + written_symbols = written_symbols.union(edge.data.keys()) + return written_symbols + + if isinstance(scope, SDFGState): + symbols = scope.used_symbols() + # Since no symbol can change within a state we are good to go + return symbols + elif isinstance(scope, SDFG | ControlFlowRegion): + # Need to get all used symbols within the SDFG | CFG + used_symbols = scope.used_symbols() + # Get all symbols that are written to + written_symbols = _get_assignments(scope) + return used_symbols - written_symbols + elif isinstance(scope, nd.NestedSDFG): + used_symbols = scope.sdfg.used_symbols() + # Can't pass them as const if they are written to in the nested SDFG + written_symbols = _get_assignments(scope.sdfg) + return used_symbols - written_symbols + elif isinstance(scope, nd.MapEntry): + used_symbols = scope.used_symbols() + return used_symbols + else: + raise Exception("Unsupported scope type for get_constant_data: {}".format(type(scope))) \ No newline at end of file diff --git a/dace/sdfg/validation.py b/dace/sdfg/validation.py index 26089a79d2..050a0e7a22 100644 --- a/dace/sdfg/validation.py +++ b/dace/sdfg/validation.py @@ -379,8 +379,9 @@ def validate_sdfg(sdfg: 'dace.sdfg.SDFG', references: Set[int] = None, **context raise def _no_writes_to_scalars_or_arrays_on_interstate_edges(sdfg: 'dace.sdfg.SDFG'): + from dace.sdfg import InterstateEdge for edge, graph in sdfg.all_edges_recursive(): - if edge.data is not None and isinstance(edge.data, dace.sdfg.InterstateEdge): + if edge.data is not None and isinstance(edge.data, InterstateEdge): # sdfg.arrays return arrays and scalars, it is invalid to write to them if any([key in graph.sdfg.arrays for key in edge.data.assignments]): raise InvalidSDFGInterstateEdgeError( From 8f7a258b982eb721f2dc244f4fc6d049f69b14e1 Mon Sep 17 00:00:00 2001 From: aydogdub Date: Thu, 3 Jul 2025 15:22:20 +0200 Subject: [PATCH 38/94] Fix scope mistakes- missing brackets. Leads to errors for certain weird strided cases --- .../experimental_cuda_helpers/gpu_stream_manager.py | 2 +- .../targets/experimental_cuda_helpers/scope_strategies.py | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/dace/codegen/targets/experimental_cuda_helpers/gpu_stream_manager.py b/dace/codegen/targets/experimental_cuda_helpers/gpu_stream_manager.py index f2fa05e9a9..959b70f573 100644 --- a/dace/codegen/targets/experimental_cuda_helpers/gpu_stream_manager.py +++ b/dace/codegen/targets/experimental_cuda_helpers/gpu_stream_manager.py @@ -20,7 +20,7 @@ def __init__(self, sdfg: SDFG, assigned_streams: Dict[nodes.Node, Union[int, str # Determine the number of streams used (stream IDs start from 0) # Only count integer stream IDs (ignore string values like "nullptr") int_stream_ids = [v for v in assigned_streams.values() if isinstance(v, int)] - self.num_gpu_streams = max(int_stream_ids, default=0) + self.num_gpu_streams = max(int_stream_ids, default=0) + 1 def get_stream_node(self, node: nodes.Node) -> str: """ diff --git a/dace/codegen/targets/experimental_cuda_helpers/scope_strategies.py b/dace/codegen/targets/experimental_cuda_helpers/scope_strategies.py index c968339947..d603c58178 100644 --- a/dace/codegen/targets/experimental_cuda_helpers/scope_strategies.py +++ b/dace/codegen/targets/experimental_cuda_helpers/scope_strategies.py @@ -139,7 +139,7 @@ def generate(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: ScopeSubgraphV index_expr = f'(blockIdx.z * {symbolic_to_cpp(kernel_block_dims[2])} + threadIdx.z)' tail_prod = product(kernel_dim_sizes[dim + 1:]) - index_expr = (f"({index_expr} / ({symbolic_to_cpp(tail_prod)})) % ({symbolic_to_cpp(kernel_dim_sizes[dim])})") + index_expr = (f"(({index_expr} / ({symbolic_to_cpp(tail_prod)})) % ({symbolic_to_cpp(kernel_dim_sizes[dim])}))") # Define thread/Block index @@ -317,7 +317,7 @@ def generate(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: ScopeSubgraphV else: # Dimensions beyond the third: full delinearization tail_prod = product(map_dim_sizes[dim + 1:]) - base_expr = (f"(threadIdx.z / ({symbolic_to_cpp(tail_prod)})) % "f"({symbolic_to_cpp(map_dim_sizes[dim])})") + base_expr = (f"((threadIdx.z / ({symbolic_to_cpp(tail_prod)})) % ({symbolic_to_cpp(map_dim_sizes[dim])}))") var_def = symbolic_to_cpp(symbolic_coordinates[dim]).replace(f'__SYM_IDX{dim}', base_expr) @@ -448,9 +448,9 @@ def generate(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: ScopeSubgraphV if len(previous_sizes) > 0: divisor = product(previous_sizes) - expr = f"({threadID_name} / {divisor}) % {warp_dim_bounds[i]}" + expr = f"(({threadID_name} / {divisor}) % ({warp_dim_bounds[i]}))" else: - expr = f"{threadID_name} % {warp_dim_bounds[i]}" + expr = f"({threadID_name} % ({warp_dim_bounds[i]}))" callsite_stream.write(f"{ids_ctype} {var_name} = {expr};", cfg, state_id, node) self._dispatcher.defined_vars.add(var_name, DefinedType.Scalar, ids_ctype) From fc3cf9b00ca3cb81d6d74ef1997fb6ff71b4d024 Mon Sep 17 00:00:00 2001 From: aydogdub Date: Thu, 3 Jul 2025 15:24:05 +0200 Subject: [PATCH 39/94] change default architecture, might be more of a personal issue. Yakups idea, due to failing legacy tests --- dace/config_schema.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dace/config_schema.yml b/dace/config_schema.yml index 7a5537e9f5..877c84e01e 100644 --- a/dace/config_schema.yml +++ b/dace/config_schema.yml @@ -319,7 +319,7 @@ required: Additional CUDA architectures (separated by commas) to compile GPU code for, excluding the current architecture on the compiling machine. - default: '60' + default: '86' hip_arch: type: str @@ -457,7 +457,7 @@ required: Yakup Koray Budanaz for Berkays master-thesis. enum: [legacy, experimental] default: experimental - + gpu_index_type: type: str title: Thread/block/warp index data type From b4aa78789af95529b18ebca082c661250de83855 Mon Sep 17 00:00:00 2001 From: aydogdub Date: Thu, 3 Jul 2025 15:31:56 +0200 Subject: [PATCH 40/94] Fixed Yakups tb pass, reported MapTiling issue (applied workaround) and provided jupyter notebook examples for testing strided cases where pass failed before --- .../threadblockPass/simple1dExamplye.ipynb | 129 +++++++++++++ .../threadblockPass/simple2dExample.ipynb | 142 ++++++++++++++ .../threadblockPass/simple4dExample.ipynb | 143 ++++++++++++++ .../dataflow/add_threadblock_map.py | 174 ++++++++++++++++++ 4 files changed, 588 insertions(+) create mode 100644 berkay_workpace/scratch/threadblockPass/simple1dExamplye.ipynb create mode 100644 berkay_workpace/scratch/threadblockPass/simple2dExample.ipynb create mode 100644 berkay_workpace/scratch/threadblockPass/simple4dExample.ipynb create mode 100644 dace/transformation/dataflow/add_threadblock_map.py diff --git a/berkay_workpace/scratch/threadblockPass/simple1dExamplye.ipynb b/berkay_workpace/scratch/threadblockPass/simple1dExamplye.ipynb new file mode 100644 index 0000000000..eaa35d1491 --- /dev/null +++ b/berkay_workpace/scratch/threadblockPass/simple1dExamplye.ipynb @@ -0,0 +1,129 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "029f8a65", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import dace\n", + "import cupy as cp" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "6a04e64f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "
\n", + "
\n", + "
\n", + "\n", + "" + ], + "text/plain": [ + "SDFG (vector_copy_strides1d)" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "N = dace.symbol('N')\n", + "s = 33\n", + "@dace.program\n", + "def vector_copy_strides1d(A: dace.uint32[N] @ dace.dtypes.StorageType.GPU_Global, B: dace.uint32[N] @ dace.dtypes.StorageType.GPU_Global):\n", + " for i in dace.map[0:N:s] @ dace.dtypes.ScheduleType.GPU_Device:\n", + " A[i] = B[i]\n", + "\n", + "sdfg = vector_copy_strides1d.to_sdfg()\n", + "sdfg\n" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "8bbd2799", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "\n", + "SUCCESS: A matches the expected result.\n" + ] + } + ], + "source": [ + "# Example input \n", + "n = 69\n", + "A = cp.zeros((n,), dtype=cp.uint32)\n", + "B = cp.arange(0, n, dtype=cp.uint32)\n", + "\n", + "\n", + "# Strided copy from B to A\n", + "sdfg(A=A, B=B, N=n)\n", + "\n", + "# Verify correctness\n", + "expected = cp.zeros((n,), dtype=cp.uint32)\n", + "expected[::s] = cp.arange(0, n, dtype=cp.uint32)[::s]\n", + "if cp.array_equal(A, expected):\n", + " print(\"\\n\\nSUCCESS: A matches the expected result.\")\n", + "else:\n", + " print(\"\\n\\nERROR: A does not match the expected result.\")\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "dace_env", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/berkay_workpace/scratch/threadblockPass/simple2dExample.ipynb b/berkay_workpace/scratch/threadblockPass/simple2dExample.ipynb new file mode 100644 index 0000000000..510f81e74e --- /dev/null +++ b/berkay_workpace/scratch/threadblockPass/simple2dExample.ipynb @@ -0,0 +1,142 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "f11daa92", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import dace\n", + "import cupy as cp" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "827c1a5d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "
\n", + "
\n", + "
\n", + "\n", + "" + ], + "text/plain": [ + "SDFG (vector_copy_strides2d)" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "N = dace.symbol('N')\n", + "M = dace.symbol('M')\n", + "sN = 33\n", + "sM = 21\n", + "\n", + "@dace.program\n", + "def vector_copy_strides2d(A: dace.uint32[N, M] @ dace.dtypes.StorageType.GPU_Global, B: dace.uint32[N, M] @ dace.dtypes.StorageType.GPU_Global):\n", + " for i, j in dace.map[0:N:sN, 0:M:sM] @ dace.dtypes.ScheduleType.GPU_Device:\n", + " A[i, j] = B[i, j]\n", + "\n", + "sdfg = vector_copy_strides2d.to_sdfg()\n", + "sdfg" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "20d36b0b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "\n", + "SUCCESS: A matches the expected result.\n" + ] + } + ], + "source": [ + "# Example input \n", + "n = 35\n", + "m = 43\n", + "A = cp.zeros((n, m,), dtype=cp.uint32)\n", + "B = cp.ones((n, m,), dtype=cp.uint32)\n", + "\n", + "\n", + "# Strided copy from B to A\n", + "sdfg(A=A, B=B, N=n, M=m)\n", + "\n", + "\n", + "# Verify correctness for 2D strided copy\n", + "expected = cp.zeros((n, m), dtype=cp.uint32)\n", + "expected[::sN, ::sM] = cp.ones((n, m,), dtype=cp.uint32)[::sN, ::sM]\n", + "if (A == expected).all():\n", + " print(\"\\n\\nSUCCESS: A matches the expected result.\")\n", + "else:\n", + " print(\"\\n\\nERROR: A does not match the expected result.\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "700d0007", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "dace_env", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/berkay_workpace/scratch/threadblockPass/simple4dExample.ipynb b/berkay_workpace/scratch/threadblockPass/simple4dExample.ipynb new file mode 100644 index 0000000000..78ce88ed03 --- /dev/null +++ b/berkay_workpace/scratch/threadblockPass/simple4dExample.ipynb @@ -0,0 +1,143 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "f5ba4b8b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import dace\n", + "import cupy as cp" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "fca45bfa", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "
\n", + "
\n", + "
\n", + "\n", + "" + ], + "text/plain": [ + "SDFG (vector_copy_strides4d)" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "N = dace.symbol(\"N\")\n", + "M = dace.symbol(\"M\")\n", + "J = dace.symbol(\"J\")\n", + "K = dace.symbol(\"K\")\n", + "\n", + "sN = 7\n", + "sM = 2\n", + "sJ = 5\n", + "sK = 8\n", + "\n", + "@dace.program\n", + "def vector_copy_strides4d(A: dace.uint32[N, M, J, K] @ dace.dtypes.StorageType.GPU_Global, B: dace.uint32[N, M, J, K] @ dace.dtypes.StorageType.GPU_Global):\n", + " for a, b, c, d in dace.map[0:N:sN, 0:M:sM, 0:J:sJ, 0:K:sK] @ dace.dtypes.ScheduleType.GPU_Device:\n", + " A[a, b, c, d] = B[a, b, c, d]\n", + "\n", + "sdfg = vector_copy_strides4d.to_sdfg()\n", + "sdfg\n" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "2678e814", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "\n", + "SUCCESS: A matches the expected result.\n" + ] + } + ], + "source": [ + "# Example input \n", + "n = 12\n", + "m = 14\n", + "j = 11\n", + "k = 23\n", + "\n", + "B = cp.ones((n, m, j, k, ), dtype=cp.uint32)\n", + "A = cp.zeros((n, m, j, k, ), dtype=cp.uint32)\n", + "\n", + "\n", + "# Strided copy from B to A\n", + "sdfg(A=A, B=B, N=n, M=m, J=j, K=k)\n", + "\n", + "\n", + "\n", + "# Verify correctness for 2D strided copy\n", + "expected = cp.zeros((n, m, j, k, ), dtype=cp.uint32)\n", + "expected[::sN, ::sM, ::sJ, ::sK] = 1\n", + "if (A == expected).all():\n", + " print(\"\\n\\nSUCCESS: A matches the expected result.\")\n", + "else:\n", + " print(\"\\n\\nERROR: A does not match the expected result.\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "dace_env", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/dace/transformation/dataflow/add_threadblock_map.py b/dace/transformation/dataflow/add_threadblock_map.py new file mode 100644 index 0000000000..ccf2b88ee5 --- /dev/null +++ b/dace/transformation/dataflow/add_threadblock_map.py @@ -0,0 +1,174 @@ +# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved. +""" This module contains classes and functions that implement the grid-strided map tiling + transformation.""" + +import copy + +import dace +from dace.sdfg import SDFG, ControlFlowRegion, SDFGState +from dace.properties import make_properties, SymbolicProperty +from dace.sdfg import nodes +from dace.sdfg import utils as sdutil +from dace.transformation import transformation, helpers +from dace.transformation.dataflow.tiling import MapTiling +from dace import dtypes +import warnings + + +@make_properties +class AddThreadBlockMap(transformation.SingleStateTransformation): + """ + Adds a thread block schedule to a device map scope + """ + + map_entry = transformation.PatternNode(nodes.MapEntry) + + # Properties + thread_block_size_x = SymbolicProperty(dtype=int, + default=None, + allow_none=True, + desc="Number threads in the threadBlock X Dim") + thread_block_size_y = SymbolicProperty(dtype=int, + default=None, + allow_none=True, + desc="Number threads in the threadBlock Y Dim") + thread_block_size_z = SymbolicProperty(dtype=int, + default=None, + allow_none=True, + desc="Number threads in the threadBlock Z Dim") + tiles_evenly = SymbolicProperty(dtype=bool, + default=False, + desc="Whether the map should be tiled evenly or not. If False, the " + "transformation will try to tile the map as evenly as possible.") + + @classmethod + def expressions(cls): + return [sdutil.node_path_graph(cls.map_entry)] + + def preprocess_default_dims(self): + # If None is passed for the pass we will get the default configs + # 1. If arguments are passed: + # 1.1 Is the arguments passed + # 2. If no arguments are passed (at least one arg is None): + # 2.1. First check if the device map has gpu_block_size set + # 2.2. Otherwise check the global default + if self.thread_block_size_x is None or self.thread_block_size_y is None or self.thread_block_size_z is None: + if self.map_entry.gpu_block_size is not None: + # If gpu_block_size ap_entry.gpu_block_sizeis set, use it + self.thread_block_size_x = self.map_entry.gpu_block_size[0] + self.thread_block_size_y = self.map_entry.gpu_block_size[1] + self.thread_block_size_z = self.map_entry.gpu_block_size[2] + else: + x, y, z = dace.config.Config.get('compiler', 'cuda', 'default_block_size').split(',') + try: + self.thread_block_size_x = int(x) + self.thread_block_size_y = int(y) + self.thread_block_size_z = int(z) + except ValueError: + raise ValueError("Invalid default block size format. Expected 'x,y,z' where x, y, z are integers.") + + num_dims_in_map = len(self.map_entry.map.range) + # Collapse missing thread block dimensions into y if 2 dimensions in the map, to x if 1 dimension in the map + if num_dims_in_map < 3: + print_warning = False + old_block = (self.thread_block_size_x, self.thread_block_size_y, self.thread_block_size_z) + if num_dims_in_map == 2: + self.thread_block_size_y *= self.thread_block_size_z + if self.thread_block_size_z > 1: + print_warning = True + self.thread_block_size_z = 1 + elif num_dims_in_map == 1: + self.thread_block_size_x *= self.thread_block_size_y * self.thread_block_size_z + if self.thread_block_size_y > 1 or self.thread_block_size_z > 1: + print_warning = True + self.thread_block_size_y = 1 + self.thread_block_size_z = 1 + new_block = (self.thread_block_size_x, self.thread_block_size_y, self.thread_block_size_z) + if print_warning: + warnings.warn( + UserWarning, f'Default block size has more dimensions ({old_block}) than kernel dimensions ' + f'({num_dims_in_map}) in map "{self.map_entry.map.label}". Linearizing block ' + f'size to {new_block}. Consider setting the ``gpu_block_size`` property.') + + + + def can_be_applied(self, graph, expr_index, sdfg, permissive=False): + + self.preprocess_default_dims() + + # Reject if thread block size exceeds GPU hardware limits + total_block_size = self.thread_block_size_x * self.thread_block_size_y * self.thread_block_size_z + + if total_block_size > 1024: + return False + + # Only applicable to GPU_Device maps + if self.map_entry.map.schedule != dtypes.ScheduleType.GPU_Device: + return False + + # Traverse inner scopes (ordered outer -> inner) + for _, inner_entry in helpers.get_internal_scopes(graph, self.map_entry): + schedule = inner_entry.map.schedule + + if schedule in {dtypes.ScheduleType.GPU_ThreadBlock, dtypes.ScheduleType.GPU_ThreadBlock_Dynamic,}: + # Already scheduled with thread block — cannot apply + return False + + if schedule == dtypes.ScheduleType.GPU_Device: + # Found another kernel launch — safe to apply + return True + + + # No thread block schedule found - do apply + return True + + + def apply(self, state: SDFGState, sdfg: SDFG): + self.preprocess_default_dims() + + map_entry = self.map_entry + + tx = self.thread_block_size_x + ty = self.thread_block_size_y + tz = self.thread_block_size_z + block_dims = [tz, ty, tx] + + # Set the gpu_block_size which the GPU_ThreadBlock map will use. This is important, because the CUDACodeGen + # will otherwise try to deduce it, leading to issues + self.map_entry.gpu_block_size = [self.thread_block_size_x, self.thread_block_size_y, self.thread_block_size_z] + + # TODO: Adapt this code once MapTiling transformation also considers existing stride. + # The below tile size works around this by including the existing stride into the tile size + num_dims = len(map_entry.map.params) + existing_strides = map_entry.range.strides() + + len_diff = num_dims - len(block_dims) # Note + if len_diff > 0: # num_dims > block_dims + block_dims = [1] * len_diff + block_dims + else: + block_dims = block_dims[-num_dims:] + + tile_sizes = [stride * block for stride, block in zip(existing_strides, block_dims)] + + # Tile trivial simplifies come checks for the BlockCoarsening and ThreadCoarsening transformations + MapTiling.apply_to( + sdfg=sdfg, + options=dict( + prefix="b", + tile_sizes=tile_sizes, + divides_evenly=self.tiles_evenly, # Todo improve this + tile_trivial=True, + skew=False), + map_entry=map_entry) + + # The old dev_entry is the new tblock_map_entry + map_entry.map.schedule = dtypes.ScheduleType.GPU_ThreadBlock + + + + def update_names(): + pass + + @staticmethod + def annotates_memlets(): + return False From 40f0b31f42e711d6cc1a47186b643cd5ab2f598d Mon Sep 17 00:00:00 2001 From: Yakup Budanaz Date: Thu, 3 Jul 2025 15:47:25 +0200 Subject: [PATCH 41/94] Add pass to detect const --- dace/sdfg/nodes.py | 28 +++++++++------ dace/sdfg/state.py | 3 +- dace/sdfg/utils.py | 29 ++++++++------- dace/sdfg/validation.py | 6 ++-- .../passes/analysis/infer_const_args.py | 35 +++++++++++++++++++ 5 files changed, 75 insertions(+), 26 deletions(-) create mode 100644 dace/transformation/passes/analysis/infer_const_args.py diff --git a/dace/sdfg/nodes.py b/dace/sdfg/nodes.py index cf7e516c55..e290d1b368 100644 --- a/dace/sdfg/nodes.py +++ b/dace/sdfg/nodes.py @@ -877,9 +877,13 @@ def new_symbols(self, sdfg, state, symbols) -> Dict[str, dtypes.typeclass]: return result - def used_symbols(self): - parent_state : dace.SDFGState = self.parent_graph - parent_sdfg : dace.SDFG = parent_state.sdfg + def used_symbols(self, parent_state: 'dace.SDFGState', all_symbols: bool = False) -> Set[str]: + """ + Returns a set of symbol names that are used withn the Map scope created by this MapEntry + + :param all_symbols: If False, only returns symbols that are needed as arguments (only used in generated code). + """ + parent_sdfg: dace.SDFG = parent_state.sdfg all_symbols = set() new_symbols = set() @@ -891,7 +895,7 @@ def used_symbols(self): new_symbols |= set(n.new_symbols(parent_sdfg, parent_state, {}).keys()) elif isinstance(n, AccessNode): # Add data descriptor symbols - freesyms |= set(map(str, n.desc(parent_sdfg).used_symbols(all_symbols))) + free_symbols |= set(map(str, n.desc(parent_sdfg).used_symbols(all_symbols))) elif isinstance(n, Tasklet): if n.language == dtypes.Language.Python: # Consider callbacks defined as symbols as free @@ -899,7 +903,7 @@ def used_symbols(self): for astnode in ast.walk(stmt): if (isinstance(astnode, ast.Call) and isinstance(astnode.func, ast.Name) and astnode.func.id in parent_sdfg.symbols): - freesyms.add(astnode.func.id) + free_symbols.add(astnode.func.id) else: # Find all string tokens and filter them to sdfg.symbols, while ignoring connectors code_symbols = dace.symbolic.symbols_in_code( @@ -911,21 +915,23 @@ def used_symbols(self): continue if hasattr(n, 'used_symbols'): - freesyms |= n.used_symbols(all_symbols) + free_symbols |= n.used_symbols(parent_state, all_symbols) else: - freesyms |= n.free_symbols + free_symbols |= n.free_symbols # Free symbols from memlets - for e in parent_state.all_edges(parent_state.all_nodes_between(self, parent_state.exit_node(self))): + for e in parent_state.all_edges(*parent_state.all_nodes_between(self, parent_state.exit_node(self))): # If used for code generation, only consider memlet tree leaves - if not all_symbols and not self.is_leaf_memlet(e): + if not all_symbols and not parent_state.is_leaf_memlet(e): continue - freesyms |= e.data.used_symbols(all_symbols, e) + free_symbols |= e.data.used_symbols(all_symbols, e) # Do not consider SDFG constants as symbols new_symbols.update(set(parent_sdfg.constants.keys())) - return freesyms - new_symbols + return free_symbols - new_symbols + + @dace.serialize.serializable class MapExit(ExitNode): """ Node that closes a Map scope. diff --git a/dace/sdfg/state.py b/dace/sdfg/state.py index 5cef723508..65681b64f1 100644 --- a/dace/sdfg/state.py +++ b/dace/sdfg/state.py @@ -692,7 +692,8 @@ def used_symbols(self, continue if hasattr(n, 'used_symbols'): - freesyms |= n.used_symbols(all_symbols) + if not isinstance(n, nd.MapEntry): + freesyms |= n.used_symbols(all_symbols=all_symbols) else: freesyms |= n.free_symbols diff --git a/dace/sdfg/utils.py b/dace/sdfg/utils.py index 3a17042916..0a332dce60 100644 --- a/dace/sdfg/utils.py +++ b/dace/sdfg/utils.py @@ -2162,7 +2162,9 @@ def get_used_data(scope: ControlFlowRegion | SDFGState | nd.MapEntry | nd.Nested else: raise Exception("Unsupported scope type for get_constant_data: {}".format(type(scope))) -def get_constant_data(scope: ControlFlowRegion | SDFGState | nd.MapEntry | nd.NestedSDFG) -> Set[str]: + +def get_constant_data(scope: ControlFlowRegion | SDFGState | nd.NestedSDFG | nd.MapEntry, + parent_state: SDFGState = None) -> Set[str]: """ Returns a set of all constant data in the given control flow region, state, or with the map scope. Data is considered constant if there is any incoming edge to an access node of the data. @@ -2170,10 +2172,9 @@ def get_constant_data(scope: ControlFlowRegion | SDFGState | nd.MapEntry | nd.Ne visible in the parent graph as well, so the function does not need to be recursive. :param cfg: The control flow region, state or a map entry node to check. + :param parent_state: The parent_state of the scope, used only for MapEntry nodes. :return: A set of constant data names. """ - def _no_incoming_memlet(state: SDFGState, node: nd.AccessNode) -> bool: - return (state.in_degree(node) == 0 or state.in_degree(node) > 0 and all([e.data is None for e in state.in_edges(node)])) def _incoming_memlet(state: SDFGState, node: nd.AccessNode) -> bool: return (state.in_degree(node) > 0 and any([e.data is not None for e in state.in_edges(node)])) @@ -2185,7 +2186,7 @@ def _incoming_memlet(state: SDFGState, node: nd.AccessNode) -> bool: read_data, write_data = scope.sdfg.read_and_write_sets() return read_data - write_data elif isinstance(scope, nd.MapEntry): - state: SDFGState = scope.parent_graph + state: SDFGState = parent_state # Which data are const: # All access nodes that have no incoming edges @@ -2203,7 +2204,7 @@ def _incoming_memlet(state: SDFGState, node: nd.AccessNode) -> bool: for ie in state.in_edges(scope): if ie.data is not None and ie.data.data is not None: used_data.add(ie.data.data) - for oe in state.out_edges(scope): + for oe in state.out_edges(state.exit_node(scope)): if oe.data is not None and oe.data.data is not None: written_data.add(oe.data.data) used_data.add(oe.data.data) @@ -2212,38 +2213,42 @@ def _incoming_memlet(state: SDFGState, node: nd.AccessNode) -> bool: else: raise Exception("Unsupported scope type for get_constant_data: {}".format(type(scope))) -def get_constant_symbols(scope: SDFG | ControlFlowRegion | SDFGState | nd.MapEntry | nd.NestedSDFG) -> Set[str]: + +def get_constant_symbols(scope: SDFG | ControlFlowRegion | SDFGState | nd.MapEntry | nd.NestedSDFG, + parent_state: SDFGState = None) -> Set[str]: """ Returns a set of all constant symbols in the given control flow region, state, or with the map scope. A symbol is considered constant if no interstate edge writes to it. :param cfg: The control flow region, state or a map entry node to check. + :param parent_state: The parent graph of the scope, used only for MapEntry nodes. :return: A set of constant symbol names. """ + def _get_assignments(cfg: ControlFlowRegion | SDFG) -> Set[str]: written_symbols = set() for edge in cfg.all_edges(*list(cfg.all_control_flow_blocks())): if edge.data is not None and isinstance(edge.data, dace.InterstateEdge): - written_symbols = written_symbols.union(edge.data.keys()) + written_symbols = written_symbols.union(edge.data.assignments.keys()) return written_symbols if isinstance(scope, SDFGState): - symbols = scope.used_symbols() + symbols = scope.used_symbols(all_symbols=False) # Since no symbol can change within a state we are good to go return symbols elif isinstance(scope, SDFG | ControlFlowRegion): # Need to get all used symbols within the SDFG | CFG - used_symbols = scope.used_symbols() + used_symbols = scope.used_symbols(all_symbols=False) # Get all symbols that are written to written_symbols = _get_assignments(scope) return used_symbols - written_symbols elif isinstance(scope, nd.NestedSDFG): - used_symbols = scope.sdfg.used_symbols() + used_symbols = scope.sdfg.used_symbols(all_symbols=True) # Can't pass them as const if they are written to in the nested SDFG written_symbols = _get_assignments(scope.sdfg) return used_symbols - written_symbols elif isinstance(scope, nd.MapEntry): - used_symbols = scope.used_symbols() + used_symbols = scope.used_symbols(parent_state=parent_state) return used_symbols else: - raise Exception("Unsupported scope type for get_constant_data: {}".format(type(scope))) \ No newline at end of file + raise Exception("Unsupported scope type for get_constant_data: {}".format(type(scope))) diff --git a/dace/sdfg/validation.py b/dace/sdfg/validation.py index 050a0e7a22..55379e01b2 100644 --- a/dace/sdfg/validation.py +++ b/dace/sdfg/validation.py @@ -378,6 +378,7 @@ def validate_sdfg(sdfg: 'dace.sdfg.SDFG', references: Set[int] = None, **context ex.path = fpath raise + def _no_writes_to_scalars_or_arrays_on_interstate_edges(sdfg: 'dace.sdfg.SDFG'): from dace.sdfg import InterstateEdge for edge, graph in sdfg.all_edges_recursive(): @@ -385,8 +386,9 @@ def _no_writes_to_scalars_or_arrays_on_interstate_edges(sdfg: 'dace.sdfg.SDFG'): # sdfg.arrays return arrays and scalars, it is invalid to write to them if any([key in graph.sdfg.arrays for key in edge.data.assignments]): raise InvalidSDFGInterstateEdgeError( - f'Assignment to a scalar or an array detected in an interstate edge: "{edge}"', - graph.sdfg, graph.edge_id(edge)) + f'Assignment to a scalar or an array detected in an interstate edge: "{edge}"', graph.sdfg, + graph.edge_id(edge)) + def _accessible(sdfg: 'dace.sdfg.SDFG', container: str, context: Dict[str, bool]): """ diff --git a/dace/transformation/passes/analysis/infer_const_args.py b/dace/transformation/passes/analysis/infer_const_args.py new file mode 100644 index 0000000000..1ba2a2e63d --- /dev/null +++ b/dace/transformation/passes/analysis/infer_const_args.py @@ -0,0 +1,35 @@ +import dace +from dace.transformation import pass_pipeline as ppl, transformation +from typing import Dict, Set, Tuple +from dace import properties +import dace.sdfg.utils as sdutils + +@properties.make_properties +@transformation.explicit_cf_compatible +class StateReachability(ppl.Pass): + """ + Evaluates state reachability (which other states can be executed after each state). + """ + + CATEGORY: str = 'Analysis' + + def modifies(self) -> ppl.Modifies: + return ppl.Modifies.Nothing + + def should_reapply(self, modified: ppl.Modifies) -> bool: + return modified & ppl.Modifies.CFG & ppl.Modifies.SDFG & ppl.Modifies.Nodes + + def depends_on(self): + return {} + + def apply_pass(self, sdfg: dace.SDFG, pipeline_res: Dict) -> Dict[str, Tuple[Set[str], Set[str]]]: + const_args_dict = dict() + for node, parent_graph in sdfg.all_nodes_recursive(): + if isinstance(node, dace.nodes.MapEntry) and node.map.schedule == dace.dtypes.ScheduleType.GPU_Device: + const_args_dict[node.guid] = (sdutils.get_constant_data(node, parent_state=parent_graph), + sdutils.get_constant_symbols(node, parent_state=parent_graph)) + elif isinstance(node, dace.sdfg.nodes.NestedSDFG): + const_args_dict[node.guid] = (sdutils.get_constant_data(node.sdfg, parent_state=parent_graph), + sdutils.get_constant_symbols(node.sdfg, parent_state=parent_graph)) + + return const_args_dict \ No newline at end of file From 8114c0630a093167768bf8430edf44cfba7bc040 Mon Sep 17 00:00:00 2001 From: Yakup Budanaz Date: Thu, 3 Jul 2025 15:55:33 +0200 Subject: [PATCH 42/94] using union instead | --- dace/sdfg/utils.py | 4 ++-- dace/transformation/passes/analysis/infer_const_args.py | 3 ++- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/dace/sdfg/utils.py b/dace/sdfg/utils.py index 0a332dce60..382c14898a 100644 --- a/dace/sdfg/utils.py +++ b/dace/sdfg/utils.py @@ -2163,7 +2163,7 @@ def get_used_data(scope: ControlFlowRegion | SDFGState | nd.MapEntry | nd.Nested raise Exception("Unsupported scope type for get_constant_data: {}".format(type(scope))) -def get_constant_data(scope: ControlFlowRegion | SDFGState | nd.NestedSDFG | nd.MapEntry, +def get_constant_data(scope: Union[ControlFlowRegion, SDFGState, nd.NestedSDFG, nd.MapEntry], parent_state: SDFGState = None) -> Set[str]: """ Returns a set of all constant data in the given control flow region, state, or with the map scope. @@ -2214,7 +2214,7 @@ def _incoming_memlet(state: SDFGState, node: nd.AccessNode) -> bool: raise Exception("Unsupported scope type for get_constant_data: {}".format(type(scope))) -def get_constant_symbols(scope: SDFG | ControlFlowRegion | SDFGState | nd.MapEntry | nd.NestedSDFG, +def get_constant_symbols(scope: Union[SDFG, ControlFlowRegion, SDFGState, nd.MapEntry, nd.NestedSDFG], parent_state: SDFGState = None) -> Set[str]: """ Returns a set of all constant symbols in the given control flow region, state, or with the map scope. diff --git a/dace/transformation/passes/analysis/infer_const_args.py b/dace/transformation/passes/analysis/infer_const_args.py index 1ba2a2e63d..3d6f35bbda 100644 --- a/dace/transformation/passes/analysis/infer_const_args.py +++ b/dace/transformation/passes/analysis/infer_const_args.py @@ -4,6 +4,7 @@ from dace import properties import dace.sdfg.utils as sdutils + @properties.make_properties @transformation.explicit_cf_compatible class StateReachability(ppl.Pass): @@ -32,4 +33,4 @@ def apply_pass(self, sdfg: dace.SDFG, pipeline_res: Dict) -> Dict[str, Tuple[Set const_args_dict[node.guid] = (sdutils.get_constant_data(node.sdfg, parent_state=parent_graph), sdutils.get_constant_symbols(node.sdfg, parent_state=parent_graph)) - return const_args_dict \ No newline at end of file + return const_args_dict From 8490a64237829e38055b9309d1a8fcf67184532e Mon Sep 17 00:00:00 2001 From: Yakup Budanaz Date: Thu, 3 Jul 2025 16:01:24 +0200 Subject: [PATCH 43/94] Using Union instead | --- dace/sdfg/utils.py | 8 +- tests/const_utilities_test.py | 198 ++++++++++++++++++++++++++++++++++ 2 files changed, 202 insertions(+), 4 deletions(-) create mode 100644 tests/const_utilities_test.py diff --git a/dace/sdfg/utils.py b/dace/sdfg/utils.py index 382c14898a..c801f7b4b5 100644 --- a/dace/sdfg/utils.py +++ b/dace/sdfg/utils.py @@ -2118,7 +2118,7 @@ def set_nested_sdfg_parent_references(sdfg: SDFG): set_nested_sdfg_parent_references(node.sdfg) -def get_used_data(scope: ControlFlowRegion | SDFGState | nd.MapEntry | nd.NestedSDFG) -> Set[str]: +def get_used_data(scope: Union[ControlFlowRegion, SDFGState, nd.MapEntry, nd.NestedSDFG]) -> Set[str]: """ Returns a set of all data names that are used in the given control flow region, state, map entry or nested SDFG node. Data is considered used if there is an access node @@ -2225,7 +2225,7 @@ def get_constant_symbols(scope: Union[SDFG, ControlFlowRegion, SDFGState, nd.Map :return: A set of constant symbol names. """ - def _get_assignments(cfg: ControlFlowRegion | SDFG) -> Set[str]: + def _get_assignments(cfg: Union[ControlFlowRegion, SDFG]) -> Set[str]: written_symbols = set() for edge in cfg.all_edges(*list(cfg.all_control_flow_blocks())): if edge.data is not None and isinstance(edge.data, dace.InterstateEdge): @@ -2236,8 +2236,8 @@ def _get_assignments(cfg: ControlFlowRegion | SDFG) -> Set[str]: symbols = scope.used_symbols(all_symbols=False) # Since no symbol can change within a state we are good to go return symbols - elif isinstance(scope, SDFG | ControlFlowRegion): - # Need to get all used symbols within the SDFG | CFG + elif isinstance(scope, Union[SDFG, ControlFlowRegion]): + # Need to get all used symbols within the SDFG or CFG used_symbols = scope.used_symbols(all_symbols=False) # Get all symbols that are written to written_symbols = _get_assignments(scope) diff --git a/tests/const_utilities_test.py b/tests/const_utilities_test.py new file mode 100644 index 0000000000..045d8113ac --- /dev/null +++ b/tests/const_utilities_test.py @@ -0,0 +1,198 @@ +import copy +import dace +import dace.sdfg.utils as sdutils +import pytest + +def _add_shared_memory(sdfg: dace.SDFG, add_src_access_node: bool = False): + for state in sdfg.all_states(): + for node in state.nodes(): + if isinstance(node, dace.sdfg.nodes.MapEntry) and node.map.schedule == dace.dtypes.ScheduleType.GPU_Device: + next_map = None + for n in state.bfs_nodes(node): + if isinstance(n, dace.sdfg.nodes.MapEntry) and n != node and n.map.schedule == dace.dtypes.ScheduleType.GPU_ThreadBlock: + next_map = n + break + elif isinstance(n, dace.nodes.MapExit): + break + if next_map is None: + raise ValueError("No next map found for the GPU_Device map entry.") + + src_name_dst_name_offset = dict() + edges_to_rm = set() + for in_edge in state.in_edges(next_map): + if in_edge.data is not None: + in_arr_name = in_edge.data.data + copy_shape = [(0, (((e) - b)//s), 1) for b, e, s in in_edge.data.subset] + copied_shape = [(((e + 1) - b)//s) for b, e, s in in_edge.data.subset] + copy_offset = [b for b, _, _ in in_edge.data.subset] + shared_mem_name = "shr_" + in_arr_name + in_arr = sdfg.arrays[in_arr_name] + if shared_mem_name not in sdfg.arrays: + sdfg.add_array(shared_mem_name, copied_shape, in_arr.dtype, storage=dace.dtypes.StorageType.GPU_Shared, transient=True) + + if add_src_access_node is True: + a1 = state.add_access(in_arr_name) + a2 = state.add_access(shared_mem_name) + e1 = state.add_edge(a1, None, a2, None, dace.Memlet( + data=in_arr_name, + subset=in_edge.data.subset, + other_subset=dace.subsets.Range(copy_shape), + wcr=None, + )) + e2 = state.add_edge(a2, None, next_map, in_edge.dst_conn, + dace.Memlet.from_array(shared_mem_name, + sdfg.arrays[shared_mem_name])) + e3 = state.add_edge(in_edge.src, in_edge.src_conn, a1, None, + copy.deepcopy(in_edge.data)) + edges_to_rm.add(in_edge) + src_name_dst_name_offset[in_arr_name] = (shared_mem_name, copy_offset) + else: + a2 = state.add_access(shared_mem_name) + e1 = state.add_edge(in_edge.src, in_edge.src_conn, a2, None, dace.Memlet( + data=in_arr_name, + subset=in_edge.data.subset, + other_subset=dace.subsets.Range(copy_shape), + wcr=None, + )) + e2 = state.add_edge(a2, None, next_map, in_edge.dst_conn, + dace.Memlet.from_array(shared_mem_name, + sdfg.arrays[shared_mem_name])) + edges_to_rm.add(in_edge) + src_name_dst_name_offset[in_arr_name] = (shared_mem_name, copy_offset) + + nodes = state.all_nodes_between(next_map, state.exit_node(next_map)) + for edge in state.all_edges(*nodes): + if edge.data is not None and edge.data.data in src_name_dst_name_offset: + dst_name, offset = src_name_dst_name_offset[edge.data.data] + edge.data.data = dst_name + old_subset = [(b,e,s) for b, e, s in edge.data.subset] + new_subset = [(b - offset[i], e - offset[i], s) for i, (b, e, s) in enumerate(old_subset)] + edge.data.subset = dace.subsets.Range(new_subset) + + for edge in edges_to_rm: + state.remove_edge(edge) + +def _check_map_entries(state, schedule, expected_data, expected_symbols): + map_entries = [n for n in state.nodes() if isinstance(n, dace.sdfg.nodes.MapEntry) and n.map.schedule == schedule] + for me in map_entries: + const_data = sdutils.get_constant_data(me, state) + const_symbols = sdutils.get_constant_symbols(me, state) + assert expected_data == const_data + assert expected_symbols == const_symbols + +def _gen_sdfg_with_symbol_use_in_nsdfg(write_only:bool=True) -> dace.SDFG: + sdfg = dace.SDFG(name="reassign_syms_in_nested_sdfg") + sdfg.add_array(name="A", shape=(1,), dtype=dace.int64, transient=False) + sdfg.add_symbol(name="A_sym", stype=dace.int64) + + s0 = sdfg.add_state(label="state0", is_start_block=True) + s1 = sdfg.add_state(label="state1") + + sdfg.add_edge(s0, s1, dace.InterstateEdge(assignments={"A_sym": "A[0]"})) + + inner_sdfg = dace.SDFG(name="inner_sdfg") + A_sym = dace.symbolic.symbol("A_sym", dace.int64) + nsdfg = s1.add_nested_sdfg( + sdfg=inner_sdfg, + parent=s1.sdfg, + inputs={}, + outputs={}, + symbol_mapping={"A_sym": A_sym}, + ) + assert "A_sym" in nsdfg.sdfg.symbols + assert "A_sym" in nsdfg.sdfg.free_symbols + if write_only: + nsdfg.sdfg.add_symbol(name="_inner_sym", stype=dace.int64) + + s1_0 = inner_sdfg.add_state(label="i_state0", is_start_block=True) + s1_1 = inner_sdfg.add_state(label="i_state1") + s1_2 = inner_sdfg.add_state(label="i_state2") + + if write_only: + inner_sdfg.add_edge(s1_0, s1_1, dace.InterstateEdge(assignments={"_inner_sym": "A_sym + 1"})) + inner_sdfg.add_edge(s1_1, s1_2, dace.InterstateEdge(assignments={"A_sym": "_inner_sym"})) + else: + inner_sdfg.add_edge(s1_0, s1_1, dace.InterstateEdge(assignments={"A_sym": "5"})) + inner_sdfg.add_edge(s1_1, s1_2, dace.InterstateEdge(assignments={})) + + s2: dace.SDFGState = sdfg.add_state(label="state2") + sdfg.add_edge(s1, s2, dace.InterstateEdge()) + + t0 = s2.add_tasklet( + name="tasklet", + inputs={"_in_A"}, + outputs={}, + code="printf(\"%ld\\n\", _in_A); printf(\"%ld\\n\", A_sym);", + language=dace.Language.CPP, + side_effects=True, + code_global="#include \n", + ) + an0 = s2.add_access(array_or_stream_name="A") + s2.add_edge( + an0, None, t0, "_in_A", dace.Memlet(expr="A[0]") + ) + sdfg.save(f"test_const_{write_only}.sdfg") + return sdfg, s1, nsdfg + +def test_const_utilities_case_non_const_input_not_present_in_output(): + """Standalone test function that can be run without pytest.""" + + # Create kernel + N = dace.symbol("N", dtype=dace.int64) + K = 5 + + @dace.program + def kernel( + A: dace.float64[N] @ dace.dtypes.StorageType.GPU_Global, + B: dace.float64[N] @ dace.dtypes.StorageType.GPU_Global, + C: dace.float64[N] @ dace.dtypes.StorageType.GPU_Global, + ): + for i in dace.map[0:N:256*K] @ dace.dtypes.ScheduleType.GPU_Device: + for k in dace.map[0:K] @ dace.dtypes.ScheduleType.Sequential: + for j in dace.map[0:256] @ dace.dtypes.ScheduleType.GPU_ThreadBlock: + C[i + j + k * 256] = A[i + j + k * 256] + B[i + j + k * 256] + + # Create original SDFG + original_sdfg = kernel.to_sdfg(use_cache=False, simplify=False) + original_sdfg.simplify() + + # Create transformed SDFG + transformed_sdfg = copy.deepcopy(original_sdfg) + _add_shared_memory(transformed_sdfg, add_src_access_node=True) + transformed_sdfg.validate() + + # Test cases + original_state = next(iter(original_sdfg.all_states())) + transformed_state = next(iter(transformed_sdfg.all_states())) + + # Original state tests + _check_map_entries(original_state, dace.dtypes.ScheduleType.GPU_Device, {"A", "B"}, {"i"}) + _check_map_entries(original_state, dace.dtypes.ScheduleType.Sequential, {"A", "B"}, {"i", "k"}) + _check_map_entries(original_state, dace.dtypes.ScheduleType.GPU_ThreadBlock, {"A", "B"}, {"i", "j", "k"}) + + # Transformed state tests + _check_map_entries(transformed_state, dace.dtypes.ScheduleType.GPU_Device, set(), {"i"}) + _check_map_entries(transformed_state, dace.dtypes.ScheduleType.Sequential, set(), {"i", "k"}) + _check_map_entries(transformed_state, dace.dtypes.ScheduleType.GPU_ThreadBlock, {"shr_A", "shr_B"}, {"i", "j", "k"}) + + +def test_const_utilities_case_write_only_free_symbol_in_nsdfg(): + sdfg1, s1, nsdfg1 = _gen_sdfg_with_symbol_use_in_nsdfg(write_only=True) + sdfg1.validate() + + const_data = sdutils.get_constant_data(nsdfg1) + const_symbols = sdutils.get_constant_symbols(nsdfg1) + assert set() == const_data + assert set() == const_symbols + + sdfg2, s2, nsdfg2 = _gen_sdfg_with_symbol_use_in_nsdfg(write_only=False) + sdfg2.validate() + const_data = sdutils.get_constant_data(nsdfg2) + const_symbols = sdutils.get_constant_symbols(nsdfg2) + assert set() == const_data + assert set() == const_symbols + + +if __name__ == "__main__": + test_const_utilities_case_non_const_input_not_present_in_output() + test_const_utilities_case_write_only_free_symbol_in_nsdfg() \ No newline at end of file From c932fd24106a20fe7dfa22b1a79dcb88e670df0d Mon Sep 17 00:00:00 2001 From: Yakup Budanaz Date: Thu, 3 Jul 2025 16:16:34 +0200 Subject: [PATCH 44/94] refactor --- tests/const_utilities_test.py | 68 +++++++++++++++++++---------------- 1 file changed, 38 insertions(+), 30 deletions(-) diff --git a/tests/const_utilities_test.py b/tests/const_utilities_test.py index 045d8113ac..92f2b667a5 100644 --- a/tests/const_utilities_test.py +++ b/tests/const_utilities_test.py @@ -3,13 +3,15 @@ import dace.sdfg.utils as sdutils import pytest + def _add_shared_memory(sdfg: dace.SDFG, add_src_access_node: bool = False): for state in sdfg.all_states(): for node in state.nodes(): if isinstance(node, dace.sdfg.nodes.MapEntry) and node.map.schedule == dace.dtypes.ScheduleType.GPU_Device: next_map = None for n in state.bfs_nodes(node): - if isinstance(n, dace.sdfg.nodes.MapEntry) and n != node and n.map.schedule == dace.dtypes.ScheduleType.GPU_ThreadBlock: + if isinstance(n, dace.sdfg.nodes.MapEntry + ) and n != node and n.map.schedule == dace.dtypes.ScheduleType.GPU_ThreadBlock: next_map = n break elif isinstance(n, dace.nodes.MapExit): @@ -22,41 +24,46 @@ def _add_shared_memory(sdfg: dace.SDFG, add_src_access_node: bool = False): for in_edge in state.in_edges(next_map): if in_edge.data is not None: in_arr_name = in_edge.data.data - copy_shape = [(0, (((e) - b)//s), 1) for b, e, s in in_edge.data.subset] - copied_shape = [(((e + 1) - b)//s) for b, e, s in in_edge.data.subset] + copy_shape = [(0, (((e) - b) // s), 1) for b, e, s in in_edge.data.subset] + copied_shape = [(((e + 1) - b) // s) for b, e, s in in_edge.data.subset] copy_offset = [b for b, _, _ in in_edge.data.subset] shared_mem_name = "shr_" + in_arr_name in_arr = sdfg.arrays[in_arr_name] if shared_mem_name not in sdfg.arrays: - sdfg.add_array(shared_mem_name, copied_shape, in_arr.dtype, storage=dace.dtypes.StorageType.GPU_Shared, transient=True) + sdfg.add_array(shared_mem_name, + copied_shape, + in_arr.dtype, + storage=dace.dtypes.StorageType.GPU_Shared, + transient=True) if add_src_access_node is True: a1 = state.add_access(in_arr_name) a2 = state.add_access(shared_mem_name) - e1 = state.add_edge(a1, None, a2, None, dace.Memlet( - data=in_arr_name, - subset=in_edge.data.subset, - other_subset=dace.subsets.Range(copy_shape), - wcr=None, - )) + e1 = state.add_edge( + a1, None, a2, None, + dace.Memlet( + data=in_arr_name, + subset=in_edge.data.subset, + other_subset=dace.subsets.Range(copy_shape), + wcr=None, + )) e2 = state.add_edge(a2, None, next_map, in_edge.dst_conn, - dace.Memlet.from_array(shared_mem_name, - sdfg.arrays[shared_mem_name])) - e3 = state.add_edge(in_edge.src, in_edge.src_conn, a1, None, - copy.deepcopy(in_edge.data)) + dace.Memlet.from_array(shared_mem_name, sdfg.arrays[shared_mem_name])) + e3 = state.add_edge(in_edge.src, in_edge.src_conn, a1, None, copy.deepcopy(in_edge.data)) edges_to_rm.add(in_edge) src_name_dst_name_offset[in_arr_name] = (shared_mem_name, copy_offset) else: a2 = state.add_access(shared_mem_name) - e1 = state.add_edge(in_edge.src, in_edge.src_conn, a2, None, dace.Memlet( - data=in_arr_name, - subset=in_edge.data.subset, - other_subset=dace.subsets.Range(copy_shape), - wcr=None, - )) + e1 = state.add_edge( + in_edge.src, in_edge.src_conn, a2, None, + dace.Memlet( + data=in_arr_name, + subset=in_edge.data.subset, + other_subset=dace.subsets.Range(copy_shape), + wcr=None, + )) e2 = state.add_edge(a2, None, next_map, in_edge.dst_conn, - dace.Memlet.from_array(shared_mem_name, - sdfg.arrays[shared_mem_name])) + dace.Memlet.from_array(shared_mem_name, sdfg.arrays[shared_mem_name])) edges_to_rm.add(in_edge) src_name_dst_name_offset[in_arr_name] = (shared_mem_name, copy_offset) @@ -65,13 +72,14 @@ def _add_shared_memory(sdfg: dace.SDFG, add_src_access_node: bool = False): if edge.data is not None and edge.data.data in src_name_dst_name_offset: dst_name, offset = src_name_dst_name_offset[edge.data.data] edge.data.data = dst_name - old_subset = [(b,e,s) for b, e, s in edge.data.subset] + old_subset = [(b, e, s) for b, e, s in edge.data.subset] new_subset = [(b - offset[i], e - offset[i], s) for i, (b, e, s) in enumerate(old_subset)] edge.data.subset = dace.subsets.Range(new_subset) for edge in edges_to_rm: state.remove_edge(edge) + def _check_map_entries(state, schedule, expected_data, expected_symbols): map_entries = [n for n in state.nodes() if isinstance(n, dace.sdfg.nodes.MapEntry) and n.map.schedule == schedule] for me in map_entries: @@ -80,9 +88,10 @@ def _check_map_entries(state, schedule, expected_data, expected_symbols): assert expected_data == const_data assert expected_symbols == const_symbols -def _gen_sdfg_with_symbol_use_in_nsdfg(write_only:bool=True) -> dace.SDFG: + +def _gen_sdfg_with_symbol_use_in_nsdfg(write_only: bool = True) -> dace.SDFG: sdfg = dace.SDFG(name="reassign_syms_in_nested_sdfg") - sdfg.add_array(name="A", shape=(1,), dtype=dace.int64, transient=False) + sdfg.add_array(name="A", shape=(1, ), dtype=dace.int64, transient=False) sdfg.add_symbol(name="A_sym", stype=dace.int64) s0 = sdfg.add_state(label="state0", is_start_block=True) @@ -128,12 +137,11 @@ def _gen_sdfg_with_symbol_use_in_nsdfg(write_only:bool=True) -> dace.SDFG: code_global="#include \n", ) an0 = s2.add_access(array_or_stream_name="A") - s2.add_edge( - an0, None, t0, "_in_A", dace.Memlet(expr="A[0]") - ) + s2.add_edge(an0, None, t0, "_in_A", dace.Memlet(expr="A[0]")) sdfg.save(f"test_const_{write_only}.sdfg") return sdfg, s1, nsdfg + def test_const_utilities_case_non_const_input_not_present_in_output(): """Standalone test function that can be run without pytest.""" @@ -147,7 +155,7 @@ def kernel( B: dace.float64[N] @ dace.dtypes.StorageType.GPU_Global, C: dace.float64[N] @ dace.dtypes.StorageType.GPU_Global, ): - for i in dace.map[0:N:256*K] @ dace.dtypes.ScheduleType.GPU_Device: + for i in dace.map[0:N:256 * K] @ dace.dtypes.ScheduleType.GPU_Device: for k in dace.map[0:K] @ dace.dtypes.ScheduleType.Sequential: for j in dace.map[0:256] @ dace.dtypes.ScheduleType.GPU_ThreadBlock: C[i + j + k * 256] = A[i + j + k * 256] + B[i + j + k * 256] @@ -195,4 +203,4 @@ def test_const_utilities_case_write_only_free_symbol_in_nsdfg(): if __name__ == "__main__": test_const_utilities_case_non_const_input_not_present_in_output() - test_const_utilities_case_write_only_free_symbol_in_nsdfg() \ No newline at end of file + test_const_utilities_case_write_only_free_symbol_in_nsdfg() From a84021e6d67b7a770654211e61ad0060419690b1 Mon Sep 17 00:00:00 2001 From: Yakup Budanaz Date: Thu, 3 Jul 2025 17:05:02 +0200 Subject: [PATCH 45/94] Fix name clash --- dace/sdfg/nodes.py | 3 +-- dace/sdfg/state.py | 3 +-- dace/sdfg/utils.py | 2 +- 3 files changed, 3 insertions(+), 5 deletions(-) diff --git a/dace/sdfg/nodes.py b/dace/sdfg/nodes.py index e290d1b368..09f68c074b 100644 --- a/dace/sdfg/nodes.py +++ b/dace/sdfg/nodes.py @@ -877,7 +877,7 @@ def new_symbols(self, sdfg, state, symbols) -> Dict[str, dtypes.typeclass]: return result - def used_symbols(self, parent_state: 'dace.SDFGState', all_symbols: bool = False) -> Set[str]: + def used_symbols_within_scope(self, parent_state: 'dace.SDFGState', all_symbols: bool = False) -> Set[str]: """ Returns a set of symbol names that are used withn the Map scope created by this MapEntry @@ -885,7 +885,6 @@ def used_symbols(self, parent_state: 'dace.SDFGState', all_symbols: bool = False """ parent_sdfg: dace.SDFG = parent_state.sdfg - all_symbols = set() new_symbols = set() free_symbols = set() diff --git a/dace/sdfg/state.py b/dace/sdfg/state.py index 65681b64f1..5cef723508 100644 --- a/dace/sdfg/state.py +++ b/dace/sdfg/state.py @@ -692,8 +692,7 @@ def used_symbols(self, continue if hasattr(n, 'used_symbols'): - if not isinstance(n, nd.MapEntry): - freesyms |= n.used_symbols(all_symbols=all_symbols) + freesyms |= n.used_symbols(all_symbols) else: freesyms |= n.free_symbols diff --git a/dace/sdfg/utils.py b/dace/sdfg/utils.py index c801f7b4b5..8286c00de6 100644 --- a/dace/sdfg/utils.py +++ b/dace/sdfg/utils.py @@ -2248,7 +2248,7 @@ def _get_assignments(cfg: Union[ControlFlowRegion, SDFG]) -> Set[str]: written_symbols = _get_assignments(scope.sdfg) return used_symbols - written_symbols elif isinstance(scope, nd.MapEntry): - used_symbols = scope.used_symbols(parent_state=parent_state) + used_symbols = scope.used_symbols_within_scope(parent_state=parent_state) return used_symbols else: raise Exception("Unsupported scope type for get_constant_data: {}".format(type(scope))) From 8a5bb99dd6feb6ab547080f8b7857b116f90223b Mon Sep 17 00:00:00 2001 From: Yakup Budanaz Date: Fri, 4 Jul 2025 11:50:17 +0200 Subject: [PATCH 46/94] Add improved validation test for the interstate_edge_utils --- tests/interstate_edge_utils_test.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/tests/interstate_edge_utils_test.py b/tests/interstate_edge_utils_test.py index 6a61b90a5a..0ba83d3fa8 100644 --- a/tests/interstate_edge_utils_test.py +++ b/tests/interstate_edge_utils_test.py @@ -1,6 +1,6 @@ import dace import typing - +import pytest def _get_sdfg() -> typing.Tuple[dace.SDFG, dace.InterstateEdge]: sdfg = dace.SDFG("interstate_util_test") @@ -31,7 +31,6 @@ def _get_sdfg() -> typing.Tuple[dace.SDFG, dace.InterstateEdge]: sym3_name: f"{array1_name}[1]", } e = sdfg.add_edge(state1, state2, dace.InterstateEdge(assignments=interstate_assignments)) - sdfg.validate() return sdfg, e @@ -77,6 +76,13 @@ def test_all_used_arrays(): e: dace.InterstateEdge = sdfg_and_edge[1] assert e.data.used_arrays(arrays=sdfg.arrays, union_lhs_symbols=True) == {"scalar2", "scalar1", "array1"} +def test_validity(): + # SDFG can't write to scalars on interstate edges catch for validity + with pytest.raises(dace.sdfg.validation.InvalidSDFGInterstateEdgeError, match="Assignment to a scalar or an array detected in an interstate edge"): + sdfg_and_edge: typing.Tuple[dace.SDFG, dace.InterstateEdge] = _get_sdfg() + sdfg: dace.SDFG = sdfg_and_edge[0] + sdfg.validate() + if __name__ == "__main__": test_read_symbols() From 37b604de1b7a63591ea0a8515412dae830724dca Mon Sep 17 00:00:00 2001 From: Yakup Budanaz Date: Fri, 4 Jul 2025 11:57:29 +0200 Subject: [PATCH 47/94] Run precommit hook --- tests/interstate_edge_utils_test.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tests/interstate_edge_utils_test.py b/tests/interstate_edge_utils_test.py index 0ba83d3fa8..77dcf99bf2 100644 --- a/tests/interstate_edge_utils_test.py +++ b/tests/interstate_edge_utils_test.py @@ -2,6 +2,7 @@ import typing import pytest + def _get_sdfg() -> typing.Tuple[dace.SDFG, dace.InterstateEdge]: sdfg = dace.SDFG("interstate_util_test") @@ -76,9 +77,11 @@ def test_all_used_arrays(): e: dace.InterstateEdge = sdfg_and_edge[1] assert e.data.used_arrays(arrays=sdfg.arrays, union_lhs_symbols=True) == {"scalar2", "scalar1", "array1"} + def test_validity(): # SDFG can't write to scalars on interstate edges catch for validity - with pytest.raises(dace.sdfg.validation.InvalidSDFGInterstateEdgeError, match="Assignment to a scalar or an array detected in an interstate edge"): + with pytest.raises(dace.sdfg.validation.InvalidSDFGInterstateEdgeError, + match="Assignment to a scalar or an array detected in an interstate edge"): sdfg_and_edge: typing.Tuple[dace.SDFG, dace.InterstateEdge] = _get_sdfg() sdfg: dace.SDFG = sdfg_and_edge[0] sdfg.validate() From 11129138d7aebe113d2c841765e390a7994fd6f5 Mon Sep 17 00:00:00 2001 From: Yakup Budanaz Date: Fri, 4 Jul 2025 13:13:54 +0200 Subject: [PATCH 48/94] Typefix --- dace/sdfg/utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dace/sdfg/utils.py b/dace/sdfg/utils.py index 8286c00de6..dd3d4a7a9b 100644 --- a/dace/sdfg/utils.py +++ b/dace/sdfg/utils.py @@ -2179,7 +2179,7 @@ def get_constant_data(scope: Union[ControlFlowRegion, SDFGState, nd.NestedSDFG, def _incoming_memlet(state: SDFGState, node: nd.AccessNode) -> bool: return (state.in_degree(node) > 0 and any([e.data is not None for e in state.in_edges(node)])) - if isinstance(scope, SDFGState) or isinstance(scope, ControlFlowRegion): + if isinstance(scope, (SDFGState, ControlFlowRegion)): read_data, write_data = scope.read_and_write_sets() return read_data - write_data elif isinstance(scope, nd.NestedSDFG): @@ -2236,7 +2236,7 @@ def _get_assignments(cfg: Union[ControlFlowRegion, SDFG]) -> Set[str]: symbols = scope.used_symbols(all_symbols=False) # Since no symbol can change within a state we are good to go return symbols - elif isinstance(scope, Union[SDFG, ControlFlowRegion]): + elif isinstance(scope, (SDFG, ControlFlowRegion)): # Need to get all used symbols within the SDFG or CFG used_symbols = scope.used_symbols(all_symbols=False) # Get all symbols that are written to From 498e7839d569396f32c0331d01cd3cf2028c11d8 Mon Sep 17 00:00:00 2001 From: Yakup Budanaz Date: Fri, 4 Jul 2025 15:49:31 +0200 Subject: [PATCH 49/94] Rename --- dace/transformation/passes/analysis/infer_const_args.py | 4 ++-- tests/interstate_edge_utils_test.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/dace/transformation/passes/analysis/infer_const_args.py b/dace/transformation/passes/analysis/infer_const_args.py index 3d6f35bbda..99bd2148c9 100644 --- a/dace/transformation/passes/analysis/infer_const_args.py +++ b/dace/transformation/passes/analysis/infer_const_args.py @@ -7,9 +7,9 @@ @properties.make_properties @transformation.explicit_cf_compatible -class StateReachability(ppl.Pass): +class InfetConstantArguments(ppl.Pass): """ - Evaluates state reachability (which other states can be executed after each state). + Evaluates which symbols and data are const within a scope. """ CATEGORY: str = 'Analysis' diff --git a/tests/interstate_edge_utils_test.py b/tests/interstate_edge_utils_test.py index 77dcf99bf2..103c2b1169 100644 --- a/tests/interstate_edge_utils_test.py +++ b/tests/interstate_edge_utils_test.py @@ -78,7 +78,7 @@ def test_all_used_arrays(): assert e.data.used_arrays(arrays=sdfg.arrays, union_lhs_symbols=True) == {"scalar2", "scalar1", "array1"} -def test_validity(): +def test_writing_to_scalar_on_iedge_is_invalid(): # SDFG can't write to scalars on interstate edges catch for validity with pytest.raises(dace.sdfg.validation.InvalidSDFGInterstateEdgeError, match="Assignment to a scalar or an array detected in an interstate edge"): @@ -94,4 +94,4 @@ def test_validity(): test_all_read_sdfg_symbols() test_all_read_arrays() test_all_used_arrays() - print("All tests passed!") + test_writing_to_scalar_on_iedge_is_invalid() From 3355569476afc712314f0beb38085f2ee279d88d Mon Sep 17 00:00:00 2001 From: aydogdub Date: Mon, 7 Jul 2025 16:24:35 +0200 Subject: [PATCH 50/94] allows to switch bettwen the codegens without local definition of ptr. This also fails old tests since they would otherwise call ptr but CUDACodeGen has not been registered when the experimental codegen is chosen in the configs --- dace/codegen/targets/cpp.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/dace/codegen/targets/cpp.py b/dace/codegen/targets/cpp.py index be1136f3b6..20e1e7d399 100644 --- a/dace/codegen/targets/cpp.py +++ b/dace/codegen/targets/cpp.py @@ -253,11 +253,22 @@ def ptr(name: str, desc: data.Data, sdfg: SDFG = None, framecode=None) -> str: # Special case: If memory is persistent and defined in this SDFG, add state # struct to name if (desc.transient and desc.lifetime in (dtypes.AllocationLifetime.Persistent, dtypes.AllocationLifetime.External)): - from dace.codegen.targets.cuda import CUDACodeGen # Avoid import loop + + # Avoid import loop + from dace.codegen.targets.cuda import CUDACodeGen + from dace.codegen.targets.experimental_cuda import ExperimentalCUDACodeGen + + # Check whether we are in kernel/ device code of GPU backend + cuda_impl = Config.get('compiler', 'cuda', 'implementation') + if cuda_impl == "legacy": + in_device_code = CUDACodeGen._in_device_code + elif cuda_impl == "experimental": + in_device_code = ExperimentalCUDACodeGen._in_kernel_code + if desc.storage == dtypes.StorageType.CPU_ThreadLocal: # Use unambiguous name for thread-local arrays return f'__{sdfg.cfg_id}_{name}' - elif not CUDACodeGen._in_device_code: # GPU kernels cannot access state + elif not in_device_code: # GPU kernels cannot access state return f'__state->__{sdfg.cfg_id}_{name}' elif (sdfg, name) in framecode.where_allocated and framecode.where_allocated[(sdfg, name)] is not sdfg: return f'__{sdfg.cfg_id}_{name}' From 9f77a03942c609daaba96c59eae065a875775b65 Mon Sep 17 00:00:00 2001 From: aydogdub Date: Mon, 7 Jul 2025 16:36:19 +0200 Subject: [PATCH 51/94] Missing Explicit ThreadBlock Maps will now be handled by a pass that add. Plus A Pass infers now Grid and Block dimensions. This is closely implemented to old implementations to ensure backwards compatibility as much as possible. Also, ExperimentalCUDACodeGen and its helper modules got extended and simplified thanks to these two passes. --- dace/codegen/targets/experimental_cuda.py | 454 +++--------------- .../experimental_cuda_helpers/gpu_utils.py | 69 ++- .../scope_strategies.py | 48 +- .../dataflow/add_threadblock_map.py | 272 ++++++----- .../analysis/infer_gpu_grid_and_block_size.py | 172 +++++++ 5 files changed, 472 insertions(+), 543 deletions(-) create mode 100644 dace/transformation/passes/analysis/infer_gpu_grid_and_block_size.py diff --git a/dace/codegen/targets/experimental_cuda.py b/dace/codegen/targets/experimental_cuda.py index d312fcf10d..80e36bdbac 100644 --- a/dace/codegen/targets/experimental_cuda.py +++ b/dace/codegen/targets/experimental_cuda.py @@ -1,6 +1,6 @@ # Standard library imports import warnings -from typing import TYPE_CHECKING, Dict, List, Optional, Set, Tuple, Union +from typing import TYPE_CHECKING, Any, Dict, List, Optional, Set, Tuple, Union # Third-party imports import networkx as nx @@ -32,9 +32,12 @@ from dace.codegen.targets.target import IllegalCopy, TargetCodeGenerator, make_absolute # DaCe transformation imports +from dace.transformation import helpers from dace.transformation.passes import analysis as ap from dace.transformation.passes.gpustream_scheduling import NaiveGPUStreamScheduler from dace.transformation.passes.shared_memory_synchronization import DefaultSharedMemorySync +from dace.transformation.dataflow.add_threadblock_map import AddThreadBlockMap +from dace.transformation.passes.analysis.infer_gpu_grid_and_block_size import InferGPUGridAndBlockSize # Experimental CUDA helper imports from dace.codegen.targets.experimental_cuda_helpers.gpu_stream_manager import GPUStreamManager @@ -46,9 +49,6 @@ from dace.codegen.targets.cpu import CPUCodeGen -# TODO's easy: -# 3. Emit sync -> yea not easy - # add symbolic_to_cpp ! # TODO's harder: @@ -138,15 +138,35 @@ def __init__(self, frame_codegen: 'DaCeCodeGenerator', sdfg: SDFG): self._current_kernel_spec: Optional[KernelSpec] = None self._gpu_stream_manager: Optional[GPUStreamManager] = None + self._kernel_dimensions_map: Set[nodes.MapEntry] = set() def preprocess(self, sdfg: SDFG) -> None: """ Preprocess the SDFG to prepare it for GPU code generation. This includes: + - Adding explicit ThreadBlock Maps where missing and infer Grid and Block dimensions for + every Kernel in the SDFG - Handling GPU<->GPU strided copies. - Assigning backend GPU streams (e.g., CUDA streams) and creating the GPUStreamManager. - Handling memory pool management """ + #----------------- Add ThreadBlock Maps & Infer Kernel Grid & Block Sizes -------------------- + + # new_nodes - old_nodes gives us all Kernel Entry nodes that were created during the insertion + # of ThreadBlock maps. Note: the original Kernel Entry was transformed into a ThreadBlock map, + # and a new GPU_Device (i.e., Kernel) map was inserted on top of it. + old_nodes = set(node for node, _ in sdfg.all_nodes_recursive()) + + # Insert default explicit GPU_ThreadBlock maps where they are missing + sdfg.apply_transformations_once_everywhere(AddThreadBlockMap) + + new_nodes = set(node for node, _ in sdfg.all_nodes_recursive()) - old_nodes + kernels_with_added_tb_maps = {n for n in new_nodes if isinstance(n, nodes.MapEntry) and n.schedule == dtypes.ScheduleType.GPU_Device} + + # Infer GPU Grid and Block dimensions + self._kernel_dimensions_map = InferGPUGridAndBlockSize().apply_pass(sdfg, kernels_with_added_tb_maps) + + #------------------------- Hanlde GPU<->GPU strided copies -------------------------- # Find GPU<->GPU strided copies that cannot be represented by a single copy command @@ -188,10 +208,8 @@ def preprocess(self, sdfg: SDFG) -> None: except ValueError: # If transformation doesn't match, continue normally continue - #------------------------- GPU Stream related Logic -------------------------- - # Register GPU context in state struct self._frame.statestruct.append('dace::cuda::Context *gpu_context;') @@ -411,7 +429,7 @@ def _generate_kernel_wrapper(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope kernel_spec: KernelSpec = self._current_kernel_spec kernel_name = kernel_spec.kernel_name - kernel_wrapper_args = kernel_spec.kernel_wrapper_args + kernel_wrapper_args_as_input = kernel_spec.kernel_wrapper_args_as_input kernel_wrapper_args_typed = kernel_spec.kernel_wrapper_args_typed # Declaration of the function which launches the kernel (C++ code) @@ -420,7 +438,7 @@ def _generate_kernel_wrapper(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope # Calling the function which launches the kernel (C++ code) callsite_stream.write( '__dace_runkernel_%s(%s);\n' % - (kernel_name, ', '.join(kernel_wrapper_args)), cfg, state_id, scope_entry) + (kernel_name, ', '.join(kernel_wrapper_args_as_input)), cfg, state_id, scope_entry) def _generate_kernel_launch(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: ScopeSubgraphView, state_id: int, function_stream: CodeIOStream, callsite_stream: CodeIOStream) -> None: @@ -450,12 +468,12 @@ def _generate_kernel_launch(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: """ DACE_EXPORTED void __dace_runkernel_{fname}({fargs}); void __dace_runkernel_{fname}({fargs}) - {{ """.format(fname=kernel_name, fargs=', '.join(kernel_launch_args_typed)), cfg, state_id, scope_entry ) - + # Open bracket + self._localcode.write('{', cfg, state_id, scope_entry) # ----------------- Guard Checks handling ----------------------- @@ -504,7 +522,9 @@ def _generate_kernel_launch(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: self._localcode.write(f'DACE_KERNEL_LAUNCH_CHECK(__err, "{kernel_name}", {gdims}, {bdims});') emit_sync_debug_checks(self.backend, self._localcode) - self._localcode.write('}') + + # Close bracket + self._localcode.write('}', cfg, state_id, scope_entry) ########################################################################### # Generation of Memory Copy Logic @@ -1140,51 +1160,8 @@ def process_out_memlets(self, *args, **kwargs): self._cpu_codegen.process_out_memlets(*args, codegen=self, **kwargs) - - - ######################################################################### -# helper classes and functions - -# NOTE: I had to redefine this function locally to not modify other files -# and ensure backwards compatibility with the old cudacodegen -def ptr(name: str, desc: dace.data.Data, sdfg: SDFG = None, framecode=None) -> str: - """ - Returns a string that points to the data based on its name and descriptor. - - This function should be in cpp.py, but for ExperimentalCUDACodeGen I defined - it here to not modify it there, s.t. we have backwards compatibility. - - :param name: Data name. - :param desc: Data descriptor. - :return: C-compatible name that can be used to access the data. - """ - from dace.codegen.targets.framecode import DaCeCodeGenerator # Avoid import loop - framecode: DaCeCodeGenerator = framecode - - if '.' in name: - root = name.split('.')[0] - if root in sdfg.arrays and isinstance(sdfg.arrays[root], dace.data.Structure): - name = name.replace('.', '->') - - # Special case: If memory is persistent and defined in this SDFG, add state - # struct to name - if (desc.transient and desc.lifetime in (dtypes.AllocationLifetime.Persistent, dtypes.AllocationLifetime.External)): - - if desc.storage == dtypes.StorageType.CPU_ThreadLocal: # Use unambiguous name for thread-local arrays - return f'__{sdfg.cfg_id}_{name}' - elif not ExperimentalCUDACodeGen._in_kernel_code: # GPU kernels cannot access state - return f'__state->__{sdfg.cfg_id}_{name}' - elif (sdfg, name) in framecode.where_allocated and framecode.where_allocated[(sdfg, name)] is not sdfg: - return f'__{sdfg.cfg_id}_{name}' - elif (desc.transient and sdfg is not None and framecode is not None and (sdfg, name) in framecode.where_allocated - and framecode.where_allocated[(sdfg, name)] is not sdfg): - # Array allocated for another SDFG, use unambiguous name - return f'__{sdfg.cfg_id}_{name}' - - return name - - +# helper class # This one is closely linked to the ExperimentalCUDACodeGen. In fact, # it only exists to not have to much attributes and methods in the ExperimentalCUDACodeGen # and to group Kernel specific methods & information. Thus, KernelSpec should remain in this file @@ -1197,65 +1174,50 @@ class KernelSpec: def __init__(self, cudaCodeGen: ExperimentalCUDACodeGen, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: ScopeSubgraphView, state_id: int): - + # Get kernel entry/exit nodes and current state kernel_entry_node = dfg_scope.source_nodes()[0] kernel_exit_node = dfg_scope.sink_nodes()[0] state: SDFGState = cfg.state(state_id) - self._kernel_entry_node: nodes.MapEntry = kernel_entry_node - self._kernel_map: nodes.Map = kernel_entry_node.map + self._kernel_entry_node: nodes.MapEntry = kernel_entry_node # Kernel name - self._kernel_name: str = '%s_%d_%d_%d' % (kernel_entry_node.map.label, cfg.cfg_id, state.block_id, state.node_id(kernel_entry_node)) + self._kernel_name: str = f'{kernel_entry_node.map.label}_{cfg.cfg_id}_{state.block_id}_{state.node_id(kernel_entry_node)}' # Kernel arguments - arglist = {} - for state, node, defined_syms in sdutil.traverse_sdfg_with_defined_symbols(sdfg, recursive=True): + arglist: Dict[str, Any] = {} + for state_, node, defined_syms in sdutil.traverse_sdfg_with_defined_symbols(sdfg, recursive=True): if node is kernel_entry_node: - shared_transients = state.parent.shared_transients() - arglist = state.scope_subgraph(node).arglist(defined_syms, shared_transients) + shared_transients = state_.parent.shared_transients() + arglist = state_.scope_subgraph(node).arglist(defined_syms, shared_transients) break - self._args: Dict = arglist - """ - # const args - input_params = set(e.data.data for e in state.in_edges(kernel_entry_node)) - output_params = set(e.data.data for e in state.out_edges(kernel_exit_node)) - toplevel_params = set(node.data for node in dfg_scope.nodes() - if isinstance(node, nodes.AccessNode) and sdfg.arrays[node.data].toplevel) - dynamic_inputs = set(e.data.data for e in dace.sdfg.dynamic_map_inputs(state, kernel_entry_node)) - - const_args = input_params - (output_params | toplevel_params | dynamic_inputs) - self._args_typed: list[str] = [('const ' if aname in const_args else '') + adata.as_arg(name=aname) for aname, adata in self._args.items()] - """ + self._args: Dict[str, Any] = arglist - # args typed correctly and as input + # Typed arguments and argument access as input self._args_typed: list[str] = [adata.as_arg(name=aname) for aname, adata in self._args.items()] self._args_as_input: list[str] = [ptr(aname, adata, sdfg, cudaCodeGen._frame) for aname, adata in self._args.items()] # Used for the kernel wrapper function, be careful: a change in the name __state will probably lead to compilation errors state_param: list[str] = [f'{mangle_dace_state_struct_name(cudaCodeGen._global_sdfg)} *__state'] - - self._kernel_wrapper_args: list[str] = ['__state'] + self._args_as_input + self._kernel_wrapper_args_as_input: list[str] = ['__state'] + self._args_as_input self._kernel_wrapper_args_typed: list[str] = state_param + self._args_typed - # Kernel dimensions - self._grid_dims, self._block_dims, self._has_tbmap = self._get_kernel_dimensions(dfg_scope) + # The kernel's grid and block dimensions + self._grid_dims, self._block_dims = cudaCodeGen._kernel_dimensions_map[kernel_entry_node] - # C type (as string) of thread, block and warp indices + # C type of block, thread, and warp indices (as a string) self._gpu_index_ctype: str = self.get_gpu_index_ctype() - # Set warp size of the kernel + # Warp size (backend-dependent) if cudaCodeGen.backend not in ['cuda', 'hip']: - raise ValueError( - f"Unsupported backend '{cudaCodeGen.backend}' in ExperimentalCUDACodeGen. " - "Only 'cuda' and 'hip' are supported." - ) + raise ValueError(f"Unsupported backend '{cudaCodeGen.backend}' in ExperimentalCUDACodeGen. " + "Only 'cuda' and 'hip' are supported.") warp_size_key = 'cuda_warp_size' if cudaCodeGen.backend == 'cuda' else 'hip_warp_size' self._warpSize = Config.get('compiler', 'cuda', warp_size_key) - + def get_gpu_index_ctype(self, config_key='gpu_index_type') -> str: """ Retrieves the GPU index data type as a C type string (for thread, block, warp indices) @@ -1279,312 +1241,58 @@ def get_gpu_index_ctype(self, config_key='gpu_index_type') -> str: ) return dtype.ctype - - def _get_kernel_dimensions(self, dfg_scope: ScopeSubgraphView): - """ - Determines a GPU kernel's grid/block dimensions from map scopes. - - Ruleset for kernel dimensions: - - 1. If only one map (device-level) exists, of an integer set ``S``, - the block size is ``32x1x1`` and grid size is ``ceil(|S|/32)`` in - 1st dimension. - 2. If nested thread-block maps exist ``(T_1,...,T_n)``, grid - size is ``|S|`` and block size is ``max(|T_1|,...,|T_n|)`` with - block specialization. - 3. If block size can be overapproximated, it is (for - dynamically-sized blocks that are bounded by a - predefined size). - 4. If nested device maps exist, behavior is an error is thrown - in the generate_scope function. Nested device maps are not supported - anymore. - - :note: Kernel dimensions are separate from the map - variables, and they should be treated as such. - :note: To make use of the grid/block 3D registers, we use multi- - dimensional kernels up to 3 dimensions, and flatten the - rest into the third dimension. - """ - - - # Extract the subgraph of the kernel entry map - launch_scope = dfg_scope.scope_subgraph(self._kernel_entry_node) - - # Collect all relevant maps affecting launch (i.e. grid and block) dimensions - affecting_maps = self._get_maps_affecting_launch_dims(launch_scope) - - # Filter for ThreadBlock maps - threadblock_maps = [(tbmap, sym_map) for tbmap, sym_map in affecting_maps - if tbmap.schedule == dtypes.ScheduleType.GPU_ThreadBlock] - - # Determine if we fall back to default block size (which also affects grid size) - no_block_info: bool = len(threadblock_maps) == 0 and self._kernel_map.gpu_block_size is None - - if no_block_info: - block_size, grid_size = self._compute_default_block_and_grid() - else: - block_size, grid_size = self._compute_block_and_grid_from_maps(threadblock_maps) - - - return grid_size, block_size, len(threadblock_maps) > 0 - - - def _compute_default_block_and_grid(self): - """ - Fallback when no gpu_block_size (i.e. self._kernel_map.gpu_block_size is None) - or GPU_ThreadBlock maps are defined: - - Uses default_block_size (e.g. [32,1,1]) on the whole domain S (assuming 1 dimensional), - producing block=[32,1,1] and grid=[ceil(|S|/32),1,1]. - - Special case: if the block has more active (non-1) dimensions than S, - extra block dimensions are collapsed into the last active slot. - """ - - kernel_map_label = self._kernel_entry_node.map.label - default_block_size_config = Config.get('compiler', 'cuda', 'default_block_size') - - # 1) Reject unsupported 'max' setting - if default_block_size_config == 'max': - # TODO: does this make sense? what is meant with dynamic here? - raise NotImplementedError('max dynamic block size unimplemented') - - # 2) Warn that we're falling back to config - warnings.warn( - f'No `gpu_block_size` property specified on map "{kernel_map_label}". ' - f'Falling back to the configuration entry `compiler.cuda.default_block_size`: {default_block_size_config}. ' - 'You can either specify the block size to use with the gpu_block_size property, ' - 'or by adding nested `GPU_ThreadBlock` maps, which map work to individual threads. ' - 'For more information, see https://spcldace.readthedocs.io/en/latest/optimization/gpu.html') - - - # 3) Normalize the total iteration space size (len(X),len(Y),len(Z)…) to 3D - raw_domain = list(self._kernel_map.range.size(True))[::-1] - kernel_domain_size = self._to_3d_dims(raw_domain) - - # 4) Parse & normalize the default block size to 3D - default_block_size = [int(x) for x in default_block_size_config.split(',')] - default_block_size = self._to_3d_dims(default_block_size) - - # 5) If block has more "active" dims than the domain, collapse extras - active_block_dims = max(1, sum(1 for b in default_block_size if b != 1)) - active_grid_dims = max(1, sum(1 for g in kernel_domain_size if g != 1)) - - if active_block_dims > active_grid_dims: - tail_product = product(default_block_size[active_grid_dims:]) - block_size = default_block_size[:active_grid_dims] + [1] * (3 - active_grid_dims) - block_size[active_grid_dims - 1] *= tail_product - warnings.warn(f'Default block size has more dimensions ({active_block_dims}) than kernel dimensions ' - f'({active_grid_dims}) in map "{kernel_map_label}". Linearizing block ' - f'size to {block_size}. Consider setting the ``gpu_block_size`` property.') - else: - block_size = default_block_size - - # 6) Compute the final grid size per axis: ceil(domain / block) - grid_size = [symbolic.int_ceil(gs, bs) for gs, bs in zip(kernel_domain_size, block_size)] - - - # 7) Check block size against configured CUDA hardware limits - self._validate_block_size_limits(block_size) - - return block_size, grid_size - - - def _compute_block_and_grid_from_maps(self, tb_maps_sym_map): - # TODO: also provide a description here in docstring - - - kernel_entry_node = self._kernel_entry_node - - # Compute kernel grid size - raw_grid_size = self._kernel_map.range.size(True)[::-1] - grid_size = self._to_3d_dims(raw_grid_size) - - # Determine block size, using gpu_block_size override if specified - # NOTE: this must be done on the original list! otherwise error - block_size = self._kernel_map.gpu_block_size - if block_size is not None: - block_size = self._to_3d_dims(block_size) - - - # Find all thread-block maps to determine overall block size - detected_block_sizes = [block_size] if block_size is not None else [] - for tbmap, sym_map in tb_maps_sym_map: - tbsize = [s.subs(list(sym_map.items())) for s in tbmap.range.size()[::-1]] - - # Over-approximate block size (e.g. min(N,(i+1)*32)-i*32 --> 32) - # The partial trailing thread-block is emitted as an if-condition - # that returns on some of the participating threads - tbsize = [symbolic.overapproximate(s) for s in tbsize] - - # To Cuda compatible block dimension description - tbsize = self._to_3d_dims(tbsize) - - if len(detected_block_sizes) == 0: - block_size = tbsize - else: - block_size = [sympy.Max(sz, bbsz) for sz, bbsz in zip(block_size, tbsize)] - - if block_size != tbsize or len(detected_block_sizes) == 0: - detected_block_sizes.append(tbsize) - - - - #-------------- Error handling and warnings ------------------------ - - # TODO: If grid/block sizes contain elements only defined within the - # kernel, raise an invalid SDFG exception and recommend - # overapproximation. - - kernel_map_label = kernel_entry_node.map.label - if len(detected_block_sizes) > 1: - # Error when both gpu_block_size and thread-block maps were defined and conflict - if kernel_entry_node.map.gpu_block_size is not None: - raise ValueError('Both the `gpu_block_size` property and internal thread-block ' - 'maps were defined with conflicting sizes for kernel ' - f'"{kernel_map_label}" (sizes detected: {detected_block_sizes}). ' - 'Use `gpu_block_size` only if you do not need access to individual ' - 'thread-block threads, or explicit block-level synchronization (e.g., ' - '`__syncthreads`). Otherwise, use internal maps with the `GPU_Threadblock` or ' - '`GPU_ThreadBlock_Dynamic` schedules. For more information, see ' - 'https://spcldace.readthedocs.io/en/latest/optimization/gpu.html') - - warnings.warn('Multiple thread-block maps with different sizes detected for ' - f'kernel "{kernel_map_label}": {detected_block_sizes}. ' - f'Over-approximating to block size {block_size}.\n' - 'If this was not the intent, try tiling one of the thread-block maps to match.') - - # Check block size against configured CUDA hardware limits - self._validate_block_size_limits(block_size) - - return block_size, grid_size - - - def _validate_block_size_limits(self, block_size): - """ - Check block size against configured maximum values, if those can be determined - """ - - kernel_map_label = self._kernel_map.label - - total_block_size = product(block_size) - limit = Config.get('compiler', 'cuda', 'block_size_limit') - lastdim_limit = Config.get('compiler', 'cuda', 'block_size_lastdim_limit') - - if (total_block_size > limit) == True: - raise ValueError(f'Block size for kernel "{kernel_map_label}" ({block_size}) ' - f'is larger than the possible number of threads per block ({limit}). ' - 'The kernel will potentially not run, please reduce the thread-block size. ' - 'To increase this limit, modify the `compiler.cuda.block_size_limit` ' - 'configuration entry.') - if (block_size[-1] > lastdim_limit) == True: - raise ValueError(f'Last block size dimension for kernel "{kernel_map_label}" ({block_size}) ' - 'is larger than the possible number of threads in the last block dimension ' - f'({lastdim_limit}). The kernel will potentially not run, please reduce the ' - 'thread-block size. To increase this limit, modify the ' - '`compiler.cuda.block_size_lastdim_limit` configuration entry.') - - - def _to_3d_dims(self, dim_sizes: List) -> List: - """ - Given a list representing the size of each dimension, this function modifies - the list in-place by collapsing all dimensions beyond the second into the - third entry. If the list has fewer than three entries, it is padded with 1's - to ensure it always contains exactly three elements. This is used to format - grid and block size parameters for a kernel launch. - - Examples: - [x] → [x, 1, 1] - [x, y] → [x, y, 1] - [x, y, z] → [x, y, z] - [x, y, z, u, v] → [x, y, z * u * v] - """ - - if len(dim_sizes) > 3: - # multiply everything from the 3rd onward into d[2] - dim_sizes[2] = product(dim_sizes[2:]) - dim_sizes = dim_sizes[:3] - - # pad with 1s if necessary - dim_sizes += [1] * (3 - len(dim_sizes)) - - return dim_sizes - - - def _get_maps_affecting_launch_dims(self, graph: ScopeSubgraphView) -> List[Tuple[nodes.MapEntry, Dict[dace.symbol, dace.symbol]]]: - """ - Recursively collects all GPU_Device and GPU_ThreadBlock maps within the given graph, - including those inside nested SDFGs. For each relevant map, returns a tuple containing - the map object and an identity mapping of its free symbols. - - Args: - graph (ScopeSubgraphView): The subgraph to search for relevant maps. - - Returns: - List[Tuple[nodes.MapEntry, Dict[dace.symbol, dace.symbol]]]: - A list of tuples, each consisting of a MapEntry object and a dictionary mapping - each free symbol in the map's range to itself (identity mapping). - - NOTE: - Currently, dynamic parallelism (nested GPU_Device schedules) is not supported. - The GPU_Device is only used for the top level map, where it is allowed and required. - """ - - relevant_maps = [] - - for node in graph.nodes(): - - # Recurse into nested SDFGs - if isinstance(node, nodes.NestedSDFG): - for state in node.sdfg.states(): - relevant_maps.extend(self._get_maps_affecting_launch_dims(state)) - continue - - # MapEntry with schedule affecting launch dimensions - if (isinstance(node, nodes.MapEntry) and - node.schedule in {dtypes.ScheduleType.GPU_Device, dtypes.ScheduleType.GPU_ThreadBlock}): - identity_map = { dace.symbol(sym): dace.symbol(sym) for sym in node.map.range.free_symbols} - relevant_maps.append((node.map, identity_map)) - - return relevant_maps - - - @property def kernel_name(self) -> list[str]: - """Returns the kernel name.""" + """Returns the kernel (function's) name.""" return self._kernel_name @property def kernel_entry_node(self) -> nodes.MapEntry: - """Returns the kernels entry node""" + """ + Returns the entry node of the kernel, which is a MapEntry node + scheduled with dace.dtypes.ScheduleType.GPU_Device. + """ return self._kernel_entry_node @property def kernel_map(self) -> nodes.Map: - """Returns the kernel map node""" - return self._kernel_map + """Returns the kernel's map node.""" + return self._kernel_entry_node.map @property def args_as_input(self) -> list[str]: - """Returns the kernel function arguments - that can be used as an input for calling the function. - It is the __global__ kernel function, NOT the kernel launch function.""" + """ + Returns the kernel function arguments formatted for use as inputs + when calling the kernel function. + """ return self._args_as_input @property def args_typed(self) -> list[str]: - """Returns the typed kernel function arguments - that can be used for declaring the __global__ kernel function. - These arguments include their respective data types.""" + """ + Returns the typed kernel function arguments suitable for declaring + the kernel function. Each argument includes its corresponding data type. + """ return self._args_typed @property - def kernel_wrapper_args(self) -> list[str]: - return self._kernel_wrapper_args + def kernel_wrapper_args_as_input(self) -> list[str]: + """ + Returns the argument names passed to the kernel wrapper function. + + The kernel wrapper is a function defined in the CUDA/HIP code that is called + from the CPU code and is responsible for launching the kernel function. + """ + return self._kernel_wrapper_args_as_input @property def kernel_wrapper_args_typed(self) -> list[str]: + """ + Returns the typed arguments used to declare the kernel wrapper function. + + The kernel wrapper is defined in the CUDA/HIP code, called from the CPU side, + and is responsible for launching the actual kernel function. + """ return self._kernel_wrapper_args_typed @property @@ -1597,11 +1305,6 @@ def block_dims(self) -> list: """Returns the block dimensions of the kernel.""" return self._block_dims - @property - def has_tbmap(self) -> bool: - """Returns whether the kernel has a thread-block map.""" - return self._has_tbmap - @property def warpSize(self) -> int: """ @@ -1619,4 +1322,3 @@ def gpu_index_ctype(self) -> str: setting in the configuration and matches with a DaCe typeclass. """ return self._gpu_index_ctype - diff --git a/dace/codegen/targets/experimental_cuda_helpers/gpu_utils.py b/dace/codegen/targets/experimental_cuda_helpers/gpu_utils.py index f487db5a88..48cf5b662c 100644 --- a/dace/codegen/targets/experimental_cuda_helpers/gpu_utils.py +++ b/dace/codegen/targets/experimental_cuda_helpers/gpu_utils.py @@ -1,10 +1,11 @@ import functools import sympy +from typing import List -from dace import Config, symbolic - +from dace import Config, symbolic from dace.codegen import cppunparse from dace.codegen.prettycode import CodeIOStream +from dace.sdfg import nodes def symbolic_to_cpp(arr): @@ -29,6 +30,68 @@ def product(iterable): """ return functools.reduce(sympy.Mul, iterable, 1) +def to_3d_dims(dim_sizes: List) -> List: + """ + Converts a list of dimension sizes to a 3D format. + + If the list has more than three dimensions, all dimensions beyond the second are + collapsed into the third (via multiplication). If the list has fewer than three + entries, it is padded with 1s to ensure a fixed length of three. + + Examples: + [x] → [x, 1, 1] + [x, y] → [x, y, 1] + [x, y, z] → [x, y, z] + [x, y, z, u, v] → [x, y, z * u * v] + """ + + if len(dim_sizes) > 3: + # multiply everything from the 3rd onward into d[2] + dim_sizes[2] = product(dim_sizes[2:]) + dim_sizes = dim_sizes[:3] + + # pad with 1s if necessary + dim_sizes += [1] * (3 - len(dim_sizes)) + + return dim_sizes + +def validate_block_size_limits(kernel_map_entry: nodes.MapEntry, block_size: List): + """ + Validates that the given block size for a kernel does not exceed typical CUDA hardware limits. + + These limits are not enforced by the CUDA compiler itself, but are configurable checks + performed by DaCe during GPU code generation. They are based on common hardware + restrictions and can be adjusted via the configuration system. + + Specifically, this function checks: + - That the total number of threads in the block does not exceed `compiler.cuda.block_size_limit`. + - That the number of threads in the last (z) dimension does not exceed + `compiler.cuda.block_size_lastdim_limit`. + + Raises: + ValueError: If either limit is exceeded. + """ + + kernel_map_label = kernel_map_entry.map.label + + total_block_size = product(block_size) + limit = Config.get('compiler', 'cuda', 'block_size_limit') + lastdim_limit = Config.get('compiler', 'cuda', 'block_size_lastdim_limit') + + if (total_block_size > limit) == True: + raise ValueError(f'Block size for kernel "{kernel_map_label}" ({block_size}) ' + f'is larger than the possible number of threads per block ({limit}). ' + 'The kernel will potentially not run, please reduce the thread-block size. ' + 'To increase this limit, modify the `compiler.cuda.block_size_limit` ' + 'configuration entry.') + + if (block_size[-1] > lastdim_limit) == True: + raise ValueError(f'Last block size dimension for kernel "{kernel_map_label}" ({block_size}) ' + 'is larger than the possible number of threads in the last block dimension ' + f'({lastdim_limit}). The kernel will potentially not run, please reduce the ' + 'thread-block size. To increase this limit, modify the ' + '`compiler.cuda.block_size_lastdim_limit` configuration entry.') + def emit_sync_debug_checks(backend: str, codestream: CodeIOStream): """ Emit backend sync and error-check calls if synchronous debugging is enabled. @@ -41,4 +104,4 @@ def emit_sync_debug_checks(backend: str, codestream: CodeIOStream): codestream.write( f"DACE_GPU_CHECK({backend}GetLastError());\n" f"DACE_GPU_CHECK({backend}DeviceSynchronize());\n" - ) \ No newline at end of file + ) \ No newline at end of file diff --git a/dace/codegen/targets/experimental_cuda_helpers/scope_strategies.py b/dace/codegen/targets/experimental_cuda_helpers/scope_strategies.py index d603c58178..9610651d20 100644 --- a/dace/codegen/targets/experimental_cuda_helpers/scope_strategies.py +++ b/dace/codegen/targets/experimental_cuda_helpers/scope_strategies.py @@ -87,11 +87,8 @@ def generate(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: ScopeSubgraphV # ----------------- Retrieve kernel configuration ----------------------- kernel_spec = self._current_kernel_spec - kernel_entry_node = kernel_spec._kernel_entry_node # = dfg_scope.source_nodes()[0] + kernel_entry_node = kernel_spec._kernel_entry_node # == dfg_scope.source_nodes()[0] kernel_map = kernel_spec.kernel_map - has_tbmap = kernel_spec.has_tbmap - kernel_block_dims = self._current_kernel_spec.block_dims - # ----------------- Kernel/Map Range Preprocessing ----------------------- @@ -100,20 +97,15 @@ def generate(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: ScopeSubgraphV kernel_dimensions = len(kernel_range) kernel_dim_sizes = kernel_range.size() - # ----------------- Set up symbolic index expressions ----------------------- symbolic_indices = [ symbolic.symbol(f'__SYM_IDX{dim}', nonnegative=True, integer=True) for dim in range(kernel_dimensions)] - symbolic_index_bounds = [ idx + block_dim - 1 for idx, block_dim in zip(symbolic_indices, kernel_block_dims)] symbolic_coordinates = kernel_range.coord_at(symbolic_indices) - # ----------------- Generate Thread or Block index Definitions ----------------------- - thread_id_ctype = kernel_spec.gpu_index_ctype # Data type of CUDA thread/block indices - # In case there is no ThreadBlock map used in a submap, the map variables will # be mapped to thread IDs instead of block IDs for dim in range(kernel_dimensions): @@ -122,56 +114,22 @@ def generate(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: ScopeSubgraphV # Compute index expressions for up to 3 dimensions (x, y, z) if dim < 3: - if has_tbmap: - index_expr = f'blockIdx.{get_cuda_dim(dim)}' - else: - index_expr = f'(blockIdx.{get_cuda_dim(dim)} * {symbolic_to_cpp(kernel_block_dims[dim])} + threadIdx.{get_cuda_dim(dim)})' - + index_expr = f'blockIdx.{get_cuda_dim(dim)}' # Delinearize third dimension if more than 3D (used in 3D+ mapping) if dim == 2 and kernel_dimensions > 3: tail_prod = product(kernel_dim_sizes[3:]) index_expr = f"({index_expr} / ({symbolic_to_cpp(tail_prod)}))" else: # Handle dimensions beyond the third (delinearize and modulo) - if has_tbmap: - index_expr = f'blockIdx.z' - else: - index_expr = f'(blockIdx.z * {symbolic_to_cpp(kernel_block_dims[2])} + threadIdx.z)' - + index_expr = f'blockIdx.z' tail_prod = product(kernel_dim_sizes[dim + 1:]) index_expr = (f"(({index_expr} / ({symbolic_to_cpp(tail_prod)})) % ({symbolic_to_cpp(kernel_dim_sizes[dim])}))") - # Define thread/Block index var_def = symbolic_to_cpp(symbolic_coordinates[dim]).replace(f'__SYM_IDX{dim}', index_expr) callsite_stream.write(f'{thread_id_ctype} {var_name} = {var_def};', cfg, state_id, kernel_entry_node) self._dispatcher.defined_vars.add(var_name, DefinedType.Scalar, thread_id_ctype) - - # ----------------- Guard Conditions for Block Execution ----------------------- - - if not has_tbmap: - minels = kernel_range.min_element() - maxels = kernel_range.max_element() - - for dim, (var_name, start, end) in enumerate(zip(kernel_map.params[::-1], minels, maxels)): - condition = '' - - # Optimize conditions if they are always true - if dim >= 3 or (symbolic_indices[dim] >= start) != True: - condition += f'{var_name} >= {symbolic_to_cpp(start)}' - - if (dim >= 3 or ((symbolic_index_bounds[dim] < end) != False - and ((symbolic_index_bounds[dim] % kernel_block_dims[dim]) != 0) == True) or (kernel_block_dims[dim] > end) == True): - - if len(condition) > 0: - condition += ' && ' - condition += f'{var_name} < {symbolic_to_cpp(end + 1)}' - - if len(condition) > 0: - scope_manager.open(condition=condition) - - # ----------------- Dispatch Subgraph code generation ----------------------- self._dispatcher.dispatch_subgraph(sdfg, cfg, dfg_scope, state_id, function_stream, diff --git a/dace/transformation/dataflow/add_threadblock_map.py b/dace/transformation/dataflow/add_threadblock_map.py index ccf2b88ee5..d1f0f09267 100644 --- a/dace/transformation/dataflow/add_threadblock_map.py +++ b/dace/transformation/dataflow/add_threadblock_map.py @@ -1,111 +1,123 @@ # Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved. """ This module contains classes and functions that implement the grid-strided map tiling transformation.""" +import warnings -import copy +import sympy import dace -from dace.sdfg import SDFG, ControlFlowRegion, SDFGState -from dace.properties import make_properties, SymbolicProperty -from dace.sdfg import nodes -from dace.sdfg import utils as sdutil -from dace.transformation import transformation, helpers +from dace import Config, dtypes, symbolic +from dace.properties import make_properties +from dace.sdfg import SDFG, SDFGState, nodes, utils as sdutil +from dace.codegen.targets.experimental_cuda_helpers import gpu_utils +from dace.transformation import helpers, transformation from dace.transformation.dataflow.tiling import MapTiling -from dace import dtypes -import warnings - @make_properties class AddThreadBlockMap(transformation.SingleStateTransformation): """ - Adds a thread block schedule to a device map scope - """ + Ensures that all `GPU_Device`-scheduled maps (kernel maps) in the SDFG + without an explicit `GPU_ThreadBlock` or `GPU_ThreadBlock_Dynamic` map + are nested within one. + This is achieved by applying the `MapTiling` transformation to each such map, + inserting a corresponding thread block scope. + """ map_entry = transformation.PatternNode(nodes.MapEntry) - # Properties - thread_block_size_x = SymbolicProperty(dtype=int, - default=None, - allow_none=True, - desc="Number threads in the threadBlock X Dim") - thread_block_size_y = SymbolicProperty(dtype=int, - default=None, - allow_none=True, - desc="Number threads in the threadBlock Y Dim") - thread_block_size_z = SymbolicProperty(dtype=int, - default=None, - allow_none=True, - desc="Number threads in the threadBlock Z Dim") - tiles_evenly = SymbolicProperty(dtype=bool, - default=False, - desc="Whether the map should be tiled evenly or not. If False, the " - "transformation will try to tile the map as evenly as possible.") - @classmethod def expressions(cls): return [sdutil.node_path_graph(cls.map_entry)] def preprocess_default_dims(self): - # If None is passed for the pass we will get the default configs - # 1. If arguments are passed: - # 1.1 Is the arguments passed - # 2. If no arguments are passed (at least one arg is None): - # 2.1. First check if the device map has gpu_block_size set - # 2.2. Otherwise check the global default - if self.thread_block_size_x is None or self.thread_block_size_y is None or self.thread_block_size_z is None: - if self.map_entry.gpu_block_size is not None: - # If gpu_block_size ap_entry.gpu_block_sizeis set, use it - self.thread_block_size_x = self.map_entry.gpu_block_size[0] - self.thread_block_size_y = self.map_entry.gpu_block_size[1] - self.thread_block_size_z = self.map_entry.gpu_block_size[2] - else: - x, y, z = dace.config.Config.get('compiler', 'cuda', 'default_block_size').split(',') - try: - self.thread_block_size_x = int(x) - self.thread_block_size_y = int(y) - self.thread_block_size_z = int(z) - except ValueError: - raise ValueError("Invalid default block size format. Expected 'x,y,z' where x, y, z are integers.") - - num_dims_in_map = len(self.map_entry.map.range) - # Collapse missing thread block dimensions into y if 2 dimensions in the map, to x if 1 dimension in the map - if num_dims_in_map < 3: - print_warning = False - old_block = (self.thread_block_size_x, self.thread_block_size_y, self.thread_block_size_z) - if num_dims_in_map == 2: - self.thread_block_size_y *= self.thread_block_size_z - if self.thread_block_size_z > 1: - print_warning = True - self.thread_block_size_z = 1 - elif num_dims_in_map == 1: - self.thread_block_size_x *= self.thread_block_size_y * self.thread_block_size_z - if self.thread_block_size_y > 1 or self.thread_block_size_z > 1: - print_warning = True - self.thread_block_size_y = 1 - self.thread_block_size_z = 1 - new_block = (self.thread_block_size_x, self.thread_block_size_y, self.thread_block_size_z) - if print_warning: - warnings.warn( - UserWarning, f'Default block size has more dimensions ({old_block}) than kernel dimensions ' - f'({num_dims_in_map}) in map "{self.map_entry.map.label}". Linearizing block ' - f'size to {new_block}. Consider setting the ``gpu_block_size`` property.') - - + """ + Computes a 3D GPU thread block size for a kernel `MapEntry` without an explicit `GPU_ThreadBlock` map. - def can_be_applied(self, graph, expr_index, sdfg, permissive=False): + Assumes that `self.map_entry` is a GPU kernel map (i.e., schedule is `ScheduleType.GPU_Device`) without + an explicit thread block map. - self.preprocess_default_dims() + If set, the `gpu_block_size` property on the map `self.map_entry` is used. Otherwise, a default is taken from + `Config('compiler', 'cuda', 'default_block_size')`, with basic validation and dimension normalization. - # Reject if thread block size exceeds GPU hardware limits - total_block_size = self.thread_block_size_x * self.thread_block_size_y * self.thread_block_size_z + Returns: + List[int]: A normalized [blockDim.x, blockDim.y, blockDim.z] list representing the GPU block size. - if total_block_size > 1024: - return False + Raises: + NotImplementedError: If the configuration sets the block size to `"max"`. + ValueError: If the computed block size exceeds hardware limits. + + Warnings: + - If falling back to the default block size from configuration. + - If the default block size has more dimensions than the kernel iteration space and gets linearized. + """ + kernel_map_entry = self.map_entry + preset_block_size = kernel_map_entry.map.gpu_block_size + + if preset_block_size is not None: + block_size = gpu_utils.to_3d_dims(preset_block_size) + + else: + kernel_map = kernel_map_entry.map + kernel_map_label = kernel_map.label + default_block_size_config = Config.get('compiler', 'cuda', 'default_block_size') + + # 1) Warn that we are falling back to config + warnings.warn( + f'No `gpu_block_size` property specified on map "{kernel_map_label}". ' + f'Falling back to the configuration entry `compiler.cuda.default_block_size`: {default_block_size_config}. ' + 'You can either specify the block size to use with the gpu_block_size property, ' + 'or by adding nested `GPU_ThreadBlock` maps, which map work to individual threads. ' + 'For more information, see https://spcldace.readthedocs.io/en/latest/optimization/gpu.html' + ) + + # 2) Reject unsupported 'max' setting + if default_block_size_config == 'max': + raise NotImplementedError('max dynamic block size unimplemented') + + # 3) Parse & normalize the default block size to 3D + default_block_size = [int(x) for x in default_block_size_config.split(',')] + default_block_size = gpu_utils.to_3d_dims(default_block_size) + + # 4) Normalize the total iteration space size (len(X),len(Y),len(Z)…) to 3D + # This is needed for X + raw_domain = list(kernel_map.range.size(True))[::-1] + kernel_domain_size = gpu_utils.to_3d_dims(raw_domain) + + # 5) If block has more "active" dims than the grid, collapse extras + active_block_dims = max(1, sum(1 for b in default_block_size if b != 1)) + active_grid_dims = max(1, sum(1 for g in kernel_domain_size if g != 1)) + + + if active_block_dims > active_grid_dims: + tail_product = gpu_utils.product(default_block_size[active_grid_dims:]) + block_size = default_block_size[:active_grid_dims] + [1] * (3 - active_grid_dims) + block_size[active_grid_dims - 1] *= tail_product + warnings.warn(f'Default block size has more dimensions ({active_block_dims}) than kernel dimensions ' + f'({active_grid_dims}) in map "{kernel_map_label}". Linearizing block ' + f'size to {block_size}. Consider setting the ``gpu_block_size`` property.') + else: + block_size = default_block_size + + # Validate that the block size does not exeed any limits + gpu_utils.validate_block_size_limits(kernel_map_entry, block_size) + + # Note order is [blockDim.x, blockDim.y, blockDim.z] + return block_size + + def can_be_applied(self, graph, expr_index, sdfg, permissive=False): + """ + Determines whether the transformation can be applied to the given map entry. + + The transformation only applies to maps with a GPU_Device schedule (i.e., kernel map entries). + It is not applicable if a nested GPU_ThreadBlock or GPU_ThreadBlock_Dynamic map exists + within the kernel scope, as that indicates the thread-block schedule is already defined. + The same restriction applies in the case of dynamic parallelism (nested kernel launches). + """ # Only applicable to GPU_Device maps if self.map_entry.map.schedule != dtypes.ScheduleType.GPU_Device: return False - + # Traverse inner scopes (ordered outer -> inner) for _, inner_entry in helpers.get_internal_scopes(graph, self.map_entry): schedule = inner_entry.map.schedule @@ -118,53 +130,75 @@ def can_be_applied(self, graph, expr_index, sdfg, permissive=False): # Found another kernel launch — safe to apply return True - # No thread block schedule found - do apply return True - def apply(self, state: SDFGState, sdfg: SDFG): - self.preprocess_default_dims() - - map_entry = self.map_entry - - tx = self.thread_block_size_x - ty = self.thread_block_size_y - tz = self.thread_block_size_z - block_dims = [tz, ty, tx] - - # Set the gpu_block_size which the GPU_ThreadBlock map will use. This is important, because the CUDACodeGen - # will otherwise try to deduce it, leading to issues - self.map_entry.gpu_block_size = [self.thread_block_size_x, self.thread_block_size_y, self.thread_block_size_z] - - # TODO: Adapt this code once MapTiling transformation also considers existing stride. - # The below tile size works around this by including the existing stride into the tile size - num_dims = len(map_entry.map.params) - existing_strides = map_entry.range.strides() - - len_diff = num_dims - len(block_dims) # Note - if len_diff > 0: # num_dims > block_dims - block_dims = [1] * len_diff + block_dims + """ + Ensures that `self.map_entry`, a `GPU_Device`-scheduled map, is explicitly nested + within a `GPU_ThreadBlock` map. + + This is achieved by applying the `MapTiling` transformation to `self.map_entry`, + using a computed block size. Essentially `self.map_entry` becomes the thread block map and + the new inserted parent map is the new kernel map. The schedules are set accordingly. + A final consistency check verifies that the resulting thread block map's range fits into the + computed block size. + + Raises: + ValueError: If the overapproximated extent of the thread block map does not match + the derived block size. + """ + gpu_block_size = self.preprocess_default_dims() + kernel_map_entry = self.map_entry + + # Reverse for map tiling to prioritize later dimensions for better memory/performance + reversed_block_size = gpu_block_size[::-1] + + # TODO: Update this once MapTiling accounts for existing strides when applying tile sizes. + # The code below is a workaround that manually adjusts tile sizes to account for existing strides. + num_dims = len(kernel_map_entry.map.params) + existing_strides = kernel_map_entry.range.strides() + + len_diff = num_dims - len(reversed_block_size) + if len_diff > 0: + # More dimensions than block size elements - pad with 1s + adjusted_block_size = [1] * len_diff + reversed_block_size else: - block_dims = block_dims[-num_dims:] + # Fewer or equal dimensions - truncate from the beginning + adjusted_block_size = reversed_block_size[-num_dims:] - tile_sizes = [stride * block for stride, block in zip(existing_strides, block_dims)] + tile_sizes = [stride * block for stride, block in zip(existing_strides, adjusted_block_size)] - # Tile trivial simplifies come checks for the BlockCoarsening and ThreadCoarsening transformations + # Apply map tiling transformation MapTiling.apply_to( sdfg=sdfg, - options=dict( - prefix="b", - tile_sizes=tile_sizes, - divides_evenly=self.tiles_evenly, # Todo improve this - tile_trivial=True, - skew=False), - map_entry=map_entry) - - # The old dev_entry is the new tblock_map_entry - map_entry.map.schedule = dtypes.ScheduleType.GPU_ThreadBlock - - + options={ + "prefix": "b", + "tile_sizes": tile_sizes, + "tile_trivial": True, + "skew": False + }, + map_entry=kernel_map_entry + ) + + # After tiling: kernel_map_entry is now the thread block map, configure its schedule + thread_block_map_entry = kernel_map_entry + thread_block_map_entry.map.schedule = dtypes.ScheduleType.GPU_ThreadBlock + + # Set the new kernel_entry's gpu_block_size attribute + new_kernel_entry, *_ = helpers.get_parent_map(state, kernel_map_entry) + new_kernel_entry.map.gpu_block_size = gpu_block_size + + # Catch any unexpected mismatches of inserted threadblock map's block size and the used block size + tb_size = gpu_utils.to_3d_dims([symbolic.overapproximate(sz) for sz in thread_block_map_entry.map.range.size()[::-1]]) + max_block_size = [sympy.Max(sz, bbsz) for sz, bbsz in zip(tb_size, gpu_block_size)] + + if max_block_size != gpu_block_size: + raise ValueError( + f"Block size mismatch: the overapproximated extent of the thread block map " + f"({tb_size}) is not enclosed by the derived block size ({gpu_block_size}). " + "They are expected to be equal or the derived block size to be larger." + ) def update_names(): pass diff --git a/dace/transformation/passes/analysis/infer_gpu_grid_and_block_size.py b/dace/transformation/passes/analysis/infer_gpu_grid_and_block_size.py new file mode 100644 index 0000000000..f8ef54fb23 --- /dev/null +++ b/dace/transformation/passes/analysis/infer_gpu_grid_and_block_size.py @@ -0,0 +1,172 @@ +import warnings +from typing import Dict, List, Set, Tuple + +import sympy + +from dace import SDFG, SDFGState, dtypes, symbolic +from dace.codegen.targets.experimental_cuda_helpers import gpu_utils +from dace.sdfg import nodes +from dace.transformation import helpers, pass_pipeline as ppl + + +class InferGPUGridAndBlockSize(ppl.Pass): + """ + Infers the 3D CUDA launch configuration (grid and block sizes) for all GPU_Device map entries in the SDFG. + + This pass assumes the `AddThreadBlockMap` transformation has already been applied, ensuring that each kernel + either has an explicit thread block map. However it is applicable as long as each GPU_Device scheduled map + has an inner explicit GPU_ThreadBlock scheduled map. + + Block sizes are determined based on: + - Whether an explicit GPU_ThreadBlock map was inserted by `AddThreadBlockMap`. In this case, + the `gpu_block_size` attribute holds this information. + - Existing nested thread block maps and also the `gpu_block_size`, if present. + + Grid sizes are computed from the kernel map's range, normalized to a 3D shape. + + NOTE: + This pass does not handle dynamic parallelism (i.e., nested GPU_Device maps), + nor does it support GPU_ThreadBlock_Dynamic maps inside kernels. Behavior is unclear in + such cases. + """ + + def apply_pass(self, sdfg: SDFG, kernels_with_added_tb_maps: Set[nodes.MapEntry]) -> Dict[nodes.MapEntry, Tuple[List, List]]: + """ + Analyzes the given SDFG to determine the 3D grid and block sizes for all GPU_Device map entries. + + Returns: + A dictionary mapping each GPU_Device MapEntry node to a tuple (grid_dimensions, block_dimensions). + """ + # Collect all GPU_Device map entries across the SDFG + kernel_maps: Set[Tuple[nodes.MapEntry, SDFGState,]] = set() + for node, state in sdfg.all_nodes_recursive(): + if isinstance(node, nodes.MapEntry) and node.schedule == dtypes.ScheduleType.GPU_Device: + kernel_maps.add((node, state)) + + + kernel_dimensions_map: Dict[nodes.MapEntry, Tuple[List, List]] = dict() + for map_entry, state in kernel_maps: + # Compute grid size + raw_grid = map_entry.map.range.size(True)[::-1] + grid_size = gpu_utils.to_3d_dims(raw_grid) + + # Compute Block size + if map_entry in kernels_with_added_tb_maps: + block_size = self._get_inserted_gpu_block_size(map_entry) + else: + block_size = self._infer_gpu_block_size(state, map_entry) + + block_size = gpu_utils.to_3d_dims(block_size) + gpu_utils.validate_block_size_limits(map_entry, block_size) + + kernel_dimensions_map[map_entry] = (grid_size, block_size) + + return kernel_dimensions_map + + def _get_inserted_gpu_block_size(self, kernel_map_entry: nodes.MapEntry) -> List: + """ + Returns the block size from a kernel map entry with an inserted thread-block map. + + Assumes the `gpu_block_size` attribute is set by the AddThreadBlockMap transformation. + """ + gpu_block_size = kernel_map_entry.map.gpu_block_size + + if gpu_block_size is None: + raise ValueError( + "Expected 'gpu_block_size' to be set. This kernel map entry should have been processed " + "by the AddThreadBlockMap transformation." + ) + + return gpu_block_size + + def _infer_gpu_block_size(self, state:SDFGState, kernel_map_entry: nodes.MapEntry) -> List: + """ + Infers the GPU block size for a kernel map entry based on nested GPU_ThreadBlock maps. + + If the `gpu_block_size` attribute is set, it is assumed to be user-defined (not set by + a transformation like `AddThreadBlockMap`), and all nested thread-block maps must fit within it. + Otherwise, the block size is inferred by overapproximating the range sizes of all inner + GPU_ThreadBlock maps of kernel_map_entry. + + + Example: + for i in dace.map[0:N:32] @ GPU_Device: + for j in dace.map[0:32] @ GPU_ThreadBlock: + ... + for l in dace.map[0:23] @ GPU_ThreadBlock: + for k in dace.map[0:16] @ GPU_ThreadBlock: + ... + + Inferred GPU block size is [32, 1, 1] + """ + # Identify nested threadblock maps + threadblock_maps = self._get_internal_threadblock_maps(state, kernel_map_entry) + + # guard check + if not threadblock_maps: + state.sdfg.save("failure.sdfg") + raise ValueError( + f"{self.__class__.__name__} expects at least one explicit nested GPU_ThreadBlock map, " + "as it assumes AddThreadBlockMap was applied beforehand.\n" + f"Check for issues in that transformation or ensure AddThreadBlockMap was applied." + ) + + # Overapproximated block size enclosing all inner ThreadBlock maps + block_size = kernel_map_entry.map.gpu_block_size + detected_block_sizes = [block_size] if block_size is not None else [] + for tb_map in threadblock_maps: + + # Over-approximate block size (e.g. min(N,(i+1)*32)-i*32 --> 32) + # and collapse to GPU-compatible 3D dimensions + tb_size = [symbolic.overapproximate(s) for s in tb_map.range.size()[::-1]] + tb_size = gpu_utils.to_3d_dims(tb_size) + + if block_size is None: + block_size = tb_size + else: + block_size = [sympy.Max(sz1, sz2) for sz1, sz2 in zip(block_size, tb_size)] + + if block_size != tb_size or len(detected_block_sizes) == 0: + detected_block_sizes.append(tb_size) + + + # Check for conflicting or multiple thread-block sizes + # - If gpu_block_size is explicitly defined (by the user) and conflicts with detected map sizes, raise an error + # - Otherwise, emit a warning when multiple differing sizes are detected, and over-approximate + if len(detected_block_sizes) > 1: + kernel_map_label = kernel_map_entry.map.label + + if kernel_map_entry.map.gpu_block_size is not None: + raise ValueError('Both the `gpu_block_size` property and internal thread-block ' + 'maps were defined with conflicting sizes for kernel ' + f'"{kernel_map_label}" (sizes detected: {detected_block_sizes}). ' + 'Use `gpu_block_size` only if you do not need access to individual ' + 'thread-block threads, or explicit block-level synchronization (e.g., ' + '`__syncthreads`). Otherwise, use internal maps with the `GPU_Threadblock` or ' + '`GPU_ThreadBlock_Dynamic` schedules. For more information, see ' + 'https://spcldace.readthedocs.io/en/latest/optimization/gpu.html') + + else: + warnings.warn('Multiple thread-block maps with different sizes detected for ' + f'kernel "{kernel_map_label}": {detected_block_sizes}. ' + f'Over-approximating to block size {block_size}.\n' + 'If this was not the intent, try tiling one of the thread-block maps to match.') + + return block_size + + def _get_internal_threadblock_maps(self, state: SDFGState, kernel_map_entry: nodes.MapEntry) -> List[nodes.MapEntry]: + """ + Returns GPU_ThreadBlock MapEntries nested within a given the GPU_Device scheduled kernel map + (kernel_map_entry). + + Returns: + A List of GPU_ThreadBlock scheduled maps. + """ + threadblock_maps = [] + + for _, scope in helpers.get_internal_scopes(state, kernel_map_entry): + if isinstance(scope, nodes.MapEntry) and scope.schedule == dtypes.ScheduleType.GPU_ThreadBlock: + threadblock_maps.append(scope) + + return threadblock_maps + \ No newline at end of file From 15b9e1c6969efcc181b6bd9315f589811dded2b9 Mon Sep 17 00:00:00 2001 From: aydogdub Date: Mon, 7 Jul 2025 16:58:57 +0200 Subject: [PATCH 52/94] small change --- dace/transformation/passes/gpustream_scheduling.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dace/transformation/passes/gpustream_scheduling.py b/dace/transformation/passes/gpustream_scheduling.py index 57012d5a48..761f747b2e 100644 --- a/dace/transformation/passes/gpustream_scheduling.py +++ b/dace/transformation/passes/gpustream_scheduling.py @@ -7,7 +7,7 @@ from dace.config import Config from dace.transformation import pass_pipeline as ppl, transformation from dace.sdfg import nodes -from dace.sdfg.graph import Edge, Graph, NodeT +from dace.sdfg.graph import Graph, NodeT @properties.make_properties From db2c87483097e3ba5df0fd355362fb61b78e869b Mon Sep 17 00:00:00 2001 From: aydogdub Date: Mon, 7 Jul 2025 17:05:18 +0200 Subject: [PATCH 53/94] cleaning up --- berkay_workpace/reports/important_notes.txt | 9 +- .../scratch/async_copy/async_copy.ipynb | 123 +++ .../scratch/async_copy/testbed.ipynb | 853 ++++++++++++++++++ berkay_workpace/scratch/testbed.ipynb | 164 ---- 4 files changed, 984 insertions(+), 165 deletions(-) create mode 100644 berkay_workpace/scratch/async_copy/async_copy.ipynb create mode 100644 berkay_workpace/scratch/async_copy/testbed.ipynb delete mode 100644 berkay_workpace/scratch/testbed.ipynb diff --git a/berkay_workpace/reports/important_notes.txt b/berkay_workpace/reports/important_notes.txt index f03bdb40a3..440cbd4dcf 100644 --- a/berkay_workpace/reports/important_notes.txt +++ b/berkay_workpace/reports/important_notes.txt @@ -3,4 +3,11 @@ depends on it. Instead of removing it, I decided to let it be and just say 0 CudaEvents are created and used. Generally: The CudaStreamManager assumes that the NaiveGPUScheduler pass was called before. Also, the CudaStreamManager should define the functions "get_stream_edge" (and maybe "get_stream_node"), since the the copystrategies might - depend on it \ No newline at end of file + depend on it + +2. I think we should rename ExperimentalCUDACodegen to ExperimentalGPUCodegen (or similarly, rename CUDACodeGen to GPUCodeGen), since it + is also intended to handle HIP code. However, this should be verified for HIP first. Otherwise, it might be better to build two separate + codegens — this is confusing as it stands. + +3. "Struct" memory copies in old codegen are hacks. These are omitted in the new ExperimentalCUDACodegen, because they should be implemented + in a planned and structured way, which is out of scope for my Master's Thesis. \ No newline at end of file diff --git a/berkay_workpace/scratch/async_copy/async_copy.ipynb b/berkay_workpace/scratch/async_copy/async_copy.ipynb new file mode 100644 index 0000000000..f86a432725 --- /dev/null +++ b/berkay_workpace/scratch/async_copy/async_copy.ipynb @@ -0,0 +1,123 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "ebf929d8", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import dace\n", + "import cupy as cp\n", + "import numpy as np\n", + "from IPython.display import Code\n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "024e65c9", + "metadata": {}, + "outputs": [ + { + "ename": "TypeError", + "evalue": "Memlet.__init__() got an unexpected keyword argument 'is_asynchronous'", + "output_type": "error", + "traceback": [ + "\u001b[31m---------------------------------------------------------------------------\u001b[39m", + "\u001b[31mTypeError\u001b[39m Traceback (most recent call last)", + "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[2]\u001b[39m\u001b[32m, line 20\u001b[39m\n\u001b[32m 16\u001b[39m tb_map_entry, tb_map_exit = state.add_map(\u001b[33m\"\u001b[39m\u001b[33mtb_map\u001b[39m\u001b[33m\"\u001b[39m, \u001b[38;5;28mdict\u001b[39m(tid=\u001b[33m\"\u001b[39m\u001b[33m0:128\u001b[39m\u001b[33m\"\u001b[39m), schedule=dace.dtypes.ScheduleType.GPU_ThreadBlock)\n\u001b[32m 18\u001b[39m \u001b[38;5;66;03m# Edges with proper data flow\u001b[39;00m\n\u001b[32m 19\u001b[39m \u001b[38;5;66;03m# Global to device scope\u001b[39;00m\n\u001b[32m---> \u001b[39m\u001b[32m20\u001b[39m state.add_edge(a_acc, \u001b[38;5;28;01mNone\u001b[39;00m, gpu_map_entry, \u001b[38;5;28;01mNone\u001b[39;00m, \u001b[43mdace\u001b[49m\u001b[43m.\u001b[49m\u001b[43mMemlet\u001b[49m\u001b[43m(\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mA[0:128]\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mis_asynchronous\u001b[49m\u001b[43m=\u001b[49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m)\u001b[49m)\n\u001b[32m 21\u001b[39m \u001b[38;5;66;03m# Device scope to thread-block scope\u001b[39;00m\n\u001b[32m 22\u001b[39m state.add_edge(gpu_map_entry, \u001b[38;5;28;01mNone\u001b[39;00m, s_acc, \u001b[38;5;28;01mNone\u001b[39;00m, dace.Memlet(\u001b[33m\"\u001b[39m\u001b[33mA[0:128]->S[0:128]\u001b[39m\u001b[33m\"\u001b[39m, is_asynchronous=\u001b[38;5;28;01mTrue\u001b[39;00m))\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/master-thesis/dace/dace/properties.py:337\u001b[39m, in \u001b[36mmake_properties..initialize_properties\u001b[39m\u001b[34m(obj, *args, **kwargs)\u001b[39m\n\u001b[32m 335\u001b[39m \u001b[38;5;28msetattr\u001b[39m(obj, name, prop.default)\n\u001b[32m 336\u001b[39m \u001b[38;5;66;03m# Now call vanilla __init__, which can initialize members\u001b[39;00m\n\u001b[32m--> \u001b[39m\u001b[32m337\u001b[39m \u001b[43minit\u001b[49m\u001b[43m(\u001b[49m\u001b[43mobj\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 338\u001b[39m \u001b[38;5;66;03m# Assert that all properties have been set\u001b[39;00m\n\u001b[32m 339\u001b[39m \u001b[38;5;28;01mfor\u001b[39;00m name, prop \u001b[38;5;129;01min\u001b[39;00m properties.items():\n", + "\u001b[31mTypeError\u001b[39m: Memlet.__init__() got an unexpected keyword argument 'is_asynchronous'" + ] + } + ], + "source": [ + "\n", + "# SDFG and the main state\n", + "sdfg = dace.SDFG(\"asyn_cpy_sdfg\")\n", + "state = sdfg.add_state(\"main\")\n", + "\n", + "# Arrays and access nodes\n", + "sdfg.add_array(\"A\", (128,), dace.uint32, storage=dace.dtypes.StorageType.GPU_Global)\n", + "sdfg.add_array(\"B\", (128,), dace.uint32, storage=dace.dtypes.StorageType.GPU_Global)\n", + "sdfg.add_array(\"S\", (128,), dace.uint32, storage=dace.dtypes.StorageType.GPU_Shared, transient=True)\n", + "\n", + "a_acc = state.add_read(\"A\")\n", + "b_acc = state.add_access(\"B\")\n", + "s_acc = state.add_access(\"S\")\n", + "\n", + "# Device and thread-block maps\n", + "gpu_map_entry, gpu_map_exit = state.add_map(\"gpu_map\", dict(bid=\"0:128:128\"), schedule=dace.dtypes.ScheduleType.GPU_Device)\n", + "tb_map_entry, tb_map_exit = state.add_map(\"tb_map\", dict(tid=\"0:128\"), schedule=dace.dtypes.ScheduleType.GPU_ThreadBlock)\n", + "\n", + "# Edges with proper data flow\n", + "# Global to device scope\n", + "state.add_edge(a_acc, None, gpu_map_entry, None, dace.Memlet(\"A[0:128]\", is_asynchronous=True))\n", + "# Device scope to thread-block scope\n", + "state.add_edge(gpu_map_entry, None, s_acc, None, dace.Memlet(\"A[0:128]->S[0:128]\", is_asynchronous=True))\n", + "state.add_edge(s_acc, None, tb_map_entry, None, dace.Memlet(\"S[0:128]\", is_asynchronous=True))\n", + "\n", + "assign_tasklet = state.add_tasklet(\n", + " \"assign\", inputs={\"__in_S\"}, outputs={\"__out_S\"},\n", + " code=\"__out_S = __in_S;\",\n", + " language=dace.dtypes.Language.CPP\n", + ")\n", + "\n", + "state.add_edge(tb_map_entry, None, assign_tasklet, \"__in_S\", dace.Memlet(\"S[tid]\", is_asynchronous=True))\n", + "state.add_edge(assign_tasklet, \"__out_S\", tb_map_exit, None, dace.Memlet(\"B[tid]\", is_asynchronous=True))\n", + "state.add_edge(tb_map_exit, None, gpu_map_exit, None, dace.Memlet(\"B[0:128]\", is_asynchronous=True))\n", + "state.add_edge(gpu_map_exit, None, b_acc, None, dace.Memlet(\"B[0:128]\", is_asynchronous=True))\n", + "\n", + "# Fill scope connectors\n", + "state.fill_scope_connectors()\n", + "\n", + "\n", + "# Display the SDFG\n", + "sdfg" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "03fef73b", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "dace_env", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/berkay_workpace/scratch/async_copy/testbed.ipynb b/berkay_workpace/scratch/async_copy/testbed.ipynb new file mode 100644 index 0000000000..2b0a520529 --- /dev/null +++ b/berkay_workpace/scratch/async_copy/testbed.ipynb @@ -0,0 +1,853 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "a5aeb1f5", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import dace\n", + "import cupy as cp\n", + "import numpy as np\n", + "from IPython.display import Code\n", + "from typing import Optional\n", + "\n", + "from dace import SDFG, properties\n", + "from dace.config import Config\n", + "from dace.transformation import pass_pipeline as ppl, transformation\n", + "from dace.sdfg import nodes\n", + "from dace import dtypes\n", + "from dace.transformation.passes.shared_memory_synchronization import DefaultSharedMemorySync\n", + "from dace.sdfg.state import LoopRegion, ConditionalBlock\n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "2f891963", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "
\n", + "
\n", + "
\n", + "\n", + "" + ], + "text/plain": [ + "SDFG (asyn_cpy_sdfg)" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "\n", + "# SDFG and the main state\n", + "sdfg = dace.SDFG(\"asyn_cpy_sdfg\")\n", + "state = sdfg.add_state(\"main\")\n", + "\n", + "# Arrays and access nodes\n", + "sdfg.add_array(\"A\", (128,), dace.uint32, storage=dace.dtypes.StorageType.GPU_Global)\n", + "sdfg.add_array(\"B\", (128,), dace.uint32, storage=dace.dtypes.StorageType.GPU_Global)\n", + "sdfg.add_array(\"S\", (128,), dace.uint32, storage=dace.dtypes.StorageType.GPU_Shared, transient=True)\n", + "\n", + "a_acc = state.add_read(\"A\")\n", + "b_acc = state.add_access(\"B\")\n", + "s_acc = state.add_access(\"S\")\n", + "\n", + "\n", + "\n", + "\n", + "# Device and thread-block maps\n", + "gpu_map_entry, gpu_map_exit = state.add_map(\"gpu_map\", dict(bid=\"0:128:128\"), schedule=dace.dtypes.ScheduleType.GPU_Device)\n", + "tb_map_entry, tb_map_exit = state.add_map(\"tb_map\", dict(tid=\"0:128\"), schedule=dace.dtypes.ScheduleType.GPU_ThreadBlock)\n", + "\n", + "# Edges with proper data flow\n", + "# Global to device scope\n", + "state.add_edge(a_acc, None, gpu_map_entry, None, dace.Memlet(\"A[0:128]\"))\n", + "# Device scope to thread-block scope\n", + "state.add_edge(gpu_map_entry, None, s_acc, None, dace.Memlet(\"A[0:128]->S[0:128]\"))\n", + "state.add_edge(s_acc, None, tb_map_entry, None, dace.Memlet(\"S[0:128]\"))\n", + "\n", + "assign_tasklet = state.add_tasklet(\n", + " \"assign\", inputs={\"__in_S\"}, outputs={\"__out_S\"},\n", + " code=\"__out_S = __in_S;\",\n", + " language=dace.dtypes.Language.CPP\n", + ")\n", + "\n", + "\n", + "state.add_edge(tb_map_entry, None, assign_tasklet, \"__in_S\", dace.Memlet(\"S[tid]\"))\n", + "state.add_edge(assign_tasklet, \"__out_S\", tb_map_exit, None, dace.Memlet(\"B[tid]\"))\n", + "state.add_edge(tb_map_exit, None, gpu_map_exit, None, dace.Memlet(\"B[0:128]\"))\n", + "state.add_edge(gpu_map_exit, None, b_acc, None, dace.Memlet(\"B[0:128]\"))\n", + "\n", + "\n", + "# pipeline and async related \n", + "\n", + "pipeline_name = \"pipeline\"\n", + "s_acc.async_copy = True\n", + "s_acc.async_pipeline = pipeline_name\n", + "sdfg.metadata = {\n", + " s_acc.guid: {\n", + " \"pipelines\": {\n", + " pipeline_name: {\n", + " \"pipeline_depth\" : 1\n", + " }\n", + " }\n", + " }\n", + "}\n", + "\n", + "\n", + "acquire_pipeline_tasklet = state.add_tasklet(\n", + " \"acquire\", inputs={}, outputs={},\n", + " code=f\"{pipeline_name}.producer_acquire();\",\n", + " language=dace.dtypes.Language.CPP\n", + ")\n", + "\n", + "commit_pipeline_tasklet = state.add_tasklet(\n", + " \"commit\", inputs={}, outputs={},\n", + " code=f\"{pipeline_name}.producer_commit();\",\n", + " language=dace.dtypes.Language.CPP\n", + ")\n", + "\n", + "wait_pipeline_tasklet = state.add_tasklet(\n", + " \"wait\", inputs={}, outputs={},\n", + " code=f\"{pipeline_name}.consumer_wait();\",\n", + " language=dace.dtypes.Language.CPP\n", + ")\n", + "\n", + "release_pipeline_tasklet = state.add_tasklet(\n", + " \"release\", inputs={}, outputs={},\n", + " code=f\"{pipeline_name}.consumer_release();\",\n", + " language=dace.dtypes.Language.CPP\n", + ")\n", + "\n", + "\n", + "\n", + "state.add_edge(gpu_map_entry, None, acquire_pipeline_tasklet, None, dace.Memlet())\n", + "state.add_edge(acquire_pipeline_tasklet, None, s_acc, None, dace.Memlet())\n", + "\n", + "state.add_edge(s_acc, None, commit_pipeline_tasklet, None, dace.Memlet())\n", + "state.add_edge(commit_pipeline_tasklet, None, wait_pipeline_tasklet, None, dace.Memlet())\n", + "state.add_edge(wait_pipeline_tasklet, None, tb_map_entry, None, dace.Memlet())\n", + "\n", + "state.add_edge(tb_map_exit, None, release_pipeline_tasklet, None, dace.Memlet())\n", + "state.add_edge(release_pipeline_tasklet, None, gpu_map_exit, None, dace.Memlet())\n", + "\n", + "\n", + "\n", + "\n", + "# Fill scope connectors\n", + "state.fill_scope_connectors()\n", + "\n", + "\n", + "# Display the SDFG\n", + "sdfg" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "c905cb3f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
#include <cuda_runtime.h>\n",
+       "#include <dace/dace.h>\n",
+       "\n",
+       "// New, cooperative groups and asnyc copy\n",
+       "#include <cooperative_groups/memcpy_async.h>\n",
+       "#include <cuda/pipeline>\n",
+       "\n",
+       "namespace cg = cooperative_groups;\n",
+       "\n",
+       "\n",
+       "struct asyn_cpy_sdfg_state_t {\n",
+       "    dace::cuda::Context *gpu_context;\n",
+       "};\n",
+       "\n",
+       "\n",
+       "\n",
+       "DACE_EXPORTED int __dace_init_experimental_cuda(asyn_cpy_sdfg_state_t *__state);\n",
+       "DACE_EXPORTED int __dace_exit_experimental_cuda(asyn_cpy_sdfg_state_t *__state);\n",
+       "\n",
+       "\n",
+       "\n",
+       "int __dace_init_experimental_cuda(asyn_cpy_sdfg_state_t *__state) {\n",
+       "    int count;\n",
+       "\n",
+       "    // Check that we are able to run cuda code\n",
+       "    if (cudaGetDeviceCount(&count) != cudaSuccess)\n",
+       "    {\n",
+       "        printf("ERROR: GPU drivers are not configured or cuda-capable device "\n",
+       "               "not found\\n");\n",
+       "        return 1;\n",
+       "    }\n",
+       "    if (count == 0)\n",
+       "    {\n",
+       "        printf("ERROR: No cuda-capable devices found\\n");\n",
+       "        return 2;\n",
+       "    }\n",
+       "\n",
+       "    // Initialize cuda before we run the application\n",
+       "    float *dev_X;\n",
+       "    DACE_GPU_CHECK(cudaMalloc((void **) &dev_X, 1));\n",
+       "    DACE_GPU_CHECK(cudaFree(dev_X));\n",
+       "\n",
+       "    \n",
+       "\n",
+       "    __state->gpu_context = new dace::cuda::Context(0, 0);\n",
+       "\n",
+       "    // Create cuda streams and events\n",
+       "    for(int i = 0; i < 0; ++i) {\n",
+       "        DACE_GPU_CHECK(cudaStreamCreateWithFlags(&__state->gpu_context->internal_streams[i], cudaStreamNonBlocking));\n",
+       "        __state->gpu_context->streams[i] = __state->gpu_context->internal_streams[i]; // Allow for externals to modify streams\n",
+       "    }\n",
+       "    for(int i = 0; i < 0; ++i) {\n",
+       "        DACE_GPU_CHECK(cudaEventCreateWithFlags(&__state->gpu_context->events[i], cudaEventDisableTiming));\n",
+       "    }\n",
+       "\n",
+       "    \n",
+       "\n",
+       "    return 0;\n",
+       "}\n",
+       "\n",
+       "int __dace_exit_experimental_cuda(asyn_cpy_sdfg_state_t *__state) {\n",
+       "    \n",
+       "\n",
+       "    // Synchronize and check for CUDA errors\n",
+       "    int __err = static_cast<int>(__state->gpu_context->lasterror);\n",
+       "    if (__err == 0)\n",
+       "        __err = static_cast<int>(cudaDeviceSynchronize());\n",
+       "\n",
+       "    // Destroy cuda streams and events\n",
+       "    for(int i = 0; i < 0; ++i) {\n",
+       "        DACE_GPU_CHECK(cudaStreamDestroy(__state->gpu_context->internal_streams[i]));\n",
+       "    }\n",
+       "    for(int i = 0; i < 0; ++i) {\n",
+       "        DACE_GPU_CHECK(cudaEventDestroy(__state->gpu_context->events[i]));\n",
+       "    }\n",
+       "\n",
+       "    delete __state->gpu_context;\n",
+       "    return __err;\n",
+       "}\n",
+       "\n",
+       "DACE_EXPORTED bool __dace_gpu_set_stream(asyn_cpy_sdfg_state_t *__state, int streamid, gpuStream_t stream)\n",
+       "{\n",
+       "    if (streamid < 0 || streamid >= 0)\n",
+       "        return false;\n",
+       "\n",
+       "    __state->gpu_context->streams[streamid] = stream;\n",
+       "\n",
+       "    return true;\n",
+       "}\n",
+       "\n",
+       "DACE_EXPORTED void __dace_gpu_set_all_streams(asyn_cpy_sdfg_state_t *__state, gpuStream_t stream)\n",
+       "{\n",
+       "    for (int i = 0; i < 0; ++i)\n",
+       "        __state->gpu_context->streams[i] = stream;\n",
+       "}\n",
+       "\n",
+       "__global__ void __launch_bounds__(128) gpu_map_0_0_3(dace::uint * __restrict__ A, dace::uint * __restrict__ B)\n",
+       "{\n",
+       "    __shared__ dace::uint S[128];\n",
+       "\n",
+       "    auto block = cg::this_thread_block();\n",
+       "\n",
+       "    const uint pipeline_depth_pipeline = 1;\n",
+       "    __shared__ cuda::pipeline_shared_state<cuda::thread_scope::thread_scope_block, pipeline_depth_pipeline> shared_state_pipeline;\n",
+       "    auto pipeline = cuda::make_pipeline(block, &shared_state_pipeline);\n",
+       "\n",
+       "    int bid = (128 * blockIdx.x);\n",
+       "    {\n",
+       "\n",
+       "        ///////////////////\n",
+       "        pipeline.producer_acquire();\n",
+       "        ///////////////////\n",
+       "\n",
+       "    }\n",
+       "    cuda::memcpy_async(block, S, A, 128 *sizeof(dace::uint), pipeline);\n",
+       "    {\n",
+       "\n",
+       "        ///////////////////\n",
+       "        pipeline.producer_commit();\n",
+       "        ///////////////////\n",
+       "\n",
+       "    }\n",
+       "    {\n",
+       "\n",
+       "        ///////////////////\n",
+       "        pipeline.consumer_wait();\n",
+       "        ///////////////////\n",
+       "\n",
+       "    }\n",
+       "    {\n",
+       "        int tid = threadIdx.x;\n",
+       "        {\n",
+       "            dace::uint __in_S = S[tid];\n",
+       "            dace::uint __out_S;\n",
+       "\n",
+       "            ///////////////////\n",
+       "            __out_S = __in_S;\n",
+       "            ///////////////////\n",
+       "\n",
+       "            B[tid] = __out_S;\n",
+       "        }\n",
+       "    }\n",
+       "    {\n",
+       "\n",
+       "        ///////////////////\n",
+       "        pipeline.consumer_release();\n",
+       "        ///////////////////\n",
+       "\n",
+       "    }\n",
+       "}\n",
+       "\n",
+       "\n",
+       "DACE_EXPORTED void __dace_runkernel_gpu_map_0_0_3(asyn_cpy_sdfg_state_t *__state, dace::uint * __restrict__ A, dace::uint * __restrict__ B);\n",
+       "void __dace_runkernel_gpu_map_0_0_3(asyn_cpy_sdfg_state_t *__state, dace::uint * __restrict__ A, dace::uint * __restrict__ B)\n",
+       "{\n",
+       "\n",
+       "\n",
+       "    void  *gpu_map_0_0_3_args[] = { (void *)&A, (void *)&B };\n",
+       "    gpuError_t __err = cudaLaunchKernel( (void*)gpu_map_0_0_3, dim3(1, 1, 1), dim3(128, 1, 1), gpu_map_0_0_3_args, 0, nullptr\n",
+       "    );\n",
+       "\n",
+       "    DACE_KERNEL_LAUNCH_CHECK(__err, "gpu_map_0_0_3", 1, 1, 1, 128, 1, 1);\n",
+       "}\n",
+       "
\n" + ], + "text/latex": [ + "\\begin{Verbatim}[commandchars=\\\\\\{\\}]\n", + "\\PY{n}{\\PYZsh{}include}\\PY{+w}{ }\\PY{o}{\\PYZlt{}}\\PY{n}{cuda\\PYZus{}runtime}\\PY{p}{.}\\PY{n}{h}\\PY{o}{\\PYZgt{}}\n", + "\\PY{n}{\\PYZsh{}include}\\PY{+w}{ }\\PY{o}{\\PYZlt{}}\\PY{n}{dace}\\PY{o}{/}\\PY{n}{dace}\\PY{p}{.}\\PY{n}{h}\\PY{o}{\\PYZgt{}}\n", + "\n", + "\\PY{o}{/}\\PY{o}{/}\\PY{+w}{ }\\PY{k}{New}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{cooperative}\\PY{+w}{ }\\PY{n}{groups}\\PY{+w}{ }\\PY{o+ow}{and}\\PY{+w}{ }\\PY{n}{asnyc}\\PY{+w}{ }\\PY{n}{copy}\n", + "\\PY{n}{\\PYZsh{}include}\\PY{+w}{ }\\PY{o}{\\PYZlt{}}\\PY{n}{cooperative\\PYZus{}groups}\\PY{o}{/}\\PY{n}{memcpy\\PYZus{}async}\\PY{p}{.}\\PY{n}{h}\\PY{o}{\\PYZgt{}}\n", + "\\PY{n}{\\PYZsh{}include}\\PY{+w}{ }\\PY{o}{\\PYZlt{}}\\PY{n}{cuda}\\PY{o}{/}\\PY{n}{pipeline}\\PY{o}{\\PYZgt{}}\n", + "\n", + "\\PY{n}{namespace}\\PY{+w}{ }\\PY{n}{cg}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{cooperative\\PYZus{}groups}\\PY{p}{;}\n", + "\n", + "\n", + "\\PY{n}{struct}\\PY{+w}{ }\\PY{n}{asyn\\PYZus{}cpy\\PYZus{}sdfg\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{err}{\\PYZob{}}\n", + "\\PY{+w}{ }\\PY{n+nl}{dace}\\PY{p}{:}\\PY{err}{:}\\PY{n+nl}{cuda}\\PY{p}{:}\\PY{err}{:}\\PY{n}{Context}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{gpu\\PYZus{}context}\\PY{p}{;}\n", + "\\PY{err}{\\PYZcb{}}\\PY{p}{;}\n", + "\n", + "\n", + "\n", + "\\PY{n}{DACE\\PYZus{}EXPORTED}\\PY{+w}{ }\\PY{n+nc}{int}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}dace\\PYZus{}init\\PYZus{}experimental\\PYZus{}cuda}\\PY{p}{(}\\PY{n}{asyn\\PYZus{}cpy\\PYZus{}sdfg\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{)}\\PY{p}{;}\n", + "\\PY{n}{DACE\\PYZus{}EXPORTED}\\PY{+w}{ }\\PY{n+nc}{int}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}dace\\PYZus{}exit\\PYZus{}experimental\\PYZus{}cuda}\\PY{p}{(}\\PY{n}{asyn\\PYZus{}cpy\\PYZus{}sdfg\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{)}\\PY{p}{;}\n", + "\n", + "\n", + "\n", + "\\PY{n+nc}{int}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}dace\\PYZus{}init\\PYZus{}experimental\\PYZus{}cuda}\\PY{p}{(}\\PY{n}{asyn\\PYZus{}cpy\\PYZus{}sdfg\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{)}\\PY{+w}{ }\\PY{err}{\\PYZob{}}\n", + "\\PY{+w}{ }\\PY{n+nc}{int}\\PY{+w}{ }\\PY{n+nf}{count}\\PY{p}{;}\n", + "\n", + "\\PY{+w}{ }\\PY{o}{/}\\PY{o}{/}\\PY{+w}{ }\\PY{k}{Check}\\PY{+w}{ }\\PY{n}{that}\\PY{+w}{ }\\PY{n}{we}\\PY{+w}{ }\\PY{k}{are}\\PY{+w}{ }\\PY{n}{able}\\PY{+w}{ }\\PY{k}{to}\\PY{+w}{ }\\PY{n}{run}\\PY{+w}{ }\\PY{n}{cuda}\\PY{+w}{ }\\PY{n}{code}\n", + "\\PY{+w}{ }\\PY{k}{if}\\PY{+w}{ }\\PY{p}{(}\\PY{n}{cudaGetDeviceCount}\\PY{p}{(}\\PY{o}{\\PYZam{}}\\PY{n+nf}{count}\\PY{p}{)}\\PY{+w}{ }\\PY{o}{!=}\\PY{+w}{ }\\PY{n}{cudaSuccess}\\PY{p}{)}\n", + "\\PY{+w}{ }\\PY{err}{\\PYZob{}}\n", + "\\PY{+w}{ }\\PY{n}{printf}\\PY{p}{(}\\PY{l+s+ss}{\\PYZdq{}ERROR: GPU drivers are not configured or cuda\\PYZhy{}capable device \\PYZdq{}}\n", + "\\PY{+w}{ }\\PY{l+s+ss}{\\PYZdq{}not found\\PYZbs{}n\\PYZdq{}}\\PY{p}{)}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{k}{return}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{err}{\\PYZcb{}}\n", + "\\PY{+w}{ }\\PY{k}{if}\\PY{+w}{ }\\PY{p}{(}\\PY{n+nf}{count}\\PY{+w}{ }\\PY{o}{=}\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{)}\n", + "\\PY{+w}{ }\\PY{err}{\\PYZob{}}\n", + "\\PY{+w}{ }\\PY{n}{printf}\\PY{p}{(}\\PY{l+s+ss}{\\PYZdq{}ERROR: No cuda\\PYZhy{}capable devices found\\PYZbs{}n\\PYZdq{}}\\PY{p}{)}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{k}{return}\\PY{+w}{ }\\PY{l+m+mi}{2}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{err}{\\PYZcb{}}\n", + "\n", + "\\PY{+w}{ }\\PY{o}{/}\\PY{o}{/}\\PY{+w}{ }\\PY{k}{Initialize}\\PY{+w}{ }\\PY{n}{cuda}\\PY{+w}{ }\\PY{k}{before}\\PY{+w}{ }\\PY{n}{we}\\PY{+w}{ }\\PY{n}{run}\\PY{+w}{ }\\PY{n}{the}\\PY{+w}{ }\\PY{n}{application}\n", + "\\PY{+w}{ }\\PY{n+nc}{float}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{dev\\PYZus{}X}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{n}{DACE\\PYZus{}GPU\\PYZus{}CHECK}\\PY{p}{(}\\PY{n}{cudaMalloc}\\PY{p}{(}\\PY{p}{(}\\PY{n}{void}\\PY{+w}{ }\\PY{o}{*}\\PY{o}{*}\\PY{p}{)}\\PY{+w}{ }\\PY{o}{\\PYZam{}}\\PY{n}{dev\\PYZus{}X}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{)}\\PY{p}{)}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{n}{DACE\\PYZus{}GPU\\PYZus{}CHECK}\\PY{p}{(}\\PY{n}{cudaFree}\\PY{p}{(}\\PY{n}{dev\\PYZus{}X}\\PY{p}{)}\\PY{p}{)}\\PY{p}{;}\n", + "\n", + "\\PY{+w}{ }\n", + "\n", + "\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{k}{new}\\PY{+w}{ }\\PY{n+nl}{dace}\\PY{p}{:}\\PY{err}{:}\\PY{n+nl}{cuda}\\PY{p}{:}\\PY{err}{:}\\PY{n}{Context}\\PY{p}{(}\\PY{l+m+mi}{0}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{)}\\PY{p}{;}\n", + "\n", + "\\PY{+w}{ }\\PY{o}{/}\\PY{o}{/}\\PY{+w}{ }\\PY{k}{Create}\\PY{+w}{ }\\PY{n}{cuda}\\PY{+w}{ }\\PY{n}{streams}\\PY{+w}{ }\\PY{o+ow}{and}\\PY{+w}{ }\\PY{n}{events}\n", + "\\PY{+w}{ }\\PY{k}{for}\\PY{p}{(}\\PY{n+nc}{int}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{;}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{\\PYZlt{}}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{;}\\PY{+w}{ }\\PY{o}{+}\\PY{o}{+}\\PY{n}{i}\\PY{p}{)}\\PY{+w}{ }\\PY{err}{\\PYZob{}}\n", + "\\PY{+w}{ }\\PY{n}{DACE\\PYZus{}GPU\\PYZus{}CHECK}\\PY{p}{(}\\PY{n}{cudaStreamCreateWithFlags}\\PY{p}{(}\\PY{o}{\\PYZam{}}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{internal\\PYZus{}streams}\\PY{o}{[}\\PY{n}{i}\\PY{o}{]}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{cudaStreamNonBlocking}\\PY{p}{)}\\PY{p}{)}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{streams}\\PY{o}{[}\\PY{n}{i}\\PY{o}{]}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{internal\\PYZus{}streams}\\PY{o}{[}\\PY{n}{i}\\PY{o}{]}\\PY{p}{;}\\PY{+w}{ }\\PY{o}{/}\\PY{o}{/}\\PY{+w}{ }\\PY{n}{Allow}\\PY{+w}{ }\\PY{k}{for}\\PY{+w}{ }\\PY{n}{externals}\\PY{+w}{ }\\PY{k}{to}\\PY{+w}{ }\\PY{k}{modify}\\PY{+w}{ }\\PY{n}{streams}\n", + "\\PY{+w}{ }\\PY{err}{\\PYZcb{}}\n", + "\\PY{+w}{ }\\PY{k}{for}\\PY{p}{(}\\PY{n+nc}{int}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{;}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{\\PYZlt{}}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{;}\\PY{+w}{ }\\PY{o}{+}\\PY{o}{+}\\PY{n}{i}\\PY{p}{)}\\PY{+w}{ }\\PY{err}{\\PYZob{}}\n", + "\\PY{+w}{ }\\PY{n}{DACE\\PYZus{}GPU\\PYZus{}CHECK}\\PY{p}{(}\\PY{n}{cudaEventCreateWithFlags}\\PY{p}{(}\\PY{o}{\\PYZam{}}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{events}\\PY{o}{[}\\PY{n}{i}\\PY{o}{]}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{cudaEventDisableTiming}\\PY{p}{)}\\PY{p}{)}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{err}{\\PYZcb{}}\n", + "\n", + "\\PY{+w}{ }\n", + "\n", + "\\PY{+w}{ }\\PY{k}{return}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{;}\n", + "\\PY{err}{\\PYZcb{}}\n", + "\n", + "\\PY{n+nc}{int}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}dace\\PYZus{}exit\\PYZus{}experimental\\PYZus{}cuda}\\PY{p}{(}\\PY{n}{asyn\\PYZus{}cpy\\PYZus{}sdfg\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{)}\\PY{+w}{ }\\PY{err}{\\PYZob{}}\n", + "\\PY{+w}{ }\n", + "\n", + "\\PY{+w}{ }\\PY{o}{/}\\PY{o}{/}\\PY{+w}{ }\\PY{n}{Synchronize}\\PY{+w}{ }\\PY{o+ow}{and}\\PY{+w}{ }\\PY{k}{check}\\PY{+w}{ }\\PY{k}{for}\\PY{+w}{ }\\PY{n}{CUDA}\\PY{+w}{ }\\PY{n}{errors}\n", + "\\PY{+w}{ }\\PY{n+nc}{int}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}err}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{static\\PYZus{}cast}\\PY{o}{\\PYZlt{}}\\PY{n+nc}{int}\\PY{o}{\\PYZgt{}}\\PY{p}{(}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{lasterror}\\PY{p}{)}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{k}{if}\\PY{+w}{ }\\PY{p}{(}\\PY{n}{\\PYZus{}\\PYZus{}err}\\PY{+w}{ }\\PY{o}{=}\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{)}\n", + "\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}err}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{static\\PYZus{}cast}\\PY{o}{\\PYZlt{}}\\PY{n+nc}{int}\\PY{o}{\\PYZgt{}}\\PY{p}{(}\\PY{n}{cudaDeviceSynchronize}\\PY{p}{(}\\PY{p}{)}\\PY{p}{)}\\PY{p}{;}\n", + "\n", + "\\PY{+w}{ }\\PY{o}{/}\\PY{o}{/}\\PY{+w}{ }\\PY{k}{Destroy}\\PY{+w}{ }\\PY{n}{cuda}\\PY{+w}{ }\\PY{n}{streams}\\PY{+w}{ }\\PY{o+ow}{and}\\PY{+w}{ }\\PY{n}{events}\n", + "\\PY{+w}{ }\\PY{k}{for}\\PY{p}{(}\\PY{n+nc}{int}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{;}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{\\PYZlt{}}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{;}\\PY{+w}{ }\\PY{o}{+}\\PY{o}{+}\\PY{n}{i}\\PY{p}{)}\\PY{+w}{ }\\PY{err}{\\PYZob{}}\n", + "\\PY{+w}{ }\\PY{n}{DACE\\PYZus{}GPU\\PYZus{}CHECK}\\PY{p}{(}\\PY{n}{cudaStreamDestroy}\\PY{p}{(}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{internal\\PYZus{}streams}\\PY{o}{[}\\PY{n}{i}\\PY{o}{]}\\PY{p}{)}\\PY{p}{)}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{err}{\\PYZcb{}}\n", + "\\PY{+w}{ }\\PY{k}{for}\\PY{p}{(}\\PY{n+nc}{int}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{;}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{\\PYZlt{}}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{;}\\PY{+w}{ }\\PY{o}{+}\\PY{o}{+}\\PY{n}{i}\\PY{p}{)}\\PY{+w}{ }\\PY{err}{\\PYZob{}}\n", + "\\PY{+w}{ }\\PY{n}{DACE\\PYZus{}GPU\\PYZus{}CHECK}\\PY{p}{(}\\PY{n}{cudaEventDestroy}\\PY{p}{(}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{events}\\PY{o}{[}\\PY{n}{i}\\PY{o}{]}\\PY{p}{)}\\PY{p}{)}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{err}{\\PYZcb{}}\n", + "\n", + "\\PY{+w}{ }\\PY{k}{delete}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{k}{return}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}err}\\PY{p}{;}\n", + "\\PY{err}{\\PYZcb{}}\n", + "\n", + "\\PY{n}{DACE\\PYZus{}EXPORTED}\\PY{+w}{ }\\PY{n}{bool}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}dace\\PYZus{}gpu\\PYZus{}set\\PYZus{}stream}\\PY{p}{(}\\PY{n}{asyn\\PYZus{}cpy\\PYZus{}sdfg\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{,}\\PY{+w}{ }\\PY{n+nc}{int}\\PY{+w}{ }\\PY{n}{streamid}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{gpuStream\\PYZus{}t}\\PY{+w}{ }\\PY{n}{stream}\\PY{p}{)}\n", + "\\PY{err}{\\PYZob{}}\n", + "\\PY{+w}{ }\\PY{k}{if}\\PY{+w}{ }\\PY{p}{(}\\PY{n}{streamid}\\PY{+w}{ }\\PY{o}{\\PYZlt{}}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{+w}{ }\\PY{o}{|}\\PY{o}{|}\\PY{+w}{ }\\PY{n}{streamid}\\PY{+w}{ }\\PY{o}{\\PYZgt{}=}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{)}\n", + "\\PY{+w}{ }\\PY{k}{return}\\PY{+w}{ }\\PY{k}{false}\\PY{p}{;}\n", + "\n", + "\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{streams}\\PY{o}{[}\\PY{n}{streamid}\\PY{o}{]}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{stream}\\PY{p}{;}\n", + "\n", + "\\PY{+w}{ }\\PY{k}{return}\\PY{+w}{ }\\PY{k}{true}\\PY{p}{;}\n", + "\\PY{err}{\\PYZcb{}}\n", + "\n", + "\\PY{n}{DACE\\PYZus{}EXPORTED}\\PY{+w}{ }\\PY{n}{void}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}dace\\PYZus{}gpu\\PYZus{}set\\PYZus{}all\\PYZus{}streams}\\PY{p}{(}\\PY{n}{asyn\\PYZus{}cpy\\PYZus{}sdfg\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{gpuStream\\PYZus{}t}\\PY{+w}{ }\\PY{n}{stream}\\PY{p}{)}\n", + "\\PY{err}{\\PYZob{}}\n", + "\\PY{+w}{ }\\PY{k}{for}\\PY{+w}{ }\\PY{p}{(}\\PY{n+nc}{int}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{;}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{\\PYZlt{}}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{;}\\PY{+w}{ }\\PY{o}{+}\\PY{o}{+}\\PY{n}{i}\\PY{p}{)}\n", + "\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{streams}\\PY{o}{[}\\PY{n}{i}\\PY{o}{]}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{stream}\\PY{p}{;}\n", + "\\PY{err}{\\PYZcb{}}\n", + "\n", + "\\PY{n}{\\PYZus{}\\PYZus{}global\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{n}{void}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}launch\\PYZus{}bounds\\PYZus{}\\PYZus{}}\\PY{p}{(}\\PY{l+m+mi}{128}\\PY{p}{)}\\PY{+w}{ }\\PY{n}{gpu\\PYZus{}map\\PYZus{}0\\PYZus{}0\\PYZus{}3}\\PY{p}{(}\\PY{n+nl}{dace}\\PY{p}{:}\\PY{err}{:}\\PY{n}{uint}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}restrict\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{n}{A}\\PY{p}{,}\\PY{+w}{ }\\PY{n+nl}{dace}\\PY{p}{:}\\PY{err}{:}\\PY{n}{uint}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}restrict\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{n}{B}\\PY{p}{)}\n", + "\\PY{err}{\\PYZob{}}\n", + "\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}shared\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{n+nl}{dace}\\PY{p}{:}\\PY{err}{:}\\PY{n}{uint}\\PY{+w}{ }\\PY{n}{S}\\PY{o}{[}\\PY{n}{128}\\PY{o}{]}\\PY{p}{;}\n", + "\n", + "\\PY{+w}{ }\\PY{n}{auto}\\PY{+w}{ }\\PY{n}{block}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n+nl}{cg}\\PY{p}{:}\\PY{err}{:}\\PY{n}{this\\PYZus{}thread\\PYZus{}block}\\PY{p}{(}\\PY{p}{)}\\PY{p}{;}\n", + "\n", + "\\PY{+w}{ }\\PY{n}{const}\\PY{+w}{ }\\PY{n}{uint}\\PY{+w}{ }\\PY{n}{pipeline\\PYZus{}depth\\PYZus{}pipeline}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}shared\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{n+nl}{cuda}\\PY{p}{:}\\PY{err}{:}\\PY{n}{pipeline\\PYZus{}shared\\PYZus{}state}\\PY{o}{\\PYZlt{}}\\PY{n+nl}{cuda}\\PY{p}{:}\\PY{err}{:}\\PY{n+nl}{thread\\PYZus{}scope}\\PY{p}{:}\\PY{err}{:}\\PY{n}{thread\\PYZus{}scope\\PYZus{}block}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{pipeline\\PYZus{}depth\\PYZus{}pipeline}\\PY{o}{\\PYZgt{}}\\PY{+w}{ }\\PY{n}{shared\\PYZus{}state\\PYZus{}pipeline}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{n}{auto}\\PY{+w}{ }\\PY{n}{pipeline}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n+nl}{cuda}\\PY{p}{:}\\PY{err}{:}\\PY{n}{make\\PYZus{}pipeline}\\PY{p}{(}\\PY{n}{block}\\PY{p}{,}\\PY{+w}{ }\\PY{o}{\\PYZam{}}\\PY{n}{shared\\PYZus{}state\\PYZus{}pipeline}\\PY{p}{)}\\PY{p}{;}\n", + "\n", + "\\PY{+w}{ }\\PY{n+nc}{int}\\PY{+w}{ }\\PY{n}{bid}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{p}{(}\\PY{l+m+mi}{128}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{blockIdx}\\PY{p}{.}\\PY{n}{x}\\PY{p}{)}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{err}{\\PYZob{}}\n", + "\n", + "\\PY{+w}{ }\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\n", + "\\PY{+w}{ }\\PY{n}{pipeline}\\PY{p}{.}\\PY{n}{producer\\PYZus{}acquire}\\PY{p}{(}\\PY{p}{)}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\n", + "\n", + "\\PY{+w}{ }\\PY{err}{\\PYZcb{}}\n", + "\\PY{+w}{ }\\PY{n+nl}{cuda}\\PY{p}{:}\\PY{err}{:}\\PY{n}{memcpy\\PYZus{}async}\\PY{p}{(}\\PY{n}{block}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{S}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{A}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{128}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{sizeof}\\PY{p}{(}\\PY{n+nl}{dace}\\PY{p}{:}\\PY{err}{:}\\PY{n}{uint}\\PY{p}{)}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{pipeline}\\PY{p}{)}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{err}{\\PYZob{}}\n", + "\n", + "\\PY{+w}{ }\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\n", + "\\PY{+w}{ }\\PY{n}{pipeline}\\PY{p}{.}\\PY{n}{producer\\PYZus{}commit}\\PY{p}{(}\\PY{p}{)}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\n", + "\n", + "\\PY{+w}{ }\\PY{err}{\\PYZcb{}}\n", + "\\PY{+w}{ }\\PY{err}{\\PYZob{}}\n", + "\n", + "\\PY{+w}{ }\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\n", + "\\PY{+w}{ }\\PY{n}{pipeline}\\PY{p}{.}\\PY{n}{consumer\\PYZus{}wait}\\PY{p}{(}\\PY{p}{)}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\n", + "\n", + "\\PY{+w}{ }\\PY{err}{\\PYZcb{}}\n", + "\\PY{+w}{ }\\PY{err}{\\PYZob{}}\n", + "\\PY{+w}{ }\\PY{n+nc}{int}\\PY{+w}{ }\\PY{n}{tid}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{threadIdx}\\PY{p}{.}\\PY{n}{x}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{err}{\\PYZob{}}\n", + "\\PY{+w}{ }\\PY{n+nl}{dace}\\PY{p}{:}\\PY{err}{:}\\PY{n}{uint}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}in\\PYZus{}S}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{S}\\PY{o}{[}\\PY{n}{tid}\\PY{o}{]}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{n+nl}{dace}\\PY{p}{:}\\PY{err}{:}\\PY{n}{uint}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}out\\PYZus{}S}\\PY{p}{;}\n", + "\n", + "\\PY{+w}{ }\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\n", + "\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}out\\PYZus{}S}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}in\\PYZus{}S}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\n", + "\n", + "\\PY{+w}{ }\\PY{n}{B}\\PY{o}{[}\\PY{n}{tid}\\PY{o}{]}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}out\\PYZus{}S}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{err}{\\PYZcb{}}\n", + "\\PY{+w}{ }\\PY{err}{\\PYZcb{}}\n", + "\\PY{+w}{ }\\PY{err}{\\PYZob{}}\n", + "\n", + "\\PY{+w}{ }\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\n", + "\\PY{+w}{ }\\PY{n}{pipeline}\\PY{p}{.}\\PY{n}{consumer\\PYZus{}release}\\PY{p}{(}\\PY{p}{)}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\n", + "\n", + "\\PY{+w}{ }\\PY{err}{\\PYZcb{}}\n", + "\\PY{err}{\\PYZcb{}}\n", + "\n", + "\n", + "\\PY{n}{DACE\\PYZus{}EXPORTED}\\PY{+w}{ }\\PY{n}{void}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}dace\\PYZus{}runkernel\\PYZus{}gpu\\PYZus{}map\\PYZus{}0\\PYZus{}0\\PYZus{}3}\\PY{p}{(}\\PY{n}{asyn\\PYZus{}cpy\\PYZus{}sdfg\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{,}\\PY{+w}{ }\\PY{n+nl}{dace}\\PY{p}{:}\\PY{err}{:}\\PY{n}{uint}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}restrict\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{n}{A}\\PY{p}{,}\\PY{+w}{ }\\PY{n+nl}{dace}\\PY{p}{:}\\PY{err}{:}\\PY{n}{uint}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}restrict\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{n}{B}\\PY{p}{)}\\PY{p}{;}\n", + "\\PY{n}{void}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}dace\\PYZus{}runkernel\\PYZus{}gpu\\PYZus{}map\\PYZus{}0\\PYZus{}0\\PYZus{}3}\\PY{p}{(}\\PY{n}{asyn\\PYZus{}cpy\\PYZus{}sdfg\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{,}\\PY{+w}{ }\\PY{n+nl}{dace}\\PY{p}{:}\\PY{err}{:}\\PY{n}{uint}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}restrict\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{n}{A}\\PY{p}{,}\\PY{+w}{ }\\PY{n+nl}{dace}\\PY{p}{:}\\PY{err}{:}\\PY{n}{uint}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}restrict\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{n}{B}\\PY{p}{)}\n", + "\\PY{err}{\\PYZob{}}\n", + "\n", + "\n", + "\\PY{+w}{ }\\PY{n}{void}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{gpu\\PYZus{}map\\PYZus{}0\\PYZus{}0\\PYZus{}3\\PYZus{}args}\\PY{err}{[}\\PY{err}{]}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{err}{\\PYZob{}}\\PY{+w}{ }\\PY{p}{(}\\PY{n}{void}\\PY{+w}{ }\\PY{o}{*}\\PY{p}{)}\\PY{o}{\\PYZam{}}\\PY{n}{A}\\PY{p}{,}\\PY{+w}{ }\\PY{p}{(}\\PY{n}{void}\\PY{+w}{ }\\PY{o}{*}\\PY{p}{)}\\PY{o}{\\PYZam{}}\\PY{n}{B}\\PY{+w}{ }\\PY{err}{\\PYZcb{}}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{n}{gpuError\\PYZus{}t}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}err}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{cudaLaunchKernel}\\PY{p}{(}\\PY{+w}{ }\\PY{p}{(}\\PY{n}{void}\\PY{o}{*}\\PY{p}{)}\\PY{n}{gpu\\PYZus{}map\\PYZus{}0\\PYZus{}0\\PYZus{}3}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{dim3}\\PY{p}{(}\\PY{l+m+mi}{1}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{)}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{dim3}\\PY{p}{(}\\PY{l+m+mi}{128}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{)}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{gpu\\PYZus{}map\\PYZus{}0\\PYZus{}0\\PYZus{}3\\PYZus{}args}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{nullptr}\n", + "\\PY{+w}{ }\\PY{p}{)}\\PY{p}{;}\n", + "\n", + "\\PY{+w}{ }\\PY{n}{DACE\\PYZus{}KERNEL\\PYZus{}LAUNCH\\PYZus{}CHECK}\\PY{p}{(}\\PY{n}{\\PYZus{}\\PYZus{}err}\\PY{p}{,}\\PY{+w}{ }\\PY{l+s+ss}{\\PYZdq{}gpu\\PYZus{}map\\PYZus{}0\\PYZus{}0\\PYZus{}3\\PYZdq{}}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{128}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{)}\\PY{p}{;}\n", + "\\PY{err}{\\PYZcb{}}\n", + "\\end{Verbatim}\n" + ], + "text/plain": [ + "\n", + "#include \n", + "#include \n", + "\n", + "// New, cooperative groups and asnyc copy\n", + "#include \n", + "#include \n", + "\n", + "namespace cg = cooperative_groups;\n", + "\n", + "\n", + "struct asyn_cpy_sdfg_state_t {\n", + " dace::cuda::Context *gpu_context;\n", + "};\n", + "\n", + "\n", + "\n", + "DACE_EXPORTED int __dace_init_experimental_cuda(asyn_cpy_sdfg_state_t *__state);\n", + "DACE_EXPORTED int __dace_exit_experimental_cuda(asyn_cpy_sdfg_state_t *__state);\n", + "\n", + "\n", + "\n", + "int __dace_init_experimental_cuda(asyn_cpy_sdfg_state_t *__state) {\n", + " int count;\n", + "\n", + " // Check that we are able to run cuda code\n", + " if (cudaGetDeviceCount(&count) != cudaSuccess)\n", + " {\n", + " printf(\"ERROR: GPU drivers are not configured or cuda-capable device \"\n", + " \"not found\\n\");\n", + " return 1;\n", + " }\n", + " if (count == 0)\n", + " {\n", + " printf(\"ERROR: No cuda-capable devices found\\n\");\n", + " return 2;\n", + " }\n", + "\n", + " // Initialize cuda before we run the application\n", + " float *dev_X;\n", + " DACE_GPU_CHECK(cudaMalloc((void **) &dev_X, 1));\n", + " DACE_GPU_CHECK(cudaFree(dev_X));\n", + "\n", + " \n", + "\n", + " __state->gpu_context = new dace::cuda::Context(0, 0);\n", + "\n", + " // Create cuda streams and events\n", + " for(int i = 0; i < 0; ++i) {\n", + " DACE_GPU_CHECK(cudaStreamCreateWithFlags(&__state->gpu_context->internal_streams[i], cudaStreamNonBlocking));\n", + " __state->gpu_context->streams[i] = __state->gpu_context->internal_streams[i]; // Allow for externals to modify streams\n", + " }\n", + " for(int i = 0; i < 0; ++i) {\n", + " DACE_GPU_CHECK(cudaEventCreateWithFlags(&__state->gpu_context->events[i], cudaEventDisableTiming));\n", + " }\n", + "\n", + " \n", + "\n", + " return 0;\n", + "}\n", + "\n", + "int __dace_exit_experimental_cuda(asyn_cpy_sdfg_state_t *__state) {\n", + " \n", + "\n", + " // Synchronize and check for CUDA errors\n", + " int __err = static_cast(__state->gpu_context->lasterror);\n", + " if (__err == 0)\n", + " __err = static_cast(cudaDeviceSynchronize());\n", + "\n", + " // Destroy cuda streams and events\n", + " for(int i = 0; i < 0; ++i) {\n", + " DACE_GPU_CHECK(cudaStreamDestroy(__state->gpu_context->internal_streams[i]));\n", + " }\n", + " for(int i = 0; i < 0; ++i) {\n", + " DACE_GPU_CHECK(cudaEventDestroy(__state->gpu_context->events[i]));\n", + " }\n", + "\n", + " delete __state->gpu_context;\n", + " return __err;\n", + "}\n", + "\n", + "DACE_EXPORTED bool __dace_gpu_set_stream(asyn_cpy_sdfg_state_t *__state, int streamid, gpuStream_t stream)\n", + "{\n", + " if (streamid < 0 || streamid >= 0)\n", + " return false;\n", + "\n", + " __state->gpu_context->streams[streamid] = stream;\n", + "\n", + " return true;\n", + "}\n", + "\n", + "DACE_EXPORTED void __dace_gpu_set_all_streams(asyn_cpy_sdfg_state_t *__state, gpuStream_t stream)\n", + "{\n", + " for (int i = 0; i < 0; ++i)\n", + " __state->gpu_context->streams[i] = stream;\n", + "}\n", + "\n", + "__global__ void __launch_bounds__(128) gpu_map_0_0_3(dace::uint * __restrict__ A, dace::uint * __restrict__ B)\n", + "{\n", + " __shared__ dace::uint S[128];\n", + "\n", + " auto block = cg::this_thread_block();\n", + "\n", + " const uint pipeline_depth_pipeline = 1;\n", + " __shared__ cuda::pipeline_shared_state shared_state_pipeline;\n", + " auto pipeline = cuda::make_pipeline(block, &shared_state_pipeline);\n", + "\n", + " int bid = (128 * blockIdx.x);\n", + " {\n", + "\n", + " ///////////////////\n", + " pipeline.producer_acquire();\n", + " ///////////////////\n", + "\n", + " }\n", + " cuda::memcpy_async(block, S, A, 128 *sizeof(dace::uint), pipeline);\n", + " {\n", + "\n", + " ///////////////////\n", + " pipeline.producer_commit();\n", + " ///////////////////\n", + "\n", + " }\n", + " {\n", + "\n", + " ///////////////////\n", + " pipeline.consumer_wait();\n", + " ///////////////////\n", + "\n", + " }\n", + " {\n", + " int tid = threadIdx.x;\n", + " {\n", + " dace::uint __in_S = S[tid];\n", + " dace::uint __out_S;\n", + "\n", + " ///////////////////\n", + " __out_S = __in_S;\n", + " ///////////////////\n", + "\n", + " B[tid] = __out_S;\n", + " }\n", + " }\n", + " {\n", + "\n", + " ///////////////////\n", + " pipeline.consumer_release();\n", + " ///////////////////\n", + "\n", + " }\n", + "}\n", + "\n", + "\n", + "DACE_EXPORTED void __dace_runkernel_gpu_map_0_0_3(asyn_cpy_sdfg_state_t *__state, dace::uint * __restrict__ A, dace::uint * __restrict__ B);\n", + "void __dace_runkernel_gpu_map_0_0_3(asyn_cpy_sdfg_state_t *__state, dace::uint * __restrict__ A, dace::uint * __restrict__ B)\n", + "{\n", + "\n", + "\n", + " void *gpu_map_0_0_3_args[] = { (void *)&A, (void *)&B };\n", + " gpuError_t __err = cudaLaunchKernel( (void*)gpu_map_0_0_3, dim3(1, 1, 1), dim3(128, 1, 1), gpu_map_0_0_3_args, 0, nullptr\n", + " );\n", + "\n", + " DACE_KERNEL_LAUNCH_CHECK(__err, \"gpu_map_0_0_3\", 1, 1, 1, 128, 1, 1);\n", + "}\n" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "\n", + "Code(sdfg.generate_code()[1].clean_code)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "18bbca39", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "A before:\n", + "[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1\n", + " 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1\n", + " 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1\n", + " 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]\n", + "B before:\n", + "[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]\n", + "A after:\n", + "[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1\n", + " 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1\n", + " 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1\n", + " 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]\n", + "B after:\n", + "[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1\n", + " 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1\n", + " 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1\n", + " 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]\n" + ] + } + ], + "source": [ + "\n", + "A = cp.ones((128,), dtype=cp.uint32)\n", + "B = cp.zeros((128,), dtype=cp.uint32)\n", + "\n", + "print(f\"A before:\\n{A}\")\n", + "print(f\"B before:\\n{B}\")\n", + "\n", + "sdfg(A=A, B=B)\n", + "\n", + "print(f\"A after:\\n{A}\")\n", + "print(f\"B after:\\n{B}\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1ce1ef33", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9aa368f1", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "dace_env", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/berkay_workpace/scratch/testbed.ipynb b/berkay_workpace/scratch/testbed.ipynb deleted file mode 100644 index 02ee911dd0..0000000000 --- a/berkay_workpace/scratch/testbed.ipynb +++ /dev/null @@ -1,164 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "id": "a5aeb1f5", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "import dace\n", - "import cupy as cp\n", - "import numpy as np\n", - "from IPython.display import Code\n", - "from typing import Optional\n", - "\n", - "from dace import SDFG, properties\n", - "from dace.config import Config\n", - "from dace.transformation import pass_pipeline as ppl, transformation\n", - "from dace.sdfg import nodes\n", - "from dace import dtypes\n", - "from dace.transformation.passes.gpustream_scheduling import NaiveGPUStreamScheduler\n" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "e66c2551", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "
\n", - "
\n", - "
\n", - "\n", - "" - ], - "text/plain": [ - "SDFG (test2)" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "@dace.program\n", - "def test1(A: dace.uint32[10] @ dace.dtypes.StorageType.GPU_Global,\n", - " B: dace.uint32[10] @ dace.dtypes.StorageType.GPU_Global,\n", - " C: dace.uint32[10],\n", - " D: dace.uint32[10] @ dace.dtypes.StorageType.GPU_Global\n", - " ):\n", - " A[:] = B[:]\n", - " C[:] = D[:]\n", - "\n", - "\n", - "@dace.program\n", - "def test2(A: dace.uint32[10] @ dace.dtypes.StorageType.GPU_Global,\n", - " B: dace.uint32[10] @ dace.dtypes.StorageType.GPU_Global,\n", - " C: dace.uint32[10],\n", - " D: dace.uint32[10] @ dace.dtypes.StorageType.GPU_Global\n", - " ):\n", - " \n", - " for i in dace.map[0:10] @ dace.dtypes.ScheduleType.GPU_Device:\n", - " A[i] = B[i]\n", - " \n", - " for j in dace.map[0:10] @ dace.dtypes.ScheduleType.GPU_Device:\n", - " C[j] = D[j]\n", - "\n", - "sdfg = test2.to_sdfg()\n", - "sdfg" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "0aaef92c", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "
\n", - "
\n", - "
\n", - "\n", - "" - ], - "text/plain": [ - "SDFG (test2)" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "\n", - "gpu_stream_access_template = \"__state->gpu_context->streams[{gpu_stream}]\" \n", - "\n", - "# Initialize and configure GPU stream scheduling pass\n", - "gpu_stream_pass = NaiveGPUStreamScheduler()\n", - "gpu_stream_pass.set_gpu_stream_access_template(gpu_stream_access_template)\n", - "assigned_streams = gpu_stream_pass.apply_pass(sdfg, None)\n", - "sdfg" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "dace_env", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.12.3" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} From 443292a258f79bdcd91c5f7b7f334f0893265205 Mon Sep 17 00:00:00 2001 From: Yakup Koray Budanaz Date: Tue, 8 Jul 2025 11:16:07 +0200 Subject: [PATCH 54/94] Update infer_const_args.py --- dace/transformation/passes/analysis/infer_const_args.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dace/transformation/passes/analysis/infer_const_args.py b/dace/transformation/passes/analysis/infer_const_args.py index 99bd2148c9..0f66d49732 100644 --- a/dace/transformation/passes/analysis/infer_const_args.py +++ b/dace/transformation/passes/analysis/infer_const_args.py @@ -7,7 +7,7 @@ @properties.make_properties @transformation.explicit_cf_compatible -class InfetConstantArguments(ppl.Pass): +class InferConstantArguments(ppl.Pass): """ Evaluates which symbols and data are const within a scope. """ From c0f4633a0b81af72bb9ee60f46d121c3f9daf1d4 Mon Sep 17 00:00:00 2001 From: aydogdub Date: Tue, 8 Jul 2025 11:26:02 +0200 Subject: [PATCH 55/94] stuff to make async memcpy work and AddThreadBlockMap for cpu.py --- .../dbuff_related/double_buffering_async.sdfg | 1479 +++++++++++------ dace/codegen/targets/cpu.py | 1 + dace/codegen/targets/cuda.py | 22 +- 3 files changed, 1001 insertions(+), 501 deletions(-) diff --git a/berkay_workpace/scratch/yakups_examples/dbuff_related/double_buffering_async.sdfg b/berkay_workpace/scratch/yakups_examples/dbuff_related/double_buffering_async.sdfg index 12fe140292..4a884d1171 100644 --- a/berkay_workpace/scratch/yakups_examples/dbuff_related/double_buffering_async.sdfg +++ b/berkay_workpace/scratch/yakups_examples/dbuff_related/double_buffering_async.sdfg @@ -286,14 +286,14 @@ }, "debuginfo": { "type": "DebugInfo", - "start_line": 20, - "end_line": 29, + "start_line": 89, + "end_line": 98, "start_column": 0, "end_column": 0, - "filename": "/home/primrose/Work/DaceLayoutAndScheduleTransformations/layout_and_schedule_transformations/tests/multiple_buffering_test.py" + "filename": "/home/primrose/Work/dace/tests/transformations/multiple_buffering_test.py" }, "using_explicit_control_flow": true, - "guid": "b1393913-a60f-4a10-a006-50ad9ec459e3" + "guid": "d86432fb-dd83-4d68-b1a3-5443e218087b" }, "nodes": [ { @@ -328,9 +328,9 @@ "nodes": [ { "type": "MapEntry", - "label": "kernel_26[i=0:N:512]", + "label": "kernel_95[i=0:N:512]", "attributes": { - "label": "kernel_26", + "label": "kernel_95", "params": [ "i" ], @@ -348,11 +348,11 @@ "schedule": "GPU_Device", "debuginfo": { "type": "DebugInfo", - "start_line": 26, - "end_line": 26, + "start_line": 95, + "end_line": 95, "start_column": 4, "end_column": 4, - "filename": "/home/primrose/Work/DaceLayoutAndScheduleTransformations/layout_and_schedule_transformations/tests/multiple_buffering_test.py" + "filename": "/home/primrose/Work/dace/tests/transformations/multiple_buffering_test.py" }, "in_connectors": { "IN_A": null, @@ -362,7 +362,7 @@ "OUT_A": null, "OUT_B": null }, - "guid": "e572193f-4078-4079-8a7b-f9e60c00c3f9" + "guid": "4748988f-5a8b-4bb9-979c-22cba4912dc0" }, "id": 0, "scope_entry": null, @@ -370,7 +370,7 @@ }, { "type": "MapExit", - "label": "kernel_26[i=0:N:512]", + "label": "kernel_95[i=0:N:512]", "attributes": { "in_connectors": { "IN_C": null @@ -378,7 +378,7 @@ "out_connectors": { "OUT_C": null }, - "guid": "2692fa6e-5d5b-4152-8604-77292eca079e" + "guid": "6b03359b-afbe-4cc4-92de-ebedc84f829b" }, "id": 1, "scope_entry": "0", @@ -390,14 +390,14 @@ "attributes": { "debuginfo": { "type": "DebugInfo", - "start_line": 26, - "end_line": 26, + "start_line": 95, + "end_line": 95, "start_column": 4, "end_column": 4, - "filename": "/home/primrose/Work/DaceLayoutAndScheduleTransformations/layout_and_schedule_transformations/tests/multiple_buffering_test.py" + "filename": "/home/primrose/Work/dace/tests/transformations/multiple_buffering_test.py" }, "data": "A", - "guid": "ab74a37e-b46f-4c4f-a769-f5c7a748410d" + "guid": "10bb78f3-f3ce-46ff-abd1-b5d1e8e19e17" }, "id": 2, "scope_entry": null, @@ -409,14 +409,14 @@ "attributes": { "debuginfo": { "type": "DebugInfo", - "start_line": 26, - "end_line": 26, + "start_line": 95, + "end_line": 95, "start_column": 4, "end_column": 4, - "filename": "/home/primrose/Work/DaceLayoutAndScheduleTransformations/layout_and_schedule_transformations/tests/multiple_buffering_test.py" + "filename": "/home/primrose/Work/dace/tests/transformations/multiple_buffering_test.py" }, "data": "B", - "guid": "61eb4a00-468b-4b1f-860d-09ce3d6d359e" + "guid": "b1b04e6f-26af-4e3a-bb90-ed435f77a0df" }, "id": 3, "scope_entry": null, @@ -428,14 +428,14 @@ "attributes": { "debuginfo": { "type": "DebugInfo", - "start_line": 26, - "end_line": 26, + "start_line": 95, + "end_line": 95, "start_column": 4, "end_column": 4, - "filename": "/home/primrose/Work/DaceLayoutAndScheduleTransformations/layout_and_schedule_transformations/tests/multiple_buffering_test.py" + "filename": "/home/primrose/Work/dace/tests/transformations/multiple_buffering_test.py" }, "data": "C", - "guid": "f166467e-40a6-46e0-9a94-d19c7e8c9b13" + "guid": "c93c561b-1a7f-42b5-af9d-6bdb9cbb586b" }, "id": 4, "scope_entry": null, @@ -443,9 +443,9 @@ }, { "type": "MapEntry", - "label": "kernel_26_4_27[k=0:2]", + "label": "kernel_95_4_96[k=0:2]", "attributes": { - "label": "kernel_26_4_27", + "label": "kernel_95_4_96", "params": [ "k" ], @@ -463,21 +463,21 @@ "schedule": "Sequential", "debuginfo": { "type": "DebugInfo", - "start_line": 27, - "end_line": 27, + "start_line": 96, + "end_line": 96, "start_column": 8, "end_column": 8, - "filename": "/home/primrose/Work/DaceLayoutAndScheduleTransformations/layout_and_schedule_transformations/tests/multiple_buffering_test.py" + "filename": "/home/primrose/Work/dace/tests/transformations/multiple_buffering_test.py" }, "in_connectors": { - "IN___tmp_29_37_r_in_from_1_0_in_from_1_0": null, - "IN___tmp_29_58_r_in_from_1_0_in_from_1_0": null + "IN___tmp_98_37_r_in_from_1_0_in_from_1_0": null, + "IN___tmp_98_58_r_in_from_1_0_in_from_1_0": null }, "out_connectors": { - "OUT___tmp_29_37_r_in_from_1_0_in_from_1_0": null, - "OUT___tmp_29_58_r_in_from_1_0_in_from_1_0": null + "OUT___tmp_98_37_r_in_from_1_0_in_from_1_0": null, + "OUT___tmp_98_58_r_in_from_1_0_in_from_1_0": null }, - "guid": "148ec12e-ac47-4a5e-bc34-836556a4bb1d" + "guid": "93ff2336-bf88-4205-9416-d8d5ada0fa43" }, "id": 5, "scope_entry": "0", @@ -485,15 +485,15 @@ }, { "type": "MapExit", - "label": "kernel_26_4_27[k=0:2]", + "label": "kernel_95_4_96[k=0:2]", "attributes": { "in_connectors": { - "IN___tmp_29_16_w_out_of_1_1_out_of_1_1": null + "IN___tmp_98_16_w_out_of_1_1_out_of_1_1": null }, "out_connectors": { - "OUT___tmp_29_16_w_out_of_1_1_out_of_1_1": null + "OUT___tmp_98_16_w_out_of_1_1_out_of_1_1": null }, - "guid": "525be732-b8e8-4067-a15d-9ad9a5ca1096" + "guid": "d36fa430-b62a-4ccf-8aab-69ea2476dff9" }, "id": 6, "scope_entry": "5", @@ -501,9 +501,9 @@ }, { "type": "MapEntry", - "label": "kernel_26_4_27_8_28[j=0:256]", + "label": "kernel_95_4_96_8_97[j=0:256]", "attributes": { - "label": "kernel_26_4_27_8_28", + "label": "kernel_95_4_96_8_97", "params": [ "j" ], @@ -521,21 +521,21 @@ "schedule": "GPU_ThreadBlock", "debuginfo": { "type": "DebugInfo", - "start_line": 28, - "end_line": 28, + "start_line": 97, + "end_line": 97, "start_column": 12, "end_column": 12, - "filename": "/home/primrose/Work/DaceLayoutAndScheduleTransformations/layout_and_schedule_transformations/tests/multiple_buffering_test.py" + "filename": "/home/primrose/Work/dace/tests/transformations/multiple_buffering_test.py" }, "in_connectors": { - "IN___tmp_29_37_r_in_from_1_0": null, - "IN___tmp_29_58_r_in_from_1_0": null + "IN___tmp_98_37_r_in_from_1_0": null, + "IN___tmp_98_58_r_in_from_1_0": null }, "out_connectors": { - "OUT___tmp_29_37_r_in_from_1_0": null, - "OUT___tmp_29_58_r_in_from_1_0": null + "OUT___tmp_98_37_r_in_from_1_0": null, + "OUT___tmp_98_58_r_in_from_1_0": null }, - "guid": "bf2fc620-040a-48b3-be54-f9d9b1997e41" + "guid": "cbaaa2f3-f301-4716-a836-6eb53ee8698f" }, "id": 7, "scope_entry": "5", @@ -543,15 +543,15 @@ }, { "type": "MapExit", - "label": "kernel_26_4_27_8_28[j=0:256]", + "label": "kernel_95_4_96_8_97[j=0:256]", "attributes": { "in_connectors": { - "IN___tmp_29_16_w_out_of_1_1": null + "IN___tmp_98_16_w_out_of_1_1": null }, "out_connectors": { - "OUT___tmp_29_16_w_out_of_1_1": null + "OUT___tmp_98_16_w_out_of_1_1": null }, - "guid": "db988a0e-a1c1-45a3-9905-32dc050c6b76" + "guid": "935f0df2-f1ce-4886-9afe-4eaef71e5b09" }, "id": 8, "scope_entry": "7", @@ -567,11 +567,11 @@ }, "debuginfo": { "type": "DebugInfo", - "start_line": 29, - "end_line": 29, + "start_line": 98, + "end_line": 98, "start_column": 72, "end_column": 72, - "filename": "/home/primrose/Work/DaceLayoutAndScheduleTransformations/layout_and_schedule_transformations/tests/multiple_buffering_test.py" + "filename": "/home/primrose/Work/dace/tests/transformations/multiple_buffering_test.py" }, "label": "_Add_", "in_connectors": { @@ -581,7 +581,7 @@ "out_connectors": { "__out": null }, - "guid": "86df7814-ffdc-41a0-afe1-f211ae009e6a" + "guid": "92adee05-eb8b-4d37-8aa3-85e480d724e9" }, "id": 9, "scope_entry": "7", @@ -593,14 +593,14 @@ "attributes": { "debuginfo": { "type": "DebugInfo", - "start_line": 29, - "end_line": 29, + "start_line": 98, + "end_line": 98, "start_column": 72, "end_column": 72, - "filename": "/home/primrose/Work/DaceLayoutAndScheduleTransformations/layout_and_schedule_transformations/tests/multiple_buffering_test.py" + "filename": "/home/primrose/Work/dace/tests/transformations/multiple_buffering_test.py" }, "data": "__tmp3", - "guid": "5e5592d0-1ffe-4ddb-9788-b3b72aecae6a" + "guid": "9bcc8c35-eea5-4862-a6d8-3228fca200f2" }, "id": 10, "scope_entry": "7", @@ -608,7 +608,7 @@ }, { "type": "Tasklet", - "label": "assign_29_16", + "label": "assign_98_16", "attributes": { "code": { "string_data": "__out = __inp", @@ -616,20 +616,20 @@ }, "debuginfo": { "type": "DebugInfo", - "start_line": 29, - "end_line": 29, + "start_line": 98, + "end_line": 98, "start_column": 30, "end_column": 30, - "filename": "/home/primrose/Work/DaceLayoutAndScheduleTransformations/layout_and_schedule_transformations/tests/multiple_buffering_test.py" + "filename": "/home/primrose/Work/dace/tests/transformations/multiple_buffering_test.py" }, - "label": "assign_29_16", + "label": "assign_98_16", "in_connectors": { "__inp": null }, "out_connectors": { "__out": null }, - "guid": "ee7b20f7-fbc2-49a9-8fba-ccffff2b7780" + "guid": "64433f94-2825-4b47-a308-361d508d77f6" }, "id": 11, "scope_entry": "7", @@ -641,14 +641,14 @@ "attributes": { "debuginfo": { "type": "DebugInfo", - "start_line": 48, - "end_line": 48, + "start_line": 54, + "end_line": 54, "start_column": 0, "end_column": 0, - "filename": "/home/primrose/Work/DaceLayoutAndScheduleTransformations/layout_and_schedule_transformations/tests/test_utils.py" + "filename": "/home/primrose/Work/dace/tests/transformations/multiple_buffering_test.py" }, "data": "shr_A", - "guid": "4905f521-649a-4796-9472-fad5dd9602a1" + "guid": "c7684bbd-ea58-48d4-a3e4-59e81016df47" }, "id": 12, "scope_entry": "5", @@ -660,14 +660,14 @@ "attributes": { "debuginfo": { "type": "DebugInfo", - "start_line": 48, - "end_line": 48, + "start_line": 54, + "end_line": 54, "start_column": 0, "end_column": 0, - "filename": "/home/primrose/Work/DaceLayoutAndScheduleTransformations/layout_and_schedule_transformations/tests/test_utils.py" + "filename": "/home/primrose/Work/dace/tests/transformations/multiple_buffering_test.py" }, "data": "shr_B", - "guid": "5aea9089-f43c-4272-8cf1-80301f87e5f5" + "guid": "de996230-3c3d-4c05-8527-257ceb9dd045" }, "id": 13, "scope_entry": "5", @@ -695,7 +695,7 @@ }, "data": "A", "debuginfo": null, - "guid": "69b08c61-8a53-4825-bd28-3ce6d2fa75f7", + "guid": "65758726-87ca-42be-89ba-a9bb75ddebe7", "src_subset": { "type": "Range", "ranges": [ @@ -738,7 +738,7 @@ }, "data": "B", "debuginfo": null, - "guid": "0185e810-4989-4e8b-b57c-1c12f5647a48", + "guid": "17dcfd79-4fcc-4314-a7c1-b845be9c77c0", "src_subset": { "type": "Range", "ranges": [ @@ -781,7 +781,7 @@ }, "data": "shr_A", "debuginfo": null, - "guid": "9053c547-1a07-4e28-9a07-b5866a347a2b", + "guid": "123eace6-f607-4d9c-8b7a-b1d91d7cd033", "src_subset": { "type": "Range", "ranges": [ @@ -801,7 +801,7 @@ }, "src": "12", "dst": "7", - "dst_connector": "IN___tmp_29_37_r_in_from_1_0", + "dst_connector": "IN___tmp_98_37_r_in_from_1_0", "src_connector": null }, { @@ -824,7 +824,7 @@ }, "data": "shr_B", "debuginfo": null, - "guid": "aa3712fc-e871-41e5-ba81-47436eb090a9", + "guid": "197f512b-c4e2-4b15-a7fa-f31a8f1f3863", "src_subset": { "type": "Range", "ranges": [ @@ -844,7 +844,7 @@ }, "src": "13", "dst": "7", - "dst_connector": "IN___tmp_29_58_r_in_from_1_0", + "dst_connector": "IN___tmp_98_58_r_in_from_1_0", "src_connector": null }, { @@ -867,7 +867,7 @@ }, "data": "__tmp3", "debuginfo": null, - "guid": "232a4b3c-63de-4475-a777-63057a798de3", + "guid": "d97379d3-04f1-4d25-93e4-29e3953be24d", "src_subset": { "type": "Range", "ranges": [ @@ -910,7 +910,7 @@ }, "data": "A", "debuginfo": null, - "guid": "f2fca7c5-eb37-41b5-aa10-1698132a8f8b", + "guid": "9ac193e2-3c72-48f7-9ef0-f50495152775", "src_subset": { "type": "Range", "ranges": [ @@ -930,7 +930,7 @@ }, "src": "0", "dst": "5", - "dst_connector": "IN___tmp_29_37_r_in_from_1_0_in_from_1_0", + "dst_connector": "IN___tmp_98_37_r_in_from_1_0_in_from_1_0", "src_connector": "OUT_A" }, { @@ -953,7 +953,7 @@ }, "data": "B", "debuginfo": null, - "guid": "76a9bb89-c68f-4161-8fc0-5d272cb46c5d", + "guid": "30da7eee-07be-4d97-9fe7-6385057477b6", "src_subset": { "type": "Range", "ranges": [ @@ -973,7 +973,7 @@ }, "src": "0", "dst": "5", - "dst_connector": "IN___tmp_29_58_r_in_from_1_0_in_from_1_0", + "dst_connector": "IN___tmp_98_58_r_in_from_1_0_in_from_1_0", "src_connector": "OUT_B" }, { @@ -996,7 +996,7 @@ }, "data": "C", "debuginfo": null, - "guid": "1dd8cf39-3f8a-4419-8403-80c1c0393bfb", + "guid": "1f0ccad2-3828-4366-9a82-2847c3398b35", "src_subset": null, "dst_subset": { "type": "Range", @@ -1039,7 +1039,7 @@ }, "data": "C", "debuginfo": null, - "guid": "59d4f720-d116-4bb0-be9d-dbee7b4461e5", + "guid": "a92efac1-2678-4303-aa04-5be51f196e25", "src_subset": null, "dst_subset": { "type": "Range", @@ -1059,8 +1059,8 @@ }, "src": "8", "dst": "6", - "dst_connector": "IN___tmp_29_16_w_out_of_1_1_out_of_1_1", - "src_connector": "OUT___tmp_29_16_w_out_of_1_1" + "dst_connector": "IN___tmp_98_16_w_out_of_1_1_out_of_1_1", + "src_connector": "OUT___tmp_98_16_w_out_of_1_1" }, { "type": "MultiConnectorEdge", @@ -1082,7 +1082,7 @@ }, "data": "C", "debuginfo": null, - "guid": "57a2e327-a170-434b-94b1-784054dc4fae", + "guid": "bf7f1dc2-7ac0-4507-8aef-a67684cc5c44", "src_subset": null, "dst_subset": { "type": "Range", @@ -1103,7 +1103,7 @@ "src": "6", "dst": "1", "dst_connector": "IN_C", - "src_connector": "OUT___tmp_29_16_w_out_of_1_1_out_of_1_1" + "src_connector": "OUT___tmp_98_16_w_out_of_1_1_out_of_1_1" }, { "type": "MultiConnectorEdge", @@ -1125,7 +1125,7 @@ }, "data": "shr_A", "debuginfo": null, - "guid": "e1e070d6-e51e-46a7-aa37-6cd892324359", + "guid": "5428f419-c4e3-4cf5-9a13-d42edd7f7fb2", "src_subset": { "type": "Range", "ranges": [ @@ -1146,7 +1146,7 @@ "src": "7", "dst": "9", "dst_connector": "__in1", - "src_connector": "OUT___tmp_29_37_r_in_from_1_0" + "src_connector": "OUT___tmp_98_37_r_in_from_1_0" }, { "type": "MultiConnectorEdge", @@ -1179,7 +1179,7 @@ }, "data": "A", "debuginfo": null, - "guid": "1fbf8afd-549b-4b0d-9a0a-64fd318d6034", + "guid": "eb9ff732-544b-4c5b-80cd-aa0b9dc83fa1", "src_subset": { "type": "Range", "ranges": [ @@ -1210,7 +1210,7 @@ "src": "5", "dst": "12", "dst_connector": null, - "src_connector": "OUT___tmp_29_37_r_in_from_1_0_in_from_1_0" + "src_connector": "OUT___tmp_98_37_r_in_from_1_0_in_from_1_0" }, { "type": "MultiConnectorEdge", @@ -1232,7 +1232,7 @@ }, "data": "shr_B", "debuginfo": null, - "guid": "57ba67c5-bd2f-4844-9e9c-40753922ad01", + "guid": "3478573b-f7f4-4441-a7a3-bf3ebf93e64b", "src_subset": { "type": "Range", "ranges": [ @@ -1253,7 +1253,7 @@ "src": "7", "dst": "9", "dst_connector": "__in2", - "src_connector": "OUT___tmp_29_58_r_in_from_1_0" + "src_connector": "OUT___tmp_98_58_r_in_from_1_0" }, { "type": "MultiConnectorEdge", @@ -1286,7 +1286,7 @@ }, "data": "B", "debuginfo": null, - "guid": "bfb890ef-cdef-4ebf-8995-f3754b0953bd", + "guid": "0ce32b4d-333f-459c-a576-2d2e79303500", "src_subset": { "type": "Range", "ranges": [ @@ -1317,7 +1317,7 @@ "src": "5", "dst": "13", "dst_connector": null, - "src_connector": "OUT___tmp_29_58_r_in_from_1_0_in_from_1_0" + "src_connector": "OUT___tmp_98_58_r_in_from_1_0_in_from_1_0" }, { "type": "MultiConnectorEdge", @@ -1339,7 +1339,7 @@ }, "data": "__tmp3", "debuginfo": null, - "guid": "9e39802d-f185-4f97-aa0d-24ec5e28fa3a", + "guid": "0c0703e8-1a00-4cf7-98d6-7047af804d38", "src_subset": null, "dst_subset": { "type": "Range", @@ -1382,7 +1382,7 @@ }, "data": "C", "debuginfo": null, - "guid": "cd431e0e-63f8-458d-a4f7-82a85dc09e96", + "guid": "176a1ba4-31ef-4819-8186-173e826e42ac", "src_subset": null, "dst_subset": { "type": "Range", @@ -1402,12 +1402,12 @@ }, "src": "11", "dst": "8", - "dst_connector": "IN___tmp_29_16_w_out_of_1_1", + "dst_connector": "IN___tmp_98_16_w_out_of_1_1", "src_connector": "__out" } ], "attributes": { - "guid": "328decff-fd21-4fff-881f-74fc87b42fa7", + "guid": "8d9c9dc6-b7fe-4305-9b0e-f344e6cb96bc", "executions": "1", "dynamic_executions": false } @@ -1435,15 +1435,15 @@ ], "debuginfo": { "type": "DebugInfo", - "start_line": 20, - "end_line": 29, + "start_line": 89, + "end_line": 98, "start_column": 0, "end_column": 0, - "filename": "/home/primrose/Work/DaceLayoutAndScheduleTransformations/layout_and_schedule_transformations/tests/multiple_buffering_test.py" + "filename": "/home/primrose/Work/dace/tests/transformations/multiple_buffering_test.py" }, "using_explicit_control_flow": true, - "guid": "b1393913-a60f-4a10-a006-50ad9ec459e3", - "hash": "986f8867fdf3dcbeb3b07b32b05c3af6ad04a3b67e786fb203ee054b1d2dbd97" + "guid": "d86432fb-dd83-4d68-b1a3-5443e218087b", + "hash": "6272783aa43803c69147bfe8f7b1459cb863365e3e0452f06b3881ab8a0997ee" }, "nodes": [ { @@ -1472,8 +1472,7 @@ 16, 17, 18, - 19, - 20 + 19 ], "7": [ 8, @@ -1485,9 +1484,9 @@ "nodes": [ { "type": "MapEntry", - "label": "kernel_26[i=0:N:512]", + "label": "kernel_95[i=0:N:512]", "attributes": { - "label": "kernel_26", + "label": "kernel_95", "params": [ "i" ], @@ -1505,11 +1504,11 @@ "schedule": "GPU_Device", "debuginfo": { "type": "DebugInfo", - "start_line": 26, - "end_line": 26, + "start_line": 95, + "end_line": 95, "start_column": 4, "end_column": 4, - "filename": "/home/primrose/Work/DaceLayoutAndScheduleTransformations/layout_and_schedule_transformations/tests/multiple_buffering_test.py" + "filename": "/home/primrose/Work/dace/tests/transformations/multiple_buffering_test.py" }, "in_connectors": { "IN_A": null, @@ -1519,7 +1518,7 @@ "OUT_A": null, "OUT_B": null }, - "guid": "e572193f-4078-4079-8a7b-f9e60c00c3f9" + "guid": "4748988f-5a8b-4bb9-979c-22cba4912dc0" }, "id": 0, "scope_entry": null, @@ -1527,7 +1526,7 @@ }, { "type": "MapExit", - "label": "kernel_26[i=0:N:512]", + "label": "kernel_95[i=0:N:512]", "attributes": { "in_connectors": { "IN_C": null @@ -1535,7 +1534,7 @@ "out_connectors": { "OUT_C": null }, - "guid": "2692fa6e-5d5b-4152-8604-77292eca079e" + "guid": "6b03359b-afbe-4cc4-92de-ebedc84f829b" }, "id": 1, "scope_entry": "0", @@ -1547,14 +1546,14 @@ "attributes": { "debuginfo": { "type": "DebugInfo", - "start_line": 26, - "end_line": 26, + "start_line": 95, + "end_line": 95, "start_column": 4, "end_column": 4, - "filename": "/home/primrose/Work/DaceLayoutAndScheduleTransformations/layout_and_schedule_transformations/tests/multiple_buffering_test.py" + "filename": "/home/primrose/Work/dace/tests/transformations/multiple_buffering_test.py" }, "data": "A", - "guid": "0af74d98-4cac-44d4-afd5-dcbb63f357fd" + "guid": "e2fa79c1-e57c-4267-8019-43579c1c1c09" }, "id": 2, "scope_entry": null, @@ -1566,14 +1565,14 @@ "attributes": { "debuginfo": { "type": "DebugInfo", - "start_line": 26, - "end_line": 26, + "start_line": 95, + "end_line": 95, "start_column": 4, "end_column": 4, - "filename": "/home/primrose/Work/DaceLayoutAndScheduleTransformations/layout_and_schedule_transformations/tests/multiple_buffering_test.py" + "filename": "/home/primrose/Work/dace/tests/transformations/multiple_buffering_test.py" }, "data": "B", - "guid": "43f7746b-9786-42ce-a43c-8b0489e0f7a6" + "guid": "68049d0f-8940-43a2-8509-f9a028be8f46" }, "id": 3, "scope_entry": null, @@ -1585,14 +1584,14 @@ "attributes": { "debuginfo": { "type": "DebugInfo", - "start_line": 26, - "end_line": 26, + "start_line": 95, + "end_line": 95, "start_column": 4, "end_column": 4, - "filename": "/home/primrose/Work/DaceLayoutAndScheduleTransformations/layout_and_schedule_transformations/tests/multiple_buffering_test.py" + "filename": "/home/primrose/Work/dace/tests/transformations/multiple_buffering_test.py" }, "data": "C", - "guid": "dd76042d-85d3-47dc-a981-a26bd1c75088" + "guid": "bd228903-54ad-4b1c-b473-2eed986516fd" }, "id": 4, "scope_entry": null, @@ -1600,9 +1599,9 @@ }, { "type": "MapEntry", - "label": "kernel_26_4_27[k=0:2]", + "label": "kernel_95_4_96[k=0:2]", "attributes": { - "label": "kernel_26_4_27", + "label": "kernel_95_4_96", "params": [ "k" ], @@ -1620,25 +1619,25 @@ "schedule": "Sequential", "debuginfo": { "type": "DebugInfo", - "start_line": 27, - "end_line": 27, + "start_line": 96, + "end_line": 96, "start_column": 8, "end_column": 8, - "filename": "/home/primrose/Work/DaceLayoutAndScheduleTransformations/layout_and_schedule_transformations/tests/multiple_buffering_test.py" + "filename": "/home/primrose/Work/dace/tests/transformations/multiple_buffering_test.py" }, "in_connectors": { - "IN___tmp_29_37_r_in_from_1_0_in_from_1_0": null, - "IN___tmp_29_58_r_in_from_1_0_in_from_1_0": null, + "IN___tmp_98_37_r_in_from_1_0_in_from_1_0": null, + "IN___tmp_98_58_r_in_from_1_0_in_from_1_0": null, "IN_prefetch_A": null, "IN_prefetch_B": null }, "out_connectors": { - "OUT___tmp_29_37_r_in_from_1_0_in_from_1_0": null, - "OUT___tmp_29_58_r_in_from_1_0_in_from_1_0": null, + "OUT___tmp_98_37_r_in_from_1_0_in_from_1_0": null, + "OUT___tmp_98_58_r_in_from_1_0_in_from_1_0": null, "OUT_prefetch_A": null, "OUT_prefetch_B": null }, - "guid": "148ec12e-ac47-4a5e-bc34-836556a4bb1d" + "guid": "93ff2336-bf88-4205-9416-d8d5ada0fa43" }, "id": 5, "scope_entry": "0", @@ -1646,15 +1645,15 @@ }, { "type": "MapExit", - "label": "kernel_26_4_27[k=0:2]", + "label": "kernel_95_4_96[k=0:2]", "attributes": { "in_connectors": { - "IN___tmp_29_16_w_out_of_1_1_out_of_1_1": null + "IN___tmp_98_16_w_out_of_1_1_out_of_1_1": null }, "out_connectors": { - "OUT___tmp_29_16_w_out_of_1_1_out_of_1_1": null + "OUT___tmp_98_16_w_out_of_1_1_out_of_1_1": null }, - "guid": "525be732-b8e8-4067-a15d-9ad9a5ca1096" + "guid": "d36fa430-b62a-4ccf-8aab-69ea2476dff9" }, "id": 6, "scope_entry": "5", @@ -1662,9 +1661,9 @@ }, { "type": "MapEntry", - "label": "kernel_26_4_27_8_28[j=0:256]", + "label": "kernel_95_4_96_8_97[j=0:256]", "attributes": { - "label": "kernel_26_4_27_8_28", + "label": "kernel_95_4_96_8_97", "params": [ "j" ], @@ -1682,23 +1681,21 @@ "schedule": "GPU_ThreadBlock", "debuginfo": { "type": "DebugInfo", - "start_line": 28, - "end_line": 28, + "start_line": 97, + "end_line": 97, "start_column": 12, "end_column": 12, - "filename": "/home/primrose/Work/DaceLayoutAndScheduleTransformations/layout_and_schedule_transformations/tests/multiple_buffering_test.py" + "filename": "/home/primrose/Work/dace/tests/transformations/multiple_buffering_test.py" }, "in_connectors": { - "IN___tmp_29_37_r_in_from_1_0": null, - "IN___tmp_29_58_r_in_from_1_0": null, - "prefetch_shr_A": null, - "prefetch_shr_B": null + "IN___tmp_98_37_r_in_from_1_0": null, + "IN___tmp_98_58_r_in_from_1_0": null }, "out_connectors": { - "OUT___tmp_29_37_r_in_from_1_0": null, - "OUT___tmp_29_58_r_in_from_1_0": null + "OUT___tmp_98_37_r_in_from_1_0": null, + "OUT___tmp_98_58_r_in_from_1_0": null }, - "guid": "bf2fc620-040a-48b3-be54-f9d9b1997e41" + "guid": "cbaaa2f3-f301-4716-a836-6eb53ee8698f" }, "id": 7, "scope_entry": "5", @@ -1706,15 +1703,15 @@ }, { "type": "MapExit", - "label": "kernel_26_4_27_8_28[j=0:256]", + "label": "kernel_95_4_96_8_97[j=0:256]", "attributes": { "in_connectors": { - "IN___tmp_29_16_w_out_of_1_1": null + "IN___tmp_98_16_w_out_of_1_1": null }, "out_connectors": { - "OUT___tmp_29_16_w_out_of_1_1": null + "OUT___tmp_98_16_w_out_of_1_1": null }, - "guid": "db988a0e-a1c1-45a3-9905-32dc050c6b76" + "guid": "935f0df2-f1ce-4886-9afe-4eaef71e5b09" }, "id": 8, "scope_entry": "7", @@ -1730,11 +1727,11 @@ }, "debuginfo": { "type": "DebugInfo", - "start_line": 29, - "end_line": 29, + "start_line": 98, + "end_line": 98, "start_column": 72, "end_column": 72, - "filename": "/home/primrose/Work/DaceLayoutAndScheduleTransformations/layout_and_schedule_transformations/tests/multiple_buffering_test.py" + "filename": "/home/primrose/Work/dace/tests/transformations/multiple_buffering_test.py" }, "label": "_Add_", "in_connectors": { @@ -1744,7 +1741,7 @@ "out_connectors": { "__out": null }, - "guid": "86df7814-ffdc-41a0-afe1-f211ae009e6a" + "guid": "92adee05-eb8b-4d37-8aa3-85e480d724e9" }, "id": 9, "scope_entry": "7", @@ -1756,14 +1753,14 @@ "attributes": { "debuginfo": { "type": "DebugInfo", - "start_line": 29, - "end_line": 29, + "start_line": 98, + "end_line": 98, "start_column": 72, "end_column": 72, - "filename": "/home/primrose/Work/DaceLayoutAndScheduleTransformations/layout_and_schedule_transformations/tests/multiple_buffering_test.py" + "filename": "/home/primrose/Work/dace/tests/transformations/multiple_buffering_test.py" }, "data": "__tmp3", - "guid": "189ae714-2453-4033-9ea3-497068c70521" + "guid": "480998aa-e861-4092-a043-5639a355c629" }, "id": 10, "scope_entry": "7", @@ -1771,7 +1768,7 @@ }, { "type": "Tasklet", - "label": "assign_29_16", + "label": "assign_98_16", "attributes": { "code": { "string_data": "__out = __inp", @@ -1779,20 +1776,20 @@ }, "debuginfo": { "type": "DebugInfo", - "start_line": 29, - "end_line": 29, + "start_line": 98, + "end_line": 98, "start_column": 30, "end_column": 30, - "filename": "/home/primrose/Work/DaceLayoutAndScheduleTransformations/layout_and_schedule_transformations/tests/multiple_buffering_test.py" + "filename": "/home/primrose/Work/dace/tests/transformations/multiple_buffering_test.py" }, - "label": "assign_29_16", + "label": "assign_98_16", "in_connectors": { "__inp": null }, "out_connectors": { "__out": null }, - "guid": "ee7b20f7-fbc2-49a9-8fba-ccffff2b7780" + "guid": "64433f94-2825-4b47-a308-361d508d77f6" }, "id": 11, "scope_entry": "7", @@ -1912,7 +1909,7 @@ "language": "CPP" } }, - "guid": "eea04031-8543-47bf-8b5a-f84b21ebbe6f" + "guid": "f94e5217-c4f1-46a4-afdc-da0185f8f45b" }, "nodes": [ { @@ -1931,19 +1928,19 @@ "language": "Python" }, "loop_variable": "pipe_stage", - "guid": "5bc4de60-cb97-4474-81ee-2eb518eef744" + "guid": "1953b28b-8f55-45ac-9f33-168fb08dca3e" }, "nodes": [ { "type": "ConditionalBlock", "attributes": { - "guid": "03025064-bdb3-43db-909c-3488ec9b7504" + "guid": "15bc7f27-eb3b-4991-ab35-89220f06f406" }, "nodes": [ { "type": "ControlFlowRegion", "attributes": { - "guid": "05b5e995-af61-4e76-854c-081bc25f7a42" + "guid": "0c174dbd-474e-4634-b190-491e0c5ab8b2" }, "nodes": [ { @@ -1956,7 +1953,9 @@ 0, 1, 2, - 3 + 3, + 4, + 5 ] }, "nodes": [ @@ -1966,14 +1965,14 @@ "attributes": { "debuginfo": { "type": "DebugInfo", - "start_line": 771, - "end_line": 771, + "start_line": 856, + "end_line": 856, "start_column": 0, "end_column": 0, "filename": "/home/primrose/Work/dace/dace/transformation/dataflow/multiple_buffering.py" }, "data": "B", - "guid": "c41027b9-626d-4477-9adb-64d4737d1262" + "guid": "7a229696-1b0e-4cc3-bfba-b0d5526ebb0d" }, "id": 0, "scope_entry": null, @@ -1985,14 +1984,14 @@ "attributes": { "debuginfo": { "type": "DebugInfo", - "start_line": 772, - "end_line": 772, + "start_line": 857, + "end_line": 857, "start_column": 0, "end_column": 0, "filename": "/home/primrose/Work/dace/dace/transformation/dataflow/multiple_buffering.py" }, "data": "shr_B", - "guid": "b0787fde-9244-48b5-a20e-aa7eda1dde3b" + "guid": "948f2521-89e6-463d-b326-4f39fc82c7a0" }, "id": 1, "scope_entry": null, @@ -2004,14 +2003,14 @@ "attributes": { "debuginfo": { "type": "DebugInfo", - "start_line": 771, - "end_line": 771, + "start_line": 856, + "end_line": 856, "start_column": 0, "end_column": 0, "filename": "/home/primrose/Work/dace/dace/transformation/dataflow/multiple_buffering.py" }, "data": "A", - "guid": "1b3ebd30-f4f5-4dcd-9d69-268a6198a074" + "guid": "5bdcde52-e626-4979-a389-b73ba6e6f927" }, "id": 2, "scope_entry": null, @@ -2023,18 +2022,66 @@ "attributes": { "debuginfo": { "type": "DebugInfo", - "start_line": 772, - "end_line": 772, + "start_line": 857, + "end_line": 857, "start_column": 0, "end_column": 0, "filename": "/home/primrose/Work/dace/dace/transformation/dataflow/multiple_buffering.py" }, "data": "shr_A", - "guid": "49ff03e3-8c02-4502-8f41-42cfa81acc43" + "guid": "355f3366-55d5-4d29-a305-dfba28639f89" }, "id": 3, "scope_entry": null, "scope_exit": null + }, + { + "type": "Tasklet", + "label": "producer_acquire_pipe", + "attributes": { + "code": { + "string_data": "pipe.consumer_acquire();", + "language": "CPP" + }, + "debuginfo": { + "type": "DebugInfo", + "start_line": 380, + "end_line": 380, + "start_column": 0, + "end_column": 0, + "filename": "/home/primrose/Work/dace/dace/transformation/dataflow/multiple_buffering.py" + }, + "side_effects": true, + "label": "producer_acquire_pipe", + "guid": "c478ba2a-9f56-45f7-942f-6a6c9153a1d0" + }, + "id": 4, + "scope_entry": null, + "scope_exit": null + }, + { + "type": "Tasklet", + "label": "producer_commit_pipe", + "attributes": { + "code": { + "string_data": "pipe.consumer_commit();", + "language": "CPP" + }, + "debuginfo": { + "type": "DebugInfo", + "start_line": 390, + "end_line": 390, + "start_column": 0, + "end_column": 0, + "filename": "/home/primrose/Work/dace/dace/transformation/dataflow/multiple_buffering.py" + }, + "side_effects": true, + "label": "producer_commit_pipe", + "guid": "fb90138b-abb2-4752-9bc4-d9d55324886c" + }, + "id": 5, + "scope_entry": null, + "scope_exit": null } ], "edges": [ @@ -2075,7 +2122,7 @@ }, "data": "B", "debuginfo": null, - "guid": "f10bf60c-98c6-498b-8bb1-b32392cee7ec", + "guid": "07276b3c-739f-46ae-bb20-390a445ca75c", "src_subset": { "type": "Range", "ranges": [ @@ -2151,7 +2198,7 @@ }, "data": "A", "debuginfo": null, - "guid": "9289d8e8-28ba-469c-9130-53adce8d0b59", + "guid": "07d7acea-e0db-45d9-8c2f-246296f71ffa", "src_subset": { "type": "Range", "ranges": [ @@ -2189,10 +2236,94 @@ "dst": "3", "dst_connector": null, "src_connector": null + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "1", + "debuginfo": null, + "guid": "0f9c8337-a0df-470c-b984-6ffe36730ec9", + "src_subset": null, + "dst_subset": null, + "is_data_src": false, + "num_accesses": "1" + } + } + }, + "src": "4", + "dst": "0", + "dst_connector": null, + "src_connector": null + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "1", + "debuginfo": null, + "guid": "c57cf2c0-8238-443e-b743-e264dc9723db", + "src_subset": null, + "dst_subset": null, + "is_data_src": false, + "num_accesses": "1" + } + } + }, + "src": "1", + "dst": "5", + "dst_connector": null, + "src_connector": null + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "1", + "debuginfo": null, + "guid": "a0f2b6f9-dbdd-4d37-9b0e-e75a74f4621b", + "src_subset": null, + "dst_subset": null, + "is_data_src": false, + "num_accesses": "1" + } + } + }, + "src": "4", + "dst": "2", + "dst_connector": null, + "src_connector": null + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "1", + "debuginfo": null, + "guid": "ada740d2-5199-4f0d-a75e-e9327ea681f7", + "src_subset": null, + "dst_subset": null, + "is_data_src": false, + "num_accesses": "1" + } + } + }, + "src": "3", + "dst": "5", + "dst_connector": null, + "src_connector": null } ], "attributes": { - "guid": "016184bb-8e43-4101-859a-a2608a20b280" + "guid": "3a623ac0-f8e9-4b15-9f7a-eecd1ccfadb3" } } ], @@ -2219,7 +2350,7 @@ { "type": "ControlFlowRegion", "attributes": { - "guid": "05b5e995-af61-4e76-854c-081bc25f7a42" + "guid": "0c174dbd-474e-4634-b190-491e0c5ab8b2" }, "nodes": [ { @@ -2232,7 +2363,9 @@ 0, 1, 2, - 3 + 3, + 4, + 5 ] }, "nodes": [ @@ -2242,14 +2375,14 @@ "attributes": { "debuginfo": { "type": "DebugInfo", - "start_line": 771, - "end_line": 771, + "start_line": 856, + "end_line": 856, "start_column": 0, "end_column": 0, "filename": "/home/primrose/Work/dace/dace/transformation/dataflow/multiple_buffering.py" }, "data": "B", - "guid": "c41027b9-626d-4477-9adb-64d4737d1262" + "guid": "7a229696-1b0e-4cc3-bfba-b0d5526ebb0d" }, "id": 0, "scope_entry": null, @@ -2261,14 +2394,14 @@ "attributes": { "debuginfo": { "type": "DebugInfo", - "start_line": 772, - "end_line": 772, + "start_line": 857, + "end_line": 857, "start_column": 0, "end_column": 0, "filename": "/home/primrose/Work/dace/dace/transformation/dataflow/multiple_buffering.py" }, "data": "shr_B", - "guid": "b0787fde-9244-48b5-a20e-aa7eda1dde3b" + "guid": "948f2521-89e6-463d-b326-4f39fc82c7a0" }, "id": 1, "scope_entry": null, @@ -2280,14 +2413,14 @@ "attributes": { "debuginfo": { "type": "DebugInfo", - "start_line": 771, - "end_line": 771, + "start_line": 856, + "end_line": 856, "start_column": 0, "end_column": 0, "filename": "/home/primrose/Work/dace/dace/transformation/dataflow/multiple_buffering.py" }, "data": "A", - "guid": "1b3ebd30-f4f5-4dcd-9d69-268a6198a074" + "guid": "5bdcde52-e626-4979-a389-b73ba6e6f927" }, "id": 2, "scope_entry": null, @@ -2299,18 +2432,66 @@ "attributes": { "debuginfo": { "type": "DebugInfo", - "start_line": 772, - "end_line": 772, + "start_line": 857, + "end_line": 857, "start_column": 0, "end_column": 0, "filename": "/home/primrose/Work/dace/dace/transformation/dataflow/multiple_buffering.py" }, "data": "shr_A", - "guid": "49ff03e3-8c02-4502-8f41-42cfa81acc43" + "guid": "355f3366-55d5-4d29-a305-dfba28639f89" }, "id": 3, "scope_entry": null, "scope_exit": null + }, + { + "type": "Tasklet", + "label": "producer_acquire_pipe", + "attributes": { + "code": { + "string_data": "pipe.consumer_acquire();", + "language": "CPP" + }, + "debuginfo": { + "type": "DebugInfo", + "start_line": 380, + "end_line": 380, + "start_column": 0, + "end_column": 0, + "filename": "/home/primrose/Work/dace/dace/transformation/dataflow/multiple_buffering.py" + }, + "side_effects": true, + "label": "producer_acquire_pipe", + "guid": "c478ba2a-9f56-45f7-942f-6a6c9153a1d0" + }, + "id": 4, + "scope_entry": null, + "scope_exit": null + }, + { + "type": "Tasklet", + "label": "producer_commit_pipe", + "attributes": { + "code": { + "string_data": "pipe.consumer_commit();", + "language": "CPP" + }, + "debuginfo": { + "type": "DebugInfo", + "start_line": 390, + "end_line": 390, + "start_column": 0, + "end_column": 0, + "filename": "/home/primrose/Work/dace/dace/transformation/dataflow/multiple_buffering.py" + }, + "side_effects": true, + "label": "producer_commit_pipe", + "guid": "fb90138b-abb2-4752-9bc4-d9d55324886c" + }, + "id": 5, + "scope_entry": null, + "scope_exit": null } ], "edges": [ @@ -2351,7 +2532,7 @@ }, "data": "B", "debuginfo": null, - "guid": "f10bf60c-98c6-498b-8bb1-b32392cee7ec", + "guid": "07276b3c-739f-46ae-bb20-390a445ca75c", "src_subset": { "type": "Range", "ranges": [ @@ -2427,7 +2608,7 @@ }, "data": "A", "debuginfo": null, - "guid": "9289d8e8-28ba-469c-9130-53adce8d0b59", + "guid": "07d7acea-e0db-45d9-8c2f-246296f71ffa", "src_subset": { "type": "Range", "ranges": [ @@ -2465,10 +2646,94 @@ "dst": "3", "dst_connector": null, "src_connector": null + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "1", + "debuginfo": null, + "guid": "0f9c8337-a0df-470c-b984-6ffe36730ec9", + "src_subset": null, + "dst_subset": null, + "is_data_src": false, + "num_accesses": "1" + } + } + }, + "src": "4", + "dst": "0", + "dst_connector": null, + "src_connector": null + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "1", + "debuginfo": null, + "guid": "c57cf2c0-8238-443e-b743-e264dc9723db", + "src_subset": null, + "dst_subset": null, + "is_data_src": false, + "num_accesses": "1" + } + } + }, + "src": "1", + "dst": "5", + "dst_connector": null, + "src_connector": null + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "1", + "debuginfo": null, + "guid": "a0f2b6f9-dbdd-4d37-9b0e-e75a74f4621b", + "src_subset": null, + "dst_subset": null, + "is_data_src": false, + "num_accesses": "1" + } + } + }, + "src": "4", + "dst": "2", + "dst_connector": null, + "src_connector": null + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "1", + "debuginfo": null, + "guid": "ada740d2-5199-4f0d-a75e-e9327ea681f7", + "src_subset": null, + "dst_subset": null, + "is_data_src": false, + "num_accesses": "1" + } + } + }, + "src": "3", + "dst": "5", + "dst_connector": null, + "src_connector": null } ], "attributes": { - "guid": "016184bb-8e43-4101-859a-a2608a20b280" + "guid": "3a623ac0-f8e9-4b15-9f7a-eecd1ccfadb3" } } ], @@ -2505,8 +2770,8 @@ }, "debuginfo": { "type": "DebugInfo", - "start_line": 761, - "end_line": 761, + "start_line": 846, + "end_line": 846, "start_column": 0, "end_column": 0, "filename": "/home/primrose/Work/dace/dace/transformation/dataflow/multiple_buffering.py" @@ -2520,7 +2785,7 @@ "shr_A": null, "shr_B": null }, - "guid": "431c5863-6cf8-4bff-a7fc-11bea2357e5e" + "guid": "4b7bb879-2f87-4b27-ab91-5bff3773b75f" }, "id": 12, "scope_entry": "0", @@ -2532,14 +2797,14 @@ "attributes": { "debuginfo": { "type": "DebugInfo", - "start_line": 398, - "end_line": 398, + "start_line": 446, + "end_line": 446, "start_column": 0, "end_column": 0, "filename": "/home/primrose/Work/dace/dace/transformation/dataflow/multiple_buffering.py" }, "data": "shr_B", - "guid": "9aff2a42-f653-436e-adde-43b71fedb84e" + "guid": "18dada66-b1ce-4caa-8c94-a57d620d853c" }, "id": 13, "scope_entry": "0", @@ -2551,14 +2816,14 @@ "attributes": { "debuginfo": { "type": "DebugInfo", - "start_line": 398, - "end_line": 398, + "start_line": 446, + "end_line": 446, "start_column": 0, "end_column": 0, "filename": "/home/primrose/Work/dace/dace/transformation/dataflow/multiple_buffering.py" }, "data": "shr_A", - "guid": "d60871ef-9696-4fed-9bac-72fe656f3747" + "guid": "a9a91038-4a36-450f-8e1c-199ce6279bb9" }, "id": 14, "scope_entry": "0", @@ -2679,19 +2944,19 @@ "language": "CPP" } }, - "guid": "c1a1acfc-7ab9-4f27-af94-09ce0d5f8c8d" + "guid": "1e62ca8f-efd0-4b2b-bd41-f3b62ee3155a" }, "nodes": [ { "type": "ConditionalBlock", "attributes": { - "guid": "40691bac-7151-4891-984f-8e3a8d3ac974" + "guid": "c702c15e-b328-4ff6-a444-290cf8272a16" }, "nodes": [ { "type": "ControlFlowRegion", "attributes": { - "guid": "99957ad5-0143-430c-9e3b-6a6d458c2bcb" + "guid": "49120be2-b132-4968-9b1b-e214c4e4a23b" }, "nodes": [ { @@ -2704,7 +2969,9 @@ 0, 1, 2, - 3 + 3, + 4, + 5 ] }, "nodes": [ @@ -2714,14 +2981,14 @@ "attributes": { "debuginfo": { "type": "DebugInfo", - "start_line": 842, - "end_line": 842, + "start_line": 924, + "end_line": 924, "start_column": 0, "end_column": 0, "filename": "/home/primrose/Work/dace/dace/transformation/dataflow/multiple_buffering.py" }, "data": "B", - "guid": "8629e09d-3256-4746-b257-e1d1633b4d09" + "guid": "c428e17a-c2db-456c-9db3-2e2242a08f43" }, "id": 0, "scope_entry": null, @@ -2733,14 +3000,14 @@ "attributes": { "debuginfo": { "type": "DebugInfo", - "start_line": 843, - "end_line": 843, + "start_line": 925, + "end_line": 925, "start_column": 0, "end_column": 0, "filename": "/home/primrose/Work/dace/dace/transformation/dataflow/multiple_buffering.py" }, "data": "shr_B", - "guid": "19c17ed2-1764-4274-b020-606c61d4b7f8" + "guid": "e071dd77-90ce-4f0e-9a5e-64695ac2d70c" }, "id": 1, "scope_entry": null, @@ -2752,14 +3019,14 @@ "attributes": { "debuginfo": { "type": "DebugInfo", - "start_line": 842, - "end_line": 842, + "start_line": 924, + "end_line": 924, "start_column": 0, "end_column": 0, "filename": "/home/primrose/Work/dace/dace/transformation/dataflow/multiple_buffering.py" }, "data": "A", - "guid": "c231ef74-5202-4f53-a930-10174929ebfc" + "guid": "278ca990-59cf-4e1b-968a-485487fb0fe2" }, "id": 2, "scope_entry": null, @@ -2771,18 +3038,66 @@ "attributes": { "debuginfo": { "type": "DebugInfo", - "start_line": 843, - "end_line": 843, + "start_line": 925, + "end_line": 925, "start_column": 0, "end_column": 0, "filename": "/home/primrose/Work/dace/dace/transformation/dataflow/multiple_buffering.py" }, "data": "shr_A", - "guid": "646deb68-da76-4a92-b937-da96ba475b21" + "guid": "a5cc31c2-b856-430e-bc9f-0b8f93022b50" }, "id": 3, "scope_entry": null, "scope_exit": null + }, + { + "type": "Tasklet", + "label": "producer_acquire_pipe", + "attributes": { + "code": { + "string_data": "pipe.consumer_acquire();", + "language": "CPP" + }, + "debuginfo": { + "type": "DebugInfo", + "start_line": 486, + "end_line": 486, + "start_column": 0, + "end_column": 0, + "filename": "/home/primrose/Work/dace/dace/transformation/dataflow/multiple_buffering.py" + }, + "side_effects": true, + "label": "producer_acquire_pipe", + "guid": "694dbb32-9d8f-408e-88cd-de8062f963e5" + }, + "id": 4, + "scope_entry": null, + "scope_exit": null + }, + { + "type": "Tasklet", + "label": "producer_commit_pipe", + "attributes": { + "code": { + "string_data": "pipe.consumer_commit();", + "language": "CPP" + }, + "debuginfo": { + "type": "DebugInfo", + "start_line": 496, + "end_line": 496, + "start_column": 0, + "end_column": 0, + "filename": "/home/primrose/Work/dace/dace/transformation/dataflow/multiple_buffering.py" + }, + "side_effects": true, + "label": "producer_commit_pipe", + "guid": "ec98c77f-a979-4684-9bd7-744aa5d4bd04" + }, + "id": 5, + "scope_entry": null, + "scope_exit": null } ], "edges": [ @@ -2823,7 +3138,7 @@ }, "data": "B", "debuginfo": null, - "guid": "0d43d4bf-add6-433d-9686-67cb71d9dbb0", + "guid": "1780aa25-071a-4d0f-bd6b-7f93339acdf3", "src_subset": { "type": "Range", "ranges": [ @@ -2899,7 +3214,7 @@ }, "data": "A", "debuginfo": null, - "guid": "ea82a854-7399-463e-ad83-d281f8f3e8b3", + "guid": "1e78b324-bfac-40aa-9418-ee8e10e6a76b", "src_subset": { "type": "Range", "ranges": [ @@ -2937,10 +3252,94 @@ "dst": "3", "dst_connector": null, "src_connector": null + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "1", + "debuginfo": null, + "guid": "9293a358-75c3-47fb-8ebc-73d67cd79f2c", + "src_subset": null, + "dst_subset": null, + "is_data_src": false, + "num_accesses": "1" + } + } + }, + "src": "4", + "dst": "0", + "dst_connector": null, + "src_connector": null + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "1", + "debuginfo": null, + "guid": "35a5a5a9-3beb-4393-8361-9776e95af37f", + "src_subset": null, + "dst_subset": null, + "is_data_src": false, + "num_accesses": "1" + } + } + }, + "src": "1", + "dst": "5", + "dst_connector": null, + "src_connector": null + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "1", + "debuginfo": null, + "guid": "db651f05-1390-4998-bf43-2e7441d9798b", + "src_subset": null, + "dst_subset": null, + "is_data_src": false, + "num_accesses": "1" + } + } + }, + "src": "4", + "dst": "2", + "dst_connector": null, + "src_connector": null + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "1", + "debuginfo": null, + "guid": "5a6b79a5-f34f-4d4f-baca-5f234b66b1ef", + "src_subset": null, + "dst_subset": null, + "is_data_src": false, + "num_accesses": "1" + } + } + }, + "src": "3", + "dst": "5", + "dst_connector": null, + "src_connector": null } ], "attributes": { - "guid": "8ba50dba-4244-4519-86f9-189eb2300202" + "guid": "842bf787-acb7-47e5-b7ab-b52500c61cdd" } } ], @@ -2967,7 +3366,7 @@ { "type": "ControlFlowRegion", "attributes": { - "guid": "99957ad5-0143-430c-9e3b-6a6d458c2bcb" + "guid": "49120be2-b132-4968-9b1b-e214c4e4a23b" }, "nodes": [ { @@ -2980,7 +3379,9 @@ 0, 1, 2, - 3 + 3, + 4, + 5 ] }, "nodes": [ @@ -2990,14 +3391,14 @@ "attributes": { "debuginfo": { "type": "DebugInfo", - "start_line": 842, - "end_line": 842, + "start_line": 924, + "end_line": 924, "start_column": 0, "end_column": 0, "filename": "/home/primrose/Work/dace/dace/transformation/dataflow/multiple_buffering.py" }, "data": "B", - "guid": "8629e09d-3256-4746-b257-e1d1633b4d09" + "guid": "c428e17a-c2db-456c-9db3-2e2242a08f43" }, "id": 0, "scope_entry": null, @@ -3009,14 +3410,14 @@ "attributes": { "debuginfo": { "type": "DebugInfo", - "start_line": 843, - "end_line": 843, + "start_line": 925, + "end_line": 925, "start_column": 0, "end_column": 0, "filename": "/home/primrose/Work/dace/dace/transformation/dataflow/multiple_buffering.py" }, "data": "shr_B", - "guid": "19c17ed2-1764-4274-b020-606c61d4b7f8" + "guid": "e071dd77-90ce-4f0e-9a5e-64695ac2d70c" }, "id": 1, "scope_entry": null, @@ -3028,14 +3429,14 @@ "attributes": { "debuginfo": { "type": "DebugInfo", - "start_line": 842, - "end_line": 842, + "start_line": 924, + "end_line": 924, "start_column": 0, "end_column": 0, "filename": "/home/primrose/Work/dace/dace/transformation/dataflow/multiple_buffering.py" }, "data": "A", - "guid": "c231ef74-5202-4f53-a930-10174929ebfc" + "guid": "278ca990-59cf-4e1b-968a-485487fb0fe2" }, "id": 2, "scope_entry": null, @@ -3047,18 +3448,66 @@ "attributes": { "debuginfo": { "type": "DebugInfo", - "start_line": 843, - "end_line": 843, + "start_line": 925, + "end_line": 925, "start_column": 0, "end_column": 0, "filename": "/home/primrose/Work/dace/dace/transformation/dataflow/multiple_buffering.py" }, "data": "shr_A", - "guid": "646deb68-da76-4a92-b937-da96ba475b21" + "guid": "a5cc31c2-b856-430e-bc9f-0b8f93022b50" }, "id": 3, "scope_entry": null, "scope_exit": null + }, + { + "type": "Tasklet", + "label": "producer_acquire_pipe", + "attributes": { + "code": { + "string_data": "pipe.consumer_acquire();", + "language": "CPP" + }, + "debuginfo": { + "type": "DebugInfo", + "start_line": 486, + "end_line": 486, + "start_column": 0, + "end_column": 0, + "filename": "/home/primrose/Work/dace/dace/transformation/dataflow/multiple_buffering.py" + }, + "side_effects": true, + "label": "producer_acquire_pipe", + "guid": "694dbb32-9d8f-408e-88cd-de8062f963e5" + }, + "id": 4, + "scope_entry": null, + "scope_exit": null + }, + { + "type": "Tasklet", + "label": "producer_commit_pipe", + "attributes": { + "code": { + "string_data": "pipe.consumer_commit();", + "language": "CPP" + }, + "debuginfo": { + "type": "DebugInfo", + "start_line": 496, + "end_line": 496, + "start_column": 0, + "end_column": 0, + "filename": "/home/primrose/Work/dace/dace/transformation/dataflow/multiple_buffering.py" + }, + "side_effects": true, + "label": "producer_commit_pipe", + "guid": "ec98c77f-a979-4684-9bd7-744aa5d4bd04" + }, + "id": 5, + "scope_entry": null, + "scope_exit": null } ], "edges": [ @@ -3099,7 +3548,7 @@ }, "data": "B", "debuginfo": null, - "guid": "0d43d4bf-add6-433d-9686-67cb71d9dbb0", + "guid": "1780aa25-071a-4d0f-bd6b-7f93339acdf3", "src_subset": { "type": "Range", "ranges": [ @@ -3175,48 +3624,132 @@ }, "data": "A", "debuginfo": null, - "guid": "ea82a854-7399-463e-ad83-d281f8f3e8b3", - "src_subset": { - "type": "Range", - "ranges": [ - { - "start": "256*k + i + 256", - "end": "256*k + i + 511", - "step": "1", - "tile": "1" - } - ] - }, - "dst_subset": { - "type": "Range", - "ranges": [ - { - "start": "Mod(k + 1, 2)", - "end": "Mod(k + 1, 2)", - "step": "1", - "tile": "1" - }, - { - "start": "0", - "end": "255", - "step": "1", - "tile": "1" - } - ] - }, - "is_data_src": true, - "num_accesses": "256" + "guid": "1e78b324-bfac-40aa-9418-ee8e10e6a76b", + "src_subset": { + "type": "Range", + "ranges": [ + { + "start": "256*k + i + 256", + "end": "256*k + i + 511", + "step": "1", + "tile": "1" + } + ] + }, + "dst_subset": { + "type": "Range", + "ranges": [ + { + "start": "Mod(k + 1, 2)", + "end": "Mod(k + 1, 2)", + "step": "1", + "tile": "1" + }, + { + "start": "0", + "end": "255", + "step": "1", + "tile": "1" + } + ] + }, + "is_data_src": true, + "num_accesses": "256" + } + } + }, + "src": "2", + "dst": "3", + "dst_connector": null, + "src_connector": null + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "1", + "debuginfo": null, + "guid": "9293a358-75c3-47fb-8ebc-73d67cd79f2c", + "src_subset": null, + "dst_subset": null, + "is_data_src": false, + "num_accesses": "1" + } + } + }, + "src": "4", + "dst": "0", + "dst_connector": null, + "src_connector": null + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "1", + "debuginfo": null, + "guid": "35a5a5a9-3beb-4393-8361-9776e95af37f", + "src_subset": null, + "dst_subset": null, + "is_data_src": false, + "num_accesses": "1" + } + } + }, + "src": "1", + "dst": "5", + "dst_connector": null, + "src_connector": null + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "1", + "debuginfo": null, + "guid": "db651f05-1390-4998-bf43-2e7441d9798b", + "src_subset": null, + "dst_subset": null, + "is_data_src": false, + "num_accesses": "1" } } }, - "src": "2", - "dst": "3", + "src": "4", + "dst": "2", + "dst_connector": null, + "src_connector": null + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "1", + "debuginfo": null, + "guid": "5a6b79a5-f34f-4d4f-baca-5f234b66b1ef", + "src_subset": null, + "dst_subset": null, + "is_data_src": false, + "num_accesses": "1" + } + } + }, + "src": "3", + "dst": "5", "dst_connector": null, "src_connector": null } ], "attributes": { - "guid": "8ba50dba-4244-4519-86f9-189eb2300202" + "guid": "842bf787-acb7-47e5-b7ab-b52500c61cdd" } } ], @@ -3246,8 +3779,8 @@ }, "debuginfo": { "type": "DebugInfo", - "start_line": 832, - "end_line": 832, + "start_line": 914, + "end_line": 914, "start_column": 0, "end_column": 0, "filename": "/home/primrose/Work/dace/dace/transformation/dataflow/multiple_buffering.py" @@ -3261,7 +3794,7 @@ "shr_A": null, "shr_B": null }, - "guid": "457b9063-b1ef-47fa-ac64-2c212ab5d96c" + "guid": "4aa1eab4-6385-41ba-9f63-630f122cf942" }, "id": 15, "scope_entry": "5", @@ -3273,14 +3806,14 @@ "attributes": { "debuginfo": { "type": "DebugInfo", - "start_line": 496, - "end_line": 496, + "start_line": 578, + "end_line": 578, "start_column": 0, "end_column": 0, "filename": "/home/primrose/Work/dace/dace/transformation/dataflow/multiple_buffering.py" }, "data": "shr_B", - "guid": "e371ae13-b0fe-4cbb-889d-d4ac8b31c21f" + "guid": "a1070f22-8d28-42f2-b21b-3bbef82280d3" }, "id": 16, "scope_entry": "5", @@ -3288,28 +3821,31 @@ }, { "type": "Tasklet", - "label": "sync_pipeline_shr_B", + "label": "acquire_pipe", "attributes": { "code": { - "string_data": "pipeline_shr_B.consumer_wait();", + "string_data": "pipe.consumer_wait();", "language": "CPP" }, "debuginfo": { "type": "DebugInfo", - "start_line": 549, - "end_line": 549, + "start_line": 630, + "end_line": 630, "start_column": 0, "end_column": 0, "filename": "/home/primrose/Work/dace/dace/transformation/dataflow/multiple_buffering.py" }, - "label": "sync_pipeline_shr_B", + "side_effects": true, + "label": "acquire_pipe", "in_connectors": { - "_in1": null + "_in_shr_A": null, + "_in_shr_B": null }, "out_connectors": { - "_out1": null + "_out_shr_A": null, + "_out_shr_B": null }, - "guid": "84793100-da93-4b8c-9456-1533a44515d5" + "guid": "8e7f4f84-04aa-4f85-8e05-9ad31ac3de3e" }, "id": 17, "scope_entry": "5", @@ -3321,14 +3857,14 @@ "attributes": { "debuginfo": { "type": "DebugInfo", - "start_line": 496, - "end_line": 496, + "start_line": 578, + "end_line": 578, "start_column": 0, "end_column": 0, "filename": "/home/primrose/Work/dace/dace/transformation/dataflow/multiple_buffering.py" }, "data": "shr_A", - "guid": "f38872ae-3a27-42d3-bf8a-5bb504cc02f8" + "guid": "95674815-1eab-4bac-885e-59957aff6161" }, "id": 18, "scope_entry": "5", @@ -3336,59 +3872,114 @@ }, { "type": "Tasklet", - "label": "sync_pipeline_shr_A", + "label": "consumer_release_pipe", "attributes": { "code": { - "string_data": "pipeline_shr_A.consumer_wait();", + "string_data": "pipe.consumer_release();", "language": "CPP" }, "debuginfo": { "type": "DebugInfo", - "start_line": 549, - "end_line": 549, + "start_line": 652, + "end_line": 652, "start_column": 0, "end_column": 0, "filename": "/home/primrose/Work/dace/dace/transformation/dataflow/multiple_buffering.py" }, - "label": "sync_pipeline_shr_A", - "in_connectors": { - "_in1": null - }, - "out_connectors": { - "_out1": null - }, - "guid": "e40b5bbc-50e4-4f39-9205-c0e52e50f3dc" + "side_effects": true, + "label": "consumer_release_pipe", + "guid": "8b55ad07-19c9-49bf-98af-017959caef88" }, "id": 19, "scope_entry": "5", "scope_exit": "6" + } + ], + "edges": [ + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "1", + "debuginfo": null, + "guid": "c5fc20f0-ec85-4953-8979-239785765110", + "src_subset": null, + "dst_subset": null, + "is_data_src": false, + "num_accesses": "1" + } + } + }, + "src": "16", + "dst": "7", + "dst_connector": null, + "src_connector": null }, { - "type": "Tasklet", - "label": "release_pipelines", + "type": "MultiConnectorEdge", "attributes": { - "code": { - "string_data": "pipeline_shr_B.consumer_release();\npipeline_shr_A.consumer_release();", - "language": "CPP" - }, - "debuginfo": { - "type": "DebugInfo", - "start_line": 570, - "end_line": 570, - "start_column": 0, - "end_column": 0, - "filename": "/home/primrose/Work/dace/dace/transformation/dataflow/multiple_buffering.py" - }, - "side_effects": true, - "label": "release_pipelines", - "guid": "385fdeb6-b4b2-4d4a-9d7e-6a2c8150fa70" + "data": { + "type": "Memlet", + "attributes": { + "volume": "1", + "debuginfo": null, + "guid": "013e641a-ac55-49f1-babe-4e32a7f2b8aa", + "src_subset": null, + "dst_subset": null, + "is_data_src": false, + "num_accesses": "1" + } + } }, - "id": 20, - "scope_entry": "5", - "scope_exit": "6" - } - ], - "edges": [ + "src": "18", + "dst": "7", + "dst_connector": null, + "src_connector": null + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "1", + "debuginfo": null, + "guid": "ed562ca6-4b10-4d27-ab12-938e49350c95", + "src_subset": null, + "dst_subset": null, + "is_data_src": false, + "num_accesses": "1" + } + } + }, + "src": "8", + "dst": "19", + "dst_connector": null, + "src_connector": null + }, + { + "type": "MultiConnectorEdge", + "attributes": { + "data": { + "type": "Memlet", + "attributes": { + "volume": "1", + "debuginfo": null, + "guid": "91a43c11-181e-4692-9cac-51de9dd6e251", + "src_subset": null, + "dst_subset": null, + "is_data_src": false, + "num_accesses": "1" + } + } + }, + "src": "19", + "dst": "6", + "dst_connector": null, + "src_connector": null + }, { "type": "MultiConnectorEdge", "attributes": { @@ -3397,7 +3988,7 @@ "attributes": { "volume": "1", "debuginfo": null, - "guid": "c5a715ef-8281-40c6-a5ce-6c005d9bdf59", + "guid": "6f824266-bfb1-4be1-863c-a2e22af2543f", "src_subset": null, "dst_subset": null, "is_data_src": false, @@ -3406,7 +3997,7 @@ } }, "src": "8", - "dst": "20", + "dst": "19", "dst_connector": null, "src_connector": null }, @@ -3418,7 +4009,7 @@ "attributes": { "volume": "1", "debuginfo": null, - "guid": "ae86542a-8a33-46cf-811f-1d5ea7e4264f", + "guid": "b78539e0-3fd7-4e62-a51e-f4a473d90901", "src_subset": null, "dst_subset": null, "is_data_src": false, @@ -3426,7 +4017,7 @@ } } }, - "src": "20", + "src": "19", "dst": "6", "dst_connector": null, "src_connector": null @@ -3451,7 +4042,7 @@ }, "data": "A", "debuginfo": null, - "guid": "90277fce-a899-4a98-9096-c9bc13119b39", + "guid": "9b3c23b5-e386-4edf-a8a1-e7274aac54ae", "src_subset": { "type": "Range", "ranges": [ @@ -3494,7 +4085,7 @@ }, "data": "B", "debuginfo": null, - "guid": "6dc9354e-bbcf-4372-b5e6-d47e5b615d27", + "guid": "dda3a79b-8c79-4417-a7aa-c4a9fcfa0352", "src_subset": { "type": "Range", "ranges": [ @@ -3543,7 +4134,7 @@ }, "data": "shr_A", "debuginfo": null, - "guid": "17775573-6dfe-42b9-9968-e5138ac5bde0", + "guid": "ad716fc5-69a5-4bfe-ad9b-f2bbb16d9d3e", "src_subset": { "type": "Range", "ranges": [ @@ -3569,7 +4160,7 @@ }, "src": "14", "dst": "5", - "dst_connector": "IN___tmp_29_37_r_in_from_1_0_in_from_1_0", + "dst_connector": "IN___tmp_98_37_r_in_from_1_0_in_from_1_0", "src_connector": null }, { @@ -3598,7 +4189,7 @@ }, "data": "shr_B", "debuginfo": null, - "guid": "d1680be5-efe6-4947-9c23-725e934436bd", + "guid": "6c0c9af6-e806-4031-9270-c9031886819a", "src_subset": { "type": "Range", "ranges": [ @@ -3624,7 +4215,7 @@ }, "src": "13", "dst": "5", - "dst_connector": "IN___tmp_29_58_r_in_from_1_0_in_from_1_0", + "dst_connector": "IN___tmp_98_58_r_in_from_1_0_in_from_1_0", "src_connector": null }, { @@ -3647,7 +4238,7 @@ }, "data": "__tmp3", "debuginfo": null, - "guid": "6861379f-d4ea-4c88-b194-3cf644ff3955", + "guid": "ef635686-7699-4f7e-9a4b-cd529006d9cf", "src_subset": { "type": "Range", "ranges": [ @@ -3670,116 +4261,6 @@ "dst_connector": "__inp", "src_connector": null }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "256", - "subset": { - "type": "Range", - "ranges": [ - { - "start": "Mod(k + 1, 2)", - "end": "Mod(k + 1, 2)", - "step": "1", - "tile": "1" - }, - { - "start": "0", - "end": "255", - "step": "1", - "tile": "1" - } - ] - }, - "data": "shr_A", - "debuginfo": null, - "guid": "4cd3b695-856a-43a8-aa26-0d356be0414f", - "src_subset": { - "type": "Range", - "ranges": [ - { - "start": "Mod(k + 1, 2)", - "end": "Mod(k + 1, 2)", - "step": "1", - "tile": "1" - }, - { - "start": "0", - "end": "255", - "step": "1", - "tile": "1" - } - ] - }, - "dst_subset": null, - "is_data_src": true, - "num_accesses": "256" - } - } - }, - "src": "18", - "dst": "7", - "dst_connector": "prefetch_shr_A", - "src_connector": null - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "256", - "subset": { - "type": "Range", - "ranges": [ - { - "start": "Mod(k + 1, 2)", - "end": "Mod(k + 1, 2)", - "step": "1", - "tile": "1" - }, - { - "start": "0", - "end": "255", - "step": "1", - "tile": "1" - } - ] - }, - "data": "shr_B", - "debuginfo": null, - "guid": "5bfb1bc5-9e69-4171-9f3c-a924d6d87773", - "src_subset": { - "type": "Range", - "ranges": [ - { - "start": "Mod(k + 1, 2)", - "end": "Mod(k + 1, 2)", - "step": "1", - "tile": "1" - }, - { - "start": "0", - "end": "255", - "step": "1", - "tile": "1" - } - ] - }, - "dst_subset": null, - "is_data_src": true, - "num_accesses": "256" - } - } - }, - "src": "16", - "dst": "7", - "dst_connector": "prefetch_shr_B", - "src_connector": null - }, { "type": "MultiConnectorEdge", "attributes": { @@ -3800,7 +4281,7 @@ }, "data": "A", "debuginfo": null, - "guid": "a0d5df7f-f651-489f-878d-0d60e7652cb1", + "guid": "245d76b7-91a4-44fd-8a6d-6af0a8e44f4d", "src_subset": { "type": "Range", "ranges": [ @@ -3843,7 +4324,7 @@ }, "data": "A", "debuginfo": null, - "guid": "9dac117d-7faa-4fc2-ab7c-586720478046", + "guid": "a136c9c6-2449-417c-8408-2cba9cc8b13f", "src_subset": { "type": "Range", "ranges": [ @@ -3886,7 +4367,7 @@ }, "data": "B", "debuginfo": null, - "guid": "b8def05b-494b-46eb-9bc6-c48a75198385", + "guid": "0f7d6f6f-67a1-4e2d-98a7-558e53741dd4", "src_subset": { "type": "Range", "ranges": [ @@ -3929,7 +4410,7 @@ }, "data": "B", "debuginfo": null, - "guid": "ff09596e-1fe4-4702-b57f-d6e04402c053", + "guid": "88623b99-236d-4786-8e32-811bea940b62", "src_subset": { "type": "Range", "ranges": [ @@ -3972,7 +4453,7 @@ }, "data": "C", "debuginfo": null, - "guid": "ce685d36-588d-4cd6-978b-3546f7e26f93", + "guid": "fd0d5304-9689-47cf-8c8c-9c105dad6c06", "src_subset": null, "dst_subset": { "type": "Range", @@ -4015,7 +4496,7 @@ }, "data": "C", "debuginfo": null, - "guid": "ac7d4bd5-5cfe-4dad-832a-2c3f949f55af", + "guid": "b0173626-eff3-4525-aa85-b96e45b5400e", "src_subset": null, "dst_subset": { "type": "Range", @@ -4035,8 +4516,8 @@ }, "src": "8", "dst": "6", - "dst_connector": "IN___tmp_29_16_w_out_of_1_1_out_of_1_1", - "src_connector": "OUT___tmp_29_16_w_out_of_1_1" + "dst_connector": "IN___tmp_98_16_w_out_of_1_1_out_of_1_1", + "src_connector": "OUT___tmp_98_16_w_out_of_1_1" }, { "type": "MultiConnectorEdge", @@ -4058,7 +4539,7 @@ }, "data": "C", "debuginfo": null, - "guid": "63b28d6f-5f69-4f75-b3e2-4823990157fd", + "guid": "961a4232-d88a-460b-9b0b-3ee74068670e", "src_subset": null, "dst_subset": { "type": "Range", @@ -4079,7 +4560,7 @@ "src": "6", "dst": "1", "dst_connector": "IN_C", - "src_connector": "OUT___tmp_29_16_w_out_of_1_1_out_of_1_1" + "src_connector": "OUT___tmp_98_16_w_out_of_1_1_out_of_1_1" }, { "type": "MultiConnectorEdge", @@ -4107,7 +4588,7 @@ }, "data": "shr_A", "debuginfo": null, - "guid": "abd95999-8a64-422c-88bb-36906fba9305", + "guid": "32b45c8a-21ff-4d9c-98da-d79ecda534b8", "src_subset": null, "dst_subset": { "type": "Range", @@ -4134,7 +4615,7 @@ "src": "7", "dst": "9", "dst_connector": "__in1", - "src_connector": "OUT___tmp_29_37_r_in_from_1_0" + "src_connector": "OUT___tmp_98_37_r_in_from_1_0" }, { "type": "MultiConnectorEdge", @@ -4162,7 +4643,7 @@ }, "data": "shr_A", "debuginfo": null, - "guid": "d3d2a22b-6feb-4aac-8363-3e151992cafb", + "guid": "dd942060-3ca5-4375-a2e6-24692b4b54c3", "src_subset": { "type": "Range", "ranges": [ @@ -4187,9 +4668,9 @@ } }, "src": "5", - "dst": "19", - "dst_connector": "_in1", - "src_connector": "OUT___tmp_29_37_r_in_from_1_0_in_from_1_0" + "dst": "17", + "dst_connector": "_in_shr_A", + "src_connector": "OUT___tmp_98_37_r_in_from_1_0_in_from_1_0" }, { "type": "MultiConnectorEdge", @@ -4217,7 +4698,7 @@ }, "data": "shr_B", "debuginfo": null, - "guid": "8637db87-9a9f-4c27-afaa-7c6995a16ce1", + "guid": "b1380382-374e-4cd8-85b4-0775615b298e", "src_subset": null, "dst_subset": { "type": "Range", @@ -4244,7 +4725,7 @@ "src": "7", "dst": "9", "dst_connector": "__in2", - "src_connector": "OUT___tmp_29_58_r_in_from_1_0" + "src_connector": "OUT___tmp_98_58_r_in_from_1_0" }, { "type": "MultiConnectorEdge", @@ -4272,7 +4753,7 @@ }, "data": "shr_B", "debuginfo": null, - "guid": "8d339332-544e-4173-a66c-a335e8993d7c", + "guid": "b8eb6fb0-9e21-467e-be63-2588e4c02c4a", "src_subset": { "type": "Range", "ranges": [ @@ -4298,8 +4779,8 @@ }, "src": "5", "dst": "17", - "dst_connector": "_in1", - "src_connector": "OUT___tmp_29_58_r_in_from_1_0_in_from_1_0" + "dst_connector": "_in_shr_B", + "src_connector": "OUT___tmp_98_58_r_in_from_1_0_in_from_1_0" }, { "type": "MultiConnectorEdge", @@ -4321,7 +4802,7 @@ }, "data": "A", "debuginfo": null, - "guid": "e4f49262-d8a1-4255-8e14-715ba57fdaff", + "guid": "b9b72928-b6cd-4e23-b898-3c0d55df4550", "src_subset": { "type": "Range", "ranges": [ @@ -4364,7 +4845,7 @@ }, "data": "B", "debuginfo": null, - "guid": "d0d710b5-083c-48f3-91de-7968ddbda1ff", + "guid": "c96b2013-4ba5-493e-9125-c3ea8005baac", "src_subset": { "type": "Range", "ranges": [ @@ -4407,7 +4888,7 @@ }, "data": "__tmp3", "debuginfo": null, - "guid": "71ea9976-2759-4850-8f7a-1b5e703bf25a", + "guid": "bab10d3a-bad3-4ccd-bf52-b7a83b41304d", "src_subset": null, "dst_subset": { "type": "Range", @@ -4450,7 +4931,7 @@ }, "data": "C", "debuginfo": null, - "guid": "b1ce4418-75a4-4381-9827-fa16b36e97e2", + "guid": "1490e37f-693d-4c9f-80be-37a03bfda736", "src_subset": null, "dst_subset": { "type": "Range", @@ -4470,7 +4951,7 @@ }, "src": "11", "dst": "8", - "dst_connector": "IN___tmp_29_16_w_out_of_1_1", + "dst_connector": "IN___tmp_98_16_w_out_of_1_1", "src_connector": "__out" }, { @@ -4499,7 +4980,7 @@ }, "data": "shr_A", "debuginfo": null, - "guid": "b889d248-0781-46d4-ab8b-6160c654c033", + "guid": "980b5189-720c-47e2-939b-1977b0b8d0a1", "src_subset": null, "dst_subset": { "type": "Range", @@ -4523,10 +5004,10 @@ } } }, - "src": "19", + "src": "17", "dst": "7", - "dst_connector": "IN___tmp_29_37_r_in_from_1_0", - "src_connector": "_out1" + "dst_connector": "IN___tmp_98_37_r_in_from_1_0", + "src_connector": "_out_shr_A" }, { "type": "MultiConnectorEdge", @@ -4554,7 +5035,7 @@ }, "data": "shr_B", "debuginfo": null, - "guid": "8ff7c450-f4c8-4b4a-b25b-baadce878a89", + "guid": "f6cb964f-52e7-4801-9ef5-1ca3eb6a1ba4", "src_subset": null, "dst_subset": { "type": "Range", @@ -4580,8 +5061,8 @@ }, "src": "17", "dst": "7", - "dst_connector": "IN___tmp_29_58_r_in_from_1_0", - "src_connector": "_out1" + "dst_connector": "IN___tmp_98_58_r_in_from_1_0", + "src_connector": "_out_shr_B" }, { "type": "MultiConnectorEdge", @@ -4609,7 +5090,7 @@ }, "data": "shr_A", "debuginfo": null, - "guid": "44a55ced-6a67-48ae-bcad-cf9619213664", + "guid": "dc10acf1-66f1-4e1b-8ea3-f01f0586f952", "src_subset": null, "dst_subset": { "type": "Range", @@ -4664,7 +5145,7 @@ }, "data": "shr_A", "debuginfo": null, - "guid": "b9092fdc-de95-4a67-8386-fb195d8e280a", + "guid": "c4e69a5c-f961-4972-a015-753ab9322fb5", "src_subset": null, "dst_subset": { "type": "Range", @@ -4719,7 +5200,7 @@ }, "data": "shr_B", "debuginfo": null, - "guid": "14f2eda5-56fa-41b9-bf80-44a5102b6343", + "guid": "e2cc2f76-58e9-4f34-bd4c-cb9d5294e2b0", "src_subset": null, "dst_subset": { "type": "Range", @@ -4774,7 +5255,7 @@ }, "data": "shr_B", "debuginfo": null, - "guid": "23e84f63-c96b-4c3d-bbfc-50ae9e3a365c", + "guid": "e4b2e651-174c-47b5-bdfe-a0ed6469e29b", "src_subset": null, "dst_subset": { "type": "Range", @@ -4805,7 +5286,7 @@ } ], "attributes": { - "guid": "328decff-fd21-4fff-881f-74fc87b42fa7", + "guid": "8d9c9dc6-b7fe-4305-9b0e-f344e6cb96bc", "executions": "1", "dynamic_executions": false } diff --git a/dace/codegen/targets/cpu.py b/dace/codegen/targets/cpu.py index 65b4e9e216..2955230a52 100644 --- a/dace/codegen/targets/cpu.py +++ b/dace/codegen/targets/cpu.py @@ -1034,6 +1034,7 @@ def process_out_memlets(self, # Tasklet -> array if isinstance(node, nodes.CodeNode): if not uconn: + return raise SyntaxError("Cannot copy memlet without a local connector: {} to {}".format( str(edge.src), str(edge.dst))) diff --git a/dace/codegen/targets/cuda.py b/dace/codegen/targets/cuda.py index 3dd50667cb..123e9eeba0 100644 --- a/dace/codegen/targets/cuda.py +++ b/dace/codegen/targets/cuda.py @@ -30,6 +30,7 @@ from dace.sdfg.state import ControlFlowRegion, StateSubgraphView from dace.transformation import helpers as xfh from dace.transformation.passes import analysis as ap +from dace.transformation.dataflow.add_threadblock_map import AddThreadBlockMap if TYPE_CHECKING: from dace.codegen.targets.framecode import DaCeCodeGenerator @@ -137,6 +138,10 @@ def __init__(self, frame_codegen: 'DaCeCodeGenerator', sdfg: SDFG): # End of dispatcher registration ###################################### + # new + self._kernels_with_inserted_tb_maps: Set[nodes.MapEntry] = set() + + def _emit_sync(self, codestream: CodeIOStream): if Config.get_bool('compiler', 'cuda', 'syncdebug'): codestream.write('''DACE_GPU_CHECK({backend}GetLastError()); @@ -153,6 +158,16 @@ def preprocess(self, sdfg: SDFG) -> None: CUDACodeGen, 'CUDA', target_type=target_type) + + + + old_nodes = set(node for node, _ in sdfg.all_nodes_recursive()) + + sdfg.apply_transformations_once_everywhere(AddThreadBlockMap, ) + + new_nodes = set(node for node, _ in sdfg.all_nodes_recursive()) - old_nodes + self._kernels_with_inserted_tb_maps = {n for n in new_nodes if isinstance(n, nodes.MapEntry) and n.schedule == dtypes.ScheduleType.GPU_Device} + # Find GPU<->GPU strided copies that cannot be represented by a single copy command from dace.transformation.dataflow import CopyToMap @@ -1929,8 +1944,11 @@ def get_kernel_dimensions(self, dfg_scope): if len(detected_block_sizes) > 1: - # Error when both gpu_block_size and thread-block maps were defined and conflict - if kernelmap_entry.map.gpu_block_size is not None: + # Error when both user has manually set gpu_block_size and thread-block maps were defined and conflict in block size + preset_block_size = kernelmap_entry.map.gpu_block_size + conflicting_block_sizes = (preset_block_size is not None) and not (kernelmap_entry in self._kernels_with_inserted_tb_maps) + + if conflicting_block_sizes: raise ValueError('Both the `gpu_block_size` property and internal thread-block ' 'maps were defined with conflicting sizes for kernel ' f'"{kernelmap_entry.map.label}" (sizes detected: {detected_block_sizes}). ' From 84b585b53818d4bdbdb11b915653a849f5990a8c Mon Sep 17 00:00:00 2001 From: aydogdub Date: Tue, 8 Jul 2025 13:05:26 +0200 Subject: [PATCH 56/94] format files to dace style, using pre-commit run --all --- berkay_workpace/reports/important_notes.txt | 2 +- berkay_workpace/reports/notes.txt | 10 +- berkay_workpace/reports/report.py | 12 +- berkay_workpace/reports/report_1.md | 52 +- .../dbuff_related/double_buffering_async.sdfg | 2 +- .../original_sdfg_with_shared_memory.sdfg | 2 +- .../2d_blocktiled_gemm_with_custom_copy.sdfg | 2 +- .../nice_global_to_shared_copy.sdfg | 2 +- .../smem_related/weird_global_to_global.sdfg | 2 +- .../weird_shared_to_shared_copy.sdfg | 2 +- .../warp_level_test.py | 186 +++----- .../tests/gpu_map_tests/device_map_test.py | 34 +- .../tests/gpu_map_tests/threadBlock_test.py | 15 +- .../out_of_kernel_memcpy_test.py | 90 ++-- .../tests/reusable_tests/cuda_block_test.py | 5 +- .../smem_tests/default_smem_sync_pass_test.py | 149 +++--- berkay_workpace/tests/smem_tests/gemm_test.py | 3 +- .../smem_tests/special_sync_pass_test.py | 2 +- dace/codegen/targets/__init__.py | 2 +- dace/codegen/targets/cpp.py | 3 +- dace/codegen/targets/cuda.py | 12 +- dace/codegen/targets/experimental_cuda.py | 451 ++++++++---------- .../copy_strategies.py | 242 ++++------ .../gpu_stream_manager.py | 5 +- .../experimental_cuda_helpers/gpu_utils.py | 45 +- .../scope_strategies.py | 290 +++++------ dace/config_schema.yml | 22 +- dace/dtypes.py | 3 +- dace/registry.py | 1 - dace/sdfg/nodes.py | 8 +- dace/sdfg/sdfg.py | 5 +- dace/sdfg/validation.py | 5 +- dace/symbolic.py | 2 +- .../dataflow/add_threadblock_map.py | 62 ++- .../analysis/infer_gpu_grid_and_block_size.py | 52 +- .../passes/gpustream_scheduling.py | 88 ++-- .../passes/shared_memory_synchronization.py | 184 +++---- 37 files changed, 935 insertions(+), 1119 deletions(-) diff --git a/berkay_workpace/reports/important_notes.txt b/berkay_workpace/reports/important_notes.txt index 440cbd4dcf..1726c8e8f7 100644 --- a/berkay_workpace/reports/important_notes.txt +++ b/berkay_workpace/reports/important_notes.txt @@ -10,4 +10,4 @@ codegens — this is confusing as it stands. 3. "Struct" memory copies in old codegen are hacks. These are omitted in the new ExperimentalCUDACodegen, because they should be implemented - in a planned and structured way, which is out of scope for my Master's Thesis. \ No newline at end of file + in a planned and structured way, which is out of scope for my Master's Thesis. diff --git a/berkay_workpace/reports/notes.txt b/berkay_workpace/reports/notes.txt index 767275669d..07714dc1d3 100644 --- a/berkay_workpace/reports/notes.txt +++ b/berkay_workpace/reports/notes.txt @@ -1,9 +1,9 @@ -What was bad: +What was bad: - Also dead code at "copy_memory", the first "dst_schedule" in the src_node if-else case it dead code. In Fact, "dst_schedule" is not even needed at all. So we have "double dead code", once a computation - which gets overwritten, and once we compute smth we don't need. + which gets overwritten, and once we compute smth we don't need. - Damn, even the copy_memory input named "memlet" is wrong.. this should be an edge type, not a memlet type. @@ -13,7 +13,7 @@ What was bad: - Again dead code: Computes "copy_shape" twice, first definition seems wrong and is not even used. -- Stream handling in CudaCodeGen check is just random- streams are not handled by the codegen. +- Stream handling in CudaCodeGen check is just random- streams are not handled by the codegen. - again, define local variables but then not use it. In this case: dtype @@ -25,12 +25,10 @@ What was bad: Tell Yakup: - I removed any logic that should handle cudaStream synchronization since I am not responsible for it. - In order to help to extend it in the future, I have two options. + In order to help to extend it in the future, I have two options. 1. I can add function calls (which are empty i.e. do nothing) that signal what should be implemented once there is a good solution of handling cuda streams in DaCe's new version 2. Document it and say that several streams are not supported (people could come up with completely new approaches to handle streams maybe) 3. We got smth wrong. CopyToMap handles only GPU<->GPU code cases. 4. I tried to handle "special case" as I understood... maybe worth to look at it closer with you - - diff --git a/berkay_workpace/reports/report.py b/berkay_workpace/reports/report.py index 1855e2fd40..c144fd74c9 100644 --- a/berkay_workpace/reports/report.py +++ b/berkay_workpace/reports/report.py @@ -1,22 +1,18 @@ # TODO: GENERAL, discuss with Yakup # 1. Modularity for Deallocate? -# 2. KernelScopeManager: What I like: +# 2. KernelScopeManager: What I like: # - simple, easy to understand, modular and clean # what I dont like: # - Kind of messes with _generate_exit and how dace generates code # Your opinion? do or dont? -# 3. __syncthread example ? Or better: General examples? +# 3. __syncthread example ? Or better: General examples? # 3.5 See below # 4. GPU streams- now or wait? # 5. Config for thread_id - why is this even a config? # 6. Used no instrumentation because I have no clue what it is - # I think the rest can wait before getting refactored (I don't need to reinvent the wheel) -# New tasks for now? - - - +# New tasks for now? # My personal TODO's # TODO: when tired @@ -24,4 +20,4 @@ # 4 dimensional example # TODO: depending on what happens next -# change in_device_code to maybe in_kernel_code? \ No newline at end of file +# change in_device_code to maybe in_kernel_code? diff --git a/berkay_workpace/reports/report_1.md b/berkay_workpace/reports/report_1.md index 040cef38ac..d2c4cb65f5 100644 --- a/berkay_workpace/reports/report_1.md +++ b/berkay_workpace/reports/report_1.md @@ -1,48 +1,48 @@ # Master's Thesis Report -**Thesis Title:** Code-generation for Modern GPUs in DaCe -**Student:** Berkay Aydogdu -**Supervisor:** Yakup Koray Budanaz -**Date:** 2025-05-23 +**Thesis Title:** Code-generation for Modern GPUs in DaCe +**Student:** Berkay Aydogdu +**Supervisor:** Yakup Koray Budanaz +**Date:** 2025-05-23 **Short description:** The objectives of this Master's thesis are to refactor the CUDA code generator in DaCe and to extend it with new features. The refactoring focuses on improving the structure, readability, and maintainability of the code. ## Progress Overview -By inspecting the source code of the CUDA code generator, we identified several poor coding -practices. These included, among others, intertwined functionality, non-descriptive variable -and function names, and numerous code fragments that appeared more like quick fixes or hacks +By inspecting the source code of the CUDA code generator, we identified several poor coding +practices. These included, among others, intertwined functionality, non-descriptive variable +and function names, and numerous code fragments that appeared more like quick fixes or hacks than thoughtfully designed solutions. To address these issues, we implemented a new CUDA code generator class `ExperimentalCUDACodeGen`, which can be enabled via configuration settings. We began by -running simple programs using the new generator, reusing parts of the existing code to get +running simple programs using the new generator, reusing parts of the existing code to get minimal examples working. -We deliberately chose not to build a completely new generator from scratch, as improving code -quality is only one part of the overall goal. Moreover, the existing implementation contains +We deliberately chose not to build a completely new generator from scratch, as improving code +quality is only one part of the overall goal. Moreover, the existing implementation contains well-designed components that are worth preserving—there is no need to reinvent the wheel. The following section highlights the notable aspects of the new implementation: -- Only simple features are supported for now, in order to eliminate the complexity introduced +- Only simple features are supported for now, in order to eliminate the complexity introduced by rarely used features such as dynamic parallelism. - The generation of scopes — specifically GPU maps— has been almost completely reworked. - In the existing CUDA code generator, this component has major issues, with several hundred - lines of dense code packed into just a few functions, even though it could be logically - split. For example, the generation of different map types (based on schedule types), the - kernel launch, and the kernel wrapper function are now implemented in separate functions. - We also improved naming throughout the code by replacing vague variable names with more + In the existing CUDA code generator, this component has major issues, with several hundred + lines of dense code packed into just a few functions, even though it could be logically + split. For example, the generation of different map types (based on schedule types), the + kernel launch, and the kernel wrapper function are now implemented in separate functions. + We also improved naming throughout the code by replacing vague variable names with more meaningful ones. -- The existing CUDA code generator opens and closes brackets in inconsistent +- The existing CUDA code generator opens and closes brackets in inconsistent locations—sometimes even at another file. This is not only error-prone, but also makes the code appear more complex than necessary. To address this, we implemented a Python - class (`KernelScopeManager`) that uses the `with` construct to clearly define when scopes + class (`KernelScopeManager`) that uses the `with` construct to clearly define when scopes are entered and exited, making bracket management more structured and easier to control. -- In our view, the existing CUDA code generator class relies on too many attributes, some of - which are specific to individual kernels—such as inputs, block and grid dimensions. These - are currently derived ad hoc and stored directly on the generator, leading to clutter and - reduced clarity. To address this, we introduced a `KernelSpec` class that encapsulates all - kernel-specific information. This allows such attributes to be accessed cleanly from a - KernelSpec instance, reducing the number of attributes in the code generator and improving +- In our view, the existing CUDA code generator class relies on too many attributes, some of + which are specific to individual kernels—such as inputs, block and grid dimensions. These + are currently derived ad hoc and stored directly on the generator, leading to clutter and + reduced clarity. To address this, we introduced a `KernelSpec` class that encapsulates all + kernel-specific information. This allows such attributes to be accessed cleanly from a + KernelSpec instance, reducing the number of attributes in the code generator and improving structure and maintainability. - We also implemented a first extension, namely the support of WarpLevel schedules, by introducing a new GPU schedule type called `GPU_Warp`. With this, the we can specify which @@ -54,7 +54,3 @@ The following section highlights the notable aspects of the new implementation: The next steps include enabling asynchronous memory copies and continuing to refactor the remaining parts of the code generator. This will require support for shared memory and further discussions around key design decisions. - - - - diff --git a/berkay_workpace/scratch/yakups_examples/dbuff_related/double_buffering_async.sdfg b/berkay_workpace/scratch/yakups_examples/dbuff_related/double_buffering_async.sdfg index 4a884d1171..bbbd88132c 100644 --- a/berkay_workpace/scratch/yakups_examples/dbuff_related/double_buffering_async.sdfg +++ b/berkay_workpace/scratch/yakups_examples/dbuff_related/double_buffering_async.sdfg @@ -5299,4 +5299,4 @@ "cfg_list_id": 0, "start_block": 0, "dace_version": "1.0.0" -} \ No newline at end of file +} diff --git a/berkay_workpace/scratch/yakups_examples/dbuff_related/original_sdfg_with_shared_memory.sdfg b/berkay_workpace/scratch/yakups_examples/dbuff_related/original_sdfg_with_shared_memory.sdfg index e4b5ed96bb..2f0132c264 100644 --- a/berkay_workpace/scratch/yakups_examples/dbuff_related/original_sdfg_with_shared_memory.sdfg +++ b/berkay_workpace/scratch/yakups_examples/dbuff_related/original_sdfg_with_shared_memory.sdfg @@ -1275,4 +1275,4 @@ "cfg_list_id": 0, "start_block": 0, "dace_version": "1.0.0" -} \ No newline at end of file +} diff --git a/berkay_workpace/scratch/yakups_examples/smem_related/2d_blocktiled_gemm_with_custom_copy.sdfg b/berkay_workpace/scratch/yakups_examples/smem_related/2d_blocktiled_gemm_with_custom_copy.sdfg index c49fc87dfa..087ca70efc 100644 --- a/berkay_workpace/scratch/yakups_examples/smem_related/2d_blocktiled_gemm_with_custom_copy.sdfg +++ b/berkay_workpace/scratch/yakups_examples/smem_related/2d_blocktiled_gemm_with_custom_copy.sdfg @@ -4162,4 +4162,4 @@ "cfg_list_id": 0, "start_block": 0, "dace_version": "1.0.0" -} \ No newline at end of file +} diff --git a/berkay_workpace/scratch/yakups_examples/smem_related/nice_global_to_shared_copy.sdfg b/berkay_workpace/scratch/yakups_examples/smem_related/nice_global_to_shared_copy.sdfg index e040a6f997..1f276cf47a 100644 --- a/berkay_workpace/scratch/yakups_examples/smem_related/nice_global_to_shared_copy.sdfg +++ b/berkay_workpace/scratch/yakups_examples/smem_related/nice_global_to_shared_copy.sdfg @@ -1275,4 +1275,4 @@ "cfg_list_id": 0, "start_block": 0, "dace_version": "1.0.0" -} \ No newline at end of file +} diff --git a/berkay_workpace/scratch/yakups_examples/smem_related/weird_global_to_global.sdfg b/berkay_workpace/scratch/yakups_examples/smem_related/weird_global_to_global.sdfg index 0804dde52a..afc8a6b4a8 100644 --- a/berkay_workpace/scratch/yakups_examples/smem_related/weird_global_to_global.sdfg +++ b/berkay_workpace/scratch/yakups_examples/smem_related/weird_global_to_global.sdfg @@ -1401,4 +1401,4 @@ "cfg_list_id": 0, "start_block": 0, "dace_version": "1.0.0" -} \ No newline at end of file +} diff --git a/berkay_workpace/scratch/yakups_examples/smem_related/weird_shared_to_shared_copy.sdfg b/berkay_workpace/scratch/yakups_examples/smem_related/weird_shared_to_shared_copy.sdfg index b4b0bc8300..5842b832b1 100644 --- a/berkay_workpace/scratch/yakups_examples/smem_related/weird_shared_to_shared_copy.sdfg +++ b/berkay_workpace/scratch/yakups_examples/smem_related/weird_shared_to_shared_copy.sdfg @@ -893,4 +893,4 @@ "cfg_list_id": 0, "start_block": 0, "dace_version": "1.0.0" -} \ No newline at end of file +} diff --git a/berkay_workpace/tests/experimental_features_tests/warp_level_test.py b/berkay_workpace/tests/experimental_features_tests/warp_level_test.py index 89374322f2..78a4c7da72 100644 --- a/berkay_workpace/tests/experimental_features_tests/warp_level_test.py +++ b/berkay_workpace/tests/experimental_features_tests/warp_level_test.py @@ -5,23 +5,21 @@ from IPython.display import Code from dace.config import Config - ####################### Testing correct mapping of indices to WarpIds ################## -# NOTE: Focus in these section is not on the tasklet (just used to have a simple + +# NOTE: Focus in these section is not on the tasklet (just used to have a simple # verification option) and the SDFG is not correct, dataFlow to warps includes 32 elements # and not only 1 element. But there is no support for correct representation (yet). However, -# the construction of the warpIds is not affected by this. Correct SDFGs appear in the next -# test section +# the construction of the warpIds is not affected by this. Correct SDFGs appear in the next +# test section @pytest.mark.gpu -@pytest.mark.parametrize("start, end, stride", [ - (0, 32, 1), - (3, 16, 1), - (5, 17, 3) -]) +@pytest.mark.parametrize("start, end, stride", [(0, 32, 1), (3, 16, 1), (5, 17, 3)]) def test_warp_map_single_TB(start, end, stride): + @dace.program - def simple_warp_map(A: dace.uint32[1024] @ dace.dtypes.StorageType.GPU_Global, B: dace.uint32[1024] @ dace.dtypes.StorageType.GPU_Global): + def simple_warp_map(A: dace.uint32[1024] @ dace.dtypes.StorageType.GPU_Global, + B: dace.uint32[1024] @ dace.dtypes.StorageType.GPU_Global): """ 1D check with different start, end and strides. """ @@ -38,14 +36,13 @@ def simple_warp_map(A: dace.uint32[1024] @ dace.dtypes.StorageType.GPU_Global, B """ out_result = __reduce_add_sync(inp_mask, inp_value); """ - - B[j] = result + B[j] = result sdfg = simple_warp_map.to_sdfg() - A = cp.ones(1024, dtype=cp.uint32) - B = cp.zeros(1024, dtype=cp.uint32) + A = cp.ones(1024, dtype=cp.uint32) + B = cp.zeros(1024, dtype=cp.uint32) sdfg(A=A, B=B) @@ -58,16 +55,13 @@ def simple_warp_map(A: dace.uint32[1024] @ dace.dtypes.StorageType.GPU_Global, B cp.testing.assert_array_equal(B, expected) - - @pytest.mark.gpu -@pytest.mark.parametrize("start, end, stride", [ - (2, 16, 6), - (3, 15, 3) -]) +@pytest.mark.parametrize("start, end, stride", [(2, 16, 6), (3, 15, 3)]) def test_warp_map_multiple_TB(start, end, stride): + @dace.program - def multTB_warp_map(A: dace.uint32[1024] @ dace.dtypes.StorageType.GPU_Global, B: dace.uint32[1024] @ dace.dtypes.StorageType.GPU_Global): + def multTB_warp_map(A: dace.uint32[1024] @ dace.dtypes.StorageType.GPU_Global, + B: dace.uint32[1024] @ dace.dtypes.StorageType.GPU_Global): """ The case where we have more than one ThreadBlock. """ @@ -84,14 +78,13 @@ def multTB_warp_map(A: dace.uint32[1024] @ dace.dtypes.StorageType.GPU_Global, B """ out_result = __reduce_add_sync(inp_mask, inp_value); """ - - B[i + j] = result + B[i + j] = result sdfg = multTB_warp_map.to_sdfg() - A = cp.ones(1024, dtype=cp.uint32) - B = cp.zeros(1024, dtype=cp.uint32) + A = cp.ones(1024, dtype=cp.uint32) + B = cp.zeros(1024, dtype=cp.uint32) sdfg(A=A, B=B) @@ -105,15 +98,16 @@ def multTB_warp_map(A: dace.uint32[1024] @ dace.dtypes.StorageType.GPU_Global, B cp.testing.assert_array_equal(B, expected) - @pytest.mark.gpu @pytest.mark.parametrize("b1, e1, s1, b2, e2, s2", [ (0, 4, 1, 0, 4, 1), (0, 3, 2, 0, 5, 3), ]) def test_warp_map_2D(b1, e1, s1, b2, e2, s2): + @dace.program - def multTB_warp_map_2D(A: dace.uint32[1024] @ dace.dtypes.StorageType.GPU_Global, B: dace.uint32[1024] @ dace.dtypes.StorageType.GPU_Global): + def multTB_warp_map_2D(A: dace.uint32[1024] @ dace.dtypes.StorageType.GPU_Global, + B: dace.uint32[1024] @ dace.dtypes.StorageType.GPU_Global): """ Simple functionality check of 2D maps, focus is on 2D and less on multible TB. """ @@ -130,14 +124,13 @@ def multTB_warp_map_2D(A: dace.uint32[1024] @ dace.dtypes.StorageType.GPU_Global """ out_result = __reduce_add_sync(inp_mask, inp_value); """ - - B[i + j] = result + B[i + j] = result sdfg = multTB_warp_map_2D.to_sdfg() - A = cp.ones(1024, dtype=cp.uint32) - B = cp.zeros(1024, dtype=cp.uint32) + A = cp.ones(1024, dtype=cp.uint32) + B = cp.zeros(1024, dtype=cp.uint32) sdfg(A=A, B=B) @@ -148,25 +141,24 @@ def multTB_warp_map_2D(A: dace.uint32[1024] @ dace.dtypes.StorageType.GPU_Global warpId = (tid // 32) if warpId >= e1 * e2: continue - warpIdx = (warpId % e2 ) - warpIdy = (warpId // e2 ) % e1 + warpIdx = (warpId % e2) + warpIdy = (warpId // e2) % e1 if (warpIdx - b2) % s2 == 0 and (warpIdy - b1) % s1 == 0: expected[block_start + tid] = 32 - cp.testing.assert_array_equal(B, expected) - - @pytest.mark.gpu @pytest.mark.parametrize("b1, e1, s1, b2, e2, s2, b3, e3, s3", [ (0, 4, 1, 0, 4, 2, 0, 2, 1), (0, 3, 2, 1, 5, 3, 1, 2, 1), ]) def test_warp_map_3D(b1, e1, s1, b2, e2, s2, b3, e3, s3): + @dace.program - def warp_map_3D(A: dace.uint32[1024] @ dace.dtypes.StorageType.GPU_Global, B: dace.uint32[1024] @ dace.dtypes.StorageType.GPU_Global): + def warp_map_3D(A: dace.uint32[1024] @ dace.dtypes.StorageType.GPU_Global, + B: dace.uint32[1024] @ dace.dtypes.StorageType.GPU_Global): """ Simple functionality check of 3D maps """ @@ -183,14 +175,13 @@ def warp_map_3D(A: dace.uint32[1024] @ dace.dtypes.StorageType.GPU_Global, B: da """ out_result = __reduce_add_sync(inp_mask, inp_value); """ - - B[i + j] = result + B[i + j] = result sdfg = warp_map_3D.to_sdfg() - A = cp.ones(1024, dtype=cp.uint32) - B = cp.zeros(1024, dtype=cp.uint32) + A = cp.ones(1024, dtype=cp.uint32) + B = cp.zeros(1024, dtype=cp.uint32) sdfg(A=A, B=B) @@ -201,21 +192,16 @@ def warp_map_3D(A: dace.uint32[1024] @ dace.dtypes.StorageType.GPU_Global, B: da warpId = (tid // 32) if warpId >= e1 * e2 * e3: continue - warpIdx = warpId % e3 - warpIdy = (warpId // e3 ) % e2 - warpIdz = (warpId // (e3 * e2) ) % e1 - if ((warpIdx - b3) % s3 == 0 and warpIdx >= b3 and - (warpIdy - b2) % s2 == 0 and warpIdx >= b2 and - (warpIdz - b1) % s1 == 0 and warpIdx >= b1): + warpIdx = warpId % e3 + warpIdy = (warpId // e3) % e2 + warpIdz = (warpId // (e3 * e2)) % e1 + if ((warpIdx - b3) % s3 == 0 and warpIdx >= b3 and (warpIdy - b2) % s2 == 0 and warpIdx >= b2 + and (warpIdz - b1) % s1 == 0 and warpIdx >= b1): expected[block_start + tid] = 32 - cp.testing.assert_array_equal(B, expected) - - - @pytest.mark.gpu @pytest.mark.parametrize("bs, ns", [(512, 1024), (1024, 2048)]) def test_symbolic_warp_map(bs, ns): @@ -230,8 +216,10 @@ def test_symbolic_warp_map(bs, ns): start = 2 stride = 3 ws = bs // 32 + @dace.program - def symbolic_warp_map(A: dace.uint32[NS] @ dace.dtypes.StorageType.GPU_Global, B: dace.uint32[NS] @ dace.dtypes.StorageType.GPU_Global): + def symbolic_warp_map(A: dace.uint32[NS] @ dace.dtypes.StorageType.GPU_Global, + B: dace.uint32[NS] @ dace.dtypes.StorageType.GPU_Global): """ Focus is in the use of symbolic variables in the MAP. """ @@ -249,16 +237,15 @@ def symbolic_warp_map(A: dace.uint32[NS] @ dace.dtypes.StorageType.GPU_Global, B """ out_result = __reduce_add_sync(inp_mask, inp_value); """ - - B[i + j] = result + B[i + j] = result sdfg = symbolic_warp_map.to_sdfg() - A = cp.ones(ns, dtype=cp.uint32) - B = cp.zeros(ns, dtype=cp.uint32) + A = cp.ones(ns, dtype=cp.uint32) + B = cp.zeros(ns, dtype=cp.uint32) - sdfg(A=A, B=B, START= start, WS=ws, STRIDE=stride, BS=bs, NS=ns) + sdfg(A=A, B=B, START=start, WS=ws, STRIDE=stride, BS=bs, NS=ns) expected = cp.full(ns, 0, dtype=cp.uint32) for block_start in range(0, ns, bs): @@ -270,22 +257,19 @@ def symbolic_warp_map(A: dace.uint32[NS] @ dace.dtypes.StorageType.GPU_Global, B cp.testing.assert_array_equal(B, expected) - - - - - @pytest.mark.gpu def test_dynamic_warpSize_warp_map(): - STRIDE = 3 # just smth else than 1, 1 is easy to pass + STRIDE = 3 # just smth else than 1, 1 is easy to pass BS = dace.symbol('BS') NS = dace.symbol('NS') bs = 1024 ns = 2024 + @dace.program - def symbolic_warp_map(A: dace.uint32[NS] @ dace.dtypes.StorageType.GPU_Global, B: dace.uint32[NS] @ dace.dtypes.StorageType.GPU_Global): + def symbolic_warp_map(A: dace.uint32[NS] @ dace.dtypes.StorageType.GPU_Global, + B: dace.uint32[NS] @ dace.dtypes.StorageType.GPU_Global): """ What if warpSize is determined at runtime. """ @@ -303,14 +287,13 @@ def symbolic_warp_map(A: dace.uint32[NS] @ dace.dtypes.StorageType.GPU_Global, B """ out_result = __reduce_add_sync(inp_mask, inp_value); """ - - B[i + j] = result + B[i + j] = result sdfg = symbolic_warp_map.to_sdfg() - A = cp.ones(ns, dtype=cp.uint32) - B = cp.zeros(ns, dtype=cp.uint32) + A = cp.ones(ns, dtype=cp.uint32) + B = cp.zeros(ns, dtype=cp.uint32) sdfg(A=A, B=B, BS=bs, NS=ns) @@ -324,8 +307,10 @@ def symbolic_warp_map(A: dace.uint32[NS] @ dace.dtypes.StorageType.GPU_Global, B cp.testing.assert_array_equal(B, expected) + ####################### Testing simple warplevel programs ################# + @pytest.mark.gpu def test_warp_reduce_add(): """ @@ -341,37 +326,32 @@ def test_warp_reduce_add(): state = sdfg.add_state("main") # Generate access nodes - a_dev = sdfg.add_array("A", (32,), dace.uint32, dace.dtypes.StorageType.GPU_Global) - b_dev = sdfg.add_array("B", (32,), dace.uint32, dace.dtypes.StorageType.GPU_Global) + a_dev = sdfg.add_array("A", (32, ), dace.uint32, dace.dtypes.StorageType.GPU_Global) + b_dev = sdfg.add_array("B", (32, ), dace.uint32, dace.dtypes.StorageType.GPU_Global) a_acc = state.add_access("A") b_acc = state.add_access("B") - # Generate maps, connect entries with access data - gpu_map_entry, gpu_map_exit = state.add_map(name = "GPU_Map", - ndrange = dict(i='0:32:32'), - schedule = dace.dtypes.ScheduleType.GPU_Device) + gpu_map_entry, gpu_map_exit = state.add_map(name="GPU_Map", + ndrange=dict(i='0:32:32'), + schedule=dace.dtypes.ScheduleType.GPU_Device) state.add_edge(a_acc, None, gpu_map_entry, None, dace.memlet.Memlet('A[0:32]')) - - tblock_map_entry, tblock_map_exit = state.add_map(name = "Block_Map", - ndrange = dict(j='0:32'), - schedule = dace.dtypes.ScheduleType.GPU_ThreadBlock) + tblock_map_entry, tblock_map_exit = state.add_map(name="Block_Map", + ndrange=dict(j='0:32'), + schedule=dace.dtypes.ScheduleType.GPU_ThreadBlock) state.add_edge(gpu_map_entry, None, tblock_map_entry, None, dace.memlet.Memlet('A[0:32]')) - tasklet, warp_scope_entry, warp_scope_exit = state.add_mapped_tasklet( name='WarpLevel_Operation', map_ranges=dict(_='0:1'), inputs=dict(inp=dace.Memlet('A[0:32]', volume=32)), - code= -""" + code=""" value = inp[j] out = __reduce_add_sync(0xFFFFFFFF, value); """, outputs=dict(out=dace.Memlet("B[j]")), - schedule=dace.dtypes.ScheduleType.GPU_Warp - ) + schedule=dace.dtypes.ScheduleType.GPU_Warp) state.add_edge(tblock_map_entry, None, warp_scope_entry, None, dace.memlet.Memlet('A[0:32]')) @@ -382,8 +362,8 @@ def test_warp_reduce_add(): sdfg.fill_scope_connectors() - A = cp.ones(32, dtype=cp.uint32) - B = cp.zeros(32, dtype=cp.uint32) + A = cp.ones(32, dtype=cp.uint32) + B = cp.zeros(32, dtype=cp.uint32) sdfg(A=A, B=B) @@ -391,7 +371,6 @@ def test_warp_reduce_add(): cp.testing.assert_array_equal(B, all_32) - @pytest.mark.gpu def test_warp_shfl_op(): """ @@ -403,31 +382,27 @@ def test_warp_shfl_op(): state = sdfg.add_state("main") # Generate access nodes - a_dev = sdfg.add_array("A", (32,), dace.uint32, dace.dtypes.StorageType.GPU_Global) - b_dev = sdfg.add_array("B", (32,), dace.uint32, dace.dtypes.StorageType.GPU_Global) + a_dev = sdfg.add_array("A", (32, ), dace.uint32, dace.dtypes.StorageType.GPU_Global) + b_dev = sdfg.add_array("B", (32, ), dace.uint32, dace.dtypes.StorageType.GPU_Global) a_acc = state.add_access("A") b_acc = state.add_access("B") - # Generate maps, connect entries with access data - gpu_map_entry, gpu_map_exit = state.add_map(name = "GPU_Map", - ndrange = dict(i='0:32:32'), - schedule = dace.dtypes.ScheduleType.GPU_Device) + gpu_map_entry, gpu_map_exit = state.add_map(name="GPU_Map", + ndrange=dict(i='0:32:32'), + schedule=dace.dtypes.ScheduleType.GPU_Device) state.add_edge(a_acc, None, gpu_map_entry, None, dace.memlet.Memlet('A[0:32]')) - - tblock_map_entry, tblock_map_exit = state.add_map(name = "Block_Map", - ndrange = dict(j='0:32'), - schedule = dace.dtypes.ScheduleType.GPU_ThreadBlock) + tblock_map_entry, tblock_map_exit = state.add_map(name="Block_Map", + ndrange=dict(j='0:32'), + schedule=dace.dtypes.ScheduleType.GPU_ThreadBlock) state.add_edge(gpu_map_entry, None, tblock_map_entry, None, dace.memlet.Memlet('A[0:32]')) - tasklet, warp_scope_entry, warp_scope_exit = state.add_mapped_tasklet( name='WarpLevel_Operation', map_ranges=dict(_='0:1'), inputs=dict(inp=dace.Memlet('A[0:32]', volume=32)), - code= -""" + code=""" tid = j; value = inp[tid]; up = __shfl_down_sync(0xFFFFFFFF, value, 16); @@ -440,8 +415,7 @@ def test_warp_shfl_op(): """, outputs=dict(out=dace.Memlet("B[j]")), - schedule=dace.dtypes.ScheduleType.GPU_Warp - ) + schedule=dace.dtypes.ScheduleType.GPU_Warp) state.add_edge(tblock_map_entry, None, warp_scope_entry, None, dace.memlet.Memlet('A[0:32]')) @@ -458,18 +432,14 @@ def test_warp_shfl_op(): sdfg(A=A, B=B) expected = cp.array(cp.concatenate((A[16:32], A[0:16]))) - cp.testing.assert_array_equal(B,expected) - - - - + cp.testing.assert_array_equal(B, expected) if __name__ == '__main__': - + # Warnings are ignored #test_warp_map(0, 32, 1) pytest.main(["-v", "-p", "no:warnings", __file__]) # Use this if you want to see the warning - # pytest.main(["-v", __file__]) \ No newline at end of file + # pytest.main(["-v", __file__]) diff --git a/berkay_workpace/tests/gpu_map_tests/device_map_test.py b/berkay_workpace/tests/gpu_map_tests/device_map_test.py index 2cbf2a51e4..618b3073f4 100644 --- a/berkay_workpace/tests/gpu_map_tests/device_map_test.py +++ b/berkay_workpace/tests/gpu_map_tests/device_map_test.py @@ -7,9 +7,10 @@ @pytest.mark.gpu -@pytest.mark.parametrize("vec_size", [0, 15, 32, 67]) # default block size is 32, so these parameters handle interesting groups +@pytest.mark.parametrize("vec_size", + [0, 15, 32, 67]) # default block size is 32, so these parameters handle interesting groups def test_1d_maps_fixed_sizes(vec_size): - """ + """ Tests flat 1D vector copy from B to A using a single GPU_Device map (no thread blocking) for fixed size arrays. The vector sizes are chosen to cover interesting cases considering a default block size is 32. """ @@ -37,21 +38,20 @@ def vector_copy_flat(A: dace.float64[vec_size] @ dace.dtypes.StorageType.GPU_Glo cp.testing.assert_array_equal(A, B) - - @pytest.mark.gpu @pytest.mark.parametrize("n", [0, 15, 32, 67]) def test_1d_maps_dynamic_sizes(n): - """ + """ Tests flat 1D vector copy from B to A using a single GPU_Device map (no thread blocking) for variable size arrays. The vector sizes are chosen to cover interesting cases considering a default block size is 32. """ N = dace.symbol('N') @dace.program - def vector_copy_dyn_sizes(A: dace.float64[N] @ dace.dtypes.StorageType.GPU_Global, B: dace.float64[N] @ dace.dtypes.StorageType.GPU_Global): + def vector_copy_dyn_sizes(A: dace.float64[N] @ dace.dtypes.StorageType.GPU_Global, + B: dace.float64[N] @ dace.dtypes.StorageType.GPU_Global): for i in dace.map[0:N] @ dace.dtypes.ScheduleType.GPU_Device: - A[i] = B[i] + A[i] = B[i] sdfg = vector_copy_dyn_sizes.to_sdfg() @@ -69,11 +69,10 @@ def vector_copy_dyn_sizes(A: dace.float64[N] @ dace.dtypes.StorageType.GPU_Globa cp.testing.assert_array_equal(A, B) - @pytest.mark.gpu @pytest.mark.parametrize("s", [1, 2, 32, 33]) def test_1d_maps_strides(s): - """ + """ Tests flat 1D vector copy from B to A using a single GPU_Device map (no thread blocking) for different strides. N is variable in the sdfg/code but we just test for N = 67 here. """ @@ -81,9 +80,10 @@ def test_1d_maps_strides(s): n = 67 @dace.program - def vector_copy_strides(A: dace.float64[N] @ dace.dtypes.StorageType.GPU_Global, B: dace.float64[N] @ dace.dtypes.StorageType.GPU_Global): + def vector_copy_strides(A: dace.float64[N] @ dace.dtypes.StorageType.GPU_Global, + B: dace.float64[N] @ dace.dtypes.StorageType.GPU_Global): for i in dace.map[0:N:s] @ dace.dtypes.ScheduleType.GPU_Device: - A[i] = B[i] + A[i] = B[i] sdfg = vector_copy_strides.to_sdfg() @@ -106,7 +106,6 @@ def vector_copy_strides(A: dace.float64[N] @ dace.dtypes.StorageType.GPU_Global, cp.testing.assert_array_equal(A[mask], cp.zeros_like(A[mask])) - @pytest.mark.gpu @pytest.mark.parametrize("shape", [(2, 16), (3, 32)]) def test_2d_maps_dynamic_sizes(shape): @@ -139,17 +138,16 @@ def matrix_copy(A: dace.float64[M, N] @ dace.dtypes.StorageType.GPU_Global, cp.testing.assert_array_equal(A, B) - # higher dimensions in old tests - - if __name__ == '__main__': - print(f"\n\n\033[94m[INFO] You are using the \033[92m{Config.get('compiler', 'cuda', 'implementation')}\033[94m CUDA implementation.\033[0m \n\n") - + print( + f"\n\n\033[94m[INFO] You are using the \033[92m{Config.get('compiler', 'cuda', 'implementation')}\033[94m CUDA implementation.\033[0m \n\n" + ) + # Warnings are ignored pytest.main(["-v", "-p", "no:warnings", __file__]) # Use this if you want to see the warning - # pytest.main(["-v", __file__]) \ No newline at end of file + # pytest.main(["-v", __file__]) diff --git a/berkay_workpace/tests/gpu_map_tests/threadBlock_test.py b/berkay_workpace/tests/gpu_map_tests/threadBlock_test.py index 93ca7e757f..9898875565 100644 --- a/berkay_workpace/tests/gpu_map_tests/threadBlock_test.py +++ b/berkay_workpace/tests/gpu_map_tests/threadBlock_test.py @@ -6,6 +6,7 @@ # More tests at old tests, see /reusable_test + @pytest.mark.gpu @pytest.mark.parametrize("vec_size, block_size, stride", [ (32, 32, 2), @@ -47,13 +48,11 @@ def vector_copy_strided(A: dace.float64[N] @ dace.dtypes.StorageType.GPU_Global, cp.testing.assert_array_equal(A[mask], cp.zeros_like(A[mask])) - - @pytest.mark.gpu @pytest.mark.parametrize("n", [40, 64, 100, 128, 149]) def test_skewed_like_map_range_flat_add(n): """ - Tests vector addition C = A + B using a skewed-style inner map: + Tests vector addition C = A + B using a skewed-style inner map: outer GPU_Device map over blocks of size 32, and inner GPU_ThreadBlock map over absolute indices. """ @@ -83,14 +82,14 @@ def vadd_flat_skew_like(A: dace.float32[N] @ dace.StorageType.GPU_Global, cp.testing.assert_allclose(C, C_expected, rtol=1e-5, err_msg=f"Mismatch in output vector C for n={n}") - - if __name__ == '__main__': - print(f"\n\n\033[94m[INFO] You are using the \033[92m{Config.get('compiler', 'cuda', 'implementation')}\033[94m CUDA implementation.\033[0m \n\n") - + print( + f"\n\n\033[94m[INFO] You are using the \033[92m{Config.get('compiler', 'cuda', 'implementation')}\033[94m CUDA implementation.\033[0m \n\n" + ) + # Warnings are ignored pytest.main(["-v", "-p", "no:warnings", __file__]) # Use this if you want to see the warning - # pytest.main(["-v", __file__]) \ No newline at end of file + # pytest.main(["-v", __file__]) diff --git a/berkay_workpace/tests/memcopy_tests/out_of_kernel_memcpy_test.py b/berkay_workpace/tests/memcopy_tests/out_of_kernel_memcpy_test.py index 85c9c41147..509aa767e8 100644 --- a/berkay_workpace/tests/memcopy_tests/out_of_kernel_memcpy_test.py +++ b/berkay_workpace/tests/memcopy_tests/out_of_kernel_memcpy_test.py @@ -4,10 +4,9 @@ import pytest from dace.codegen import common - """ NOTE: -This test suite focuses on GPU memory copies that are generated outside the kernel code using DaCe and aims to +This test suite focuses on GPU memory copies that are generated outside the kernel code using DaCe and aims to remain backend-agnostic (CUDA/HIP). While HIP support has not been verified, care was taken to ensure tests are not backend-specific. @@ -15,7 +14,7 @@ - A small number of test cases is used intentionally to avoid redundancy while still covering a broad set of scenarios. - The test set alternates between different offsets, symbolic sizes, fixed sizes and different locations of the source and destination (GPU or CPU) to simulate common usage patterns. -- At the time of writing, the DaCe Python frontend does not correctly translate some advanced slicing patterns +- At the time of writing, the DaCe Python frontend does not correctly translate some advanced slicing patterns (e.g., `dst[b1:e1:s1] = src[b2:e2:s2]`) into valid SDFG representations. Therefore, such cases are implemented directly through the SDFG API for full control and correctness. """ @@ -28,7 +27,7 @@ def test_1d_out_of_kernel_memcpy(): """ Test simple 1D out-of-kernel memory copy. - The size of both arrays is symbolic, both are defined on + The size of both arrays is symbolic, both are defined on the GPU. """ # Symbolic array size @@ -38,8 +37,8 @@ def test_1d_out_of_kernel_memcpy(): state = sdfg.add_state("main") # Access nodes - sdfg.add_array("src", (N,), dace.uint32, dace.dtypes.StorageType.GPU_Global) - sdfg.add_array("dst", (N,), dace.uint32, dace.dtypes.StorageType.GPU_Global) + sdfg.add_array("src", (N, ), dace.uint32, dace.dtypes.StorageType.GPU_Global) + sdfg.add_array("dst", (N, ), dace.uint32, dace.dtypes.StorageType.GPU_Global) src_acc = state.add_access("src") dst_acc = state.add_access("dst") @@ -48,7 +47,7 @@ def test_1d_out_of_kernel_memcpy(): sdfg.fill_scope_connectors() # Check correctness - + # Initialize arrays on GPU n = 100 src = cp.ones(n, dtype=cp.uint32) @@ -66,6 +65,7 @@ def test_1d_out_of_kernel_memcpy(): # Check correctness cp.testing.assert_array_equal(dst, src) + @pytest.mark.gpu def test_1d_out_of_kernel_memcpy_strided(): """ @@ -78,8 +78,8 @@ def test_1d_out_of_kernel_memcpy_strided(): state = sdfg.add_state("main") # Access nodes of fixed shapes - sdfg.add_array("src", (40,), dace.uint32) - sdfg.add_array("dst", (20,), dace.uint32, dace.dtypes.StorageType.GPU_Global) + sdfg.add_array("src", (40, ), dace.uint32) + sdfg.add_array("dst", (20, ), dace.uint32, dace.dtypes.StorageType.GPU_Global) src_acc = state.add_access("src") dst_acc = state.add_access("dst") @@ -90,13 +90,13 @@ def test_1d_out_of_kernel_memcpy_strided(): # Check correctness # Initialize arrays - src = np.ones(40, dtype=cp.uint32) + src = np.ones(40, dtype=cp.uint32) dst = cp.zeros(20, dtype=cp.uint32) # Run program sdfg(src=src, dst=dst) - # Check generated code for expected memcpy usage + # Check generated code for expected memcpy usage # NOTE: Memcpy2DAsync is used! Check the codegen, neat trick :) func_name = f"{BACKEND}Memcpy2DAsync" kind = f"{BACKEND}MemcpyHostToDevice" @@ -108,12 +108,13 @@ def test_1d_out_of_kernel_memcpy_strided(): expected[::2] = 1 cp.testing.assert_array_equal(expected, dst) + #------------------ 2D Memory Copy Tests ----------------------- @pytest.mark.gpu def test_2d_out_of_kernel_memcpy(): """ Test 2D out-of-kernel memcpy. - Here, the copy shape is contigous (copy contiguous src to contigous dst), + Here, the copy shape is contigous (copy contiguous src to contigous dst), we use fixed sizes and only copy a subset of the array. Source is on GPU, destination an array on CPU. """ @@ -121,8 +122,14 @@ def test_2d_out_of_kernel_memcpy(): state = sdfg.add_state("main") # Access nodes of fixed shape (5,10) - sdfg.add_array("src", (5,10,), dace.uint32, dace.dtypes.StorageType.GPU_Global) - sdfg.add_array("dst", (5,10,), dace.uint32) + sdfg.add_array("src", ( + 5, + 10, + ), dace.uint32, dace.dtypes.StorageType.GPU_Global) + sdfg.add_array("dst", ( + 5, + 10, + ), dace.uint32) src_acc = state.add_access("src") dst_acc = state.add_access("dst") @@ -133,23 +140,24 @@ def test_2d_out_of_kernel_memcpy(): # Check correctness # Initialize arrays - src = cp.ones((5,10), dtype=cp.uint32) - dst = np.zeros((5,10), dtype=cp.uint32) + src = cp.ones((5, 10), dtype=cp.uint32) + dst = np.zeros((5, 10), dtype=cp.uint32) # Run program sdfg(src=src, dst=dst) - # Check generated code for expected memcpy usage + # Check generated code for expected memcpy usage func_name = f"{BACKEND}Memcpy2DAsync" kind = f"{BACKEND}MemcpyDeviceToHost" code = sdfg.generate_code()[0].code assert func_name in code and kind in code #Check whether result is as expected - expected = np.zeros((5,10), dtype=cp.uint32) + expected = np.zeros((5, 10), dtype=cp.uint32) expected[2:4, 5:8] = 1 np.testing.assert_array_equal(dst, expected) + @pytest.mark.gpu def test_2d_out_of_kernel_memcpy_one_strided(): """ @@ -164,8 +172,14 @@ def test_2d_out_of_kernel_memcpy_one_strided(): state = sdfg.add_state("main") # Access nodes - sdfg.add_array("src", (N,2*M,), dace.uint32) - sdfg.add_array("dst", (N,M,), dace.uint32, dace.dtypes.StorageType.GPU_Global) + sdfg.add_array("src", ( + N, + 2 * M, + ), dace.uint32) + sdfg.add_array("dst", ( + N, + M, + ), dace.uint32, dace.dtypes.StorageType.GPU_Global) src_acc = state.add_access("src") dst_acc = state.add_access("dst") @@ -178,22 +192,23 @@ def test_2d_out_of_kernel_memcpy_one_strided(): # Initialize arrays n = 3 m = 10 - src = np.ones((n,2*m), dtype=cp.uint32) - dst = cp.zeros((n,m), dtype=cp.uint32) + src = np.ones((n, 2 * m), dtype=cp.uint32) + dst = cp.zeros((n, m), dtype=cp.uint32) # Run program sdfg(src=src, dst=dst, N=n, M=m) - # Check generated code for expected memcpy usage + # Check generated code for expected memcpy usage func_name = f"{BACKEND}Memcpy2DAsync" kind = f"{BACKEND}MemcpyHostToDevice" code = sdfg.generate_code()[0].code assert func_name in code and kind in code #Check whether result is as expected - expected = cp.ones((n,m), dtype=cp.uint32) + expected = cp.ones((n, m), dtype=cp.uint32) cp.testing.assert_array_equal(dst, expected) + @pytest.mark.gpu def test_2d_oofkmemcpy_strided(): """ @@ -204,8 +219,14 @@ def test_2d_oofkmemcpy_strided(): state = sdfg.add_state("main") # Access nodes - sdfg.add_array("src", (2,20,), dace.uint32, dace.dtypes.StorageType.GPU_Global) - sdfg.add_array("dst", (2,10,), dace.uint32, dace.dtypes.StorageType.GPU_Global) + sdfg.add_array("src", ( + 2, + 20, + ), dace.uint32, dace.dtypes.StorageType.GPU_Global) + sdfg.add_array("dst", ( + 2, + 10, + ), dace.uint32, dace.dtypes.StorageType.GPU_Global) src_acc = state.add_access("src") dst_acc = state.add_access("dst") @@ -216,17 +237,18 @@ def test_2d_oofkmemcpy_strided(): # Check correctness # Initialize arrays - src = cp.ones((2,20), dtype=cp.uint32) - dst = cp.zeros((2,10), dtype=cp.uint32) + src = cp.ones((2, 20), dtype=cp.uint32) + dst = cp.zeros((2, 10), dtype=cp.uint32) # Execute program sdfg(src=src, dst=dst) # Compute expected result & verify - expected = cp.zeros((2,10), dtype=cp.uint32) + expected = cp.zeros((2, 10), dtype=cp.uint32) expected[0:2, 0:10:5] = src[0:2, 0:20:10] cp.testing.assert_array_equal(dst, expected) + # ---------- Higher-Dimensional (>2D) Memory Copy Tests -------- @pytest.mark.gpu def test_3d_oofkmemcpy(): @@ -238,8 +260,8 @@ def test_3d_oofkmemcpy(): state = sdfg.add_state("main") # Access nodes - sdfg.add_array("src", (2,2,4), dace.uint32, dace.dtypes.StorageType.GPU_Global) - sdfg.add_array("dst", (2,2,4), dace.uint32, dace.dtypes.StorageType.GPU_Global) + sdfg.add_array("src", (2, 2, 4), dace.uint32, dace.dtypes.StorageType.GPU_Global) + sdfg.add_array("dst", (2, 2, 4), dace.uint32, dace.dtypes.StorageType.GPU_Global) src_acc = state.add_access("src") dst_acc = state.add_access("dst") @@ -250,9 +272,9 @@ def test_3d_oofkmemcpy(): # Check correctness # Initialize arrays - src = cp.ones((2,2,4), dtype=cp.uint32) - dst = cp.zeros((2,2,4), dtype=cp.uint32) + src = cp.ones((2, 2, 4), dtype=cp.uint32) + dst = cp.zeros((2, 2, 4), dtype=cp.uint32) # run and check sdfg(src=src, dst=dst) - cp.testing.assert_array_equal(dst, src) \ No newline at end of file + cp.testing.assert_array_equal(dst, src) diff --git a/berkay_workpace/tests/reusable_tests/cuda_block_test.py b/berkay_workpace/tests/reusable_tests/cuda_block_test.py index c716b7b117..aead6ea25f 100644 --- a/berkay_workpace/tests/reusable_tests/cuda_block_test.py +++ b/berkay_workpace/tests/reusable_tests/cuda_block_test.py @@ -15,7 +15,7 @@ def cudahello(V, Vout): def multiplication(i): # I don't understand why this is here - # Probably will be removed later? + # Probably will be removed later? @dace.map(_[0:32]) def mult_block(bi): in_V << V[i + bi] @@ -161,7 +161,6 @@ def tester(A: dace.float64[400, 300, 2, 32]): sdfg.compile() - """ # Not implemened @pytest.mark.gpu @@ -196,8 +195,6 @@ def tester(A: dace.float64[200]): assert np.allclose(a, ref) """ - - if __name__ == "__main__": test_cpu() test_gpu() diff --git a/berkay_workpace/tests/smem_tests/default_smem_sync_pass_test.py b/berkay_workpace/tests/smem_tests/default_smem_sync_pass_test.py index 0f11ff2764..554c7b68a2 100644 --- a/berkay_workpace/tests/smem_tests/default_smem_sync_pass_test.py +++ b/berkay_workpace/tests/smem_tests/default_smem_sync_pass_test.py @@ -4,8 +4,6 @@ from dace.transformation.passes.shared_memory_synchronization import DefaultSharedMemorySync import pytest - - """ Simple tests checking core functionality of the "DefaultSharedMemorySync" pass. """ @@ -17,12 +15,12 @@ def test_scalar_multiplic(): Constructs an SDFG that performs scalar multiplication on a vector. In this test, a sequential loop is placed inside the GPU kernel, reusing shared memory. - As a result, the 'DefaultSharedMemorySync' pass should insert a "__syncthreads();" + As a result, the 'DefaultSharedMemorySync' pass should insert a "__syncthreads();" at the end of each iteration to ensure correctness. - Note: This test is designed to evaluate where the 'DefaultSharedMemorySync' pass places - synchronization tasklets. In this particular example, the inserted synchronizations are - not strictly necessary and could be avoided with more advanced analysis, which is beyond + Note: This test is designed to evaluate where the 'DefaultSharedMemorySync' pass places + synchronization tasklets. In this particular example, the inserted synchronizations are + not strictly necessary and could be avoided with more advanced analysis, which is beyond the scope of this pass. """ @@ -33,17 +31,21 @@ def test_scalar_multiplic(): state = sdfg.add_state("main") # Add arrays - sdfg.add_array("A", (128,), dace.uint32, storage=dace.dtypes.StorageType.GPU_Global) + sdfg.add_array("A", (128, ), dace.uint32, storage=dace.dtypes.StorageType.GPU_Global) sdfg.add_scalar("scalar", dace.uint32) - sdfg.add_array("S", (32,), dace.uint32, storage=dace.dtypes.StorageType.GPU_Shared, transient=True, lifetime=dace.dtypes.AllocationLifetime.Scope) + sdfg.add_array("S", (32, ), + dace.uint32, + storage=dace.dtypes.StorageType.GPU_Shared, + transient=True, + lifetime=dace.dtypes.AllocationLifetime.Scope) # Add access nodes a_acc = state.add_read("A") a_store = state.add_write("A") scalar_acc = state.add_access("scalar") - s_acc= state.add_access("S") + s_acc = state.add_access("S") - # Sequential map (outermost) + # Sequential map (outermost) seq_map_entry, seq_map_exit = state.add_map( "seq_map", dict(k="0:4"), @@ -65,21 +67,17 @@ def test_scalar_multiplic(): ) # Add tasklets for A -> S -> B - tasklet1 = state.add_tasklet( - "addMult", - inputs={"__inp_A", "__inp_scalar"}, - outputs={"__out"}, - code="__out = __inp_A * __inp_scalar;", - language=dace.dtypes.Language.CPP - ) - - tasklet2 = state.add_tasklet( - "store_to_global", - inputs={"__inp"}, - outputs={"__out"}, - code="__out = __inp;", - language=dace.dtypes.Language.CPP - ) + tasklet1 = state.add_tasklet("addMult", + inputs={"__inp_A", "__inp_scalar"}, + outputs={"__out"}, + code="__out = __inp_A * __inp_scalar;", + language=dace.dtypes.Language.CPP) + + tasklet2 = state.add_tasklet("store_to_global", + inputs={"__inp"}, + outputs={"__out"}, + code="__out = __inp;", + language=dace.dtypes.Language.CPP) # Edges @@ -104,22 +102,20 @@ def test_scalar_multiplic(): state.add_edge(tb_map_exit, None, seq_map_exit, None, dace.Memlet("A[32 * k: 32 * (k+1)]")) state.add_edge(seq_map_exit, None, gpu_map_exit, None, dace.Memlet("A[0:128]")) state.add_edge(gpu_map_exit, None, a_store, None, dace.Memlet("A[0:128]")) - - sdfg.fill_scope_connectors() + sdfg.fill_scope_connectors() #----------------- Apply pass -------------------- DefaultSharedMemorySync().apply_pass(sdfg, None) - #----------------- Check correct insertion of sync tasklets -------------------- # s_acc has a sync tasklet successor found = None for succ in state.successors(s_acc): - if (hasattr(succ, "_label") and succ._label == "pre_sync_barrier" and - isinstance(succ, nodes.Tasklet) and "__syncthreads();" in succ.code.code): + if (hasattr(succ, "_label") and succ._label == "pre_sync_barrier" and isinstance(succ, nodes.Tasklet) + and "__syncthreads();" in succ.code.code): found = succ break @@ -128,13 +124,14 @@ def test_scalar_multiplic(): # smem is reused in seq map, so we need synchronization after each iteration found = None for pred in state.predecessors(seq_map_exit): - if (hasattr(pred, "_label") and pred._label == "post_sync_barrier" and - isinstance(pred, nodes.Tasklet) and "__syncthreads();" in pred.code.code): + if (hasattr(pred, "_label") and pred._label == "post_sync_barrier" and isinstance(pred, nodes.Tasklet) + and "__syncthreads();" in pred.code.code): found = pred break assert found is not None, "There should be a synchronization tasklet after each iteration of the sequential map" + @pytest.mark.gpu def test_scalar_multiplic_special(): """ @@ -153,17 +150,21 @@ def test_scalar_multiplic_special(): state = sdfg.add_state("main") # Add arrays - sdfg.add_array("A", (32,), dace.uint32, storage=dace.dtypes.StorageType.GPU_Global) + sdfg.add_array("A", (32, ), dace.uint32, storage=dace.dtypes.StorageType.GPU_Global) sdfg.add_scalar("scalar", dace.uint32) - sdfg.add_array("S", (32,), dace.uint32, storage=dace.dtypes.StorageType.GPU_Shared, transient=True, lifetime=dace.dtypes.AllocationLifetime.Scope) + sdfg.add_array("S", (32, ), + dace.uint32, + storage=dace.dtypes.StorageType.GPU_Shared, + transient=True, + lifetime=dace.dtypes.AllocationLifetime.Scope) # Add access nodes a_acc = state.add_read("A") a_store = state.add_write("A") scalar_acc = state.add_access("scalar") - s_acc= state.add_access("S") + s_acc = state.add_access("S") - # Sequential map (outermost) + # Sequential map (outermost) seq_map_entry, seq_map_exit = state.add_map( "seq_map", dict(k="0:1"), @@ -185,21 +186,17 @@ def test_scalar_multiplic_special(): ) # Add tasklets for A -> S -> B - tasklet1 = state.add_tasklet( - "addMult", - inputs={"__inp_A", "__inp_scalar"}, - outputs={"__out"}, - code="__out = __inp_A * __inp_scalar;", - language=dace.dtypes.Language.CPP - ) - - tasklet2 = state.add_tasklet( - "store_to_global", - inputs={"__inp"}, - outputs={"__out"}, - code="__out = __inp;", - language=dace.dtypes.Language.CPP - ) + tasklet1 = state.add_tasklet("addMult", + inputs={"__inp_A", "__inp_scalar"}, + outputs={"__out"}, + code="__out = __inp_A * __inp_scalar;", + language=dace.dtypes.Language.CPP) + + tasklet2 = state.add_tasklet("store_to_global", + inputs={"__inp"}, + outputs={"__out"}, + code="__out = __inp;", + language=dace.dtypes.Language.CPP) # Edges @@ -224,22 +221,20 @@ def test_scalar_multiplic_special(): state.add_edge(tb_map_exit, None, seq_map_exit, None, dace.Memlet("A[32 * k: 32 * (k+1)]")) state.add_edge(seq_map_exit, None, gpu_map_exit, None, dace.Memlet("A[0:32]")) state.add_edge(gpu_map_exit, None, a_store, None, dace.Memlet("A[0:32]")) - - sdfg.fill_scope_connectors() + sdfg.fill_scope_connectors() #----------------- Apply pass -------------------- DefaultSharedMemorySync().apply_pass(sdfg, None) - #----------------- Check correct insertion of sync tasklets -------------------- # s_acc has a sync tasklet successor found = None for succ in state.successors(s_acc): - if (hasattr(succ, "_label") and succ._label == "pre_sync_barrier" and - isinstance(succ, nodes.Tasklet) and "__syncthreads();" in succ.code.code): + if (hasattr(succ, "_label") and succ._label == "pre_sync_barrier" and isinstance(succ, nodes.Tasklet) + and "__syncthreads();" in succ.code.code): found = succ break @@ -248,13 +243,14 @@ def test_scalar_multiplic_special(): # smem is NOT reused in seq map found = None for pred in state.predecessors(seq_map_exit): - if (hasattr(pred, "_label") and pred._label == "post_sync_barrier" and - isinstance(pred, nodes.Tasklet) and "__syncthreads();" in pred.code.code): + if (hasattr(pred, "_label") and pred._label == "post_sync_barrier" and isinstance(pred, nodes.Tasklet) + and "__syncthreads();" in pred.code.code): found = pred break assert found is None, "The DefaultSharedMemorySync pass should not have inserted at the end of the sequential map body" + @pytest.mark.gpu def test_scalar_multiplic_loopRegion(): """ @@ -270,19 +266,17 @@ def test_scalar_multiplic_loopRegion(): state = sdfg.add_state("main") # Arrays and access nodes - sdfg.add_array("A", (128,), dace.uint32, storage=dace.dtypes.StorageType.GPU_Global) + sdfg.add_array("A", (128, ), dace.uint32, storage=dace.dtypes.StorageType.GPU_Global) sdfg.add_scalar("scalar", dace.uint32) a_acc = state.add_read("A") a_store = state.add_write("A") scalar_acc = state.add_access("scalar") # Device and thread-block maps - gpu_map_entry, gpu_map_exit = state.add_map( - "gpu_map", dict(i="0:32:32"), schedule=dace.dtypes.ScheduleType.GPU_Device - ) - tb_map_entry, tb_map_exit = state.add_map( - "tb", dict(j="0:32"), schedule=dace.dtypes.ScheduleType.GPU_ThreadBlock - ) + gpu_map_entry, gpu_map_exit = state.add_map("gpu_map", + dict(i="0:32:32"), + schedule=dace.dtypes.ScheduleType.GPU_Device) + tb_map_entry, tb_map_exit = state.add_map("tb", dict(j="0:32"), schedule=dace.dtypes.ScheduleType.GPU_ThreadBlock) # Nested SDFG setup inner_sdfg = dace.SDFG('nested_sdfg') @@ -293,22 +287,22 @@ def test_scalar_multiplic_loopRegion(): inner_state = loopreg.add_state("use_smem") # Shared memory and result - inner_sdfg.add_array("S", (32,), dace.uint32, storage=dace.dtypes.StorageType.GPU_Shared, transient=True) + inner_sdfg.add_array("S", (32, ), dace.uint32, storage=dace.dtypes.StorageType.GPU_Shared, transient=True) inner_sdfg.add_scalar("tmp_ret", dace.uint32) s_acc = inner_state.add_access("S") ret = inner_state.add_write("tmp_ret") # Tasklets - tasklet1 = inner_state.add_tasklet( - "assign_to_smem", inputs={}, outputs={"__out1"}, - code="__out1 = __inp_A[j + 32 * k]", - language=dace.dtypes.Language.CPP - ) - tasklet2 = inner_state.add_tasklet( - "addMult", inputs={"__inp2"}, outputs={"__out2"}, - code="__out2 = __inp2 * __inp_scalar;", - language=dace.dtypes.Language.CPP - ) + tasklet1 = inner_state.add_tasklet("assign_to_smem", + inputs={}, + outputs={"__out1"}, + code="__out1 = __inp_A[j + 32 * k]", + language=dace.dtypes.Language.CPP) + tasklet2 = inner_state.add_tasklet("addMult", + inputs={"__inp2"}, + outputs={"__out2"}, + code="__out2 = __inp2 * __inp_scalar;", + language=dace.dtypes.Language.CPP) # Main SDFG edges state.add_edge(a_acc, None, gpu_map_entry, None, dace.Memlet("A[0:128]")) @@ -332,7 +326,6 @@ def test_scalar_multiplic_loopRegion(): DefaultSharedMemorySync().apply_pass(sdfg, None) - #----------------- Check correct insertion of sync tasklets -------------------- try: @@ -341,4 +334,4 @@ def test_scalar_multiplic_loopRegion(): assert "__syncthreads();" in post_sync_tasklet.code.code, "Post synchronization tasklet is not correctly inserted" except: # Any other weird failures - assert False, "Post synchronization tasklet is not correctly inserted" \ No newline at end of file + assert False, "Post synchronization tasklet is not correctly inserted" diff --git a/berkay_workpace/tests/smem_tests/gemm_test.py b/berkay_workpace/tests/smem_tests/gemm_test.py index 925d4aac5d..f6c7b9b081 100644 --- a/berkay_workpace/tests/smem_tests/gemm_test.py +++ b/berkay_workpace/tests/smem_tests/gemm_test.py @@ -13,7 +13,8 @@ def test_gemm(): of a GEMM SDFG using 2D block tiling with custom copy. """ current_dir = os.path.dirname(os.path.abspath(__file__)) - sdfg_path = os.path.join(current_dir, '../../scratch/yakups_examples/smem_related/2d_blocktiled_gemm_with_custom_copy.sdfg') + sdfg_path = os.path.join(current_dir, + '../../scratch/yakups_examples/smem_related/2d_blocktiled_gemm_with_custom_copy.sdfg') sdfg = dace.SDFG.from_file(sdfg_path) m, n, k = 1024, 1024, 1024 diff --git a/berkay_workpace/tests/smem_tests/special_sync_pass_test.py b/berkay_workpace/tests/smem_tests/special_sync_pass_test.py index 1d56f31df9..b3e98312ea 100644 --- a/berkay_workpace/tests/smem_tests/special_sync_pass_test.py +++ b/berkay_workpace/tests/smem_tests/special_sync_pass_test.py @@ -23,7 +23,7 @@ def test_correctness_and_reuse(): size = 512 a = cp.random.rand(size, dtype=cp.float64) b = cp.random.rand(size, dtype=cp.float64) - c = cp.zeros((size,), dtype=cp.float64) + c = cp.zeros((size, ), dtype=cp.float64) # count that there is only one __syncthread(); call. You can also inspect the final SDFG in the cache for that generated_code = sdfg.generate_code()[1].clean_code diff --git a/dace/codegen/targets/__init__.py b/dace/codegen/targets/__init__.py index a0c2065524..5c9027e68e 100644 --- a/dace/codegen/targets/__init__.py +++ b/dace/codegen/targets/__init__.py @@ -9,4 +9,4 @@ from .mlir.mlir import MLIRCodeGen from .sve.codegen import SVECodeGen from .snitch import SnitchCodeGen -from .experimental_cuda import ExperimentalCUDACodeGen \ No newline at end of file +from .experimental_cuda import ExperimentalCUDACodeGen diff --git a/dace/codegen/targets/cpp.py b/dace/codegen/targets/cpp.py index 7d599ed292..89bade5d7e 100644 --- a/dace/codegen/targets/cpp.py +++ b/dace/codegen/targets/cpp.py @@ -274,7 +274,7 @@ def ptr(name: str, desc: data.Data, sdfg: SDFG = None, framecode=None) -> str: # Avoid import loop from dace.codegen.targets.cuda import CUDACodeGen from dace.codegen.targets.experimental_cuda import ExperimentalCUDACodeGen - + # Check whether we are in kernel/ device code of GPU backend cuda_impl = Config.get('compiler', 'cuda', 'implementation') if cuda_impl == "legacy": @@ -282,7 +282,6 @@ def ptr(name: str, desc: data.Data, sdfg: SDFG = None, framecode=None) -> str: elif cuda_impl == "experimental": in_device_code = ExperimentalCUDACodeGen._in_kernel_code - if desc.storage == dtypes.StorageType.CPU_ThreadLocal: # Use unambiguous name for thread-local arrays return f'__{sdfg.cfg_id}_{name}' elif not in_device_code: # GPU kernels cannot access state diff --git a/dace/codegen/targets/cuda.py b/dace/codegen/targets/cuda.py index 4ee0f38694..da0a92e899 100644 --- a/dace/codegen/targets/cuda.py +++ b/dace/codegen/targets/cuda.py @@ -140,7 +140,6 @@ def __init__(self, frame_codegen: 'DaCeCodeGenerator', sdfg: SDFG): # new self._kernels_with_inserted_tb_maps: Set[nodes.MapEntry] = set() - def _emit_sync(self, codestream: CodeIOStream): if Config.get_bool('compiler', 'cuda', 'syncdebug'): codestream.write('''DACE_GPU_CHECK({backend}GetLastError()); @@ -157,16 +156,16 @@ def preprocess(self, sdfg: SDFG) -> None: CUDACodeGen, 'CUDA', target_type=target_type) - - old_nodes = set(node for node, _ in sdfg.all_nodes_recursive()) sdfg.apply_transformations_once_everywhere(AddThreadBlockMap, ) new_nodes = set(node for node, _ in sdfg.all_nodes_recursive()) - old_nodes - self._kernels_with_inserted_tb_maps = {n for n in new_nodes if isinstance(n, nodes.MapEntry) and n.schedule == dtypes.ScheduleType.GPU_Device} - + self._kernels_with_inserted_tb_maps = { + n + for n in new_nodes if isinstance(n, nodes.MapEntry) and n.schedule == dtypes.ScheduleType.GPU_Device + } # Find GPU<->GPU strided copies that cannot be represented by a single copy command for e, state in list(sdfg.all_edges_recursive()): @@ -2056,7 +2055,8 @@ def get_kernel_dimensions(self, dfg_scope): # Error when both user has manually set gpu_block_size and thread-block maps were defined and conflict in block size preset_block_size = kernelmap_entry.map.gpu_block_size - conflicting_block_sizes = (preset_block_size is not None) and not (kernelmap_entry in self._kernels_with_inserted_tb_maps) + conflicting_block_sizes = (preset_block_size + is not None) and not (kernelmap_entry in self._kernels_with_inserted_tb_maps) if conflicting_block_sizes: raise ValueError('Both the `gpu_block_size` property and internal thread-block ' diff --git a/dace/codegen/targets/experimental_cuda.py b/dace/codegen/targets/experimental_cuda.py index 80e36bdbac..fd676e338f 100644 --- a/dace/codegen/targets/experimental_cuda.py +++ b/dace/codegen/targets/experimental_cuda.py @@ -22,13 +22,8 @@ from dace.codegen.dispatcher import DefinedType, TargetDispatcher from dace.codegen.prettycode import CodeIOStream from dace.codegen.common import update_persistent_desc -from dace.codegen.targets.cpp import ( - codeblock_to_cpp, - memlet_copy_to_absolute_strides, - mangle_dace_state_struct_name, - ptr, - sym2cpp -) +from dace.codegen.targets.cpp import (codeblock_to_cpp, memlet_copy_to_absolute_strides, mangle_dace_state_struct_name, + ptr, sym2cpp) from dace.codegen.targets.target import IllegalCopy, TargetCodeGenerator, make_absolute # DaCe transformation imports @@ -48,36 +43,31 @@ from dace.codegen.targets.framecode import DaCeCodeGenerator from dace.codegen.targets.cpu import CPUCodeGen - # add symbolic_to_cpp ! # TODO's harder: # 1. Include constant expressions - @registry.autoregister_params(name='experimental_cuda') class ExperimentalCUDACodeGen(TargetCodeGenerator): """ Experimental CUDA code generator.""" target_name = 'experimental_cuda' title = 'CUDA' - ########################################################################### - # Initialization & Preprocessing + # Initialization & Preprocessing def __init__(self, frame_codegen: 'DaCeCodeGenerator', sdfg: SDFG): - self._frame: DaCeCodeGenerator = frame_codegen # creates the frame code, orchestrates the code generation for targets - self._dispatcher: TargetDispatcher = frame_codegen.dispatcher # responsible for dispatching code generation to the appropriate target + self._frame: DaCeCodeGenerator = frame_codegen # creates the frame code, orchestrates the code generation for targets + self._dispatcher: TargetDispatcher = frame_codegen.dispatcher # responsible for dispatching code generation to the appropriate target - - ExperimentalCUDACodeGen._in_kernel_code = False + ExperimentalCUDACodeGen._in_kernel_code = False self._cpu_codegen: Optional['CPUCodeGen'] = None - # NOTE: Moved from preprossessing to here - self.backend: str = common.get_gpu_backend() + self.backend: str = common.get_gpu_backend() self.language = 'cu' if self.backend == 'cuda' else 'cpp' target_type = '' if self.backend == 'cuda' else self.backend self._codeobject = CodeObject(sdfg.name + '_' + 'cuda', @@ -87,8 +77,6 @@ def __init__(self, frame_codegen: 'DaCeCodeGenerator', sdfg: SDFG): 'CUDA', target_type=target_type) - - self._localcode = CodeIOStream() self._globalcode = CodeIOStream() @@ -99,13 +87,11 @@ def __init__(self, frame_codegen: 'DaCeCodeGenerator', sdfg: SDFG): self._global_sdfg: SDFG = sdfg self._toplevel_schedule = None - # Positions at which to deallocate memory pool arrays self.pool_release: Dict[Tuple[SDFG, str], Tuple[SDFGState, Set[nodes.Node]]] = {} self.has_pool = False - - # INFO: + # INFO: # Register GPU schedules and storage types for ExperimentalCUDACodeGen. # The dispatcher maps GPU-related schedules and storage types to the # appropriate code generation functions in this code generator. @@ -128,16 +114,14 @@ def __init__(self, frame_codegen: 'DaCeCodeGenerator', sdfg: SDFG): self._dispatcher.register_copy_dispatcher(storage, other_storage, None, self) self._dispatcher.register_copy_dispatcher(other_storage, storage, None, self) - - # NOTE: + # NOTE: # "Register illegal copies" code NOT copied from cuda.py # Behavior unclear for me yet. - ################## New variables ########################## self._current_kernel_spec: Optional[KernelSpec] = None - self._gpu_stream_manager: Optional[GPUStreamManager] = None + self._gpu_stream_manager: Optional[GPUStreamManager] = None self._kernel_dimensions_map: Set[nodes.MapEntry] = set() def preprocess(self, sdfg: SDFG) -> None: @@ -147,13 +131,13 @@ def preprocess(self, sdfg: SDFG) -> None: every Kernel in the SDFG - Handling GPU<->GPU strided copies. - Assigning backend GPU streams (e.g., CUDA streams) and creating the GPUStreamManager. - - Handling memory pool management + - Handling memory pool management """ - + #----------------- Add ThreadBlock Maps & Infer Kernel Grid & Block Sizes -------------------- - # new_nodes - old_nodes gives us all Kernel Entry nodes that were created during the insertion - # of ThreadBlock maps. Note: the original Kernel Entry was transformed into a ThreadBlock map, + # new_nodes - old_nodes gives us all Kernel Entry nodes that were created during the insertion + # of ThreadBlock maps. Note: the original Kernel Entry was transformed into a ThreadBlock map, # and a new GPU_Device (i.e., Kernel) map was inserted on top of it. old_nodes = set(node for node, _ in sdfg.all_nodes_recursive()) @@ -161,12 +145,14 @@ def preprocess(self, sdfg: SDFG) -> None: sdfg.apply_transformations_once_everywhere(AddThreadBlockMap) new_nodes = set(node for node, _ in sdfg.all_nodes_recursive()) - old_nodes - kernels_with_added_tb_maps = {n for n in new_nodes if isinstance(n, nodes.MapEntry) and n.schedule == dtypes.ScheduleType.GPU_Device} + kernels_with_added_tb_maps = { + n + for n in new_nodes if isinstance(n, nodes.MapEntry) and n.schedule == dtypes.ScheduleType.GPU_Device + } # Infer GPU Grid and Block dimensions self._kernel_dimensions_map = InferGPUGridAndBlockSize().apply_pass(sdfg, kernels_with_added_tb_maps) - #------------------------- Hanlde GPU<->GPU strided copies -------------------------- # Find GPU<->GPU strided copies that cannot be represented by a single copy command @@ -214,7 +200,7 @@ def preprocess(self, sdfg: SDFG) -> None: self._frame.statestruct.append('dace::cuda::Context *gpu_context;') # Define backend stream access expression (e.g., CUDA stream handle) - gpu_stream_access_template = "__state->gpu_context->streams[{gpu_stream}]" + gpu_stream_access_template = "__state->gpu_context->streams[{gpu_stream}]" # Initialize and configure GPU stream scheduling pass gpu_stream_pass = NaiveGPUStreamScheduler() @@ -278,7 +264,7 @@ def _compute_pool_release(self, top_sdfg: SDFG): if all(nx.has_path(state.nx, an2, an1) for an2 in ans if an2 is not an1): terminator = an1 break - + # Old logic below, now we use the gpu_stream manager which returns nullptr automatically # to all nodes thatdid not got assigned a cuda stream """ @@ -327,42 +313,39 @@ def _compute_pool_release(self, top_sdfg: SDFG): for arr in unfreed: self.pool_release[(sdfg, arr)] = (sink, set()) - ########################################################################### # Determine wheter initializer and finalizer should be called @property def has_initializer(self) -> bool: return True + @property def has_finalizer(self) -> bool: return True - ########################################################################### - # Scope generation + # Scope generation def generate_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: ScopeSubgraphView, state_id: int, function_stream: CodeIOStream, callsite_stream: CodeIOStream) -> None: - # Import strategies here to avoid circular dependencies - from dace.codegen.targets.experimental_cuda_helpers.scope_strategies import ( - ScopeGenerationStrategy, - KernelScopeGenerator, - ThreadBlockScopeGenerator, - WarpScopeGenerator - ) - + from dace.codegen.targets.experimental_cuda_helpers.scope_strategies import (ScopeGenerationStrategy, + KernelScopeGenerator, + ThreadBlockScopeGenerator, + WarpScopeGenerator) #--------------- Start of Kernel Function Code Generation -------------------- if not ExperimentalCUDACodeGen._in_kernel_code: # Prepare and cache kernel metadata (name, dimensions, arguments, etc.) - self._current_kernel_spec = KernelSpec( - cudaCodeGen=self, sdfg=sdfg, cfg=cfg, dfg_scope=dfg_scope, state_id=state_id - ) + self._current_kernel_spec = KernelSpec(cudaCodeGen=self, + sdfg=sdfg, + cfg=cfg, + dfg_scope=dfg_scope, + state_id=state_id) # Generate wrapper function self._generate_kernel_wrapper(sdfg, cfg, dfg_scope, state_id, function_stream, callsite_stream) @@ -376,10 +359,8 @@ def generate_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: ScopeSub if kernel_scope_generator.applicable(sdfg, cfg, dfg_scope, state_id, kernel_function_stream, kernel_stream): kernel_scope_generator.generate(sdfg, cfg, dfg_scope, state_id, kernel_function_stream, kernel_stream) else: - raise ValueError( - "Invalid kernel configuration: This strategy is only applicable if the " - "outermost GPU schedule is of type GPU_Device (most likely cause)." - ) + raise ValueError("Invalid kernel configuration: This strategy is only applicable if the " + "outermost GPU schedule is of type GPU_Device (most likely cause).") # Append generated kernel code to localcode self._localcode.write(kernel_stream.getvalue() + '\n') @@ -392,7 +373,6 @@ def generate_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: ScopeSub return - #--------------- Nested GPU Scope -------------------- supported_strategies: List[ScopeGenerationStrategy] = [ @@ -412,140 +392,124 @@ def generate_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: ScopeSub schedule_type = node.map.schedule if schedule_type == dace.ScheduleType.GPU_Device: - raise NotImplementedError( - "Dynamic parallelism (nested GPU_Device schedules) is not supported." - ) + raise NotImplementedError("Dynamic parallelism (nested GPU_Device schedules) is not supported.") raise NotImplementedError( f"Scope generation for schedule type '{schedule_type}' is not implemented in ExperimentalCUDACodeGen. " - "Please check for supported schedule types or implement the corresponding strategy." - ) - - def _generate_kernel_wrapper(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: ScopeSubgraphView, - state_id: int, function_stream: CodeIOStream, callsite_stream: CodeIOStream) -> None: - - - scope_entry = dfg_scope.source_nodes()[0] - - kernel_spec: KernelSpec = self._current_kernel_spec - kernel_name = kernel_spec.kernel_name - kernel_wrapper_args_as_input = kernel_spec.kernel_wrapper_args_as_input - kernel_wrapper_args_typed = kernel_spec.kernel_wrapper_args_typed - - # Declaration of the function which launches the kernel (C++ code) - function_stream.write('DACE_EXPORTED void __dace_runkernel_%s(%s);\n' % - (kernel_name, ', '.join(kernel_wrapper_args_typed)), cfg, state_id, scope_entry) - - # Calling the function which launches the kernel (C++ code) - callsite_stream.write( '__dace_runkernel_%s(%s);\n' % - (kernel_name, ', '.join(kernel_wrapper_args_as_input)), cfg, state_id, scope_entry) - - def _generate_kernel_launch(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: ScopeSubgraphView, - state_id: int, function_stream: CodeIOStream, callsite_stream: CodeIOStream) -> None: - - # NOTE: This generates the function that launches the kernel. - # Do not confuse it with CUDA's internal "LaunchKernel" API — - # the generated function *calls* that API, but we also refer to it as a "launch function". - - scope_entry = dfg_scope.source_nodes()[0] - - kernel_spec: KernelSpec = self._current_kernel_spec - kernel_name = kernel_spec.kernel_name - kernel_args_as_input = kernel_spec.args_as_input - kernel_launch_args_typed = kernel_spec.kernel_wrapper_args_typed - - # get kernel dimensions and transform into a c++ string - grid_dims = kernel_spec.grid_dims - block_dims = kernel_spec.block_dims - gdims = ', '.join(symbolic_to_cpp(grid_dims)) - bdims = ', '.join(symbolic_to_cpp(block_dims)) - - # cuda/hip stream the kernel belongs to - gpu_stream = self._gpu_stream_manager.get_stream_node(scope_entry) - - # ----------------- Kernel Launch Function Declaration ----------------------- - self._localcode.write( - """ + "Please check for supported schedule types or implement the corresponding strategy.") + + def _generate_kernel_wrapper(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: ScopeSubgraphView, state_id: int, + function_stream: CodeIOStream, callsite_stream: CodeIOStream) -> None: + + scope_entry = dfg_scope.source_nodes()[0] + + kernel_spec: KernelSpec = self._current_kernel_spec + kernel_name = kernel_spec.kernel_name + kernel_wrapper_args_as_input = kernel_spec.kernel_wrapper_args_as_input + kernel_wrapper_args_typed = kernel_spec.kernel_wrapper_args_typed + + # Declaration of the function which launches the kernel (C++ code) + function_stream.write( + 'DACE_EXPORTED void __dace_runkernel_%s(%s);\n' % (kernel_name, ', '.join(kernel_wrapper_args_typed)), cfg, + state_id, scope_entry) + + # Calling the function which launches the kernel (C++ code) + callsite_stream.write('__dace_runkernel_%s(%s);\n' % (kernel_name, ', '.join(kernel_wrapper_args_as_input)), + cfg, state_id, scope_entry) + + def _generate_kernel_launch(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: ScopeSubgraphView, state_id: int, + function_stream: CodeIOStream, callsite_stream: CodeIOStream) -> None: + + # NOTE: This generates the function that launches the kernel. + # Do not confuse it with CUDA's internal "LaunchKernel" API — + # the generated function *calls* that API, but we also refer to it as a "launch function". + + scope_entry = dfg_scope.source_nodes()[0] + + kernel_spec: KernelSpec = self._current_kernel_spec + kernel_name = kernel_spec.kernel_name + kernel_args_as_input = kernel_spec.args_as_input + kernel_launch_args_typed = kernel_spec.kernel_wrapper_args_typed + + # get kernel dimensions and transform into a c++ string + grid_dims = kernel_spec.grid_dims + block_dims = kernel_spec.block_dims + gdims = ', '.join(symbolic_to_cpp(grid_dims)) + bdims = ', '.join(symbolic_to_cpp(block_dims)) + + # cuda/hip stream the kernel belongs to + gpu_stream = self._gpu_stream_manager.get_stream_node(scope_entry) + + # ----------------- Kernel Launch Function Declaration ----------------------- + self._localcode.write( + """ DACE_EXPORTED void __dace_runkernel_{fname}({fargs}); void __dace_runkernel_{fname}({fargs}) - """.format(fname=kernel_name, fargs=', '.join(kernel_launch_args_typed)), - cfg, state_id, scope_entry - ) + """.format(fname=kernel_name, fargs=', '.join(kernel_launch_args_typed)), cfg, state_id, scope_entry) - # Open bracket - self._localcode.write('{', cfg, state_id, scope_entry) - - # ----------------- Guard Checks handling ----------------------- + # Open bracket + self._localcode.write('{', cfg, state_id, scope_entry) - # Ensure that iteration space is neither empty nor negative sized - single_dimchecks = [] - for gdim in grid_dims: - # Only emit a guard if we can't statically prove gdim > 0 - if (gdim > 0) != True: - single_dimchecks.append(f'(({symbolic_to_cpp(gdim)}) <= 0)') + # ----------------- Guard Checks handling ----------------------- - dimcheck = ' || '.join(single_dimchecks) + # Ensure that iteration space is neither empty nor negative sized + single_dimchecks = [] + for gdim in grid_dims: + # Only emit a guard if we can't statically prove gdim > 0 + if (gdim > 0) != True: + single_dimchecks.append(f'(({symbolic_to_cpp(gdim)}) <= 0)') - if dimcheck: - emptygrid_warning = '' - if Config.get('debugprint') == 'verbose' or Config.get_bool('compiler', 'cuda', 'syncdebug'): - emptygrid_warning = (f'printf("Warning: Skipping launching kernel \\"{kernel_name}\\" ' - 'due to an empty grid.\\n");') + dimcheck = ' || '.join(single_dimchecks) - self._localcode.write( - f''' + if dimcheck: + emptygrid_warning = '' + if Config.get('debugprint') == 'verbose' or Config.get_bool('compiler', 'cuda', 'syncdebug'): + emptygrid_warning = (f'printf("Warning: Skipping launching kernel \\"{kernel_name}\\" ' + 'due to an empty grid.\\n");') + + self._localcode.write( + f''' if ({dimcheck}) {{ {emptygrid_warning} return; }}''', cfg, state_id, scope_entry) - - - # ----------------- Kernel Launch Invocation ----------------------- - self._localcode.write( - ''' + # ----------------- Kernel Launch Invocation ----------------------- + self._localcode.write( + ''' void *{kname}_args[] = {{ {kargs} }}; gpuError_t __err = {backend}LaunchKernel( (void*){kname}, dim3({gdims}), dim3({bdims}), {kname}_args, {dynsmem}, {stream} ); - '''.format( - kname=kernel_name, - kargs=', '.join(['(void *)&' + arg for arg in kernel_args_as_input]), - gdims=gdims, - bdims=bdims, - dynsmem='0', - stream=gpu_stream, - backend=self.backend - ), - cfg, state_id, scope_entry - ) - + '''.format(kname=kernel_name, + kargs=', '.join(['(void *)&' + arg for arg in kernel_args_as_input]), + gdims=gdims, + bdims=bdims, + dynsmem='0', + stream=gpu_stream, + backend=self.backend), cfg, state_id, scope_entry) - self._localcode.write(f'DACE_KERNEL_LAUNCH_CHECK(__err, "{kernel_name}", {gdims}, {bdims});') - emit_sync_debug_checks(self.backend, self._localcode) + self._localcode.write(f'DACE_KERNEL_LAUNCH_CHECK(__err, "{kernel_name}", {gdims}, {bdims});') + emit_sync_debug_checks(self.backend, self._localcode) - # Close bracket - self._localcode.write('}', cfg, state_id, scope_entry) + # Close bracket + self._localcode.write('}', cfg, state_id, scope_entry) ########################################################################### # Generation of Memory Copy Logic def copy_memory(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int, src_node: Union[nodes.Tasklet, nodes.AccessNode], dst_node: Union[nodes.CodeNode, nodes.AccessNode], - edge: Tuple[nodes.Node, str, nodes.Node, str, Memlet], - function_stream: CodeIOStream, callsite_stream: CodeIOStream) -> None: - - - from dace.codegen.targets.experimental_cuda_helpers.copy_strategies import ( - CopyContext, - CopyStrategy, - OutOfKernelCopyStrategy, - SyncCollaboritveGPUCopyStrategy, - AsyncCollaboritveGPUCopyStrategy, - FallBackGPUCopyStrategy - ) - - context = CopyContext(self, self._gpu_stream_manager, state_id, src_node, dst_node, edge, - sdfg, cfg, dfg, callsite_stream) + edge: Tuple[nodes.Node, str, nodes.Node, str, + Memlet], function_stream: CodeIOStream, callsite_stream: CodeIOStream) -> None: + + from dace.codegen.targets.experimental_cuda_helpers.copy_strategies import (CopyContext, CopyStrategy, + OutOfKernelCopyStrategy, + SyncCollaboritveGPUCopyStrategy, + AsyncCollaboritveGPUCopyStrategy, + FallBackGPUCopyStrategy) + + context = CopyContext(self, self._gpu_stream_manager, state_id, src_node, dst_node, edge, sdfg, cfg, dfg, + callsite_stream) # Order matters: fallback must come last strategies: List[CopyStrategy] = [ @@ -587,12 +551,12 @@ def node_dispatch_predicate(self, sdfg, state, node): if schedule in dtypes.GPU_SCHEDULES_EXPERIMENTAL_CUDACODEGEN: return True - + if ExperimentalCUDACodeGen._in_kernel_code: return True - + return False - + ############################################################################# # Nested SDFG related, testing phase @@ -603,7 +567,7 @@ def generate_state(self, function_stream: CodeIOStream, callsite_stream: CodeIOStream, generate_state_footer: bool = False) -> None: - + # User frame code to generate state self._frame.generate_state(sdfg, cfg, state, function_stream, callsite_stream) @@ -635,16 +599,15 @@ def generate_state(self, # Delete the handled keys here (not in the for loop, which would cause issues) for key in handled_keys: del self.pool_release[key] - + def generate_node(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int, node: nodes.Node, function_stream: CodeIOStream, callsite_stream: CodeIOStream) -> None: - # get the generating function's name gen = getattr(self, '_generate_' + type(node).__name__, False) # if it is not implemented, use generate node of cpu impl - if gen is not False: + if gen is not False: gen(sdfg, cfg, dfg, state_id, node, function_stream, callsite_stream) elif type(node).__name__ == 'MapExit' and node.schedule in dtypes.GPU_SCHEDULES_EXPERIMENTAL_CUDACODEGEN: # Special case: It is a MapExit but from a GPU_schedule- the MapExit is already @@ -658,8 +621,13 @@ def generate_nsdfg_header(self, sdfg, cfg, state, state_id, node, memlet_referen sdfg, cfg, state, state_id, node, memlet_references, sdfg_label, state_struct=False) def generate_nsdfg_call(self, sdfg, cfg, state, node, memlet_references, sdfg_label): - return self._cpu_codegen.generate_nsdfg_call(sdfg, cfg, state, node, memlet_references, - sdfg_label, state_struct=False) + return self._cpu_codegen.generate_nsdfg_call(sdfg, + cfg, + state, + node, + memlet_references, + sdfg_label, + state_struct=False) def generate_nsdfg_arguments(self, sdfg, cfg, dfg, state, node): result = self._cpu_codegen.generate_nsdfg_arguments(sdfg, cfg, dfg, state, node) @@ -677,15 +645,13 @@ def _generate_NestedSDFG(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSub self._cpu_codegen.calling_codegen = old_codegen self._toplevel_schedule = old_schedule - ####################################################################### # Array Declaration, Allocation and Deallocation def declare_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int, - node: nodes.AccessNode, nodedesc: dt.Data, function_stream: CodeIOStream, - declaration_stream: CodeIOStream) -> None: - + node: nodes.AccessNode, nodedesc: dt.Data, function_stream: CodeIOStream, + declaration_stream: CodeIOStream) -> None: ptrname = ptr(node.data, nodedesc, sdfg, self._frame) fsymbols = self._frame.symbols_and_constants(sdfg) @@ -697,7 +663,7 @@ def declare_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphVi if dfg and not sdutil.is_nonfree_sym_dependent(node, nodedesc, dfg, fsymbols): raise NotImplementedError( "declare_array is only for variables that require separate declaration and allocation.") - + if nodedesc.storage == dtypes.StorageType.GPU_Shared: raise NotImplementedError("Dynamic shared memory unsupported") @@ -705,13 +671,11 @@ def declare_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphVi raise ValueError("Dynamic allocation of registers is not allowed") if nodedesc.storage not in {dtypes.StorageType.GPU_Global, dtypes.StorageType.CPU_Pinned}: - raise NotImplementedError( - f"CUDA: Unimplemented storage type {nodedesc.storage.name}.") + raise NotImplementedError(f"CUDA: Unimplemented storage type {nodedesc.storage.name}.") if self._dispatcher.declared_arrays.has(ptrname): return # Already declared - # ----------------- Declaration -------------------- dataname = node.data array_ctype = f'{nodedesc.dtype.ctype} *' @@ -754,8 +718,8 @@ def allocate_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphV raise NotImplementedError(f'CUDA: Unimplemented storage type {nodedesc.storage}') def _prepare_GPU_Global_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int, - node: nodes.AccessNode, nodedesc: dt.Data, function_stream: CodeIOStream, - declaration_stream: CodeIOStream, allocation_stream: CodeIOStream): + node: nodes.AccessNode, nodedesc: dt.Data, function_stream: CodeIOStream, + declaration_stream: CodeIOStream, allocation_stream: CodeIOStream): dataname = ptr(node.data, nodedesc, sdfg, self._frame) # ------------------- Declaration ------------------- @@ -778,35 +742,27 @@ def _prepare_GPU_Global_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: Sta gpu_stream = f'__state->gpu_context->streams[{gpu_stream}]' allocation_stream.write( f'DACE_GPU_CHECK({self.backend}MallocAsync((void**)&{dataname}, {arrsize_malloc}, {gpu_stream}));\n', - cfg, state_id, node - ) + cfg, state_id, node) emit_sync_debug_checks(self.backend, allocation_stream) else: # Strides are left to the user's discretion - allocation_stream.write( - f'DACE_GPU_CHECK({self.backend}Malloc((void**)&{dataname}, {arrsize_malloc}));\n', - cfg, state_id, node - ) + allocation_stream.write(f'DACE_GPU_CHECK({self.backend}Malloc((void**)&{dataname}, {arrsize_malloc}));\n', + cfg, state_id, node) # ------------------- Initialization ------------------- if node.setzero: - allocation_stream.write( - f'DACE_GPU_CHECK({self.backend}Memset({dataname}, 0, {arrsize_malloc}));\n', - cfg, state_id, node - ) + allocation_stream.write(f'DACE_GPU_CHECK({self.backend}Memset({dataname}, 0, {arrsize_malloc}));\n', cfg, + state_id, node) if isinstance(nodedesc, dt.Array) and nodedesc.start_offset != 0: - allocation_stream.write( - f'{dataname} += {symbolic_to_cpp(nodedesc.start_offset)};\n', - cfg, state_id, node - ) + allocation_stream.write(f'{dataname} += {symbolic_to_cpp(nodedesc.start_offset)};\n', cfg, state_id, node) def _prepare_CPU_Pinned_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int, - node: nodes.AccessNode, nodedesc: dt.Data, function_stream: CodeIOStream, - declaration_stream: CodeIOStream, allocation_stream: CodeIOStream): - + node: nodes.AccessNode, nodedesc: dt.Data, function_stream: CodeIOStream, + declaration_stream: CodeIOStream, allocation_stream: CodeIOStream): + dataname = ptr(node.data, nodedesc, sdfg, self._frame) - + # ------------------- Declaration ------------------- array_ctype = f'{nodedesc.dtype.ctype} *' declared = self._dispatcher.declared_arrays.has(dataname) @@ -816,65 +772,49 @@ def _prepare_CPU_Pinned_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: Sta self._dispatcher.defined_vars.add(dataname, DefinedType.Pointer, array_ctype) - # ------------------- Allocation ------------------- arrsize = nodedesc.total_size arrsize_malloc = f'{symbolic_to_cpp(arrsize)} * sizeof({nodedesc.dtype.ctype})' # Strides are left to the user's discretion - allocation_stream.write( - f'DACE_GPU_CHECK({self.backend}MallocHost(&{dataname}, {arrsize_malloc}));\n', - cfg, state_id, node - ) + allocation_stream.write(f'DACE_GPU_CHECK({self.backend}MallocHost(&{dataname}, {arrsize_malloc}));\n', cfg, + state_id, node) if node.setzero: - allocation_stream.write( - f'memset({dataname}, 0, {arrsize_malloc});\n', - cfg, state_id, node - ) - + allocation_stream.write(f'memset({dataname}, 0, {arrsize_malloc});\n', cfg, state_id, node) + if nodedesc.start_offset != 0: - allocation_stream.write( - f'{dataname} += {symbolic_to_cpp(nodedesc.start_offset)};\n', - cfg, state_id, node - ) + allocation_stream.write(f'{dataname} += {symbolic_to_cpp(nodedesc.start_offset)};\n', cfg, state_id, node) def _prepare_GPU_Shared_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int, - node: nodes.AccessNode, nodedesc: dt.Data, function_stream: CodeIOStream, - declaration_stream: CodeIOStream, allocation_stream: CodeIOStream): + node: nodes.AccessNode, nodedesc: dt.Data, function_stream: CodeIOStream, + declaration_stream: CodeIOStream, allocation_stream: CodeIOStream): dataname = ptr(node.data, nodedesc, sdfg, self._frame) arrsize = nodedesc.total_size - # ------------------- Guard checks ------------------- - if symbolic.issymbolic(arrsize, sdfg.constants): + if symbolic.issymbolic(arrsize, sdfg.constants): raise NotImplementedError('Dynamic shared memory unsupported') if nodedesc.start_offset != 0: raise NotImplementedError('Start offset unsupported for shared memory') - # ------------------- Declaration ------------------- array_ctype = f'{nodedesc.dtype.ctype} *' - declaration_stream.write( - f'__shared__ {nodedesc.dtype.ctype} {dataname}[{symbolic_to_cpp(arrsize)}];\n', - cfg, state_id, node - ) - - self._dispatcher.defined_vars.add(dataname, DefinedType.Pointer, array_ctype) + declaration_stream.write(f'__shared__ {nodedesc.dtype.ctype} {dataname}[{symbolic_to_cpp(arrsize)}];\n', cfg, + state_id, node) + self._dispatcher.defined_vars.add(dataname, DefinedType.Pointer, array_ctype) # ------------------- Initialization ------------------- if node.setzero: allocation_stream.write( f'dace::ResetShared<{nodedesc.dtype.ctype}, {", ".join(symbolic_to_cpp(self._current_kernel_spec.block_dims))}, {symbolic_to_cpp(arrsize)}, ' - f'1, false>::Reset({dataname});\n', - cfg, state_id, node - ) + f'1, false>::Reset({dataname});\n', cfg, state_id, node) def _prepare_Register_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int, - node: nodes.AccessNode, nodedesc: dt.Data, function_stream: CodeIOStream, - declaration_stream: CodeIOStream, allocation_stream: CodeIOStream): + node: nodes.AccessNode, nodedesc: dt.Data, function_stream: CodeIOStream, + declaration_stream: CodeIOStream, allocation_stream: CodeIOStream): dataname = ptr(node.data, nodedesc, sdfg, self._frame) @@ -883,24 +823,20 @@ def _prepare_Register_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: State raise ValueError('Dynamic allocation of registers not allowed') if nodedesc.start_offset != 0: raise NotImplementedError('Start offset unsupported for registers') - # ------------------- Declaration & Initialization ------------------- arrsize = nodedesc.total_size array_ctype = '{nodedesc.dtype.ctype} *' init_clause = ' = {0}' if node.setzero else '' - declaration_stream.write( - f'{nodedesc.dtype.ctype} {dataname}[{symbolic_to_cpp(arrsize)}]{init_clause};\n', - cfg, state_id, node - ) - + declaration_stream.write(f'{nodedesc.dtype.ctype} {dataname}[{symbolic_to_cpp(arrsize)}]{init_clause};\n', cfg, + state_id, node) + self._dispatcher.defined_vars.add(dataname, DefinedType.Pointer, array_ctype) def deallocate_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int, node: nodes.AccessNode, nodedesc: dt.Data, function_stream: CodeIOStream, callsite_stream: CodeIOStream) -> None: - dataname = ptr(node.data, nodedesc, sdfg, self._frame) @@ -917,32 +853,26 @@ def deallocate_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgrap ) self._dispatcher.declared_arrays.remove(dataname, is_global=is_global) - # Special case: Stream if isinstance(nodedesc, dace.data.Stream): raise NotImplementedError('stream code is not implemented in ExperimentalCUDACodeGen (yet)') - + # Special case: View - no deallocation if isinstance(nodedesc, dace.data.View): return - # Main deallocation logic by storage type if nodedesc.storage == dtypes.StorageType.GPU_Global: if not nodedesc.pool: # If pooled, will be freed somewhere else - callsite_stream.write( - f'DACE_GPU_CHECK({self.backend}Free({dataname}));\n', - cfg, state_id, node - ) + callsite_stream.write(f'DACE_GPU_CHECK({self.backend}Free({dataname}));\n', cfg, state_id, node) elif nodedesc.storage == dtypes.StorageType.CPU_Pinned: - callsite_stream.write( - f'DACE_GPU_CHECK({self.backend}FreeHost({dataname}));\n', cfg, state_id, node) - + callsite_stream.write(f'DACE_GPU_CHECK({self.backend}FreeHost({dataname}));\n', cfg, state_id, node) + elif nodedesc.storage in {dtypes.StorageType.GPU_Shared, dtypes.StorageType.Register}: # No deallocation needed return - + else: raise NotImplementedError(f'Deallocation not implemented for storage type: {nodedesc.storage.name}') @@ -971,7 +901,6 @@ def get_generated_codeobjects(self): exitcode.write(codeblock_to_cpp(sd.exit_code['cuda']), sd) exitcode.write(self._exitcode.getvalue()) - # My comment: Uses GPU backend (NVIDIA or AMD) to get correct header files if self.backend == 'cuda': backend_header = 'cuda_runtime.h' @@ -1144,7 +1073,7 @@ def cmake_options(): host_compiler = make_absolute(Config.get("compiler", "cpu", "executable")) options.append("-DCUDA_HOST_COMPILER=\"{}\"".format(host_compiler)) - return options + return options ####################################################################### # Callback to CPU codegen @@ -1173,7 +1102,7 @@ class KernelSpec: def __init__(self, cudaCodeGen: ExperimentalCUDACodeGen, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: ScopeSubgraphView, state_id: int): - + # Get kernel entry/exit nodes and current state kernel_entry_node = dfg_scope.source_nodes()[0] kernel_exit_node = dfg_scope.sink_nodes()[0] @@ -1196,7 +1125,9 @@ def __init__(self, cudaCodeGen: ExperimentalCUDACodeGen, sdfg: SDFG, cfg: Contro # Typed arguments and argument access as input self._args_typed: list[str] = [adata.as_arg(name=aname) for aname, adata in self._args.items()] - self._args_as_input: list[str] = [ptr(aname, adata, sdfg, cudaCodeGen._frame) for aname, adata in self._args.items()] + self._args_as_input: list[str] = [ + ptr(aname, adata, sdfg, cudaCodeGen._frame) for aname, adata in self._args.items() + ] # Used for the kernel wrapper function, be careful: a change in the name __state will probably lead to compilation errors state_param: list[str] = [f'{mangle_dace_state_struct_name(cudaCodeGen._global_sdfg)} *__state'] @@ -1213,11 +1144,10 @@ def __init__(self, cudaCodeGen: ExperimentalCUDACodeGen, sdfg: SDFG, cfg: Contro if cudaCodeGen.backend not in ['cuda', 'hip']: raise ValueError(f"Unsupported backend '{cudaCodeGen.backend}' in ExperimentalCUDACodeGen. " "Only 'cuda' and 'hip' are supported.") - + warp_size_key = 'cuda_warp_size' if cudaCodeGen.backend == 'cuda' else 'hip_warp_size' self._warpSize = Config.get('compiler', 'cuda', warp_size_key) - def get_gpu_index_ctype(self, config_key='gpu_index_type') -> str: """ Retrieves the GPU index data type as a C type string (for thread, block, warp indices) @@ -1227,8 +1157,8 @@ def get_gpu_index_ctype(self, config_key='gpu_index_type') -> str: ValueError: If the configured type does not match a DaCe data type. Returns: - str: - The C type string corresponding to the configured GPU index type. + str: + The C type string corresponding to the configured GPU index type. Used for defining thread, block, and warp indices in the generated code. """ type_name = Config.get('compiler', 'cuda', config_key) @@ -1237,8 +1167,7 @@ def get_gpu_index_ctype(self, config_key='gpu_index_type') -> str: raise ValueError( f'Invalid {config_key} "{type_name}" configured (used for thread, block, and warp indices): ' 'no matching DaCe data type found.\n' - 'Please use a valid type from dace.dtypes (e.g., "int32", "uint64").' - ) + 'Please use a valid type from dace.dtypes (e.g., "int32", "uint64").') return dtype.ctype @property @@ -1253,16 +1182,16 @@ def kernel_entry_node(self) -> nodes.MapEntry: scheduled with dace.dtypes.ScheduleType.GPU_Device. """ return self._kernel_entry_node - + @property def kernel_map(self) -> nodes.Map: """Returns the kernel's map node.""" return self._kernel_entry_node.map - + @property def args_as_input(self) -> list[str]: """ - Returns the kernel function arguments formatted for use as inputs + Returns the kernel function arguments formatted for use as inputs when calling the kernel function. """ return self._args_as_input @@ -1270,11 +1199,11 @@ def args_as_input(self) -> list[str]: @property def args_typed(self) -> list[str]: """ - Returns the typed kernel function arguments suitable for declaring + Returns the typed kernel function arguments suitable for declaring the kernel function. Each argument includes its corresponding data type. """ return self._args_typed - + @property def kernel_wrapper_args_as_input(self) -> list[str]: """ diff --git a/dace/codegen/targets/experimental_cuda_helpers/copy_strategies.py b/dace/codegen/targets/experimental_cuda_helpers/copy_strategies.py index 6932ea992b..3dcf29cd9f 100644 --- a/dace/codegen/targets/experimental_cuda_helpers/copy_strategies.py +++ b/dace/codegen/targets/experimental_cuda_helpers/copy_strategies.py @@ -7,7 +7,6 @@ from dace.codegen.targets.experimental_cuda import ExperimentalCUDACodeGen, GPUStreamManager, KernelSpec from dace.codegen.targets.experimental_cuda_helpers.gpu_utils import product, symbolic_to_cpp, emit_sync_debug_checks - from dace.codegen.prettycode import CodeIOStream from dace.sdfg import SDFG, nodes from dace.sdfg.nodes import Node @@ -30,10 +29,11 @@ class CopyContext: what values are needed for code generation and why. This improves readability, simplifies copy emission logic, and makes future extensions easier. """ + def __init__(self, codegen: ExperimentalCUDACodeGen, gpu_stream_manager: GPUStreamManager, state_id: int, - src_node: Node, dst_node: Node, edge: Tuple[Node, str, Node, str, Memlet], sdfg: SDFG, + src_node: Node, dst_node: Node, edge: Tuple[Node, str, Node, str, Memlet], sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, callsite_stream: CodeIOStream): - + # Store general context information for the copy operation, such as: # - which code generator is responsible, # - which edge and SDFG/state context related to the copy, @@ -47,7 +47,7 @@ def __init__(self, codegen: ExperimentalCUDACodeGen, gpu_stream_manager: GPUStre self.cfg = cfg self.dfg = dfg self.callsite_stream = callsite_stream - + # Additional information frequently needed self.backend = codegen.backend self.state_dfg = cfg.state(state_id) @@ -55,17 +55,15 @@ def __init__(self, codegen: ExperimentalCUDACodeGen, gpu_stream_manager: GPUStre self.src_storage = self.get_storage_type(src_node) self.dst_storage = self.get_storage_type(dst_node) - if isinstance(src_node, nodes.AccessNode) and isinstance(dst_node, nodes.AccessNode): copy_shape, src_strides, dst_strides, src_expr, dst_expr = memlet_copy_to_absolute_strides( - codegen._dispatcher, sdfg, self.state_dfg, edge, src_node, dst_node, codegen._cpu_codegen._packed_types) + codegen._dispatcher, sdfg, self.state_dfg, edge, src_node, dst_node, codegen._cpu_codegen._packed_types) else: _, _, _, _, memlet = edge copy_shape = [symbolic.overapproximate(s) for s in memlet.subset.bounding_box_size()] - - # if src and dst node are not AccessNodes, these are undefined + + # if src and dst node are not AccessNodes, these are undefined src_strides = dst_strides = src_expr = dst_expr = None - self.copy_shape = copy_shape self.src_strides = src_strides @@ -76,21 +74,21 @@ def __init__(self, codegen: ExperimentalCUDACodeGen, gpu_stream_manager: GPUStre self.num_dims = len(copy_shape) def get_storage_type(self, node: Node): - + if isinstance(node, nodes.Tasklet): storage_type = StorageType.Register else: storage_type = node.desc(self.sdfg).storage - + return storage_type - + def get_copy_call_parameters(self) -> Tuple[str, str, str, str, str, str, any]: """ Returns all essential parameters required to emit a backend memory copy call. - This method determines both structural and backend-specific information - needed to perform a memory copy, including memory locations, pointer - expressions, and data types. In cases where either the source or + This method determines both structural and backend-specific information + needed to perform a memory copy, including memory locations, pointer + expressions, and data types. In cases where either the source or destination is not a data access node, pointer expressions may be unavailable. Returns @@ -112,13 +110,11 @@ def get_copy_call_parameters(self) -> Tuple[str, str, str, str, str, str, any]: ctype_src = self.src_node.desc(self.sdfg).dtype.ctype ctype_dst = self.dst_node.desc(self.sdfg).dtype.ctype ctype = ctype_dst - assert ctype_src == ctype_dst, ( - f"Source and destination data types must match for the memory copy: " - f"{ctype_src} != {ctype_dst}" - ) + assert ctype_src == ctype_dst, (f"Source and destination data types must match for the memory copy: " + f"{ctype_src} != {ctype_dst}") return self.backend, self.src_expr, self.dst_expr, src_location, dst_location, self.cudastream, ctype - + def get_transfer_layout(self) -> Tuple[list, list, list]: """ Returns layout information required for emitting a memory copy. @@ -186,10 +182,10 @@ def generate_copy(self, copy_context: CopyContext) -> None: Generates the copy code for the supported pattern. """ raise NotImplementedError('Abstract class') - + class OutOfKernelCopyStrategy(CopyStrategy): - + def applicable(self, copy_context: CopyContext) -> bool: """ Determines whether the data movement is a host<->device memory copy. @@ -201,37 +197,26 @@ def applicable(self, copy_context: CopyContext) -> bool: This check is used to detect and handle transfers between host and device memory spaces. """ - + # TODO: I don't understand why all of these conditions are needed, look into it cpu_storage_types = [StorageType.CPU_Heap, StorageType.CPU_ThreadLocal, StorageType.CPU_Pinned] not_in_kernel_code = not ExperimentalCUDACodeGen._in_kernel_code - is_between_access_nodes = ( - isinstance(copy_context.src_node, nodes.AccessNode) and - isinstance(copy_context.dst_node, nodes.AccessNode) - ) - - - involves_gpu_or_pinned = ( - copy_context.src_storage in (StorageType.GPU_Global, StorageType.CPU_Pinned) or - copy_context.dst_storage in (StorageType.GPU_Global, StorageType.CPU_Pinned) - ) - - is_not_cpu_to_cpu = not ( - copy_context.src_storage in cpu_storage_types and - copy_context.dst_storage in cpu_storage_types - ) - - is_gpu_host_copy = ( - not_in_kernel_code and - is_between_access_nodes and - involves_gpu_or_pinned and - is_not_cpu_to_cpu - ) + is_between_access_nodes = (isinstance(copy_context.src_node, nodes.AccessNode) + and isinstance(copy_context.dst_node, nodes.AccessNode)) + + involves_gpu_or_pinned = (copy_context.src_storage in (StorageType.GPU_Global, StorageType.CPU_Pinned) + or copy_context.dst_storage in (StorageType.GPU_Global, StorageType.CPU_Pinned)) + + is_not_cpu_to_cpu = not (copy_context.src_storage in cpu_storage_types + and copy_context.dst_storage in cpu_storage_types) + + is_gpu_host_copy = (not_in_kernel_code and is_between_access_nodes and involves_gpu_or_pinned + and is_not_cpu_to_cpu) return is_gpu_host_copy - + def generate_copy(self, copy_context: CopyContext) -> None: """Execute host-device copy with CUDA memory operations""" @@ -240,7 +225,7 @@ def generate_copy(self, copy_context: CopyContext) -> None: if memlet.wcr is not None: src_location, dst_location = copy_context.get_memory_location() raise NotImplementedError(f'Accumulate {src_location} to {dst_location} not implemented') - + # call corresponding helper function num_dims = copy_context.num_dims if num_dims == 1: @@ -254,16 +239,16 @@ def generate_copy(self, copy_context: CopyContext) -> None: # We use library calls thus for debugging we provide sync option emit_sync_debug_checks(copy_context.backend, copy_context.callsite_stream) - + def _generate_1d_copy(self, copy_context: CopyContext) -> None: """ Emits code for a 1D memory copy between host and device using GPU backend. Uses {backend}MemcpyAsync for contiguous memory and uses {backend}Memcpy2DAsync for strided memory copies. """ - + # ----------- Extract relevant copy parameters -------------- - copy_shape, src_strides, dst_strides= copy_context.get_transfer_layout() + copy_shape, src_strides, dst_strides = copy_context.get_transfer_layout() backend, src_expr, dst_expr, src_location, dst_location, cudastream, ctype = \ copy_context.get_copy_call_parameters() @@ -272,16 +257,16 @@ def _generate_1d_copy(self, copy_context: CopyContext) -> None: if copy_context.is_contiguous_copy(): # Memory is linear: can use {backend}MemcpyAsync copysize = ' * '.join(symbolic_to_cpp(copy_shape)) - copysize += f' * sizeof({ctype})' + copysize += f' * sizeof({ctype})' kind = f'{backend}Memcpy{src_location}To{dst_location}' call = f'DACE_GPU_CHECK({backend}MemcpyAsync({dst_expr}, {src_expr}, {copysize}, {kind}, {cudastream}));\n' - + else: # Memory is strided: use {backend}Memcpy2DAsync with dpitch/spitch - # This allows copying a strided 1D region + # This allows copying a strided 1D region dpitch = f'{dst_strides[0]} * sizeof({ctype})' spitch = f'{src_strides[0]} * sizeof({ctype})' - width = f'sizeof({ctype})' + width = f'sizeof({ctype})' height = copy_shape[0] kind = f'{backend}Memcpy{src_location}To{dst_location}' @@ -295,12 +280,11 @@ def _generate_2d_copy(self, copy_context: CopyContext) -> None: """Generates code for a 2D copy, falling back to 1D flattening if applicable.""" # ----------- Extract relevant copy parameters -------------- - copy_shape, src_strides, dst_strides= copy_context.get_transfer_layout() + copy_shape, src_strides, dst_strides = copy_context.get_transfer_layout() backend, src_expr, dst_expr, src_location, dst_location, cudastream, ctype = \ copy_context.get_copy_call_parameters() - # ----------------- Generate backend call if supported -------------------- if copy_context.is_contiguous_copy(): @@ -311,11 +295,11 @@ def _generate_2d_copy(self, copy_context: CopyContext) -> None: kind = f'{backend}Memcpy{src_location}To{dst_location}' call = f'DACE_GPU_CHECK({backend}Memcpy2DAsync({dst_expr}, {dpitch}, {src_expr}, {spitch}, {width}, {height}, {kind}, {cudastream}));\n' - - elif src_strides[-1] != 1 or dst_strides[-1] != 1: + + elif src_strides[-1] != 1 or dst_strides[-1] != 1: # TODO: Checks this, I am not sure but the old code and its description - # seems to be more complicated here than necessary.. - # But worth to mention: we essentiall flatten + # seems to be more complicated here than necessary.. + # But worth to mention: we essentiall flatten # NOTE: Special case of continuous copy # Example: dcol[0:I, 0:J, k] -> datacol[0:I, 0:J] @@ -323,18 +307,17 @@ def _generate_2d_copy(self, copy_context: CopyContext) -> None: dpitch = f'{dst_strides[1]} * sizeof({ctype})' spitch = f'{src_strides[1]} * sizeof({ctype})' - width = f'sizeof({ctype})' + width = f'sizeof({ctype})' height = copy_shape[0] * copy_shape[1] kind = f'{backend}Memcpy{src_location}To{dst_location}' call = f'DACE_GPU_CHECK({backend}Memcpy2DAsync({dst_expr}, {dpitch}, {src_expr}, {spitch}, {width}, {height}, {kind}, {cudastream}));\n' - + else: raise NotImplementedError( f"Unsupported 2D memory copy: shape={copy_shape}, src_strides={src_strides}, dst_strides={dst_strides}." " Please implement this case if it is valid, or raise a more descriptive error if this path should not be taken." ) - # ----------------- Write copy call to code stream -------------------- callsite_stream, cfg, state_id, src_node, dst_node = copy_context.get_write_context() @@ -349,11 +332,10 @@ def _generate_nd_copy(self, copy_context: CopyContext) -> None: f" Source node: {copy_context.src_node} (storage: {copy_context.src_storage})\n" f" Destination node: {copy_context.dst_node} (storage: {copy_context.dst_storage})\n" f" Source strides: {copy_context.src_strides}\n" - f" Destination strides: {copy_context.dst_strides}\n" - ) - + f" Destination strides: {copy_context.dst_strides}\n") + # ----------- Extract relevant copy parameters -------------- - copy_shape, src_strides, dst_strides= copy_context.get_transfer_layout() + copy_shape, src_strides, dst_strides = copy_context.get_transfer_layout() backend, src_expr, dst_expr, src_location, dst_location, cudastream, ctype = \ copy_context.get_copy_call_parameters() @@ -367,17 +349,17 @@ def _generate_nd_copy(self, copy_context: CopyContext) -> None: for dim in range(num_dims - 2): callsite_stream.write( f"for (int __copyidx{dim} = 0; __copyidx{dim} < {copy_shape[dim]}; ++__copyidx{dim}) {{") - + # Write Memcopy2DAsync offset_src = ' + '.join(f'(__copyidx{d} * ({s}))' for d, s in enumerate(src_strides[:-2])) offset_dst = ' + '.join(f'(__copyidx{d} * ({s}))' for d, s in enumerate(dst_strides[:-2])) - src = f'{src_expr} + {offset_src}' - dst = f'{dst_expr} + {offset_dst}' + src = f'{src_expr} + {offset_src}' + dst = f'{dst_expr} + {offset_dst}' dpitch = f'{dst_strides[-2]} + sizeof({ctype})' spitch = f'{src_strides[-2]} + sizeof({ctype})' - width = f'{copy_shape[-1]} + sizeof({ctype})' + width = f'{copy_shape[-1]} + sizeof({ctype})' height = copy_shape[-2] kind = f'{backend}Memcpy{src_location}To{dst_location}' @@ -387,14 +369,15 @@ def _generate_nd_copy(self, copy_context: CopyContext) -> None: # Write for-loop footers for dim in range(num_dims - 2): - callsite_stream.write("}") + callsite_stream.write("}") ################ TODO, Might need to modified further ############# -# Below: Does collaborative copy + +# Below: Does collaborative copy class SyncCollaboritveGPUCopyStrategy(CopyStrategy): - + def applicable(self, copy_context: CopyContext) -> bool: """ Checks if the copy is eligible for a collaborative GPU-to-GPU copy. @@ -408,18 +391,13 @@ def applicable(self, copy_context: CopyContext) -> bool: # --- Condition 1: GPU to GPU memory transfer --- gpu_storages = {dtypes.StorageType.GPU_Global, dtypes.StorageType.GPU_Shared} - if not (copy_context.src_storage in gpu_storages and - copy_context.dst_storage in gpu_storages): + if not (copy_context.src_storage in gpu_storages and copy_context.dst_storage in gpu_storages): return False - - dst_node = copy_context.dst_node if isinstance(dst_node, nodes.AccessNode) and dst_node.async_copy: return False - - # --- Condition 2: Inside a GPU_Device map scope --- state = copy_context.state_dfg scope_dict = state.scope_dict() @@ -431,9 +409,9 @@ def applicable(self, copy_context: CopyContext) -> bool: # Determine the schedule type of the innermost non-sequential map. # If no such map exists, use the default schedule. current_node = deeper_scope_node - while (current_node is None or not isinstance(current_node, nodes.MapEntry) or - current_node.map.schedule == dtypes.ScheduleType.Sequential): - + while (current_node is None or not isinstance(current_node, nodes.MapEntry) + or current_node.map.schedule == dtypes.ScheduleType.Sequential): + parent = helpers.get_parent_map(state, current_node) if parent is None: current_node = None @@ -446,13 +424,11 @@ def applicable(self, copy_context: CopyContext) -> bool: schedule_type = current_node.map.schedule return schedule_type == dtypes.ScheduleType.GPU_Device - def generate_copy(self, copy_context: CopyContext) -> None: from dace.frontend import operations - # Get required copy information copy_shape, src_strides, dst_strides = copy_context.get_transfer_layout() src_expr, dst_expr = copy_context.src_expr, copy_context.dst_expr @@ -484,7 +460,7 @@ def generate_copy(self, copy_context: CopyContext) -> None: else: custom_reduction = [unparse_cr(sdfg, wcr, dtype)] reduction_template = "" - + accum = f"::template Accum{reduction_template}" # Dispatch to the correct backend copy template based on copy characteristics @@ -503,82 +479,56 @@ def generate_copy(self, copy_context: CopyContext) -> None: synchronized = "true" if any(symbolic.issymbolic(s, copy_context.sdfg.constants) for s in copy_shape): - args_list = ( - [src_expr] - + src_strides - + [dst_expr] - + custom_reduction - + dst_strides - + copy_shape - ) + args_list = ([src_expr] + src_strides + [dst_expr] + custom_reduction + dst_strides + copy_shape) args = ", ".join(symbolic_to_cpp(args_list)) - callsite_stream.write(f"{function_name}Dynamic<{ctype}, {block_dims}, {synchronized}>{accum}({args});", - cfg, state_id, [src_node, dst_node]) - + callsite_stream.write(f"{function_name}Dynamic<{ctype}, {block_dims}, {synchronized}>{accum}({args});", cfg, + state_id, [src_node, dst_node]) elif function_name == "dace::SharedToGlobal1D": # special case: use a new template struct that provides functions for copy and reduction copy_size = ', '.join(symbolic_to_cpp(copy_shape)) accum = accum or '::Copy' - args_list = ( - [src_expr] - + src_strides - + [dst_expr] - + dst_strides - + custom_reduction - ) + args_list = ([src_expr] + src_strides + [dst_expr] + dst_strides + custom_reduction) args = ", ".join(symbolic_to_cpp(args_list)) - callsite_stream.write(f"{function_name}<{ctype}, {block_dims}, {copy_size}, {synchronized}>{accum}({args});", - cfg, state_id, [src_node, dst_node]) - + callsite_stream.write( + f"{function_name}<{ctype}, {block_dims}, {copy_size}, {synchronized}>{accum}({args});", cfg, state_id, + [src_node, dst_node]) + else: copy_size = ', '.join(symbolic_to_cpp(copy_shape)) - args_list = ( - [src_expr] - + src_strides - + [dst_expr] - + custom_reduction - ) + args_list = ([src_expr] + src_strides + [dst_expr] + custom_reduction) args = ", ".join(symbolic_to_cpp(args_list)) dst_strides_unpacked = ", ".join(symbolic_to_cpp(dst_strides)) - callsite_stream.write(f"{function_name}<{ctype}, {block_dims}, {copy_size}, {dst_strides_unpacked}, {synchronized}>{accum}({args});", - cfg, state_id, [src_node, dst_node]) - - - + callsite_stream.write( + f"{function_name}<{ctype}, {block_dims}, {copy_size}, {dst_strides_unpacked}, {synchronized}>{accum}({args});", + cfg, state_id, [src_node, dst_node]) def _get_storagename(self, storage: dtypes.StorageType): - """ + """ Returns a string containing the name of the storage location. - Example: dtypes.StorageType.GPU_Shared will return "Shared". + Example: dtypes.StorageType.GPU_Shared will return "Shared". """ storage_name = str(storage) return storage_name[storage_name.rindex('_') + 1:] - - class AsyncCollaboritveGPUCopyStrategy(CopyStrategy): - def applicable(self, copy_context: CopyContext)-> bool: + def applicable(self, copy_context: CopyContext) -> bool: from dace.sdfg import scope_contains_scope from dace.transformation import helpers # --- Condition 1: GPU to GPU memory transfer --- gpu_storages = {dtypes.StorageType.GPU_Global, dtypes.StorageType.GPU_Shared} - if not (copy_context.src_storage in gpu_storages and - copy_context.dst_storage in gpu_storages): + if not (copy_context.src_storage in gpu_storages and copy_context.dst_storage in gpu_storages): return False - - dst_node = copy_context.dst_node if not (isinstance(dst_node, nodes.AccessNode) and dst_node.async_copy): return False - # --- Condition 2: Inside a GPU_Device map scope --- state = copy_context.state_dfg scope_dict = state.scope_dict() @@ -590,9 +540,9 @@ def applicable(self, copy_context: CopyContext)-> bool: # Determine the schedule type of the innermost non-sequential map. # If no such map exists, use the default schedule. current_node = deeper_scope_node - while (current_node is None or not isinstance(current_node, nodes.MapEntry) or - current_node.map.schedule == dtypes.ScheduleType.Sequential): - + while (current_node is None or not isinstance(current_node, nodes.MapEntry) + or current_node.map.schedule == dtypes.ScheduleType.Sequential): + parent = helpers.get_parent_map(state, current_node) if parent is None: current_node = None @@ -605,17 +555,14 @@ def applicable(self, copy_context: CopyContext)-> bool: schedule_type = current_node.map.schedule return schedule_type == dtypes.ScheduleType.GPU_Device - - def generate_copy(self, copy_context: CopyContext): - - # Show Yakup: + + # Show Yakup: # Asynchronous memory copies are only allowed if they are contiguous if not copy_context.is_contiguous_copy(): raise NotImplementedError("Asynchronous memory copies are not supported for not contigous memory copies") - # Get required copy information copy_shape, src_strides, dst_strides = copy_context.get_transfer_layout() src_expr, dst_expr = copy_context.src_expr, copy_context.dst_expr @@ -623,7 +570,7 @@ def generate_copy(self, copy_context: CopyContext): sdfg = copy_context.sdfg dtype = copy_context.src_node.desc(sdfg).dtype ctype = dtype.ctype - + # Get write context: callsite_stream, cfg, state_id, src_node, dst_node = copy_context.get_write_context() # copy dimension @@ -632,7 +579,8 @@ def generate_copy(self, copy_context: CopyContext): if num_dims == 1: pipeline = dst_node.async_pipeline size = f'{product(copy_shape)} *sizeof({ctype})' - callsite_stream.write(f"cuda::memcpy_async(block, {dst_expr}, {src_expr}, {size}, {pipeline});\n", cfg, state_id, [src_node, dst_node]) + callsite_stream.write(f"cuda::memcpy_async(block, {dst_expr}, {src_expr}, {size}, {pipeline});\n", cfg, + state_id, [src_node, dst_node]) elif num_dims > 1: @@ -644,36 +592,33 @@ def generate_copy(self, copy_context: CopyContext): callsite_stream.write( f"for (int __copyidx{dim} = 0; __copyidx{dim} < {copy_shape[dim]}; ++__copyidx{dim}) {{") - offset_src = ' + '.join(f'(__copyidx{d} * ({s}))' for d, s in enumerate(src_strides[:-1])) offset_dst = ' + '.join(f'(__copyidx{d} * ({s}))' for d, s in enumerate(dst_strides[:-1])) size = f'{copy_shape[-1]} *sizeof({ctype})' - src = f'{src_expr} + {offset_src}' - dst = f'{dst_expr} + {offset_dst}' + src = f'{src_expr} + {offset_src}' + dst = f'{dst_expr} + {offset_dst}' - callsite_stream.write(f"cuda::memcpy_async(block, {dst}, {src}, {size}, {pipeline});\n", cfg, state_id, [src_node, dst_node]) + callsite_stream.write(f"cuda::memcpy_async(block, {dst}, {src}, {size}, {pipeline});\n", cfg, state_id, + [src_node, dst_node]) # Write for-loop footers for dim in range(num_dims - 2): - callsite_stream.write("}") - + callsite_stream.write("}") else: # Should not be possible- otherwise, doing nothing is also okay # because a empty copy shape means we don't copy anything pass - emit_sync_debug_checks(copy_context.backend, copy_context.callsite_stream) - class FallBackGPUCopyStrategy(CopyStrategy): - def applicable(self, copy_context: CopyContext)-> bool: + def applicable(self, copy_context: CopyContext) -> bool: return True - + def generate_copy(self, copy_context: CopyContext): callsite_stream, cfg, state_id, src_node, dst_node = copy_context.get_write_context() sdfg = copy_context.sdfg @@ -681,4 +626,3 @@ def generate_copy(self, copy_context: CopyContext): edge = copy_context.edge cpu_codegen = copy_context.codegen._cpu_codegen cpu_codegen.copy_memory(sdfg, cfg, dfg, state_id, src_node, dst_node, edge, None, callsite_stream) - \ No newline at end of file diff --git a/dace/codegen/targets/experimental_cuda_helpers/gpu_stream_manager.py b/dace/codegen/targets/experimental_cuda_helpers/gpu_stream_manager.py index 959b70f573..6582395027 100644 --- a/dace/codegen/targets/experimental_cuda_helpers/gpu_stream_manager.py +++ b/dace/codegen/targets/experimental_cuda_helpers/gpu_stream_manager.py @@ -1,6 +1,7 @@ from typing import Dict, Union from dace import SDFG, nodes + class GPUStreamManager: """ Manages GPU backend streams (e.g., CUDA or HIP streams) for nodes in an SDFG. @@ -32,7 +33,7 @@ def get_stream_node(self, node: nodes.Node) -> str: if node in self.assigned_streams and self.assigned_streams[node] != "nullptr": return self.stream_access_template.format(gpu_stream=self.assigned_streams[node]) return "nullptr" - + def get_stream_edge(self, src_node: nodes.Node, dst_node: nodes.Node) -> str: """ Returns the stream access expression for an edge based on either the @@ -47,4 +48,4 @@ def get_stream_edge(self, src_node: nodes.Node, dst_node: nodes.Node) -> str: stream_id = self.assigned_streams[dst_node] return self.stream_access_template.format(gpu_stream=stream_id) else: - return "nullptr" \ No newline at end of file + return "nullptr" diff --git a/dace/codegen/targets/experimental_cuda_helpers/gpu_utils.py b/dace/codegen/targets/experimental_cuda_helpers/gpu_utils.py index 48cf5b662c..3ae2f3a347 100644 --- a/dace/codegen/targets/experimental_cuda_helpers/gpu_utils.py +++ b/dace/codegen/targets/experimental_cuda_helpers/gpu_utils.py @@ -1,10 +1,10 @@ -import functools -import sympy +import functools +import sympy from typing import List from dace import Config, symbolic from dace.codegen import cppunparse -from dace.codegen.prettycode import CodeIOStream +from dace.codegen.prettycode import CodeIOStream from dace.sdfg import nodes @@ -14,12 +14,14 @@ def symbolic_to_cpp(arr): return cppunparse.pyexpr2cpp(symbolic.symstr(arr, cpp_mode=True)) return [cppunparse.pyexpr2cpp(symbolic.symstr(d, cpp_mode=True)) for d in arr] + def get_cuda_dim(idx): """ Converts 0 to x, 1 to y, 2 to z, or raises an exception. """ if idx < 0 or idx > 2: raise ValueError(f'idx must be between 0 and 2, got {idx}') return ('x', 'y', 'z')[idx] + def product(iterable): """ Computes the symbolic product of elements in the iterable using sympy.Mul. @@ -30,12 +32,13 @@ def product(iterable): """ return functools.reduce(sympy.Mul, iterable, 1) + def to_3d_dims(dim_sizes: List) -> List: """ Converts a list of dimension sizes to a 3D format. - If the list has more than three dimensions, all dimensions beyond the second are - collapsed into the third (via multiplication). If the list has fewer than three + If the list has more than three dimensions, all dimensions beyond the second are + collapsed into the third (via multiplication). If the list has fewer than three entries, it is padded with 1s to ensure a fixed length of three. Examples: @@ -44,7 +47,7 @@ def to_3d_dims(dim_sizes: List) -> List: [x, y, z] → [x, y, z] [x, y, z, u, v] → [x, y, z * u * v] """ - + if len(dim_sizes) > 3: # multiply everything from the 3rd onward into d[2] dim_sizes[2] = product(dim_sizes[2:]) @@ -55,6 +58,7 @@ def to_3d_dims(dim_sizes: List) -> List: return dim_sizes + def validate_block_size_limits(kernel_map_entry: nodes.MapEntry, block_size: List): """ Validates that the given block size for a kernel does not exceed typical CUDA hardware limits. @@ -77,31 +81,30 @@ def validate_block_size_limits(kernel_map_entry: nodes.MapEntry, block_size: Lis total_block_size = product(block_size) limit = Config.get('compiler', 'cuda', 'block_size_limit') lastdim_limit = Config.get('compiler', 'cuda', 'block_size_lastdim_limit') - + if (total_block_size > limit) == True: raise ValueError(f'Block size for kernel "{kernel_map_label}" ({block_size}) ' - f'is larger than the possible number of threads per block ({limit}). ' - 'The kernel will potentially not run, please reduce the thread-block size. ' - 'To increase this limit, modify the `compiler.cuda.block_size_limit` ' - 'configuration entry.') - + f'is larger than the possible number of threads per block ({limit}). ' + 'The kernel will potentially not run, please reduce the thread-block size. ' + 'To increase this limit, modify the `compiler.cuda.block_size_limit` ' + 'configuration entry.') + if (block_size[-1] > lastdim_limit) == True: raise ValueError(f'Last block size dimension for kernel "{kernel_map_label}" ({block_size}) ' - 'is larger than the possible number of threads in the last block dimension ' - f'({lastdim_limit}). The kernel will potentially not run, please reduce the ' - 'thread-block size. To increase this limit, modify the ' - '`compiler.cuda.block_size_lastdim_limit` configuration entry.') + 'is larger than the possible number of threads in the last block dimension ' + f'({lastdim_limit}). The kernel will potentially not run, please reduce the ' + 'thread-block size. To increase this limit, modify the ' + '`compiler.cuda.block_size_lastdim_limit` configuration entry.') + def emit_sync_debug_checks(backend: str, codestream: CodeIOStream): """ Emit backend sync and error-check calls if synchronous debugging is enabled. - + Args: backend (str): Backend API prefix (e.g., 'cuda'). codestream (CodeIOStream): Stream to write code to. """ if Config.get_bool('compiler', 'cuda', 'syncdebug'): - codestream.write( - f"DACE_GPU_CHECK({backend}GetLastError());\n" - f"DACE_GPU_CHECK({backend}DeviceSynchronize());\n" - ) \ No newline at end of file + codestream.write(f"DACE_GPU_CHECK({backend}GetLastError());\n" + f"DACE_GPU_CHECK({backend}DeviceSynchronize());\n") diff --git a/dace/codegen/targets/experimental_cuda_helpers/scope_strategies.py b/dace/codegen/targets/experimental_cuda_helpers/scope_strategies.py index 9610651d20..e0af55adb9 100644 --- a/dace/codegen/targets/experimental_cuda_helpers/scope_strategies.py +++ b/dace/codegen/targets/experimental_cuda_helpers/scope_strategies.py @@ -22,44 +22,40 @@ # Experimental CUDA imports from dace.codegen.targets.experimental_cuda import ExperimentalCUDACodeGen, KernelSpec -from dace.codegen.targets.experimental_cuda_helpers.gpu_utils import ( - symbolic_to_cpp, - get_cuda_dim, - product -) - +from dace.codegen.targets.experimental_cuda_helpers.gpu_utils import (symbolic_to_cpp, get_cuda_dim, product) #---------------------------------------------------------------------------------- # GPU Scope Generation Strategies #---------------------------------------------------------------------------------- + class ScopeGenerationStrategy(ABC): """Base strategy for generating GPU scope code""" - + def __init__(self, codegen: ExperimentalCUDACodeGen): self.codegen: ExperimentalCUDACodeGen = codegen self._dispatcher: TargetDispatcher = codegen._dispatcher self._current_kernel_spec: KernelSpec = codegen._current_kernel_spec - + @abstractmethod - def applicable(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: ScopeSubgraphView, - state_id: int, function_stream: CodeIOStream, callsite_stream: CodeIOStream) -> bool: + def applicable(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: ScopeSubgraphView, state_id: int, + function_stream: CodeIOStream, callsite_stream: CodeIOStream) -> bool: raise NotImplementedError('Abstract class') - + @abstractmethod - def generate(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: ScopeSubgraphView, - state_id: int, function_stream: CodeIOStream, callsite_stream: CodeIOStream) -> None: + def generate(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: ScopeSubgraphView, state_id: int, + function_stream: CodeIOStream, callsite_stream: CodeIOStream) -> None: raise NotImplementedError('Abstract class') class KernelScopeGenerator(ScopeGenerationStrategy): - + def __init__(self, codegen: ExperimentalCUDACodeGen): super().__init__(codegen) - def applicable(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: ScopeSubgraphView, - state_id: int, function_stream: CodeIOStream, callsite_stream: CodeIOStream) -> bool: - + def applicable(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: ScopeSubgraphView, state_id: int, + function_stream: CodeIOStream, callsite_stream: CodeIOStream) -> bool: + node = dfg_scope.source_nodes()[0] schedule_type = node.map.schedule @@ -67,50 +63,56 @@ def applicable(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: ScopeSubgrap # the outermost (first) GPU schedule is of type GPU_Device. applicable = schedule_type == dtypes.ScheduleType.GPU_Device return applicable - - def generate(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: ScopeSubgraphView, - state_id: int, function_stream: CodeIOStream, callsite_stream: CodeIOStream): - + + def generate(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: ScopeSubgraphView, state_id: int, + function_stream: CodeIOStream, callsite_stream: CodeIOStream): # Generate kernel function signature self._generate_kernel_signature(sdfg, cfg, dfg_scope, state_id, function_stream, callsite_stream) # Generate kernel body - with ScopeManager(frame_codegen=self.codegen._frame, sdfg=sdfg, cfg=cfg, dfg_scope=dfg_scope, state_id=state_id, - function_stream=function_stream, callsite_stream=callsite_stream, comment="Kernel scope") as scope_manager: - + with ScopeManager(frame_codegen=self.codegen._frame, + sdfg=sdfg, + cfg=cfg, + dfg_scope=dfg_scope, + state_id=state_id, + function_stream=function_stream, + callsite_stream=callsite_stream, + comment="Kernel scope") as scope_manager: # ----------------- Initialize Kernel Scope Constructs ----------------------- self._generate_kernel_initialization(sdfg, cfg, dfg_scope, state_id, function_stream, callsite_stream) - + # ----------------- Retrieve kernel configuration ----------------------- kernel_spec = self._current_kernel_spec - kernel_entry_node = kernel_spec._kernel_entry_node # == dfg_scope.source_nodes()[0] + kernel_entry_node = kernel_spec._kernel_entry_node # == dfg_scope.source_nodes()[0] kernel_map = kernel_spec.kernel_map # ----------------- Kernel/Map Range Preprocessing ----------------------- - reversed_kernel_range = kernel_map.range[::-1] # also reverse it + reversed_kernel_range = kernel_map.range[::-1] # also reverse it kernel_range = subsets.Range(reversed_kernel_range) kernel_dimensions = len(kernel_range) kernel_dim_sizes = kernel_range.size() # ----------------- Set up symbolic index expressions ----------------------- - symbolic_indices = [ symbolic.symbol(f'__SYM_IDX{dim}', nonnegative=True, integer=True) for dim in range(kernel_dimensions)] + symbolic_indices = [ + symbolic.symbol(f'__SYM_IDX{dim}', nonnegative=True, integer=True) for dim in range(kernel_dimensions) + ] symbolic_coordinates = kernel_range.coord_at(symbolic_indices) # ----------------- Generate Thread or Block index Definitions ----------------------- - thread_id_ctype = kernel_spec.gpu_index_ctype # Data type of CUDA thread/block indices + thread_id_ctype = kernel_spec.gpu_index_ctype # Data type of CUDA thread/block indices # In case there is no ThreadBlock map used in a submap, the map variables will # be mapped to thread IDs instead of block IDs for dim in range(kernel_dimensions): - var_name = kernel_map.params[-dim - 1] # also reverse it here! + var_name = kernel_map.params[-dim - 1] # also reverse it here! # Compute index expressions for up to 3 dimensions (x, y, z) if dim < 3: @@ -123,21 +125,28 @@ def generate(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: ScopeSubgraphV else: # Handle dimensions beyond the third (delinearize and modulo) index_expr = f'blockIdx.z' tail_prod = product(kernel_dim_sizes[dim + 1:]) - index_expr = (f"(({index_expr} / ({symbolic_to_cpp(tail_prod)})) % ({symbolic_to_cpp(kernel_dim_sizes[dim])}))") + index_expr = ( + f"(({index_expr} / ({symbolic_to_cpp(tail_prod)})) % ({symbolic_to_cpp(kernel_dim_sizes[dim])}))" + ) # Define thread/Block index var_def = symbolic_to_cpp(symbolic_coordinates[dim]).replace(f'__SYM_IDX{dim}', index_expr) callsite_stream.write(f'{thread_id_ctype} {var_name} = {var_def};', cfg, state_id, kernel_entry_node) - self._dispatcher.defined_vars.add(var_name, DefinedType.Scalar, thread_id_ctype) + self._dispatcher.defined_vars.add(var_name, DefinedType.Scalar, thread_id_ctype) # ----------------- Dispatch Subgraph code generation ----------------------- - self._dispatcher.dispatch_subgraph(sdfg, cfg, dfg_scope, state_id, function_stream, - callsite_stream, skip_entry_node=True) + self._dispatcher.dispatch_subgraph(sdfg, + cfg, + dfg_scope, + state_id, + function_stream, + callsite_stream, + skip_entry_node=True) - def _generate_kernel_signature(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: ScopeSubgraphView, + def _generate_kernel_signature(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: ScopeSubgraphView, state_id: int, function_stream: CodeIOStream, callsite_stream: CodeIOStream): - + kernel_name = self._current_kernel_spec.kernel_name kernel_args = self._current_kernel_spec.args_typed block_dims = self._current_kernel_spec.block_dims @@ -152,16 +161,12 @@ def _generate_kernel_signature(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_sco else: launch_bounds = f'__launch_bounds__({node.gpu_launch_bounds})' - # Emit kernel function signature - callsite_stream.write( - f'__global__ void {launch_bounds} {kernel_name}({", ".join(kernel_args)}) ', - cfg, state_id, node - ) + callsite_stream.write(f'__global__ void {launch_bounds} {kernel_name}({", ".join(kernel_args)}) ', cfg, + state_id, node) - def _generate_kernel_initialization(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: ScopeSubgraphView, + def _generate_kernel_initialization(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: ScopeSubgraphView, state_id: int, function_stream: CodeIOStream, callsite_stream: CodeIOStream): - """ NOTE: Under construction Tell yakup: @@ -174,7 +179,7 @@ def _generate_kernel_initialization(self, sdfg: SDFG, cfg: ControlFlowRegion, df metadata = sdfg.metadata if metadata == None: return - + node = dfg_scope.source_nodes()[0] callsite_stream.write(f"\n", cfg, state_id, node) @@ -184,15 +189,13 @@ def _generate_kernel_initialization(self, sdfg: SDFG, cfg: ControlFlowRegion, df callsite_stream.write(f"{tblock_obj_ctype} {tblock_obj_name} = cg::this_thread_block();\n", cfg, state_id, node) self._dispatcher.defined_vars.add(tblock_obj_name, DefinedType.Object, tblock_obj_ctype) - # initialize pipeline + # initialize pipeline pipelines = dict() for node_guid, node_meta in metadata.items(): pipelines = node_meta.get("pipelines", {}) for pipeline_name, pipeline_info in pipelines.items(): pipelines[pipeline_name] = pipeline_info["pipeline_depth"] - - for pipeline_name, pipeline_depth in pipelines.items(): callsite_stream.write(f"\n", cfg, state_id, node) # initialize pipeline depth scalar @@ -201,7 +204,7 @@ def _generate_kernel_initialization(self, sdfg: SDFG, cfg: ControlFlowRegion, df callsite_stream.write(f"{depth_ctype} {depth_name} = {pipeline_depth};\n", cfg, state_id, node) self._dispatcher.defined_vars.add(depth_name, DefinedType.Scalar, depth_ctype) - # allocate shared pipeline state + # allocate shared pipeline state shared_state_name = f"shared_state_{pipeline_name}" shared_state_ctype = f"cuda::pipeline_shared_state" callsite_stream.write(f" __shared__ {shared_state_ctype} {shared_state_name};\n") @@ -209,36 +212,43 @@ def _generate_kernel_initialization(self, sdfg: SDFG, cfg: ControlFlowRegion, df # intialize the pipeline pipeline_ctype = "auto" - callsite_stream.write(f"{pipeline_ctype} {pipeline_name} = cuda::make_pipeline({tblock_obj_name}, &{shared_state_name});\n", cfg, state_id, node) + callsite_stream.write( + f"{pipeline_ctype} {pipeline_name} = cuda::make_pipeline({tblock_obj_name}, &{shared_state_name});\n", + cfg, state_id, node) self._dispatcher.defined_vars.add(pipeline_name, DefinedType.Object, pipeline_ctype) - + callsite_stream.write(f"\n", cfg, state_id, node) class ThreadBlockScopeGenerator(ScopeGenerationStrategy): - + def __init__(self, codegen: ExperimentalCUDACodeGen): super().__init__(codegen) - - def applicable(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: ScopeSubgraphView, - state_id: int, function_stream: CodeIOStream, callsite_stream: CodeIOStream) -> bool: - + + def applicable(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: ScopeSubgraphView, state_id: int, + function_stream: CodeIOStream, callsite_stream: CodeIOStream) -> bool: + node = dfg_scope.source_nodes()[0] applicable = node.map.schedule == dtypes.ScheduleType.GPU_ThreadBlock return applicable - - def generate(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: ScopeSubgraphView, - state_id: int, function_stream: CodeIOStream, callsite_stream: CodeIOStream): + + def generate(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: ScopeSubgraphView, state_id: int, + function_stream: CodeIOStream, callsite_stream: CodeIOStream): # NOTE: not my code, but my insights. Approval for commenting this needed - with ScopeManager(frame_codegen=self.codegen._frame, sdfg=sdfg, cfg=cfg, dfg_scope=dfg_scope, state_id=state_id, - function_stream=function_stream, callsite_stream=callsite_stream, comment="ThreadBlock Scope") as scope_manager: - + with ScopeManager(frame_codegen=self.codegen._frame, + sdfg=sdfg, + cfg=cfg, + dfg_scope=dfg_scope, + state_id=state_id, + function_stream=function_stream, + callsite_stream=callsite_stream, + comment="ThreadBlock Scope") as scope_manager: + node = dfg_scope.source_nodes()[0] scope_map = node.map - # ----------------- Map Range Preprocessing ----------------------- # Reverse range for better performance (e.g. memory coalescing) @@ -249,21 +259,24 @@ def generate(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: ScopeSubgraphV kernel_block_dims = self._current_kernel_spec.block_dims - # ----------------- Symbolic Index Expressions ----------------------- - symbolic_indices = [ symbolic.symbol(f'__SYM_IDX{dim}', nonnegative=True, integer=True) for dim in range(map_dimensions)] - symbolic_index_bounds = [idx + (block_dim * rng[2]) - 1 for idx, block_dim, rng in zip(symbolic_indices, kernel_block_dims, map_range)] + symbolic_indices = [ + symbolic.symbol(f'__SYM_IDX{dim}', nonnegative=True, integer=True) for dim in range(map_dimensions) + ] + symbolic_index_bounds = [ + idx + (block_dim * rng[2]) - 1 + for idx, block_dim, rng in zip(symbolic_indices, kernel_block_dims, map_range) + ] symbolic_coordinates = map_range.coord_at(symbolic_indices) - # ----------------- Generate Index Variable Definitions ----------------------- # Get the block's index dace data type block_id_ctype = self._current_kernel_spec.gpu_index_ctype for dim in range(map_dimensions): - var_name = scope_map.params[-dim - 1] # also reverse it here! + var_name = scope_map.params[-dim - 1] # also reverse it here! if dim < 3: # First three dimensions: direct mapping or partial delinearization @@ -275,13 +288,12 @@ def generate(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: ScopeSubgraphV else: # Dimensions beyond the third: full delinearization tail_prod = product(map_dim_sizes[dim + 1:]) - base_expr = (f"((threadIdx.z / ({symbolic_to_cpp(tail_prod)})) % ({symbolic_to_cpp(map_dim_sizes[dim])}))") - + base_expr = ( + f"((threadIdx.z / ({symbolic_to_cpp(tail_prod)})) % ({symbolic_to_cpp(map_dim_sizes[dim])}))") var_def = symbolic_to_cpp(symbolic_coordinates[dim]).replace(f'__SYM_IDX{dim}', base_expr) callsite_stream.write(f'{block_id_ctype} {var_name} = {var_def};', cfg, state_id, node) - self._dispatcher.defined_vars.add(var_name, DefinedType.Scalar, block_id_ctype) - + self._dispatcher.defined_vars.add(var_name, DefinedType.Scalar, block_id_ctype) # ----------------- Guard Conditions for Block Execution ----------------------- @@ -298,7 +310,7 @@ def generate(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: ScopeSubgraphV # Block range start if dim >= 3 or (symbolic_indices[dim] >= start) != True: - condition += f'{var_name} >= {symbolic_to_cpp(start)}' + condition += f'{var_name} >= {symbolic_to_cpp(start)}' # Special case: block size is exactly the range of the map (0:b) if dim >= 3: @@ -316,32 +328,41 @@ def generate(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: ScopeSubgraphV if len(condition) > 0: scope_manager.open(condition=condition) - # ----------------- Dispatch Subgraph code generation ----------------------- - self._dispatcher.dispatch_subgraph(sdfg, cfg, dfg_scope, state_id, function_stream, - callsite_stream, skip_entry_node=True) + self._dispatcher.dispatch_subgraph(sdfg, + cfg, + dfg_scope, + state_id, + function_stream, + callsite_stream, + skip_entry_node=True) class WarpScopeGenerator(ScopeGenerationStrategy): - + def __init__(self, codegen: ExperimentalCUDACodeGen): super().__init__(codegen) - - def applicable(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: ScopeSubgraphView, - state_id: int, function_stream: CodeIOStream, callsite_stream: CodeIOStream) -> bool: - + + def applicable(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: ScopeSubgraphView, state_id: int, + function_stream: CodeIOStream, callsite_stream: CodeIOStream) -> bool: + node = dfg_scope.source_nodes()[0] applicable = node.map.schedule == dtypes.ScheduleType.GPU_Warp return applicable - - def generate(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: ScopeSubgraphView, - state_id: int, function_stream: CodeIOStream, callsite_stream: CodeIOStream): - with ScopeManager(frame_codegen=self.codegen._frame, sdfg=sdfg, cfg=cfg, dfg_scope=dfg_scope, state_id=state_id, - function_stream=function_stream, callsite_stream=callsite_stream, comment="WarpLevel Scope") as scope_manager: + def generate(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: ScopeSubgraphView, state_id: int, + function_stream: CodeIOStream, callsite_stream: CodeIOStream): + with ScopeManager(frame_codegen=self.codegen._frame, + sdfg=sdfg, + cfg=cfg, + dfg_scope=dfg_scope, + state_id=state_id, + function_stream=function_stream, + callsite_stream=callsite_stream, + comment="WarpLevel Scope") as scope_manager: # Get kernel specifications kernel_spec = self._current_kernel_spec @@ -354,24 +375,20 @@ def generate(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: ScopeSubgraphV map_range = subsets.Range(scope_map.range[::-1]) # Reversed for potential better performance warp_dim = len(map_range) - + # The following sizes and bounds are be symbolic - num_threads_in_block = product(block_dims) + num_threads_in_block = product(block_dims) warp_dim_bounds = [max_elem + 1 for max_elem in map_range.max_element()] num_warps = product(warp_dim_bounds) - # The C type used to define the (flat) threadId and warpId variables ids_ctype = kernel_spec.gpu_index_ctype # ----------------- Guard checks ----------------------- - # handles checks either at compile time or runtime (i.e. checks in the generated code) self._handle_GPU_Warp_scope_guards(state_dfg, node, map_range, warp_dim, num_threads_in_block, num_warps, callsite_stream, scope_manager) - - # ----------------- Define (flat) Thread ID within Block ----------------------- @@ -387,17 +404,16 @@ def generate(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: ScopeSubgraphV idx_expr = " * ".join(stride + [f"threadIdx.{get_cuda_dim(i)}"]) if stride else f"threadIdx.{dim}" flattened_terms.append(idx_expr) - joined_terms = " + ".join(flattened_terms) flat_thread_idx_expr = f"({joined_terms})" if len(flattened_terms) > 1 else joined_terms - threadID_name = 'ThreadId_%s_%d_%d_%d' % (scope_map.label, cfg.cfg_id, state_dfg.block_id, state_dfg.node_id(node)) + threadID_name = 'ThreadId_%s_%d_%d_%d' % (scope_map.label, cfg.cfg_id, state_dfg.block_id, + state_dfg.node_id(node)) - callsite_stream.write(f"{ids_ctype} {threadID_name} = ({flat_thread_idx_expr}) / {warpSize};", cfg, state_id, node) + callsite_stream.write(f"{ids_ctype} {threadID_name} = ({flat_thread_idx_expr}) / {warpSize};", cfg, + state_id, node) self._dispatcher.defined_vars.add(threadID_name, DefinedType.Scalar, ids_ctype) - - # ----------------- Compute Map indices (= Warp indices) ----------------------- for i in range(warp_dim): @@ -413,11 +429,8 @@ def generate(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: ScopeSubgraphV callsite_stream.write(f"{ids_ctype} {var_name} = {expr};", cfg, state_id, node) self._dispatcher.defined_vars.add(var_name, DefinedType.Scalar, ids_ctype) - - # ----------------- Guard Conditions for Warp Execution ----------------------- - if num_warps * warpSize != num_threads_in_block: condition = f'{threadID_name} < {num_warps}' scope_manager.open(condition) @@ -425,38 +438,39 @@ def generate(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: ScopeSubgraphV warp_range = [(start, end + 1, stride) for start, end, stride in map_range.ranges] for dim, (var_name, (start, _, stride)) in enumerate(zip(scope_map.params[::-1], warp_range)): - + condition_terms = [] - + if start != 0: condition_terms.append(f"{var_name} >= {start}") - + if stride != 1: expr = var_name if start == 0 else f"({var_name} - {start})" - condition_terms.append(f'{expr} % {stride} == 0' ) - + condition_terms.append(f'{expr} % {stride} == 0') + if condition_terms: condition = " && ".join(condition_terms) scope_manager.open(condition) - # ----------------- Dispatch Subgraph code generation ----------------------- - - self._dispatcher.dispatch_subgraph( - sdfg, cfg, dfg_scope, state_id, function_stream, - callsite_stream, skip_entry_node=True - ) + self._dispatcher.dispatch_subgraph(sdfg, + cfg, + dfg_scope, + state_id, + function_stream, + callsite_stream, + skip_entry_node=True) def _handle_GPU_Warp_scope_guards(self, state_dfg: SDFGState, node: nodes.MapEntry, map_range: subsets.Range, warp_dim: int, num_threads_in_block, num_warps, kernel_stream: CodeIOStream, scope_manager: 'ScopeManager'): - + #TODO: Move them to sdfg validation as well if possible # Get warpSize from the kernel specification warpSize = self._current_kernel_spec.warpSize - + parent_map, _ = helpers.get_parent_map(state_dfg, node) if parent_map.schedule != dtypes.ScheduleType.GPU_ThreadBlock: raise ValueError("GPU_Warp map must be nested within a GPU_ThreadBlock map.") @@ -464,17 +478,14 @@ def _handle_GPU_Warp_scope_guards(self, state_dfg: SDFGState, node: nodes.MapEnt if warp_dim > 3: raise NotImplementedError("GPU_Warp maps are limited to 3 dimensions.") - # Guard against invalid thread/block configurations. # - For concrete (compile-time) values, raise Python errors early. # - For symbolic values, insert runtime CUDA checks (guards) into the generated kernel. # These will emit meaningful error messages and abort execution if violated. if isinstance(num_threads_in_block, symbolic.symbol): - condition = ( - f"{num_threads_in_block} % {warpSize} != 0 || " - f"{num_threads_in_block} > 1024 || " - f"{num_warps} * {warpSize} > {num_threads_in_block}" - ) + condition = (f"{num_threads_in_block} % {warpSize} != 0 || " + f"{num_threads_in_block} > 1024 || " + f"{num_warps} * {warpSize} > {num_threads_in_block}") kernel_stream.write(f"""\ if ({condition}) {{ printf("CUDA error:\\n" @@ -492,22 +503,22 @@ def _handle_GPU_Warp_scope_guards(self, state_dfg: SDFGState, node: nodes.MapEnt elif num_warps * warpSize > num_threads_in_block: raise ValueError(f"Invalid configuration: {num_warps} warps x {warpSize} threads exceed " - f"{num_threads_in_block} threads in the block.") + f"{num_threads_in_block} threads in the block.") if num_threads_in_block % warpSize != 0: raise ValueError(f"Block must be a multiple of {warpSize} threads for GPU_Warp scheduling " - f"(got {num_threads_in_block}).") + f"(got {num_threads_in_block}).") if num_threads_in_block > 1024: raise ValueError("CUDA does not support more than 1024 threads per block (hardware limit).") - - + for min_element in map_range.min_element(): if isinstance(min_element, symbolic.symbol): - kernel_stream.write(f'if ({min_element} < 0) {{\n' - f' printf("Runtime error: Warp ID symbol {min_element} must be non-negative.\\n");\n' - f' asm("trap;");\n' - f'}}\n') + kernel_stream.write( + f'if ({min_element} < 0) {{\n' + f' printf("Runtime error: Warp ID symbol {min_element} must be non-negative.\\n");\n' + f' asm("trap;");\n' + f'}}\n') elif min_element < 0: raise ValueError(f"Warp ID value {min_element} must be non-negative.") @@ -516,6 +527,7 @@ def _handle_GPU_Warp_scope_guards(self, state_dfg: SDFGState, node: nodes.MapEnt # Scope Manager, handling brackets and allocation/deallocation of arrays in Scopes #---------------------------------------------------------------------------------- + class ScopeManager: """ A helper class to manage opening and closing brackets in a structured way using the 'with' statement. @@ -524,10 +536,16 @@ class ScopeManager: the code structure. """ - def __init__(self, frame_codegen: DaCeCodeGenerator, sdfg: SDFG, - cfg: ControlFlowRegion, dfg_scope: ScopeSubgraphView, state_id: int, - function_stream: CodeIOStream, callsite_stream: CodeIOStream, comment: str = None, - debug: bool = False): + def __init__(self, + frame_codegen: DaCeCodeGenerator, + sdfg: SDFG, + cfg: ControlFlowRegion, + dfg_scope: ScopeSubgraphView, + state_id: int, + function_stream: CodeIOStream, + callsite_stream: CodeIOStream, + comment: str = None, + debug: bool = False): """ Initializes the KernelScopeManager. @@ -560,18 +578,16 @@ def __enter__(self): Writes the opening bracket to the stream and allocates arrays in scope. """ self.open() - self.frame_codegen.allocate_arrays_in_scope( - self.sdfg, self.cfg, self.entry_node, self.function_stream, self.callsite_stream - ) + self.frame_codegen.allocate_arrays_in_scope(self.sdfg, self.cfg, self.entry_node, self.function_stream, + self.callsite_stream) return self def __exit__(self, exc_type, exc_value, traceback): """ Deallocates arrays in scope and writes the closing brackets to the stream. """ - self.frame_codegen.deallocate_arrays_in_scope( - self.sdfg, self.cfg, self.entry_node, self.function_stream, self.callsite_stream - ) + self.frame_codegen.deallocate_arrays_in_scope(self.sdfg, self.cfg, self.entry_node, self.function_stream, + self.callsite_stream) for i in range(self._opened): line = "}" if self.debug: @@ -590,5 +606,3 @@ def open(self, condition: str = None): line += f" // {self.comment} (open {self._opened + 1})" self.callsite_stream.write(line, self.cfg, self.state_id, self.entry_node) self._opened += 1 - - diff --git a/dace/config_schema.yml b/dace/config_schema.yml index cad0dcbbe8..a817e42b37 100644 --- a/dace/config_schema.yml +++ b/dace/config_schema.yml @@ -457,7 +457,7 @@ required: Yakup Koray Budanaz for Berkays master-thesis. enum: [legacy, experimental] default: experimental - + gpu_index_type: type: str title: Thread/block/warp index data type @@ -467,7 +467,7 @@ required: The type is based on the type-classes in ``dace.dtypes``. For example, ``uint64`` is equivalent to ``dace.uint64``. Change this setting when large index types are needed to address memory offsets that are beyond the 32-bit - range, or to reduce memory usage. This replaces ``thread_id_type`` in + range, or to reduce memory usage. This replaces ``thread_id_type`` in ``ExperimentalCUDACodeGen`` , as the new name more accurately reflects its broader usage. @@ -476,7 +476,7 @@ required: title: CUDA warp size description: > Defines the warp size used during CUDA code generation. The default and current - standard value for CUDA is 32. This should only be changed if future CUDA + standard value for CUDA is 32. This should only be changed if future CUDA architectures explicitly alter the warp size. Modifying this value arbitrarily may result in incorrect or unknown behavior, and is therefore strongly discouraged. default: 32 @@ -486,18 +486,18 @@ required: title: HIP warp size description: > Specifies the warp size (also known as wavefront size) for HIP code generation. - The default value for AMD GPUs is typically 64. This setting should only be modified + The default value for AMD GPUs is typically 64. This setting should only be modified if you have a clear understanding of what you are doing. default: 64 - + auto_syncthreads_insertion: type: bool title: Insert Default __syncthreads() Tasklets description: > - If enabled, inserts default __syncthreads() tasklets during preprocessing - in ExperimentalCUDACodeGen to ensure shared memory is ready before access. - This is a simple safeguard for correctness—it may not be complete, but it - does the job for basic SDFGs. Disable if you handle synchronization manually + If enabled, inserts default __syncthreads() tasklets during preprocessing + in ExperimentalCUDACodeGen to ensure shared memory is ready before access. + This is a simple safeguard for correctness—it may not be complete, but it + does the job for basic SDFGs. Disable if you handle synchronization manually or use other mechanisms like async copies or pipelines. default: True @@ -507,8 +507,8 @@ required: description: > Specifies the name of the variable that holds the current thread block group, initialized using `cooperative_groups::this_thread_block()`. This is useful in - contexts like custom tasklets, where the variable is explicitly referenced - (e.g., `cooperative_groups::wait(block)`). Setting this allows users to customize the + contexts like custom tasklets, where the variable is explicitly referenced + (e.g., `cooperative_groups::wait(block)`). Setting this allows users to customize the variable name without modifying the source code or relying on a fixed name. default: block diff --git a/dace/dtypes.py b/dace/dtypes.py index 7d03c26f3c..7a2ade50a2 100644 --- a/dace/dtypes.py +++ b/dace/dtypes.py @@ -99,7 +99,6 @@ class ScheduleType(aenum.AutoNumberEnum): StorageType.GPU_Shared, ] - # A subset of CPU schedule types CPU_SCHEDULES = [ ScheduleType.CPU_Multicore, @@ -240,7 +239,7 @@ class TilingType(aenum.AutoNumberEnum): ScheduleType.SVE_Map: ScheduleType.Sequential, ScheduleType.Snitch: ScheduleType.Snitch, ScheduleType.Snitch_Multicore: ScheduleType.Snitch_Multicore, - ScheduleType.GPU_Warp: ScheduleType.Sequential, + ScheduleType.GPU_Warp: ScheduleType.Sequential, } # Maps from StorageType to a preferred ScheduleType for helping determine schedules. diff --git a/dace/registry.py b/dace/registry.py index de6634e29c..fe14b8a3ba 100644 --- a/dace/registry.py +++ b/dace/registry.py @@ -45,7 +45,6 @@ def autoregister(cls: Type, **kwargs): if Config.get('compiler', 'cuda', 'implementation') == 'legacy' and kwargs['name'] == 'experimental_cuda': return - registered = False for base in cls.__bases__: if hasattr(base, '_registry_') and hasattr(base, 'register'): diff --git a/dace/sdfg/nodes.py b/dace/sdfg/nodes.py index 06a8fae71d..24bd504016 100644 --- a/dace/sdfg/nodes.py +++ b/dace/sdfg/nodes.py @@ -283,15 +283,15 @@ class AccessNode(Node): default=dtypes.DataInstrumentationType.No_Instrumentation) instrument_condition = CodeProperty(desc="Condition under which to trigger the instrumentation", default=CodeBlock("1", language=dtypes.Language.CPP)) - + # Experimental-CUDA-specific properties async_copy = Property(dtype=bool, - desc="Marks the data copy to this node (if any) as asynchronous (CUDA-specific).", - default=False) + desc="Marks the data copy to this node (if any) as asynchronous (CUDA-specific).", + default=False) async_pipeline = Property(dtype=str, desc="Name of the CUDA pipeline responsible for synchronization. " - "Only relevant if async_copy is True. May be None.", + "Only relevant if async_copy is True. May be None.", allow_none=True) def __init__(self, data, debuginfo=None): diff --git a/dace/sdfg/sdfg.py b/dace/sdfg/sdfg.py index c9453fc80a..a10bf2505d 100644 --- a/dace/sdfg/sdfg.py +++ b/dace/sdfg/sdfg.py @@ -502,10 +502,7 @@ class SDFG(ControlFlowRegion): default=False, desc="Whether the SDFG contains explicit control flow constructs") - metadata = Property(dtype=dict, - desc="Metada attached to the SDFG", - default=None, - allow_none=True) + metadata = Property(dtype=dict, desc="Metada attached to the SDFG", default=None, allow_none=True) def __init__(self, name: str, diff --git a/dace/sdfg/validation.py b/dace/sdfg/validation.py index 18811b81e6..ab0d30e676 100644 --- a/dace/sdfg/validation.py +++ b/dace/sdfg/validation.py @@ -873,7 +873,8 @@ def validate_state(state: 'dace.sdfg.SDFGState', if e.data.is_empty() and isinstance(dst_node, nd.ExitNode): pass else: - if isinstance(dst_node, nd.Tasklet) and len(dst_node.in_connectors) == 0 and len(dst_node.out_connectors) == 0: + if isinstance(dst_node, nd.Tasklet) and len(dst_node.in_connectors) == 0 and len( + dst_node.out_connectors) == 0: # Tasklets with no input or output connector -> sync tasklet -> OK pass else: @@ -1159,4 +1160,4 @@ def validate_memlet_data(memlet_data: str, access_data: str) -> bool: access_tokens = access_data.split('.') memlet_tokens = memlet_data.split('.') mem_root = '.'.join(memlet_tokens[:len(access_tokens)]) - return mem_root == access_data \ No newline at end of file + return mem_root == access_data diff --git a/dace/symbolic.py b/dace/symbolic.py index 9e0eebfffd..67754b9be5 100644 --- a/dace/symbolic.py +++ b/dace/symbolic.py @@ -1674,4 +1674,4 @@ def symbols_in_code(code: str, potential_symbols: Set[str] = None, symbols_to_ig tokens &= potential_symbols if symbols_to_ignore is None: return tokens - return tokens - symbols_to_ignore \ No newline at end of file + return tokens - symbols_to_ignore diff --git a/dace/transformation/dataflow/add_threadblock_map.py b/dace/transformation/dataflow/add_threadblock_map.py index d1f0f09267..87274b0afa 100644 --- a/dace/transformation/dataflow/add_threadblock_map.py +++ b/dace/transformation/dataflow/add_threadblock_map.py @@ -13,6 +13,7 @@ from dace.transformation import helpers, transformation from dace.transformation.dataflow.tiling import MapTiling + @make_properties class AddThreadBlockMap(transformation.SingleStateTransformation): """ @@ -67,13 +68,12 @@ def preprocess_default_dims(self): f'Falling back to the configuration entry `compiler.cuda.default_block_size`: {default_block_size_config}. ' 'You can either specify the block size to use with the gpu_block_size property, ' 'or by adding nested `GPU_ThreadBlock` maps, which map work to individual threads. ' - 'For more information, see https://spcldace.readthedocs.io/en/latest/optimization/gpu.html' - ) - + 'For more information, see https://spcldace.readthedocs.io/en/latest/optimization/gpu.html') + # 2) Reject unsupported 'max' setting if default_block_size_config == 'max': - raise NotImplementedError('max dynamic block size unimplemented') - + raise NotImplementedError('max dynamic block size unimplemented') + # 3) Parse & normalize the default block size to 3D default_block_size = [int(x) for x in default_block_size_config.split(',')] default_block_size = gpu_utils.to_3d_dims(default_block_size) @@ -85,8 +85,7 @@ def preprocess_default_dims(self): # 5) If block has more "active" dims than the grid, collapse extras active_block_dims = max(1, sum(1 for b in default_block_size if b != 1)) - active_grid_dims = max(1, sum(1 for g in kernel_domain_size if g != 1)) - + active_grid_dims = max(1, sum(1 for g in kernel_domain_size if g != 1)) if active_block_dims > active_grid_dims: tail_product = gpu_utils.product(default_block_size[active_grid_dims:]) @@ -98,31 +97,33 @@ def preprocess_default_dims(self): else: block_size = default_block_size - # Validate that the block size does not exeed any limits gpu_utils.validate_block_size_limits(kernel_map_entry, block_size) # Note order is [blockDim.x, blockDim.y, blockDim.z] return block_size - + def can_be_applied(self, graph, expr_index, sdfg, permissive=False): """ Determines whether the transformation can be applied to the given map entry. The transformation only applies to maps with a GPU_Device schedule (i.e., kernel map entries). - It is not applicable if a nested GPU_ThreadBlock or GPU_ThreadBlock_Dynamic map exists + It is not applicable if a nested GPU_ThreadBlock or GPU_ThreadBlock_Dynamic map exists within the kernel scope, as that indicates the thread-block schedule is already defined. The same restriction applies in the case of dynamic parallelism (nested kernel launches). """ # Only applicable to GPU_Device maps if self.map_entry.map.schedule != dtypes.ScheduleType.GPU_Device: return False - + # Traverse inner scopes (ordered outer -> inner) for _, inner_entry in helpers.get_internal_scopes(graph, self.map_entry): schedule = inner_entry.map.schedule - if schedule in {dtypes.ScheduleType.GPU_ThreadBlock, dtypes.ScheduleType.GPU_ThreadBlock_Dynamic,}: + if schedule in { + dtypes.ScheduleType.GPU_ThreadBlock, + dtypes.ScheduleType.GPU_ThreadBlock_Dynamic, + }: # Already scheduled with thread block — cannot apply return False @@ -140,7 +141,7 @@ def apply(self, state: SDFGState, sdfg: SDFG): This is achieved by applying the `MapTiling` transformation to `self.map_entry`, using a computed block size. Essentially `self.map_entry` becomes the thread block map and - the new inserted parent map is the new kernel map. The schedules are set accordingly. + the new inserted parent map is the new kernel map. The schedules are set accordingly. A final consistency check verifies that the resulting thread block map's range fits into the computed block size. @@ -153,7 +154,7 @@ def apply(self, state: SDFGState, sdfg: SDFG): # Reverse for map tiling to prioritize later dimensions for better memory/performance reversed_block_size = gpu_block_size[::-1] - + # TODO: Update this once MapTiling accounts for existing strides when applying tile sizes. # The code below is a workaround that manually adjusts tile sizes to account for existing strides. num_dims = len(kernel_map_entry.map.params) @@ -168,19 +169,17 @@ def apply(self, state: SDFGState, sdfg: SDFG): adjusted_block_size = reversed_block_size[-num_dims:] tile_sizes = [stride * block for stride, block in zip(existing_strides, adjusted_block_size)] - + # Apply map tiling transformation - MapTiling.apply_to( - sdfg=sdfg, - options={ - "prefix": "b", - "tile_sizes": tile_sizes, - "tile_trivial": True, - "skew": False - }, - map_entry=kernel_map_entry - ) - + MapTiling.apply_to(sdfg=sdfg, + options={ + "prefix": "b", + "tile_sizes": tile_sizes, + "tile_trivial": True, + "skew": False + }, + map_entry=kernel_map_entry) + # After tiling: kernel_map_entry is now the thread block map, configure its schedule thread_block_map_entry = kernel_map_entry thread_block_map_entry.map.schedule = dtypes.ScheduleType.GPU_ThreadBlock @@ -190,15 +189,14 @@ def apply(self, state: SDFGState, sdfg: SDFG): new_kernel_entry.map.gpu_block_size = gpu_block_size # Catch any unexpected mismatches of inserted threadblock map's block size and the used block size - tb_size = gpu_utils.to_3d_dims([symbolic.overapproximate(sz) for sz in thread_block_map_entry.map.range.size()[::-1]]) + tb_size = gpu_utils.to_3d_dims( + [symbolic.overapproximate(sz) for sz in thread_block_map_entry.map.range.size()[::-1]]) max_block_size = [sympy.Max(sz, bbsz) for sz, bbsz in zip(tb_size, gpu_block_size)] if max_block_size != gpu_block_size: - raise ValueError( - f"Block size mismatch: the overapproximated extent of the thread block map " - f"({tb_size}) is not enclosed by the derived block size ({gpu_block_size}). " - "They are expected to be equal or the derived block size to be larger." - ) + raise ValueError(f"Block size mismatch: the overapproximated extent of the thread block map " + f"({tb_size}) is not enclosed by the derived block size ({gpu_block_size}). " + "They are expected to be equal or the derived block size to be larger.") def update_names(): pass diff --git a/dace/transformation/passes/analysis/infer_gpu_grid_and_block_size.py b/dace/transformation/passes/analysis/infer_gpu_grid_and_block_size.py index f8ef54fb23..39eedc7d34 100644 --- a/dace/transformation/passes/analysis/infer_gpu_grid_and_block_size.py +++ b/dace/transformation/passes/analysis/infer_gpu_grid_and_block_size.py @@ -14,7 +14,7 @@ class InferGPUGridAndBlockSize(ppl.Pass): Infers the 3D CUDA launch configuration (grid and block sizes) for all GPU_Device map entries in the SDFG. This pass assumes the `AddThreadBlockMap` transformation has already been applied, ensuring that each kernel - either has an explicit thread block map. However it is applicable as long as each GPU_Device scheduled map + either has an explicit thread block map. However it is applicable as long as each GPU_Device scheduled map has an inner explicit GPU_ThreadBlock scheduled map. Block sizes are determined based on: @@ -30,7 +30,8 @@ class InferGPUGridAndBlockSize(ppl.Pass): such cases. """ - def apply_pass(self, sdfg: SDFG, kernels_with_added_tb_maps: Set[nodes.MapEntry]) -> Dict[nodes.MapEntry, Tuple[List, List]]: + def apply_pass(self, sdfg: SDFG, + kernels_with_added_tb_maps: Set[nodes.MapEntry]) -> Dict[nodes.MapEntry, Tuple[List, List]]: """ Analyzes the given SDFG to determine the 3D grid and block sizes for all GPU_Device map entries. @@ -38,15 +39,17 @@ def apply_pass(self, sdfg: SDFG, kernels_with_added_tb_maps: Set[nodes.MapEntry] A dictionary mapping each GPU_Device MapEntry node to a tuple (grid_dimensions, block_dimensions). """ # Collect all GPU_Device map entries across the SDFG - kernel_maps: Set[Tuple[nodes.MapEntry, SDFGState,]] = set() + kernel_maps: Set[Tuple[ + nodes.MapEntry, + SDFGState, + ]] = set() for node, state in sdfg.all_nodes_recursive(): if isinstance(node, nodes.MapEntry) and node.schedule == dtypes.ScheduleType.GPU_Device: kernel_maps.add((node, state)) - kernel_dimensions_map: Dict[nodes.MapEntry, Tuple[List, List]] = dict() for map_entry, state in kernel_maps: - # Compute grid size + # Compute grid size raw_grid = map_entry.map.range.size(True)[::-1] grid_size = gpu_utils.to_3d_dims(raw_grid) @@ -72,19 +75,17 @@ def _get_inserted_gpu_block_size(self, kernel_map_entry: nodes.MapEntry) -> List gpu_block_size = kernel_map_entry.map.gpu_block_size if gpu_block_size is None: - raise ValueError( - "Expected 'gpu_block_size' to be set. This kernel map entry should have been processed " - "by the AddThreadBlockMap transformation." - ) - + raise ValueError("Expected 'gpu_block_size' to be set. This kernel map entry should have been processed " + "by the AddThreadBlockMap transformation.") + return gpu_block_size - - def _infer_gpu_block_size(self, state:SDFGState, kernel_map_entry: nodes.MapEntry) -> List: + + def _infer_gpu_block_size(self, state: SDFGState, kernel_map_entry: nodes.MapEntry) -> List: """ Infers the GPU block size for a kernel map entry based on nested GPU_ThreadBlock maps. If the `gpu_block_size` attribute is set, it is assumed to be user-defined (not set by - a transformation like `AddThreadBlockMap`), and all nested thread-block maps must fit within it. + a transformation like `AddThreadBlockMap`), and all nested thread-block maps must fit within it. Otherwise, the block size is inferred by overapproximating the range sizes of all inner GPU_ThreadBlock maps of kernel_map_entry. @@ -105,12 +106,10 @@ def _infer_gpu_block_size(self, state:SDFGState, kernel_map_entry: nodes.MapEntr # guard check if not threadblock_maps: state.sdfg.save("failure.sdfg") - raise ValueError( - f"{self.__class__.__name__} expects at least one explicit nested GPU_ThreadBlock map, " - "as it assumes AddThreadBlockMap was applied beforehand.\n" - f"Check for issues in that transformation or ensure AddThreadBlockMap was applied." - ) - + raise ValueError(f"{self.__class__.__name__} expects at least one explicit nested GPU_ThreadBlock map, " + "as it assumes AddThreadBlockMap was applied beforehand.\n" + f"Check for issues in that transformation or ensure AddThreadBlockMap was applied.") + # Overapproximated block size enclosing all inner ThreadBlock maps block_size = kernel_map_entry.map.gpu_block_size detected_block_sizes = [block_size] if block_size is not None else [] @@ -128,7 +127,6 @@ def _infer_gpu_block_size(self, state:SDFGState, kernel_map_entry: nodes.MapEntr if block_size != tb_size or len(detected_block_sizes) == 0: detected_block_sizes.append(tb_size) - # Check for conflicting or multiple thread-block sizes # - If gpu_block_size is explicitly defined (by the user) and conflicts with detected map sizes, raise an error @@ -145,16 +143,17 @@ def _infer_gpu_block_size(self, state:SDFGState, kernel_map_entry: nodes.MapEntr '`__syncthreads`). Otherwise, use internal maps with the `GPU_Threadblock` or ' '`GPU_ThreadBlock_Dynamic` schedules. For more information, see ' 'https://spcldace.readthedocs.io/en/latest/optimization/gpu.html') - + else: warnings.warn('Multiple thread-block maps with different sizes detected for ' - f'kernel "{kernel_map_label}": {detected_block_sizes}. ' - f'Over-approximating to block size {block_size}.\n' - 'If this was not the intent, try tiling one of the thread-block maps to match.') - + f'kernel "{kernel_map_label}": {detected_block_sizes}. ' + f'Over-approximating to block size {block_size}.\n' + 'If this was not the intent, try tiling one of the thread-block maps to match.') + return block_size - def _get_internal_threadblock_maps(self, state: SDFGState, kernel_map_entry: nodes.MapEntry) -> List[nodes.MapEntry]: + def _get_internal_threadblock_maps(self, state: SDFGState, + kernel_map_entry: nodes.MapEntry) -> List[nodes.MapEntry]: """ Returns GPU_ThreadBlock MapEntries nested within a given the GPU_Device scheduled kernel map (kernel_map_entry). @@ -169,4 +168,3 @@ def _get_internal_threadblock_maps(self, state: SDFGState, kernel_map_entry: nod threadblock_maps.append(scope) return threadblock_maps - \ No newline at end of file diff --git a/dace/transformation/passes/gpustream_scheduling.py b/dace/transformation/passes/gpustream_scheduling.py index 761f747b2e..f463ac8053 100644 --- a/dace/transformation/passes/gpustream_scheduling.py +++ b/dace/transformation/passes/gpustream_scheduling.py @@ -41,9 +41,9 @@ class NaiveGPUStreamScheduler(ppl.Pass): K6 would be scheduled as: - K1, K2 → stream 0 - K3, K4, K5 → stream 1 - K6 → stream 2 + K1, K2 → stream 0 + K3, K4, K5 → stream 1 + K6 → stream 2 (assuming no limit on the number of concurrent streams) @@ -57,7 +57,7 @@ def __init__(self): self._max_concurrent_streams = int(Config.get('compiler', 'cuda', 'max_concurrent_streams')) # needed to call correct backend synchronization functions - self._backend: str = common.get_gpu_backend() + self._backend: str = common.get_gpu_backend() # This is expected to be set by the calling backend code generator before applying the pass self._gpu_stream_access_template: str = "" @@ -73,8 +73,9 @@ def apply_pass(self, sdfg: SDFG, _) -> Dict[nodes.Node, Union[int, str]]: self._assign_gpu_streams_in_state(sdfg, False, state, assigned_nodes, 0) # 2. If only one stream is used set all assignments to "nullptr". - num_assigned_streams = max(assigned_nodes.values(), default=0) # self.max_concurrent_streams == -1 (default) also handled here - if num_assigned_streams == 0: + num_assigned_streams = max(assigned_nodes.values(), + default=0) # self.max_concurrent_streams == -1 (default) also handled here + if num_assigned_streams == 0: for k in assigned_nodes.keys(): assigned_nodes[k] = "nullptr" @@ -83,7 +84,8 @@ def apply_pass(self, sdfg: SDFG, _) -> Dict[nodes.Node, Union[int, str]]: return assigned_nodes - def _assign_gpu_streams_in_state(self, sdfg: SDFG, in_nested_sdfg: bool, state: SDFGState, assigned_nodes: Dict, gpu_stream:int): + def _assign_gpu_streams_in_state(self, sdfg: SDFG, in_nested_sdfg: bool, state: SDFGState, assigned_nodes: Dict, + gpu_stream: int): """ Processes connected components in a state, assigning each to a different GPU stream if not inside a nested SDFG. If inside a nested SDFG, components inherit the stream from the parent state/component. @@ -97,14 +99,14 @@ def _assign_gpu_streams_in_state(self, sdfg: SDFG, in_nested_sdfg: bool, state: nodes_assigned_before = len(assigned_nodes) for node in component: - + if self._is_relevant_for_gpu_stream(node, sdfg, state): assigned_nodes[node] = gpu_stream - + if isinstance(node, nodes.NestedSDFG): for nested_state in node.sdfg.states(): self._assign_gpu_streams_in_state(node.sdfg, True, nested_state, assigned_nodes, gpu_stream) - + # Move to next stream if we assigned streams to any node in this component (careful: if nested, states are in same component) if not in_nested_sdfg and len(assigned_nodes) > nodes_assigned_before: gpu_stream = self._next_stream(gpu_stream) @@ -113,7 +115,7 @@ def _get_weakly_connected_nodes(self, graph: Graph) -> List[Set[NodeT]]: """ Returns all weakly connected components in the given directed graph. - A weakly connected component is a maximal group of nodes such that each pair + A weakly connected component is a maximal group of nodes such that each pair of nodes is connected by a path when ignoring edge directions. :param graph: A directed graph (Graph) instance. @@ -170,26 +172,23 @@ def _is_relevant_for_gpu_stream(self, node: nodes.Node, sdfg: SDFG, state: SDFGS for n in node_and_neighbors: # GPU global memory access nodes - if (isinstance(n, nodes.AccessNode) and - n.desc(sdfg).storage == dtypes.StorageType.GPU_Global): + if (isinstance(n, nodes.AccessNode) and n.desc(sdfg).storage == dtypes.StorageType.GPU_Global): return True - + # GPU-scheduled map entry/exit nodes (kernels) - if (isinstance(n, (nodes.EntryNode, nodes.ExitNode)) and - n.schedule in dtypes.GPU_SCHEDULES): + if (isinstance(n, (nodes.EntryNode, nodes.ExitNode)) and n.schedule in dtypes.GPU_SCHEDULES): return True - + # GPU-scheduled library nodes - if (isinstance(n, nodes.LibraryNode) and - n.schedule in dtypes.GPU_SCHEDULES): + if (isinstance(n, nodes.LibraryNode) and n.schedule in dtypes.GPU_SCHEDULES): return True return False - + def _next_stream(self, gpu_stream: int) -> int: """ Returns the next CUDA stream index based on the configured concurrency policy. - + - If max_concurrent_streams == 0: unlimited streams → increment stream index - If max_concurrent_streams == -1: default → always return 0 - Else: wrap around within the allowed number of streams @@ -228,16 +227,15 @@ def _insert_gpu_stream_sync_tasklet(self, sdfg: SDFG, assigned_nodes: Dict) -> N sync_code = "\n".join(sync_code_lines) - tasklet = state.add_tasklet( - name=f"gpu_stream_sync_{state}", inputs=set(), outputs=set(), - code=sync_code, - language=dtypes.Language.CPP - ) - + tasklet = state.add_tasklet(name=f"gpu_stream_sync_{state}", + inputs=set(), + outputs=set(), + code=sync_code, + language=dtypes.Language.CPP) + for sink_node in sink_nodes: state.add_edge(sink_node, None, tasklet, None, dace.Memlet()) - #----------------- Insert synchronization tasklets after specific nodes ----------------- for node, state in sync_node.items(): @@ -250,26 +248,28 @@ def _insert_gpu_stream_sync_tasklet(self, sdfg: SDFG, assigned_nodes: Dict) -> N gpu_stream_access_expr = self._gpu_stream_access_template.format(gpu_stream=stream) tasklet = state.add_tasklet( - name=f"gpu_stream_sync_{stream}", inputs=set(), outputs=set(), + name=f"gpu_stream_sync_{stream}", + inputs=set(), + outputs=set(), code=f"DACE_GPU_CHECK({self._backend}StreamSynchronize({gpu_stream_access_expr}));\n", - language=dtypes.Language.CPP - ) - + language=dtypes.Language.CPP) + # important: First get the successors, then add the tasklet successors = list(state.successors(node)) state.add_edge(node, None, tasklet, None, dace.Memlet()) - for succ in successors : + for succ in successors: state.add_edge(tasklet, None, succ, None, dace.Memlet()) - - def _identify_sync_locations(self, sdfg: SDFG, assigned_nodes: Dict) -> Tuple[Dict[SDFGState, Set[str]], Dict[nodes.Node, SDFGState]]: + + def _identify_sync_locations(self, sdfg: SDFG, + assigned_nodes: Dict) -> Tuple[Dict[SDFGState, Set[str]], Dict[nodes.Node, SDFGState]]: """ Heuristically identifies GPU stream synchronization points in an SDFG. Synchronization is needed: - At the end of a state, if we copy to/from GPU AccessNodes. - Immediately after a node, if data leaves GPU memory and is further used. - + Returns: - sync_state: Maps each SDFGState to a set of stream IDs to sync at the end of the state. - sync_node: Maps individual nodes to the state where a sync is required after the node. @@ -278,14 +278,16 @@ def _identify_sync_locations(self, sdfg: SDFG, assigned_nodes: Dict) -> Tuple[Di # ------------------ Helper predicates ----------------------------- def is_gpu_accessnode(node, state): - return isinstance(node, nodes.AccessNode) and node.desc(state.parent).storage == dtypes.StorageType.GPU_Global + return isinstance(node, nodes.AccessNode) and node.desc( + state.parent).storage == dtypes.StorageType.GPU_Global def is_nongpu_accessnode(node, state): - return isinstance(node, nodes.AccessNode) and node.desc(state.parent).storage not in dtypes.GPU_MEMORY_STORAGES_EXPERIMENTAL_CUDACODEGEN - + return isinstance(node, nodes.AccessNode) and node.desc( + state.parent).storage not in dtypes.GPU_MEMORY_STORAGES_EXPERIMENTAL_CUDACODEGEN + def is_kernel_exit(node): return isinstance(node, nodes.ExitNode) and node.schedule == dtypes.ScheduleType.GPU_Device - + def is_sink_node(node, state): return state.out_degree(node) == 0 @@ -317,11 +319,11 @@ def is_sink_node(node, state): else: continue - + # Check that state is indeed a SDFGState when added to the dictionary, to be on the safe side if not isinstance(state, SDFGState): raise NotImplementedError(f"Unexpected parent type '{type(state).__name__}' for edge '{edge}'. " - "Expected 'SDFGState'. Please handle this case explicitly.") + "Expected 'SDFGState'. Please handle this case explicitly.") # Remove states with no syncs sync_state = {state: streams for state, streams in sync_state.items() if len(streams) > 0} @@ -336,4 +338,4 @@ def set_gpu_stream_access_template(self, expr_template: str): """ if "{gpu_stream}" not in expr_template: raise ValueError("self._gpu_stream_access_template must include '{gpu_stream}' placeholder.") - self._gpu_stream_access_template = expr_template \ No newline at end of file + self._gpu_stream_access_template = expr_template diff --git a/dace/transformation/passes/shared_memory_synchronization.py b/dace/transformation/passes/shared_memory_synchronization.py index 0b50a0eba3..8a45993bb5 100644 --- a/dace/transformation/passes/shared_memory_synchronization.py +++ b/dace/transformation/passes/shared_memory_synchronization.py @@ -17,45 +17,43 @@ from dace.transformation.passes import analysis as ap + @properties.make_properties @transformation.explicit_cf_compatible class DefaultSharedMemorySync(ppl.Pass): """ A DaCe transformation pass that automatically inserts GPU synchronization barriers (__syncthreads()) for shared memory access patterns. - + This pass ensures proper synchronization in two scenarios: 1. Pre-synchronization: Before consuming shared memory data (AccessNode -> CodeNode/MapEntry) 2. Post-synchronization: After shared memory reuse in sequential loops/maps within GPU kernels - + The pass traverses the SDFG hierarchy and identifies shared memory access patterns that require synchronization to prevent race conditions in GPU code. - + NOTE: This implementation handles commonly observed patterns. Unsupported cases - raise NotImplementedError with context for extending the implementation once comming across + raise NotImplementedError with context for extending the implementation once comming across another constellation which was not observed in the used common examples. """ - def __init__(self): """Initialize the synchronization pass.""" # Track which scopes (sequential maps and Loops) have already been # synchronized to avoid duplicate barriers self._synchronized_scopes: Set[Union[MapExit, LoopRegion]] = set() - + # Map from MapExit nodes to their containing states for post-synchronization self._map_exit_to_state: Dict[MapExit, SDFGState] = dict() # Keep track of processed nested sdfgs self._processed_nsdfg = set() - - def apply_pass(self, sdfg: SDFG, _) -> None: """ Apply the synchronization pass to the entire SDFG. - + Args: sdfg: The SDFG to process (expected to be top-level) _: Unused pass pipeline argument @@ -65,12 +63,10 @@ def apply_pass(self, sdfg: SDFG, _) -> None: enclosing_scopes = [] self._process_sdfg(sdfg, enclosing_scopes) - - def _process_sdfg(self, sdfg: SDFG, enclosing_scopes: list[Union[MapExit, LoopRegion]]) -> None: """ Recursively traverse all nodes in an SDFG, handling different node types. - + Args: sdfg: The SDFG to traverse enclosing_scopes: Stack of execution scopes (e.g., maps, loops) enclosing the SDFG as a whole. @@ -78,8 +74,8 @@ def _process_sdfg(self, sdfg: SDFG, enclosing_scopes: list[Union[MapExit, LoopRe for sdfg_elem in sdfg.nodes(): self._process_sdfg_element(sdfg, sdfg_elem, enclosing_scopes) - - def _process_sdfg_element(self, sdfg: SDFG, element: any, enclosing_scopes: list[Union[MapExit, LoopRegion]]) -> None: + def _process_sdfg_element(self, sdfg: SDFG, element: any, enclosing_scopes: list[Union[MapExit, + LoopRegion]]) -> None: """ Identifies the type of the SDFG element and processes it using the corresponding handler. @@ -99,14 +95,13 @@ def _process_sdfg_element(self, sdfg: SDFG, element: any, enclosing_scopes: lis else: raise NotImplementedError( f"{self.__class__.__name__}: Unsupported node type '{type(element).__name__}' " - f"encountered during SDFG traversal. Please extend the implementation to handle this case." - ) + f"encountered during SDFG traversal. Please extend the implementation to handle this case.") - def _process_loop_region(self, sdfg: SDFG, loop_region: LoopRegion, - enclosing_scopes: list[Union[MapExit, LoopRegion]]) -> None: + def _process_loop_region(self, sdfg: SDFG, loop_region: LoopRegion, + enclosing_scopes: list[Union[MapExit, LoopRegion]]) -> None: """ Process a loop region by adding it to the scope stack and traversing its contents. - + Args: sdfg: The containing SDFG loop_region: The loop region to process @@ -114,25 +109,21 @@ def _process_loop_region(self, sdfg: SDFG, loop_region: LoopRegion, """ # Create a new scope stack with this loop region added nested_scopes = enclosing_scopes.copy() - nested_scopes.insert(0, loop_region) # Not append! :) careful + nested_scopes.insert(0, loop_region) # Not append! :) careful # Process all states within the loop region for node in loop_region.nodes(): if isinstance(node, SDFGState): self._process_state(sdfg, node, nested_scopes) else: - raise NotImplementedError( - f"{self.__class__.__name__}: Unexpected node type '{type(node).__name__}' " - f"found inside LoopRegion. SDFGState nodes were expected. Extend if you think" - "the node type is also valid" - ) - + raise NotImplementedError(f"{self.__class__.__name__}: Unexpected node type '{type(node).__name__}' " + f"found inside LoopRegion. SDFGState nodes were expected. Extend if you think" + "the node type is also valid") - def _process_state(self, sdfg: SDFG, state: SDFGState, - enclosing_scopes: list[Union[MapExit, LoopRegion]]) -> None: + def _process_state(self, sdfg: SDFG, state: SDFGState, enclosing_scopes: list[Union[MapExit, LoopRegion]]) -> None: """ Process a single SDFG state, analyzing edges for shared memory access patterns. - + Args: sdfg: The containing SDFG state: The state to process @@ -151,12 +142,11 @@ def _process_state(self, sdfg: SDFG, state: SDFGState, if not self._is_shared_memory_access_node(sdfg, source_node) or isinstance(dest_node, nodes.AccessNode): continue - # Handle different types of shared memory consumers if isinstance(dest_node, (nodes.CodeNode, nodes.MapEntry)): # Direct consumption by computation or map entry self._insert_pre_synchronization_barrier(source_node, dest_node, state, nodes_with_sync) - + elif isinstance(dest_node, nodes.NestedSDFG): # Consumption by nested SDFG - synchronize and recurse # NOTE: For nesting, we append all scopes which wrap around the nestedSDFG @@ -167,13 +157,11 @@ def _process_state(self, sdfg: SDFG, state: SDFGState, else: raise NotImplementedError( f"{self.__class__.__name__}: Unsupported destination node type '{type(dest_node).__name__}' " - f"for shared memory access. Currently supported: CodeNode, MapEntry, AccessNode, NestedSDFG." - ) + f"for shared memory access. Currently supported: CodeNode, MapEntry, AccessNode, NestedSDFG.") # Check if post-synchronization is needed and apply shared self._handle_shared_memory_post_synchronization(state, source_node, enclosing_scopes) - # It may be the case that nestedSDFG were not recursed previously. Process them in that case for node in state.nodes(): @@ -188,8 +176,7 @@ def _process_state(self, sdfg: SDFG, state: SDFGState, self._process_sdfg(node.sdfg, nested_scopes) self._processed_nsdfg.add(node) - - def _process_conditionalBlock(self, sdfg: SDFG, cond_block: ConditionalBlock, + def _process_conditionalBlock(self, sdfg: SDFG, cond_block: ConditionalBlock, enclosing_scopes: list[Union[MapExit, LoopRegion]]) -> None: """ Processes a ConditionalBlock by visiting each clause body and its elements. @@ -200,37 +187,30 @@ def _process_conditionalBlock(self, sdfg: SDFG, cond_block: ConditionalBlock, enclosing_scopes: Stack of execution scopes (e.g., maps, loops) enclosing the SDFG as a whole. """ clause_bodies: list[ControlFlowBlock] = cond_block.nodes() - + for body in clause_bodies: for sdfg_elem in body.nodes(): self._process_sdfg_element(sdfg, sdfg_elem, enclosing_scopes) - - def _is_shared_memory_access_node(self, sdfg: SDFG, node: nodes.Node) -> bool: """ Check if a node represents a GPU shared memory access. - + Args: sdfg: The containing SDFG node: The node to check - + Returns: True if the node is an AccessNode with GPU_Shared storage """ - return ( - isinstance(node, nodes.AccessNode) - and node.desc(sdfg).storage == dtypes.StorageType.GPU_Shared - ) - + return (isinstance(node, nodes.AccessNode) and node.desc(sdfg).storage == dtypes.StorageType.GPU_Shared) - - def _insert_pre_synchronization_barrier(self, source_node: nodes.Node, dest_node: nodes.Node, - state: SDFGState, nodes_with_sync: Dict[nodes.Node, nodes.Tasklet]) -> None: + def _insert_pre_synchronization_barrier(self, source_node: nodes.Node, dest_node: nodes.Node, state: SDFGState, + nodes_with_sync: Dict[nodes.Node, nodes.Tasklet]) -> None: """ Insert a __syncthreads() barrier before shared memory consumption. Reuses existing barriers when multiple shared memory sources feed the same destination. - + Args: source_node: The shared memory AccessNode dest_node: The consuming node @@ -243,31 +223,30 @@ def _insert_pre_synchronization_barrier(self, source_node: nodes.Node, dest_node state.add_edge(source_node, None, existing_barrier, None, dace.Memlet()) else: # Create a new synchronization barrier - sync_barrier = state.add_tasklet( - name="pre_sync_barrier", - inputs=set(), - outputs=set(), - code="__syncthreads();\n", - language=dtypes.Language.CPP - ) + sync_barrier = state.add_tasklet(name="pre_sync_barrier", + inputs=set(), + outputs=set(), + code="__syncthreads();\n", + language=dtypes.Language.CPP) # Connect: shared_memory -> sync_barrier -> consumer state.add_edge(source_node, None, sync_barrier, None, dace.Memlet()) state.add_edge(sync_barrier, None, dest_node, None, dace.Memlet()) nodes_with_sync[dest_node] = sync_barrier - def _build_nested_scope_stack(self, state: SDFGState, nested_sdfg_node: nodes.NestedSDFG, - enclosing_scopes: list[Union[MapExit, LoopRegion]]) -> list[Union[MapExit, LoopRegion]]: + def _build_nested_scope_stack( + self, state: SDFGState, nested_sdfg_node: nodes.NestedSDFG, + enclosing_scopes: list[Union[MapExit, LoopRegion]]) -> list[Union[MapExit, LoopRegion]]: """ Copy the 'enclosing_scopes' stack and extend it with all maps in 'state' that enclose 'nested_sdfg_node'. It is assumed that the 'enclosing_scopes' stack contains all maps and loops that wrap around 'state', but not individual nodes within 'state'. - + Args: state: The state containing the nested SDFG nested_sdfg_node: The NestedSDFG node enclosing_scopes: Current scope stack - + Returns: Updated scope stack including maps enclosing the nested SDFG """ @@ -286,21 +265,20 @@ def _build_nested_scope_stack(self, state: SDFGState, nested_sdfg_node: nodes.Ne # add the current state in which the map_exit is contained, # needed for potential post synchronization barriers self._map_exit_to_state[map_exit] = state - + # move up in the nested map hierarchy current_map = scope_dict[current_map] return updated_scopes - def _handle_shared_memory_post_synchronization(self, state: SDFGState, shared_mem_node: nodes.Node, - enclosing_scopes: list[Union[MapExit, LoopRegion]]) -> None: + enclosing_scopes: list[Union[MapExit, LoopRegion]]) -> None: """ Handle post-synchronization for shared memory reuse in sequential execution contexts. - + When shared memory is reused across iterations in a for loop or sequential map within a GPU kernel, we need post-synchronization barriers to prevent race conditions. - + Args: state: The state containing the shared memory access shared_mem_node: The shared memory AccessNode @@ -357,16 +335,13 @@ def _handle_shared_memory_post_synchronization(self, state: SDFGState, shared_me # Validate that shared memory is used within GPU kernel context if not inside_gpu_kernel: - raise ValueError( - "Shared memory usage detected outside GPU kernel context. " - "GPU shared memory is only valid within GPU_Device scheduled maps." - ) + raise ValueError("Shared memory usage detected outside GPU kernel context. " + "GPU shared memory is only valid within GPU_Device scheduled maps.") # No post synchronization needed if there's no sequential iteration context if innermost_sequential_scope is None: return - # Apply appropriate post-synchronization based on scope type if isinstance(innermost_sequential_scope, MapExit): self._add_post_sync_for_sequential_map(innermost_sequential_scope) @@ -375,30 +350,26 @@ def _handle_shared_memory_post_synchronization(self, state: SDFGState, shared_me self._add_post_sync_tasklets_for_loop_region(innermost_sequential_scope) # self._add_post_sync_state_for_loop_region(innermost_sequential_scope) - - def _add_post_sync_for_sequential_map(self, seq_map_exit: MapExit) -> None: """ Add post-synchronization barrier after a sequential map that may reuse shared memory. - + Args: seq_map_exit: The MapExit node of the sequential map """ # Avoid duplicate synchronization if seq_map_exit in self._synchronized_scopes: return - + # Find the state containing this map containing_state = self._map_exit_to_state[seq_map_exit] - + # Create post-synchronization barrier - post_sync_barrier = containing_state.add_tasklet( - name="post_sync_barrier", - inputs=set(), - outputs=set(), - code="__syncthreads();\n", - language=dtypes.Language.CPP - ) + post_sync_barrier = containing_state.add_tasklet(name="post_sync_barrier", + inputs=set(), + outputs=set(), + code="__syncthreads();\n", + language=dtypes.Language.CPP) # Insert barrier before the map exit and all other predecessors incoming_edges = containing_state.in_edges(seq_map_exit) @@ -407,7 +378,6 @@ def _add_post_sync_for_sequential_map(self, seq_map_exit: MapExit) -> None: predecessor = edge.src containing_state.add_edge(predecessor, None, post_sync_barrier, None, dace.Memlet()) containing_state.add_edge(post_sync_barrier, None, seq_map_exit, None, dace.Memlet()) - # Mark as synchronized self._synchronized_scopes.add(seq_map_exit) @@ -417,7 +387,7 @@ def _add_post_sync_state_for_loop_region(self, loop_region: LoopRegion) -> None: Add post-synchronization barrier for a loop region that reuses shared memory arrays. It adds a new state, which contains only a synchronization tasklet that connects to all sink blocks of the loop region. - + Args: loop_region: The LoopRegion that needs post-synchronization """ @@ -431,17 +401,14 @@ def _add_post_sync_state_for_loop_region(self, loop_region: LoopRegion) -> None: # No sync needed if len(sink_blocks) < 0: return - + # Add new state which synchronizates all sink nodes of the loop syn_block = loop_region.add_state("sync_state") - syn_block.add_tasklet( - name="post_sync_barrier", - inputs=set(), - outputs=set(), - code="__syncthreads();\n", - language=dtypes.Language.CPP - ) - + syn_block.add_tasklet(name="post_sync_barrier", + inputs=set(), + outputs=set(), + code="__syncthreads();\n", + language=dtypes.Language.CPP) for block in sink_blocks: loop_region.add_edge(block, syn_block, InterstateEdge()) @@ -449,13 +416,12 @@ def _add_post_sync_state_for_loop_region(self, loop_region: LoopRegion) -> None: # Mark as synchronized self._synchronized_scopes.add(loop_region) - def _add_post_sync_tasklets_for_loop_region(self, loop_region: LoopRegion) -> None: """ Add post-synchronization barrier for a loop region that reuses shared memory arrays. Determines all sink blocks in the LoopRegion, and then, for each sink block, adds a new synchronization tasklet that connects to all sink nodes within that sink block. - + Args: loop_region: The LoopRegion that needs post-synchronization """ @@ -464,35 +430,31 @@ def _add_post_sync_tasklets_for_loop_region(self, loop_region: LoopRegion) -> No for block in loop_region.nodes(): if not isinstance(block, SDFGState): - raise NotImplementedError(f"Block {block} is expected to be an SDFG state. But it is of type {type(block)}. " - "Extend use case if this should be valid." - ) - + raise NotImplementedError( + f"Block {block} is expected to be an SDFG state. But it is of type {type(block)}. " + "Extend use case if this should be valid.") + if loop_region.out_degree(block) == 0: sink_blocks.append(block) # No sync needed if len(sink_blocks) < 0: return - # For each sink block, synchronize at the end for block in sink_blocks: - + sink_nodes: list[nodes.Node] = block.sink_nodes() # All sink nodes in the same block (= state) get the same sync tasklet - post_sync_barrier = block.add_tasklet( - name="post_sync_barrier", - inputs=set(), - outputs=set(), - code="__syncthreads();\n", - language=dtypes.Language.CPP - ) + post_sync_barrier = block.add_tasklet(name="post_sync_barrier", + inputs=set(), + outputs=set(), + code="__syncthreads();\n", + language=dtypes.Language.CPP) for snode in sink_nodes: block.add_edge(snode, None, post_sync_barrier, None, dace.Memlet()) - # Mark as synchronized - self._synchronized_scopes.add(loop_region) \ No newline at end of file + self._synchronized_scopes.add(loop_region) From a7ccb31cd4ffd62256753807136d052102976466 Mon Sep 17 00:00:00 2001 From: Yakup Budanaz Date: Tue, 8 Jul 2025 20:56:26 +0200 Subject: [PATCH 57/94] Improve API --- dace/sdfg/utils.py | 73 ++++++++++++++++++++++++++++----- tests/const_utilities_test.py | 76 ++++++++++++++++++++++++++++++----- 2 files changed, 128 insertions(+), 21 deletions(-) diff --git a/dace/sdfg/utils.py b/dace/sdfg/utils.py index dd3d4a7a9b..db85958ddc 100644 --- a/dace/sdfg/utils.py +++ b/dace/sdfg/utils.py @@ -2118,12 +2118,14 @@ def set_nested_sdfg_parent_references(sdfg: SDFG): set_nested_sdfg_parent_references(node.sdfg) -def get_used_data(scope: Union[ControlFlowRegion, SDFGState, nd.MapEntry, nd.NestedSDFG]) -> Set[str]: +def get_used_data(scope: Union[ControlFlowRegion, SDFGState, nd.MapEntry, nd.NestedSDFG], + parent_state: Union[SDFGState, None] = None) -> Set[str]: """ Returns a set of all data names that are used in the given control flow region, state, map entry or nested SDFG node. - Data is considered used if there is an access node + Data is considered used if there is an access node within the scope to data or it appears in an interstate edge. :param cfg: The control flow region, state, or map entry node to check. + :param parent_state: The parent state of the scope, used only for MapEntry nodes. Can't be None if scope is a MapEntry. :return: A set of used data names. """ if isinstance(scope, SDFGState) or isinstance(scope, ControlFlowRegion): @@ -2133,8 +2135,8 @@ def get_used_data(scope: Union[ControlFlowRegion, SDFGState, nd.MapEntry, nd.Nes read_data, write_data = scope.sdfg.read_and_write_sets() return read_data.union(write_data) elif isinstance(scope, nd.MapEntry): - state: SDFGState = scope.parent_graph - + assert parent_state is not None, "parent_state must be provided for MapEntry nodes" + state: SDFGState = parent_state # How can data be accessed in an SDFG?: # Read interstate edges or access nodes using memlets # Written to access nodes using memlets @@ -2164,7 +2166,7 @@ def get_used_data(scope: Union[ControlFlowRegion, SDFGState, nd.MapEntry, nd.Nes def get_constant_data(scope: Union[ControlFlowRegion, SDFGState, nd.NestedSDFG, nd.MapEntry], - parent_state: SDFGState = None) -> Set[str]: + parent_state: Union[SDFGState, None] = None) -> Set[str]: """ Returns a set of all constant data in the given control flow region, state, or with the map scope. Data is considered constant if there is any incoming edge to an access node of the data. @@ -2214,8 +2216,44 @@ def _incoming_memlet(state: SDFGState, node: nd.AccessNode) -> bool: raise Exception("Unsupported scope type for get_constant_data: {}".format(type(scope))) +def get_used_symbols( + scope: Union[SDFG, ControlFlowRegion, SDFGState, nd.MapEntry, nd.NestedSDFG], + parent_state: Union[SDFGState, None] = None, + include_symbols_for_offset_calculations: bool = False, +) -> Set[str]: + """ + Returns a set of all used symbols in the given control flow region, state, or with the map scope. + + :param cfg: The control flow region, state or a map entry node to check. + :param parent_state: The parent graph of the scope, used only for MapEntry nodes. + :return: A set of symbol names. + """ + return _get_used_symbols_impl(scope=scope, + constant_syms_only=True, + parent_state=parent_state, + include_symbols_for_offset_calculations=include_symbols_for_offset_calculations) + + def get_constant_symbols(scope: Union[SDFG, ControlFlowRegion, SDFGState, nd.MapEntry, nd.NestedSDFG], - parent_state: SDFGState = None) -> Set[str]: + parent_state: Union[SDFGState, None] = None, + include_symbols_for_offset_calculations: bool = False) -> Set[str]: + """ + Returns a set of all constant symbols in the given control flow region, state, or with the map scope. + A symbol is considered constant if no interstate edge within the scope writes to it. + + :param cfg: The control flow region, state or a map entry node to check. + :param parent_state: The parent graph of the scope, used only for MapEntry nodes. + :return: A set of constant symbol names. + """ + return _get_used_symbols_impl(scope=scope, + constant_syms_only=True, + parent_state=parent_state, + include_symbols_for_offset_calculations=include_symbols_for_offset_calculations) + + +def _get_used_symbols_impl(scope: Union[SDFG, ControlFlowRegion, SDFGState, nd.MapEntry, + nd.NestedSDFG], constant_syms_only: bool, parent_state: Union[SDFGState, None], + include_symbols_for_offset_calculations: bool) -> Set[str]: """ Returns a set of all constant symbols in the given control flow region, state, or with the map scope. A symbol is considered constant if no interstate edge writes to it. @@ -2232,23 +2270,38 @@ def _get_assignments(cfg: Union[ControlFlowRegion, SDFG]) -> Set[str]: written_symbols = written_symbols.union(edge.data.assignments.keys()) return written_symbols + offset_symbols = set() + if include_symbols_for_offset_calculations: + used_data = get_used_data(scope=scope, parent_state=parent_state) + for data in used_data: + parent_graph = parent_state if isinstance(scope, nd.MapEntry) else scope + if data in parent_graph.sdfg.arrays: + desc = parent_graph.sdfg.arrays[data] + offset_symbols.update(str(sym) for sym in desc.free_symbols) + if isinstance(scope, SDFGState): symbols = scope.used_symbols(all_symbols=False) # Since no symbol can change within a state we are good to go - return symbols + return offset_symbols | symbols elif isinstance(scope, (SDFG, ControlFlowRegion)): # Need to get all used symbols within the SDFG or CFG used_symbols = scope.used_symbols(all_symbols=False) # Get all symbols that are written to written_symbols = _get_assignments(scope) - return used_symbols - written_symbols + if constant_syms_only: + return (offset_symbols | used_symbols) - written_symbols + else: + return offset_symbols | used_symbols elif isinstance(scope, nd.NestedSDFG): used_symbols = scope.sdfg.used_symbols(all_symbols=True) # Can't pass them as const if they are written to in the nested SDFG written_symbols = _get_assignments(scope.sdfg) - return used_symbols - written_symbols + if constant_syms_only: + return (offset_symbols | used_symbols) - written_symbols + else: + return offset_symbols | used_symbols elif isinstance(scope, nd.MapEntry): used_symbols = scope.used_symbols_within_scope(parent_state=parent_state) - return used_symbols + return offset_symbols | used_symbols else: raise Exception("Unsupported scope type for get_constant_data: {}".format(type(scope))) diff --git a/tests/const_utilities_test.py b/tests/const_utilities_test.py index 92f2b667a5..6d70630045 100644 --- a/tests/const_utilities_test.py +++ b/tests/const_utilities_test.py @@ -80,13 +80,26 @@ def _add_shared_memory(sdfg: dace.SDFG, add_src_access_node: bool = False): state.remove_edge(edge) -def _check_map_entries(state, schedule, expected_data, expected_symbols): +def _check_map_entries(state, include_symbols_for_offset_calculation, const_only, schedule, expected_data, + expected_symbols): map_entries = [n for n in state.nodes() if isinstance(n, dace.sdfg.nodes.MapEntry) and n.map.schedule == schedule] for me in map_entries: - const_data = sdutils.get_constant_data(me, state) - const_symbols = sdutils.get_constant_symbols(me, state) - assert expected_data == const_data - assert expected_symbols == const_symbols + if const_only: + const_data = sdutils.get_constant_data(scope=me, parent_state=state) + const_symbols = sdutils.get_constant_symbols( + scope=me, + parent_state=state, + include_symbols_for_offset_calculations=include_symbols_for_offset_calculation) + assert expected_data == const_data, f"(Const Data) Expected {expected_data}, got {const_data} in map {me.label}" + assert expected_symbols == const_symbols, f"(Const Symbols) Expected {expected_symbols}, got {const_symbols} in map {me.label}" + else: + used_data = sdutils.get_used_data(scope=me, parent_state=state) + used_symbols = sdutils.get_used_symbols( + scope=me, + parent_state=state, + include_symbols_for_offset_calculations=include_symbols_for_offset_calculation) + assert expected_data == used_data, f"(Used Data) Expected {expected_data}, got {used_data} in map {me.label}" + assert expected_symbols == used_symbols, f"(Used Symbols) Expected {expected_symbols}, got {used_symbols} in map {me.label}" def _gen_sdfg_with_symbol_use_in_nsdfg(write_only: bool = True) -> dace.SDFG: @@ -172,16 +185,57 @@ def kernel( # Test cases original_state = next(iter(original_sdfg.all_states())) transformed_state = next(iter(transformed_sdfg.all_states())) + assert original_state is not None + assert transformed_state is not None + + all_data_names = set(node.data for node in original_state.data_nodes()) + transformed_sdfg_tmp_names = set(node.data for node in transformed_state.data_nodes() + if transformed_sdfg.arrays[node.data].transient) + original_sdfg_tmp_names = set(node.data for node in original_state.data_nodes() + if original_sdfg.arrays[node.data].transient) + + # Original state tests + _check_map_entries(original_state, True, False, dace.dtypes.ScheduleType.GPU_Device, all_data_names - {"C"}, + {"i", "N"}) + _check_map_entries(original_state, True, False, dace.dtypes.ScheduleType.Sequential, all_data_names - {"C"}, + {"i", "k", "N"}) + _check_map_entries(original_state, True, False, dace.dtypes.ScheduleType.GPU_ThreadBlock, all_data_names - {"C"}, + {"i", "j", "k", "N"}) + + # Transformed state tests + _check_map_entries(transformed_state, True, False, dace.dtypes.ScheduleType.GPU_Device, + all_data_names - {"C"} | {"shr_A", "shr_B"}, {"i", "N"}) + _check_map_entries(transformed_state, True, False, dace.dtypes.ScheduleType.Sequential, + all_data_names - {"C"} | {"shr_A", "shr_B"}, {"i", "k", "N"}) + # Using only shr_a and shr_b means no need of N + _check_map_entries(transformed_state, True, False, dace.dtypes.ScheduleType.GPU_ThreadBlock, + {"shr_A", "shr_B"} | transformed_sdfg_tmp_names, {"i", "j", "k"}) + + # Original state tests + _check_map_entries(original_state, True, True, dace.dtypes.ScheduleType.GPU_Device, {"A", "B"}, {"i", "N"}) + _check_map_entries(original_state, True, True, dace.dtypes.ScheduleType.Sequential, {"A", "B"}, {"i", "k", "N"}) + _check_map_entries(original_state, True, True, dace.dtypes.ScheduleType.GPU_ThreadBlock, {"A", "B"}, + {"i", "j", "k", "N"}) + + # Transformed state tests + _check_map_entries(transformed_state, True, True, dace.dtypes.ScheduleType.GPU_Device, set(), {"i", "N"}) + _check_map_entries(transformed_state, True, True, dace.dtypes.ScheduleType.Sequential, set(), {"i", "k", "N"}) + # Using only shr_a and shr_b means no need of N + _check_map_entries(transformed_state, True, True, dace.dtypes.ScheduleType.GPU_ThreadBlock, {"shr_A", "shr_B"}, + {"i", "j", "k"}) # Original state tests - _check_map_entries(original_state, dace.dtypes.ScheduleType.GPU_Device, {"A", "B"}, {"i"}) - _check_map_entries(original_state, dace.dtypes.ScheduleType.Sequential, {"A", "B"}, {"i", "k"}) - _check_map_entries(original_state, dace.dtypes.ScheduleType.GPU_ThreadBlock, {"A", "B"}, {"i", "j", "k"}) + _check_map_entries(original_state, False, True, dace.dtypes.ScheduleType.GPU_Device, {"A", "B"}, {"i"}) + _check_map_entries(original_state, False, True, dace.dtypes.ScheduleType.Sequential, {"A", "B"}, {"i", "k"}) + _check_map_entries(original_state, False, True, dace.dtypes.ScheduleType.GPU_ThreadBlock, {"A", "B"}, + {"i", "j", "k"}) # Transformed state tests - _check_map_entries(transformed_state, dace.dtypes.ScheduleType.GPU_Device, set(), {"i"}) - _check_map_entries(transformed_state, dace.dtypes.ScheduleType.Sequential, set(), {"i", "k"}) - _check_map_entries(transformed_state, dace.dtypes.ScheduleType.GPU_ThreadBlock, {"shr_A", "shr_B"}, {"i", "j", "k"}) + _check_map_entries(transformed_state, False, True, dace.dtypes.ScheduleType.GPU_Device, set(), {"i"}) + _check_map_entries(transformed_state, False, True, dace.dtypes.ScheduleType.Sequential, set(), {"i", "k"}) + # Using only shr_a and shr_b means no need of N + _check_map_entries(transformed_state, False, True, dace.dtypes.ScheduleType.GPU_ThreadBlock, {"shr_A", "shr_B"}, + {"i", "j", "k"}) def test_const_utilities_case_write_only_free_symbol_in_nsdfg(): From 27007067807e245092fd27ab26dab57d42a9ede5 Mon Sep 17 00:00:00 2001 From: aydogdub Date: Thu, 10 Jul 2025 14:07:41 +0200 Subject: [PATCH 58/94] removing workspace folder for PR and from the repository --- berkay_workpace/reports/important_notes.txt | 13 - berkay_workpace/reports/notes.txt | 34 - berkay_workpace/reports/report.py | 23 - berkay_workpace/reports/report_1.md | 56 - .../scratch/async_copy/async_copy.ipynb | 123 - .../scratch/async_copy/testbed.ipynb | 853 --- berkay_workpace/scratch/constantArgs.ipynb | 1057 ---- berkay_workpace/scratch/cudastreamPass.ipynb | 331 - .../scalarMultiplication1.ipynb | 509 -- .../scalarMultiplication2.ipynb | 286 - .../scratch/smemPassAndCopy/simpleCopy.ipynb | 866 --- .../thesis_related/const_check_fails.ipynb | 179 - .../threadblockPass/simple1dExamplye.ipynb | 129 - .../threadblockPass/simple2dExample.ipynb | 142 - .../threadblockPass/simple4dExample.ipynb | 143 - .../dbuff_related/double_buffering_async.sdfg | 5302 ----------------- .../original_sdfg_with_shared_memory.sdfg | 1278 ---- .../2d_blocktiled_gemm_with_custom_copy.sdfg | 4165 ------------- .../smem_related/generate_sdfgs.ipynb | 656 -- .../nice_global_to_shared_copy.sdfg | 1278 ---- .../smem_related/weird_global_to_global.sdfg | 1404 ----- .../weird_shared_to_shared_copy.sdfg | 896 --- .../warp_level_test.py | 445 -- .../tests/gpu_map_tests/device_map_test.py | 153 - .../tests/gpu_map_tests/threadBlock_test.py | 95 - .../out_of_kernel_memcpy_test.py | 280 - .../tests/reusable_tests/cuda_block_test.py | 204 - .../cuda_highdim_kernel_test.py | 213 - .../tests/reusable_tests/cuda_smem2d_test.py | 177 - .../tests/reusable_tests/cuda_smem_test.py | 62 - .../reusable_tests/gpu_launch_bounds_test.py | 70 - .../tests/reusable_tests/halfvec_cudatest.py | 160 - .../reusable_tests/multiprogram_cudatest.py | 57 - .../multistream_copy_cudatest.py | 93 - .../multistream_kernel_cudatest.py | 79 - .../smem_tests/default_smem_sync_pass_test.py | 337 -- berkay_workpace/tests/smem_tests/gemm_test.py | 33 - .../smem_tests/special_sync_pass_test.py | 37 - 38 files changed, 22218 deletions(-) delete mode 100644 berkay_workpace/reports/important_notes.txt delete mode 100644 berkay_workpace/reports/notes.txt delete mode 100644 berkay_workpace/reports/report.py delete mode 100644 berkay_workpace/reports/report_1.md delete mode 100644 berkay_workpace/scratch/async_copy/async_copy.ipynb delete mode 100644 berkay_workpace/scratch/async_copy/testbed.ipynb delete mode 100644 berkay_workpace/scratch/constantArgs.ipynb delete mode 100644 berkay_workpace/scratch/cudastreamPass.ipynb delete mode 100644 berkay_workpace/scratch/smemPassAndCopy/scalarMultiplication1.ipynb delete mode 100644 berkay_workpace/scratch/smemPassAndCopy/scalarMultiplication2.ipynb delete mode 100644 berkay_workpace/scratch/smemPassAndCopy/simpleCopy.ipynb delete mode 100644 berkay_workpace/scratch/thesis_related/const_check_fails.ipynb delete mode 100644 berkay_workpace/scratch/threadblockPass/simple1dExamplye.ipynb delete mode 100644 berkay_workpace/scratch/threadblockPass/simple2dExample.ipynb delete mode 100644 berkay_workpace/scratch/threadblockPass/simple4dExample.ipynb delete mode 100644 berkay_workpace/scratch/yakups_examples/dbuff_related/double_buffering_async.sdfg delete mode 100644 berkay_workpace/scratch/yakups_examples/dbuff_related/original_sdfg_with_shared_memory.sdfg delete mode 100644 berkay_workpace/scratch/yakups_examples/smem_related/2d_blocktiled_gemm_with_custom_copy.sdfg delete mode 100644 berkay_workpace/scratch/yakups_examples/smem_related/generate_sdfgs.ipynb delete mode 100644 berkay_workpace/scratch/yakups_examples/smem_related/nice_global_to_shared_copy.sdfg delete mode 100644 berkay_workpace/scratch/yakups_examples/smem_related/weird_global_to_global.sdfg delete mode 100644 berkay_workpace/scratch/yakups_examples/smem_related/weird_shared_to_shared_copy.sdfg delete mode 100644 berkay_workpace/tests/experimental_features_tests/warp_level_test.py delete mode 100644 berkay_workpace/tests/gpu_map_tests/device_map_test.py delete mode 100644 berkay_workpace/tests/gpu_map_tests/threadBlock_test.py delete mode 100644 berkay_workpace/tests/memcopy_tests/out_of_kernel_memcpy_test.py delete mode 100644 berkay_workpace/tests/reusable_tests/cuda_block_test.py delete mode 100644 berkay_workpace/tests/reusable_tests/cuda_highdim_kernel_test.py delete mode 100644 berkay_workpace/tests/reusable_tests/cuda_smem2d_test.py delete mode 100644 berkay_workpace/tests/reusable_tests/cuda_smem_test.py delete mode 100644 berkay_workpace/tests/reusable_tests/gpu_launch_bounds_test.py delete mode 100644 berkay_workpace/tests/reusable_tests/halfvec_cudatest.py delete mode 100644 berkay_workpace/tests/reusable_tests/multiprogram_cudatest.py delete mode 100644 berkay_workpace/tests/reusable_tests/multistream_copy_cudatest.py delete mode 100644 berkay_workpace/tests/reusable_tests/multistream_kernel_cudatest.py delete mode 100644 berkay_workpace/tests/smem_tests/default_smem_sync_pass_test.py delete mode 100644 berkay_workpace/tests/smem_tests/gemm_test.py delete mode 100644 berkay_workpace/tests/smem_tests/special_sync_pass_test.py diff --git a/berkay_workpace/reports/important_notes.txt b/berkay_workpace/reports/important_notes.txt deleted file mode 100644 index 1726c8e8f7..0000000000 --- a/berkay_workpace/reports/important_notes.txt +++ /dev/null @@ -1,13 +0,0 @@ -1. CudaEvents are a bit a mix in the codegen. I left it as an attribute in the GPUStreamManager class, but I do not work with CudaEvents. - I left it there because it may be implemented in future and more importantly, the GPU code template (in get_generated_codeobject()) - depends on it. Instead of removing it, I decided to let it be and just say 0 CudaEvents are created and used. - Generally: The CudaStreamManager assumes that the NaiveGPUScheduler pass was called before. - Also, the CudaStreamManager should define the functions "get_stream_edge" (and maybe "get_stream_node"), since the the copystrategies might - depend on it - -2. I think we should rename ExperimentalCUDACodegen to ExperimentalGPUCodegen (or similarly, rename CUDACodeGen to GPUCodeGen), since it - is also intended to handle HIP code. However, this should be verified for HIP first. Otherwise, it might be better to build two separate - codegens — this is confusing as it stands. - -3. "Struct" memory copies in old codegen are hacks. These are omitted in the new ExperimentalCUDACodegen, because they should be implemented - in a planned and structured way, which is out of scope for my Master's Thesis. diff --git a/berkay_workpace/reports/notes.txt b/berkay_workpace/reports/notes.txt deleted file mode 100644 index 07714dc1d3..0000000000 --- a/berkay_workpace/reports/notes.txt +++ /dev/null @@ -1,34 +0,0 @@ -What was bad: - - -- Also dead code at "copy_memory", the first "dst_schedule" in the src_node if-else case it dead code. - In Fact, "dst_schedule" is not even needed at all. So we have "double dead code", once a computation - which gets overwritten, and once we compute smth we don't need. - -- Damn, even the copy_memory input named "memlet" is wrong.. this should be an edge type, not a memlet type. - -- Also, in "_emit_copy" there is a code snippet that "determines directionality", but part which may be useful - is raising an error in the else case. Again, dead code- setting variables that are never used. - - -- Again dead code: Computes "copy_shape" twice, first definition seems wrong and is not even used. - -- Stream handling in CudaCodeGen check is just random- streams are not handled by the codegen. - -- again, define local variables but then not use it. In this case: dtype - - -- yeah I realize that in the codegen the same function is implemented twice (one locally, - once in a utility file) - - -Tell Yakup: - -- I removed any logic that should handle cudaStream synchronization since I am not responsible for it. - In order to help to extend it in the future, I have two options. - 1. I can add function calls (which are empty i.e. do nothing) that signal what should be implemented once - there is a good solution of handling cuda streams in DaCe's new version - 2. Document it and say that several streams are not supported (people could come up with completely new - approaches to handle streams maybe) - 3. We got smth wrong. CopyToMap handles only GPU<->GPU code cases. - 4. I tried to handle "special case" as I understood... maybe worth to look at it closer with you diff --git a/berkay_workpace/reports/report.py b/berkay_workpace/reports/report.py deleted file mode 100644 index c144fd74c9..0000000000 --- a/berkay_workpace/reports/report.py +++ /dev/null @@ -1,23 +0,0 @@ -# TODO: GENERAL, discuss with Yakup -# 1. Modularity for Deallocate? -# 2. KernelScopeManager: What I like: -# - simple, easy to understand, modular and clean -# what I dont like: -# - Kind of messes with _generate_exit and how dace generates code -# Your opinion? do or dont? -# 3. __syncthread example ? Or better: General examples? -# 3.5 See below -# 4. GPU streams- now or wait? -# 5. Config for thread_id - why is this even a config? -# 6. Used no instrumentation because I have no clue what it is - -# I think the rest can wait before getting refactored (I don't need to reinvent the wheel) -# New tasks for now? - -# My personal TODO's -# TODO: when tired -# include constant expressions -# 4 dimensional example - -# TODO: depending on what happens next -# change in_device_code to maybe in_kernel_code? diff --git a/berkay_workpace/reports/report_1.md b/berkay_workpace/reports/report_1.md deleted file mode 100644 index d2c4cb65f5..0000000000 --- a/berkay_workpace/reports/report_1.md +++ /dev/null @@ -1,56 +0,0 @@ -# Master's Thesis Report - -**Thesis Title:** Code-generation for Modern GPUs in DaCe -**Student:** Berkay Aydogdu -**Supervisor:** Yakup Koray Budanaz -**Date:** 2025-05-23 -**Short description:** The objectives of this Master's thesis are to refactor the CUDA code generator in DaCe and to extend it with new features. The refactoring focuses on improving the structure, readability, and maintainability of the code. - -## Progress Overview - -By inspecting the source code of the CUDA code generator, we identified several poor coding -practices. These included, among others, intertwined functionality, non-descriptive variable -and function names, and numerous code fragments that appeared more like quick fixes or hacks -than thoughtfully designed solutions. - -To address these issues, we implemented a new CUDA code generator class `ExperimentalCUDACodeGen`, which can be enabled via configuration settings. We began by -running simple programs using the new generator, reusing parts of the existing code to get -minimal examples working. - -We deliberately chose not to build a completely new generator from scratch, as improving code -quality is only one part of the overall goal. Moreover, the existing implementation contains -well-designed components that are worth preserving—there is no need to reinvent the wheel. - -The following section highlights the notable aspects of the new implementation: - -- Only simple features are supported for now, in order to eliminate the complexity introduced - by rarely used features such as dynamic parallelism. -- The generation of scopes — specifically GPU maps— has been almost completely reworked. - In the existing CUDA code generator, this component has major issues, with several hundred - lines of dense code packed into just a few functions, even though it could be logically - split. For example, the generation of different map types (based on schedule types), the - kernel launch, and the kernel wrapper function are now implemented in separate functions. - We also improved naming throughout the code by replacing vague variable names with more - meaningful ones. -- The existing CUDA code generator opens and closes brackets in inconsistent - locations—sometimes even at another file. This is not only error-prone, but also makes - the code appear more complex than necessary. To address this, we implemented a Python - class (`KernelScopeManager`) that uses the `with` construct to clearly define when scopes - are entered and exited, making bracket management more structured and easier to control. -- In our view, the existing CUDA code generator class relies on too many attributes, some of - which are specific to individual kernels—such as inputs, block and grid dimensions. These - are currently derived ad hoc and stored directly on the generator, leading to clutter and - reduced clarity. To address this, we introduced a `KernelSpec` class that encapsulates all - kernel-specific information. This allows such attributes to be accessed cleanly from a - KernelSpec instance, reducing the number of attributes in the code generator and improving - structure and maintainability. -- We also implemented a first extension, namely the support of WarpLevel schedules, by - introducing a new GPU schedule type called `GPU_Warp`. With this, the we can specify which - warps are selected to perform a task. - - -## Next Steps - -The next steps include enabling asynchronous memory copies and continuing to refactor the -remaining parts of the code generator. This will require support for shared memory and -further discussions around key design decisions. diff --git a/berkay_workpace/scratch/async_copy/async_copy.ipynb b/berkay_workpace/scratch/async_copy/async_copy.ipynb deleted file mode 100644 index f86a432725..0000000000 --- a/berkay_workpace/scratch/async_copy/async_copy.ipynb +++ /dev/null @@ -1,123 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "id": "ebf929d8", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "import dace\n", - "import cupy as cp\n", - "import numpy as np\n", - "from IPython.display import Code\n" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "024e65c9", - "metadata": {}, - "outputs": [ - { - "ename": "TypeError", - "evalue": "Memlet.__init__() got an unexpected keyword argument 'is_asynchronous'", - "output_type": "error", - "traceback": [ - "\u001b[31m---------------------------------------------------------------------------\u001b[39m", - "\u001b[31mTypeError\u001b[39m Traceback (most recent call last)", - "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[2]\u001b[39m\u001b[32m, line 20\u001b[39m\n\u001b[32m 16\u001b[39m tb_map_entry, tb_map_exit = state.add_map(\u001b[33m\"\u001b[39m\u001b[33mtb_map\u001b[39m\u001b[33m\"\u001b[39m, \u001b[38;5;28mdict\u001b[39m(tid=\u001b[33m\"\u001b[39m\u001b[33m0:128\u001b[39m\u001b[33m\"\u001b[39m), schedule=dace.dtypes.ScheduleType.GPU_ThreadBlock)\n\u001b[32m 18\u001b[39m \u001b[38;5;66;03m# Edges with proper data flow\u001b[39;00m\n\u001b[32m 19\u001b[39m \u001b[38;5;66;03m# Global to device scope\u001b[39;00m\n\u001b[32m---> \u001b[39m\u001b[32m20\u001b[39m state.add_edge(a_acc, \u001b[38;5;28;01mNone\u001b[39;00m, gpu_map_entry, \u001b[38;5;28;01mNone\u001b[39;00m, \u001b[43mdace\u001b[49m\u001b[43m.\u001b[49m\u001b[43mMemlet\u001b[49m\u001b[43m(\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mA[0:128]\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mis_asynchronous\u001b[49m\u001b[43m=\u001b[49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m)\u001b[49m)\n\u001b[32m 21\u001b[39m \u001b[38;5;66;03m# Device scope to thread-block scope\u001b[39;00m\n\u001b[32m 22\u001b[39m state.add_edge(gpu_map_entry, \u001b[38;5;28;01mNone\u001b[39;00m, s_acc, \u001b[38;5;28;01mNone\u001b[39;00m, dace.Memlet(\u001b[33m\"\u001b[39m\u001b[33mA[0:128]->S[0:128]\u001b[39m\u001b[33m\"\u001b[39m, is_asynchronous=\u001b[38;5;28;01mTrue\u001b[39;00m))\n", - "\u001b[36mFile \u001b[39m\u001b[32m~/master-thesis/dace/dace/properties.py:337\u001b[39m, in \u001b[36mmake_properties..initialize_properties\u001b[39m\u001b[34m(obj, *args, **kwargs)\u001b[39m\n\u001b[32m 335\u001b[39m \u001b[38;5;28msetattr\u001b[39m(obj, name, prop.default)\n\u001b[32m 336\u001b[39m \u001b[38;5;66;03m# Now call vanilla __init__, which can initialize members\u001b[39;00m\n\u001b[32m--> \u001b[39m\u001b[32m337\u001b[39m \u001b[43minit\u001b[49m\u001b[43m(\u001b[49m\u001b[43mobj\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 338\u001b[39m \u001b[38;5;66;03m# Assert that all properties have been set\u001b[39;00m\n\u001b[32m 339\u001b[39m \u001b[38;5;28;01mfor\u001b[39;00m name, prop \u001b[38;5;129;01min\u001b[39;00m properties.items():\n", - "\u001b[31mTypeError\u001b[39m: Memlet.__init__() got an unexpected keyword argument 'is_asynchronous'" - ] - } - ], - "source": [ - "\n", - "# SDFG and the main state\n", - "sdfg = dace.SDFG(\"asyn_cpy_sdfg\")\n", - "state = sdfg.add_state(\"main\")\n", - "\n", - "# Arrays and access nodes\n", - "sdfg.add_array(\"A\", (128,), dace.uint32, storage=dace.dtypes.StorageType.GPU_Global)\n", - "sdfg.add_array(\"B\", (128,), dace.uint32, storage=dace.dtypes.StorageType.GPU_Global)\n", - "sdfg.add_array(\"S\", (128,), dace.uint32, storage=dace.dtypes.StorageType.GPU_Shared, transient=True)\n", - "\n", - "a_acc = state.add_read(\"A\")\n", - "b_acc = state.add_access(\"B\")\n", - "s_acc = state.add_access(\"S\")\n", - "\n", - "# Device and thread-block maps\n", - "gpu_map_entry, gpu_map_exit = state.add_map(\"gpu_map\", dict(bid=\"0:128:128\"), schedule=dace.dtypes.ScheduleType.GPU_Device)\n", - "tb_map_entry, tb_map_exit = state.add_map(\"tb_map\", dict(tid=\"0:128\"), schedule=dace.dtypes.ScheduleType.GPU_ThreadBlock)\n", - "\n", - "# Edges with proper data flow\n", - "# Global to device scope\n", - "state.add_edge(a_acc, None, gpu_map_entry, None, dace.Memlet(\"A[0:128]\", is_asynchronous=True))\n", - "# Device scope to thread-block scope\n", - "state.add_edge(gpu_map_entry, None, s_acc, None, dace.Memlet(\"A[0:128]->S[0:128]\", is_asynchronous=True))\n", - "state.add_edge(s_acc, None, tb_map_entry, None, dace.Memlet(\"S[0:128]\", is_asynchronous=True))\n", - "\n", - "assign_tasklet = state.add_tasklet(\n", - " \"assign\", inputs={\"__in_S\"}, outputs={\"__out_S\"},\n", - " code=\"__out_S = __in_S;\",\n", - " language=dace.dtypes.Language.CPP\n", - ")\n", - "\n", - "state.add_edge(tb_map_entry, None, assign_tasklet, \"__in_S\", dace.Memlet(\"S[tid]\", is_asynchronous=True))\n", - "state.add_edge(assign_tasklet, \"__out_S\", tb_map_exit, None, dace.Memlet(\"B[tid]\", is_asynchronous=True))\n", - "state.add_edge(tb_map_exit, None, gpu_map_exit, None, dace.Memlet(\"B[0:128]\", is_asynchronous=True))\n", - "state.add_edge(gpu_map_exit, None, b_acc, None, dace.Memlet(\"B[0:128]\", is_asynchronous=True))\n", - "\n", - "# Fill scope connectors\n", - "state.fill_scope_connectors()\n", - "\n", - "\n", - "# Display the SDFG\n", - "sdfg" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "03fef73b", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "dace_env", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.12.3" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/berkay_workpace/scratch/async_copy/testbed.ipynb b/berkay_workpace/scratch/async_copy/testbed.ipynb deleted file mode 100644 index 2b0a520529..0000000000 --- a/berkay_workpace/scratch/async_copy/testbed.ipynb +++ /dev/null @@ -1,853 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "id": "a5aeb1f5", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "import dace\n", - "import cupy as cp\n", - "import numpy as np\n", - "from IPython.display import Code\n", - "from typing import Optional\n", - "\n", - "from dace import SDFG, properties\n", - "from dace.config import Config\n", - "from dace.transformation import pass_pipeline as ppl, transformation\n", - "from dace.sdfg import nodes\n", - "from dace import dtypes\n", - "from dace.transformation.passes.shared_memory_synchronization import DefaultSharedMemorySync\n", - "from dace.sdfg.state import LoopRegion, ConditionalBlock\n" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "2f891963", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "
\n", - "
\n", - "
\n", - "\n", - "" - ], - "text/plain": [ - "SDFG (asyn_cpy_sdfg)" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "\n", - "# SDFG and the main state\n", - "sdfg = dace.SDFG(\"asyn_cpy_sdfg\")\n", - "state = sdfg.add_state(\"main\")\n", - "\n", - "# Arrays and access nodes\n", - "sdfg.add_array(\"A\", (128,), dace.uint32, storage=dace.dtypes.StorageType.GPU_Global)\n", - "sdfg.add_array(\"B\", (128,), dace.uint32, storage=dace.dtypes.StorageType.GPU_Global)\n", - "sdfg.add_array(\"S\", (128,), dace.uint32, storage=dace.dtypes.StorageType.GPU_Shared, transient=True)\n", - "\n", - "a_acc = state.add_read(\"A\")\n", - "b_acc = state.add_access(\"B\")\n", - "s_acc = state.add_access(\"S\")\n", - "\n", - "\n", - "\n", - "\n", - "# Device and thread-block maps\n", - "gpu_map_entry, gpu_map_exit = state.add_map(\"gpu_map\", dict(bid=\"0:128:128\"), schedule=dace.dtypes.ScheduleType.GPU_Device)\n", - "tb_map_entry, tb_map_exit = state.add_map(\"tb_map\", dict(tid=\"0:128\"), schedule=dace.dtypes.ScheduleType.GPU_ThreadBlock)\n", - "\n", - "# Edges with proper data flow\n", - "# Global to device scope\n", - "state.add_edge(a_acc, None, gpu_map_entry, None, dace.Memlet(\"A[0:128]\"))\n", - "# Device scope to thread-block scope\n", - "state.add_edge(gpu_map_entry, None, s_acc, None, dace.Memlet(\"A[0:128]->S[0:128]\"))\n", - "state.add_edge(s_acc, None, tb_map_entry, None, dace.Memlet(\"S[0:128]\"))\n", - "\n", - "assign_tasklet = state.add_tasklet(\n", - " \"assign\", inputs={\"__in_S\"}, outputs={\"__out_S\"},\n", - " code=\"__out_S = __in_S;\",\n", - " language=dace.dtypes.Language.CPP\n", - ")\n", - "\n", - "\n", - "state.add_edge(tb_map_entry, None, assign_tasklet, \"__in_S\", dace.Memlet(\"S[tid]\"))\n", - "state.add_edge(assign_tasklet, \"__out_S\", tb_map_exit, None, dace.Memlet(\"B[tid]\"))\n", - "state.add_edge(tb_map_exit, None, gpu_map_exit, None, dace.Memlet(\"B[0:128]\"))\n", - "state.add_edge(gpu_map_exit, None, b_acc, None, dace.Memlet(\"B[0:128]\"))\n", - "\n", - "\n", - "# pipeline and async related \n", - "\n", - "pipeline_name = \"pipeline\"\n", - "s_acc.async_copy = True\n", - "s_acc.async_pipeline = pipeline_name\n", - "sdfg.metadata = {\n", - " s_acc.guid: {\n", - " \"pipelines\": {\n", - " pipeline_name: {\n", - " \"pipeline_depth\" : 1\n", - " }\n", - " }\n", - " }\n", - "}\n", - "\n", - "\n", - "acquire_pipeline_tasklet = state.add_tasklet(\n", - " \"acquire\", inputs={}, outputs={},\n", - " code=f\"{pipeline_name}.producer_acquire();\",\n", - " language=dace.dtypes.Language.CPP\n", - ")\n", - "\n", - "commit_pipeline_tasklet = state.add_tasklet(\n", - " \"commit\", inputs={}, outputs={},\n", - " code=f\"{pipeline_name}.producer_commit();\",\n", - " language=dace.dtypes.Language.CPP\n", - ")\n", - "\n", - "wait_pipeline_tasklet = state.add_tasklet(\n", - " \"wait\", inputs={}, outputs={},\n", - " code=f\"{pipeline_name}.consumer_wait();\",\n", - " language=dace.dtypes.Language.CPP\n", - ")\n", - "\n", - "release_pipeline_tasklet = state.add_tasklet(\n", - " \"release\", inputs={}, outputs={},\n", - " code=f\"{pipeline_name}.consumer_release();\",\n", - " language=dace.dtypes.Language.CPP\n", - ")\n", - "\n", - "\n", - "\n", - "state.add_edge(gpu_map_entry, None, acquire_pipeline_tasklet, None, dace.Memlet())\n", - "state.add_edge(acquire_pipeline_tasklet, None, s_acc, None, dace.Memlet())\n", - "\n", - "state.add_edge(s_acc, None, commit_pipeline_tasklet, None, dace.Memlet())\n", - "state.add_edge(commit_pipeline_tasklet, None, wait_pipeline_tasklet, None, dace.Memlet())\n", - "state.add_edge(wait_pipeline_tasklet, None, tb_map_entry, None, dace.Memlet())\n", - "\n", - "state.add_edge(tb_map_exit, None, release_pipeline_tasklet, None, dace.Memlet())\n", - "state.add_edge(release_pipeline_tasklet, None, gpu_map_exit, None, dace.Memlet())\n", - "\n", - "\n", - "\n", - "\n", - "# Fill scope connectors\n", - "state.fill_scope_connectors()\n", - "\n", - "\n", - "# Display the SDFG\n", - "sdfg" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "c905cb3f", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
#include <cuda_runtime.h>\n",
-       "#include <dace/dace.h>\n",
-       "\n",
-       "// New, cooperative groups and asnyc copy\n",
-       "#include <cooperative_groups/memcpy_async.h>\n",
-       "#include <cuda/pipeline>\n",
-       "\n",
-       "namespace cg = cooperative_groups;\n",
-       "\n",
-       "\n",
-       "struct asyn_cpy_sdfg_state_t {\n",
-       "    dace::cuda::Context *gpu_context;\n",
-       "};\n",
-       "\n",
-       "\n",
-       "\n",
-       "DACE_EXPORTED int __dace_init_experimental_cuda(asyn_cpy_sdfg_state_t *__state);\n",
-       "DACE_EXPORTED int __dace_exit_experimental_cuda(asyn_cpy_sdfg_state_t *__state);\n",
-       "\n",
-       "\n",
-       "\n",
-       "int __dace_init_experimental_cuda(asyn_cpy_sdfg_state_t *__state) {\n",
-       "    int count;\n",
-       "\n",
-       "    // Check that we are able to run cuda code\n",
-       "    if (cudaGetDeviceCount(&count) != cudaSuccess)\n",
-       "    {\n",
-       "        printf("ERROR: GPU drivers are not configured or cuda-capable device "\n",
-       "               "not found\\n");\n",
-       "        return 1;\n",
-       "    }\n",
-       "    if (count == 0)\n",
-       "    {\n",
-       "        printf("ERROR: No cuda-capable devices found\\n");\n",
-       "        return 2;\n",
-       "    }\n",
-       "\n",
-       "    // Initialize cuda before we run the application\n",
-       "    float *dev_X;\n",
-       "    DACE_GPU_CHECK(cudaMalloc((void **) &dev_X, 1));\n",
-       "    DACE_GPU_CHECK(cudaFree(dev_X));\n",
-       "\n",
-       "    \n",
-       "\n",
-       "    __state->gpu_context = new dace::cuda::Context(0, 0);\n",
-       "\n",
-       "    // Create cuda streams and events\n",
-       "    for(int i = 0; i < 0; ++i) {\n",
-       "        DACE_GPU_CHECK(cudaStreamCreateWithFlags(&__state->gpu_context->internal_streams[i], cudaStreamNonBlocking));\n",
-       "        __state->gpu_context->streams[i] = __state->gpu_context->internal_streams[i]; // Allow for externals to modify streams\n",
-       "    }\n",
-       "    for(int i = 0; i < 0; ++i) {\n",
-       "        DACE_GPU_CHECK(cudaEventCreateWithFlags(&__state->gpu_context->events[i], cudaEventDisableTiming));\n",
-       "    }\n",
-       "\n",
-       "    \n",
-       "\n",
-       "    return 0;\n",
-       "}\n",
-       "\n",
-       "int __dace_exit_experimental_cuda(asyn_cpy_sdfg_state_t *__state) {\n",
-       "    \n",
-       "\n",
-       "    // Synchronize and check for CUDA errors\n",
-       "    int __err = static_cast<int>(__state->gpu_context->lasterror);\n",
-       "    if (__err == 0)\n",
-       "        __err = static_cast<int>(cudaDeviceSynchronize());\n",
-       "\n",
-       "    // Destroy cuda streams and events\n",
-       "    for(int i = 0; i < 0; ++i) {\n",
-       "        DACE_GPU_CHECK(cudaStreamDestroy(__state->gpu_context->internal_streams[i]));\n",
-       "    }\n",
-       "    for(int i = 0; i < 0; ++i) {\n",
-       "        DACE_GPU_CHECK(cudaEventDestroy(__state->gpu_context->events[i]));\n",
-       "    }\n",
-       "\n",
-       "    delete __state->gpu_context;\n",
-       "    return __err;\n",
-       "}\n",
-       "\n",
-       "DACE_EXPORTED bool __dace_gpu_set_stream(asyn_cpy_sdfg_state_t *__state, int streamid, gpuStream_t stream)\n",
-       "{\n",
-       "    if (streamid < 0 || streamid >= 0)\n",
-       "        return false;\n",
-       "\n",
-       "    __state->gpu_context->streams[streamid] = stream;\n",
-       "\n",
-       "    return true;\n",
-       "}\n",
-       "\n",
-       "DACE_EXPORTED void __dace_gpu_set_all_streams(asyn_cpy_sdfg_state_t *__state, gpuStream_t stream)\n",
-       "{\n",
-       "    for (int i = 0; i < 0; ++i)\n",
-       "        __state->gpu_context->streams[i] = stream;\n",
-       "}\n",
-       "\n",
-       "__global__ void __launch_bounds__(128) gpu_map_0_0_3(dace::uint * __restrict__ A, dace::uint * __restrict__ B)\n",
-       "{\n",
-       "    __shared__ dace::uint S[128];\n",
-       "\n",
-       "    auto block = cg::this_thread_block();\n",
-       "\n",
-       "    const uint pipeline_depth_pipeline = 1;\n",
-       "    __shared__ cuda::pipeline_shared_state<cuda::thread_scope::thread_scope_block, pipeline_depth_pipeline> shared_state_pipeline;\n",
-       "    auto pipeline = cuda::make_pipeline(block, &shared_state_pipeline);\n",
-       "\n",
-       "    int bid = (128 * blockIdx.x);\n",
-       "    {\n",
-       "\n",
-       "        ///////////////////\n",
-       "        pipeline.producer_acquire();\n",
-       "        ///////////////////\n",
-       "\n",
-       "    }\n",
-       "    cuda::memcpy_async(block, S, A, 128 *sizeof(dace::uint), pipeline);\n",
-       "    {\n",
-       "\n",
-       "        ///////////////////\n",
-       "        pipeline.producer_commit();\n",
-       "        ///////////////////\n",
-       "\n",
-       "    }\n",
-       "    {\n",
-       "\n",
-       "        ///////////////////\n",
-       "        pipeline.consumer_wait();\n",
-       "        ///////////////////\n",
-       "\n",
-       "    }\n",
-       "    {\n",
-       "        int tid = threadIdx.x;\n",
-       "        {\n",
-       "            dace::uint __in_S = S[tid];\n",
-       "            dace::uint __out_S;\n",
-       "\n",
-       "            ///////////////////\n",
-       "            __out_S = __in_S;\n",
-       "            ///////////////////\n",
-       "\n",
-       "            B[tid] = __out_S;\n",
-       "        }\n",
-       "    }\n",
-       "    {\n",
-       "\n",
-       "        ///////////////////\n",
-       "        pipeline.consumer_release();\n",
-       "        ///////////////////\n",
-       "\n",
-       "    }\n",
-       "}\n",
-       "\n",
-       "\n",
-       "DACE_EXPORTED void __dace_runkernel_gpu_map_0_0_3(asyn_cpy_sdfg_state_t *__state, dace::uint * __restrict__ A, dace::uint * __restrict__ B);\n",
-       "void __dace_runkernel_gpu_map_0_0_3(asyn_cpy_sdfg_state_t *__state, dace::uint * __restrict__ A, dace::uint * __restrict__ B)\n",
-       "{\n",
-       "\n",
-       "\n",
-       "    void  *gpu_map_0_0_3_args[] = { (void *)&A, (void *)&B };\n",
-       "    gpuError_t __err = cudaLaunchKernel( (void*)gpu_map_0_0_3, dim3(1, 1, 1), dim3(128, 1, 1), gpu_map_0_0_3_args, 0, nullptr\n",
-       "    );\n",
-       "\n",
-       "    DACE_KERNEL_LAUNCH_CHECK(__err, "gpu_map_0_0_3", 1, 1, 1, 128, 1, 1);\n",
-       "}\n",
-       "
\n" - ], - "text/latex": [ - "\\begin{Verbatim}[commandchars=\\\\\\{\\}]\n", - "\\PY{n}{\\PYZsh{}include}\\PY{+w}{ }\\PY{o}{\\PYZlt{}}\\PY{n}{cuda\\PYZus{}runtime}\\PY{p}{.}\\PY{n}{h}\\PY{o}{\\PYZgt{}}\n", - "\\PY{n}{\\PYZsh{}include}\\PY{+w}{ }\\PY{o}{\\PYZlt{}}\\PY{n}{dace}\\PY{o}{/}\\PY{n}{dace}\\PY{p}{.}\\PY{n}{h}\\PY{o}{\\PYZgt{}}\n", - "\n", - "\\PY{o}{/}\\PY{o}{/}\\PY{+w}{ }\\PY{k}{New}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{cooperative}\\PY{+w}{ }\\PY{n}{groups}\\PY{+w}{ }\\PY{o+ow}{and}\\PY{+w}{ }\\PY{n}{asnyc}\\PY{+w}{ }\\PY{n}{copy}\n", - "\\PY{n}{\\PYZsh{}include}\\PY{+w}{ }\\PY{o}{\\PYZlt{}}\\PY{n}{cooperative\\PYZus{}groups}\\PY{o}{/}\\PY{n}{memcpy\\PYZus{}async}\\PY{p}{.}\\PY{n}{h}\\PY{o}{\\PYZgt{}}\n", - "\\PY{n}{\\PYZsh{}include}\\PY{+w}{ }\\PY{o}{\\PYZlt{}}\\PY{n}{cuda}\\PY{o}{/}\\PY{n}{pipeline}\\PY{o}{\\PYZgt{}}\n", - "\n", - "\\PY{n}{namespace}\\PY{+w}{ }\\PY{n}{cg}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{cooperative\\PYZus{}groups}\\PY{p}{;}\n", - "\n", - "\n", - "\\PY{n}{struct}\\PY{+w}{ }\\PY{n}{asyn\\PYZus{}cpy\\PYZus{}sdfg\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{err}{\\PYZob{}}\n", - "\\PY{+w}{ }\\PY{n+nl}{dace}\\PY{p}{:}\\PY{err}{:}\\PY{n+nl}{cuda}\\PY{p}{:}\\PY{err}{:}\\PY{n}{Context}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{gpu\\PYZus{}context}\\PY{p}{;}\n", - "\\PY{err}{\\PYZcb{}}\\PY{p}{;}\n", - "\n", - "\n", - "\n", - "\\PY{n}{DACE\\PYZus{}EXPORTED}\\PY{+w}{ }\\PY{n+nc}{int}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}dace\\PYZus{}init\\PYZus{}experimental\\PYZus{}cuda}\\PY{p}{(}\\PY{n}{asyn\\PYZus{}cpy\\PYZus{}sdfg\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{)}\\PY{p}{;}\n", - "\\PY{n}{DACE\\PYZus{}EXPORTED}\\PY{+w}{ }\\PY{n+nc}{int}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}dace\\PYZus{}exit\\PYZus{}experimental\\PYZus{}cuda}\\PY{p}{(}\\PY{n}{asyn\\PYZus{}cpy\\PYZus{}sdfg\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{)}\\PY{p}{;}\n", - "\n", - "\n", - "\n", - "\\PY{n+nc}{int}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}dace\\PYZus{}init\\PYZus{}experimental\\PYZus{}cuda}\\PY{p}{(}\\PY{n}{asyn\\PYZus{}cpy\\PYZus{}sdfg\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{)}\\PY{+w}{ }\\PY{err}{\\PYZob{}}\n", - "\\PY{+w}{ }\\PY{n+nc}{int}\\PY{+w}{ }\\PY{n+nf}{count}\\PY{p}{;}\n", - "\n", - "\\PY{+w}{ }\\PY{o}{/}\\PY{o}{/}\\PY{+w}{ }\\PY{k}{Check}\\PY{+w}{ }\\PY{n}{that}\\PY{+w}{ }\\PY{n}{we}\\PY{+w}{ }\\PY{k}{are}\\PY{+w}{ }\\PY{n}{able}\\PY{+w}{ }\\PY{k}{to}\\PY{+w}{ }\\PY{n}{run}\\PY{+w}{ }\\PY{n}{cuda}\\PY{+w}{ }\\PY{n}{code}\n", - "\\PY{+w}{ }\\PY{k}{if}\\PY{+w}{ }\\PY{p}{(}\\PY{n}{cudaGetDeviceCount}\\PY{p}{(}\\PY{o}{\\PYZam{}}\\PY{n+nf}{count}\\PY{p}{)}\\PY{+w}{ }\\PY{o}{!=}\\PY{+w}{ }\\PY{n}{cudaSuccess}\\PY{p}{)}\n", - "\\PY{+w}{ }\\PY{err}{\\PYZob{}}\n", - "\\PY{+w}{ }\\PY{n}{printf}\\PY{p}{(}\\PY{l+s+ss}{\\PYZdq{}ERROR: GPU drivers are not configured or cuda\\PYZhy{}capable device \\PYZdq{}}\n", - "\\PY{+w}{ }\\PY{l+s+ss}{\\PYZdq{}not found\\PYZbs{}n\\PYZdq{}}\\PY{p}{)}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{k}{return}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{err}{\\PYZcb{}}\n", - "\\PY{+w}{ }\\PY{k}{if}\\PY{+w}{ }\\PY{p}{(}\\PY{n+nf}{count}\\PY{+w}{ }\\PY{o}{=}\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{)}\n", - "\\PY{+w}{ }\\PY{err}{\\PYZob{}}\n", - "\\PY{+w}{ }\\PY{n}{printf}\\PY{p}{(}\\PY{l+s+ss}{\\PYZdq{}ERROR: No cuda\\PYZhy{}capable devices found\\PYZbs{}n\\PYZdq{}}\\PY{p}{)}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{k}{return}\\PY{+w}{ }\\PY{l+m+mi}{2}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{err}{\\PYZcb{}}\n", - "\n", - "\\PY{+w}{ }\\PY{o}{/}\\PY{o}{/}\\PY{+w}{ }\\PY{k}{Initialize}\\PY{+w}{ }\\PY{n}{cuda}\\PY{+w}{ }\\PY{k}{before}\\PY{+w}{ }\\PY{n}{we}\\PY{+w}{ }\\PY{n}{run}\\PY{+w}{ }\\PY{n}{the}\\PY{+w}{ }\\PY{n}{application}\n", - "\\PY{+w}{ }\\PY{n+nc}{float}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{dev\\PYZus{}X}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{n}{DACE\\PYZus{}GPU\\PYZus{}CHECK}\\PY{p}{(}\\PY{n}{cudaMalloc}\\PY{p}{(}\\PY{p}{(}\\PY{n}{void}\\PY{+w}{ }\\PY{o}{*}\\PY{o}{*}\\PY{p}{)}\\PY{+w}{ }\\PY{o}{\\PYZam{}}\\PY{n}{dev\\PYZus{}X}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{)}\\PY{p}{)}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{n}{DACE\\PYZus{}GPU\\PYZus{}CHECK}\\PY{p}{(}\\PY{n}{cudaFree}\\PY{p}{(}\\PY{n}{dev\\PYZus{}X}\\PY{p}{)}\\PY{p}{)}\\PY{p}{;}\n", - "\n", - "\\PY{+w}{ }\n", - "\n", - "\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{k}{new}\\PY{+w}{ }\\PY{n+nl}{dace}\\PY{p}{:}\\PY{err}{:}\\PY{n+nl}{cuda}\\PY{p}{:}\\PY{err}{:}\\PY{n}{Context}\\PY{p}{(}\\PY{l+m+mi}{0}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{)}\\PY{p}{;}\n", - "\n", - "\\PY{+w}{ }\\PY{o}{/}\\PY{o}{/}\\PY{+w}{ }\\PY{k}{Create}\\PY{+w}{ }\\PY{n}{cuda}\\PY{+w}{ }\\PY{n}{streams}\\PY{+w}{ }\\PY{o+ow}{and}\\PY{+w}{ }\\PY{n}{events}\n", - "\\PY{+w}{ }\\PY{k}{for}\\PY{p}{(}\\PY{n+nc}{int}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{;}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{\\PYZlt{}}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{;}\\PY{+w}{ }\\PY{o}{+}\\PY{o}{+}\\PY{n}{i}\\PY{p}{)}\\PY{+w}{ }\\PY{err}{\\PYZob{}}\n", - "\\PY{+w}{ }\\PY{n}{DACE\\PYZus{}GPU\\PYZus{}CHECK}\\PY{p}{(}\\PY{n}{cudaStreamCreateWithFlags}\\PY{p}{(}\\PY{o}{\\PYZam{}}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{internal\\PYZus{}streams}\\PY{o}{[}\\PY{n}{i}\\PY{o}{]}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{cudaStreamNonBlocking}\\PY{p}{)}\\PY{p}{)}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{streams}\\PY{o}{[}\\PY{n}{i}\\PY{o}{]}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{internal\\PYZus{}streams}\\PY{o}{[}\\PY{n}{i}\\PY{o}{]}\\PY{p}{;}\\PY{+w}{ }\\PY{o}{/}\\PY{o}{/}\\PY{+w}{ }\\PY{n}{Allow}\\PY{+w}{ }\\PY{k}{for}\\PY{+w}{ }\\PY{n}{externals}\\PY{+w}{ }\\PY{k}{to}\\PY{+w}{ }\\PY{k}{modify}\\PY{+w}{ }\\PY{n}{streams}\n", - "\\PY{+w}{ }\\PY{err}{\\PYZcb{}}\n", - "\\PY{+w}{ }\\PY{k}{for}\\PY{p}{(}\\PY{n+nc}{int}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{;}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{\\PYZlt{}}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{;}\\PY{+w}{ }\\PY{o}{+}\\PY{o}{+}\\PY{n}{i}\\PY{p}{)}\\PY{+w}{ }\\PY{err}{\\PYZob{}}\n", - "\\PY{+w}{ }\\PY{n}{DACE\\PYZus{}GPU\\PYZus{}CHECK}\\PY{p}{(}\\PY{n}{cudaEventCreateWithFlags}\\PY{p}{(}\\PY{o}{\\PYZam{}}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{events}\\PY{o}{[}\\PY{n}{i}\\PY{o}{]}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{cudaEventDisableTiming}\\PY{p}{)}\\PY{p}{)}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{err}{\\PYZcb{}}\n", - "\n", - "\\PY{+w}{ }\n", - "\n", - "\\PY{+w}{ }\\PY{k}{return}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{;}\n", - "\\PY{err}{\\PYZcb{}}\n", - "\n", - "\\PY{n+nc}{int}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}dace\\PYZus{}exit\\PYZus{}experimental\\PYZus{}cuda}\\PY{p}{(}\\PY{n}{asyn\\PYZus{}cpy\\PYZus{}sdfg\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{)}\\PY{+w}{ }\\PY{err}{\\PYZob{}}\n", - "\\PY{+w}{ }\n", - "\n", - "\\PY{+w}{ }\\PY{o}{/}\\PY{o}{/}\\PY{+w}{ }\\PY{n}{Synchronize}\\PY{+w}{ }\\PY{o+ow}{and}\\PY{+w}{ }\\PY{k}{check}\\PY{+w}{ }\\PY{k}{for}\\PY{+w}{ }\\PY{n}{CUDA}\\PY{+w}{ }\\PY{n}{errors}\n", - "\\PY{+w}{ }\\PY{n+nc}{int}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}err}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{static\\PYZus{}cast}\\PY{o}{\\PYZlt{}}\\PY{n+nc}{int}\\PY{o}{\\PYZgt{}}\\PY{p}{(}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{lasterror}\\PY{p}{)}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{k}{if}\\PY{+w}{ }\\PY{p}{(}\\PY{n}{\\PYZus{}\\PYZus{}err}\\PY{+w}{ }\\PY{o}{=}\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{)}\n", - "\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}err}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{static\\PYZus{}cast}\\PY{o}{\\PYZlt{}}\\PY{n+nc}{int}\\PY{o}{\\PYZgt{}}\\PY{p}{(}\\PY{n}{cudaDeviceSynchronize}\\PY{p}{(}\\PY{p}{)}\\PY{p}{)}\\PY{p}{;}\n", - "\n", - "\\PY{+w}{ }\\PY{o}{/}\\PY{o}{/}\\PY{+w}{ }\\PY{k}{Destroy}\\PY{+w}{ }\\PY{n}{cuda}\\PY{+w}{ }\\PY{n}{streams}\\PY{+w}{ }\\PY{o+ow}{and}\\PY{+w}{ }\\PY{n}{events}\n", - "\\PY{+w}{ }\\PY{k}{for}\\PY{p}{(}\\PY{n+nc}{int}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{;}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{\\PYZlt{}}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{;}\\PY{+w}{ }\\PY{o}{+}\\PY{o}{+}\\PY{n}{i}\\PY{p}{)}\\PY{+w}{ }\\PY{err}{\\PYZob{}}\n", - "\\PY{+w}{ }\\PY{n}{DACE\\PYZus{}GPU\\PYZus{}CHECK}\\PY{p}{(}\\PY{n}{cudaStreamDestroy}\\PY{p}{(}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{internal\\PYZus{}streams}\\PY{o}{[}\\PY{n}{i}\\PY{o}{]}\\PY{p}{)}\\PY{p}{)}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{err}{\\PYZcb{}}\n", - "\\PY{+w}{ }\\PY{k}{for}\\PY{p}{(}\\PY{n+nc}{int}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{;}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{\\PYZlt{}}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{;}\\PY{+w}{ }\\PY{o}{+}\\PY{o}{+}\\PY{n}{i}\\PY{p}{)}\\PY{+w}{ }\\PY{err}{\\PYZob{}}\n", - "\\PY{+w}{ }\\PY{n}{DACE\\PYZus{}GPU\\PYZus{}CHECK}\\PY{p}{(}\\PY{n}{cudaEventDestroy}\\PY{p}{(}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{events}\\PY{o}{[}\\PY{n}{i}\\PY{o}{]}\\PY{p}{)}\\PY{p}{)}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{err}{\\PYZcb{}}\n", - "\n", - "\\PY{+w}{ }\\PY{k}{delete}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{k}{return}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}err}\\PY{p}{;}\n", - "\\PY{err}{\\PYZcb{}}\n", - "\n", - "\\PY{n}{DACE\\PYZus{}EXPORTED}\\PY{+w}{ }\\PY{n}{bool}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}dace\\PYZus{}gpu\\PYZus{}set\\PYZus{}stream}\\PY{p}{(}\\PY{n}{asyn\\PYZus{}cpy\\PYZus{}sdfg\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{,}\\PY{+w}{ }\\PY{n+nc}{int}\\PY{+w}{ }\\PY{n}{streamid}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{gpuStream\\PYZus{}t}\\PY{+w}{ }\\PY{n}{stream}\\PY{p}{)}\n", - "\\PY{err}{\\PYZob{}}\n", - "\\PY{+w}{ }\\PY{k}{if}\\PY{+w}{ }\\PY{p}{(}\\PY{n}{streamid}\\PY{+w}{ }\\PY{o}{\\PYZlt{}}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{+w}{ }\\PY{o}{|}\\PY{o}{|}\\PY{+w}{ }\\PY{n}{streamid}\\PY{+w}{ }\\PY{o}{\\PYZgt{}=}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{)}\n", - "\\PY{+w}{ }\\PY{k}{return}\\PY{+w}{ }\\PY{k}{false}\\PY{p}{;}\n", - "\n", - "\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{streams}\\PY{o}{[}\\PY{n}{streamid}\\PY{o}{]}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{stream}\\PY{p}{;}\n", - "\n", - "\\PY{+w}{ }\\PY{k}{return}\\PY{+w}{ }\\PY{k}{true}\\PY{p}{;}\n", - "\\PY{err}{\\PYZcb{}}\n", - "\n", - "\\PY{n}{DACE\\PYZus{}EXPORTED}\\PY{+w}{ }\\PY{n}{void}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}dace\\PYZus{}gpu\\PYZus{}set\\PYZus{}all\\PYZus{}streams}\\PY{p}{(}\\PY{n}{asyn\\PYZus{}cpy\\PYZus{}sdfg\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{gpuStream\\PYZus{}t}\\PY{+w}{ }\\PY{n}{stream}\\PY{p}{)}\n", - "\\PY{err}{\\PYZob{}}\n", - "\\PY{+w}{ }\\PY{k}{for}\\PY{+w}{ }\\PY{p}{(}\\PY{n+nc}{int}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{;}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{\\PYZlt{}}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{;}\\PY{+w}{ }\\PY{o}{+}\\PY{o}{+}\\PY{n}{i}\\PY{p}{)}\n", - "\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{streams}\\PY{o}{[}\\PY{n}{i}\\PY{o}{]}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{stream}\\PY{p}{;}\n", - "\\PY{err}{\\PYZcb{}}\n", - "\n", - "\\PY{n}{\\PYZus{}\\PYZus{}global\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{n}{void}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}launch\\PYZus{}bounds\\PYZus{}\\PYZus{}}\\PY{p}{(}\\PY{l+m+mi}{128}\\PY{p}{)}\\PY{+w}{ }\\PY{n}{gpu\\PYZus{}map\\PYZus{}0\\PYZus{}0\\PYZus{}3}\\PY{p}{(}\\PY{n+nl}{dace}\\PY{p}{:}\\PY{err}{:}\\PY{n}{uint}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}restrict\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{n}{A}\\PY{p}{,}\\PY{+w}{ }\\PY{n+nl}{dace}\\PY{p}{:}\\PY{err}{:}\\PY{n}{uint}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}restrict\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{n}{B}\\PY{p}{)}\n", - "\\PY{err}{\\PYZob{}}\n", - "\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}shared\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{n+nl}{dace}\\PY{p}{:}\\PY{err}{:}\\PY{n}{uint}\\PY{+w}{ }\\PY{n}{S}\\PY{o}{[}\\PY{n}{128}\\PY{o}{]}\\PY{p}{;}\n", - "\n", - "\\PY{+w}{ }\\PY{n}{auto}\\PY{+w}{ }\\PY{n}{block}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n+nl}{cg}\\PY{p}{:}\\PY{err}{:}\\PY{n}{this\\PYZus{}thread\\PYZus{}block}\\PY{p}{(}\\PY{p}{)}\\PY{p}{;}\n", - "\n", - "\\PY{+w}{ }\\PY{n}{const}\\PY{+w}{ }\\PY{n}{uint}\\PY{+w}{ }\\PY{n}{pipeline\\PYZus{}depth\\PYZus{}pipeline}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}shared\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{n+nl}{cuda}\\PY{p}{:}\\PY{err}{:}\\PY{n}{pipeline\\PYZus{}shared\\PYZus{}state}\\PY{o}{\\PYZlt{}}\\PY{n+nl}{cuda}\\PY{p}{:}\\PY{err}{:}\\PY{n+nl}{thread\\PYZus{}scope}\\PY{p}{:}\\PY{err}{:}\\PY{n}{thread\\PYZus{}scope\\PYZus{}block}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{pipeline\\PYZus{}depth\\PYZus{}pipeline}\\PY{o}{\\PYZgt{}}\\PY{+w}{ }\\PY{n}{shared\\PYZus{}state\\PYZus{}pipeline}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{n}{auto}\\PY{+w}{ }\\PY{n}{pipeline}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n+nl}{cuda}\\PY{p}{:}\\PY{err}{:}\\PY{n}{make\\PYZus{}pipeline}\\PY{p}{(}\\PY{n}{block}\\PY{p}{,}\\PY{+w}{ }\\PY{o}{\\PYZam{}}\\PY{n}{shared\\PYZus{}state\\PYZus{}pipeline}\\PY{p}{)}\\PY{p}{;}\n", - "\n", - "\\PY{+w}{ }\\PY{n+nc}{int}\\PY{+w}{ }\\PY{n}{bid}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{p}{(}\\PY{l+m+mi}{128}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{blockIdx}\\PY{p}{.}\\PY{n}{x}\\PY{p}{)}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{err}{\\PYZob{}}\n", - "\n", - "\\PY{+w}{ }\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\n", - "\\PY{+w}{ }\\PY{n}{pipeline}\\PY{p}{.}\\PY{n}{producer\\PYZus{}acquire}\\PY{p}{(}\\PY{p}{)}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\n", - "\n", - "\\PY{+w}{ }\\PY{err}{\\PYZcb{}}\n", - "\\PY{+w}{ }\\PY{n+nl}{cuda}\\PY{p}{:}\\PY{err}{:}\\PY{n}{memcpy\\PYZus{}async}\\PY{p}{(}\\PY{n}{block}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{S}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{A}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{128}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{sizeof}\\PY{p}{(}\\PY{n+nl}{dace}\\PY{p}{:}\\PY{err}{:}\\PY{n}{uint}\\PY{p}{)}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{pipeline}\\PY{p}{)}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{err}{\\PYZob{}}\n", - "\n", - "\\PY{+w}{ }\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\n", - "\\PY{+w}{ }\\PY{n}{pipeline}\\PY{p}{.}\\PY{n}{producer\\PYZus{}commit}\\PY{p}{(}\\PY{p}{)}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\n", - "\n", - "\\PY{+w}{ }\\PY{err}{\\PYZcb{}}\n", - "\\PY{+w}{ }\\PY{err}{\\PYZob{}}\n", - "\n", - "\\PY{+w}{ }\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\n", - "\\PY{+w}{ }\\PY{n}{pipeline}\\PY{p}{.}\\PY{n}{consumer\\PYZus{}wait}\\PY{p}{(}\\PY{p}{)}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\n", - "\n", - "\\PY{+w}{ }\\PY{err}{\\PYZcb{}}\n", - "\\PY{+w}{ }\\PY{err}{\\PYZob{}}\n", - "\\PY{+w}{ }\\PY{n+nc}{int}\\PY{+w}{ }\\PY{n}{tid}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{threadIdx}\\PY{p}{.}\\PY{n}{x}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{err}{\\PYZob{}}\n", - "\\PY{+w}{ }\\PY{n+nl}{dace}\\PY{p}{:}\\PY{err}{:}\\PY{n}{uint}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}in\\PYZus{}S}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{S}\\PY{o}{[}\\PY{n}{tid}\\PY{o}{]}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{n+nl}{dace}\\PY{p}{:}\\PY{err}{:}\\PY{n}{uint}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}out\\PYZus{}S}\\PY{p}{;}\n", - "\n", - "\\PY{+w}{ }\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\n", - "\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}out\\PYZus{}S}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}in\\PYZus{}S}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\n", - "\n", - "\\PY{+w}{ }\\PY{n}{B}\\PY{o}{[}\\PY{n}{tid}\\PY{o}{]}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}out\\PYZus{}S}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{err}{\\PYZcb{}}\n", - "\\PY{+w}{ }\\PY{err}{\\PYZcb{}}\n", - "\\PY{+w}{ }\\PY{err}{\\PYZob{}}\n", - "\n", - "\\PY{+w}{ }\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\n", - "\\PY{+w}{ }\\PY{n}{pipeline}\\PY{p}{.}\\PY{n}{consumer\\PYZus{}release}\\PY{p}{(}\\PY{p}{)}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\n", - "\n", - "\\PY{+w}{ }\\PY{err}{\\PYZcb{}}\n", - "\\PY{err}{\\PYZcb{}}\n", - "\n", - "\n", - "\\PY{n}{DACE\\PYZus{}EXPORTED}\\PY{+w}{ }\\PY{n}{void}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}dace\\PYZus{}runkernel\\PYZus{}gpu\\PYZus{}map\\PYZus{}0\\PYZus{}0\\PYZus{}3}\\PY{p}{(}\\PY{n}{asyn\\PYZus{}cpy\\PYZus{}sdfg\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{,}\\PY{+w}{ }\\PY{n+nl}{dace}\\PY{p}{:}\\PY{err}{:}\\PY{n}{uint}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}restrict\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{n}{A}\\PY{p}{,}\\PY{+w}{ }\\PY{n+nl}{dace}\\PY{p}{:}\\PY{err}{:}\\PY{n}{uint}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}restrict\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{n}{B}\\PY{p}{)}\\PY{p}{;}\n", - "\\PY{n}{void}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}dace\\PYZus{}runkernel\\PYZus{}gpu\\PYZus{}map\\PYZus{}0\\PYZus{}0\\PYZus{}3}\\PY{p}{(}\\PY{n}{asyn\\PYZus{}cpy\\PYZus{}sdfg\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{,}\\PY{+w}{ }\\PY{n+nl}{dace}\\PY{p}{:}\\PY{err}{:}\\PY{n}{uint}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}restrict\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{n}{A}\\PY{p}{,}\\PY{+w}{ }\\PY{n+nl}{dace}\\PY{p}{:}\\PY{err}{:}\\PY{n}{uint}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}restrict\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{n}{B}\\PY{p}{)}\n", - "\\PY{err}{\\PYZob{}}\n", - "\n", - "\n", - "\\PY{+w}{ }\\PY{n}{void}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{gpu\\PYZus{}map\\PYZus{}0\\PYZus{}0\\PYZus{}3\\PYZus{}args}\\PY{err}{[}\\PY{err}{]}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{err}{\\PYZob{}}\\PY{+w}{ }\\PY{p}{(}\\PY{n}{void}\\PY{+w}{ }\\PY{o}{*}\\PY{p}{)}\\PY{o}{\\PYZam{}}\\PY{n}{A}\\PY{p}{,}\\PY{+w}{ }\\PY{p}{(}\\PY{n}{void}\\PY{+w}{ }\\PY{o}{*}\\PY{p}{)}\\PY{o}{\\PYZam{}}\\PY{n}{B}\\PY{+w}{ }\\PY{err}{\\PYZcb{}}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{n}{gpuError\\PYZus{}t}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}err}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{cudaLaunchKernel}\\PY{p}{(}\\PY{+w}{ }\\PY{p}{(}\\PY{n}{void}\\PY{o}{*}\\PY{p}{)}\\PY{n}{gpu\\PYZus{}map\\PYZus{}0\\PYZus{}0\\PYZus{}3}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{dim3}\\PY{p}{(}\\PY{l+m+mi}{1}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{)}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{dim3}\\PY{p}{(}\\PY{l+m+mi}{128}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{)}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{gpu\\PYZus{}map\\PYZus{}0\\PYZus{}0\\PYZus{}3\\PYZus{}args}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{nullptr}\n", - "\\PY{+w}{ }\\PY{p}{)}\\PY{p}{;}\n", - "\n", - "\\PY{+w}{ }\\PY{n}{DACE\\PYZus{}KERNEL\\PYZus{}LAUNCH\\PYZus{}CHECK}\\PY{p}{(}\\PY{n}{\\PYZus{}\\PYZus{}err}\\PY{p}{,}\\PY{+w}{ }\\PY{l+s+ss}{\\PYZdq{}gpu\\PYZus{}map\\PYZus{}0\\PYZus{}0\\PYZus{}3\\PYZdq{}}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{128}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{)}\\PY{p}{;}\n", - "\\PY{err}{\\PYZcb{}}\n", - "\\end{Verbatim}\n" - ], - "text/plain": [ - "\n", - "#include \n", - "#include \n", - "\n", - "// New, cooperative groups and asnyc copy\n", - "#include \n", - "#include \n", - "\n", - "namespace cg = cooperative_groups;\n", - "\n", - "\n", - "struct asyn_cpy_sdfg_state_t {\n", - " dace::cuda::Context *gpu_context;\n", - "};\n", - "\n", - "\n", - "\n", - "DACE_EXPORTED int __dace_init_experimental_cuda(asyn_cpy_sdfg_state_t *__state);\n", - "DACE_EXPORTED int __dace_exit_experimental_cuda(asyn_cpy_sdfg_state_t *__state);\n", - "\n", - "\n", - "\n", - "int __dace_init_experimental_cuda(asyn_cpy_sdfg_state_t *__state) {\n", - " int count;\n", - "\n", - " // Check that we are able to run cuda code\n", - " if (cudaGetDeviceCount(&count) != cudaSuccess)\n", - " {\n", - " printf(\"ERROR: GPU drivers are not configured or cuda-capable device \"\n", - " \"not found\\n\");\n", - " return 1;\n", - " }\n", - " if (count == 0)\n", - " {\n", - " printf(\"ERROR: No cuda-capable devices found\\n\");\n", - " return 2;\n", - " }\n", - "\n", - " // Initialize cuda before we run the application\n", - " float *dev_X;\n", - " DACE_GPU_CHECK(cudaMalloc((void **) &dev_X, 1));\n", - " DACE_GPU_CHECK(cudaFree(dev_X));\n", - "\n", - " \n", - "\n", - " __state->gpu_context = new dace::cuda::Context(0, 0);\n", - "\n", - " // Create cuda streams and events\n", - " for(int i = 0; i < 0; ++i) {\n", - " DACE_GPU_CHECK(cudaStreamCreateWithFlags(&__state->gpu_context->internal_streams[i], cudaStreamNonBlocking));\n", - " __state->gpu_context->streams[i] = __state->gpu_context->internal_streams[i]; // Allow for externals to modify streams\n", - " }\n", - " for(int i = 0; i < 0; ++i) {\n", - " DACE_GPU_CHECK(cudaEventCreateWithFlags(&__state->gpu_context->events[i], cudaEventDisableTiming));\n", - " }\n", - "\n", - " \n", - "\n", - " return 0;\n", - "}\n", - "\n", - "int __dace_exit_experimental_cuda(asyn_cpy_sdfg_state_t *__state) {\n", - " \n", - "\n", - " // Synchronize and check for CUDA errors\n", - " int __err = static_cast(__state->gpu_context->lasterror);\n", - " if (__err == 0)\n", - " __err = static_cast(cudaDeviceSynchronize());\n", - "\n", - " // Destroy cuda streams and events\n", - " for(int i = 0; i < 0; ++i) {\n", - " DACE_GPU_CHECK(cudaStreamDestroy(__state->gpu_context->internal_streams[i]));\n", - " }\n", - " for(int i = 0; i < 0; ++i) {\n", - " DACE_GPU_CHECK(cudaEventDestroy(__state->gpu_context->events[i]));\n", - " }\n", - "\n", - " delete __state->gpu_context;\n", - " return __err;\n", - "}\n", - "\n", - "DACE_EXPORTED bool __dace_gpu_set_stream(asyn_cpy_sdfg_state_t *__state, int streamid, gpuStream_t stream)\n", - "{\n", - " if (streamid < 0 || streamid >= 0)\n", - " return false;\n", - "\n", - " __state->gpu_context->streams[streamid] = stream;\n", - "\n", - " return true;\n", - "}\n", - "\n", - "DACE_EXPORTED void __dace_gpu_set_all_streams(asyn_cpy_sdfg_state_t *__state, gpuStream_t stream)\n", - "{\n", - " for (int i = 0; i < 0; ++i)\n", - " __state->gpu_context->streams[i] = stream;\n", - "}\n", - "\n", - "__global__ void __launch_bounds__(128) gpu_map_0_0_3(dace::uint * __restrict__ A, dace::uint * __restrict__ B)\n", - "{\n", - " __shared__ dace::uint S[128];\n", - "\n", - " auto block = cg::this_thread_block();\n", - "\n", - " const uint pipeline_depth_pipeline = 1;\n", - " __shared__ cuda::pipeline_shared_state shared_state_pipeline;\n", - " auto pipeline = cuda::make_pipeline(block, &shared_state_pipeline);\n", - "\n", - " int bid = (128 * blockIdx.x);\n", - " {\n", - "\n", - " ///////////////////\n", - " pipeline.producer_acquire();\n", - " ///////////////////\n", - "\n", - " }\n", - " cuda::memcpy_async(block, S, A, 128 *sizeof(dace::uint), pipeline);\n", - " {\n", - "\n", - " ///////////////////\n", - " pipeline.producer_commit();\n", - " ///////////////////\n", - "\n", - " }\n", - " {\n", - "\n", - " ///////////////////\n", - " pipeline.consumer_wait();\n", - " ///////////////////\n", - "\n", - " }\n", - " {\n", - " int tid = threadIdx.x;\n", - " {\n", - " dace::uint __in_S = S[tid];\n", - " dace::uint __out_S;\n", - "\n", - " ///////////////////\n", - " __out_S = __in_S;\n", - " ///////////////////\n", - "\n", - " B[tid] = __out_S;\n", - " }\n", - " }\n", - " {\n", - "\n", - " ///////////////////\n", - " pipeline.consumer_release();\n", - " ///////////////////\n", - "\n", - " }\n", - "}\n", - "\n", - "\n", - "DACE_EXPORTED void __dace_runkernel_gpu_map_0_0_3(asyn_cpy_sdfg_state_t *__state, dace::uint * __restrict__ A, dace::uint * __restrict__ B);\n", - "void __dace_runkernel_gpu_map_0_0_3(asyn_cpy_sdfg_state_t *__state, dace::uint * __restrict__ A, dace::uint * __restrict__ B)\n", - "{\n", - "\n", - "\n", - " void *gpu_map_0_0_3_args[] = { (void *)&A, (void *)&B };\n", - " gpuError_t __err = cudaLaunchKernel( (void*)gpu_map_0_0_3, dim3(1, 1, 1), dim3(128, 1, 1), gpu_map_0_0_3_args, 0, nullptr\n", - " );\n", - "\n", - " DACE_KERNEL_LAUNCH_CHECK(__err, \"gpu_map_0_0_3\", 1, 1, 1, 128, 1, 1);\n", - "}\n" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "\n", - "Code(sdfg.generate_code()[1].clean_code)" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "18bbca39", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "A before:\n", - "[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1\n", - " 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1\n", - " 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1\n", - " 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]\n", - "B before:\n", - "[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", - " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", - " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", - " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]\n", - "A after:\n", - "[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1\n", - " 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1\n", - " 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1\n", - " 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]\n", - "B after:\n", - "[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1\n", - " 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1\n", - " 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1\n", - " 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]\n" - ] - } - ], - "source": [ - "\n", - "A = cp.ones((128,), dtype=cp.uint32)\n", - "B = cp.zeros((128,), dtype=cp.uint32)\n", - "\n", - "print(f\"A before:\\n{A}\")\n", - "print(f\"B before:\\n{B}\")\n", - "\n", - "sdfg(A=A, B=B)\n", - "\n", - "print(f\"A after:\\n{A}\")\n", - "print(f\"B after:\\n{B}\")\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1ce1ef33", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9aa368f1", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "dace_env", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.12.3" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/berkay_workpace/scratch/constantArgs.ipynb b/berkay_workpace/scratch/constantArgs.ipynb deleted file mode 100644 index ceac5ff6d0..0000000000 --- a/berkay_workpace/scratch/constantArgs.ipynb +++ /dev/null @@ -1,1057 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "88ef6b75", - "metadata": {}, - "source": [ - "# CUDA Codegen Testing Playground\n", - "\n", - "A \"playfield\" for exploring whether the **experimental CUDA codegen** correctly identifies **constant (`const`) arguments** or whether its behavior is at least \n", - "similar to the **legacy codegen**.\n", - "\n", - "**Why does this matters?** Using `const` in CUDA can lead (and usually does lead) to **better performance** by enabling compiler optimizations. \n", - "\n", - "This notebook helps verify that constant arguments are being properly recognized." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "20625e0d", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "# import all modules needed at once\n", - "import dace\n", - "import cupy as cp\n", - "import numpy as np\n", - "from IPython.display import Code\n", - "from typing import Optional\n", - "\n", - "from dace import SDFG, properties\n", - "from dace.config import Config\n", - "from dace.transformation import pass_pipeline as ppl, transformation\n", - "from dace.sdfg import nodes\n", - "from dace import dtypes\n", - "from dace.transformation.passes.gpustream_scheduling import NaiveGPUStreamScheduler" - ] - }, - { - "cell_type": "markdown", - "id": "48b4b2ee", - "metadata": {}, - "source": [ - "Look which codegen we are currently using (legay or experimental):" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "cf68a501", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'experimental'" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "usedCodegen = Config.get('compiler', 'cuda', 'implementation')\n", - "usedCodegen" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "b9d10c4b", - "metadata": {}, - "outputs": [], - "source": [ - "N = dace.symbol('N')\n", - "\n", - "@dace.program\n", - "def vector_copy_dyn_sizes(A: dace.float64[N] @ dace.dtypes.StorageType.GPU_Global, B: dace.float64[N] @ dace.dtypes.StorageType.GPU_Global):\n", - " for i in dace.map[0:N] @ dace.dtypes.ScheduleType.GPU_Device:\n", - " A[i] = B[i]\n", - "\n", - "sdfg = vector_copy_dyn_sizes.to_sdfg()" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "3f759a90", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/home/berkay/master-thesis/dace/dace/codegen/targets/experimental_cuda.py:1323: UserWarning: No `gpu_block_size` property specified on map \"vector_copy_dyn_sizes_5\". Falling back to the configuration entry `compiler.cuda.default_block_size`: 32,1,1. You can either specify the block size to use with the gpu_block_size property, or by adding nested `GPU_ThreadBlock` maps, which map work to individual threads. For more information, see https://spcldace.readthedocs.io/en/latest/optimization/gpu.html\n", - " warnings.warn(\n" - ] - }, - { - "data": { - "text/html": [ - "
/* DaCe AUTO-GENERATED FILE. DO NOT MODIFY */\n",
-       "#include <dace/dace.h>\n",
-       "#include "../../include/hash.h"\n",
-       "\n",
-       "struct vector_copy_dyn_sizes_state_t {\n",
-       "    dace::cuda::Context *gpu_context;\n",
-       "};\n",
-       "\n",
-       "DACE_EXPORTED void __dace_runkernel_vector_copy_dyn_sizes_5_0_0_0(vector_copy_dyn_sizes_state_t *__state, double * __restrict__ A, const double * __restrict__ B, int N);\n",
-       "void __program_vector_copy_dyn_sizes_internal(vector_copy_dyn_sizes_state_t*__state, double * __restrict__ A, double * __restrict__ B, int N)\n",
-       "{\n",
-       "\n",
-       "    {\n",
-       "\n",
-       "        __dace_runkernel_vector_copy_dyn_sizes_5_0_0_0(__state, A, B, N);\n",
-       "        {\n",
-       "\n",
-       "            ///////////////////\n",
-       "            DACE_GPU_CHECK(cudaStreamSynchronize(nullptr));\n",
-       "            ///////////////////\n",
-       "\n",
-       "        }\n",
-       "\n",
-       "    }\n",
-       "}\n",
-       "\n",
-       "DACE_EXPORTED void __program_vector_copy_dyn_sizes(vector_copy_dyn_sizes_state_t *__state, double * __restrict__ A, double * __restrict__ B, int N)\n",
-       "{\n",
-       "    __program_vector_copy_dyn_sizes_internal(__state, A, B, N);\n",
-       "}\n",
-       "DACE_EXPORTED int __dace_init_experimental_cuda(vector_copy_dyn_sizes_state_t *__state, int N);\n",
-       "DACE_EXPORTED int __dace_exit_experimental_cuda(vector_copy_dyn_sizes_state_t *__state);\n",
-       "\n",
-       "DACE_EXPORTED vector_copy_dyn_sizes_state_t *__dace_init_vector_copy_dyn_sizes(int N)\n",
-       "{\n",
-       "    int __result = 0;\n",
-       "    vector_copy_dyn_sizes_state_t *__state = new vector_copy_dyn_sizes_state_t;\n",
-       "\n",
-       "\n",
-       "    __result |= __dace_init_experimental_cuda(__state, N);\n",
-       "\n",
-       "    if (__result) {\n",
-       "        delete __state;\n",
-       "        return nullptr;\n",
-       "    }\n",
-       "    return __state;\n",
-       "}\n",
-       "\n",
-       "DACE_EXPORTED int __dace_exit_vector_copy_dyn_sizes(vector_copy_dyn_sizes_state_t *__state)\n",
-       "{\n",
-       "    int __err = 0;\n",
-       "\n",
-       "    int __err_experimental_cuda = __dace_exit_experimental_cuda(__state);\n",
-       "    if (__err_experimental_cuda) {\n",
-       "        __err = __err_experimental_cuda;\n",
-       "    }\n",
-       "    delete __state;\n",
-       "    return __err;\n",
-       "}\n",
-       "
\n" - ], - "text/latex": [ - "\\begin{Verbatim}[commandchars=\\\\\\{\\}]\n", - "\\PY{c+cm}{/* DaCe AUTO\\PYZhy{}GENERATED FILE. DO NOT MODIFY */}\n", - "\\PY{c+cp}{\\PYZsh{}}\\PY{c+cp}{include}\\PY{+w}{ }\\PY{c+cpf}{\\PYZlt{}dace/dace.h\\PYZgt{}}\n", - "\\PY{c+cp}{\\PYZsh{}}\\PY{c+cp}{include}\\PY{+w}{ }\\PY{c+cpf}{\\PYZdq{}../../include/hash.h\\PYZdq{}}\n", - "\n", - "\\PY{k}{struct}\\PY{+w}{ }\\PY{n+nc}{vector\\PYZus{}copy\\PYZus{}dyn\\PYZus{}sizes\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", - "\\PY{+w}{ }\\PY{n}{dace}\\PY{o}{:}\\PY{o}{:}\\PY{n}{cuda}\\PY{o}{:}\\PY{o}{:}\\PY{n}{Context}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{gpu\\PYZus{}context}\\PY{p}{;}\n", - "\\PY{p}{\\PYZcb{}}\\PY{p}{;}\n", - "\n", - "\\PY{n}{DACE\\PYZus{}EXPORTED}\\PY{+w}{ }\\PY{k+kt}{void}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}dace\\PYZus{}runkernel\\PYZus{}vector\\PYZus{}copy\\PYZus{}dyn\\PYZus{}sizes\\PYZus{}5\\PYZus{}0\\PYZus{}0\\PYZus{}0}\\PY{p}{(}\\PY{n}{vector\\PYZus{}copy\\PYZus{}dyn\\PYZus{}sizes\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{,}\\PY{+w}{ }\\PY{k+kt}{double}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}restrict\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{n}{A}\\PY{p}{,}\\PY{+w}{ }\\PY{k}{const}\\PY{+w}{ }\\PY{k+kt}{double}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}restrict\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{n}{B}\\PY{p}{,}\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{N}\\PY{p}{)}\\PY{p}{;}\n", - "\\PY{k+kt}{void}\\PY{+w}{ }\\PY{n+nf}{\\PYZus{}\\PYZus{}program\\PYZus{}vector\\PYZus{}copy\\PYZus{}dyn\\PYZus{}sizes\\PYZus{}internal}\\PY{p}{(}\\PY{n}{vector\\PYZus{}copy\\PYZus{}dyn\\PYZus{}sizes\\PYZus{}state\\PYZus{}t}\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{,}\\PY{+w}{ }\\PY{k+kt}{double}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}restrict\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{n}{A}\\PY{p}{,}\\PY{+w}{ }\\PY{k+kt}{double}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}restrict\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{n}{B}\\PY{p}{,}\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{N}\\PY{p}{)}\n", - "\\PY{p}{\\PYZob{}}\n", - "\n", - "\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", - "\n", - "\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}dace\\PYZus{}runkernel\\PYZus{}vector\\PYZus{}copy\\PYZus{}dyn\\PYZus{}sizes\\PYZus{}5\\PYZus{}0\\PYZus{}0\\PYZus{}0}\\PY{p}{(}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{A}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{B}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{N}\\PY{p}{)}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", - "\n", - "\\PY{+w}{ }\\PY{c+c1}{///////////////////}\n", - "\\PY{+w}{ }\\PY{n}{DACE\\PYZus{}GPU\\PYZus{}CHECK}\\PY{p}{(}\\PY{n}{cudaStreamSynchronize}\\PY{p}{(}\\PY{k}{nullptr}\\PY{p}{)}\\PY{p}{)}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{c+c1}{///////////////////}\n", - "\n", - "\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\n", - "\n", - "\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\n", - "\\PY{p}{\\PYZcb{}}\n", - "\n", - "\\PY{n}{DACE\\PYZus{}EXPORTED}\\PY{+w}{ }\\PY{k+kt}{void}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}program\\PYZus{}vector\\PYZus{}copy\\PYZus{}dyn\\PYZus{}sizes}\\PY{p}{(}\\PY{n}{vector\\PYZus{}copy\\PYZus{}dyn\\PYZus{}sizes\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{,}\\PY{+w}{ }\\PY{k+kt}{double}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}restrict\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{n}{A}\\PY{p}{,}\\PY{+w}{ }\\PY{k+kt}{double}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}restrict\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{n}{B}\\PY{p}{,}\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{N}\\PY{p}{)}\n", - "\\PY{p}{\\PYZob{}}\n", - "\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}program\\PYZus{}vector\\PYZus{}copy\\PYZus{}dyn\\PYZus{}sizes\\PYZus{}internal}\\PY{p}{(}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{A}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{B}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{N}\\PY{p}{)}\\PY{p}{;}\n", - "\\PY{p}{\\PYZcb{}}\n", - "\\PY{n}{DACE\\PYZus{}EXPORTED}\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}dace\\PYZus{}init\\PYZus{}experimental\\PYZus{}cuda}\\PY{p}{(}\\PY{n}{vector\\PYZus{}copy\\PYZus{}dyn\\PYZus{}sizes\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{,}\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{N}\\PY{p}{)}\\PY{p}{;}\n", - "\\PY{n}{DACE\\PYZus{}EXPORTED}\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}dace\\PYZus{}exit\\PYZus{}experimental\\PYZus{}cuda}\\PY{p}{(}\\PY{n}{vector\\PYZus{}copy\\PYZus{}dyn\\PYZus{}sizes\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{)}\\PY{p}{;}\n", - "\n", - "\\PY{n}{DACE\\PYZus{}EXPORTED}\\PY{+w}{ }\\PY{n}{vector\\PYZus{}copy\\PYZus{}dyn\\PYZus{}sizes\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}dace\\PYZus{}init\\PYZus{}vector\\PYZus{}copy\\PYZus{}dyn\\PYZus{}sizes}\\PY{p}{(}\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{N}\\PY{p}{)}\n", - "\\PY{p}{\\PYZob{}}\n", - "\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}result}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{n}{vector\\PYZus{}copy\\PYZus{}dyn\\PYZus{}sizes\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{k}{new}\\PY{+w}{ }\\PY{n}{vector\\PYZus{}copy\\PYZus{}dyn\\PYZus{}sizes\\PYZus{}state\\PYZus{}t}\\PY{p}{;}\n", - "\n", - "\n", - "\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}result}\\PY{+w}{ }\\PY{o}{|}\\PY{o}{=}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}dace\\PYZus{}init\\PYZus{}experimental\\PYZus{}cuda}\\PY{p}{(}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{N}\\PY{p}{)}\\PY{p}{;}\n", - "\n", - "\\PY{+w}{ }\\PY{k}{if}\\PY{+w}{ }\\PY{p}{(}\\PY{n}{\\PYZus{}\\PYZus{}result}\\PY{p}{)}\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", - "\\PY{+w}{ }\\PY{k}{delete}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{k}{return}\\PY{+w}{ }\\PY{k}{nullptr}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\n", - "\\PY{+w}{ }\\PY{k}{return}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{;}\n", - "\\PY{p}{\\PYZcb{}}\n", - "\n", - "\\PY{n}{DACE\\PYZus{}EXPORTED}\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}dace\\PYZus{}exit\\PYZus{}vector\\PYZus{}copy\\PYZus{}dyn\\PYZus{}sizes}\\PY{p}{(}\\PY{n}{vector\\PYZus{}copy\\PYZus{}dyn\\PYZus{}sizes\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{)}\n", - "\\PY{p}{\\PYZob{}}\n", - "\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}err}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{;}\n", - "\n", - "\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}err\\PYZus{}experimental\\PYZus{}cuda}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}dace\\PYZus{}exit\\PYZus{}experimental\\PYZus{}cuda}\\PY{p}{(}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{)}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{k}{if}\\PY{+w}{ }\\PY{p}{(}\\PY{n}{\\PYZus{}\\PYZus{}err\\PYZus{}experimental\\PYZus{}cuda}\\PY{p}{)}\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", - "\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}err}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}err\\PYZus{}experimental\\PYZus{}cuda}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\n", - "\\PY{+w}{ }\\PY{k}{delete}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{k}{return}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}err}\\PY{p}{;}\n", - "\\PY{p}{\\PYZcb{}}\n", - "\\end{Verbatim}\n" - ], - "text/plain": [ - "/* DaCe AUTO-GENERATED FILE. DO NOT MODIFY */\n", - "#include \n", - "#include \"../../include/hash.h\"\n", - "\n", - "struct vector_copy_dyn_sizes_state_t {\n", - " dace::cuda::Context *gpu_context;\n", - "};\n", - "\n", - "DACE_EXPORTED void __dace_runkernel_vector_copy_dyn_sizes_5_0_0_0(vector_copy_dyn_sizes_state_t *__state, double * __restrict__ A, const double * __restrict__ B, int N);\n", - "void __program_vector_copy_dyn_sizes_internal(vector_copy_dyn_sizes_state_t*__state, double * __restrict__ A, double * __restrict__ B, int N)\n", - "{\n", - "\n", - " {\n", - "\n", - " __dace_runkernel_vector_copy_dyn_sizes_5_0_0_0(__state, A, B, N);\n", - " {\n", - "\n", - " ///////////////////\n", - " DACE_GPU_CHECK(cudaStreamSynchronize(nullptr));\n", - " ///////////////////\n", - "\n", - " }\n", - "\n", - " }\n", - "}\n", - "\n", - "DACE_EXPORTED void __program_vector_copy_dyn_sizes(vector_copy_dyn_sizes_state_t *__state, double * __restrict__ A, double * __restrict__ B, int N)\n", - "{\n", - " __program_vector_copy_dyn_sizes_internal(__state, A, B, N);\n", - "}\n", - "DACE_EXPORTED int __dace_init_experimental_cuda(vector_copy_dyn_sizes_state_t *__state, int N);\n", - "DACE_EXPORTED int __dace_exit_experimental_cuda(vector_copy_dyn_sizes_state_t *__state);\n", - "\n", - "DACE_EXPORTED vector_copy_dyn_sizes_state_t *__dace_init_vector_copy_dyn_sizes(int N)\n", - "{\n", - " int __result = 0;\n", - " vector_copy_dyn_sizes_state_t *__state = new vector_copy_dyn_sizes_state_t;\n", - "\n", - "\n", - " __result |= __dace_init_experimental_cuda(__state, N);\n", - "\n", - " if (__result) {\n", - " delete __state;\n", - " return nullptr;\n", - " }\n", - " return __state;\n", - "}\n", - "\n", - "DACE_EXPORTED int __dace_exit_vector_copy_dyn_sizes(vector_copy_dyn_sizes_state_t *__state)\n", - "{\n", - " int __err = 0;\n", - "\n", - " int __err_experimental_cuda = __dace_exit_experimental_cuda(__state);\n", - " if (__err_experimental_cuda) {\n", - " __err = __err_experimental_cuda;\n", - " }\n", - " delete __state;\n", - " return __err;\n", - "}" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "Code(sdfg.generate_code()[0].clean_code, language='cpp')" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "31580e6d", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
#include <cuda_runtime.h>\n",
-       "#include <dace/dace.h>\n",
-       "\n",
-       "\n",
-       "struct vector_copy_dyn_sizes_state_t {\n",
-       "    dace::cuda::Context *gpu_context;\n",
-       "};\n",
-       "\n",
-       "\n",
-       "\n",
-       "DACE_EXPORTED int __dace_init_experimental_cuda(vector_copy_dyn_sizes_state_t *__state, int N);\n",
-       "DACE_EXPORTED int __dace_exit_experimental_cuda(vector_copy_dyn_sizes_state_t *__state);\n",
-       "\n",
-       "\n",
-       "\n",
-       "int __dace_init_experimental_cuda(vector_copy_dyn_sizes_state_t *__state, int N) {\n",
-       "    int count;\n",
-       "\n",
-       "    // Check that we are able to run cuda code\n",
-       "    if (cudaGetDeviceCount(&count) != cudaSuccess)\n",
-       "    {\n",
-       "        printf("ERROR: GPU drivers are not configured or cuda-capable device "\n",
-       "               "not found\\n");\n",
-       "        return 1;\n",
-       "    }\n",
-       "    if (count == 0)\n",
-       "    {\n",
-       "        printf("ERROR: No cuda-capable devices found\\n");\n",
-       "        return 2;\n",
-       "    }\n",
-       "\n",
-       "    // Initialize cuda before we run the application\n",
-       "    float *dev_X;\n",
-       "    DACE_GPU_CHECK(cudaMalloc((void **) &dev_X, 1));\n",
-       "    DACE_GPU_CHECK(cudaFree(dev_X));\n",
-       "\n",
-       "    \n",
-       "\n",
-       "    __state->gpu_context = new dace::cuda::Context(0, 0);\n",
-       "\n",
-       "    // Create cuda streams and events\n",
-       "    for(int i = 0; i < 0; ++i) {\n",
-       "        DACE_GPU_CHECK(cudaStreamCreateWithFlags(&__state->gpu_context->internal_streams[i], cudaStreamNonBlocking));\n",
-       "        __state->gpu_context->streams[i] = __state->gpu_context->internal_streams[i]; // Allow for externals to modify streams\n",
-       "    }\n",
-       "    for(int i = 0; i < 0; ++i) {\n",
-       "        DACE_GPU_CHECK(cudaEventCreateWithFlags(&__state->gpu_context->events[i], cudaEventDisableTiming));\n",
-       "    }\n",
-       "\n",
-       "    \n",
-       "\n",
-       "    return 0;\n",
-       "}\n",
-       "\n",
-       "int __dace_exit_experimental_cuda(vector_copy_dyn_sizes_state_t *__state) {\n",
-       "    \n",
-       "\n",
-       "    // Synchronize and check for CUDA errors\n",
-       "    int __err = static_cast<int>(__state->gpu_context->lasterror);\n",
-       "    if (__err == 0)\n",
-       "        __err = static_cast<int>(cudaDeviceSynchronize());\n",
-       "\n",
-       "    // Destroy cuda streams and events\n",
-       "    for(int i = 0; i < 0; ++i) {\n",
-       "        DACE_GPU_CHECK(cudaStreamDestroy(__state->gpu_context->internal_streams[i]));\n",
-       "    }\n",
-       "    for(int i = 0; i < 0; ++i) {\n",
-       "        DACE_GPU_CHECK(cudaEventDestroy(__state->gpu_context->events[i]));\n",
-       "    }\n",
-       "\n",
-       "    delete __state->gpu_context;\n",
-       "    return __err;\n",
-       "}\n",
-       "\n",
-       "DACE_EXPORTED bool __dace_gpu_set_stream(vector_copy_dyn_sizes_state_t *__state, int streamid, gpuStream_t stream)\n",
-       "{\n",
-       "    if (streamid < 0 || streamid >= 0)\n",
-       "        return false;\n",
-       "\n",
-       "    __state->gpu_context->streams[streamid] = stream;\n",
-       "\n",
-       "    return true;\n",
-       "}\n",
-       "\n",
-       "DACE_EXPORTED void __dace_gpu_set_all_streams(vector_copy_dyn_sizes_state_t *__state, gpuStream_t stream)\n",
-       "{\n",
-       "    for (int i = 0; i < 0; ++i)\n",
-       "        __state->gpu_context->streams[i] = stream;\n",
-       "}\n",
-       "\n",
-       "__global__ void __launch_bounds__(32) vector_copy_dyn_sizes_5_0_0_0(double * __restrict__ A, const double * __restrict__ B, int N)\n",
-       "{\n",
-       "    int i = (blockIdx.x * 32 + threadIdx.x);\n",
-       "    if (i < N) {\n",
-       "        {\n",
-       "            double __inp = B[i];\n",
-       "            double __out;\n",
-       "\n",
-       "            ///////////////////\n",
-       "            // Tasklet code (assign_6_12)\n",
-       "            __out = __inp;\n",
-       "            ///////////////////\n",
-       "\n",
-       "            A[i] = __out;\n",
-       "        }\n",
-       "    }\n",
-       "}\n",
-       "\n",
-       "\n",
-       "DACE_EXPORTED void __dace_runkernel_vector_copy_dyn_sizes_5_0_0_0(vector_copy_dyn_sizes_state_t *__state, double * __restrict__ A, const double * __restrict__ B, int N);\n",
-       "void __dace_runkernel_vector_copy_dyn_sizes_5_0_0_0(vector_copy_dyn_sizes_state_t *__state, double * __restrict__ A, const double * __restrict__ B, int N)\n",
-       "{\n",
-       "\n",
-       "\n",
-       "    if (((int_ceil(int_ceil(N, 1), 32)) <= 0)) {\n",
-       "\n",
-       "        return;\n",
-       "    }\n",
-       "\n",
-       "    void  *vector_copy_dyn_sizes_5_0_0_0_args[] = { (void *)&A, (void *)&B, (void *)&N };\n",
-       "    gpuError_t __err = cudaLaunchKernel( (void*)vector_copy_dyn_sizes_5_0_0_0, dim3(int_ceil(int_ceil(N, 1), 32), 1, 1), dim3(32, 1, 1), vector_copy_dyn_sizes_5_0_0_0_args, 0, nullptr\n",
-       "    );\n",
-       "\n",
-       "    DACE_KERNEL_LAUNCH_CHECK(__err, "vector_copy_dyn_sizes_5_0_0_0", int_ceil(int_ceil(N, 1), 32), 1, 1, 32, 1, 1);\n",
-       "}\n",
-       "
\n" - ], - "text/latex": [ - "\\begin{Verbatim}[commandchars=\\\\\\{\\}]\n", - "\\PY{c+cp}{\\PYZsh{}}\\PY{c+cp}{include}\\PY{+w}{ }\\PY{c+cpf}{\\PYZlt{}cuda\\PYZus{}runtime.h\\PYZgt{}}\n", - "\\PY{c+cp}{\\PYZsh{}}\\PY{c+cp}{include}\\PY{+w}{ }\\PY{c+cpf}{\\PYZlt{}dace/dace.h\\PYZgt{}}\n", - "\n", - "\n", - "\\PY{k}{struct}\\PY{+w}{ }\\PY{n+nc}{vector\\PYZus{}copy\\PYZus{}dyn\\PYZus{}sizes\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", - "\\PY{+w}{ }\\PY{n}{dace}\\PY{o}{:}\\PY{o}{:}\\PY{n}{cuda}\\PY{o}{:}\\PY{o}{:}\\PY{n}{Context}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{gpu\\PYZus{}context}\\PY{p}{;}\n", - "\\PY{p}{\\PYZcb{}}\\PY{p}{;}\n", - "\n", - "\n", - "\n", - "\\PY{n}{DACE\\PYZus{}EXPORTED}\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}dace\\PYZus{}init\\PYZus{}experimental\\PYZus{}cuda}\\PY{p}{(}\\PY{n}{vector\\PYZus{}copy\\PYZus{}dyn\\PYZus{}sizes\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{,}\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{N}\\PY{p}{)}\\PY{p}{;}\n", - "\\PY{n}{DACE\\PYZus{}EXPORTED}\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}dace\\PYZus{}exit\\PYZus{}experimental\\PYZus{}cuda}\\PY{p}{(}\\PY{n}{vector\\PYZus{}copy\\PYZus{}dyn\\PYZus{}sizes\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{)}\\PY{p}{;}\n", - "\n", - "\n", - "\n", - "\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n+nf}{\\PYZus{}\\PYZus{}dace\\PYZus{}init\\PYZus{}experimental\\PYZus{}cuda}\\PY{p}{(}\\PY{n}{vector\\PYZus{}copy\\PYZus{}dyn\\PYZus{}sizes\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{,}\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{N}\\PY{p}{)}\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", - "\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{count}\\PY{p}{;}\n", - "\n", - "\\PY{+w}{ }\\PY{c+c1}{// Check that we are able to run cuda code}\n", - "\\PY{+w}{ }\\PY{k}{if}\\PY{+w}{ }\\PY{p}{(}\\PY{n}{cudaGetDeviceCount}\\PY{p}{(}\\PY{o}{\\PYZam{}}\\PY{n}{count}\\PY{p}{)}\\PY{+w}{ }\\PY{o}{!}\\PY{o}{=}\\PY{+w}{ }\\PY{n}{cudaSuccess}\\PY{p}{)}\n", - "\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", - "\\PY{+w}{ }\\PY{n}{printf}\\PY{p}{(}\\PY{l+s}{\\PYZdq{}}\\PY{l+s}{ERROR: GPU drivers are not configured or cuda\\PYZhy{}capable device }\\PY{l+s}{\\PYZdq{}}\n", - "\\PY{+w}{ }\\PY{l+s}{\\PYZdq{}}\\PY{l+s}{not found}\\PY{l+s+se}{\\PYZbs{}n}\\PY{l+s}{\\PYZdq{}}\\PY{p}{)}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{k}{return}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\n", - "\\PY{+w}{ }\\PY{k}{if}\\PY{+w}{ }\\PY{p}{(}\\PY{n}{count}\\PY{+w}{ }\\PY{o}{=}\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{)}\n", - "\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", - "\\PY{+w}{ }\\PY{n}{printf}\\PY{p}{(}\\PY{l+s}{\\PYZdq{}}\\PY{l+s}{ERROR: No cuda\\PYZhy{}capable devices found}\\PY{l+s+se}{\\PYZbs{}n}\\PY{l+s}{\\PYZdq{}}\\PY{p}{)}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{k}{return}\\PY{+w}{ }\\PY{l+m+mi}{2}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\n", - "\n", - "\\PY{+w}{ }\\PY{c+c1}{// Initialize cuda before we run the application}\n", - "\\PY{+w}{ }\\PY{k+kt}{float}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{dev\\PYZus{}X}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{n}{DACE\\PYZus{}GPU\\PYZus{}CHECK}\\PY{p}{(}\\PY{n}{cudaMalloc}\\PY{p}{(}\\PY{p}{(}\\PY{k+kt}{void}\\PY{+w}{ }\\PY{o}{*}\\PY{o}{*}\\PY{p}{)}\\PY{+w}{ }\\PY{o}{\\PYZam{}}\\PY{n}{dev\\PYZus{}X}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{)}\\PY{p}{)}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{n}{DACE\\PYZus{}GPU\\PYZus{}CHECK}\\PY{p}{(}\\PY{n}{cudaFree}\\PY{p}{(}\\PY{n}{dev\\PYZus{}X}\\PY{p}{)}\\PY{p}{)}\\PY{p}{;}\n", - "\n", - "\\PY{+w}{ }\n", - "\n", - "\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{k}{new}\\PY{+w}{ }\\PY{n}{dace}\\PY{o}{:}\\PY{o}{:}\\PY{n}{cuda}\\PY{o}{:}\\PY{o}{:}\\PY{n}{Context}\\PY{p}{(}\\PY{l+m+mi}{0}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{)}\\PY{p}{;}\n", - "\n", - "\\PY{+w}{ }\\PY{c+c1}{// Create cuda streams and events}\n", - "\\PY{+w}{ }\\PY{k}{for}\\PY{p}{(}\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{;}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{\\PYZlt{}}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{;}\\PY{+w}{ }\\PY{o}{+}\\PY{o}{+}\\PY{n}{i}\\PY{p}{)}\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", - "\\PY{+w}{ }\\PY{n}{DACE\\PYZus{}GPU\\PYZus{}CHECK}\\PY{p}{(}\\PY{n}{cudaStreamCreateWithFlags}\\PY{p}{(}\\PY{o}{\\PYZam{}}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{internal\\PYZus{}streams}\\PY{p}{[}\\PY{n}{i}\\PY{p}{]}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{cudaStreamNonBlocking}\\PY{p}{)}\\PY{p}{)}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{streams}\\PY{p}{[}\\PY{n}{i}\\PY{p}{]}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{internal\\PYZus{}streams}\\PY{p}{[}\\PY{n}{i}\\PY{p}{]}\\PY{p}{;}\\PY{+w}{ }\\PY{c+c1}{// Allow for externals to modify streams}\n", - "\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\n", - "\\PY{+w}{ }\\PY{k}{for}\\PY{p}{(}\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{;}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{\\PYZlt{}}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{;}\\PY{+w}{ }\\PY{o}{+}\\PY{o}{+}\\PY{n}{i}\\PY{p}{)}\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", - "\\PY{+w}{ }\\PY{n}{DACE\\PYZus{}GPU\\PYZus{}CHECK}\\PY{p}{(}\\PY{n}{cudaEventCreateWithFlags}\\PY{p}{(}\\PY{o}{\\PYZam{}}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{events}\\PY{p}{[}\\PY{n}{i}\\PY{p}{]}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{cudaEventDisableTiming}\\PY{p}{)}\\PY{p}{)}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\n", - "\n", - "\\PY{+w}{ }\n", - "\n", - "\\PY{+w}{ }\\PY{k}{return}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{;}\n", - "\\PY{p}{\\PYZcb{}}\n", - "\n", - "\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n+nf}{\\PYZus{}\\PYZus{}dace\\PYZus{}exit\\PYZus{}experimental\\PYZus{}cuda}\\PY{p}{(}\\PY{n}{vector\\PYZus{}copy\\PYZus{}dyn\\PYZus{}sizes\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{)}\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", - "\\PY{+w}{ }\n", - "\n", - "\\PY{+w}{ }\\PY{c+c1}{// Synchronize and check for CUDA errors}\n", - "\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}err}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{k}{static\\PYZus{}cast}\\PY{o}{\\PYZlt{}}\\PY{k+kt}{int}\\PY{o}{\\PYZgt{}}\\PY{p}{(}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{lasterror}\\PY{p}{)}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{k}{if}\\PY{+w}{ }\\PY{p}{(}\\PY{n}{\\PYZus{}\\PYZus{}err}\\PY{+w}{ }\\PY{o}{=}\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{)}\n", - "\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}err}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{k}{static\\PYZus{}cast}\\PY{o}{\\PYZlt{}}\\PY{k+kt}{int}\\PY{o}{\\PYZgt{}}\\PY{p}{(}\\PY{n}{cudaDeviceSynchronize}\\PY{p}{(}\\PY{p}{)}\\PY{p}{)}\\PY{p}{;}\n", - "\n", - "\\PY{+w}{ }\\PY{c+c1}{// Destroy cuda streams and events}\n", - "\\PY{+w}{ }\\PY{k}{for}\\PY{p}{(}\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{;}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{\\PYZlt{}}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{;}\\PY{+w}{ }\\PY{o}{+}\\PY{o}{+}\\PY{n}{i}\\PY{p}{)}\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", - "\\PY{+w}{ }\\PY{n}{DACE\\PYZus{}GPU\\PYZus{}CHECK}\\PY{p}{(}\\PY{n}{cudaStreamDestroy}\\PY{p}{(}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{internal\\PYZus{}streams}\\PY{p}{[}\\PY{n}{i}\\PY{p}{]}\\PY{p}{)}\\PY{p}{)}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\n", - "\\PY{+w}{ }\\PY{k}{for}\\PY{p}{(}\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{;}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{\\PYZlt{}}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{;}\\PY{+w}{ }\\PY{o}{+}\\PY{o}{+}\\PY{n}{i}\\PY{p}{)}\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", - "\\PY{+w}{ }\\PY{n}{DACE\\PYZus{}GPU\\PYZus{}CHECK}\\PY{p}{(}\\PY{n}{cudaEventDestroy}\\PY{p}{(}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{events}\\PY{p}{[}\\PY{n}{i}\\PY{p}{]}\\PY{p}{)}\\PY{p}{)}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\n", - "\n", - "\\PY{+w}{ }\\PY{k}{delete}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{k}{return}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}err}\\PY{p}{;}\n", - "\\PY{p}{\\PYZcb{}}\n", - "\n", - "\\PY{n}{DACE\\PYZus{}EXPORTED}\\PY{+w}{ }\\PY{k+kt}{bool}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}dace\\PYZus{}gpu\\PYZus{}set\\PYZus{}stream}\\PY{p}{(}\\PY{n}{vector\\PYZus{}copy\\PYZus{}dyn\\PYZus{}sizes\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{,}\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{streamid}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{gpuStream\\PYZus{}t}\\PY{+w}{ }\\PY{n}{stream}\\PY{p}{)}\n", - "\\PY{p}{\\PYZob{}}\n", - "\\PY{+w}{ }\\PY{k}{if}\\PY{+w}{ }\\PY{p}{(}\\PY{n}{streamid}\\PY{+w}{ }\\PY{o}{\\PYZlt{}}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{+w}{ }\\PY{o}{|}\\PY{o}{|}\\PY{+w}{ }\\PY{n}{streamid}\\PY{+w}{ }\\PY{o}{\\PYZgt{}}\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{)}\n", - "\\PY{+w}{ }\\PY{k}{return}\\PY{+w}{ }\\PY{n+nb}{false}\\PY{p}{;}\n", - "\n", - "\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{streams}\\PY{p}{[}\\PY{n}{streamid}\\PY{p}{]}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{stream}\\PY{p}{;}\n", - "\n", - "\\PY{+w}{ }\\PY{k}{return}\\PY{+w}{ }\\PY{n+nb}{true}\\PY{p}{;}\n", - "\\PY{p}{\\PYZcb{}}\n", - "\n", - "\\PY{n}{DACE\\PYZus{}EXPORTED}\\PY{+w}{ }\\PY{k+kt}{void}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}dace\\PYZus{}gpu\\PYZus{}set\\PYZus{}all\\PYZus{}streams}\\PY{p}{(}\\PY{n}{vector\\PYZus{}copy\\PYZus{}dyn\\PYZus{}sizes\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{gpuStream\\PYZus{}t}\\PY{+w}{ }\\PY{n}{stream}\\PY{p}{)}\n", - "\\PY{p}{\\PYZob{}}\n", - "\\PY{+w}{ }\\PY{k}{for}\\PY{+w}{ }\\PY{p}{(}\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{;}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{\\PYZlt{}}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{;}\\PY{+w}{ }\\PY{o}{+}\\PY{o}{+}\\PY{n}{i}\\PY{p}{)}\n", - "\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{streams}\\PY{p}{[}\\PY{n}{i}\\PY{p}{]}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{stream}\\PY{p}{;}\n", - "\\PY{p}{\\PYZcb{}}\n", - "\n", - "\\PY{n}{\\PYZus{}\\PYZus{}global\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{k+kt}{void}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}launch\\PYZus{}bounds\\PYZus{}\\PYZus{}}\\PY{p}{(}\\PY{l+m+mi}{32}\\PY{p}{)}\\PY{+w}{ }\\PY{n}{vector\\PYZus{}copy\\PYZus{}dyn\\PYZus{}sizes\\PYZus{}5\\PYZus{}0\\PYZus{}0\\PYZus{}0}\\PY{p}{(}\\PY{k+kt}{double}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}restrict\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{n}{A}\\PY{p}{,}\\PY{+w}{ }\\PY{k}{const}\\PY{+w}{ }\\PY{k+kt}{double}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}restrict\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{n}{B}\\PY{p}{,}\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{N}\\PY{p}{)}\n", - "\\PY{p}{\\PYZob{}}\n", - "\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{p}{(}\\PY{n}{blockIdx}\\PY{p}{.}\\PY{n}{x}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{l+m+mi}{32}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{n}{threadIdx}\\PY{p}{.}\\PY{n}{x}\\PY{p}{)}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{k}{if}\\PY{+w}{ }\\PY{p}{(}\\PY{n}{i}\\PY{+w}{ }\\PY{o}{\\PYZlt{}}\\PY{+w}{ }\\PY{n}{N}\\PY{p}{)}\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", - "\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", - "\\PY{+w}{ }\\PY{k+kt}{double}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}inp}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{B}\\PY{p}{[}\\PY{n}{i}\\PY{p}{]}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{k+kt}{double}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}out}\\PY{p}{;}\n", - "\n", - "\\PY{+w}{ }\\PY{c+c1}{///////////////////}\n", - "\\PY{+w}{ }\\PY{c+c1}{// Tasklet code (assign\\PYZus{}6\\PYZus{}12)}\n", - "\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}out}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}inp}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{c+c1}{///////////////////}\n", - "\n", - "\\PY{+w}{ }\\PY{n}{A}\\PY{p}{[}\\PY{n}{i}\\PY{p}{]}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}out}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\n", - "\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\n", - "\\PY{p}{\\PYZcb{}}\n", - "\n", - "\n", - "\\PY{n}{DACE\\PYZus{}EXPORTED}\\PY{+w}{ }\\PY{k+kt}{void}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}dace\\PYZus{}runkernel\\PYZus{}vector\\PYZus{}copy\\PYZus{}dyn\\PYZus{}sizes\\PYZus{}5\\PYZus{}0\\PYZus{}0\\PYZus{}0}\\PY{p}{(}\\PY{n}{vector\\PYZus{}copy\\PYZus{}dyn\\PYZus{}sizes\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{,}\\PY{+w}{ }\\PY{k+kt}{double}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}restrict\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{n}{A}\\PY{p}{,}\\PY{+w}{ }\\PY{k}{const}\\PY{+w}{ }\\PY{k+kt}{double}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}restrict\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{n}{B}\\PY{p}{,}\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{N}\\PY{p}{)}\\PY{p}{;}\n", - "\\PY{k+kt}{void}\\PY{+w}{ }\\PY{n+nf}{\\PYZus{}\\PYZus{}dace\\PYZus{}runkernel\\PYZus{}vector\\PYZus{}copy\\PYZus{}dyn\\PYZus{}sizes\\PYZus{}5\\PYZus{}0\\PYZus{}0\\PYZus{}0}\\PY{p}{(}\\PY{n}{vector\\PYZus{}copy\\PYZus{}dyn\\PYZus{}sizes\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{,}\\PY{+w}{ }\\PY{k+kt}{double}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}restrict\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{n}{A}\\PY{p}{,}\\PY{+w}{ }\\PY{k}{const}\\PY{+w}{ }\\PY{k+kt}{double}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}restrict\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{n}{B}\\PY{p}{,}\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{N}\\PY{p}{)}\n", - "\\PY{p}{\\PYZob{}}\n", - "\n", - "\n", - "\\PY{+w}{ }\\PY{k}{if}\\PY{+w}{ }\\PY{p}{(}\\PY{p}{(}\\PY{p}{(}\\PY{n}{int\\PYZus{}ceil}\\PY{p}{(}\\PY{n}{int\\PYZus{}ceil}\\PY{p}{(}\\PY{n}{N}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{)}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{32}\\PY{p}{)}\\PY{p}{)}\\PY{+w}{ }\\PY{o}{\\PYZlt{}}\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{)}\\PY{p}{)}\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", - "\n", - "\\PY{+w}{ }\\PY{k}{return}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\n", - "\n", - "\\PY{+w}{ }\\PY{k+kt}{void}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{vector\\PYZus{}copy\\PYZus{}dyn\\PYZus{}sizes\\PYZus{}5\\PYZus{}0\\PYZus{}0\\PYZus{}0\\PYZus{}args}\\PY{p}{[}\\PY{p}{]}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{p}{\\PYZob{}}\\PY{+w}{ }\\PY{p}{(}\\PY{k+kt}{void}\\PY{+w}{ }\\PY{o}{*}\\PY{p}{)}\\PY{o}{\\PYZam{}}\\PY{n}{A}\\PY{p}{,}\\PY{+w}{ }\\PY{p}{(}\\PY{k+kt}{void}\\PY{+w}{ }\\PY{o}{*}\\PY{p}{)}\\PY{o}{\\PYZam{}}\\PY{n}{B}\\PY{p}{,}\\PY{+w}{ }\\PY{p}{(}\\PY{k+kt}{void}\\PY{+w}{ }\\PY{o}{*}\\PY{p}{)}\\PY{o}{\\PYZam{}}\\PY{n}{N}\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{n}{gpuError\\PYZus{}t}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}err}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{cudaLaunchKernel}\\PY{p}{(}\\PY{+w}{ }\\PY{p}{(}\\PY{k+kt}{void}\\PY{o}{*}\\PY{p}{)}\\PY{n}{vector\\PYZus{}copy\\PYZus{}dyn\\PYZus{}sizes\\PYZus{}5\\PYZus{}0\\PYZus{}0\\PYZus{}0}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{dim3}\\PY{p}{(}\\PY{n}{int\\PYZus{}ceil}\\PY{p}{(}\\PY{n}{int\\PYZus{}ceil}\\PY{p}{(}\\PY{n}{N}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{)}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{32}\\PY{p}{)}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{)}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{dim3}\\PY{p}{(}\\PY{l+m+mi}{32}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{)}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{vector\\PYZus{}copy\\PYZus{}dyn\\PYZus{}sizes\\PYZus{}5\\PYZus{}0\\PYZus{}0\\PYZus{}0\\PYZus{}args}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{,}\\PY{+w}{ }\\PY{k}{nullptr}\n", - "\\PY{+w}{ }\\PY{p}{)}\\PY{p}{;}\n", - "\n", - "\\PY{+w}{ }\\PY{n}{DACE\\PYZus{}KERNEL\\PYZus{}LAUNCH\\PYZus{}CHECK}\\PY{p}{(}\\PY{n}{\\PYZus{}\\PYZus{}err}\\PY{p}{,}\\PY{+w}{ }\\PY{l+s}{\\PYZdq{}}\\PY{l+s}{vector\\PYZus{}copy\\PYZus{}dyn\\PYZus{}sizes\\PYZus{}5\\PYZus{}0\\PYZus{}0\\PYZus{}0}\\PY{l+s}{\\PYZdq{}}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{int\\PYZus{}ceil}\\PY{p}{(}\\PY{n}{int\\PYZus{}ceil}\\PY{p}{(}\\PY{n}{N}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{)}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{32}\\PY{p}{)}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{32}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{)}\\PY{p}{;}\n", - "\\PY{p}{\\PYZcb{}}\n", - "\\end{Verbatim}\n" - ], - "text/plain": [ - "\n", - "#include \n", - "#include \n", - "\n", - "\n", - "struct vector_copy_dyn_sizes_state_t {\n", - " dace::cuda::Context *gpu_context;\n", - "};\n", - "\n", - "\n", - "\n", - "DACE_EXPORTED int __dace_init_experimental_cuda(vector_copy_dyn_sizes_state_t *__state, int N);\n", - "DACE_EXPORTED int __dace_exit_experimental_cuda(vector_copy_dyn_sizes_state_t *__state);\n", - "\n", - "\n", - "\n", - "int __dace_init_experimental_cuda(vector_copy_dyn_sizes_state_t *__state, int N) {\n", - " int count;\n", - "\n", - " // Check that we are able to run cuda code\n", - " if (cudaGetDeviceCount(&count) != cudaSuccess)\n", - " {\n", - " printf(\"ERROR: GPU drivers are not configured or cuda-capable device \"\n", - " \"not found\\n\");\n", - " return 1;\n", - " }\n", - " if (count == 0)\n", - " {\n", - " printf(\"ERROR: No cuda-capable devices found\\n\");\n", - " return 2;\n", - " }\n", - "\n", - " // Initialize cuda before we run the application\n", - " float *dev_X;\n", - " DACE_GPU_CHECK(cudaMalloc((void **) &dev_X, 1));\n", - " DACE_GPU_CHECK(cudaFree(dev_X));\n", - "\n", - " \n", - "\n", - " __state->gpu_context = new dace::cuda::Context(0, 0);\n", - "\n", - " // Create cuda streams and events\n", - " for(int i = 0; i < 0; ++i) {\n", - " DACE_GPU_CHECK(cudaStreamCreateWithFlags(&__state->gpu_context->internal_streams[i], cudaStreamNonBlocking));\n", - " __state->gpu_context->streams[i] = __state->gpu_context->internal_streams[i]; // Allow for externals to modify streams\n", - " }\n", - " for(int i = 0; i < 0; ++i) {\n", - " DACE_GPU_CHECK(cudaEventCreateWithFlags(&__state->gpu_context->events[i], cudaEventDisableTiming));\n", - " }\n", - "\n", - " \n", - "\n", - " return 0;\n", - "}\n", - "\n", - "int __dace_exit_experimental_cuda(vector_copy_dyn_sizes_state_t *__state) {\n", - " \n", - "\n", - " // Synchronize and check for CUDA errors\n", - " int __err = static_cast(__state->gpu_context->lasterror);\n", - " if (__err == 0)\n", - " __err = static_cast(cudaDeviceSynchronize());\n", - "\n", - " // Destroy cuda streams and events\n", - " for(int i = 0; i < 0; ++i) {\n", - " DACE_GPU_CHECK(cudaStreamDestroy(__state->gpu_context->internal_streams[i]));\n", - " }\n", - " for(int i = 0; i < 0; ++i) {\n", - " DACE_GPU_CHECK(cudaEventDestroy(__state->gpu_context->events[i]));\n", - " }\n", - "\n", - " delete __state->gpu_context;\n", - " return __err;\n", - "}\n", - "\n", - "DACE_EXPORTED bool __dace_gpu_set_stream(vector_copy_dyn_sizes_state_t *__state, int streamid, gpuStream_t stream)\n", - "{\n", - " if (streamid < 0 || streamid >= 0)\n", - " return false;\n", - "\n", - " __state->gpu_context->streams[streamid] = stream;\n", - "\n", - " return true;\n", - "}\n", - "\n", - "DACE_EXPORTED void __dace_gpu_set_all_streams(vector_copy_dyn_sizes_state_t *__state, gpuStream_t stream)\n", - "{\n", - " for (int i = 0; i < 0; ++i)\n", - " __state->gpu_context->streams[i] = stream;\n", - "}\n", - "\n", - "__global__ void __launch_bounds__(32) vector_copy_dyn_sizes_5_0_0_0(double * __restrict__ A, const double * __restrict__ B, int N)\n", - "{\n", - " int i = (blockIdx.x * 32 + threadIdx.x);\n", - " if (i < N) {\n", - " {\n", - " double __inp = B[i];\n", - " double __out;\n", - "\n", - " ///////////////////\n", - " // Tasklet code (assign_6_12)\n", - " __out = __inp;\n", - " ///////////////////\n", - "\n", - " A[i] = __out;\n", - " }\n", - " }\n", - "}\n", - "\n", - "\n", - "DACE_EXPORTED void __dace_runkernel_vector_copy_dyn_sizes_5_0_0_0(vector_copy_dyn_sizes_state_t *__state, double * __restrict__ A, const double * __restrict__ B, int N);\n", - "void __dace_runkernel_vector_copy_dyn_sizes_5_0_0_0(vector_copy_dyn_sizes_state_t *__state, double * __restrict__ A, const double * __restrict__ B, int N)\n", - "{\n", - "\n", - "\n", - " if (((int_ceil(int_ceil(N, 1), 32)) <= 0)) {\n", - "\n", - " return;\n", - " }\n", - "\n", - " void *vector_copy_dyn_sizes_5_0_0_0_args[] = { (void *)&A, (void *)&B, (void *)&N };\n", - " gpuError_t __err = cudaLaunchKernel( (void*)vector_copy_dyn_sizes_5_0_0_0, dim3(int_ceil(int_ceil(N, 1), 32), 1, 1), dim3(32, 1, 1), vector_copy_dyn_sizes_5_0_0_0_args, 0, nullptr\n", - " );\n", - "\n", - " DACE_KERNEL_LAUNCH_CHECK(__err, \"vector_copy_dyn_sizes_5_0_0_0\", int_ceil(int_ceil(N, 1), 32), 1, 1, 32, 1, 1);\n", - "}\n" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "Code(sdfg.generate_code()[1].clean_code, language='cpp')" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "b1be294d", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
#include <cstdlib>\n",
-       "#include "../include/vector_copy_dyn_sizes.h"\n",
-       "\n",
-       "int main(int argc, char **argv) {\n",
-       "    vector_copy_dyn_sizesHandle_t handle;\n",
-       "    int err;\n",
-       "    int N = 42;\n",
-       "    double * __restrict__ A = (double*) calloc(N, sizeof(double));\n",
-       "    double * __restrict__ B = (double*) calloc(N, sizeof(double));\n",
-       "\n",
-       "\n",
-       "    handle = __dace_init_vector_copy_dyn_sizes(N);\n",
-       "    __program_vector_copy_dyn_sizes(handle, A, B, N);\n",
-       "    err = __dace_exit_vector_copy_dyn_sizes(handle);\n",
-       "\n",
-       "    free(A);\n",
-       "    free(B);\n",
-       "\n",
-       "\n",
-       "    return err;\n",
-       "}\n",
-       "
\n" - ], - "text/latex": [ - "\\begin{Verbatim}[commandchars=\\\\\\{\\}]\n", - "\\PY{c+cp}{\\PYZsh{}}\\PY{c+cp}{include}\\PY{+w}{ }\\PY{c+cpf}{\\PYZlt{}cstdlib\\PYZgt{}}\n", - "\\PY{c+cp}{\\PYZsh{}}\\PY{c+cp}{include}\\PY{+w}{ }\\PY{c+cpf}{\\PYZdq{}../include/vector\\PYZus{}copy\\PYZus{}dyn\\PYZus{}sizes.h\\PYZdq{}}\n", - "\n", - "\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n+nf}{main}\\PY{p}{(}\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{argc}\\PY{p}{,}\\PY{+w}{ }\\PY{k+kt}{char}\\PY{+w}{ }\\PY{o}{*}\\PY{o}{*}\\PY{n}{argv}\\PY{p}{)}\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", - "\\PY{+w}{ }\\PY{n}{vector\\PYZus{}copy\\PYZus{}dyn\\PYZus{}sizesHandle\\PYZus{}t}\\PY{+w}{ }\\PY{n}{handle}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{err}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{N}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{42}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{k+kt}{double}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}restrict\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{n}{A}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{p}{(}\\PY{k+kt}{double}\\PY{o}{*}\\PY{p}{)}\\PY{+w}{ }\\PY{n}{calloc}\\PY{p}{(}\\PY{n}{N}\\PY{p}{,}\\PY{+w}{ }\\PY{k}{sizeof}\\PY{p}{(}\\PY{k+kt}{double}\\PY{p}{)}\\PY{p}{)}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{k+kt}{double}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}restrict\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{n}{B}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{p}{(}\\PY{k+kt}{double}\\PY{o}{*}\\PY{p}{)}\\PY{+w}{ }\\PY{n}{calloc}\\PY{p}{(}\\PY{n}{N}\\PY{p}{,}\\PY{+w}{ }\\PY{k}{sizeof}\\PY{p}{(}\\PY{k+kt}{double}\\PY{p}{)}\\PY{p}{)}\\PY{p}{;}\n", - "\n", - "\n", - "\\PY{+w}{ }\\PY{n}{handle}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}dace\\PYZus{}init\\PYZus{}vector\\PYZus{}copy\\PYZus{}dyn\\PYZus{}sizes}\\PY{p}{(}\\PY{n}{N}\\PY{p}{)}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}program\\PYZus{}vector\\PYZus{}copy\\PYZus{}dyn\\PYZus{}sizes}\\PY{p}{(}\\PY{n}{handle}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{A}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{B}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{N}\\PY{p}{)}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{n}{err}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}dace\\PYZus{}exit\\PYZus{}vector\\PYZus{}copy\\PYZus{}dyn\\PYZus{}sizes}\\PY{p}{(}\\PY{n}{handle}\\PY{p}{)}\\PY{p}{;}\n", - "\n", - "\\PY{+w}{ }\\PY{n}{free}\\PY{p}{(}\\PY{n}{A}\\PY{p}{)}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{n}{free}\\PY{p}{(}\\PY{n}{B}\\PY{p}{)}\\PY{p}{;}\n", - "\n", - "\n", - "\\PY{+w}{ }\\PY{k}{return}\\PY{+w}{ }\\PY{n}{err}\\PY{p}{;}\n", - "\\PY{p}{\\PYZcb{}}\n", - "\\end{Verbatim}\n" - ], - "text/plain": [ - "#include \n", - "#include \"../include/vector_copy_dyn_sizes.h\"\n", - "\n", - "int main(int argc, char **argv) {\n", - " vector_copy_dyn_sizesHandle_t handle;\n", - " int err;\n", - " int N = 42;\n", - " double * __restrict__ A = (double*) calloc(N, sizeof(double));\n", - " double * __restrict__ B = (double*) calloc(N, sizeof(double));\n", - "\n", - "\n", - " handle = __dace_init_vector_copy_dyn_sizes(N);\n", - " __program_vector_copy_dyn_sizes(handle, A, B, N);\n", - " err = __dace_exit_vector_copy_dyn_sizes(handle);\n", - "\n", - " free(A);\n", - " free(B);\n", - "\n", - "\n", - " return err;\n", - "}" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "Code(sdfg.generate_code()[3].clean_code, language='cpp')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "33d74b5c", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "dace_env", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.12.3" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/berkay_workpace/scratch/cudastreamPass.ipynb b/berkay_workpace/scratch/cudastreamPass.ipynb deleted file mode 100644 index c48362fff1..0000000000 --- a/berkay_workpace/scratch/cudastreamPass.ipynb +++ /dev/null @@ -1,331 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "id": "f4d111db", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "import dace\n", - "from IPython.display import Code\n", - "from dace.transformation import pass_pipeline\n" - ] - }, - { - "cell_type": "markdown", - "id": "7bdf4ea6", - "metadata": {}, - "source": [ - "Here you can choose any of the 3 following programs to see how the sdfg assigns streams and how it adds synchronization tasklets if required.\n", - "You can, if you wish, also change e.g. the StorageType of one input- as long as you don't choose a strategy where the GPU is not used (e.g. a direct CPU\n", - "to CPU copy), a synchronization tasklet should be added. \n", - "\n", - "Note: test1 is a special case - where we have only one connected component. I thought it would be cool if we just use the default nullptr in this case instead of \n", - "creating a stream." - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "3dfa8ad3", - "metadata": {}, - "outputs": [], - "source": [ - "@dace.program\n", - "def test1(A: dace.uint32[10] @ dace.dtypes.StorageType.GPU_Global,\n", - " B: dace.uint32[10] @ dace.dtypes.StorageType.GPU_Global\n", - " ):\n", - " A[:] = B[:]" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "ad689f3d", - "metadata": {}, - "outputs": [], - "source": [ - "@dace.program\n", - "def test2(A: dace.uint32[10] @ dace.dtypes.StorageType.GPU_Global,\n", - " B: dace.uint32[10] @ dace.dtypes.StorageType.GPU_Global,\n", - " C: dace.uint32[10] @ dace.dtypes.StorageType.GPU_Global,\n", - " D: dace.uint32[10] @ dace.dtypes.StorageType.GPU_Global\n", - " ):\n", - " \n", - " for i in dace.map[0:10] @ dace.dtypes.ScheduleType.GPU_Device:\n", - " A[i] = B[i]\n", - " \n", - " for j in dace.map[0:10] @ dace.dtypes.ScheduleType.GPU_Device:\n", - " C[j] = D[j]" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "104de517", - "metadata": {}, - "outputs": [], - "source": [ - "@dace.program\n", - "def test3(A: dace.uint32[10] @ dace.dtypes.StorageType.GPU_Global,\n", - " B: dace.uint32[10] @ dace.dtypes.StorageType.GPU_Global,\n", - " C: dace.uint32[10] @ dace.dtypes.StorageType.GPU_Global,\n", - " D: dace.uint32[10] @ dace.dtypes.StorageType.GPU_Global\n", - " ):\n", - " \n", - " A[:] = B[:]\n", - " \n", - " for i in dace.map[0:3] @ dace.dtypes.ScheduleType.Sequential:\n", - " for j in dace.map[0:10] @ dace.dtypes.ScheduleType.GPU_Device:\n", - " C[j] = D[j]" - ] - }, - { - "cell_type": "markdown", - "id": "3ce0f3af", - "metadata": {}, - "source": [ - "Choose which program you want to select for generating the sdfg below. It will give you the sdfg, without any snychronization tasklets.\n", - "The old codegen, would figure out where synchronization has to occur. We will make this explicit, as you wanted :)." - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "5ba1505c", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "
\n", - "
\n", - "
\n", - "\n", - "" - ], - "text/plain": [ - "SDFG (test3)" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Choose\n", - "# sdfg = test1.to_sdfg()\n", - "# sdfg = test2.to_sdfg()\n", - "sdfg = test3.to_sdfg()\n", - "sdfg" - ] - }, - { - "cell_type": "markdown", - "id": "16c0f318", - "metadata": {}, - "source": [ - "Now we apply the pass to see the change:" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "c9152955", - "metadata": {}, - "outputs": [], - "source": [ - "# import the pass\n", - "from dace.transformation.passes.gpustream_scheduling import NaiveGPUStreamScheduler\n", - "\n", - "# Define backend stream access expression, which is used as below. \n", - "# (I do this explicitly such that any change in the access expression can be detected easier in future)\n", - "gpu_stream_access_template = \"__state->gpu_context->streams[{gpu_stream}]\" \n", - "\n", - "# Initialize and configure GPU stream scheduling pass\n", - "gpu_stream_pass = NaiveGPUStreamScheduler()\n", - "gpu_stream_pass.set_gpu_stream_access_template(gpu_stream_access_template)\n", - "assigned_streams = gpu_stream_pass.apply_pass(sdfg, None)\n" - ] - }, - { - "cell_type": "markdown", - "id": "415675f7", - "metadata": {}, - "source": [ - "Look at which nodes get assigned to which streams - as expected, right?" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "964ac157", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{AccessNode (B): 0,\n", - " AccessNode (A): 0,\n", - " AccessNode (D): 1,\n", - " MapEntry (test3_10[i=0:3]): 1,\n", - " MapEntry (test3_10_4_11[j=0:10]): 1,\n", - " Tasklet (assign_12_12): 1,\n", - " MapExit (test3_10_4_11[j=0:10]): 1,\n", - " MapExit (test3_10[i=0:3]): 1,\n", - " AccessNode (C): 1}" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "assigned_streams" - ] - }, - { - "cell_type": "markdown", - "id": "69b5a1c0", - "metadata": {}, - "source": [ - "Look at the extended sdfg, now the synchronization is explicit and not the job of the codegen to figure out and implement." - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "f0cbcd1f", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "
\n", - "
\n", - "
\n", - "\n", - "" - ], - "text/plain": [ - "SDFG (test3)" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "sdfg " - ] - }, - { - "cell_type": "markdown", - "id": "804d8436", - "metadata": {}, - "source": [ - "And you can also inspect the corresponding code. Just ensure that you are using the experimental codegen:" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "60d817de", - "metadata": {}, - "outputs": [ - { - "ename": "AssertionError", - "evalue": "", - "output_type": "error", - "traceback": [ - "\u001b[31m---------------------------------------------------------------------------\u001b[39m", - "\u001b[31mAssertionError\u001b[39m Traceback (most recent call last)", - "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[9]\u001b[39m\u001b[32m, line 3\u001b[39m\n\u001b[32m 1\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mdace\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01mconfig\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m Config\n\u001b[32m----> \u001b[39m\u001b[32m3\u001b[39m \u001b[38;5;28;01massert\u001b[39;00m Config.get(\u001b[33m'\u001b[39m\u001b[33mcompiler\u001b[39m\u001b[33m'\u001b[39m, \u001b[33m'\u001b[39m\u001b[33mcuda\u001b[39m\u001b[33m'\u001b[39m, \u001b[33m'\u001b[39m\u001b[33mimplementation\u001b[39m\u001b[33m'\u001b[39m) == \u001b[33m\"\u001b[39m\u001b[33mexperimental\u001b[39m\u001b[33m\"\u001b[39m\n", - "\u001b[31mAssertionError\u001b[39m: " - ] - } - ], - "source": [ - "from dace.config import Config\n", - "\n", - "assert Config.get('compiler', 'cuda', 'implementation') == \"experimental\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "bf7c6836", - "metadata": {}, - "outputs": [], - "source": [ - "Code(sdfg.generate_code()[0].clean_code, language='cpp')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a029d5a5", - "metadata": {}, - "outputs": [], - "source": [ - "Code(sdfg.generate_code()[1].clean_code, language='cpp')" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "dace_env", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.12.3" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/berkay_workpace/scratch/smemPassAndCopy/scalarMultiplication1.ipynb b/berkay_workpace/scratch/smemPassAndCopy/scalarMultiplication1.ipynb deleted file mode 100644 index c6a566ca36..0000000000 --- a/berkay_workpace/scratch/smemPassAndCopy/scalarMultiplication1.ipynb +++ /dev/null @@ -1,509 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "15cd9104", - "metadata": {}, - "source": [ - "# Scalar Multiplication 1\n", - "\n", - "In this notebook, we will explore how the **`DefaultSharedMemorySync` pass** inserts `__syncthreads()` tasklets. We will specifically observe its behavior when **reusing shared memory** during a scalar multiplication. Our example involves multiplying a scalar by a long vector; we will import a consecutive subset of the vector into shared memory, perform the multiplication, and then restore it.\n", - "Only one threadblock is used and it gets each consecutive chunc using a **sequential map**. **Scalar Multiplication 2** does the same but uses a **for loop** instead. " - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "1e6f5b43", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "# needed modules, nothing interesting :)\n", - "import dace\n", - "from IPython.display import Code\n", - "from dace.transformation import pass_pipeline\n", - "from dace.transformation.auto import auto_optimize\n", - "from dace.transformation.passes.shared_memory_synchronization import DefaultSharedMemorySync" - ] - }, - { - "cell_type": "markdown", - "id": "a370147d", - "metadata": {}, - "source": [ - "### Inspiration\n", - "\n", - "Here are some example SDFGs using the Python frontend that perform scalar multiplication. These served as inspiration to implement the same operation—this time using shared memory instead of a temporary local variable.\n", - "\n", - "Why not use shared memory in the Python frontend? Because we want more control over the program and prefer to focus on the concept itself, rather than the capabilities provided by the Python frontend.\n", - "\n", - "Note that we have several similar examples. They differ in where the sequential map is placed within the nested map. A sequential map **outside** the kernel (i.e., outside GPU schedules) does **not** require synchronization after the sequential iteration, as we simply launch the kernel again and do not reuse shared memory.\n" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "66ef7e5f", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "
\n", - "
\n", - "
\n", - "\n", - "" - ], - "text/plain": [ - "SDFG (scalarMultiplication3)" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "@dace.program\n", - "def scalarMultiplication1(A: dace.int32[128] @ dace.dtypes.StorageType.GPU_Global, scalar: dace.int32):\n", - " for k in dace.map[0:4] @ dace.dtypes.ScheduleType.Sequential:\n", - " for i in dace.map[0:32:32] @ dace.dtypes.ScheduleType.GPU_Device:\n", - " for j in dace.map[0:32] @ dace.dtypes.ScheduleType.GPU_ThreadBlock:\n", - " tmp = A[k * 32 + j]\n", - " A[k * 32 + j] = scalar * tmp\n", - "\n", - "@dace.program\n", - "def scalarMultiplication2(A: dace.int32[128] @ dace.dtypes.StorageType.GPU_Global, scalar: dace.int32):\n", - " for i in dace.map[0:32:32] @ dace.dtypes.ScheduleType.GPU_Device:\n", - " for k in dace.map[0:4] @ dace.dtypes.ScheduleType.Sequential:\n", - " for j in dace.map[0:32] @ dace.dtypes.ScheduleType.GPU_ThreadBlock:\n", - " tmp = A[k * 32 + j]\n", - " A[k * 32 + j] = scalar * tmp\n", - "\n", - "@dace.program\n", - "def scalarMultiplication3(A: dace.int32[128] @ dace.dtypes.StorageType.GPU_Global, scalar: dace.int32):\n", - " for i in dace.map[0:32:32] @ dace.dtypes.ScheduleType.GPU_Device:\n", - " for j in dace.map[0:32] @ dace.dtypes.ScheduleType.GPU_ThreadBlock:\n", - " for k in dace.map[0:4] @ dace.dtypes.ScheduleType.Sequential:\n", - " tmp = A[k * 32 + j]\n", - " A[k * 32 + j] = scalar * tmp\n", - "\n", - "\n", - "# Choose the sdfg you want so inspect below\n", - "sdfg_inspiration = scalarMultiplication3.to_sdfg()\n", - "sdfg_inspiration" - ] - }, - { - "cell_type": "markdown", - "id": "c6d4c63a", - "metadata": {}, - "source": [ - "Tipp: collapse the functions and only focus one at a time below. They are quite similar, only difference is where the sequential map occurs.\n", - "Select it and the observe whether the post-synchronization happens if required and whether it is omitted if unnecessary." - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "9abdaf19", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "
\n", - "
\n", - "
\n", - "\n", - "" - ], - "text/plain": [ - "SDFG (scalarMultiplication2_smem)" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Here we should have NO post synchronization, since seq map is OUTSIDE of the kernel. \n", - "def scalarMultiplication1_smem():\n", - " # Create SDFG and state\n", - " sdfg = dace.SDFG(\"scalarMultiplication1_smem\")\n", - " state = sdfg.add_state(\"main\")\n", - "\n", - " # Add arrays\n", - " sdfg.add_array(\"A\", (128,), dace.uint32, storage=dace.dtypes.StorageType.GPU_Global)\n", - " sdfg.add_scalar(\"scalar\", dace.uint32)\n", - " sdfg.add_array(\"S\", (32,), dace.uint32, storage=dace.dtypes.StorageType.GPU_Shared, transient=True, lifetime=dace.dtypes.AllocationLifetime.Scope)\n", - "\n", - " # Add access nodes\n", - " a_acc = state.add_read(\"A\")\n", - " a_store = state.add_write(\"A\")\n", - " scalar_acc = state.add_access(\"scalar\")\n", - " s_acc= state.add_access(\"S\")\n", - "\n", - " # Sequential map (outermost) \n", - " seq_map_entry, seq_map_exit = state.add_map(\n", - " \"seq_map\",\n", - " dict(k=\"0:4\"),\n", - " schedule=dace.dtypes.ScheduleType.Sequential,\n", - " )\n", - "\n", - "\n", - " # GPU Device map\n", - " gpu_map_entry, gpu_map_exit = state.add_map(\n", - " \"gpu_map\",\n", - " dict(i=\"0:32:32\"),\n", - " schedule=dace.dtypes.ScheduleType.GPU_Device,\n", - " )\n", - "\n", - " # GPU TB map\n", - " tb_map_entry, tb_map_exit = state.add_map(\n", - " \"tb\",\n", - " dict(j=\"0:32\"),\n", - " schedule=dace.dtypes.ScheduleType.GPU_ThreadBlock,\n", - " )\n", - "\n", - " # Add tasklets for A -> S -> B\n", - " tasklet1 = state.add_tasklet(\n", - " \"addMult\",\n", - " inputs={\"__inp_A\", \"__inp_scalar\"},\n", - " outputs={\"__out\"},\n", - " code=\"__out = __inp_A * __inp_scalar;\",\n", - " language=dace.dtypes.Language.CPP\n", - " )\n", - "\n", - " tasklet2 = state.add_tasklet(\n", - " \"store_to_global\",\n", - " inputs={\"__inp\"},\n", - " outputs={\"__out\"},\n", - " code=\"__out = __inp;\",\n", - " language=dace.dtypes.Language.CPP\n", - " )\n", - "\n", - " # Edges\n", - "\n", - " # A and scalar to first map\n", - " state.add_edge(a_acc, None, seq_map_entry, None, dace.Memlet(\"A[0:128]\"))\n", - " state.add_edge(scalar_acc, None, seq_map_entry, None, dace.Memlet(\"scalar[0]\"))\n", - "\n", - " # Add both down to last map, the threadblock map\n", - " state.add_edge(seq_map_entry, None, gpu_map_entry, None, dace.Memlet(\"A[32 * k: 32 * (k+1)]\"))\n", - " state.add_edge(seq_map_entry, None, gpu_map_entry, None, dace.Memlet(\"scalar[0]\"))\n", - "\n", - " state.add_edge(gpu_map_entry, None, tb_map_entry, None, dace.Memlet(\"A[32 * k: 32 * (k+1)]\"))\n", - " state.add_edge(gpu_map_entry, None, tb_map_entry, None, dace.Memlet(\"scalar[0]\"))\n", - "\n", - " # connect to tasklets\n", - " state.add_edge(tb_map_entry, None, tasklet1, \"__inp_A\", dace.Memlet(\"A[j + 32* k]\"))\n", - " state.add_edge(tb_map_entry, None, tasklet1, \"__inp_scalar\", dace.Memlet(\"scalar[0]\"))\n", - "\n", - " state.add_edge(tasklet1, \"__out\", s_acc, None, dace.Memlet(\"S[j]\"))\n", - "\n", - " state.add_edge(s_acc, None, tasklet2, \"__inp\", dace.Memlet(\"S[j]\"))\n", - "\n", - " # connect to all map exit nodes and then back to A to store back\n", - " state.add_edge(tasklet2, \"__out\", tb_map_exit, None, dace.Memlet(\"A[j + 32* k]\"))\n", - " state.add_edge(tb_map_exit, None, gpu_map_exit, None, dace.Memlet(\"A[32 * k: 32 * (k+1)]\"))\n", - " state.add_edge(gpu_map_exit, None, seq_map_exit, None, dace.Memlet(\"A[32 * k: 32 * (k+1)]\"))\n", - " state.add_edge(seq_map_exit, None, a_store, None, dace.Memlet(\"A[0:128]\"))\n", - " \n", - " \n", - " sdfg.fill_scope_connectors()\n", - " return sdfg\n", - "\n", - "\n", - "# Here we should have post synchronization\n", - "def scalarMultiplication2_smem():\n", - " # Create SDFG and state\n", - " sdfg = dace.SDFG(\"scalarMultiplication2_smem\")\n", - " state = sdfg.add_state(\"main\")\n", - "\n", - " # Add arrays\n", - " sdfg.add_array(\"A\", (128,), dace.uint32, storage=dace.dtypes.StorageType.GPU_Global)\n", - " sdfg.add_scalar(\"scalar\", dace.uint32)\n", - " sdfg.add_array(\"S\", (32,), dace.uint32, storage=dace.dtypes.StorageType.GPU_Shared, transient=True, lifetime=dace.dtypes.AllocationLifetime.Scope)\n", - "\n", - " # Add access nodes\n", - " a_acc = state.add_read(\"A\")\n", - " a_store = state.add_write(\"A\")\n", - " scalar_acc = state.add_access(\"scalar\")\n", - " s_acc= state.add_access(\"S\")\n", - "\n", - " # Sequential map (outermost) \n", - " seq_map_entry, seq_map_exit = state.add_map(\n", - " \"seq_map\",\n", - " dict(k=\"0:4\"),\n", - " schedule=dace.dtypes.ScheduleType.Sequential,\n", - " )\n", - "\n", - "\n", - " # GPU Device map\n", - " gpu_map_entry, gpu_map_exit = state.add_map(\n", - " \"gpu_map\",\n", - " dict(i=\"0:32:32\"),\n", - " schedule=dace.dtypes.ScheduleType.GPU_Device,\n", - " )\n", - "\n", - " # GPU TB map\n", - " tb_map_entry, tb_map_exit = state.add_map(\n", - " \"tb\",\n", - " dict(j=\"0:32\"),\n", - " schedule=dace.dtypes.ScheduleType.GPU_ThreadBlock,\n", - " )\n", - "\n", - " # Add tasklets for A -> S -> B\n", - " tasklet1 = state.add_tasklet(\n", - " \"addMult\",\n", - " inputs={\"__inp_A\", \"__inp_scalar\"},\n", - " outputs={\"__out\"},\n", - " code=\"__out = __inp_A * __inp_scalar;\",\n", - " language=dace.dtypes.Language.CPP\n", - " )\n", - "\n", - " tasklet2 = state.add_tasklet(\n", - " \"store_to_global\",\n", - " inputs={\"__inp\"},\n", - " outputs={\"__out\"},\n", - " code=\"__out = __inp;\",\n", - " language=dace.dtypes.Language.CPP\n", - " )\n", - "\n", - " # Edges\n", - "\n", - " # A and scalar to first map\n", - " state.add_edge(a_acc, None, gpu_map_entry, None, dace.Memlet(\"A[0:128]\"))\n", - " state.add_edge(scalar_acc, None, gpu_map_entry, None, dace.Memlet(\"scalar[0]\"))\n", - "\n", - " # Add both down to last map, the threadblock map\n", - " state.add_edge(gpu_map_entry, None, seq_map_entry, None, dace.Memlet(\"A[0:128]\"))\n", - " state.add_edge(gpu_map_entry, None, seq_map_entry, None, dace.Memlet(\"scalar[0]\"))\n", - "\n", - " state.add_edge(seq_map_entry, None, tb_map_entry, None, dace.Memlet(\"A[32 * k: 32 * (k+1)]\"))\n", - " state.add_edge(seq_map_entry, None, tb_map_entry, None, dace.Memlet(\"scalar[0]\"))\n", - "\n", - " # connect to tasklets\n", - " state.add_edge(tb_map_entry, None, tasklet1, \"__inp_A\", dace.Memlet(\"A[j + 32* k]\"))\n", - " state.add_edge(tb_map_entry, None, tasklet1, \"__inp_scalar\", dace.Memlet(\"scalar[0]\"))\n", - "\n", - " state.add_edge(tasklet1, \"__out\", s_acc, None, dace.Memlet(\"S[j]\"))\n", - "\n", - " state.add_edge(s_acc, None, tasklet2, \"__inp\", dace.Memlet(\"S[j]\"))\n", - "\n", - " # connect to all map exit nodes and then back to A to store back\n", - " state.add_edge(tasklet2, \"__out\", tb_map_exit, None, dace.Memlet(\"A[j + 32* k]\"))\n", - " state.add_edge(tb_map_exit, None, seq_map_exit, None, dace.Memlet(\"A[32 * k: 32 * (k+1)]\"))\n", - " state.add_edge(seq_map_exit, None, gpu_map_exit, None, dace.Memlet(\"A[0:128]\"))\n", - " state.add_edge(gpu_map_exit, None, a_store, None, dace.Memlet(\"A[0:128]\"))\n", - " \n", - " \n", - " sdfg.fill_scope_connectors()\n", - " return sdfg\n", - "\n", - "\n", - "# As before, Here we should have post synchronization\n", - "def scalarMultiplication3_smem():\n", - " # Create SDFG and state\n", - " sdfg = dace.SDFG(\"scalarMultiplication3_smem\")\n", - " state = sdfg.add_state(\"main\")\n", - "\n", - " # Add arrays\n", - " sdfg.add_array(\"A\", (128,), dace.uint32, storage=dace.dtypes.StorageType.GPU_Global)\n", - " sdfg.add_scalar(\"scalar\", dace.uint32)\n", - " sdfg.add_array(\"S\", (32,), dace.uint32, storage=dace.dtypes.StorageType.GPU_Shared, transient=True, lifetime=dace.dtypes.AllocationLifetime.Scope)\n", - "\n", - " # Add access nodes\n", - " a_acc = state.add_read(\"A\")\n", - " a_store = state.add_write(\"A\")\n", - " scalar_acc = state.add_access(\"scalar\")\n", - " s_acc= state.add_access(\"S\")\n", - "\n", - " # Sequential map (outermost) \n", - " seq_map_entry, seq_map_exit = state.add_map(\n", - " \"seq_map\",\n", - " dict(k=\"0:4\"),\n", - " schedule=dace.dtypes.ScheduleType.Sequential,\n", - " )\n", - "\n", - "\n", - " # GPU Device map\n", - " gpu_map_entry, gpu_map_exit = state.add_map(\n", - " \"gpu_map\",\n", - " dict(i=\"0:32:32\"),\n", - " schedule=dace.dtypes.ScheduleType.GPU_Device,\n", - " )\n", - "\n", - " # GPU TB map\n", - " tb_map_entry, tb_map_exit = state.add_map(\n", - " \"tb\",\n", - " dict(j=\"0:32\"),\n", - " schedule=dace.dtypes.ScheduleType.GPU_ThreadBlock,\n", - " )\n", - "\n", - " # Add tasklets for A -> S -> B\n", - " tasklet1 = state.add_tasklet(\n", - " \"addMult\",\n", - " inputs={\"__inp_A\", \"__inp_scalar\"},\n", - " outputs={\"__out\"},\n", - " code=\"__out = __inp_A * __inp_scalar;\",\n", - " language=dace.dtypes.Language.CPP\n", - " )\n", - "\n", - " tasklet2 = state.add_tasklet(\n", - " \"store_to_global\",\n", - " inputs={\"__inp\"},\n", - " outputs={\"__out\"},\n", - " code=\"__out = __inp;\",\n", - " language=dace.dtypes.Language.CPP\n", - " )\n", - "\n", - " # Edges\n", - "\n", - " # A and scalar to first map\n", - " state.add_edge(a_acc, None, gpu_map_entry, None, dace.Memlet(\"A[0:128]\"))\n", - " state.add_edge(scalar_acc, None, gpu_map_entry, None, dace.Memlet(\"scalar[0]\"))\n", - "\n", - " # Add both down to last map, the threadblock map\n", - " state.add_edge(gpu_map_entry, None, tb_map_entry, None, dace.Memlet(\"A[0:128]\"))\n", - " state.add_edge(gpu_map_entry, None, tb_map_entry, None, dace.Memlet(\"scalar[0]\"))\n", - "\n", - " state.add_edge(tb_map_entry, None, seq_map_entry, None, dace.Memlet(\"A[j: j + 4]\")) # weird, but it is like this in the inspiration\n", - " state.add_edge(tb_map_entry, None, seq_map_entry, None, dace.Memlet(\"scalar[0]\"))\n", - "\n", - " # connect to tasklets\n", - " state.add_edge(seq_map_entry, None, tasklet1, \"__inp_A\", dace.Memlet(\"A[j + 32* k]\"))\n", - " state.add_edge(seq_map_entry, None, tasklet1, \"__inp_scalar\", dace.Memlet(\"scalar[0]\"))\n", - "\n", - " state.add_edge(tasklet1, \"__out\", s_acc, None, dace.Memlet(\"S[j]\"))\n", - "\n", - " state.add_edge(s_acc, None, tasklet2, \"__inp\", dace.Memlet(\"S[j]\"))\n", - "\n", - " # connect to all map exit nodes and then back to A to store back\n", - " state.add_edge(tasklet2, \"__out\", seq_map_exit, None, dace.Memlet(\"A[j + 32* k]\"))\n", - " state.add_edge(seq_map_exit, None, tb_map_exit, None, dace.Memlet(\"A[j: j + 4]\"))\n", - " state.add_edge(tb_map_exit, None, gpu_map_exit, None, dace.Memlet(\"A[0:128]\"))\n", - " state.add_edge(gpu_map_exit, None, a_store, None, dace.Memlet(\"A[0:128]\"))\n", - " \n", - " \n", - " sdfg.fill_scope_connectors()\n", - " return sdfg\n", - "\n", - "\n", - "# choose which of the three versions should be applied to the pass\n", - "sdfg = scalarMultiplication2_smem()\n", - "sdfg" - ] - }, - { - "cell_type": "markdown", - "id": "2cddaa1c", - "metadata": {}, - "source": [ - "Observe how the pass inserts the synchronization barriers correctly:" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "6c8921a7", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "
\n", - "
\n", - "
\n", - "\n", - "" - ], - "text/plain": [ - "SDFG (scalarMultiplication2_smem)" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# insert synchronization barriers\n", - "DefaultSharedMemorySync().apply_pass(sdfg, None)\n", - "sdfg" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "149b48c5", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "dace_env", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.12.3" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/berkay_workpace/scratch/smemPassAndCopy/scalarMultiplication2.ipynb b/berkay_workpace/scratch/smemPassAndCopy/scalarMultiplication2.ipynb deleted file mode 100644 index c86a07a8ab..0000000000 --- a/berkay_workpace/scratch/smemPassAndCopy/scalarMultiplication2.ipynb +++ /dev/null @@ -1,286 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "012177f0", - "metadata": {}, - "source": [ - "# Scalar Multiplication 2\n", - "\n", - "This notebook is quite similar to **Scalar Multiplication 1**, but instead of reusing shared memory due to a sequential scheduled map, we reuse shared memory since it is in the body of a **for loop**.\n", - "This notebook is shorter and does not explain everything all over again in detail." - ] - }, - { - "cell_type": "markdown", - "id": "1fb5f12b", - "metadata": {}, - "source": [ - "Needed imports:" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "e607a9c6", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "import dace\n", - "from dace.sdfg.state import LoopRegion\n", - "from IPython.display import Code\n", - "from dace.transformation.passes.shared_memory_synchronization import DefaultSharedMemorySync" - ] - }, - { - "cell_type": "markdown", - "id": "c12a68d3", - "metadata": {}, - "source": [ - "## Insipration\n", - "\n", - "As in **Scalar Multiplication 1**, the frontend DaCe program that was used as an inspiration. I omit the different positions where the for loop can be, I just assume it is the innermost \"scope\"." - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "2769e30c", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "
\n", - "
\n", - "
\n", - "\n", - "" - ], - "text/plain": [ - "SDFG (scalarMultiplication)" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# To next file\n", - "@dace.program\n", - "def scalarMultiplication(A: dace.int32[128] @ dace.dtypes.StorageType.GPU_Global, scalar: dace.int32):\n", - " for i in dace.map[0:32:32] @ dace.dtypes.ScheduleType.GPU_Device:\n", - " for j in dace.map[0:32] @ dace.dtypes.ScheduleType.GPU_ThreadBlock:\n", - " for k in range(4):\n", - " tmp = A[k * 32 + j]\n", - " A[k * 32 + j] = scalar * tmp\n", - " \n", - "\n", - "sdfg = scalarMultiplication.to_sdfg()\n", - "sdfg" - ] - }, - { - "cell_type": "markdown", - "id": "21bc45e6", - "metadata": {}, - "source": [ - "The sdfg we use with by using shared memory instead of a temporary local variable:" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "f0609dff", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "
\n", - "
\n", - "
\n", - "\n", - "" - ], - "text/plain": [ - "SDFG (scalarMultiplication_smem)" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "def scalarMultiplication_smem():\n", - " sdfg = dace.SDFG(\"scalarMultiplication_smem\")\n", - " state = sdfg.add_state(\"main\")\n", - "\n", - " # Arrays and access nodes\n", - " sdfg.add_array(\"A\", (128,), dace.uint32, storage=dace.dtypes.StorageType.GPU_Global)\n", - " sdfg.add_scalar(\"scalar\", dace.uint32)\n", - " a_acc = state.add_read(\"A\")\n", - " a_store = state.add_write(\"A\")\n", - " scalar_acc = state.add_access(\"scalar\")\n", - "\n", - " # Device and thread-block maps\n", - " gpu_map_entry, gpu_map_exit = state.add_map(\n", - " \"gpu_map\", dict(i=\"0:32:32\"), schedule=dace.dtypes.ScheduleType.GPU_Device\n", - " )\n", - " tb_map_entry, tb_map_exit = state.add_map(\n", - " \"tb\", dict(j=\"0:32\"), schedule=dace.dtypes.ScheduleType.GPU_ThreadBlock\n", - " )\n", - "\n", - " # Nested SDFG setup\n", - " inner_sdfg = dace.SDFG('nested_sdfg')\n", - " nested = state.add_nested_sdfg(inner_sdfg, sdfg, inputs={'__inp_A', '__inp_scalar'}, outputs={'tmp_ret'})\n", - "\n", - " loopreg = LoopRegion(\"loop\", \"k < 4\", \"k\", \"k = 0\", \"k = (k + 1)\", False, inner_sdfg)\n", - " inner_sdfg.add_node(loopreg)\n", - " inner_state = loopreg.add_state(\"use_smem\")\n", - "\n", - " # Shared memory and result\n", - " inner_sdfg.add_array(\"S\", (32,), dace.uint32, storage=dace.dtypes.StorageType.GPU_Shared, transient=True)\n", - " inner_sdfg.add_scalar(\"tmp_ret\", dace.uint32)\n", - " s_acc = inner_state.add_access(\"S\")\n", - " ret = inner_state.add_write(\"tmp_ret\")\n", - "\n", - " # Tasklets\n", - " tasklet1 = inner_state.add_tasklet(\n", - " \"assign_to_smem\", inputs={}, outputs={\"__out1\"},\n", - " code=\"__out1 = __inp_A[j + 32 * k]\",\n", - " language=dace.dtypes.Language.CPP\n", - " )\n", - " tasklet2 = inner_state.add_tasklet(\n", - " \"addMult\", inputs={\"__inp2\"}, outputs={\"__out2\"},\n", - " code=\"__out2 = __inp2 * __inp_scalar;\",\n", - " language=dace.dtypes.Language.CPP\n", - " )\n", - "\n", - " # Main SDFG edges\n", - " state.add_edge(a_acc, None, gpu_map_entry, None, dace.Memlet(\"A[0:128]\"))\n", - " state.add_edge(scalar_acc, None, gpu_map_entry, None, dace.Memlet(\"scalar[0]\"))\n", - " state.add_edge(gpu_map_entry, None, tb_map_entry, None, dace.Memlet(\"A[0:128]\"))\n", - " state.add_edge(gpu_map_entry, None, tb_map_entry, None, dace.Memlet(\"scalar[0]\"))\n", - " state.add_edge(tb_map_entry, None, nested, \"__inp_A\", dace.Memlet(\"A[j : j + 97 : 32]\"))\n", - " state.add_edge(tb_map_entry, None, nested, \"__inp_scalar\", dace.Memlet(\"scalar[0]\"))\n", - " state.add_edge(nested, \"tmp_ret\", tb_map_exit, None, dace.Memlet(\"A[j : j + 97 : 32]\"))\n", - " state.add_edge(tb_map_exit, None, gpu_map_exit, None, dace.Memlet(\"A[0:128]\"))\n", - " state.add_edge(gpu_map_exit, None, a_store, None, dace.Memlet(\"A[0:128]\"))\n", - "\n", - " # Inner SDFG edges\n", - " inner_state.add_edge(tasklet1, \"__out1\", s_acc, None, dace.Memlet(\"S[j]\"))\n", - " inner_state.add_edge(s_acc, None, tasklet2, \"__inp2\", dace.Memlet(\"S[j]\"))\n", - " inner_state.add_edge(tasklet2, \"__out2\", ret, None, dace.Memlet(\"S[j]\"))\n", - "\n", - " sdfg.fill_scope_connectors()\n", - " return sdfg\n", - "\n", - "\n", - "sdfg = scalarMultiplication_smem()\n", - "sdfg\n" - ] - }, - { - "cell_type": "markdown", - "id": "0e7e27ca", - "metadata": {}, - "source": [ - "Observe how the synchronization tasklets are inserted using the DefaultSharedMemorySync pass:" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "3fac943d", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "
\n", - "
\n", - "
\n", - "\n", - "" - ], - "text/plain": [ - "SDFG (scalarMultiplication_smem)" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# insert synchronization barriers\n", - "DefaultSharedMemorySync().apply_pass(sdfg, None)\n", - "sdfg" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "dace_env", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.12.3" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/berkay_workpace/scratch/smemPassAndCopy/simpleCopy.ipynb b/berkay_workpace/scratch/smemPassAndCopy/simpleCopy.ipynb deleted file mode 100644 index 90f60b0a69..0000000000 --- a/berkay_workpace/scratch/smemPassAndCopy/simpleCopy.ipynb +++ /dev/null @@ -1,866 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "a819effe", - "metadata": {}, - "source": [ - "# Simple Memory Copy\n", - "\n", - "This example demonstrates a basic memory copy operation where **shared memory** is used as an intermediate buffer. \n", - "It serves as the simplest possible scenario to test whether the `DefaultSharedMemorySync()` pass correctly inserts synchronization.\n", - "\n", - "The goal is to observe shared memory behavior in a minimal setting.\n" - ] - }, - { - "cell_type": "markdown", - "id": "df0fbf69", - "metadata": {}, - "source": [ - "First, we import needed modules at the beginning:" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "e7f52766", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "import dace\n", - "from IPython.display import Code\n", - "from dace.transformation.passes.shared_memory_synchronization import DefaultSharedMemorySync" - ] - }, - { - "cell_type": "markdown", - "id": "4215bbff", - "metadata": {}, - "source": [ - "## Insipration\n", - "\n", - "Below is the sdfg which I used for inspiration. The goal is to replace 'k' with a shared memory array later." - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "3f225145", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "
\n", - "
\n", - "
\n", - "\n", - "" - ], - "text/plain": [ - "SDFG (simpleCopy)" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "@dace.program\n", - "def simpleCopy(A: dace.float64[32] @ dace.dtypes.StorageType.GPU_Global, B: dace.float64[32] @ dace.dtypes.StorageType.GPU_Global, C: dace.float64[32] @ dace.dtypes.StorageType.GPU_Global):\n", - " for i in dace.map[0:32:32] @ dace.dtypes.ScheduleType.GPU_Device:\n", - " for j in dace.map[0:32] @ dace.dtypes.ScheduleType.GPU_ThreadBlock:\n", - " k = A[j]\n", - " B[j] = k\n", - "\n", - "simpleCopy.to_sdfg()\n" - ] - }, - { - "cell_type": "markdown", - "id": "f6382749", - "metadata": {}, - "source": [ - "A DaCe program built using the sdfg API, corresponding to a simple memory copy using shared memory as a buffer:" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "e7b22e0c", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "
\n", - "
\n", - "
\n", - "\n", - "" - ], - "text/plain": [ - "SDFG (simpleCopy_smem)" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "def simpleCopy_smem():\n", - " # Create SDFG and state\n", - " sdfg = dace.SDFG(\"simpleCopy_smem\")\n", - " state = sdfg.add_state(\"main\")\n", - "\n", - " # Add arrays\n", - " sdfg.add_array(\"A\", (32,), dace.uint32, storage=dace.dtypes.StorageType.GPU_Global)\n", - " sdfg.add_array(\"B\", (32,), dace.uint32, storage=dace.dtypes.StorageType.GPU_Global)\n", - " sdfg.add_array(\"S\", (32,), dace.uint32, storage=dace.dtypes.StorageType.GPU_Shared, transient=True, lifetime=dace.dtypes.AllocationLifetime.Scope)\n", - "\n", - " # Add access nodes\n", - " a_acc = state.add_access(\"A\")\n", - " b_acc = state.add_access(\"B\")\n", - " s_acc= state.add_access(\"S\")\n", - "\n", - " # GPU Device map\n", - " gpu_map_entry, gpu_map_exit = state.add_map(\n", - " \"gpu_map\",\n", - " dict(i=\"0:32:32\"),\n", - " schedule=dace.dtypes.ScheduleType.GPU_Device,\n", - " )\n", - "\n", - " # GPU TB map\n", - " tb_map_entry, tb_map_exit = state.add_map(\n", - " \"tb\",\n", - " dict(j=\"0:32\"),\n", - " schedule=dace.dtypes.ScheduleType.GPU_ThreadBlock,\n", - " )\n", - "\n", - " # Add tasklets for A -> S -> B\n", - " tasklet1 = state.add_tasklet(\n", - " \"copy_to_shared\",\n", - " inputs={\"__inp\"},\n", - " outputs={\"__out\"},\n", - " code=\"__out = __inp;\",\n", - " language=dace.dtypes.Language.CPP\n", - " )\n", - "\n", - " tasklet2 = state.add_tasklet(\n", - " \"copy_to_global\",\n", - " inputs={\"__inp\"},\n", - " outputs={\"__out\"},\n", - " code=\"__out = __inp;\",\n", - " language=dace.dtypes.Language.CPP\n", - " )\n", - "\n", - "\n", - " # Edges\n", - " state.add_edge(a_acc, None, gpu_map_entry, None, dace.Memlet(\"A[0:32]\"))\n", - " state.add_edge(gpu_map_entry, None, tb_map_entry, None, dace.Memlet(\"A[0:32]\"))\n", - " state.add_edge(tb_map_entry, None, tasklet1, \"__inp\", dace.Memlet(\"A[j]\"))\n", - " state.add_edge(tasklet1, \"__out\", s_acc, None, dace.Memlet(\"S[j]\"))\n", - " state.add_edge(s_acc, None, tasklet2, \"__inp\", dace.Memlet(\"S[j]\"))\n", - " state.add_edge(tasklet2, \"__out\", tb_map_exit, None, dace.Memlet(\"B[j]\"))\n", - " state.add_edge(tb_map_exit, None, gpu_map_exit, None, dace.Memlet(\"B[0:32]\"))\n", - " state.add_edge(gpu_map_exit, None, b_acc, None, dace.Memlet(\"B[0:32]\"))\n", - "\n", - " sdfg.fill_scope_connectors()\n", - " return sdfg\n", - "\n", - "sdfg = simpleCopy_smem()\n", - "sdfg" - ] - }, - { - "cell_type": "markdown", - "id": "ab08683d", - "metadata": {}, - "source": [ - "## Adding Synchronization Barriers\n", - "\n", - "A simple pass is used to add synchronization tasklets correct. We observe, that the synchronization tasklet is inserted after \n", - "the shared memory access and between an assignment tasklet, ensuring that the threads wait until all data is in shared memory before\n", - "using it. (Note, that in this case, synchronization would not be necessary since each thread access the same position in shared memory\n", - "it writes to. But we only care about the correct insertion after a shared memory accessNode is used)" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "15d8af45", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "
\n", - "
\n", - "
\n", - "\n", - "" - ], - "text/plain": [ - "SDFG (simpleCopy_smem)" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "DefaultSharedMemorySync().apply_pass(sdfg, None)\n", - "sdfg" - ] - }, - { - "cell_type": "markdown", - "id": "93950b7b", - "metadata": {}, - "source": [ - "The generated code where the \"__syncthreads();\" tasklet is correctly placed:" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "2318db8f", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
#include <cuda_runtime.h>\n",
-       "#include <dace/dace.h>\n",
-       "\n",
-       "\n",
-       "struct simpleCopy_smem_state_t {\n",
-       "    dace::cuda::Context *gpu_context;\n",
-       "};\n",
-       "\n",
-       "\n",
-       "\n",
-       "DACE_EXPORTED int __dace_init_experimental_cuda(simpleCopy_smem_state_t *__state);\n",
-       "DACE_EXPORTED int __dace_exit_experimental_cuda(simpleCopy_smem_state_t *__state);\n",
-       "\n",
-       "\n",
-       "\n",
-       "int __dace_init_experimental_cuda(simpleCopy_smem_state_t *__state) {\n",
-       "    int count;\n",
-       "\n",
-       "    // Check that we are able to run cuda code\n",
-       "    if (cudaGetDeviceCount(&count) != cudaSuccess)\n",
-       "    {\n",
-       "        printf("ERROR: GPU drivers are not configured or cuda-capable device "\n",
-       "               "not found\\n");\n",
-       "        return 1;\n",
-       "    }\n",
-       "    if (count == 0)\n",
-       "    {\n",
-       "        printf("ERROR: No cuda-capable devices found\\n");\n",
-       "        return 2;\n",
-       "    }\n",
-       "\n",
-       "    // Initialize cuda before we run the application\n",
-       "    float *dev_X;\n",
-       "    DACE_GPU_CHECK(cudaMalloc((void **) &dev_X, 1));\n",
-       "    DACE_GPU_CHECK(cudaFree(dev_X));\n",
-       "\n",
-       "    \n",
-       "\n",
-       "    __state->gpu_context = new dace::cuda::Context(0, 0);\n",
-       "\n",
-       "    // Create cuda streams and events\n",
-       "    for(int i = 0; i < 0; ++i) {\n",
-       "        DACE_GPU_CHECK(cudaStreamCreateWithFlags(&__state->gpu_context->internal_streams[i], cudaStreamNonBlocking));\n",
-       "        __state->gpu_context->streams[i] = __state->gpu_context->internal_streams[i]; // Allow for externals to modify streams\n",
-       "    }\n",
-       "    for(int i = 0; i < 0; ++i) {\n",
-       "        DACE_GPU_CHECK(cudaEventCreateWithFlags(&__state->gpu_context->events[i], cudaEventDisableTiming));\n",
-       "    }\n",
-       "\n",
-       "    \n",
-       "\n",
-       "    return 0;\n",
-       "}\n",
-       "\n",
-       "int __dace_exit_experimental_cuda(simpleCopy_smem_state_t *__state) {\n",
-       "    \n",
-       "\n",
-       "    // Synchronize and check for CUDA errors\n",
-       "    int __err = static_cast<int>(__state->gpu_context->lasterror);\n",
-       "    if (__err == 0)\n",
-       "        __err = static_cast<int>(cudaDeviceSynchronize());\n",
-       "\n",
-       "    // Destroy cuda streams and events\n",
-       "    for(int i = 0; i < 0; ++i) {\n",
-       "        DACE_GPU_CHECK(cudaStreamDestroy(__state->gpu_context->internal_streams[i]));\n",
-       "    }\n",
-       "    for(int i = 0; i < 0; ++i) {\n",
-       "        DACE_GPU_CHECK(cudaEventDestroy(__state->gpu_context->events[i]));\n",
-       "    }\n",
-       "\n",
-       "    delete __state->gpu_context;\n",
-       "    return __err;\n",
-       "}\n",
-       "\n",
-       "DACE_EXPORTED bool __dace_gpu_set_stream(simpleCopy_smem_state_t *__state, int streamid, gpuStream_t stream)\n",
-       "{\n",
-       "    if (streamid < 0 || streamid >= 0)\n",
-       "        return false;\n",
-       "\n",
-       "    __state->gpu_context->streams[streamid] = stream;\n",
-       "\n",
-       "    return true;\n",
-       "}\n",
-       "\n",
-       "DACE_EXPORTED void __dace_gpu_set_all_streams(simpleCopy_smem_state_t *__state, gpuStream_t stream)\n",
-       "{\n",
-       "    for (int i = 0; i < 0; ++i)\n",
-       "        __state->gpu_context->streams[i] = stream;\n",
-       "}\n",
-       "\n",
-       "__global__ void __launch_bounds__(32) gpu_map_0_0_3(dace::uint * __restrict__ A, dace::uint * __restrict__ B)\n",
-       "{\n",
-       "    int i = (32 * blockIdx.x);\n",
-       "    {\n",
-       "        __shared__ dace::uint S[32];\n",
-       "        int j = threadIdx.x;\n",
-       "        {\n",
-       "            dace::uint __inp = A[j];\n",
-       "            dace::uint __out;\n",
-       "\n",
-       "            ///////////////////\n",
-       "            __out = __inp;\n",
-       "            ///////////////////\n",
-       "\n",
-       "            S[j] = __out;\n",
-       "        }\n",
-       "        {\n",
-       "\n",
-       "            ///////////////////\n",
-       "            __syncthreads();\n",
-       "            ///////////////////\n",
-       "\n",
-       "        }\n",
-       "        {\n",
-       "\n",
-       "            ///////////////////\n",
-       "            __syncthreads();\n",
-       "            ///////////////////\n",
-       "\n",
-       "        }\n",
-       "        {\n",
-       "\n",
-       "            ///////////////////\n",
-       "            __syncthreads();\n",
-       "            ///////////////////\n",
-       "\n",
-       "        }\n",
-       "        {\n",
-       "            dace::uint __inp = S[j];\n",
-       "            dace::uint __out;\n",
-       "\n",
-       "            ///////////////////\n",
-       "            __out = __inp;\n",
-       "            ///////////////////\n",
-       "\n",
-       "            B[j] = __out;\n",
-       "        }\n",
-       "    }\n",
-       "}\n",
-       "\n",
-       "\n",
-       "DACE_EXPORTED void __dace_runkernel_gpu_map_0_0_3(simpleCopy_smem_state_t *__state, dace::uint * __restrict__ A, dace::uint * __restrict__ B);\n",
-       "void __dace_runkernel_gpu_map_0_0_3(simpleCopy_smem_state_t *__state, dace::uint * __restrict__ A, dace::uint * __restrict__ B)\n",
-       "{\n",
-       "\n",
-       "\n",
-       "    void  *gpu_map_0_0_3_args[] = { (void *)&A, (void *)&B };\n",
-       "    gpuError_t __err = cudaLaunchKernel( (void*)gpu_map_0_0_3, dim3(1, 1, 1), dim3(32, 1, 1), gpu_map_0_0_3_args, 0, nullptr\n",
-       "    );\n",
-       "\n",
-       "    DACE_KERNEL_LAUNCH_CHECK(__err, "gpu_map_0_0_3", 1, 1, 1, 32, 1, 1);\n",
-       "}\n",
-       "
\n" - ], - "text/latex": [ - "\\begin{Verbatim}[commandchars=\\\\\\{\\}]\n", - "\\PY{n}{\\PYZsh{}include}\\PY{+w}{ }\\PY{o}{\\PYZlt{}}\\PY{n}{cuda\\PYZus{}runtime}\\PY{p}{.}\\PY{n}{h}\\PY{o}{\\PYZgt{}}\n", - "\\PY{n}{\\PYZsh{}include}\\PY{+w}{ }\\PY{o}{\\PYZlt{}}\\PY{n}{dace}\\PY{o}{/}\\PY{n}{dace}\\PY{p}{.}\\PY{n}{h}\\PY{o}{\\PYZgt{}}\n", - "\n", - "\n", - "\\PY{n}{struct}\\PY{+w}{ }\\PY{n}{simpleCopy\\PYZus{}smem\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{err}{\\PYZob{}}\n", - "\\PY{+w}{ }\\PY{n+nl}{dace}\\PY{p}{:}\\PY{err}{:}\\PY{n+nl}{cuda}\\PY{p}{:}\\PY{err}{:}\\PY{n}{Context}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{gpu\\PYZus{}context}\\PY{p}{;}\n", - "\\PY{err}{\\PYZcb{}}\\PY{p}{;}\n", - "\n", - "\n", - "\n", - "\\PY{n}{DACE\\PYZus{}EXPORTED}\\PY{+w}{ }\\PY{n+nc}{int}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}dace\\PYZus{}init\\PYZus{}experimental\\PYZus{}cuda}\\PY{p}{(}\\PY{n}{simpleCopy\\PYZus{}smem\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{)}\\PY{p}{;}\n", - "\\PY{n}{DACE\\PYZus{}EXPORTED}\\PY{+w}{ }\\PY{n+nc}{int}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}dace\\PYZus{}exit\\PYZus{}experimental\\PYZus{}cuda}\\PY{p}{(}\\PY{n}{simpleCopy\\PYZus{}smem\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{)}\\PY{p}{;}\n", - "\n", - "\n", - "\n", - "\\PY{n+nc}{int}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}dace\\PYZus{}init\\PYZus{}experimental\\PYZus{}cuda}\\PY{p}{(}\\PY{n}{simpleCopy\\PYZus{}smem\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{)}\\PY{+w}{ }\\PY{err}{\\PYZob{}}\n", - "\\PY{+w}{ }\\PY{n+nc}{int}\\PY{+w}{ }\\PY{n+nf}{count}\\PY{p}{;}\n", - "\n", - "\\PY{+w}{ }\\PY{o}{/}\\PY{o}{/}\\PY{+w}{ }\\PY{k}{Check}\\PY{+w}{ }\\PY{n}{that}\\PY{+w}{ }\\PY{n}{we}\\PY{+w}{ }\\PY{k}{are}\\PY{+w}{ }\\PY{n}{able}\\PY{+w}{ }\\PY{k}{to}\\PY{+w}{ }\\PY{n}{run}\\PY{+w}{ }\\PY{n}{cuda}\\PY{+w}{ }\\PY{n}{code}\n", - "\\PY{+w}{ }\\PY{k}{if}\\PY{+w}{ }\\PY{p}{(}\\PY{n}{cudaGetDeviceCount}\\PY{p}{(}\\PY{o}{\\PYZam{}}\\PY{n+nf}{count}\\PY{p}{)}\\PY{+w}{ }\\PY{o}{!=}\\PY{+w}{ }\\PY{n}{cudaSuccess}\\PY{p}{)}\n", - "\\PY{+w}{ }\\PY{err}{\\PYZob{}}\n", - "\\PY{+w}{ }\\PY{n}{printf}\\PY{p}{(}\\PY{l+s+ss}{\\PYZdq{}ERROR: GPU drivers are not configured or cuda\\PYZhy{}capable device \\PYZdq{}}\n", - "\\PY{+w}{ }\\PY{l+s+ss}{\\PYZdq{}not found\\PYZbs{}n\\PYZdq{}}\\PY{p}{)}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{k}{return}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{err}{\\PYZcb{}}\n", - "\\PY{+w}{ }\\PY{k}{if}\\PY{+w}{ }\\PY{p}{(}\\PY{n+nf}{count}\\PY{+w}{ }\\PY{o}{=}\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{)}\n", - "\\PY{+w}{ }\\PY{err}{\\PYZob{}}\n", - "\\PY{+w}{ }\\PY{n}{printf}\\PY{p}{(}\\PY{l+s+ss}{\\PYZdq{}ERROR: No cuda\\PYZhy{}capable devices found\\PYZbs{}n\\PYZdq{}}\\PY{p}{)}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{k}{return}\\PY{+w}{ }\\PY{l+m+mi}{2}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{err}{\\PYZcb{}}\n", - "\n", - "\\PY{+w}{ }\\PY{o}{/}\\PY{o}{/}\\PY{+w}{ }\\PY{k}{Initialize}\\PY{+w}{ }\\PY{n}{cuda}\\PY{+w}{ }\\PY{k}{before}\\PY{+w}{ }\\PY{n}{we}\\PY{+w}{ }\\PY{n}{run}\\PY{+w}{ }\\PY{n}{the}\\PY{+w}{ }\\PY{n}{application}\n", - "\\PY{+w}{ }\\PY{n+nc}{float}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{dev\\PYZus{}X}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{n}{DACE\\PYZus{}GPU\\PYZus{}CHECK}\\PY{p}{(}\\PY{n}{cudaMalloc}\\PY{p}{(}\\PY{p}{(}\\PY{n}{void}\\PY{+w}{ }\\PY{o}{*}\\PY{o}{*}\\PY{p}{)}\\PY{+w}{ }\\PY{o}{\\PYZam{}}\\PY{n}{dev\\PYZus{}X}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{)}\\PY{p}{)}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{n}{DACE\\PYZus{}GPU\\PYZus{}CHECK}\\PY{p}{(}\\PY{n}{cudaFree}\\PY{p}{(}\\PY{n}{dev\\PYZus{}X}\\PY{p}{)}\\PY{p}{)}\\PY{p}{;}\n", - "\n", - "\\PY{+w}{ }\n", - "\n", - "\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{k}{new}\\PY{+w}{ }\\PY{n+nl}{dace}\\PY{p}{:}\\PY{err}{:}\\PY{n+nl}{cuda}\\PY{p}{:}\\PY{err}{:}\\PY{n}{Context}\\PY{p}{(}\\PY{l+m+mi}{0}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{)}\\PY{p}{;}\n", - "\n", - "\\PY{+w}{ }\\PY{o}{/}\\PY{o}{/}\\PY{+w}{ }\\PY{k}{Create}\\PY{+w}{ }\\PY{n}{cuda}\\PY{+w}{ }\\PY{n}{streams}\\PY{+w}{ }\\PY{o+ow}{and}\\PY{+w}{ }\\PY{n}{events}\n", - "\\PY{+w}{ }\\PY{k}{for}\\PY{p}{(}\\PY{n+nc}{int}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{;}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{\\PYZlt{}}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{;}\\PY{+w}{ }\\PY{o}{+}\\PY{o}{+}\\PY{n}{i}\\PY{p}{)}\\PY{+w}{ }\\PY{err}{\\PYZob{}}\n", - "\\PY{+w}{ }\\PY{n}{DACE\\PYZus{}GPU\\PYZus{}CHECK}\\PY{p}{(}\\PY{n}{cudaStreamCreateWithFlags}\\PY{p}{(}\\PY{o}{\\PYZam{}}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{internal\\PYZus{}streams}\\PY{o}{[}\\PY{n}{i}\\PY{o}{]}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{cudaStreamNonBlocking}\\PY{p}{)}\\PY{p}{)}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{streams}\\PY{o}{[}\\PY{n}{i}\\PY{o}{]}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{internal\\PYZus{}streams}\\PY{o}{[}\\PY{n}{i}\\PY{o}{]}\\PY{p}{;}\\PY{+w}{ }\\PY{o}{/}\\PY{o}{/}\\PY{+w}{ }\\PY{n}{Allow}\\PY{+w}{ }\\PY{k}{for}\\PY{+w}{ }\\PY{n}{externals}\\PY{+w}{ }\\PY{k}{to}\\PY{+w}{ }\\PY{k}{modify}\\PY{+w}{ }\\PY{n}{streams}\n", - "\\PY{+w}{ }\\PY{err}{\\PYZcb{}}\n", - "\\PY{+w}{ }\\PY{k}{for}\\PY{p}{(}\\PY{n+nc}{int}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{;}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{\\PYZlt{}}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{;}\\PY{+w}{ }\\PY{o}{+}\\PY{o}{+}\\PY{n}{i}\\PY{p}{)}\\PY{+w}{ }\\PY{err}{\\PYZob{}}\n", - "\\PY{+w}{ }\\PY{n}{DACE\\PYZus{}GPU\\PYZus{}CHECK}\\PY{p}{(}\\PY{n}{cudaEventCreateWithFlags}\\PY{p}{(}\\PY{o}{\\PYZam{}}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{events}\\PY{o}{[}\\PY{n}{i}\\PY{o}{]}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{cudaEventDisableTiming}\\PY{p}{)}\\PY{p}{)}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{err}{\\PYZcb{}}\n", - "\n", - "\\PY{+w}{ }\n", - "\n", - "\\PY{+w}{ }\\PY{k}{return}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{;}\n", - "\\PY{err}{\\PYZcb{}}\n", - "\n", - "\\PY{n+nc}{int}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}dace\\PYZus{}exit\\PYZus{}experimental\\PYZus{}cuda}\\PY{p}{(}\\PY{n}{simpleCopy\\PYZus{}smem\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{)}\\PY{+w}{ }\\PY{err}{\\PYZob{}}\n", - "\\PY{+w}{ }\n", - "\n", - "\\PY{+w}{ }\\PY{o}{/}\\PY{o}{/}\\PY{+w}{ }\\PY{n}{Synchronize}\\PY{+w}{ }\\PY{o+ow}{and}\\PY{+w}{ }\\PY{k}{check}\\PY{+w}{ }\\PY{k}{for}\\PY{+w}{ }\\PY{n}{CUDA}\\PY{+w}{ }\\PY{n}{errors}\n", - "\\PY{+w}{ }\\PY{n+nc}{int}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}err}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{static\\PYZus{}cast}\\PY{o}{\\PYZlt{}}\\PY{n+nc}{int}\\PY{o}{\\PYZgt{}}\\PY{p}{(}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{lasterror}\\PY{p}{)}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{k}{if}\\PY{+w}{ }\\PY{p}{(}\\PY{n}{\\PYZus{}\\PYZus{}err}\\PY{+w}{ }\\PY{o}{=}\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{)}\n", - "\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}err}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{static\\PYZus{}cast}\\PY{o}{\\PYZlt{}}\\PY{n+nc}{int}\\PY{o}{\\PYZgt{}}\\PY{p}{(}\\PY{n}{cudaDeviceSynchronize}\\PY{p}{(}\\PY{p}{)}\\PY{p}{)}\\PY{p}{;}\n", - "\n", - "\\PY{+w}{ }\\PY{o}{/}\\PY{o}{/}\\PY{+w}{ }\\PY{k}{Destroy}\\PY{+w}{ }\\PY{n}{cuda}\\PY{+w}{ }\\PY{n}{streams}\\PY{+w}{ }\\PY{o+ow}{and}\\PY{+w}{ }\\PY{n}{events}\n", - "\\PY{+w}{ }\\PY{k}{for}\\PY{p}{(}\\PY{n+nc}{int}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{;}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{\\PYZlt{}}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{;}\\PY{+w}{ }\\PY{o}{+}\\PY{o}{+}\\PY{n}{i}\\PY{p}{)}\\PY{+w}{ }\\PY{err}{\\PYZob{}}\n", - "\\PY{+w}{ }\\PY{n}{DACE\\PYZus{}GPU\\PYZus{}CHECK}\\PY{p}{(}\\PY{n}{cudaStreamDestroy}\\PY{p}{(}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{internal\\PYZus{}streams}\\PY{o}{[}\\PY{n}{i}\\PY{o}{]}\\PY{p}{)}\\PY{p}{)}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{err}{\\PYZcb{}}\n", - "\\PY{+w}{ }\\PY{k}{for}\\PY{p}{(}\\PY{n+nc}{int}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{;}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{\\PYZlt{}}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{;}\\PY{+w}{ }\\PY{o}{+}\\PY{o}{+}\\PY{n}{i}\\PY{p}{)}\\PY{+w}{ }\\PY{err}{\\PYZob{}}\n", - "\\PY{+w}{ }\\PY{n}{DACE\\PYZus{}GPU\\PYZus{}CHECK}\\PY{p}{(}\\PY{n}{cudaEventDestroy}\\PY{p}{(}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{events}\\PY{o}{[}\\PY{n}{i}\\PY{o}{]}\\PY{p}{)}\\PY{p}{)}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{err}{\\PYZcb{}}\n", - "\n", - "\\PY{+w}{ }\\PY{k}{delete}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{k}{return}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}err}\\PY{p}{;}\n", - "\\PY{err}{\\PYZcb{}}\n", - "\n", - "\\PY{n}{DACE\\PYZus{}EXPORTED}\\PY{+w}{ }\\PY{n}{bool}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}dace\\PYZus{}gpu\\PYZus{}set\\PYZus{}stream}\\PY{p}{(}\\PY{n}{simpleCopy\\PYZus{}smem\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{,}\\PY{+w}{ }\\PY{n+nc}{int}\\PY{+w}{ }\\PY{n}{streamid}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{gpuStream\\PYZus{}t}\\PY{+w}{ }\\PY{n}{stream}\\PY{p}{)}\n", - "\\PY{err}{\\PYZob{}}\n", - "\\PY{+w}{ }\\PY{k}{if}\\PY{+w}{ }\\PY{p}{(}\\PY{n}{streamid}\\PY{+w}{ }\\PY{o}{\\PYZlt{}}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{+w}{ }\\PY{o}{|}\\PY{o}{|}\\PY{+w}{ }\\PY{n}{streamid}\\PY{+w}{ }\\PY{o}{\\PYZgt{}=}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{)}\n", - "\\PY{+w}{ }\\PY{k}{return}\\PY{+w}{ }\\PY{k}{false}\\PY{p}{;}\n", - "\n", - "\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{streams}\\PY{o}{[}\\PY{n}{streamid}\\PY{o}{]}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{stream}\\PY{p}{;}\n", - "\n", - "\\PY{+w}{ }\\PY{k}{return}\\PY{+w}{ }\\PY{k}{true}\\PY{p}{;}\n", - "\\PY{err}{\\PYZcb{}}\n", - "\n", - "\\PY{n}{DACE\\PYZus{}EXPORTED}\\PY{+w}{ }\\PY{n}{void}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}dace\\PYZus{}gpu\\PYZus{}set\\PYZus{}all\\PYZus{}streams}\\PY{p}{(}\\PY{n}{simpleCopy\\PYZus{}smem\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{gpuStream\\PYZus{}t}\\PY{+w}{ }\\PY{n}{stream}\\PY{p}{)}\n", - "\\PY{err}{\\PYZob{}}\n", - "\\PY{+w}{ }\\PY{k}{for}\\PY{+w}{ }\\PY{p}{(}\\PY{n+nc}{int}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{;}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{\\PYZlt{}}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{;}\\PY{+w}{ }\\PY{o}{+}\\PY{o}{+}\\PY{n}{i}\\PY{p}{)}\n", - "\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{streams}\\PY{o}{[}\\PY{n}{i}\\PY{o}{]}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{stream}\\PY{p}{;}\n", - "\\PY{err}{\\PYZcb{}}\n", - "\n", - "\\PY{n}{\\PYZus{}\\PYZus{}global\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{n}{void}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}launch\\PYZus{}bounds\\PYZus{}\\PYZus{}}\\PY{p}{(}\\PY{l+m+mi}{32}\\PY{p}{)}\\PY{+w}{ }\\PY{n}{gpu\\PYZus{}map\\PYZus{}0\\PYZus{}0\\PYZus{}3}\\PY{p}{(}\\PY{n+nl}{dace}\\PY{p}{:}\\PY{err}{:}\\PY{n}{uint}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}restrict\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{n}{A}\\PY{p}{,}\\PY{+w}{ }\\PY{n+nl}{dace}\\PY{p}{:}\\PY{err}{:}\\PY{n}{uint}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}restrict\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{n}{B}\\PY{p}{)}\n", - "\\PY{err}{\\PYZob{}}\n", - "\\PY{+w}{ }\\PY{n+nc}{int}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{p}{(}\\PY{l+m+mi}{32}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{blockIdx}\\PY{p}{.}\\PY{n}{x}\\PY{p}{)}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{err}{\\PYZob{}}\n", - "\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}shared\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{n+nl}{dace}\\PY{p}{:}\\PY{err}{:}\\PY{n}{uint}\\PY{+w}{ }\\PY{n}{S}\\PY{o}{[}\\PY{n}{32}\\PY{o}{]}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{n+nc}{int}\\PY{+w}{ }\\PY{n}{j}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{threadIdx}\\PY{p}{.}\\PY{n}{x}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{err}{\\PYZob{}}\n", - "\\PY{+w}{ }\\PY{n+nl}{dace}\\PY{p}{:}\\PY{err}{:}\\PY{n}{uint}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}inp}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{A}\\PY{o}{[}\\PY{n}{j}\\PY{o}{]}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{n+nl}{dace}\\PY{p}{:}\\PY{err}{:}\\PY{n}{uint}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}out}\\PY{p}{;}\n", - "\n", - "\\PY{+w}{ }\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\n", - "\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}out}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}inp}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\n", - "\n", - "\\PY{+w}{ }\\PY{n}{S}\\PY{o}{[}\\PY{n}{j}\\PY{o}{]}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}out}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{err}{\\PYZcb{}}\n", - "\\PY{+w}{ }\\PY{err}{\\PYZob{}}\n", - "\n", - "\\PY{+w}{ }\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\n", - "\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}syncthreads}\\PY{p}{(}\\PY{p}{)}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\n", - "\n", - "\\PY{+w}{ }\\PY{err}{\\PYZcb{}}\n", - "\\PY{+w}{ }\\PY{err}{\\PYZob{}}\n", - "\n", - "\\PY{+w}{ }\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\n", - "\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}syncthreads}\\PY{p}{(}\\PY{p}{)}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\n", - "\n", - "\\PY{+w}{ }\\PY{err}{\\PYZcb{}}\n", - "\\PY{+w}{ }\\PY{err}{\\PYZob{}}\n", - "\n", - "\\PY{+w}{ }\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\n", - "\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}syncthreads}\\PY{p}{(}\\PY{p}{)}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\n", - "\n", - "\\PY{+w}{ }\\PY{err}{\\PYZcb{}}\n", - "\\PY{+w}{ }\\PY{err}{\\PYZob{}}\n", - "\\PY{+w}{ }\\PY{n+nl}{dace}\\PY{p}{:}\\PY{err}{:}\\PY{n}{uint}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}inp}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{S}\\PY{o}{[}\\PY{n}{j}\\PY{o}{]}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{n+nl}{dace}\\PY{p}{:}\\PY{err}{:}\\PY{n}{uint}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}out}\\PY{p}{;}\n", - "\n", - "\\PY{+w}{ }\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\n", - "\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}out}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}inp}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\n", - "\n", - "\\PY{+w}{ }\\PY{n}{B}\\PY{o}{[}\\PY{n}{j}\\PY{o}{]}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}out}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{err}{\\PYZcb{}}\n", - "\\PY{+w}{ }\\PY{err}{\\PYZcb{}}\n", - "\\PY{err}{\\PYZcb{}}\n", - "\n", - "\n", - "\\PY{n}{DACE\\PYZus{}EXPORTED}\\PY{+w}{ }\\PY{n}{void}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}dace\\PYZus{}runkernel\\PYZus{}gpu\\PYZus{}map\\PYZus{}0\\PYZus{}0\\PYZus{}3}\\PY{p}{(}\\PY{n}{simpleCopy\\PYZus{}smem\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{,}\\PY{+w}{ }\\PY{n+nl}{dace}\\PY{p}{:}\\PY{err}{:}\\PY{n}{uint}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}restrict\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{n}{A}\\PY{p}{,}\\PY{+w}{ }\\PY{n+nl}{dace}\\PY{p}{:}\\PY{err}{:}\\PY{n}{uint}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}restrict\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{n}{B}\\PY{p}{)}\\PY{p}{;}\n", - "\\PY{n}{void}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}dace\\PYZus{}runkernel\\PYZus{}gpu\\PYZus{}map\\PYZus{}0\\PYZus{}0\\PYZus{}3}\\PY{p}{(}\\PY{n}{simpleCopy\\PYZus{}smem\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{,}\\PY{+w}{ }\\PY{n+nl}{dace}\\PY{p}{:}\\PY{err}{:}\\PY{n}{uint}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}restrict\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{n}{A}\\PY{p}{,}\\PY{+w}{ }\\PY{n+nl}{dace}\\PY{p}{:}\\PY{err}{:}\\PY{n}{uint}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}restrict\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{n}{B}\\PY{p}{)}\n", - "\\PY{err}{\\PYZob{}}\n", - "\n", - "\n", - "\\PY{+w}{ }\\PY{n}{void}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{gpu\\PYZus{}map\\PYZus{}0\\PYZus{}0\\PYZus{}3\\PYZus{}args}\\PY{err}{[}\\PY{err}{]}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{err}{\\PYZob{}}\\PY{+w}{ }\\PY{p}{(}\\PY{n}{void}\\PY{+w}{ }\\PY{o}{*}\\PY{p}{)}\\PY{o}{\\PYZam{}}\\PY{n}{A}\\PY{p}{,}\\PY{+w}{ }\\PY{p}{(}\\PY{n}{void}\\PY{+w}{ }\\PY{o}{*}\\PY{p}{)}\\PY{o}{\\PYZam{}}\\PY{n}{B}\\PY{+w}{ }\\PY{err}{\\PYZcb{}}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{n}{gpuError\\PYZus{}t}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}err}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{cudaLaunchKernel}\\PY{p}{(}\\PY{+w}{ }\\PY{p}{(}\\PY{n}{void}\\PY{o}{*}\\PY{p}{)}\\PY{n}{gpu\\PYZus{}map\\PYZus{}0\\PYZus{}0\\PYZus{}3}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{dim3}\\PY{p}{(}\\PY{l+m+mi}{1}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{)}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{dim3}\\PY{p}{(}\\PY{l+m+mi}{32}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{)}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{gpu\\PYZus{}map\\PYZus{}0\\PYZus{}0\\PYZus{}3\\PYZus{}args}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{nullptr}\n", - "\\PY{+w}{ }\\PY{p}{)}\\PY{p}{;}\n", - "\n", - "\\PY{+w}{ }\\PY{n}{DACE\\PYZus{}KERNEL\\PYZus{}LAUNCH\\PYZus{}CHECK}\\PY{p}{(}\\PY{n}{\\PYZus{}\\PYZus{}err}\\PY{p}{,}\\PY{+w}{ }\\PY{l+s+ss}{\\PYZdq{}gpu\\PYZus{}map\\PYZus{}0\\PYZus{}0\\PYZus{}3\\PYZdq{}}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{32}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{)}\\PY{p}{;}\n", - "\\PY{err}{\\PYZcb{}}\n", - "\\end{Verbatim}\n" - ], - "text/plain": [ - "\n", - "#include \n", - "#include \n", - "\n", - "\n", - "struct simpleCopy_smem_state_t {\n", - " dace::cuda::Context *gpu_context;\n", - "};\n", - "\n", - "\n", - "\n", - "DACE_EXPORTED int __dace_init_experimental_cuda(simpleCopy_smem_state_t *__state);\n", - "DACE_EXPORTED int __dace_exit_experimental_cuda(simpleCopy_smem_state_t *__state);\n", - "\n", - "\n", - "\n", - "int __dace_init_experimental_cuda(simpleCopy_smem_state_t *__state) {\n", - " int count;\n", - "\n", - " // Check that we are able to run cuda code\n", - " if (cudaGetDeviceCount(&count) != cudaSuccess)\n", - " {\n", - " printf(\"ERROR: GPU drivers are not configured or cuda-capable device \"\n", - " \"not found\\n\");\n", - " return 1;\n", - " }\n", - " if (count == 0)\n", - " {\n", - " printf(\"ERROR: No cuda-capable devices found\\n\");\n", - " return 2;\n", - " }\n", - "\n", - " // Initialize cuda before we run the application\n", - " float *dev_X;\n", - " DACE_GPU_CHECK(cudaMalloc((void **) &dev_X, 1));\n", - " DACE_GPU_CHECK(cudaFree(dev_X));\n", - "\n", - " \n", - "\n", - " __state->gpu_context = new dace::cuda::Context(0, 0);\n", - "\n", - " // Create cuda streams and events\n", - " for(int i = 0; i < 0; ++i) {\n", - " DACE_GPU_CHECK(cudaStreamCreateWithFlags(&__state->gpu_context->internal_streams[i], cudaStreamNonBlocking));\n", - " __state->gpu_context->streams[i] = __state->gpu_context->internal_streams[i]; // Allow for externals to modify streams\n", - " }\n", - " for(int i = 0; i < 0; ++i) {\n", - " DACE_GPU_CHECK(cudaEventCreateWithFlags(&__state->gpu_context->events[i], cudaEventDisableTiming));\n", - " }\n", - "\n", - " \n", - "\n", - " return 0;\n", - "}\n", - "\n", - "int __dace_exit_experimental_cuda(simpleCopy_smem_state_t *__state) {\n", - " \n", - "\n", - " // Synchronize and check for CUDA errors\n", - " int __err = static_cast(__state->gpu_context->lasterror);\n", - " if (__err == 0)\n", - " __err = static_cast(cudaDeviceSynchronize());\n", - "\n", - " // Destroy cuda streams and events\n", - " for(int i = 0; i < 0; ++i) {\n", - " DACE_GPU_CHECK(cudaStreamDestroy(__state->gpu_context->internal_streams[i]));\n", - " }\n", - " for(int i = 0; i < 0; ++i) {\n", - " DACE_GPU_CHECK(cudaEventDestroy(__state->gpu_context->events[i]));\n", - " }\n", - "\n", - " delete __state->gpu_context;\n", - " return __err;\n", - "}\n", - "\n", - "DACE_EXPORTED bool __dace_gpu_set_stream(simpleCopy_smem_state_t *__state, int streamid, gpuStream_t stream)\n", - "{\n", - " if (streamid < 0 || streamid >= 0)\n", - " return false;\n", - "\n", - " __state->gpu_context->streams[streamid] = stream;\n", - "\n", - " return true;\n", - "}\n", - "\n", - "DACE_EXPORTED void __dace_gpu_set_all_streams(simpleCopy_smem_state_t *__state, gpuStream_t stream)\n", - "{\n", - " for (int i = 0; i < 0; ++i)\n", - " __state->gpu_context->streams[i] = stream;\n", - "}\n", - "\n", - "__global__ void __launch_bounds__(32) gpu_map_0_0_3(dace::uint * __restrict__ A, dace::uint * __restrict__ B)\n", - "{\n", - " int i = (32 * blockIdx.x);\n", - " {\n", - " __shared__ dace::uint S[32];\n", - " int j = threadIdx.x;\n", - " {\n", - " dace::uint __inp = A[j];\n", - " dace::uint __out;\n", - "\n", - " ///////////////////\n", - " __out = __inp;\n", - " ///////////////////\n", - "\n", - " S[j] = __out;\n", - " }\n", - " {\n", - "\n", - " ///////////////////\n", - " __syncthreads();\n", - " ///////////////////\n", - "\n", - " }\n", - " {\n", - "\n", - " ///////////////////\n", - " __syncthreads();\n", - " ///////////////////\n", - "\n", - " }\n", - " {\n", - "\n", - " ///////////////////\n", - " __syncthreads();\n", - " ///////////////////\n", - "\n", - " }\n", - " {\n", - " dace::uint __inp = S[j];\n", - " dace::uint __out;\n", - "\n", - " ///////////////////\n", - " __out = __inp;\n", - " ///////////////////\n", - "\n", - " B[j] = __out;\n", - " }\n", - " }\n", - "}\n", - "\n", - "\n", - "DACE_EXPORTED void __dace_runkernel_gpu_map_0_0_3(simpleCopy_smem_state_t *__state, dace::uint * __restrict__ A, dace::uint * __restrict__ B);\n", - "void __dace_runkernel_gpu_map_0_0_3(simpleCopy_smem_state_t *__state, dace::uint * __restrict__ A, dace::uint * __restrict__ B)\n", - "{\n", - "\n", - "\n", - " void *gpu_map_0_0_3_args[] = { (void *)&A, (void *)&B };\n", - " gpuError_t __err = cudaLaunchKernel( (void*)gpu_map_0_0_3, dim3(1, 1, 1), dim3(32, 1, 1), gpu_map_0_0_3_args, 0, nullptr\n", - " );\n", - "\n", - " DACE_KERNEL_LAUNCH_CHECK(__err, \"gpu_map_0_0_3\", 1, 1, 1, 32, 1, 1);\n", - "}\n" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "Code(sdfg.generate_code()[1].clean_code)" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "61a16931", - "metadata": {}, - "outputs": [], - "source": [ - "#Code(sdfg.generate_code()[0].clean_code)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b5c5e3be", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "dace_env", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.12.3" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/berkay_workpace/scratch/thesis_related/const_check_fails.ipynb b/berkay_workpace/scratch/thesis_related/const_check_fails.ipynb deleted file mode 100644 index 80250c2662..0000000000 --- a/berkay_workpace/scratch/thesis_related/const_check_fails.ipynb +++ /dev/null @@ -1,179 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "2362eee8", - "metadata": {}, - "source": [ - "# Const Check fail\n", - "\n", - "Here is a sdfg that leads to compilation error using the legacy CUDACodeGen. Below is the sdfg." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "f77627b8", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "\n", - "
\n", - "
\n", - "
\n", - "\n", - "" - ], - "text/plain": [ - "SDFG (kernel)" - ] - }, - "execution_count": 1, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import dace\n", - "\n", - "sdfg = dace.SDFG.from_file(\"../yakups_examples/weird_global_to_global.sdfg\")\n", - "sdfg" - ] - }, - { - "cell_type": "markdown", - "id": "ae442fd0", - "metadata": {}, - "source": [ - "Ensure you are using the legay CUDACodegen" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ce960a1f", - "metadata": {}, - "outputs": [], - "source": [ - "from dace.config import Config\n", - "\n", - "assert Config.get('compiler', 'cuda', 'implementation') == 'legacy', \"Select legacy CUDACodgen in config.yml & restart jupyter notebook\"" - ] - }, - { - "cell_type": "markdown", - "id": "d8e04ba1", - "metadata": {}, - "source": [ - "Now let's try to compile the sdfg and observe how it fails:" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "177184e4", - "metadata": {}, - "outputs": [ - { - "ename": "CompilationError", - "evalue": "Compiler failure:\n[ 16%] \u001b[32mBuilding CXX object CMakeFiles/kernel.dir/home/berkay/master-thesis/dace/berkay_workpace/scratch/thesis_related/.dacecache/kernel/src/cpu/kernel.cpp.o\u001b[0m\nIn file included from /home/berkay/master-thesis/dace/dace/codegen/../runtime/include/dace/dace.h:14,\n from /home/berkay/master-thesis/dace/berkay_workpace/scratch/thesis_related/.dacecache/kernel/src/cpu/kernel.cpp:2:\n/home/berkay/master-thesis/dace/dace/codegen/../runtime/include/dace/types.h: In constructor ‘dace::half::half(float)’:\n/home/berkay/master-thesis/dace/dace/codegen/../runtime/include/dace/types.h:101:28: warning: dereferencing type-punned pointer will break strict-aliasing rules [-Wstrict-aliasing]\n 101 | uint32_t x = *((uint32_t*)&f);\n | ~^~~~~~~~~~~~~~\n[ 33%] \u001b[32mBuilding CUDA object CMakeFiles/kernel.dir/home/berkay/master-thesis/dace/berkay_workpace/scratch/thesis_related/.dacecache/kernel/src/cuda/kernel_cuda.cu.o\u001b[0m\n/home/berkay/master-thesis/dace/dace/codegen/../runtime/include/dace/../../../external/moodycamel/concurrentqueue.h(3599): warning #68-D: integer conversion resulted in a change of sign\nRemark: The warnings can be suppressed with \"-diag-suppress \"\n\n/home/berkay/master-thesis/dace/dace/codegen/../runtime/include/dace/../../../external/moodycamel/concurrentqueue.h(3607): warning #68-D: integer conversion resulted in a change of sign\n\n/home/berkay/master-thesis/dace/berkay_workpace/scratch/thesis_related/.dacecache/kernel/src/cuda/kernel_cuda.cu(97): error: no instance of function template \"dace::GlobalToGlobal1D\" matches the argument list\n argument types are: (const double *, int, const double *__restrict__)\n\n/home/berkay/master-thesis/dace/berkay_workpace/scratch/thesis_related/.dacecache/kernel/src/cuda/kernel_cuda.cu(99): error: no instance of function template \"dace::GlobalToGlobal1D\" matches the argument list\n argument types are: (const double *, int, const double *__restrict__)\n\n2 errors detected in the compilation of \"/home/berkay/master-thesis/dace/berkay_workpace/scratch/thesis_related/.dacecache/kernel/src/cuda/kernel_cuda.cu\".\ngmake[2]: *** [CMakeFiles/kernel.dir/build.make:94: CMakeFiles/kernel.dir/home/berkay/master-thesis/dace/berkay_workpace/scratch/thesis_related/.dacecache/kernel/src/cuda/kernel_cuda.cu.o] Error 2\ngmake[1]: *** [CMakeFiles/Makefile2:90: CMakeFiles/kernel.dir/all] Error 2\ngmake: *** [Makefile:91: all] Error 2\n", - "output_type": "error", - "traceback": [ - "\u001b[31m---------------------------------------------------------------------------\u001b[39m", - "\u001b[31mCalledProcessError\u001b[39m Traceback (most recent call last)", - "\u001b[36mFile \u001b[39m\u001b[32m~/master-thesis/dace/dace/codegen/compiler.py:245\u001b[39m, in \u001b[36mconfigure_and_compile\u001b[39m\u001b[34m(program_folder, program_name, output_stream)\u001b[39m\n\u001b[32m 244\u001b[39m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[32m--> \u001b[39m\u001b[32m245\u001b[39m \u001b[43m_run_liveoutput\u001b[49m\u001b[43m(\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mcmake --build . --config \u001b[39;49m\u001b[38;5;132;43;01m%s\u001b[39;49;00m\u001b[33;43m\"\u001b[39;49m\u001b[43m \u001b[49m\u001b[43m%\u001b[49m\u001b[43m \u001b[49m\u001b[43m(\u001b[49m\u001b[43mConfig\u001b[49m\u001b[43m.\u001b[49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[33;43m'\u001b[39;49m\u001b[33;43mcompiler\u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[33;43m'\u001b[39;49m\u001b[33;43mbuild_type\u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 246\u001b[39m \u001b[43m \u001b[49m\u001b[43mshell\u001b[49m\u001b[43m=\u001b[49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[32m 247\u001b[39m \u001b[43m \u001b[49m\u001b[43mcwd\u001b[49m\u001b[43m=\u001b[49m\u001b[43mbuild_folder\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 248\u001b[39m \u001b[43m \u001b[49m\u001b[43moutput_stream\u001b[49m\u001b[43m=\u001b[49m\u001b[43moutput_stream\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 249\u001b[39m \u001b[38;5;28;01mexcept\u001b[39;00m subprocess.CalledProcessError \u001b[38;5;28;01mas\u001b[39;00m ex:\n\u001b[32m 250\u001b[39m \u001b[38;5;66;03m# If unsuccessful, print results\u001b[39;00m\n", - "\u001b[36mFile \u001b[39m\u001b[32m~/master-thesis/dace/dace/codegen/compiler.py:429\u001b[39m, in \u001b[36m_run_liveoutput\u001b[39m\u001b[34m(command, output_stream, **kwargs)\u001b[39m\n\u001b[32m 428\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m process.returncode != \u001b[32m0\u001b[39m:\n\u001b[32m--> \u001b[39m\u001b[32m429\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m subprocess.CalledProcessError(process.returncode, command, output.getvalue())\n", - "\u001b[31mCalledProcessError\u001b[39m: Command 'cmake --build . --config RelWithDebInfo' returned non-zero exit status 2.", - "\nDuring handling of the above exception, another exception occurred:\n", - "\u001b[31mCompilationError\u001b[39m Traceback (most recent call last)", - "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[3]\u001b[39m\u001b[32m, line 1\u001b[39m\n\u001b[32m----> \u001b[39m\u001b[32m1\u001b[39m \u001b[43msdfg\u001b[49m\u001b[43m.\u001b[49m\u001b[43mcompile\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n", - "\u001b[36mFile \u001b[39m\u001b[32m~/master-thesis/dace/dace/sdfg/sdfg.py:2396\u001b[39m, in \u001b[36mSDFG.compile\u001b[39m\u001b[34m(self, output_file, validate, return_program_handle)\u001b[39m\n\u001b[32m 2393\u001b[39m sdfg = \u001b[38;5;28mself\u001b[39m\n\u001b[32m 2395\u001b[39m \u001b[38;5;66;03m# Compile the code and get the shared library path\u001b[39;00m\n\u001b[32m-> \u001b[39m\u001b[32m2396\u001b[39m shared_library = \u001b[43mcompiler\u001b[49m\u001b[43m.\u001b[49m\u001b[43mconfigure_and_compile\u001b[49m\u001b[43m(\u001b[49m\u001b[43mprogram_folder\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43msdfg\u001b[49m\u001b[43m.\u001b[49m\u001b[43mname\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 2398\u001b[39m \u001b[38;5;66;03m# If provided, save output to path or filename\u001b[39;00m\n\u001b[32m 2399\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m output_file \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n", - "\u001b[36mFile \u001b[39m\u001b[32m~/master-thesis/dace/dace/codegen/compiler.py:254\u001b[39m, in \u001b[36mconfigure_and_compile\u001b[39m\u001b[34m(program_folder, program_name, output_stream)\u001b[39m\n\u001b[32m 252\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m cgx.CompilationError(\u001b[33m'\u001b[39m\u001b[33mCompiler failure\u001b[39m\u001b[33m'\u001b[39m)\n\u001b[32m 253\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m--> \u001b[39m\u001b[32m254\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m cgx.CompilationError(\u001b[33m'\u001b[39m\u001b[33mCompiler failure:\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[33m'\u001b[39m + ex.output)\n\u001b[32m 256\u001b[39m shared_library_path = os.path.join(build_folder, \u001b[33m\"\u001b[39m\u001b[33mlib\u001b[39m\u001b[38;5;132;01m{}\u001b[39;00m\u001b[33m.\u001b[39m\u001b[38;5;132;01m{}\u001b[39;00m\u001b[33m\"\u001b[39m.format(program_name,\n\u001b[32m 257\u001b[39m Config.get(\u001b[33m'\u001b[39m\u001b[33mcompiler\u001b[39m\u001b[33m'\u001b[39m, \u001b[33m'\u001b[39m\u001b[33mlibrary_extension\u001b[39m\u001b[33m'\u001b[39m)))\n\u001b[32m 259\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m shared_library_path\n", - "\u001b[31mCompilationError\u001b[39m: Compiler failure:\n[ 16%] \u001b[32mBuilding CXX object CMakeFiles/kernel.dir/home/berkay/master-thesis/dace/berkay_workpace/scratch/thesis_related/.dacecache/kernel/src/cpu/kernel.cpp.o\u001b[0m\nIn file included from /home/berkay/master-thesis/dace/dace/codegen/../runtime/include/dace/dace.h:14,\n from /home/berkay/master-thesis/dace/berkay_workpace/scratch/thesis_related/.dacecache/kernel/src/cpu/kernel.cpp:2:\n/home/berkay/master-thesis/dace/dace/codegen/../runtime/include/dace/types.h: In constructor ‘dace::half::half(float)’:\n/home/berkay/master-thesis/dace/dace/codegen/../runtime/include/dace/types.h:101:28: warning: dereferencing type-punned pointer will break strict-aliasing rules [-Wstrict-aliasing]\n 101 | uint32_t x = *((uint32_t*)&f);\n | ~^~~~~~~~~~~~~~\n[ 33%] \u001b[32mBuilding CUDA object CMakeFiles/kernel.dir/home/berkay/master-thesis/dace/berkay_workpace/scratch/thesis_related/.dacecache/kernel/src/cuda/kernel_cuda.cu.o\u001b[0m\n/home/berkay/master-thesis/dace/dace/codegen/../runtime/include/dace/../../../external/moodycamel/concurrentqueue.h(3599): warning #68-D: integer conversion resulted in a change of sign\nRemark: The warnings can be suppressed with \"-diag-suppress \"\n\n/home/berkay/master-thesis/dace/dace/codegen/../runtime/include/dace/../../../external/moodycamel/concurrentqueue.h(3607): warning #68-D: integer conversion resulted in a change of sign\n\n/home/berkay/master-thesis/dace/berkay_workpace/scratch/thesis_related/.dacecache/kernel/src/cuda/kernel_cuda.cu(97): error: no instance of function template \"dace::GlobalToGlobal1D\" matches the argument list\n argument types are: (const double *, int, const double *__restrict__)\n\n/home/berkay/master-thesis/dace/berkay_workpace/scratch/thesis_related/.dacecache/kernel/src/cuda/kernel_cuda.cu(99): error: no instance of function template \"dace::GlobalToGlobal1D\" matches the argument list\n argument types are: (const double *, int, const double *__restrict__)\n\n2 errors detected in the compilation of \"/home/berkay/master-thesis/dace/berkay_workpace/scratch/thesis_related/.dacecache/kernel/src/cuda/kernel_cuda.cu\".\ngmake[2]: *** [CMakeFiles/kernel.dir/build.make:94: CMakeFiles/kernel.dir/home/berkay/master-thesis/dace/berkay_workpace/scratch/thesis_related/.dacecache/kernel/src/cuda/kernel_cuda.cu.o] Error 2\ngmake[1]: *** [CMakeFiles/Makefile2:90: CMakeFiles/kernel.dir/all] Error 2\ngmake: *** [Makefile:91: all] Error 2\n" - ] - } - ], - "source": [ - "sdfg.compile()" - ] - }, - { - "cell_type": "markdown", - "id": "95ba7c6b", - "metadata": {}, - "source": [ - "### ❓ Why does this fail?\n", - "\n", - "The error is a **compilation failure** when calling the function template `dace::GlobalToGlobal1D`, due to a mismatch in argument types.\n", - "\n", - "The function expects:\n", - "\n", - "```cpp\n", - "(const T* src, int src_xstride, T* dst)\n", - "```\n", - "\n", - "But in the following example:\n", - "\n", - "```cpp\n", - "dace::GlobalToGlobal1D(A + i, 1, A);\n", - "```\n", - "\n", - "both the **source** (`A + i`) and **destination** (`A`) point to the **same array**, just at different locations.\n", - "\n", - "---\n", - "\n", - "### 🧠 Why is this a problem?\n", - "\n", - "1. **Wrong `const` deduction** \n", - " The old code generator mistakenly marks `A` as `const`, even though it is used as the **destination**.\n", - "\n", - "2. **Missing overloads** \n", - " The template should allow for cases where the destination is not `const`. Overloading should be used to handle this properly — even if it is probably not \n", - " strictly an error, it is probably good practice\n" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "dace_env", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.12.3" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/berkay_workpace/scratch/threadblockPass/simple1dExamplye.ipynb b/berkay_workpace/scratch/threadblockPass/simple1dExamplye.ipynb deleted file mode 100644 index eaa35d1491..0000000000 --- a/berkay_workpace/scratch/threadblockPass/simple1dExamplye.ipynb +++ /dev/null @@ -1,129 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "id": "029f8a65", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "import dace\n", - "import cupy as cp" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "6a04e64f", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "
\n", - "
\n", - "
\n", - "\n", - "" - ], - "text/plain": [ - "SDFG (vector_copy_strides1d)" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "N = dace.symbol('N')\n", - "s = 33\n", - "@dace.program\n", - "def vector_copy_strides1d(A: dace.uint32[N] @ dace.dtypes.StorageType.GPU_Global, B: dace.uint32[N] @ dace.dtypes.StorageType.GPU_Global):\n", - " for i in dace.map[0:N:s] @ dace.dtypes.ScheduleType.GPU_Device:\n", - " A[i] = B[i]\n", - "\n", - "sdfg = vector_copy_strides1d.to_sdfg()\n", - "sdfg\n" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "8bbd2799", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "\n", - "SUCCESS: A matches the expected result.\n" - ] - } - ], - "source": [ - "# Example input \n", - "n = 69\n", - "A = cp.zeros((n,), dtype=cp.uint32)\n", - "B = cp.arange(0, n, dtype=cp.uint32)\n", - "\n", - "\n", - "# Strided copy from B to A\n", - "sdfg(A=A, B=B, N=n)\n", - "\n", - "# Verify correctness\n", - "expected = cp.zeros((n,), dtype=cp.uint32)\n", - "expected[::s] = cp.arange(0, n, dtype=cp.uint32)[::s]\n", - "if cp.array_equal(A, expected):\n", - " print(\"\\n\\nSUCCESS: A matches the expected result.\")\n", - "else:\n", - " print(\"\\n\\nERROR: A does not match the expected result.\")\n" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "dace_env", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.12.3" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/berkay_workpace/scratch/threadblockPass/simple2dExample.ipynb b/berkay_workpace/scratch/threadblockPass/simple2dExample.ipynb deleted file mode 100644 index 510f81e74e..0000000000 --- a/berkay_workpace/scratch/threadblockPass/simple2dExample.ipynb +++ /dev/null @@ -1,142 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "id": "f11daa92", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "import dace\n", - "import cupy as cp" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "827c1a5d", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "
\n", - "
\n", - "
\n", - "\n", - "" - ], - "text/plain": [ - "SDFG (vector_copy_strides2d)" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "N = dace.symbol('N')\n", - "M = dace.symbol('M')\n", - "sN = 33\n", - "sM = 21\n", - "\n", - "@dace.program\n", - "def vector_copy_strides2d(A: dace.uint32[N, M] @ dace.dtypes.StorageType.GPU_Global, B: dace.uint32[N, M] @ dace.dtypes.StorageType.GPU_Global):\n", - " for i, j in dace.map[0:N:sN, 0:M:sM] @ dace.dtypes.ScheduleType.GPU_Device:\n", - " A[i, j] = B[i, j]\n", - "\n", - "sdfg = vector_copy_strides2d.to_sdfg()\n", - "sdfg" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "20d36b0b", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "\n", - "SUCCESS: A matches the expected result.\n" - ] - } - ], - "source": [ - "# Example input \n", - "n = 35\n", - "m = 43\n", - "A = cp.zeros((n, m,), dtype=cp.uint32)\n", - "B = cp.ones((n, m,), dtype=cp.uint32)\n", - "\n", - "\n", - "# Strided copy from B to A\n", - "sdfg(A=A, B=B, N=n, M=m)\n", - "\n", - "\n", - "# Verify correctness for 2D strided copy\n", - "expected = cp.zeros((n, m), dtype=cp.uint32)\n", - "expected[::sN, ::sM] = cp.ones((n, m,), dtype=cp.uint32)[::sN, ::sM]\n", - "if (A == expected).all():\n", - " print(\"\\n\\nSUCCESS: A matches the expected result.\")\n", - "else:\n", - " print(\"\\n\\nERROR: A does not match the expected result.\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "700d0007", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "dace_env", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.12.3" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/berkay_workpace/scratch/threadblockPass/simple4dExample.ipynb b/berkay_workpace/scratch/threadblockPass/simple4dExample.ipynb deleted file mode 100644 index 78ce88ed03..0000000000 --- a/berkay_workpace/scratch/threadblockPass/simple4dExample.ipynb +++ /dev/null @@ -1,143 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "id": "f5ba4b8b", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "import dace\n", - "import cupy as cp" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "fca45bfa", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "
\n", - "
\n", - "
\n", - "\n", - "" - ], - "text/plain": [ - "SDFG (vector_copy_strides4d)" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "N = dace.symbol(\"N\")\n", - "M = dace.symbol(\"M\")\n", - "J = dace.symbol(\"J\")\n", - "K = dace.symbol(\"K\")\n", - "\n", - "sN = 7\n", - "sM = 2\n", - "sJ = 5\n", - "sK = 8\n", - "\n", - "@dace.program\n", - "def vector_copy_strides4d(A: dace.uint32[N, M, J, K] @ dace.dtypes.StorageType.GPU_Global, B: dace.uint32[N, M, J, K] @ dace.dtypes.StorageType.GPU_Global):\n", - " for a, b, c, d in dace.map[0:N:sN, 0:M:sM, 0:J:sJ, 0:K:sK] @ dace.dtypes.ScheduleType.GPU_Device:\n", - " A[a, b, c, d] = B[a, b, c, d]\n", - "\n", - "sdfg = vector_copy_strides4d.to_sdfg()\n", - "sdfg\n" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "2678e814", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "\n", - "SUCCESS: A matches the expected result.\n" - ] - } - ], - "source": [ - "# Example input \n", - "n = 12\n", - "m = 14\n", - "j = 11\n", - "k = 23\n", - "\n", - "B = cp.ones((n, m, j, k, ), dtype=cp.uint32)\n", - "A = cp.zeros((n, m, j, k, ), dtype=cp.uint32)\n", - "\n", - "\n", - "# Strided copy from B to A\n", - "sdfg(A=A, B=B, N=n, M=m, J=j, K=k)\n", - "\n", - "\n", - "\n", - "# Verify correctness for 2D strided copy\n", - "expected = cp.zeros((n, m, j, k, ), dtype=cp.uint32)\n", - "expected[::sN, ::sM, ::sJ, ::sK] = 1\n", - "if (A == expected).all():\n", - " print(\"\\n\\nSUCCESS: A matches the expected result.\")\n", - "else:\n", - " print(\"\\n\\nERROR: A does not match the expected result.\")" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "dace_env", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.12.3" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/berkay_workpace/scratch/yakups_examples/dbuff_related/double_buffering_async.sdfg b/berkay_workpace/scratch/yakups_examples/dbuff_related/double_buffering_async.sdfg deleted file mode 100644 index bbbd88132c..0000000000 --- a/berkay_workpace/scratch/yakups_examples/dbuff_related/double_buffering_async.sdfg +++ /dev/null @@ -1,5302 +0,0 @@ -{ - "type": "SDFG", - "attributes": { - "name": "kernel_double_buffered_async", - "arg_names": [ - "A", - "B", - "C" - ], - "_arrays": { - "A": { - "type": "Array", - "attributes": { - "strides": [ - "1" - ], - "total_size": "N", - "offset": [ - "0" - ], - "optional": false, - "dtype": "float64", - "shape": [ - "N" - ], - "storage": "GPU_Global", - "debuginfo": null - } - }, - "B": { - "type": "Array", - "attributes": { - "strides": [ - "1" - ], - "total_size": "N", - "offset": [ - "0" - ], - "optional": false, - "dtype": "float64", - "shape": [ - "N" - ], - "storage": "GPU_Global", - "debuginfo": null - } - }, - "C": { - "type": "Array", - "attributes": { - "strides": [ - "1" - ], - "total_size": "N", - "offset": [ - "0" - ], - "optional": false, - "dtype": "float64", - "shape": [ - "N" - ], - "storage": "GPU_Global", - "debuginfo": null - } - }, - "__tmp3": { - "type": "Scalar", - "attributes": { - "dtype": "float64", - "shape": [ - "1" - ], - "transient": true, - "debuginfo": null - } - }, - "shr_B": { - "type": "Array", - "attributes": { - "strides": [ - "256", - "1" - ], - "total_size": "512", - "offset": [ - "0", - "0" - ], - "optional": false, - "dtype": "float64", - "shape": [ - "2", - "256" - ], - "transient": true, - "storage": "GPU_Shared", - "debuginfo": null - } - }, - "shr_A": { - "type": "Array", - "attributes": { - "strides": [ - "256", - "1" - ], - "total_size": "512", - "offset": [ - "0", - "0" - ], - "optional": false, - "dtype": "float64", - "shape": [ - "2", - "256" - ], - "transient": true, - "storage": "GPU_Shared", - "debuginfo": null - } - } - }, - "symbols": { - "N": "int64" - }, - "global_code": { - "frame": { - "string_data": "", - "language": "CPP" - } - }, - "init_code": { - "frame": { - "string_data": "", - "language": "CPP" - } - }, - "exit_code": { - "frame": { - "string_data": "", - "language": "CPP" - } - }, - "orig_sdfg": { - "type": "SDFG", - "attributes": { - "name": "kernel_double_buffered_async", - "arg_names": [ - "A", - "B", - "C" - ], - "_arrays": { - "A": { - "type": "Array", - "attributes": { - "strides": [ - "1" - ], - "total_size": "N", - "offset": [ - "0" - ], - "optional": false, - "dtype": "float64", - "shape": [ - "N" - ], - "storage": "GPU_Global", - "debuginfo": null - } - }, - "B": { - "type": "Array", - "attributes": { - "strides": [ - "1" - ], - "total_size": "N", - "offset": [ - "0" - ], - "optional": false, - "dtype": "float64", - "shape": [ - "N" - ], - "storage": "GPU_Global", - "debuginfo": null - } - }, - "C": { - "type": "Array", - "attributes": { - "strides": [ - "1" - ], - "total_size": "N", - "offset": [ - "0" - ], - "optional": false, - "dtype": "float64", - "shape": [ - "N" - ], - "storage": "GPU_Global", - "debuginfo": null - } - }, - "__tmp3": { - "type": "Scalar", - "attributes": { - "dtype": "float64", - "shape": [ - "1" - ], - "transient": true, - "debuginfo": null - } - }, - "shr_A": { - "type": "Array", - "attributes": { - "strides": [ - "1" - ], - "total_size": "256", - "offset": [ - "0" - ], - "optional": false, - "dtype": "float64", - "shape": [ - "256" - ], - "transient": true, - "storage": "GPU_Shared", - "debuginfo": null - } - }, - "shr_B": { - "type": "Array", - "attributes": { - "strides": [ - "1" - ], - "total_size": "256", - "offset": [ - "0" - ], - "optional": false, - "dtype": "float64", - "shape": [ - "256" - ], - "transient": true, - "storage": "GPU_Shared", - "debuginfo": null - } - } - }, - "symbols": { - "N": "int64" - }, - "global_code": { - "frame": { - "string_data": "", - "language": "CPP" - } - }, - "init_code": { - "frame": { - "string_data": "", - "language": "CPP" - } - }, - "exit_code": { - "frame": { - "string_data": "", - "language": "CPP" - } - }, - "debuginfo": { - "type": "DebugInfo", - "start_line": 89, - "end_line": 98, - "start_column": 0, - "end_column": 0, - "filename": "/home/primrose/Work/dace/tests/transformations/multiple_buffering_test.py" - }, - "using_explicit_control_flow": true, - "guid": "d86432fb-dd83-4d68-b1a3-5443e218087b" - }, - "nodes": [ - { - "type": "SDFGState", - "label": "MapState", - "id": 0, - "collapsed": false, - "scope_dict": { - "-1": [ - 0, - 2, - 3, - 4 - ], - "0": [ - 1, - 5 - ], - "5": [ - 6, - 7, - 12, - 13 - ], - "7": [ - 8, - 9, - 10, - 11 - ] - }, - "nodes": [ - { - "type": "MapEntry", - "label": "kernel_95[i=0:N:512]", - "attributes": { - "label": "kernel_95", - "params": [ - "i" - ], - "range": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "N - 1", - "step": "512", - "tile": "1" - } - ] - }, - "schedule": "GPU_Device", - "debuginfo": { - "type": "DebugInfo", - "start_line": 95, - "end_line": 95, - "start_column": 4, - "end_column": 4, - "filename": "/home/primrose/Work/dace/tests/transformations/multiple_buffering_test.py" - }, - "in_connectors": { - "IN_A": null, - "IN_B": null - }, - "out_connectors": { - "OUT_A": null, - "OUT_B": null - }, - "guid": "4748988f-5a8b-4bb9-979c-22cba4912dc0" - }, - "id": 0, - "scope_entry": null, - "scope_exit": "1" - }, - { - "type": "MapExit", - "label": "kernel_95[i=0:N:512]", - "attributes": { - "in_connectors": { - "IN_C": null - }, - "out_connectors": { - "OUT_C": null - }, - "guid": "6b03359b-afbe-4cc4-92de-ebedc84f829b" - }, - "id": 1, - "scope_entry": "0", - "scope_exit": "1" - }, - { - "type": "AccessNode", - "label": "A", - "attributes": { - "debuginfo": { - "type": "DebugInfo", - "start_line": 95, - "end_line": 95, - "start_column": 4, - "end_column": 4, - "filename": "/home/primrose/Work/dace/tests/transformations/multiple_buffering_test.py" - }, - "data": "A", - "guid": "10bb78f3-f3ce-46ff-abd1-b5d1e8e19e17" - }, - "id": 2, - "scope_entry": null, - "scope_exit": null - }, - { - "type": "AccessNode", - "label": "B", - "attributes": { - "debuginfo": { - "type": "DebugInfo", - "start_line": 95, - "end_line": 95, - "start_column": 4, - "end_column": 4, - "filename": "/home/primrose/Work/dace/tests/transformations/multiple_buffering_test.py" - }, - "data": "B", - "guid": "b1b04e6f-26af-4e3a-bb90-ed435f77a0df" - }, - "id": 3, - "scope_entry": null, - "scope_exit": null - }, - { - "type": "AccessNode", - "label": "C", - "attributes": { - "debuginfo": { - "type": "DebugInfo", - "start_line": 95, - "end_line": 95, - "start_column": 4, - "end_column": 4, - "filename": "/home/primrose/Work/dace/tests/transformations/multiple_buffering_test.py" - }, - "data": "C", - "guid": "c93c561b-1a7f-42b5-af9d-6bdb9cbb586b" - }, - "id": 4, - "scope_entry": null, - "scope_exit": null - }, - { - "type": "MapEntry", - "label": "kernel_95_4_96[k=0:2]", - "attributes": { - "label": "kernel_95_4_96", - "params": [ - "k" - ], - "range": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "1", - "step": "1", - "tile": "1" - } - ] - }, - "schedule": "Sequential", - "debuginfo": { - "type": "DebugInfo", - "start_line": 96, - "end_line": 96, - "start_column": 8, - "end_column": 8, - "filename": "/home/primrose/Work/dace/tests/transformations/multiple_buffering_test.py" - }, - "in_connectors": { - "IN___tmp_98_37_r_in_from_1_0_in_from_1_0": null, - "IN___tmp_98_58_r_in_from_1_0_in_from_1_0": null - }, - "out_connectors": { - "OUT___tmp_98_37_r_in_from_1_0_in_from_1_0": null, - "OUT___tmp_98_58_r_in_from_1_0_in_from_1_0": null - }, - "guid": "93ff2336-bf88-4205-9416-d8d5ada0fa43" - }, - "id": 5, - "scope_entry": "0", - "scope_exit": "6" - }, - { - "type": "MapExit", - "label": "kernel_95_4_96[k=0:2]", - "attributes": { - "in_connectors": { - "IN___tmp_98_16_w_out_of_1_1_out_of_1_1": null - }, - "out_connectors": { - "OUT___tmp_98_16_w_out_of_1_1_out_of_1_1": null - }, - "guid": "d36fa430-b62a-4ccf-8aab-69ea2476dff9" - }, - "id": 6, - "scope_entry": "5", - "scope_exit": "6" - }, - { - "type": "MapEntry", - "label": "kernel_95_4_96_8_97[j=0:256]", - "attributes": { - "label": "kernel_95_4_96_8_97", - "params": [ - "j" - ], - "range": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "255", - "step": "1", - "tile": "1" - } - ] - }, - "schedule": "GPU_ThreadBlock", - "debuginfo": { - "type": "DebugInfo", - "start_line": 97, - "end_line": 97, - "start_column": 12, - "end_column": 12, - "filename": "/home/primrose/Work/dace/tests/transformations/multiple_buffering_test.py" - }, - "in_connectors": { - "IN___tmp_98_37_r_in_from_1_0": null, - "IN___tmp_98_58_r_in_from_1_0": null - }, - "out_connectors": { - "OUT___tmp_98_37_r_in_from_1_0": null, - "OUT___tmp_98_58_r_in_from_1_0": null - }, - "guid": "cbaaa2f3-f301-4716-a836-6eb53ee8698f" - }, - "id": 7, - "scope_entry": "5", - "scope_exit": "8" - }, - { - "type": "MapExit", - "label": "kernel_95_4_96_8_97[j=0:256]", - "attributes": { - "in_connectors": { - "IN___tmp_98_16_w_out_of_1_1": null - }, - "out_connectors": { - "OUT___tmp_98_16_w_out_of_1_1": null - }, - "guid": "935f0df2-f1ce-4886-9afe-4eaef71e5b09" - }, - "id": 8, - "scope_entry": "7", - "scope_exit": "8" - }, - { - "type": "Tasklet", - "label": "_Add_", - "attributes": { - "code": { - "string_data": "__out = (__in1 + __in2)", - "language": "Python" - }, - "debuginfo": { - "type": "DebugInfo", - "start_line": 98, - "end_line": 98, - "start_column": 72, - "end_column": 72, - "filename": "/home/primrose/Work/dace/tests/transformations/multiple_buffering_test.py" - }, - "label": "_Add_", - "in_connectors": { - "__in1": null, - "__in2": null - }, - "out_connectors": { - "__out": null - }, - "guid": "92adee05-eb8b-4d37-8aa3-85e480d724e9" - }, - "id": 9, - "scope_entry": "7", - "scope_exit": "8" - }, - { - "type": "AccessNode", - "label": "__tmp3", - "attributes": { - "debuginfo": { - "type": "DebugInfo", - "start_line": 98, - "end_line": 98, - "start_column": 72, - "end_column": 72, - "filename": "/home/primrose/Work/dace/tests/transformations/multiple_buffering_test.py" - }, - "data": "__tmp3", - "guid": "9bcc8c35-eea5-4862-a6d8-3228fca200f2" - }, - "id": 10, - "scope_entry": "7", - "scope_exit": "8" - }, - { - "type": "Tasklet", - "label": "assign_98_16", - "attributes": { - "code": { - "string_data": "__out = __inp", - "language": "Python" - }, - "debuginfo": { - "type": "DebugInfo", - "start_line": 98, - "end_line": 98, - "start_column": 30, - "end_column": 30, - "filename": "/home/primrose/Work/dace/tests/transformations/multiple_buffering_test.py" - }, - "label": "assign_98_16", - "in_connectors": { - "__inp": null - }, - "out_connectors": { - "__out": null - }, - "guid": "64433f94-2825-4b47-a308-361d508d77f6" - }, - "id": 11, - "scope_entry": "7", - "scope_exit": "8" - }, - { - "type": "AccessNode", - "label": "shr_A", - "attributes": { - "debuginfo": { - "type": "DebugInfo", - "start_line": 54, - "end_line": 54, - "start_column": 0, - "end_column": 0, - "filename": "/home/primrose/Work/dace/tests/transformations/multiple_buffering_test.py" - }, - "data": "shr_A", - "guid": "c7684bbd-ea58-48d4-a3e4-59e81016df47" - }, - "id": 12, - "scope_entry": "5", - "scope_exit": "6" - }, - { - "type": "AccessNode", - "label": "shr_B", - "attributes": { - "debuginfo": { - "type": "DebugInfo", - "start_line": 54, - "end_line": 54, - "start_column": 0, - "end_column": 0, - "filename": "/home/primrose/Work/dace/tests/transformations/multiple_buffering_test.py" - }, - "data": "shr_B", - "guid": "de996230-3c3d-4c05-8527-257ceb9dd045" - }, - "id": 13, - "scope_entry": "5", - "scope_exit": "6" - } - ], - "edges": [ - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "512*ceiling(N/512)", - "subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "N - 1", - "step": "1", - "tile": "1" - } - ] - }, - "data": "A", - "debuginfo": null, - "guid": "65758726-87ca-42be-89ba-a9bb75ddebe7", - "src_subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "N - 1", - "step": "1", - "tile": "1" - } - ] - }, - "dst_subset": null, - "is_data_src": true, - "num_accesses": "512*ceiling(N/512)" - } - } - }, - "src": "2", - "dst": "0", - "dst_connector": "IN_A", - "src_connector": null - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "512*ceiling(N/512)", - "subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "N - 1", - "step": "1", - "tile": "1" - } - ] - }, - "data": "B", - "debuginfo": null, - "guid": "17dcfd79-4fcc-4314-a7c1-b845be9c77c0", - "src_subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "N - 1", - "step": "1", - "tile": "1" - } - ] - }, - "dst_subset": null, - "is_data_src": true, - "num_accesses": "512*ceiling(N/512)" - } - } - }, - "src": "3", - "dst": "0", - "dst_connector": "IN_B", - "src_connector": null - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "256", - "subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "255", - "step": "1", - "tile": "1" - } - ] - }, - "data": "shr_A", - "debuginfo": null, - "guid": "123eace6-f607-4d9c-8b7a-b1d91d7cd033", - "src_subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "255", - "step": "1", - "tile": "1" - } - ] - }, - "dst_subset": null, - "is_data_src": true, - "num_accesses": "256" - } - } - }, - "src": "12", - "dst": "7", - "dst_connector": "IN___tmp_98_37_r_in_from_1_0", - "src_connector": null - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "256", - "subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "255", - "step": "1", - "tile": "1" - } - ] - }, - "data": "shr_B", - "debuginfo": null, - "guid": "197f512b-c4e2-4b15-a7fa-f31a8f1f3863", - "src_subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "255", - "step": "1", - "tile": "1" - } - ] - }, - "dst_subset": null, - "is_data_src": true, - "num_accesses": "256" - } - } - }, - "src": "13", - "dst": "7", - "dst_connector": "IN___tmp_98_58_r_in_from_1_0", - "src_connector": null - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "1", - "subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "0", - "step": "1", - "tile": "1" - } - ] - }, - "data": "__tmp3", - "debuginfo": null, - "guid": "d97379d3-04f1-4d25-93e4-29e3953be24d", - "src_subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "0", - "step": "1", - "tile": "1" - } - ] - }, - "dst_subset": null, - "is_data_src": true, - "num_accesses": "1" - } - } - }, - "src": "10", - "dst": "11", - "dst_connector": "__inp", - "src_connector": null - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "512", - "subset": { - "type": "Range", - "ranges": [ - { - "start": "i", - "end": "i + 511", - "step": "1", - "tile": "1" - } - ] - }, - "data": "A", - "debuginfo": null, - "guid": "9ac193e2-3c72-48f7-9ef0-f50495152775", - "src_subset": { - "type": "Range", - "ranges": [ - { - "start": "i", - "end": "i + 511", - "step": "1", - "tile": "1" - } - ] - }, - "dst_subset": null, - "is_data_src": true, - "num_accesses": "512" - } - } - }, - "src": "0", - "dst": "5", - "dst_connector": "IN___tmp_98_37_r_in_from_1_0_in_from_1_0", - "src_connector": "OUT_A" - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "512", - "subset": { - "type": "Range", - "ranges": [ - { - "start": "i", - "end": "i + 511", - "step": "1", - "tile": "1" - } - ] - }, - "data": "B", - "debuginfo": null, - "guid": "30da7eee-07be-4d97-9fe7-6385057477b6", - "src_subset": { - "type": "Range", - "ranges": [ - { - "start": "i", - "end": "i + 511", - "step": "1", - "tile": "1" - } - ] - }, - "dst_subset": null, - "is_data_src": true, - "num_accesses": "512" - } - } - }, - "src": "0", - "dst": "5", - "dst_connector": "IN___tmp_98_58_r_in_from_1_0_in_from_1_0", - "src_connector": "OUT_B" - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "512*ceiling(N/512)", - "subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "N - 1", - "step": "1", - "tile": "1" - } - ] - }, - "data": "C", - "debuginfo": null, - "guid": "1f0ccad2-3828-4366-9a82-2847c3398b35", - "src_subset": null, - "dst_subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "N - 1", - "step": "1", - "tile": "1" - } - ] - }, - "is_data_src": false, - "num_accesses": "512*ceiling(N/512)" - } - } - }, - "src": "1", - "dst": "4", - "dst_connector": null, - "src_connector": "OUT_C" - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "256", - "subset": { - "type": "Range", - "ranges": [ - { - "start": "i + 256*k", - "end": "i + 256*k + 255", - "step": "1", - "tile": "1" - } - ] - }, - "data": "C", - "debuginfo": null, - "guid": "a92efac1-2678-4303-aa04-5be51f196e25", - "src_subset": null, - "dst_subset": { - "type": "Range", - "ranges": [ - { - "start": "i + 256*k", - "end": "i + 256*k + 255", - "step": "1", - "tile": "1" - } - ] - }, - "is_data_src": false, - "num_accesses": "256" - } - } - }, - "src": "8", - "dst": "6", - "dst_connector": "IN___tmp_98_16_w_out_of_1_1_out_of_1_1", - "src_connector": "OUT___tmp_98_16_w_out_of_1_1" - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "512", - "subset": { - "type": "Range", - "ranges": [ - { - "start": "i", - "end": "i + 511", - "step": "1", - "tile": "1" - } - ] - }, - "data": "C", - "debuginfo": null, - "guid": "bf7f1dc2-7ac0-4507-8aef-a67684cc5c44", - "src_subset": null, - "dst_subset": { - "type": "Range", - "ranges": [ - { - "start": "i", - "end": "i + 511", - "step": "1", - "tile": "1" - } - ] - }, - "is_data_src": false, - "num_accesses": "512" - } - } - }, - "src": "6", - "dst": "1", - "dst_connector": "IN_C", - "src_connector": "OUT___tmp_98_16_w_out_of_1_1_out_of_1_1" - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "1", - "subset": { - "type": "Range", - "ranges": [ - { - "start": "j", - "end": "j", - "step": "1", - "tile": "1" - } - ] - }, - "data": "shr_A", - "debuginfo": null, - "guid": "5428f419-c4e3-4cf5-9a13-d42edd7f7fb2", - "src_subset": { - "type": "Range", - "ranges": [ - { - "start": "j", - "end": "j", - "step": "1", - "tile": "1" - } - ] - }, - "dst_subset": null, - "is_data_src": true, - "num_accesses": "1" - } - } - }, - "src": "7", - "dst": "9", - "dst_connector": "__in1", - "src_connector": "OUT___tmp_98_37_r_in_from_1_0" - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "256", - "subset": { - "type": "Range", - "ranges": [ - { - "start": "i + 256*k", - "end": "i + 256*k + 255", - "step": "1", - "tile": "1" - } - ] - }, - "other_subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "255", - "step": "1", - "tile": "1" - } - ] - }, - "data": "A", - "debuginfo": null, - "guid": "eb9ff732-544b-4c5b-80cd-aa0b9dc83fa1", - "src_subset": { - "type": "Range", - "ranges": [ - { - "start": "i + 256*k", - "end": "i + 256*k + 255", - "step": "1", - "tile": "1" - } - ] - }, - "dst_subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "255", - "step": "1", - "tile": "1" - } - ] - }, - "is_data_src": true, - "num_accesses": "256" - } - } - }, - "src": "5", - "dst": "12", - "dst_connector": null, - "src_connector": "OUT___tmp_98_37_r_in_from_1_0_in_from_1_0" - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "1", - "subset": { - "type": "Range", - "ranges": [ - { - "start": "j", - "end": "j", - "step": "1", - "tile": "1" - } - ] - }, - "data": "shr_B", - "debuginfo": null, - "guid": "3478573b-f7f4-4441-a7a3-bf3ebf93e64b", - "src_subset": { - "type": "Range", - "ranges": [ - { - "start": "j", - "end": "j", - "step": "1", - "tile": "1" - } - ] - }, - "dst_subset": null, - "is_data_src": true, - "num_accesses": "1" - } - } - }, - "src": "7", - "dst": "9", - "dst_connector": "__in2", - "src_connector": "OUT___tmp_98_58_r_in_from_1_0" - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "256", - "subset": { - "type": "Range", - "ranges": [ - { - "start": "i + 256*k", - "end": "i + 256*k + 255", - "step": "1", - "tile": "1" - } - ] - }, - "other_subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "255", - "step": "1", - "tile": "1" - } - ] - }, - "data": "B", - "debuginfo": null, - "guid": "0ce32b4d-333f-459c-a576-2d2e79303500", - "src_subset": { - "type": "Range", - "ranges": [ - { - "start": "i + 256*k", - "end": "i + 256*k + 255", - "step": "1", - "tile": "1" - } - ] - }, - "dst_subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "255", - "step": "1", - "tile": "1" - } - ] - }, - "is_data_src": true, - "num_accesses": "256" - } - } - }, - "src": "5", - "dst": "13", - "dst_connector": null, - "src_connector": "OUT___tmp_98_58_r_in_from_1_0_in_from_1_0" - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "1", - "subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "0", - "step": "1", - "tile": "1" - } - ] - }, - "data": "__tmp3", - "debuginfo": null, - "guid": "0c0703e8-1a00-4cf7-98d6-7047af804d38", - "src_subset": null, - "dst_subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "0", - "step": "1", - "tile": "1" - } - ] - }, - "is_data_src": false, - "num_accesses": "1" - } - } - }, - "src": "9", - "dst": "10", - "dst_connector": null, - "src_connector": "__out" - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "1", - "subset": { - "type": "Range", - "ranges": [ - { - "start": "i + j + 256*k", - "end": "i + j + 256*k", - "step": "1", - "tile": "1" - } - ] - }, - "data": "C", - "debuginfo": null, - "guid": "176a1ba4-31ef-4819-8186-173e826e42ac", - "src_subset": null, - "dst_subset": { - "type": "Range", - "ranges": [ - { - "start": "i + j + 256*k", - "end": "i + j + 256*k", - "step": "1", - "tile": "1" - } - ] - }, - "is_data_src": false, - "num_accesses": "1" - } - } - }, - "src": "11", - "dst": "8", - "dst_connector": "IN___tmp_98_16_w_out_of_1_1", - "src_connector": "__out" - } - ], - "attributes": { - "guid": "8d9c9dc6-b7fe-4305-9b0e-f344e6cb96bc", - "executions": "1", - "dynamic_executions": false - } - } - ], - "edges": [], - "collapsed": false, - "label": "", - "id": null, - "cfg_list_id": 0, - "start_block": 0, - "dace_version": "1.0.0" - }, - "transformation_hist": [ - { - "type": "PatternTransformation", - "transformation": "MultipleBuffering", - "prefill_cfg_id": 1, - "prefetch_cfg_id": 1, - "synchronous": false, - "_subgraph": { - "0": 0 - } - } - ], - "debuginfo": { - "type": "DebugInfo", - "start_line": 89, - "end_line": 98, - "start_column": 0, - "end_column": 0, - "filename": "/home/primrose/Work/dace/tests/transformations/multiple_buffering_test.py" - }, - "using_explicit_control_flow": true, - "guid": "d86432fb-dd83-4d68-b1a3-5443e218087b", - "hash": "6272783aa43803c69147bfe8f7b1459cb863365e3e0452f06b3881ab8a0997ee" - }, - "nodes": [ - { - "type": "SDFGState", - "label": "MapState", - "id": 0, - "collapsed": false, - "scope_dict": { - "-1": [ - 0, - 2, - 3, - 4 - ], - "0": [ - 1, - 5, - 12, - 13, - 14 - ], - "5": [ - 6, - 7, - 15, - 16, - 17, - 18, - 19 - ], - "7": [ - 8, - 9, - 10, - 11 - ] - }, - "nodes": [ - { - "type": "MapEntry", - "label": "kernel_95[i=0:N:512]", - "attributes": { - "label": "kernel_95", - "params": [ - "i" - ], - "range": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "N - 1", - "step": "512", - "tile": "1" - } - ] - }, - "schedule": "GPU_Device", - "debuginfo": { - "type": "DebugInfo", - "start_line": 95, - "end_line": 95, - "start_column": 4, - "end_column": 4, - "filename": "/home/primrose/Work/dace/tests/transformations/multiple_buffering_test.py" - }, - "in_connectors": { - "IN_A": null, - "IN_B": null - }, - "out_connectors": { - "OUT_A": null, - "OUT_B": null - }, - "guid": "4748988f-5a8b-4bb9-979c-22cba4912dc0" - }, - "id": 0, - "scope_entry": null, - "scope_exit": "1" - }, - { - "type": "MapExit", - "label": "kernel_95[i=0:N:512]", - "attributes": { - "in_connectors": { - "IN_C": null - }, - "out_connectors": { - "OUT_C": null - }, - "guid": "6b03359b-afbe-4cc4-92de-ebedc84f829b" - }, - "id": 1, - "scope_entry": "0", - "scope_exit": "1" - }, - { - "type": "AccessNode", - "label": "A", - "attributes": { - "debuginfo": { - "type": "DebugInfo", - "start_line": 95, - "end_line": 95, - "start_column": 4, - "end_column": 4, - "filename": "/home/primrose/Work/dace/tests/transformations/multiple_buffering_test.py" - }, - "data": "A", - "guid": "e2fa79c1-e57c-4267-8019-43579c1c1c09" - }, - "id": 2, - "scope_entry": null, - "scope_exit": null - }, - { - "type": "AccessNode", - "label": "B", - "attributes": { - "debuginfo": { - "type": "DebugInfo", - "start_line": 95, - "end_line": 95, - "start_column": 4, - "end_column": 4, - "filename": "/home/primrose/Work/dace/tests/transformations/multiple_buffering_test.py" - }, - "data": "B", - "guid": "68049d0f-8940-43a2-8509-f9a028be8f46" - }, - "id": 3, - "scope_entry": null, - "scope_exit": null - }, - { - "type": "AccessNode", - "label": "C", - "attributes": { - "debuginfo": { - "type": "DebugInfo", - "start_line": 95, - "end_line": 95, - "start_column": 4, - "end_column": 4, - "filename": "/home/primrose/Work/dace/tests/transformations/multiple_buffering_test.py" - }, - "data": "C", - "guid": "bd228903-54ad-4b1c-b473-2eed986516fd" - }, - "id": 4, - "scope_entry": null, - "scope_exit": null - }, - { - "type": "MapEntry", - "label": "kernel_95_4_96[k=0:2]", - "attributes": { - "label": "kernel_95_4_96", - "params": [ - "k" - ], - "range": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "1", - "step": "1", - "tile": "1" - } - ] - }, - "schedule": "Sequential", - "debuginfo": { - "type": "DebugInfo", - "start_line": 96, - "end_line": 96, - "start_column": 8, - "end_column": 8, - "filename": "/home/primrose/Work/dace/tests/transformations/multiple_buffering_test.py" - }, - "in_connectors": { - "IN___tmp_98_37_r_in_from_1_0_in_from_1_0": null, - "IN___tmp_98_58_r_in_from_1_0_in_from_1_0": null, - "IN_prefetch_A": null, - "IN_prefetch_B": null - }, - "out_connectors": { - "OUT___tmp_98_37_r_in_from_1_0_in_from_1_0": null, - "OUT___tmp_98_58_r_in_from_1_0_in_from_1_0": null, - "OUT_prefetch_A": null, - "OUT_prefetch_B": null - }, - "guid": "93ff2336-bf88-4205-9416-d8d5ada0fa43" - }, - "id": 5, - "scope_entry": "0", - "scope_exit": "6" - }, - { - "type": "MapExit", - "label": "kernel_95_4_96[k=0:2]", - "attributes": { - "in_connectors": { - "IN___tmp_98_16_w_out_of_1_1_out_of_1_1": null - }, - "out_connectors": { - "OUT___tmp_98_16_w_out_of_1_1_out_of_1_1": null - }, - "guid": "d36fa430-b62a-4ccf-8aab-69ea2476dff9" - }, - "id": 6, - "scope_entry": "5", - "scope_exit": "6" - }, - { - "type": "MapEntry", - "label": "kernel_95_4_96_8_97[j=0:256]", - "attributes": { - "label": "kernel_95_4_96_8_97", - "params": [ - "j" - ], - "range": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "255", - "step": "1", - "tile": "1" - } - ] - }, - "schedule": "GPU_ThreadBlock", - "debuginfo": { - "type": "DebugInfo", - "start_line": 97, - "end_line": 97, - "start_column": 12, - "end_column": 12, - "filename": "/home/primrose/Work/dace/tests/transformations/multiple_buffering_test.py" - }, - "in_connectors": { - "IN___tmp_98_37_r_in_from_1_0": null, - "IN___tmp_98_58_r_in_from_1_0": null - }, - "out_connectors": { - "OUT___tmp_98_37_r_in_from_1_0": null, - "OUT___tmp_98_58_r_in_from_1_0": null - }, - "guid": "cbaaa2f3-f301-4716-a836-6eb53ee8698f" - }, - "id": 7, - "scope_entry": "5", - "scope_exit": "8" - }, - { - "type": "MapExit", - "label": "kernel_95_4_96_8_97[j=0:256]", - "attributes": { - "in_connectors": { - "IN___tmp_98_16_w_out_of_1_1": null - }, - "out_connectors": { - "OUT___tmp_98_16_w_out_of_1_1": null - }, - "guid": "935f0df2-f1ce-4886-9afe-4eaef71e5b09" - }, - "id": 8, - "scope_entry": "7", - "scope_exit": "8" - }, - { - "type": "Tasklet", - "label": "_Add_", - "attributes": { - "code": { - "string_data": "__out = (__in1 + __in2)", - "language": "Python" - }, - "debuginfo": { - "type": "DebugInfo", - "start_line": 98, - "end_line": 98, - "start_column": 72, - "end_column": 72, - "filename": "/home/primrose/Work/dace/tests/transformations/multiple_buffering_test.py" - }, - "label": "_Add_", - "in_connectors": { - "__in1": null, - "__in2": null - }, - "out_connectors": { - "__out": null - }, - "guid": "92adee05-eb8b-4d37-8aa3-85e480d724e9" - }, - "id": 9, - "scope_entry": "7", - "scope_exit": "8" - }, - { - "type": "AccessNode", - "label": "__tmp3", - "attributes": { - "debuginfo": { - "type": "DebugInfo", - "start_line": 98, - "end_line": 98, - "start_column": 72, - "end_column": 72, - "filename": "/home/primrose/Work/dace/tests/transformations/multiple_buffering_test.py" - }, - "data": "__tmp3", - "guid": "480998aa-e861-4092-a043-5639a355c629" - }, - "id": 10, - "scope_entry": "7", - "scope_exit": "8" - }, - { - "type": "Tasklet", - "label": "assign_98_16", - "attributes": { - "code": { - "string_data": "__out = __inp", - "language": "Python" - }, - "debuginfo": { - "type": "DebugInfo", - "start_line": 98, - "end_line": 98, - "start_column": 30, - "end_column": 30, - "filename": "/home/primrose/Work/dace/tests/transformations/multiple_buffering_test.py" - }, - "label": "assign_98_16", - "in_connectors": { - "__inp": null - }, - "out_connectors": { - "__out": null - }, - "guid": "64433f94-2825-4b47-a308-361d508d77f6" - }, - "id": 11, - "scope_entry": "7", - "scope_exit": "8" - }, - { - "type": "NestedSDFG", - "label": "pipeline_prefill_nsdfg_0", - "attributes": { - "sdfg": { - "type": "SDFG", - "attributes": { - "name": "pipeline_prefill_main_sdfg_0", - "_arrays": { - "B": { - "type": "Array", - "attributes": { - "strides": [ - "1" - ], - "total_size": "N", - "offset": [ - "0" - ], - "optional": false, - "dtype": "float64", - "shape": [ - "N" - ], - "storage": "GPU_Global", - "debuginfo": null - } - }, - "shr_B": { - "type": "Array", - "attributes": { - "strides": [ - "256", - "1" - ], - "total_size": "512", - "offset": [ - "0", - "0" - ], - "optional": false, - "dtype": "float64", - "shape": [ - "2", - "256" - ], - "storage": "GPU_Shared", - "debuginfo": null - } - }, - "A": { - "type": "Array", - "attributes": { - "strides": [ - "1" - ], - "total_size": "N", - "offset": [ - "0" - ], - "optional": false, - "dtype": "float64", - "shape": [ - "N" - ], - "storage": "GPU_Global", - "debuginfo": null - } - }, - "shr_A": { - "type": "Array", - "attributes": { - "strides": [ - "256", - "1" - ], - "total_size": "512", - "offset": [ - "0", - "0" - ], - "optional": false, - "dtype": "float64", - "shape": [ - "2", - "256" - ], - "storage": "GPU_Shared", - "debuginfo": null - } - } - }, - "symbols": { - "N": "int64", - "i": "int64" - }, - "global_code": { - "frame": { - "string_data": "", - "language": "CPP" - } - }, - "init_code": { - "frame": { - "string_data": "", - "language": "CPP" - } - }, - "exit_code": { - "frame": { - "string_data": "", - "language": "CPP" - } - }, - "guid": "f94e5217-c4f1-46a4-afdc-da0185f8f45b" - }, - "nodes": [ - { - "type": "LoopRegion", - "attributes": { - "update_statement": { - "string_data": "pipe_stage = (pipe_stage + 1)", - "language": "Python" - }, - "init_statement": { - "string_data": "pipe_stage = 0", - "language": "Python" - }, - "loop_condition": { - "string_data": "(pipe_stage < 1)", - "language": "Python" - }, - "loop_variable": "pipe_stage", - "guid": "1953b28b-8f55-45ac-9f33-168fb08dca3e" - }, - "nodes": [ - { - "type": "ConditionalBlock", - "attributes": { - "guid": "15bc7f27-eb3b-4991-ab35-89220f06f406" - }, - "nodes": [ - { - "type": "ControlFlowRegion", - "attributes": { - "guid": "0c174dbd-474e-4634-b190-491e0c5ab8b2" - }, - "nodes": [ - { - "type": "SDFGState", - "label": "pipeline_prefill_state_0", - "id": 0, - "collapsed": false, - "scope_dict": { - "-1": [ - 0, - 1, - 2, - 3, - 4, - 5 - ] - }, - "nodes": [ - { - "type": "AccessNode", - "label": "B", - "attributes": { - "debuginfo": { - "type": "DebugInfo", - "start_line": 856, - "end_line": 856, - "start_column": 0, - "end_column": 0, - "filename": "/home/primrose/Work/dace/dace/transformation/dataflow/multiple_buffering.py" - }, - "data": "B", - "guid": "7a229696-1b0e-4cc3-bfba-b0d5526ebb0d" - }, - "id": 0, - "scope_entry": null, - "scope_exit": null - }, - { - "type": "AccessNode", - "label": "shr_B", - "attributes": { - "debuginfo": { - "type": "DebugInfo", - "start_line": 857, - "end_line": 857, - "start_column": 0, - "end_column": 0, - "filename": "/home/primrose/Work/dace/dace/transformation/dataflow/multiple_buffering.py" - }, - "data": "shr_B", - "guid": "948f2521-89e6-463d-b326-4f39fc82c7a0" - }, - "id": 1, - "scope_entry": null, - "scope_exit": null - }, - { - "type": "AccessNode", - "label": "A", - "attributes": { - "debuginfo": { - "type": "DebugInfo", - "start_line": 856, - "end_line": 856, - "start_column": 0, - "end_column": 0, - "filename": "/home/primrose/Work/dace/dace/transformation/dataflow/multiple_buffering.py" - }, - "data": "A", - "guid": "5bdcde52-e626-4979-a389-b73ba6e6f927" - }, - "id": 2, - "scope_entry": null, - "scope_exit": null - }, - { - "type": "AccessNode", - "label": "shr_A", - "attributes": { - "debuginfo": { - "type": "DebugInfo", - "start_line": 857, - "end_line": 857, - "start_column": 0, - "end_column": 0, - "filename": "/home/primrose/Work/dace/dace/transformation/dataflow/multiple_buffering.py" - }, - "data": "shr_A", - "guid": "355f3366-55d5-4d29-a305-dfba28639f89" - }, - "id": 3, - "scope_entry": null, - "scope_exit": null - }, - { - "type": "Tasklet", - "label": "producer_acquire_pipe", - "attributes": { - "code": { - "string_data": "pipe.consumer_acquire();", - "language": "CPP" - }, - "debuginfo": { - "type": "DebugInfo", - "start_line": 380, - "end_line": 380, - "start_column": 0, - "end_column": 0, - "filename": "/home/primrose/Work/dace/dace/transformation/dataflow/multiple_buffering.py" - }, - "side_effects": true, - "label": "producer_acquire_pipe", - "guid": "c478ba2a-9f56-45f7-942f-6a6c9153a1d0" - }, - "id": 4, - "scope_entry": null, - "scope_exit": null - }, - { - "type": "Tasklet", - "label": "producer_commit_pipe", - "attributes": { - "code": { - "string_data": "pipe.consumer_commit();", - "language": "CPP" - }, - "debuginfo": { - "type": "DebugInfo", - "start_line": 390, - "end_line": 390, - "start_column": 0, - "end_column": 0, - "filename": "/home/primrose/Work/dace/dace/transformation/dataflow/multiple_buffering.py" - }, - "side_effects": true, - "label": "producer_commit_pipe", - "guid": "fb90138b-abb2-4752-9bc4-d9d55324886c" - }, - "id": 5, - "scope_entry": null, - "scope_exit": null - } - ], - "edges": [ - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "256", - "subset": { - "type": "Range", - "ranges": [ - { - "start": "256*pipe_stage + i", - "end": "256*pipe_stage + i + 255", - "step": "1", - "tile": "1" - } - ] - }, - "other_subset": { - "type": "Range", - "ranges": [ - { - "start": "pipe_stage", - "end": "pipe_stage", - "step": "1", - "tile": "1" - }, - { - "start": "0", - "end": "255", - "step": "1", - "tile": "1" - } - ] - }, - "data": "B", - "debuginfo": null, - "guid": "07276b3c-739f-46ae-bb20-390a445ca75c", - "src_subset": { - "type": "Range", - "ranges": [ - { - "start": "256*pipe_stage + i", - "end": "256*pipe_stage + i + 255", - "step": "1", - "tile": "1" - } - ] - }, - "dst_subset": { - "type": "Range", - "ranges": [ - { - "start": "pipe_stage", - "end": "pipe_stage", - "step": "1", - "tile": "1" - }, - { - "start": "0", - "end": "255", - "step": "1", - "tile": "1" - } - ] - }, - "is_data_src": true, - "num_accesses": "256" - } - } - }, - "src": "0", - "dst": "1", - "dst_connector": null, - "src_connector": null - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "256", - "subset": { - "type": "Range", - "ranges": [ - { - "start": "256*pipe_stage + i", - "end": "256*pipe_stage + i + 255", - "step": "1", - "tile": "1" - } - ] - }, - "other_subset": { - "type": "Range", - "ranges": [ - { - "start": "pipe_stage", - "end": "pipe_stage", - "step": "1", - "tile": "1" - }, - { - "start": "0", - "end": "255", - "step": "1", - "tile": "1" - } - ] - }, - "data": "A", - "debuginfo": null, - "guid": "07d7acea-e0db-45d9-8c2f-246296f71ffa", - "src_subset": { - "type": "Range", - "ranges": [ - { - "start": "256*pipe_stage + i", - "end": "256*pipe_stage + i + 255", - "step": "1", - "tile": "1" - } - ] - }, - "dst_subset": { - "type": "Range", - "ranges": [ - { - "start": "pipe_stage", - "end": "pipe_stage", - "step": "1", - "tile": "1" - }, - { - "start": "0", - "end": "255", - "step": "1", - "tile": "1" - } - ] - }, - "is_data_src": true, - "num_accesses": "256" - } - } - }, - "src": "2", - "dst": "3", - "dst_connector": null, - "src_connector": null - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "1", - "debuginfo": null, - "guid": "0f9c8337-a0df-470c-b984-6ffe36730ec9", - "src_subset": null, - "dst_subset": null, - "is_data_src": false, - "num_accesses": "1" - } - } - }, - "src": "4", - "dst": "0", - "dst_connector": null, - "src_connector": null - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "1", - "debuginfo": null, - "guid": "c57cf2c0-8238-443e-b743-e264dc9723db", - "src_subset": null, - "dst_subset": null, - "is_data_src": false, - "num_accesses": "1" - } - } - }, - "src": "1", - "dst": "5", - "dst_connector": null, - "src_connector": null - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "1", - "debuginfo": null, - "guid": "a0f2b6f9-dbdd-4d37-9b0e-e75a74f4621b", - "src_subset": null, - "dst_subset": null, - "is_data_src": false, - "num_accesses": "1" - } - } - }, - "src": "4", - "dst": "2", - "dst_connector": null, - "src_connector": null - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "1", - "debuginfo": null, - "guid": "ada740d2-5199-4f0d-a75e-e9327ea681f7", - "src_subset": null, - "dst_subset": null, - "is_data_src": false, - "num_accesses": "1" - } - } - }, - "src": "3", - "dst": "5", - "dst_connector": null, - "src_connector": null - } - ], - "attributes": { - "guid": "3a623ac0-f8e9-4b15-9f7a-eecd1ccfadb3" - } - } - ], - "edges": [], - "collapsed": false, - "label": "pipeline_prefill_test_0", - "id": 0, - "cfg_list_id": 4, - "start_block": 0 - } - ], - "edges": [], - "collapsed": false, - "label": "pipeline_prefill_cond_0", - "id": 0, - "cfg_list_id": 3, - "start_block": null, - "branches": [ - [ - { - "string_data": "(pipe_stage <= 1)", - "language": "Python" - }, - { - "type": "ControlFlowRegion", - "attributes": { - "guid": "0c174dbd-474e-4634-b190-491e0c5ab8b2" - }, - "nodes": [ - { - "type": "SDFGState", - "label": "pipeline_prefill_state_0", - "id": 0, - "collapsed": false, - "scope_dict": { - "-1": [ - 0, - 1, - 2, - 3, - 4, - 5 - ] - }, - "nodes": [ - { - "type": "AccessNode", - "label": "B", - "attributes": { - "debuginfo": { - "type": "DebugInfo", - "start_line": 856, - "end_line": 856, - "start_column": 0, - "end_column": 0, - "filename": "/home/primrose/Work/dace/dace/transformation/dataflow/multiple_buffering.py" - }, - "data": "B", - "guid": "7a229696-1b0e-4cc3-bfba-b0d5526ebb0d" - }, - "id": 0, - "scope_entry": null, - "scope_exit": null - }, - { - "type": "AccessNode", - "label": "shr_B", - "attributes": { - "debuginfo": { - "type": "DebugInfo", - "start_line": 857, - "end_line": 857, - "start_column": 0, - "end_column": 0, - "filename": "/home/primrose/Work/dace/dace/transformation/dataflow/multiple_buffering.py" - }, - "data": "shr_B", - "guid": "948f2521-89e6-463d-b326-4f39fc82c7a0" - }, - "id": 1, - "scope_entry": null, - "scope_exit": null - }, - { - "type": "AccessNode", - "label": "A", - "attributes": { - "debuginfo": { - "type": "DebugInfo", - "start_line": 856, - "end_line": 856, - "start_column": 0, - "end_column": 0, - "filename": "/home/primrose/Work/dace/dace/transformation/dataflow/multiple_buffering.py" - }, - "data": "A", - "guid": "5bdcde52-e626-4979-a389-b73ba6e6f927" - }, - "id": 2, - "scope_entry": null, - "scope_exit": null - }, - { - "type": "AccessNode", - "label": "shr_A", - "attributes": { - "debuginfo": { - "type": "DebugInfo", - "start_line": 857, - "end_line": 857, - "start_column": 0, - "end_column": 0, - "filename": "/home/primrose/Work/dace/dace/transformation/dataflow/multiple_buffering.py" - }, - "data": "shr_A", - "guid": "355f3366-55d5-4d29-a305-dfba28639f89" - }, - "id": 3, - "scope_entry": null, - "scope_exit": null - }, - { - "type": "Tasklet", - "label": "producer_acquire_pipe", - "attributes": { - "code": { - "string_data": "pipe.consumer_acquire();", - "language": "CPP" - }, - "debuginfo": { - "type": "DebugInfo", - "start_line": 380, - "end_line": 380, - "start_column": 0, - "end_column": 0, - "filename": "/home/primrose/Work/dace/dace/transformation/dataflow/multiple_buffering.py" - }, - "side_effects": true, - "label": "producer_acquire_pipe", - "guid": "c478ba2a-9f56-45f7-942f-6a6c9153a1d0" - }, - "id": 4, - "scope_entry": null, - "scope_exit": null - }, - { - "type": "Tasklet", - "label": "producer_commit_pipe", - "attributes": { - "code": { - "string_data": "pipe.consumer_commit();", - "language": "CPP" - }, - "debuginfo": { - "type": "DebugInfo", - "start_line": 390, - "end_line": 390, - "start_column": 0, - "end_column": 0, - "filename": "/home/primrose/Work/dace/dace/transformation/dataflow/multiple_buffering.py" - }, - "side_effects": true, - "label": "producer_commit_pipe", - "guid": "fb90138b-abb2-4752-9bc4-d9d55324886c" - }, - "id": 5, - "scope_entry": null, - "scope_exit": null - } - ], - "edges": [ - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "256", - "subset": { - "type": "Range", - "ranges": [ - { - "start": "256*pipe_stage + i", - "end": "256*pipe_stage + i + 255", - "step": "1", - "tile": "1" - } - ] - }, - "other_subset": { - "type": "Range", - "ranges": [ - { - "start": "pipe_stage", - "end": "pipe_stage", - "step": "1", - "tile": "1" - }, - { - "start": "0", - "end": "255", - "step": "1", - "tile": "1" - } - ] - }, - "data": "B", - "debuginfo": null, - "guid": "07276b3c-739f-46ae-bb20-390a445ca75c", - "src_subset": { - "type": "Range", - "ranges": [ - { - "start": "256*pipe_stage + i", - "end": "256*pipe_stage + i + 255", - "step": "1", - "tile": "1" - } - ] - }, - "dst_subset": { - "type": "Range", - "ranges": [ - { - "start": "pipe_stage", - "end": "pipe_stage", - "step": "1", - "tile": "1" - }, - { - "start": "0", - "end": "255", - "step": "1", - "tile": "1" - } - ] - }, - "is_data_src": true, - "num_accesses": "256" - } - } - }, - "src": "0", - "dst": "1", - "dst_connector": null, - "src_connector": null - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "256", - "subset": { - "type": "Range", - "ranges": [ - { - "start": "256*pipe_stage + i", - "end": "256*pipe_stage + i + 255", - "step": "1", - "tile": "1" - } - ] - }, - "other_subset": { - "type": "Range", - "ranges": [ - { - "start": "pipe_stage", - "end": "pipe_stage", - "step": "1", - "tile": "1" - }, - { - "start": "0", - "end": "255", - "step": "1", - "tile": "1" - } - ] - }, - "data": "A", - "debuginfo": null, - "guid": "07d7acea-e0db-45d9-8c2f-246296f71ffa", - "src_subset": { - "type": "Range", - "ranges": [ - { - "start": "256*pipe_stage + i", - "end": "256*pipe_stage + i + 255", - "step": "1", - "tile": "1" - } - ] - }, - "dst_subset": { - "type": "Range", - "ranges": [ - { - "start": "pipe_stage", - "end": "pipe_stage", - "step": "1", - "tile": "1" - }, - { - "start": "0", - "end": "255", - "step": "1", - "tile": "1" - } - ] - }, - "is_data_src": true, - "num_accesses": "256" - } - } - }, - "src": "2", - "dst": "3", - "dst_connector": null, - "src_connector": null - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "1", - "debuginfo": null, - "guid": "0f9c8337-a0df-470c-b984-6ffe36730ec9", - "src_subset": null, - "dst_subset": null, - "is_data_src": false, - "num_accesses": "1" - } - } - }, - "src": "4", - "dst": "0", - "dst_connector": null, - "src_connector": null - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "1", - "debuginfo": null, - "guid": "c57cf2c0-8238-443e-b743-e264dc9723db", - "src_subset": null, - "dst_subset": null, - "is_data_src": false, - "num_accesses": "1" - } - } - }, - "src": "1", - "dst": "5", - "dst_connector": null, - "src_connector": null - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "1", - "debuginfo": null, - "guid": "a0f2b6f9-dbdd-4d37-9b0e-e75a74f4621b", - "src_subset": null, - "dst_subset": null, - "is_data_src": false, - "num_accesses": "1" - } - } - }, - "src": "4", - "dst": "2", - "dst_connector": null, - "src_connector": null - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "1", - "debuginfo": null, - "guid": "ada740d2-5199-4f0d-a75e-e9327ea681f7", - "src_subset": null, - "dst_subset": null, - "is_data_src": false, - "num_accesses": "1" - } - } - }, - "src": "3", - "dst": "5", - "dst_connector": null, - "src_connector": null - } - ], - "attributes": { - "guid": "3a623ac0-f8e9-4b15-9f7a-eecd1ccfadb3" - } - } - ], - "edges": [], - "collapsed": false, - "label": "pipeline_prefill_test_0", - "id": null, - "cfg_list_id": 4, - "start_block": 0 - } - ] - ] - } - ], - "edges": [], - "collapsed": false, - "label": "pipeline_prefill_loop_0", - "id": 0, - "cfg_list_id": 2, - "start_block": 0 - } - ], - "edges": [], - "collapsed": false, - "label": "", - "id": null, - "cfg_list_id": 1, - "start_block": 0 - }, - "schedule": "Sequential", - "symbol_mapping": { - "N": "N", - "i": "i" - }, - "debuginfo": { - "type": "DebugInfo", - "start_line": 846, - "end_line": 846, - "start_column": 0, - "end_column": 0, - "filename": "/home/primrose/Work/dace/dace/transformation/dataflow/multiple_buffering.py" - }, - "label": "pipeline_prefill_nsdfg_0", - "in_connectors": { - "A": null, - "B": null - }, - "out_connectors": { - "shr_A": null, - "shr_B": null - }, - "guid": "4b7bb879-2f87-4b27-ab91-5bff3773b75f" - }, - "id": 12, - "scope_entry": "0", - "scope_exit": "1" - }, - { - "type": "AccessNode", - "label": "shr_B", - "attributes": { - "debuginfo": { - "type": "DebugInfo", - "start_line": 446, - "end_line": 446, - "start_column": 0, - "end_column": 0, - "filename": "/home/primrose/Work/dace/dace/transformation/dataflow/multiple_buffering.py" - }, - "data": "shr_B", - "guid": "18dada66-b1ce-4caa-8c94-a57d620d853c" - }, - "id": 13, - "scope_entry": "0", - "scope_exit": "1" - }, - { - "type": "AccessNode", - "label": "shr_A", - "attributes": { - "debuginfo": { - "type": "DebugInfo", - "start_line": 446, - "end_line": 446, - "start_column": 0, - "end_column": 0, - "filename": "/home/primrose/Work/dace/dace/transformation/dataflow/multiple_buffering.py" - }, - "data": "shr_A", - "guid": "a9a91038-4a36-450f-8e1c-199ce6279bb9" - }, - "id": 14, - "scope_entry": "0", - "scope_exit": "1" - }, - { - "type": "NestedSDFG", - "label": "pipeline_prefetch_nsdfg_0", - "attributes": { - "sdfg": { - "type": "SDFG", - "attributes": { - "name": "pipeline_prefetch_main_sdfg_0", - "_arrays": { - "B": { - "type": "Array", - "attributes": { - "strides": [ - "1" - ], - "total_size": "N", - "offset": [ - "0" - ], - "optional": false, - "dtype": "float64", - "shape": [ - "N" - ], - "storage": "GPU_Global", - "debuginfo": null - } - }, - "shr_B": { - "type": "Array", - "attributes": { - "strides": [ - "256", - "1" - ], - "total_size": "512", - "offset": [ - "0", - "0" - ], - "optional": false, - "dtype": "float64", - "shape": [ - "2", - "256" - ], - "storage": "GPU_Shared", - "debuginfo": null - } - }, - "A": { - "type": "Array", - "attributes": { - "strides": [ - "1" - ], - "total_size": "N", - "offset": [ - "0" - ], - "optional": false, - "dtype": "float64", - "shape": [ - "N" - ], - "storage": "GPU_Global", - "debuginfo": null - } - }, - "shr_A": { - "type": "Array", - "attributes": { - "strides": [ - "256", - "1" - ], - "total_size": "512", - "offset": [ - "0", - "0" - ], - "optional": false, - "dtype": "float64", - "shape": [ - "2", - "256" - ], - "storage": "GPU_Shared", - "debuginfo": null - } - } - }, - "symbols": { - "N": "int64", - "i": "int64", - "k": "int32" - }, - "global_code": { - "frame": { - "string_data": "", - "language": "CPP" - } - }, - "init_code": { - "frame": { - "string_data": "", - "language": "CPP" - } - }, - "exit_code": { - "frame": { - "string_data": "", - "language": "CPP" - } - }, - "guid": "1e62ca8f-efd0-4b2b-bd41-f3b62ee3155a" - }, - "nodes": [ - { - "type": "ConditionalBlock", - "attributes": { - "guid": "c702c15e-b328-4ff6-a444-290cf8272a16" - }, - "nodes": [ - { - "type": "ControlFlowRegion", - "attributes": { - "guid": "49120be2-b132-4968-9b1b-e214c4e4a23b" - }, - "nodes": [ - { - "type": "SDFGState", - "label": "pipeline_prefetch_state_0", - "id": 0, - "collapsed": false, - "scope_dict": { - "-1": [ - 0, - 1, - 2, - 3, - 4, - 5 - ] - }, - "nodes": [ - { - "type": "AccessNode", - "label": "B", - "attributes": { - "debuginfo": { - "type": "DebugInfo", - "start_line": 924, - "end_line": 924, - "start_column": 0, - "end_column": 0, - "filename": "/home/primrose/Work/dace/dace/transformation/dataflow/multiple_buffering.py" - }, - "data": "B", - "guid": "c428e17a-c2db-456c-9db3-2e2242a08f43" - }, - "id": 0, - "scope_entry": null, - "scope_exit": null - }, - { - "type": "AccessNode", - "label": "shr_B", - "attributes": { - "debuginfo": { - "type": "DebugInfo", - "start_line": 925, - "end_line": 925, - "start_column": 0, - "end_column": 0, - "filename": "/home/primrose/Work/dace/dace/transformation/dataflow/multiple_buffering.py" - }, - "data": "shr_B", - "guid": "e071dd77-90ce-4f0e-9a5e-64695ac2d70c" - }, - "id": 1, - "scope_entry": null, - "scope_exit": null - }, - { - "type": "AccessNode", - "label": "A", - "attributes": { - "debuginfo": { - "type": "DebugInfo", - "start_line": 924, - "end_line": 924, - "start_column": 0, - "end_column": 0, - "filename": "/home/primrose/Work/dace/dace/transformation/dataflow/multiple_buffering.py" - }, - "data": "A", - "guid": "278ca990-59cf-4e1b-968a-485487fb0fe2" - }, - "id": 2, - "scope_entry": null, - "scope_exit": null - }, - { - "type": "AccessNode", - "label": "shr_A", - "attributes": { - "debuginfo": { - "type": "DebugInfo", - "start_line": 925, - "end_line": 925, - "start_column": 0, - "end_column": 0, - "filename": "/home/primrose/Work/dace/dace/transformation/dataflow/multiple_buffering.py" - }, - "data": "shr_A", - "guid": "a5cc31c2-b856-430e-bc9f-0b8f93022b50" - }, - "id": 3, - "scope_entry": null, - "scope_exit": null - }, - { - "type": "Tasklet", - "label": "producer_acquire_pipe", - "attributes": { - "code": { - "string_data": "pipe.consumer_acquire();", - "language": "CPP" - }, - "debuginfo": { - "type": "DebugInfo", - "start_line": 486, - "end_line": 486, - "start_column": 0, - "end_column": 0, - "filename": "/home/primrose/Work/dace/dace/transformation/dataflow/multiple_buffering.py" - }, - "side_effects": true, - "label": "producer_acquire_pipe", - "guid": "694dbb32-9d8f-408e-88cd-de8062f963e5" - }, - "id": 4, - "scope_entry": null, - "scope_exit": null - }, - { - "type": "Tasklet", - "label": "producer_commit_pipe", - "attributes": { - "code": { - "string_data": "pipe.consumer_commit();", - "language": "CPP" - }, - "debuginfo": { - "type": "DebugInfo", - "start_line": 496, - "end_line": 496, - "start_column": 0, - "end_column": 0, - "filename": "/home/primrose/Work/dace/dace/transformation/dataflow/multiple_buffering.py" - }, - "side_effects": true, - "label": "producer_commit_pipe", - "guid": "ec98c77f-a979-4684-9bd7-744aa5d4bd04" - }, - "id": 5, - "scope_entry": null, - "scope_exit": null - } - ], - "edges": [ - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "256", - "subset": { - "type": "Range", - "ranges": [ - { - "start": "256*k + i + 256", - "end": "256*k + i + 511", - "step": "1", - "tile": "1" - } - ] - }, - "other_subset": { - "type": "Range", - "ranges": [ - { - "start": "Mod(k + 1, 2)", - "end": "Mod(k + 1, 2)", - "step": "1", - "tile": "1" - }, - { - "start": "0", - "end": "255", - "step": "1", - "tile": "1" - } - ] - }, - "data": "B", - "debuginfo": null, - "guid": "1780aa25-071a-4d0f-bd6b-7f93339acdf3", - "src_subset": { - "type": "Range", - "ranges": [ - { - "start": "256*k + i + 256", - "end": "256*k + i + 511", - "step": "1", - "tile": "1" - } - ] - }, - "dst_subset": { - "type": "Range", - "ranges": [ - { - "start": "Mod(k + 1, 2)", - "end": "Mod(k + 1, 2)", - "step": "1", - "tile": "1" - }, - { - "start": "0", - "end": "255", - "step": "1", - "tile": "1" - } - ] - }, - "is_data_src": true, - "num_accesses": "256" - } - } - }, - "src": "0", - "dst": "1", - "dst_connector": null, - "src_connector": null - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "256", - "subset": { - "type": "Range", - "ranges": [ - { - "start": "256*k + i + 256", - "end": "256*k + i + 511", - "step": "1", - "tile": "1" - } - ] - }, - "other_subset": { - "type": "Range", - "ranges": [ - { - "start": "Mod(k + 1, 2)", - "end": "Mod(k + 1, 2)", - "step": "1", - "tile": "1" - }, - { - "start": "0", - "end": "255", - "step": "1", - "tile": "1" - } - ] - }, - "data": "A", - "debuginfo": null, - "guid": "1e78b324-bfac-40aa-9418-ee8e10e6a76b", - "src_subset": { - "type": "Range", - "ranges": [ - { - "start": "256*k + i + 256", - "end": "256*k + i + 511", - "step": "1", - "tile": "1" - } - ] - }, - "dst_subset": { - "type": "Range", - "ranges": [ - { - "start": "Mod(k + 1, 2)", - "end": "Mod(k + 1, 2)", - "step": "1", - "tile": "1" - }, - { - "start": "0", - "end": "255", - "step": "1", - "tile": "1" - } - ] - }, - "is_data_src": true, - "num_accesses": "256" - } - } - }, - "src": "2", - "dst": "3", - "dst_connector": null, - "src_connector": null - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "1", - "debuginfo": null, - "guid": "9293a358-75c3-47fb-8ebc-73d67cd79f2c", - "src_subset": null, - "dst_subset": null, - "is_data_src": false, - "num_accesses": "1" - } - } - }, - "src": "4", - "dst": "0", - "dst_connector": null, - "src_connector": null - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "1", - "debuginfo": null, - "guid": "35a5a5a9-3beb-4393-8361-9776e95af37f", - "src_subset": null, - "dst_subset": null, - "is_data_src": false, - "num_accesses": "1" - } - } - }, - "src": "1", - "dst": "5", - "dst_connector": null, - "src_connector": null - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "1", - "debuginfo": null, - "guid": "db651f05-1390-4998-bf43-2e7441d9798b", - "src_subset": null, - "dst_subset": null, - "is_data_src": false, - "num_accesses": "1" - } - } - }, - "src": "4", - "dst": "2", - "dst_connector": null, - "src_connector": null - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "1", - "debuginfo": null, - "guid": "5a6b79a5-f34f-4d4f-baca-5f234b66b1ef", - "src_subset": null, - "dst_subset": null, - "is_data_src": false, - "num_accesses": "1" - } - } - }, - "src": "3", - "dst": "5", - "dst_connector": null, - "src_connector": null - } - ], - "attributes": { - "guid": "842bf787-acb7-47e5-b7ab-b52500c61cdd" - } - } - ], - "edges": [], - "collapsed": false, - "label": "pipeline_prefetch_test_0", - "id": 0, - "cfg_list_id": 7, - "start_block": 0 - } - ], - "edges": [], - "collapsed": false, - "label": "pipeline_prefetch_cond_0", - "id": 0, - "cfg_list_id": 6, - "start_block": null, - "branches": [ - [ - { - "string_data": "((k + 1) <= 1)", - "language": "Python" - }, - { - "type": "ControlFlowRegion", - "attributes": { - "guid": "49120be2-b132-4968-9b1b-e214c4e4a23b" - }, - "nodes": [ - { - "type": "SDFGState", - "label": "pipeline_prefetch_state_0", - "id": 0, - "collapsed": false, - "scope_dict": { - "-1": [ - 0, - 1, - 2, - 3, - 4, - 5 - ] - }, - "nodes": [ - { - "type": "AccessNode", - "label": "B", - "attributes": { - "debuginfo": { - "type": "DebugInfo", - "start_line": 924, - "end_line": 924, - "start_column": 0, - "end_column": 0, - "filename": "/home/primrose/Work/dace/dace/transformation/dataflow/multiple_buffering.py" - }, - "data": "B", - "guid": "c428e17a-c2db-456c-9db3-2e2242a08f43" - }, - "id": 0, - "scope_entry": null, - "scope_exit": null - }, - { - "type": "AccessNode", - "label": "shr_B", - "attributes": { - "debuginfo": { - "type": "DebugInfo", - "start_line": 925, - "end_line": 925, - "start_column": 0, - "end_column": 0, - "filename": "/home/primrose/Work/dace/dace/transformation/dataflow/multiple_buffering.py" - }, - "data": "shr_B", - "guid": "e071dd77-90ce-4f0e-9a5e-64695ac2d70c" - }, - "id": 1, - "scope_entry": null, - "scope_exit": null - }, - { - "type": "AccessNode", - "label": "A", - "attributes": { - "debuginfo": { - "type": "DebugInfo", - "start_line": 924, - "end_line": 924, - "start_column": 0, - "end_column": 0, - "filename": "/home/primrose/Work/dace/dace/transformation/dataflow/multiple_buffering.py" - }, - "data": "A", - "guid": "278ca990-59cf-4e1b-968a-485487fb0fe2" - }, - "id": 2, - "scope_entry": null, - "scope_exit": null - }, - { - "type": "AccessNode", - "label": "shr_A", - "attributes": { - "debuginfo": { - "type": "DebugInfo", - "start_line": 925, - "end_line": 925, - "start_column": 0, - "end_column": 0, - "filename": "/home/primrose/Work/dace/dace/transformation/dataflow/multiple_buffering.py" - }, - "data": "shr_A", - "guid": "a5cc31c2-b856-430e-bc9f-0b8f93022b50" - }, - "id": 3, - "scope_entry": null, - "scope_exit": null - }, - { - "type": "Tasklet", - "label": "producer_acquire_pipe", - "attributes": { - "code": { - "string_data": "pipe.consumer_acquire();", - "language": "CPP" - }, - "debuginfo": { - "type": "DebugInfo", - "start_line": 486, - "end_line": 486, - "start_column": 0, - "end_column": 0, - "filename": "/home/primrose/Work/dace/dace/transformation/dataflow/multiple_buffering.py" - }, - "side_effects": true, - "label": "producer_acquire_pipe", - "guid": "694dbb32-9d8f-408e-88cd-de8062f963e5" - }, - "id": 4, - "scope_entry": null, - "scope_exit": null - }, - { - "type": "Tasklet", - "label": "producer_commit_pipe", - "attributes": { - "code": { - "string_data": "pipe.consumer_commit();", - "language": "CPP" - }, - "debuginfo": { - "type": "DebugInfo", - "start_line": 496, - "end_line": 496, - "start_column": 0, - "end_column": 0, - "filename": "/home/primrose/Work/dace/dace/transformation/dataflow/multiple_buffering.py" - }, - "side_effects": true, - "label": "producer_commit_pipe", - "guid": "ec98c77f-a979-4684-9bd7-744aa5d4bd04" - }, - "id": 5, - "scope_entry": null, - "scope_exit": null - } - ], - "edges": [ - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "256", - "subset": { - "type": "Range", - "ranges": [ - { - "start": "256*k + i + 256", - "end": "256*k + i + 511", - "step": "1", - "tile": "1" - } - ] - }, - "other_subset": { - "type": "Range", - "ranges": [ - { - "start": "Mod(k + 1, 2)", - "end": "Mod(k + 1, 2)", - "step": "1", - "tile": "1" - }, - { - "start": "0", - "end": "255", - "step": "1", - "tile": "1" - } - ] - }, - "data": "B", - "debuginfo": null, - "guid": "1780aa25-071a-4d0f-bd6b-7f93339acdf3", - "src_subset": { - "type": "Range", - "ranges": [ - { - "start": "256*k + i + 256", - "end": "256*k + i + 511", - "step": "1", - "tile": "1" - } - ] - }, - "dst_subset": { - "type": "Range", - "ranges": [ - { - "start": "Mod(k + 1, 2)", - "end": "Mod(k + 1, 2)", - "step": "1", - "tile": "1" - }, - { - "start": "0", - "end": "255", - "step": "1", - "tile": "1" - } - ] - }, - "is_data_src": true, - "num_accesses": "256" - } - } - }, - "src": "0", - "dst": "1", - "dst_connector": null, - "src_connector": null - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "256", - "subset": { - "type": "Range", - "ranges": [ - { - "start": "256*k + i + 256", - "end": "256*k + i + 511", - "step": "1", - "tile": "1" - } - ] - }, - "other_subset": { - "type": "Range", - "ranges": [ - { - "start": "Mod(k + 1, 2)", - "end": "Mod(k + 1, 2)", - "step": "1", - "tile": "1" - }, - { - "start": "0", - "end": "255", - "step": "1", - "tile": "1" - } - ] - }, - "data": "A", - "debuginfo": null, - "guid": "1e78b324-bfac-40aa-9418-ee8e10e6a76b", - "src_subset": { - "type": "Range", - "ranges": [ - { - "start": "256*k + i + 256", - "end": "256*k + i + 511", - "step": "1", - "tile": "1" - } - ] - }, - "dst_subset": { - "type": "Range", - "ranges": [ - { - "start": "Mod(k + 1, 2)", - "end": "Mod(k + 1, 2)", - "step": "1", - "tile": "1" - }, - { - "start": "0", - "end": "255", - "step": "1", - "tile": "1" - } - ] - }, - "is_data_src": true, - "num_accesses": "256" - } - } - }, - "src": "2", - "dst": "3", - "dst_connector": null, - "src_connector": null - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "1", - "debuginfo": null, - "guid": "9293a358-75c3-47fb-8ebc-73d67cd79f2c", - "src_subset": null, - "dst_subset": null, - "is_data_src": false, - "num_accesses": "1" - } - } - }, - "src": "4", - "dst": "0", - "dst_connector": null, - "src_connector": null - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "1", - "debuginfo": null, - "guid": "35a5a5a9-3beb-4393-8361-9776e95af37f", - "src_subset": null, - "dst_subset": null, - "is_data_src": false, - "num_accesses": "1" - } - } - }, - "src": "1", - "dst": "5", - "dst_connector": null, - "src_connector": null - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "1", - "debuginfo": null, - "guid": "db651f05-1390-4998-bf43-2e7441d9798b", - "src_subset": null, - "dst_subset": null, - "is_data_src": false, - "num_accesses": "1" - } - } - }, - "src": "4", - "dst": "2", - "dst_connector": null, - "src_connector": null - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "1", - "debuginfo": null, - "guid": "5a6b79a5-f34f-4d4f-baca-5f234b66b1ef", - "src_subset": null, - "dst_subset": null, - "is_data_src": false, - "num_accesses": "1" - } - } - }, - "src": "3", - "dst": "5", - "dst_connector": null, - "src_connector": null - } - ], - "attributes": { - "guid": "842bf787-acb7-47e5-b7ab-b52500c61cdd" - } - } - ], - "edges": [], - "collapsed": false, - "label": "pipeline_prefetch_test_0", - "id": null, - "cfg_list_id": 7, - "start_block": 0 - } - ] - ] - } - ], - "edges": [], - "collapsed": false, - "label": "", - "id": null, - "cfg_list_id": 5, - "start_block": 0 - }, - "schedule": "Sequential", - "symbol_mapping": { - "N": "N", - "i": "i", - "k": "k" - }, - "debuginfo": { - "type": "DebugInfo", - "start_line": 914, - "end_line": 914, - "start_column": 0, - "end_column": 0, - "filename": "/home/primrose/Work/dace/dace/transformation/dataflow/multiple_buffering.py" - }, - "label": "pipeline_prefetch_nsdfg_0", - "in_connectors": { - "A": null, - "B": null - }, - "out_connectors": { - "shr_A": null, - "shr_B": null - }, - "guid": "4aa1eab4-6385-41ba-9f63-630f122cf942" - }, - "id": 15, - "scope_entry": "5", - "scope_exit": "6" - }, - { - "type": "AccessNode", - "label": "shr_B", - "attributes": { - "debuginfo": { - "type": "DebugInfo", - "start_line": 578, - "end_line": 578, - "start_column": 0, - "end_column": 0, - "filename": "/home/primrose/Work/dace/dace/transformation/dataflow/multiple_buffering.py" - }, - "data": "shr_B", - "guid": "a1070f22-8d28-42f2-b21b-3bbef82280d3" - }, - "id": 16, - "scope_entry": "5", - "scope_exit": "6" - }, - { - "type": "Tasklet", - "label": "acquire_pipe", - "attributes": { - "code": { - "string_data": "pipe.consumer_wait();", - "language": "CPP" - }, - "debuginfo": { - "type": "DebugInfo", - "start_line": 630, - "end_line": 630, - "start_column": 0, - "end_column": 0, - "filename": "/home/primrose/Work/dace/dace/transformation/dataflow/multiple_buffering.py" - }, - "side_effects": true, - "label": "acquire_pipe", - "in_connectors": { - "_in_shr_A": null, - "_in_shr_B": null - }, - "out_connectors": { - "_out_shr_A": null, - "_out_shr_B": null - }, - "guid": "8e7f4f84-04aa-4f85-8e05-9ad31ac3de3e" - }, - "id": 17, - "scope_entry": "5", - "scope_exit": "6" - }, - { - "type": "AccessNode", - "label": "shr_A", - "attributes": { - "debuginfo": { - "type": "DebugInfo", - "start_line": 578, - "end_line": 578, - "start_column": 0, - "end_column": 0, - "filename": "/home/primrose/Work/dace/dace/transformation/dataflow/multiple_buffering.py" - }, - "data": "shr_A", - "guid": "95674815-1eab-4bac-885e-59957aff6161" - }, - "id": 18, - "scope_entry": "5", - "scope_exit": "6" - }, - { - "type": "Tasklet", - "label": "consumer_release_pipe", - "attributes": { - "code": { - "string_data": "pipe.consumer_release();", - "language": "CPP" - }, - "debuginfo": { - "type": "DebugInfo", - "start_line": 652, - "end_line": 652, - "start_column": 0, - "end_column": 0, - "filename": "/home/primrose/Work/dace/dace/transformation/dataflow/multiple_buffering.py" - }, - "side_effects": true, - "label": "consumer_release_pipe", - "guid": "8b55ad07-19c9-49bf-98af-017959caef88" - }, - "id": 19, - "scope_entry": "5", - "scope_exit": "6" - } - ], - "edges": [ - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "1", - "debuginfo": null, - "guid": "c5fc20f0-ec85-4953-8979-239785765110", - "src_subset": null, - "dst_subset": null, - "is_data_src": false, - "num_accesses": "1" - } - } - }, - "src": "16", - "dst": "7", - "dst_connector": null, - "src_connector": null - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "1", - "debuginfo": null, - "guid": "013e641a-ac55-49f1-babe-4e32a7f2b8aa", - "src_subset": null, - "dst_subset": null, - "is_data_src": false, - "num_accesses": "1" - } - } - }, - "src": "18", - "dst": "7", - "dst_connector": null, - "src_connector": null - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "1", - "debuginfo": null, - "guid": "ed562ca6-4b10-4d27-ab12-938e49350c95", - "src_subset": null, - "dst_subset": null, - "is_data_src": false, - "num_accesses": "1" - } - } - }, - "src": "8", - "dst": "19", - "dst_connector": null, - "src_connector": null - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "1", - "debuginfo": null, - "guid": "91a43c11-181e-4692-9cac-51de9dd6e251", - "src_subset": null, - "dst_subset": null, - "is_data_src": false, - "num_accesses": "1" - } - } - }, - "src": "19", - "dst": "6", - "dst_connector": null, - "src_connector": null - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "1", - "debuginfo": null, - "guid": "6f824266-bfb1-4be1-863c-a2e22af2543f", - "src_subset": null, - "dst_subset": null, - "is_data_src": false, - "num_accesses": "1" - } - } - }, - "src": "8", - "dst": "19", - "dst_connector": null, - "src_connector": null - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "1", - "debuginfo": null, - "guid": "b78539e0-3fd7-4e62-a51e-f4a473d90901", - "src_subset": null, - "dst_subset": null, - "is_data_src": false, - "num_accesses": "1" - } - } - }, - "src": "19", - "dst": "6", - "dst_connector": null, - "src_connector": null - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "512*ceiling(N/512)", - "subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "N - 1", - "step": "1", - "tile": "1" - } - ] - }, - "data": "A", - "debuginfo": null, - "guid": "9b3c23b5-e386-4edf-a8a1-e7274aac54ae", - "src_subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "N - 1", - "step": "1", - "tile": "1" - } - ] - }, - "dst_subset": null, - "is_data_src": true, - "num_accesses": "512*ceiling(N/512)" - } - } - }, - "src": "2", - "dst": "0", - "dst_connector": "IN_A", - "src_connector": null - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "512*ceiling(N/512)", - "subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "N - 1", - "step": "1", - "tile": "1" - } - ] - }, - "data": "B", - "debuginfo": null, - "guid": "dda3a79b-8c79-4417-a7aa-c4a9fcfa0352", - "src_subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "N - 1", - "step": "1", - "tile": "1" - } - ] - }, - "dst_subset": null, - "is_data_src": true, - "num_accesses": "512*ceiling(N/512)" - } - } - }, - "src": "3", - "dst": "0", - "dst_connector": "IN_B", - "src_connector": null - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "512", - "subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "1", - "step": "1", - "tile": "1" - }, - { - "start": "0", - "end": "255", - "step": "1", - "tile": "1" - } - ] - }, - "data": "shr_A", - "debuginfo": null, - "guid": "ad716fc5-69a5-4bfe-ad9b-f2bbb16d9d3e", - "src_subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "1", - "step": "1", - "tile": "1" - }, - { - "start": "0", - "end": "255", - "step": "1", - "tile": "1" - } - ] - }, - "dst_subset": null, - "is_data_src": true, - "num_accesses": "512" - } - } - }, - "src": "14", - "dst": "5", - "dst_connector": "IN___tmp_98_37_r_in_from_1_0_in_from_1_0", - "src_connector": null - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "512", - "subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "1", - "step": "1", - "tile": "1" - }, - { - "start": "0", - "end": "255", - "step": "1", - "tile": "1" - } - ] - }, - "data": "shr_B", - "debuginfo": null, - "guid": "6c0c9af6-e806-4031-9270-c9031886819a", - "src_subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "1", - "step": "1", - "tile": "1" - }, - { - "start": "0", - "end": "255", - "step": "1", - "tile": "1" - } - ] - }, - "dst_subset": null, - "is_data_src": true, - "num_accesses": "512" - } - } - }, - "src": "13", - "dst": "5", - "dst_connector": "IN___tmp_98_58_r_in_from_1_0_in_from_1_0", - "src_connector": null - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "1", - "subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "0", - "step": "1", - "tile": "1" - } - ] - }, - "data": "__tmp3", - "debuginfo": null, - "guid": "ef635686-7699-4f7e-9a4b-cd529006d9cf", - "src_subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "0", - "step": "1", - "tile": "1" - } - ] - }, - "dst_subset": null, - "is_data_src": true, - "num_accesses": "1" - } - } - }, - "src": "10", - "dst": "11", - "dst_connector": "__inp", - "src_connector": null - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "N", - "subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "N - 1", - "step": "1", - "tile": "1" - } - ] - }, - "data": "A", - "debuginfo": null, - "guid": "245d76b7-91a4-44fd-8a6d-6af0a8e44f4d", - "src_subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "N - 1", - "step": "1", - "tile": "1" - } - ] - }, - "dst_subset": null, - "is_data_src": true, - "num_accesses": "N" - } - } - }, - "src": "0", - "dst": "12", - "dst_connector": "A", - "src_connector": "OUT_A" - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "N", - "subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "N - 1", - "step": "1", - "tile": "1" - } - ] - }, - "data": "A", - "debuginfo": null, - "guid": "a136c9c6-2449-417c-8408-2cba9cc8b13f", - "src_subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "N - 1", - "step": "1", - "tile": "1" - } - ] - }, - "dst_subset": null, - "is_data_src": true, - "num_accesses": "N" - } - } - }, - "src": "0", - "dst": "5", - "dst_connector": "IN_prefetch_A", - "src_connector": "OUT_A" - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "N", - "subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "N - 1", - "step": "1", - "tile": "1" - } - ] - }, - "data": "B", - "debuginfo": null, - "guid": "0f7d6f6f-67a1-4e2d-98a7-558e53741dd4", - "src_subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "N - 1", - "step": "1", - "tile": "1" - } - ] - }, - "dst_subset": null, - "is_data_src": true, - "num_accesses": "N" - } - } - }, - "src": "0", - "dst": "12", - "dst_connector": "B", - "src_connector": "OUT_B" - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "N", - "subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "N - 1", - "step": "1", - "tile": "1" - } - ] - }, - "data": "B", - "debuginfo": null, - "guid": "88623b99-236d-4786-8e32-811bea940b62", - "src_subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "N - 1", - "step": "1", - "tile": "1" - } - ] - }, - "dst_subset": null, - "is_data_src": true, - "num_accesses": "N" - } - } - }, - "src": "0", - "dst": "5", - "dst_connector": "IN_prefetch_B", - "src_connector": "OUT_B" - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "512*ceiling(N/512)", - "subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "N - 1", - "step": "1", - "tile": "1" - } - ] - }, - "data": "C", - "debuginfo": null, - "guid": "fd0d5304-9689-47cf-8c8c-9c105dad6c06", - "src_subset": null, - "dst_subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "N - 1", - "step": "1", - "tile": "1" - } - ] - }, - "is_data_src": false, - "num_accesses": "512*ceiling(N/512)" - } - } - }, - "src": "1", - "dst": "4", - "dst_connector": null, - "src_connector": "OUT_C" - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "256", - "subset": { - "type": "Range", - "ranges": [ - { - "start": "i + 256*k", - "end": "i + 256*k + 255", - "step": "1", - "tile": "1" - } - ] - }, - "data": "C", - "debuginfo": null, - "guid": "b0173626-eff3-4525-aa85-b96e45b5400e", - "src_subset": null, - "dst_subset": { - "type": "Range", - "ranges": [ - { - "start": "i + 256*k", - "end": "i + 256*k + 255", - "step": "1", - "tile": "1" - } - ] - }, - "is_data_src": false, - "num_accesses": "256" - } - } - }, - "src": "8", - "dst": "6", - "dst_connector": "IN___tmp_98_16_w_out_of_1_1_out_of_1_1", - "src_connector": "OUT___tmp_98_16_w_out_of_1_1" - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "512", - "subset": { - "type": "Range", - "ranges": [ - { - "start": "i", - "end": "i + 511", - "step": "1", - "tile": "1" - } - ] - }, - "data": "C", - "debuginfo": null, - "guid": "961a4232-d88a-460b-9b0b-3ee74068670e", - "src_subset": null, - "dst_subset": { - "type": "Range", - "ranges": [ - { - "start": "i", - "end": "i + 511", - "step": "1", - "tile": "1" - } - ] - }, - "is_data_src": false, - "num_accesses": "512" - } - } - }, - "src": "6", - "dst": "1", - "dst_connector": "IN_C", - "src_connector": "OUT___tmp_98_16_w_out_of_1_1_out_of_1_1" - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "1", - "subset": { - "type": "Range", - "ranges": [ - { - "start": "Mod(k, 2)", - "end": "Mod(k, 2)", - "step": "1", - "tile": "1" - }, - { - "start": "j", - "end": "j", - "step": "1", - "tile": "1" - } - ] - }, - "data": "shr_A", - "debuginfo": null, - "guid": "32b45c8a-21ff-4d9c-98da-d79ecda534b8", - "src_subset": null, - "dst_subset": { - "type": "Range", - "ranges": [ - { - "start": "Mod(k, 2)", - "end": "Mod(k, 2)", - "step": "1", - "tile": "1" - }, - { - "start": "j", - "end": "j", - "step": "1", - "tile": "1" - } - ] - }, - "is_data_src": false, - "num_accesses": "1" - } - } - }, - "src": "7", - "dst": "9", - "dst_connector": "__in1", - "src_connector": "OUT___tmp_98_37_r_in_from_1_0" - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "256", - "subset": { - "type": "Range", - "ranges": [ - { - "start": "Mod(k, 2)", - "end": "Mod(k, 2)", - "step": "1", - "tile": "1" - }, - { - "start": "0", - "end": "255", - "step": "1", - "tile": "1" - } - ] - }, - "data": "shr_A", - "debuginfo": null, - "guid": "dd942060-3ca5-4375-a2e6-24692b4b54c3", - "src_subset": { - "type": "Range", - "ranges": [ - { - "start": "Mod(k, 2)", - "end": "Mod(k, 2)", - "step": "1", - "tile": "1" - }, - { - "start": "0", - "end": "255", - "step": "1", - "tile": "1" - } - ] - }, - "dst_subset": null, - "is_data_src": true, - "num_accesses": "256" - } - } - }, - "src": "5", - "dst": "17", - "dst_connector": "_in_shr_A", - "src_connector": "OUT___tmp_98_37_r_in_from_1_0_in_from_1_0" - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "1", - "subset": { - "type": "Range", - "ranges": [ - { - "start": "Mod(k, 2)", - "end": "Mod(k, 2)", - "step": "1", - "tile": "1" - }, - { - "start": "j", - "end": "j", - "step": "1", - "tile": "1" - } - ] - }, - "data": "shr_B", - "debuginfo": null, - "guid": "b1380382-374e-4cd8-85b4-0775615b298e", - "src_subset": null, - "dst_subset": { - "type": "Range", - "ranges": [ - { - "start": "Mod(k, 2)", - "end": "Mod(k, 2)", - "step": "1", - "tile": "1" - }, - { - "start": "j", - "end": "j", - "step": "1", - "tile": "1" - } - ] - }, - "is_data_src": false, - "num_accesses": "1" - } - } - }, - "src": "7", - "dst": "9", - "dst_connector": "__in2", - "src_connector": "OUT___tmp_98_58_r_in_from_1_0" - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "256", - "subset": { - "type": "Range", - "ranges": [ - { - "start": "Mod(k, 2)", - "end": "Mod(k, 2)", - "step": "1", - "tile": "1" - }, - { - "start": "0", - "end": "255", - "step": "1", - "tile": "1" - } - ] - }, - "data": "shr_B", - "debuginfo": null, - "guid": "b8eb6fb0-9e21-467e-be63-2588e4c02c4a", - "src_subset": { - "type": "Range", - "ranges": [ - { - "start": "Mod(k, 2)", - "end": "Mod(k, 2)", - "step": "1", - "tile": "1" - }, - { - "start": "0", - "end": "255", - "step": "1", - "tile": "1" - } - ] - }, - "dst_subset": null, - "is_data_src": true, - "num_accesses": "256" - } - } - }, - "src": "5", - "dst": "17", - "dst_connector": "_in_shr_B", - "src_connector": "OUT___tmp_98_58_r_in_from_1_0_in_from_1_0" - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "N", - "subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "N - 1", - "step": "1", - "tile": "1" - } - ] - }, - "data": "A", - "debuginfo": null, - "guid": "b9b72928-b6cd-4e23-b898-3c0d55df4550", - "src_subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "N - 1", - "step": "1", - "tile": "1" - } - ] - }, - "dst_subset": null, - "is_data_src": true, - "num_accesses": "N" - } - } - }, - "src": "5", - "dst": "15", - "dst_connector": "A", - "src_connector": "OUT_prefetch_A" - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "N", - "subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "N - 1", - "step": "1", - "tile": "1" - } - ] - }, - "data": "B", - "debuginfo": null, - "guid": "c96b2013-4ba5-493e-9125-c3ea8005baac", - "src_subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "N - 1", - "step": "1", - "tile": "1" - } - ] - }, - "dst_subset": null, - "is_data_src": true, - "num_accesses": "N" - } - } - }, - "src": "5", - "dst": "15", - "dst_connector": "B", - "src_connector": "OUT_prefetch_B" - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "1", - "subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "0", - "step": "1", - "tile": "1" - } - ] - }, - "data": "__tmp3", - "debuginfo": null, - "guid": "bab10d3a-bad3-4ccd-bf52-b7a83b41304d", - "src_subset": null, - "dst_subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "0", - "step": "1", - "tile": "1" - } - ] - }, - "is_data_src": false, - "num_accesses": "1" - } - } - }, - "src": "9", - "dst": "10", - "dst_connector": null, - "src_connector": "__out" - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "1", - "subset": { - "type": "Range", - "ranges": [ - { - "start": "i + j + 256*k", - "end": "i + j + 256*k", - "step": "1", - "tile": "1" - } - ] - }, - "data": "C", - "debuginfo": null, - "guid": "1490e37f-693d-4c9f-80be-37a03bfda736", - "src_subset": null, - "dst_subset": { - "type": "Range", - "ranges": [ - { - "start": "i + j + 256*k", - "end": "i + j + 256*k", - "step": "1", - "tile": "1" - } - ] - }, - "is_data_src": false, - "num_accesses": "1" - } - } - }, - "src": "11", - "dst": "8", - "dst_connector": "IN___tmp_98_16_w_out_of_1_1", - "src_connector": "__out" - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "256", - "subset": { - "type": "Range", - "ranges": [ - { - "start": "Mod(k, 2)", - "end": "Mod(k, 2)", - "step": "1", - "tile": "1" - }, - { - "start": "0", - "end": "255", - "step": "1", - "tile": "1" - } - ] - }, - "data": "shr_A", - "debuginfo": null, - "guid": "980b5189-720c-47e2-939b-1977b0b8d0a1", - "src_subset": null, - "dst_subset": { - "type": "Range", - "ranges": [ - { - "start": "Mod(k, 2)", - "end": "Mod(k, 2)", - "step": "1", - "tile": "1" - }, - { - "start": "0", - "end": "255", - "step": "1", - "tile": "1" - } - ] - }, - "is_data_src": false, - "num_accesses": "256" - } - } - }, - "src": "17", - "dst": "7", - "dst_connector": "IN___tmp_98_37_r_in_from_1_0", - "src_connector": "_out_shr_A" - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "256", - "subset": { - "type": "Range", - "ranges": [ - { - "start": "Mod(k, 2)", - "end": "Mod(k, 2)", - "step": "1", - "tile": "1" - }, - { - "start": "0", - "end": "255", - "step": "1", - "tile": "1" - } - ] - }, - "data": "shr_B", - "debuginfo": null, - "guid": "f6cb964f-52e7-4801-9ef5-1ca3eb6a1ba4", - "src_subset": null, - "dst_subset": { - "type": "Range", - "ranges": [ - { - "start": "Mod(k, 2)", - "end": "Mod(k, 2)", - "step": "1", - "tile": "1" - }, - { - "start": "0", - "end": "255", - "step": "1", - "tile": "1" - } - ] - }, - "is_data_src": false, - "num_accesses": "256" - } - } - }, - "src": "17", - "dst": "7", - "dst_connector": "IN___tmp_98_58_r_in_from_1_0", - "src_connector": "_out_shr_B" - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "512", - "subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "1", - "step": "1", - "tile": "1" - }, - { - "start": "0", - "end": "255", - "step": "1", - "tile": "1" - } - ] - }, - "data": "shr_A", - "debuginfo": null, - "guid": "dc10acf1-66f1-4e1b-8ea3-f01f0586f952", - "src_subset": null, - "dst_subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "1", - "step": "1", - "tile": "1" - }, - { - "start": "0", - "end": "255", - "step": "1", - "tile": "1" - } - ] - }, - "is_data_src": false, - "num_accesses": "512" - } - } - }, - "src": "12", - "dst": "14", - "dst_connector": null, - "src_connector": "shr_A" - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "512", - "subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "1", - "step": "1", - "tile": "1" - }, - { - "start": "0", - "end": "255", - "step": "1", - "tile": "1" - } - ] - }, - "data": "shr_A", - "debuginfo": null, - "guid": "c4e69a5c-f961-4972-a015-753ab9322fb5", - "src_subset": null, - "dst_subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "1", - "step": "1", - "tile": "1" - }, - { - "start": "0", - "end": "255", - "step": "1", - "tile": "1" - } - ] - }, - "is_data_src": false, - "num_accesses": "512" - } - } - }, - "src": "15", - "dst": "18", - "dst_connector": null, - "src_connector": "shr_A" - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "512", - "subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "1", - "step": "1", - "tile": "1" - }, - { - "start": "0", - "end": "255", - "step": "1", - "tile": "1" - } - ] - }, - "data": "shr_B", - "debuginfo": null, - "guid": "e2cc2f76-58e9-4f34-bd4c-cb9d5294e2b0", - "src_subset": null, - "dst_subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "1", - "step": "1", - "tile": "1" - }, - { - "start": "0", - "end": "255", - "step": "1", - "tile": "1" - } - ] - }, - "is_data_src": false, - "num_accesses": "512" - } - } - }, - "src": "12", - "dst": "13", - "dst_connector": null, - "src_connector": "shr_B" - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "512", - "subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "1", - "step": "1", - "tile": "1" - }, - { - "start": "0", - "end": "255", - "step": "1", - "tile": "1" - } - ] - }, - "data": "shr_B", - "debuginfo": null, - "guid": "e4b2e651-174c-47b5-bdfe-a0ed6469e29b", - "src_subset": null, - "dst_subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "1", - "step": "1", - "tile": "1" - }, - { - "start": "0", - "end": "255", - "step": "1", - "tile": "1" - } - ] - }, - "is_data_src": false, - "num_accesses": "512" - } - } - }, - "src": "15", - "dst": "16", - "dst_connector": null, - "src_connector": "shr_B" - } - ], - "attributes": { - "guid": "8d9c9dc6-b7fe-4305-9b0e-f344e6cb96bc", - "executions": "1", - "dynamic_executions": false - } - } - ], - "edges": [], - "collapsed": false, - "label": "", - "id": null, - "cfg_list_id": 0, - "start_block": 0, - "dace_version": "1.0.0" -} diff --git a/berkay_workpace/scratch/yakups_examples/dbuff_related/original_sdfg_with_shared_memory.sdfg b/berkay_workpace/scratch/yakups_examples/dbuff_related/original_sdfg_with_shared_memory.sdfg deleted file mode 100644 index 2f0132c264..0000000000 --- a/berkay_workpace/scratch/yakups_examples/dbuff_related/original_sdfg_with_shared_memory.sdfg +++ /dev/null @@ -1,1278 +0,0 @@ -{ - "type": "SDFG", - "attributes": { - "name": "kernel_double_buffered", - "arg_names": [ - "A", - "B", - "C" - ], - "_arrays": { - "A": { - "type": "Array", - "attributes": { - "strides": [ - "1" - ], - "total_size": "N", - "offset": [ - "0" - ], - "optional": false, - "dtype": "float64", - "shape": [ - "N" - ], - "storage": "GPU_Global", - "debuginfo": null - } - }, - "B": { - "type": "Array", - "attributes": { - "strides": [ - "1" - ], - "total_size": "N", - "offset": [ - "0" - ], - "optional": false, - "dtype": "float64", - "shape": [ - "N" - ], - "storage": "GPU_Global", - "debuginfo": null - } - }, - "C": { - "type": "Array", - "attributes": { - "strides": [ - "1" - ], - "total_size": "N", - "offset": [ - "0" - ], - "optional": false, - "dtype": "float64", - "shape": [ - "N" - ], - "storage": "GPU_Global", - "debuginfo": null - } - }, - "__tmp3": { - "type": "Scalar", - "attributes": { - "dtype": "float64", - "shape": [ - "1" - ], - "transient": true, - "debuginfo": null - } - }, - "shr_A": { - "type": "Array", - "attributes": { - "strides": [ - "1" - ], - "total_size": "256", - "offset": [ - "0" - ], - "optional": false, - "dtype": "float64", - "shape": [ - "256" - ], - "transient": true, - "storage": "GPU_Shared", - "debuginfo": null - } - }, - "shr_B": { - "type": "Array", - "attributes": { - "strides": [ - "1" - ], - "total_size": "256", - "offset": [ - "0" - ], - "optional": false, - "dtype": "float64", - "shape": [ - "256" - ], - "transient": true, - "storage": "GPU_Shared", - "debuginfo": null - } - } - }, - "symbols": { - "N": "int64" - }, - "global_code": { - "frame": { - "string_data": "", - "language": "CPP" - } - }, - "init_code": { - "frame": { - "string_data": "", - "language": "CPP" - } - }, - "exit_code": { - "frame": { - "string_data": "", - "language": "CPP" - } - }, - "debuginfo": { - "type": "DebugInfo", - "start_line": 21, - "end_line": 30, - "start_column": 0, - "end_column": 0, - "filename": "/home/primrose/Work/DaceLayoutAndScheduleTransformations/layout_and_schedule_transformations/tests/multiple_buffering_test.py" - }, - "using_explicit_control_flow": true, - "guid": "3876c3d3-22e3-48a5-a227-0e6227e60775", - "hash": "15d16bc3e33636c2aa74e39db44a85f7abeb0ad003b99d3195b94e51c7c687d9" - }, - "nodes": [ - { - "type": "SDFGState", - "label": "MapState", - "id": 0, - "collapsed": false, - "scope_dict": { - "-1": [ - 0, - 2, - 3, - 4 - ], - "0": [ - 1, - 5 - ], - "5": [ - 6, - 7, - 12, - 13 - ], - "7": [ - 8, - 9, - 10, - 11 - ] - }, - "nodes": [ - { - "type": "MapEntry", - "label": "kernel_27[i=0:N:512]", - "attributes": { - "label": "kernel_27", - "params": [ - "i" - ], - "range": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "N - 1", - "step": "512", - "tile": "1" - } - ] - }, - "schedule": "GPU_Device", - "debuginfo": { - "type": "DebugInfo", - "start_line": 27, - "end_line": 27, - "start_column": 4, - "end_column": 4, - "filename": "/home/primrose/Work/DaceLayoutAndScheduleTransformations/layout_and_schedule_transformations/tests/multiple_buffering_test.py" - }, - "in_connectors": { - "IN_A": null, - "IN_B": null - }, - "out_connectors": { - "OUT_A": null, - "OUT_B": null - }, - "guid": "59ba5c8f-2679-4811-9d5c-6612cee18053" - }, - "id": 0, - "scope_entry": null, - "scope_exit": "1" - }, - { - "type": "MapExit", - "label": "kernel_27[i=0:N:512]", - "attributes": { - "in_connectors": { - "IN_C": null - }, - "out_connectors": { - "OUT_C": null - }, - "guid": "937530c2-b4a3-4e83-81d3-ee5db5735ae8" - }, - "id": 1, - "scope_entry": "0", - "scope_exit": "1" - }, - { - "type": "AccessNode", - "label": "A", - "attributes": { - "debuginfo": { - "type": "DebugInfo", - "start_line": 27, - "end_line": 27, - "start_column": 4, - "end_column": 4, - "filename": "/home/primrose/Work/DaceLayoutAndScheduleTransformations/layout_and_schedule_transformations/tests/multiple_buffering_test.py" - }, - "data": "A", - "guid": "06bd864e-06f7-41e8-b701-4907600053ea" - }, - "id": 2, - "scope_entry": null, - "scope_exit": null - }, - { - "type": "AccessNode", - "label": "B", - "attributes": { - "debuginfo": { - "type": "DebugInfo", - "start_line": 27, - "end_line": 27, - "start_column": 4, - "end_column": 4, - "filename": "/home/primrose/Work/DaceLayoutAndScheduleTransformations/layout_and_schedule_transformations/tests/multiple_buffering_test.py" - }, - "data": "B", - "guid": "000290da-2594-43de-ae95-9a25c7b8c9d0" - }, - "id": 3, - "scope_entry": null, - "scope_exit": null - }, - { - "type": "AccessNode", - "label": "C", - "attributes": { - "debuginfo": { - "type": "DebugInfo", - "start_line": 27, - "end_line": 27, - "start_column": 4, - "end_column": 4, - "filename": "/home/primrose/Work/DaceLayoutAndScheduleTransformations/layout_and_schedule_transformations/tests/multiple_buffering_test.py" - }, - "data": "C", - "guid": "997761eb-8cfa-4d27-a7d5-cc4bb5005f93" - }, - "id": 4, - "scope_entry": null, - "scope_exit": null - }, - { - "type": "MapEntry", - "label": "kernel_27_4_28[k=0:2]", - "attributes": { - "label": "kernel_27_4_28", - "params": [ - "k" - ], - "range": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "1", - "step": "1", - "tile": "1" - } - ] - }, - "schedule": "Sequential", - "debuginfo": { - "type": "DebugInfo", - "start_line": 28, - "end_line": 28, - "start_column": 8, - "end_column": 8, - "filename": "/home/primrose/Work/DaceLayoutAndScheduleTransformations/layout_and_schedule_transformations/tests/multiple_buffering_test.py" - }, - "in_connectors": { - "IN___tmp_30_37_r_in_from_1_0_in_from_1_0": null, - "IN___tmp_30_58_r_in_from_1_0_in_from_1_0": null - }, - "out_connectors": { - "OUT___tmp_30_37_r_in_from_1_0_in_from_1_0": null, - "OUT___tmp_30_58_r_in_from_1_0_in_from_1_0": null - }, - "guid": "460bbd40-7325-4651-a422-fb41394d8752" - }, - "id": 5, - "scope_entry": "0", - "scope_exit": "6" - }, - { - "type": "MapExit", - "label": "kernel_27_4_28[k=0:2]", - "attributes": { - "in_connectors": { - "IN___tmp_30_16_w_out_of_1_1_out_of_1_1": null - }, - "out_connectors": { - "OUT___tmp_30_16_w_out_of_1_1_out_of_1_1": null - }, - "guid": "fa7eaf22-39d3-4216-b93a-36061c5bb53e" - }, - "id": 6, - "scope_entry": "5", - "scope_exit": "6" - }, - { - "type": "MapEntry", - "label": "kernel_27_4_28_8_29[j=0:256]", - "attributes": { - "label": "kernel_27_4_28_8_29", - "params": [ - "j" - ], - "range": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "255", - "step": "1", - "tile": "1" - } - ] - }, - "schedule": "GPU_ThreadBlock", - "debuginfo": { - "type": "DebugInfo", - "start_line": 29, - "end_line": 29, - "start_column": 12, - "end_column": 12, - "filename": "/home/primrose/Work/DaceLayoutAndScheduleTransformations/layout_and_schedule_transformations/tests/multiple_buffering_test.py" - }, - "in_connectors": { - "IN___tmp_30_37_r_in_from_1_0": null, - "IN___tmp_30_58_r_in_from_1_0": null - }, - "out_connectors": { - "OUT___tmp_30_37_r_in_from_1_0": null, - "OUT___tmp_30_58_r_in_from_1_0": null - }, - "guid": "0f6e50ab-156b-4197-8f75-fcc83bd993ef" - }, - "id": 7, - "scope_entry": "5", - "scope_exit": "8" - }, - { - "type": "MapExit", - "label": "kernel_27_4_28_8_29[j=0:256]", - "attributes": { - "in_connectors": { - "IN___tmp_30_16_w_out_of_1_1": null - }, - "out_connectors": { - "OUT___tmp_30_16_w_out_of_1_1": null - }, - "guid": "67bf3a8a-7f84-4717-a6c6-443d181e0703" - }, - "id": 8, - "scope_entry": "7", - "scope_exit": "8" - }, - { - "type": "Tasklet", - "label": "_Add_", - "attributes": { - "code": { - "string_data": "__out = (__in1 + __in2)", - "language": "Python" - }, - "debuginfo": { - "type": "DebugInfo", - "start_line": 30, - "end_line": 30, - "start_column": 72, - "end_column": 72, - "filename": "/home/primrose/Work/DaceLayoutAndScheduleTransformations/layout_and_schedule_transformations/tests/multiple_buffering_test.py" - }, - "label": "_Add_", - "in_connectors": { - "__in1": null, - "__in2": null - }, - "out_connectors": { - "__out": null - }, - "guid": "467c5541-6c78-466f-9cd0-59b52ab5f3bb" - }, - "id": 9, - "scope_entry": "7", - "scope_exit": "8" - }, - { - "type": "AccessNode", - "label": "__tmp3", - "attributes": { - "debuginfo": { - "type": "DebugInfo", - "start_line": 30, - "end_line": 30, - "start_column": 72, - "end_column": 72, - "filename": "/home/primrose/Work/DaceLayoutAndScheduleTransformations/layout_and_schedule_transformations/tests/multiple_buffering_test.py" - }, - "data": "__tmp3", - "guid": "e1635af8-7f3b-44e5-90af-05c10b4d0beb" - }, - "id": 10, - "scope_entry": "7", - "scope_exit": "8" - }, - { - "type": "Tasklet", - "label": "assign_30_16", - "attributes": { - "code": { - "string_data": "__out = __inp", - "language": "Python" - }, - "debuginfo": { - "type": "DebugInfo", - "start_line": 30, - "end_line": 30, - "start_column": 30, - "end_column": 30, - "filename": "/home/primrose/Work/DaceLayoutAndScheduleTransformations/layout_and_schedule_transformations/tests/multiple_buffering_test.py" - }, - "label": "assign_30_16", - "in_connectors": { - "__inp": null - }, - "out_connectors": { - "__out": null - }, - "guid": "94a47063-816a-420f-9bcd-e507b8e52932" - }, - "id": 11, - "scope_entry": "7", - "scope_exit": "8" - }, - { - "type": "AccessNode", - "label": "shr_A", - "attributes": { - "debuginfo": { - "type": "DebugInfo", - "start_line": 48, - "end_line": 48, - "start_column": 0, - "end_column": 0, - "filename": "/home/primrose/Work/DaceLayoutAndScheduleTransformations/layout_and_schedule_transformations/tests/test_utils.py" - }, - "data": "shr_A", - "guid": "a0583fa5-f8c8-450a-822d-c06bc504e3d8" - }, - "id": 12, - "scope_entry": "5", - "scope_exit": "6" - }, - { - "type": "AccessNode", - "label": "shr_B", - "attributes": { - "debuginfo": { - "type": "DebugInfo", - "start_line": 48, - "end_line": 48, - "start_column": 0, - "end_column": 0, - "filename": "/home/primrose/Work/DaceLayoutAndScheduleTransformations/layout_and_schedule_transformations/tests/test_utils.py" - }, - "data": "shr_B", - "guid": "ddfc614b-a21e-4d47-a07f-3377589f5f1e" - }, - "id": 13, - "scope_entry": "5", - "scope_exit": "6" - } - ], - "edges": [ - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "512*ceiling(N/512)", - "subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "N - 1", - "step": "1", - "tile": "1" - } - ] - }, - "data": "A", - "debuginfo": null, - "guid": "bbbfba59-0966-4964-ade6-951656e12f8b", - "src_subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "N - 1", - "step": "1", - "tile": "1" - } - ] - }, - "dst_subset": null, - "is_data_src": true, - "num_accesses": "512*ceiling(N/512)" - } - } - }, - "src": "2", - "dst": "0", - "dst_connector": "IN_A", - "src_connector": null - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "512*ceiling(N/512)", - "subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "N - 1", - "step": "1", - "tile": "1" - } - ] - }, - "data": "B", - "debuginfo": null, - "guid": "2cc2c126-f362-468c-9541-aafb9ceef5ed", - "src_subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "N - 1", - "step": "1", - "tile": "1" - } - ] - }, - "dst_subset": null, - "is_data_src": true, - "num_accesses": "512*ceiling(N/512)" - } - } - }, - "src": "3", - "dst": "0", - "dst_connector": "IN_B", - "src_connector": null - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "256", - "subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "255", - "step": "1", - "tile": "1" - } - ] - }, - "data": "shr_A", - "debuginfo": null, - "guid": "dfa616c1-fd38-4d9a-90e5-cf9530f3d8d5", - "src_subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "255", - "step": "1", - "tile": "1" - } - ] - }, - "dst_subset": null, - "is_data_src": true, - "num_accesses": "256" - } - } - }, - "src": "12", - "dst": "7", - "dst_connector": "IN___tmp_30_37_r_in_from_1_0", - "src_connector": null - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "256", - "subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "255", - "step": "1", - "tile": "1" - } - ] - }, - "data": "shr_B", - "debuginfo": null, - "guid": "805f691f-2ad8-436f-9165-b939865e438a", - "src_subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "255", - "step": "1", - "tile": "1" - } - ] - }, - "dst_subset": null, - "is_data_src": true, - "num_accesses": "256" - } - } - }, - "src": "13", - "dst": "7", - "dst_connector": "IN___tmp_30_58_r_in_from_1_0", - "src_connector": null - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "1", - "subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "0", - "step": "1", - "tile": "1" - } - ] - }, - "data": "__tmp3", - "debuginfo": null, - "guid": "24c8fcc0-12fb-4256-8c81-b1f06d6e0b24", - "src_subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "0", - "step": "1", - "tile": "1" - } - ] - }, - "dst_subset": null, - "is_data_src": true, - "num_accesses": "1" - } - } - }, - "src": "10", - "dst": "11", - "dst_connector": "__inp", - "src_connector": null - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "512", - "subset": { - "type": "Range", - "ranges": [ - { - "start": "i", - "end": "i + 511", - "step": "1", - "tile": "1" - } - ] - }, - "data": "A", - "debuginfo": null, - "guid": "16438026-7243-4acd-b483-e50afcf2bae1", - "src_subset": { - "type": "Range", - "ranges": [ - { - "start": "i", - "end": "i + 511", - "step": "1", - "tile": "1" - } - ] - }, - "dst_subset": null, - "is_data_src": true, - "num_accesses": "512" - } - } - }, - "src": "0", - "dst": "5", - "dst_connector": "IN___tmp_30_37_r_in_from_1_0_in_from_1_0", - "src_connector": "OUT_A" - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "512", - "subset": { - "type": "Range", - "ranges": [ - { - "start": "i", - "end": "i + 511", - "step": "1", - "tile": "1" - } - ] - }, - "data": "B", - "debuginfo": null, - "guid": "9f86156c-d84e-476a-b4fd-586781af92b3", - "src_subset": { - "type": "Range", - "ranges": [ - { - "start": "i", - "end": "i + 511", - "step": "1", - "tile": "1" - } - ] - }, - "dst_subset": null, - "is_data_src": true, - "num_accesses": "512" - } - } - }, - "src": "0", - "dst": "5", - "dst_connector": "IN___tmp_30_58_r_in_from_1_0_in_from_1_0", - "src_connector": "OUT_B" - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "512*ceiling(N/512)", - "subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "N - 1", - "step": "1", - "tile": "1" - } - ] - }, - "data": "C", - "debuginfo": null, - "guid": "7de92b91-bf9e-4ae7-a3a5-45120338e67a", - "src_subset": null, - "dst_subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "N - 1", - "step": "1", - "tile": "1" - } - ] - }, - "is_data_src": false, - "num_accesses": "512*ceiling(N/512)" - } - } - }, - "src": "1", - "dst": "4", - "dst_connector": null, - "src_connector": "OUT_C" - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "256", - "subset": { - "type": "Range", - "ranges": [ - { - "start": "i + 256*k", - "end": "i + 256*k + 255", - "step": "1", - "tile": "1" - } - ] - }, - "data": "C", - "debuginfo": null, - "guid": "fce63159-8480-4884-bc3c-a4e4f3af4ba5", - "src_subset": null, - "dst_subset": { - "type": "Range", - "ranges": [ - { - "start": "i + 256*k", - "end": "i + 256*k + 255", - "step": "1", - "tile": "1" - } - ] - }, - "is_data_src": false, - "num_accesses": "256" - } - } - }, - "src": "8", - "dst": "6", - "dst_connector": "IN___tmp_30_16_w_out_of_1_1_out_of_1_1", - "src_connector": "OUT___tmp_30_16_w_out_of_1_1" - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "512", - "subset": { - "type": "Range", - "ranges": [ - { - "start": "i", - "end": "i + 511", - "step": "1", - "tile": "1" - } - ] - }, - "data": "C", - "debuginfo": null, - "guid": "b6addcac-3b8b-44ff-977a-d16564b8689c", - "src_subset": null, - "dst_subset": { - "type": "Range", - "ranges": [ - { - "start": "i", - "end": "i + 511", - "step": "1", - "tile": "1" - } - ] - }, - "is_data_src": false, - "num_accesses": "512" - } - } - }, - "src": "6", - "dst": "1", - "dst_connector": "IN_C", - "src_connector": "OUT___tmp_30_16_w_out_of_1_1_out_of_1_1" - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "1", - "subset": { - "type": "Range", - "ranges": [ - { - "start": "j", - "end": "j", - "step": "1", - "tile": "1" - } - ] - }, - "data": "shr_A", - "debuginfo": null, - "guid": "1b3b0248-b13f-4cc9-a417-69fd0d81f95d", - "src_subset": { - "type": "Range", - "ranges": [ - { - "start": "j", - "end": "j", - "step": "1", - "tile": "1" - } - ] - }, - "dst_subset": null, - "is_data_src": true, - "num_accesses": "1" - } - } - }, - "src": "7", - "dst": "9", - "dst_connector": "__in1", - "src_connector": "OUT___tmp_30_37_r_in_from_1_0" - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "256", - "subset": { - "type": "Range", - "ranges": [ - { - "start": "i + 256*k", - "end": "i + 256*k + 255", - "step": "1", - "tile": "1" - } - ] - }, - "other_subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "255", - "step": "1", - "tile": "1" - } - ] - }, - "data": "A", - "debuginfo": null, - "guid": "0fd0f751-5a94-4181-b3a9-da66b015a93f", - "src_subset": { - "type": "Range", - "ranges": [ - { - "start": "i + 256*k", - "end": "i + 256*k + 255", - "step": "1", - "tile": "1" - } - ] - }, - "dst_subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "255", - "step": "1", - "tile": "1" - } - ] - }, - "is_data_src": true, - "num_accesses": "256" - } - } - }, - "src": "5", - "dst": "12", - "dst_connector": null, - "src_connector": "OUT___tmp_30_37_r_in_from_1_0_in_from_1_0" - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "1", - "subset": { - "type": "Range", - "ranges": [ - { - "start": "j", - "end": "j", - "step": "1", - "tile": "1" - } - ] - }, - "data": "shr_B", - "debuginfo": null, - "guid": "9f88305d-28a9-44c7-b87a-19473ddd2986", - "src_subset": { - "type": "Range", - "ranges": [ - { - "start": "j", - "end": "j", - "step": "1", - "tile": "1" - } - ] - }, - "dst_subset": null, - "is_data_src": true, - "num_accesses": "1" - } - } - }, - "src": "7", - "dst": "9", - "dst_connector": "__in2", - "src_connector": "OUT___tmp_30_58_r_in_from_1_0" - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "256", - "subset": { - "type": "Range", - "ranges": [ - { - "start": "i + 256*k", - "end": "i + 256*k + 255", - "step": "1", - "tile": "1" - } - ] - }, - "other_subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "255", - "step": "1", - "tile": "1" - } - ] - }, - "data": "B", - "debuginfo": null, - "guid": "6b67d7f9-a952-4447-a5f4-ff00f9d26712", - "src_subset": { - "type": "Range", - "ranges": [ - { - "start": "i + 256*k", - "end": "i + 256*k + 255", - "step": "1", - "tile": "1" - } - ] - }, - "dst_subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "255", - "step": "1", - "tile": "1" - } - ] - }, - "is_data_src": true, - "num_accesses": "256" - } - } - }, - "src": "5", - "dst": "13", - "dst_connector": null, - "src_connector": "OUT___tmp_30_58_r_in_from_1_0_in_from_1_0" - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "1", - "subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "0", - "step": "1", - "tile": "1" - } - ] - }, - "data": "__tmp3", - "debuginfo": null, - "guid": "4cfe2901-4813-4424-9231-49dbab83199b", - "src_subset": null, - "dst_subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "0", - "step": "1", - "tile": "1" - } - ] - }, - "is_data_src": false, - "num_accesses": "1" - } - } - }, - "src": "9", - "dst": "10", - "dst_connector": null, - "src_connector": "__out" - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "1", - "subset": { - "type": "Range", - "ranges": [ - { - "start": "i + j + 256*k", - "end": "i + j + 256*k", - "step": "1", - "tile": "1" - } - ] - }, - "data": "C", - "debuginfo": null, - "guid": "62b2982c-4bc7-404b-8ccf-7c8c213034a9", - "src_subset": null, - "dst_subset": { - "type": "Range", - "ranges": [ - { - "start": "i + j + 256*k", - "end": "i + j + 256*k", - "step": "1", - "tile": "1" - } - ] - }, - "is_data_src": false, - "num_accesses": "1" - } - } - }, - "src": "11", - "dst": "8", - "dst_connector": "IN___tmp_30_16_w_out_of_1_1", - "src_connector": "__out" - } - ], - "attributes": { - "guid": "e529de92-f3c5-45b7-8313-ae1ae22a2a4c", - "executions": "1", - "dynamic_executions": false - } - } - ], - "edges": [], - "collapsed": false, - "label": "", - "id": null, - "cfg_list_id": 0, - "start_block": 0, - "dace_version": "1.0.0" -} diff --git a/berkay_workpace/scratch/yakups_examples/smem_related/2d_blocktiled_gemm_with_custom_copy.sdfg b/berkay_workpace/scratch/yakups_examples/smem_related/2d_blocktiled_gemm_with_custom_copy.sdfg deleted file mode 100644 index 087ca70efc..0000000000 --- a/berkay_workpace/scratch/yakups_examples/smem_related/2d_blocktiled_gemm_with_custom_copy.sdfg +++ /dev/null @@ -1,4165 +0,0 @@ -{ - "type": "SDFG", - "attributes": { - "name": "dace_naive_matmul", - "arg_names": [ - "A", - "B", - "C" - ], - "_arrays": { - "A": { - "type": "Array", - "attributes": { - "strides": [ - "K", - "1" - ], - "total_size": "K*M", - "offset": [ - "0", - "0" - ], - "optional": false, - "dtype": "float32", - "shape": [ - "M", - "K" - ], - "storage": "GPU_Global", - "debuginfo": null - } - }, - "B": { - "type": "Array", - "attributes": { - "strides": [ - "N", - "1" - ], - "total_size": "K*N", - "offset": [ - "0", - "0" - ], - "optional": false, - "dtype": "float32", - "shape": [ - "K", - "N" - ], - "storage": "GPU_Global", - "debuginfo": null - } - }, - "C": { - "type": "Array", - "attributes": { - "strides": [ - "N", - "1" - ], - "total_size": "M*N", - "offset": [ - "0", - "0" - ], - "optional": false, - "dtype": "float32", - "shape": [ - "M", - "N" - ], - "storage": "GPU_Global", - "debuginfo": null - } - }, - "__tmp3": { - "type": "Scalar", - "attributes": { - "dtype": "float32", - "shape": [ - "1" - ], - "transient": true, - "storage": "Register", - "debuginfo": null - } - }, - "__tmp4": { - "type": "Scalar", - "attributes": { - "dtype": "float32", - "shape": [ - "1" - ], - "transient": true, - "storage": "Register", - "debuginfo": null - } - }, - "tmp": { - "type": "Array", - "attributes": { - "strides": [ - "8", - "1" - ], - "total_size": "64", - "offset": [ - "0", - "0" - ], - "alignment": 16, - "optional": false, - "dtype": "float32", - "shape": [ - "8", - "8" - ], - "transient": true, - "storage": "Register", - "debuginfo": null - } - }, - "shrB": { - "type": "Array", - "attributes": { - "strides": [ - "128", - "1" - ], - "total_size": "2048", - "offset": [ - "0", - "0" - ], - "optional": false, - "dtype": "float32", - "shape": [ - "16", - "128" - ], - "transient": true, - "storage": "GPU_Shared", - "debuginfo": null - } - }, - "shrA": { - "type": "Array", - "attributes": { - "strides": [ - "16", - "1" - ], - "total_size": "2048", - "offset": [ - "0", - "0" - ], - "optional": false, - "dtype": "float32", - "shape": [ - "128", - "16" - ], - "transient": true, - "storage": "GPU_Shared", - "debuginfo": null - } - } - }, - "symbols": { - "K": "int32", - "M": "int32", - "N": "int32" - }, - "global_code": { - "frame": { - "string_data": "", - "language": "CPP" - } - }, - "init_code": { - "frame": { - "string_data": "", - "language": "CPP" - } - }, - "exit_code": { - "frame": { - "string_data": "", - "language": "CPP" - } - }, - "orig_sdfg": { - "type": "SDFG", - "attributes": { - "name": "dace_naive_matmul", - "arg_names": [ - "A", - "B", - "C" - ], - "_arrays": { - "A": { - "type": "Array", - "attributes": { - "strides": [ - "K", - "1" - ], - "total_size": "K*M", - "offset": [ - "0", - "0" - ], - "optional": false, - "dtype": "float32", - "shape": [ - "M", - "K" - ], - "storage": "GPU_Global", - "debuginfo": null - } - }, - "B": { - "type": "Array", - "attributes": { - "strides": [ - "N", - "1" - ], - "total_size": "K*N", - "offset": [ - "0", - "0" - ], - "optional": false, - "dtype": "float32", - "shape": [ - "K", - "N" - ], - "storage": "GPU_Global", - "debuginfo": null - } - }, - "C": { - "type": "Array", - "attributes": { - "strides": [ - "N", - "1" - ], - "total_size": "M*N", - "offset": [ - "0", - "0" - ], - "optional": false, - "dtype": "float32", - "shape": [ - "M", - "N" - ], - "storage": "GPU_Global", - "debuginfo": null - } - }, - "tmp": { - "type": "Scalar", - "attributes": { - "dtype": "float32", - "shape": [ - "1" - ], - "transient": true, - "storage": "Register", - "debuginfo": null - } - }, - "__tmp3": { - "type": "Scalar", - "attributes": { - "dtype": "float32", - "shape": [ - "1" - ], - "transient": true, - "debuginfo": null - } - }, - "__tmp4": { - "type": "Scalar", - "attributes": { - "dtype": "float32", - "shape": [ - "1" - ], - "transient": true, - "debuginfo": null - } - } - }, - "symbols": { - "K": "int32", - "M": "int32", - "N": "int32" - }, - "global_code": { - "frame": { - "string_data": "", - "language": "CPP" - } - }, - "init_code": { - "frame": { - "string_data": "", - "language": "CPP" - } - }, - "exit_code": { - "frame": { - "string_data": "", - "language": "CPP" - } - }, - "debuginfo": { - "type": "DebugInfo", - "start_line": 254, - "end_line": 262, - "start_column": 0, - "end_column": 0, - "filename": "/home/primrose/Work/DaCellerator/MatMul/matmul.py" - }, - "guid": "b58195d5-3a0f-46d0-ac04-c263d4e3772b" - }, - "nodes": [ - { - "type": "SDFGState", - "label": "MapState", - "id": 0, - "collapsed": false, - "scope_dict": { - "-1": [ - 0, - 2, - 3, - 4 - ], - "0": [ - 1, - 5, - 6, - 8, - 9 - ], - "6": [ - 7, - 10, - 11, - 12, - 13, - 14 - ] - }, - "nodes": [ - { - "type": "MapEntry", - "label": "dace_naive_matmul_258[i=0:M, j=0:N]", - "attributes": { - "label": "dace_naive_matmul_258", - "params": [ - "i", - "j" - ], - "range": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "M - 1", - "step": "1", - "tile": "1" - }, - { - "start": "0", - "end": "N - 1", - "step": "1", - "tile": "1" - } - ] - }, - "schedule": "GPU_Device", - "debuginfo": { - "type": "DebugInfo", - "start_line": 258, - "end_line": 258, - "start_column": 2, - "end_column": 2, - "filename": "/home/primrose/Work/DaCellerator/MatMul/matmul.py" - }, - "in_connectors": { - "IN_A": null, - "IN_B": null - }, - "out_connectors": { - "OUT_A": null, - "OUT_B": null - }, - "guid": "3befdc17-85e0-4f77-8d69-28f32bfd3669" - }, - "id": 0, - "scope_entry": null, - "scope_exit": "1" - }, - { - "type": "MapExit", - "label": "dace_naive_matmul_258[i=0:M, j=0:N]", - "attributes": { - "in_connectors": { - "IN_C": null - }, - "out_connectors": { - "OUT_C": null - }, - "guid": "0778c935-cb9e-40f0-b04a-8c24d0f3fd9f" - }, - "id": 1, - "scope_entry": "0", - "scope_exit": "1" - }, - { - "type": "AccessNode", - "label": "A", - "attributes": { - "debuginfo": { - "type": "DebugInfo", - "start_line": 258, - "end_line": 258, - "start_column": 2, - "end_column": 2, - "filename": "/home/primrose/Work/DaCellerator/MatMul/matmul.py" - }, - "data": "A", - "guid": "c63b7b81-d6b7-4d02-bb1a-08b16d50d082" - }, - "id": 2, - "scope_entry": null, - "scope_exit": null - }, - { - "type": "AccessNode", - "label": "B", - "attributes": { - "debuginfo": { - "type": "DebugInfo", - "start_line": 258, - "end_line": 258, - "start_column": 2, - "end_column": 2, - "filename": "/home/primrose/Work/DaCellerator/MatMul/matmul.py" - }, - "data": "B", - "guid": "0775dbf8-a80f-41bd-94df-85333c04942c" - }, - "id": 3, - "scope_entry": null, - "scope_exit": null - }, - { - "type": "AccessNode", - "label": "C", - "attributes": { - "debuginfo": { - "type": "DebugInfo", - "start_line": 258, - "end_line": 258, - "start_column": 2, - "end_column": 2, - "filename": "/home/primrose/Work/DaCellerator/MatMul/matmul.py" - }, - "data": "C", - "guid": "fd617edc-81aa-4499-bbe1-3f1183629c0f" - }, - "id": 4, - "scope_entry": null, - "scope_exit": null - }, - { - "type": "AccessNode", - "label": "tmp", - "attributes": { - "setzero": true, - "debuginfo": { - "type": "DebugInfo", - "start_line": 259, - "end_line": 259, - "start_column": 23, - "end_column": 23, - "filename": "/home/primrose/Work/DaCellerator/MatMul/matmul.py" - }, - "data": "tmp", - "guid": "156c4b4e-6d95-4233-8a56-ae9a92225d9f" - }, - "id": 5, - "scope_entry": "0", - "scope_exit": "1" - }, - { - "type": "MapEntry", - "label": "dace_naive_matmul_258_2_260[k=0:K]", - "attributes": { - "label": "dace_naive_matmul_258_2_260", - "params": [ - "k" - ], - "range": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "K - 1", - "step": "1", - "tile": "1" - } - ] - }, - "schedule": "Sequential", - "debuginfo": { - "type": "DebugInfo", - "start_line": 260, - "end_line": 260, - "start_column": 4, - "end_column": 4, - "filename": "/home/primrose/Work/DaCellerator/MatMul/matmul.py" - }, - "in_connectors": { - "IN___tmp1": null, - "IN___tmp_261_18_r_in_from_2_0": null, - "IN___tmp_261_28_r_in_from_2_0": null - }, - "out_connectors": { - "OUT___tmp1": null, - "OUT___tmp_261_18_r_in_from_2_0": null, - "OUT___tmp_261_28_r_in_from_2_0": null - }, - "guid": "dbd98cec-d976-42db-a740-b8c99d25fd8d" - }, - "id": 6, - "scope_entry": "0", - "scope_exit": "7" - }, - { - "type": "MapExit", - "label": "dace_naive_matmul_258_2_260[k=0:K]", - "attributes": { - "in_connectors": { - "IN___tmp1": null - }, - "out_connectors": { - "OUT___tmp1": null - }, - "guid": "fcc83bb0-47e1-4eef-a4a8-3ce4d9596be2" - }, - "id": 7, - "scope_entry": "6", - "scope_exit": "7" - }, - { - "type": "AccessNode", - "label": "tmp", - "attributes": { - "debuginfo": { - "type": "DebugInfo", - "start_line": 260, - "end_line": 260, - "start_column": 4, - "end_column": 4, - "filename": "/home/primrose/Work/DaCellerator/MatMul/matmul.py" - }, - "data": "tmp", - "guid": "7d953f40-4e3d-430f-9398-f394902d4cab" - }, - "id": 8, - "scope_entry": "0", - "scope_exit": "1" - }, - { - "type": "Tasklet", - "label": "assign_262_4", - "attributes": { - "code": { - "string_data": "__out = __inp", - "language": "Python" - }, - "debuginfo": { - "type": "DebugInfo", - "start_line": 262, - "end_line": 262, - "start_column": 9, - "end_column": 9, - "filename": "/home/primrose/Work/DaCellerator/MatMul/matmul.py" - }, - "label": "assign_262_4", - "in_connectors": { - "__inp": null - }, - "out_connectors": { - "__out": null - }, - "guid": "f55f6be2-ce3f-4b4d-b1e2-f349ded7943e" - }, - "id": 9, - "scope_entry": "0", - "scope_exit": "1" - }, - { - "type": "Tasklet", - "label": "_Mult_", - "attributes": { - "code": { - "string_data": "__out = (__in1 * __in2)", - "language": "Python" - }, - "debuginfo": { - "type": "DebugInfo", - "start_line": 261, - "end_line": 261, - "start_column": 33, - "end_column": 33, - "filename": "/home/primrose/Work/DaCellerator/MatMul/matmul.py" - }, - "label": "_Mult_", - "in_connectors": { - "__in1": null, - "__in2": null - }, - "out_connectors": { - "__out": null - }, - "guid": "73aecb99-070f-4e61-a316-3b2a7b40b929" - }, - "id": 10, - "scope_entry": "6", - "scope_exit": "7" - }, - { - "type": "AccessNode", - "label": "__tmp3", - "attributes": { - "debuginfo": { - "type": "DebugInfo", - "start_line": 261, - "end_line": 261, - "start_column": 33, - "end_column": 33, - "filename": "/home/primrose/Work/DaCellerator/MatMul/matmul.py" - }, - "data": "__tmp3", - "guid": "9a838b65-220e-482a-83bb-e0d554444eb4" - }, - "id": 11, - "scope_entry": "6", - "scope_exit": "7" - }, - { - "type": "Tasklet", - "label": "_Add_", - "attributes": { - "code": { - "string_data": "__out = (__in1 + __in2)", - "language": "Python" - }, - "debuginfo": { - "type": "DebugInfo", - "start_line": 261, - "end_line": 261, - "start_column": 33, - "end_column": 33, - "filename": "/home/primrose/Work/DaCellerator/MatMul/matmul.py" - }, - "label": "_Add_", - "in_connectors": { - "__in1": null, - "__in2": null - }, - "out_connectors": { - "__out": null - }, - "guid": "ae8acca8-acfa-42de-a0e2-dc45be188f59" - }, - "id": 12, - "scope_entry": "6", - "scope_exit": "7" - }, - { - "type": "AccessNode", - "label": "__tmp4", - "attributes": { - "debuginfo": { - "type": "DebugInfo", - "start_line": 261, - "end_line": 261, - "start_column": 33, - "end_column": 33, - "filename": "/home/primrose/Work/DaCellerator/MatMul/matmul.py" - }, - "data": "__tmp4", - "guid": "c45f593f-64ec-4deb-9a7a-06c9028a359e" - }, - "id": 13, - "scope_entry": "6", - "scope_exit": "7" - }, - { - "type": "Tasklet", - "label": "assign_261_6", - "attributes": { - "code": { - "string_data": "__out = __inp", - "language": "Python" - }, - "debuginfo": { - "type": "DebugInfo", - "start_line": 261, - "end_line": 261, - "start_column": 33, - "end_column": 33, - "filename": "/home/primrose/Work/DaCellerator/MatMul/matmul.py" - }, - "label": "assign_261_6", - "in_connectors": { - "__inp": null - }, - "out_connectors": { - "__out": null - }, - "guid": "ee02b28c-b419-4cfe-87a0-5cb6f00e7728" - }, - "id": 14, - "scope_entry": "6", - "scope_exit": "7" - } - ], - "edges": [ - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "1", - "debuginfo": null, - "guid": "955a2c4f-712d-4f48-ba92-25f0630625d8", - "src_subset": null, - "dst_subset": null, - "is_data_src": false, - "num_accesses": "1" - } - } - }, - "src": "0", - "dst": "5", - "dst_connector": null, - "src_connector": null - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "K*M*N", - "subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "M - 1", - "step": "1", - "tile": "1" - }, - { - "start": "0", - "end": "K - 1", - "step": "1", - "tile": "1" - } - ] - }, - "data": "A", - "debuginfo": null, - "guid": "440297ed-13f9-441c-8590-54de034648b1", - "src_subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "M - 1", - "step": "1", - "tile": "1" - }, - { - "start": "0", - "end": "K - 1", - "step": "1", - "tile": "1" - } - ] - }, - "dst_subset": null, - "is_data_src": true, - "num_accesses": "K*M*N" - } - } - }, - "src": "2", - "dst": "0", - "dst_connector": "IN_A", - "src_connector": null - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "K*M*N", - "subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "K - 1", - "step": "1", - "tile": "1" - }, - { - "start": "0", - "end": "N - 1", - "step": "1", - "tile": "1" - } - ] - }, - "data": "B", - "debuginfo": null, - "guid": "b5a04c2f-9139-4e91-b3fa-05e31fb685c1", - "src_subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "K - 1", - "step": "1", - "tile": "1" - }, - { - "start": "0", - "end": "N - 1", - "step": "1", - "tile": "1" - } - ] - }, - "dst_subset": null, - "is_data_src": true, - "num_accesses": "K*M*N" - } - } - }, - "src": "3", - "dst": "0", - "dst_connector": "IN_B", - "src_connector": null - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "K", - "subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "0", - "step": "1", - "tile": "1" - } - ] - }, - "data": "tmp", - "debuginfo": null, - "guid": "b475ec44-1676-44fa-a2a0-475d09fd1f58", - "src_subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "0", - "step": "1", - "tile": "1" - } - ] - }, - "dst_subset": null, - "is_data_src": true, - "num_accesses": "K" - } - } - }, - "src": "5", - "dst": "6", - "dst_connector": "IN___tmp1", - "src_connector": null - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "1", - "subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "0", - "step": "1", - "tile": "1" - } - ] - }, - "data": "__tmp3", - "debuginfo": null, - "guid": "4d74242e-cc00-4c30-aa79-af5208c734bc", - "src_subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "0", - "step": "1", - "tile": "1" - } - ] - }, - "dst_subset": null, - "is_data_src": true, - "num_accesses": "1" - } - } - }, - "src": "11", - "dst": "12", - "dst_connector": "__in2", - "src_connector": null - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "1", - "subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "0", - "step": "1", - "tile": "1" - } - ] - }, - "data": "tmp", - "debuginfo": null, - "guid": "9294be17-8966-48f4-af5b-b94952ae2ba6", - "src_subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "0", - "step": "1", - "tile": "1" - } - ] - }, - "dst_subset": null, - "is_data_src": true, - "num_accesses": "1" - } - } - }, - "src": "8", - "dst": "9", - "dst_connector": "__inp", - "src_connector": null - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "1", - "subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "0", - "step": "1", - "tile": "1" - } - ] - }, - "data": "__tmp4", - "debuginfo": null, - "guid": "879548dd-1ea6-4b6d-be90-636186f7af63", - "src_subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "0", - "step": "1", - "tile": "1" - } - ] - }, - "dst_subset": null, - "is_data_src": true, - "num_accesses": "1" - } - } - }, - "src": "13", - "dst": "14", - "dst_connector": "__inp", - "src_connector": null - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "K", - "subset": { - "type": "Range", - "ranges": [ - { - "start": "i", - "end": "i", - "step": "1", - "tile": "1" - }, - { - "start": "0", - "end": "K - 1", - "step": "1", - "tile": "1" - } - ] - }, - "data": "A", - "debuginfo": null, - "guid": "c358fd7e-f83f-499e-baf6-634a74e0eb68", - "src_subset": { - "type": "Range", - "ranges": [ - { - "start": "i", - "end": "i", - "step": "1", - "tile": "1" - }, - { - "start": "0", - "end": "K - 1", - "step": "1", - "tile": "1" - } - ] - }, - "dst_subset": null, - "is_data_src": true, - "num_accesses": "K" - } - } - }, - "src": "0", - "dst": "6", - "dst_connector": "IN___tmp_261_18_r_in_from_2_0", - "src_connector": "OUT_A" - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "K", - "subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "K - 1", - "step": "1", - "tile": "1" - }, - { - "start": "j", - "end": "j", - "step": "1", - "tile": "1" - } - ] - }, - "data": "B", - "debuginfo": null, - "guid": "a01e117a-7338-40c0-a93d-98a99b5ba416", - "src_subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "K - 1", - "step": "1", - "tile": "1" - }, - { - "start": "j", - "end": "j", - "step": "1", - "tile": "1" - } - ] - }, - "dst_subset": null, - "is_data_src": true, - "num_accesses": "K" - } - } - }, - "src": "0", - "dst": "6", - "dst_connector": "IN___tmp_261_28_r_in_from_2_0", - "src_connector": "OUT_B" - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "M*N", - "subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "M - 1", - "step": "1", - "tile": "1" - }, - { - "start": "0", - "end": "N - 1", - "step": "1", - "tile": "1" - } - ] - }, - "data": "C", - "debuginfo": null, - "guid": "8d793faf-6c88-41ef-b15f-73d6cc4769ab", - "src_subset": null, - "dst_subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "M - 1", - "step": "1", - "tile": "1" - }, - { - "start": "0", - "end": "N - 1", - "step": "1", - "tile": "1" - } - ] - }, - "is_data_src": false, - "num_accesses": "M*N" - } - } - }, - "src": "1", - "dst": "4", - "dst_connector": null, - "src_connector": "OUT_C" - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "K", - "subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "0", - "step": "1", - "tile": "1" - } - ] - }, - "data": "tmp", - "debuginfo": null, - "guid": "a8a897dc-36b0-4558-92e6-337b9c4569eb", - "src_subset": null, - "dst_subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "0", - "step": "1", - "tile": "1" - } - ] - }, - "is_data_src": false, - "num_accesses": "K" - } - } - }, - "src": "7", - "dst": "8", - "dst_connector": null, - "src_connector": "OUT___tmp1" - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "1", - "subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "0", - "step": "1", - "tile": "1" - } - ] - }, - "data": "tmp", - "debuginfo": null, - "guid": "b4b91eb5-6f11-4107-8eaa-f00684d09166", - "src_subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "0", - "step": "1", - "tile": "1" - } - ] - }, - "dst_subset": null, - "is_data_src": true, - "num_accesses": "1" - } - } - }, - "src": "6", - "dst": "12", - "dst_connector": "__in1", - "src_connector": "OUT___tmp1" - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "1", - "subset": { - "type": "Range", - "ranges": [ - { - "start": "i", - "end": "i", - "step": "1", - "tile": "1" - }, - { - "start": "k", - "end": "k", - "step": "1", - "tile": "1" - } - ] - }, - "data": "A", - "debuginfo": null, - "guid": "78e57126-1091-4d19-95ca-1a9baa8b2980", - "src_subset": { - "type": "Range", - "ranges": [ - { - "start": "i", - "end": "i", - "step": "1", - "tile": "1" - }, - { - "start": "k", - "end": "k", - "step": "1", - "tile": "1" - } - ] - }, - "dst_subset": null, - "is_data_src": true, - "num_accesses": "1" - } - } - }, - "src": "6", - "dst": "10", - "dst_connector": "__in1", - "src_connector": "OUT___tmp_261_18_r_in_from_2_0" - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "1", - "subset": { - "type": "Range", - "ranges": [ - { - "start": "k", - "end": "k", - "step": "1", - "tile": "1" - }, - { - "start": "j", - "end": "j", - "step": "1", - "tile": "1" - } - ] - }, - "data": "B", - "debuginfo": null, - "guid": "90d034c5-b25b-439e-aa00-5b36f40fa3c1", - "src_subset": { - "type": "Range", - "ranges": [ - { - "start": "k", - "end": "k", - "step": "1", - "tile": "1" - }, - { - "start": "j", - "end": "j", - "step": "1", - "tile": "1" - } - ] - }, - "dst_subset": null, - "is_data_src": true, - "num_accesses": "1" - } - } - }, - "src": "6", - "dst": "10", - "dst_connector": "__in2", - "src_connector": "OUT___tmp_261_28_r_in_from_2_0" - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "1", - "subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "0", - "step": "1", - "tile": "1" - } - ] - }, - "data": "__tmp3", - "debuginfo": null, - "guid": "5c44d599-d384-49d0-9e03-023a46d05dd2", - "src_subset": null, - "dst_subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "0", - "step": "1", - "tile": "1" - } - ] - }, - "is_data_src": false, - "num_accesses": "1" - } - } - }, - "src": "10", - "dst": "11", - "dst_connector": null, - "src_connector": "__out" - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "1", - "subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "0", - "step": "1", - "tile": "1" - } - ] - }, - "data": "__tmp4", - "debuginfo": null, - "guid": "7d8d1da4-4f02-48b8-980a-33a71b57c26c", - "src_subset": null, - "dst_subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "0", - "step": "1", - "tile": "1" - } - ] - }, - "is_data_src": false, - "num_accesses": "1" - } - } - }, - "src": "12", - "dst": "13", - "dst_connector": null, - "src_connector": "__out" - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "1", - "subset": { - "type": "Range", - "ranges": [ - { - "start": "i", - "end": "i", - "step": "1", - "tile": "1" - }, - { - "start": "j", - "end": "j", - "step": "1", - "tile": "1" - } - ] - }, - "data": "C", - "debuginfo": null, - "guid": "0fb8b2f9-f4eb-43c6-b28b-778145ad2cc5", - "src_subset": null, - "dst_subset": { - "type": "Range", - "ranges": [ - { - "start": "i", - "end": "i", - "step": "1", - "tile": "1" - }, - { - "start": "j", - "end": "j", - "step": "1", - "tile": "1" - } - ] - }, - "is_data_src": false, - "num_accesses": "1" - } - } - }, - "src": "9", - "dst": "1", - "dst_connector": "IN_C", - "src_connector": "__out" - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "1", - "subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "0", - "step": "1", - "tile": "1" - } - ] - }, - "data": "tmp", - "debuginfo": null, - "guid": "7590ecfd-7127-48af-a7e4-98906d48b08e", - "src_subset": null, - "dst_subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "0", - "step": "1", - "tile": "1" - } - ] - }, - "is_data_src": false, - "num_accesses": "1" - } - } - }, - "src": "14", - "dst": "7", - "dst_connector": "IN___tmp1", - "src_connector": "__out" - } - ], - "attributes": { - "executions": "1", - "dynamic_executions": false, - "guid": "21d6aa0d-ccfb-4036-ab01-8654ab4417ed" - } - } - ], - "edges": [], - "collapsed": false, - "label": "", - "id": null, - "cfg_list_id": 0, - "start_block": 0, - "dace_version": "1.0.0" - }, - "transformation_hist": [ - { - "type": "PatternTransformation", - "transformation": "AddThreadBlockMap", - "_subgraph": { - "0": 0 - } - }, - { - "type": "PatternTransformation", - "transformation": "MapTiling", - "prefix": "b", - "tile_sizes": [ - "8", - "32" - ], - "divides_evenly": true, - "tile_trivial": true, - "skew": true, - "_subgraph": { - "0": 0 - } - }, - { - "type": "PatternTransformation", - "transformation": "ChangeThreadBlockMap", - "dim_size_x": "16", - "dim_size_y": "16", - "_subgraph": { - "0": 15, - "1": 0 - } - }, - { - "type": "PatternTransformation", - "transformation": "ThreadCoarsening", - "tile_size_x": "8", - "tile_size_y": "8", - "_subgraph": { - "0": 0, - "1": 15 - } - }, - { - "type": "PatternTransformation", - "transformation": "MapTiling", - "prefix": "d", - "tile_sizes": [ - "8", - "8" - ], - "divides_evenly": true, - "tile_trivial": true, - "skew": true, - "_subgraph": { - "0": 0 - } - }, - { - "type": "PatternTransformation", - "transformation": "BlockTiling", - "_subgraph": { - "0": 17, - "1": 6 - } - }, - { - "type": "PatternTransformation", - "transformation": "ExplicitMemoryMove", - "_subgraph": { - "0": 12, - "1": 14, - "2": 16 - } - }, - { - "type": "ExpandTransformation", - "transformation": "Expansion", - "classpath": "dace.libraries.standard.nodes.code.Expansion", - "_subgraph": { - "0": 20 - } - }, - { - "type": "ExpandTransformation", - "transformation": "Expansion", - "classpath": "dace.libraries.standard.nodes.code.Expansion", - "_subgraph": { - "0": 21 - } - } - ], - "debuginfo": { - "type": "DebugInfo", - "start_line": 254, - "end_line": 262, - "start_column": 0, - "end_column": 0, - "filename": "/home/primrose/Work/DaCellerator/MatMul/matmul.py" - }, - "guid": "b58195d5-3a0f-46d0-ac04-c263d4e3772b", - "hash": "cdf5cd16d9a29f92023e8d4b01e385997cc6f91086fcfba358c900713434638b" - }, - "nodes": [ - { - "type": "SDFGState", - "label": "MapState", - "id": 0, - "collapsed": false, - "scope_dict": { - "-1": [ - 2, - 3, - 4, - 12 - ], - "0": [ - 1, - 5 - ], - "5": [ - 6, - 7, - 8, - 9, - 10, - 11 - ], - "12": [ - 13, - 14 - ], - "14": [ - 15, - 16, - 18, - 19 - ], - "16": [ - 0, - 17, - 20, - 21, - 22, - 23 - ] - }, - "nodes": [ - { - "type": "MapEntry", - "label": "ThreadCoarsenedMap[i=0:8, j=0:8]", - "attributes": { - "label": "ThreadCoarsenedMap", - "params": [ - "i", - "j" - ], - "param_types": { - "i": "int32", - "j": "int32" - }, - "range": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "7", - "step": "1", - "tile": "1" - }, - { - "start": "0", - "end": "7", - "step": "1", - "tile": "1" - } - ] - }, - "schedule": "Sequential", - "unroll": true, - "debuginfo": { - "type": "DebugInfo", - "start_line": 258, - "end_line": 258, - "start_column": 2, - "end_column": 2, - "filename": "/home/primrose/Work/DaCellerator/MatMul/matmul.py" - }, - "in_connectors": { - "IN_A": { - "type": "pointer", - "dtype": "float32" - }, - "IN_B": { - "type": "pointer", - "dtype": "float32" - }, - "IN___tmp1": { - "type": "pointer", - "dtype": "float32" - } - }, - "out_connectors": { - "OUT_A": { - "type": "pointer", - "dtype": "float32" - }, - "OUT_B": { - "type": "pointer", - "dtype": "float32" - }, - "OUT___tmp1": "float32" - }, - "guid": "3befdc17-85e0-4f77-8d69-28f32bfd3669" - }, - "id": 0, - "scope_entry": "16", - "scope_exit": "1" - }, - { - "type": "MapExit", - "label": "ThreadCoarsenedMap[i=0:8, j=0:8]", - "attributes": { - "in_connectors": { - "IN_C": "float32" - }, - "out_connectors": { - "OUT_C": { - "type": "pointer", - "dtype": "float32" - } - }, - "guid": "0778c935-cb9e-40f0-b04a-8c24d0f3fd9f" - }, - "id": 1, - "scope_entry": "0", - "scope_exit": "1" - }, - { - "type": "AccessNode", - "label": "A", - "attributes": { - "debuginfo": { - "type": "DebugInfo", - "start_line": 258, - "end_line": 258, - "start_column": 2, - "end_column": 2, - "filename": "/home/primrose/Work/DaCellerator/MatMul/matmul.py" - }, - "data": "A", - "guid": "0ed9ab6a-02de-4951-9c25-86a43dc8edae" - }, - "id": 2, - "scope_entry": null, - "scope_exit": null - }, - { - "type": "AccessNode", - "label": "B", - "attributes": { - "debuginfo": { - "type": "DebugInfo", - "start_line": 258, - "end_line": 258, - "start_column": 2, - "end_column": 2, - "filename": "/home/primrose/Work/DaCellerator/MatMul/matmul.py" - }, - "data": "B", - "guid": "9a3d8a83-bfc2-4359-9eca-a0a42565bfd3" - }, - "id": 3, - "scope_entry": null, - "scope_exit": null - }, - { - "type": "AccessNode", - "label": "C", - "attributes": { - "debuginfo": { - "type": "DebugInfo", - "start_line": 258, - "end_line": 258, - "start_column": 2, - "end_column": 2, - "filename": "/home/primrose/Work/DaCellerator/MatMul/matmul.py" - }, - "data": "C", - "guid": "fec6b795-a139-4a14-9e63-44d333192daa" - }, - "id": 4, - "scope_entry": null, - "scope_exit": null - }, - { - "type": "MapEntry", - "label": "InnerWorkMapNo0[tk=0:16]", - "attributes": { - "label": "InnerWorkMapNo0", - "params": [ - "tk" - ], - "param_types": { - "tk": "int32" - }, - "range": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "15", - "step": "1", - "tile": "1" - } - ] - }, - "schedule": "Sequential", - "unroll": true, - "debuginfo": { - "type": "DebugInfo", - "start_line": 260, - "end_line": 260, - "start_column": 4, - "end_column": 4, - "filename": "/home/primrose/Work/DaCellerator/MatMul/matmul.py" - }, - "in_connectors": { - "IN___tmp1": "float32", - "IN___tmp_261_18_r_in_from_2_0": { - "type": "pointer", - "dtype": "float32" - }, - "IN___tmp_261_28_r_in_from_2_0": { - "type": "pointer", - "dtype": "float32" - } - }, - "out_connectors": { - "OUT___tmp1": "float32", - "OUT___tmp_261_18_r_in_from_2_0": "float32", - "OUT___tmp_261_28_r_in_from_2_0": "float32" - }, - "guid": "dbd98cec-d976-42db-a740-b8c99d25fd8d" - }, - "id": 5, - "scope_entry": "0", - "scope_exit": "6" - }, - { - "type": "MapExit", - "label": "InnerWorkMapNo0[tk=0:16]", - "attributes": { - "in_connectors": { - "IN___tmp1": "float32" - }, - "out_connectors": { - "OUT___tmp1": "float32" - }, - "guid": "fcc83bb0-47e1-4eef-a4a8-3ce4d9596be2" - }, - "id": 6, - "scope_entry": "5", - "scope_exit": "6" - }, - { - "type": "Tasklet", - "label": "_Mult_", - "attributes": { - "code": { - "string_data": "__out = (__in1 * __in2)", - "language": "Python" - }, - "debuginfo": { - "type": "DebugInfo", - "start_line": 261, - "end_line": 261, - "start_column": 33, - "end_column": 33, - "filename": "/home/primrose/Work/DaCellerator/MatMul/matmul.py" - }, - "label": "_Mult_", - "in_connectors": { - "__in1": "float32", - "__in2": "float32" - }, - "out_connectors": { - "__out": "float32" - }, - "guid": "73aecb99-070f-4e61-a316-3b2a7b40b929" - }, - "id": 7, - "scope_entry": "5", - "scope_exit": "6" - }, - { - "type": "AccessNode", - "label": "__tmp3", - "attributes": { - "debuginfo": { - "type": "DebugInfo", - "start_line": 261, - "end_line": 261, - "start_column": 33, - "end_column": 33, - "filename": "/home/primrose/Work/DaCellerator/MatMul/matmul.py" - }, - "data": "__tmp3", - "guid": "3ab24690-9afa-4fdf-be28-532806a0d849" - }, - "id": 8, - "scope_entry": "5", - "scope_exit": "6" - }, - { - "type": "Tasklet", - "label": "_Add_", - "attributes": { - "code": { - "string_data": "__out = (__in1 + __in2)", - "language": "Python" - }, - "debuginfo": { - "type": "DebugInfo", - "start_line": 261, - "end_line": 261, - "start_column": 33, - "end_column": 33, - "filename": "/home/primrose/Work/DaCellerator/MatMul/matmul.py" - }, - "label": "_Add_", - "in_connectors": { - "__in1": "float32", - "__in2": "float32" - }, - "out_connectors": { - "__out": "float32" - }, - "guid": "ae8acca8-acfa-42de-a0e2-dc45be188f59" - }, - "id": 9, - "scope_entry": "5", - "scope_exit": "6" - }, - { - "type": "AccessNode", - "label": "__tmp4", - "attributes": { - "debuginfo": { - "type": "DebugInfo", - "start_line": 261, - "end_line": 261, - "start_column": 33, - "end_column": 33, - "filename": "/home/primrose/Work/DaCellerator/MatMul/matmul.py" - }, - "data": "__tmp4", - "guid": "6e664a3e-01db-4577-b456-90aa0c41eaaf" - }, - "id": 10, - "scope_entry": "5", - "scope_exit": "6" - }, - { - "type": "Tasklet", - "label": "assign_261_6", - "attributes": { - "code": { - "string_data": "__out = __inp", - "language": "Python" - }, - "debuginfo": { - "type": "DebugInfo", - "start_line": 261, - "end_line": 261, - "start_column": 33, - "end_column": 33, - "filename": "/home/primrose/Work/DaCellerator/MatMul/matmul.py" - }, - "label": "assign_261_6", - "in_connectors": { - "__inp": "float32" - }, - "out_connectors": { - "__out": "float32" - }, - "guid": "ee02b28c-b419-4cfe-87a0-5cb6f00e7728" - }, - "id": 11, - "scope_entry": "5", - "scope_exit": "6" - }, - { - "type": "MapEntry", - "label": "KernelEntryMap[b_i=0:M:128, b_j=0:N:128]", - "attributes": { - "label": "KernelEntryMap", - "params": [ - "b_i", - "b_j" - ], - "param_types": { - "b_i": "int32", - "b_j": "int32" - }, - "range": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "M - 1", - "step": "128", - "tile": "1" - }, - { - "start": "0", - "end": "N - 1", - "step": "128", - "tile": "1" - } - ] - }, - "schedule": "GPU_Device", - "debuginfo": null, - "in_connectors": { - "IN_A": { - "type": "pointer", - "dtype": "float32" - }, - "IN_B": { - "type": "pointer", - "dtype": "float32" - } - }, - "out_connectors": { - "OUT_A": { - "type": "pointer", - "dtype": "float32" - }, - "OUT_B": { - "type": "pointer", - "dtype": "float32" - } - }, - "guid": "2e19f5de-2731-48f4-bfb6-f406686efc57" - }, - "id": 12, - "scope_entry": null, - "scope_exit": "13" - }, - { - "type": "MapExit", - "label": "KernelEntryMap[b_i=0:M:128, b_j=0:N:128]", - "attributes": { - "in_connectors": { - "IN_C": { - "type": "pointer", - "dtype": "float32" - } - }, - "out_connectors": { - "OUT_C": { - "type": "pointer", - "dtype": "float32" - } - }, - "guid": "977a6c95-f6b2-405d-81b9-9f47afd3a213" - }, - "id": 13, - "scope_entry": "12", - "scope_exit": "13" - }, - { - "type": "MapEntry", - "label": "ThreadBlockMap[d_i=0:128:8, d_j=0:128:8]", - "attributes": { - "label": "ThreadBlockMap", - "params": [ - "d_i", - "d_j" - ], - "param_types": { - "d_i": "int32", - "d_j": "int32" - }, - "range": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "127", - "step": "8", - "tile": "1" - }, - { - "start": "0", - "end": "127", - "step": "8", - "tile": "1" - } - ] - }, - "schedule": "GPU_ThreadBlock", - "debuginfo": null, - "in_connectors": { - "IN_A": { - "type": "pointer", - "dtype": "float32" - }, - "IN_B": { - "type": "pointer", - "dtype": "float32" - } - }, - "out_connectors": { - "OUT_A": { - "type": "pointer", - "dtype": "float32" - }, - "OUT_B": { - "type": "pointer", - "dtype": "float32" - } - }, - "guid": "6afb15dc-ce53-46a0-9605-bd36e62f9a23" - }, - "id": 14, - "scope_entry": "12", - "scope_exit": "15" - }, - { - "type": "MapExit", - "label": "ThreadBlockMap[d_i=0:128:8, d_j=0:128:8]", - "attributes": { - "in_connectors": { - "IN_C": { - "type": "pointer", - "dtype": "float32" - } - }, - "out_connectors": { - "OUT_C": { - "type": "pointer", - "dtype": "float32" - } - }, - "guid": "47fc83c9-18d4-44fb-a814-4f91e6347a1f" - }, - "id": 15, - "scope_entry": "14", - "scope_exit": "15" - }, - { - "type": "MapEntry", - "label": "OuterWorkMapNo0[k=0:K:16]", - "attributes": { - "label": "OuterWorkMapNo0", - "params": [ - "k" - ], - "param_types": { - "k": "int32" - }, - "range": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "K - 1", - "step": "16", - "tile": "1" - } - ] - }, - "schedule": "Sequential", - "debuginfo": null, - "in_connectors": { - "IN_A": { - "type": "pointer", - "dtype": "float32" - }, - "IN_B": { - "type": "pointer", - "dtype": "float32" - }, - "IN___tmp1": { - "type": "pointer", - "dtype": "float32" - } - }, - "out_connectors": { - "OUT_A": { - "type": "pointer", - "dtype": "float32" - }, - "OUT_B": { - "type": "pointer", - "dtype": "float32" - }, - "OUT___tmp1": { - "type": "pointer", - "dtype": "float32" - } - }, - "guid": "58cc03e7-cb61-484b-a116-9e5198e3686d" - }, - "id": 16, - "scope_entry": "14", - "scope_exit": "17" - }, - { - "type": "MapExit", - "label": "OuterWorkMapNo0[k=0:K:16]", - "attributes": { - "in_connectors": { - "IN_C": { - "type": "pointer", - "dtype": "float32" - } - }, - "out_connectors": { - "OUT_C": { - "type": "pointer", - "dtype": "float32" - } - }, - "guid": "d6ccdaf9-3fb9-4801-96b4-796bb619219d" - }, - "id": 17, - "scope_entry": "16", - "scope_exit": "17" - }, - { - "type": "AccessNode", - "label": "tmp", - "attributes": { - "setzero": true, - "debuginfo": { - "type": "DebugInfo", - "start_line": 259, - "end_line": 259, - "start_column": 23, - "end_column": 23, - "filename": "/home/primrose/Work/DaCellerator/MatMul/matmul.py" - }, - "data": "tmp", - "guid": "2cbe0906-ffb6-480f-8918-5d403b2c2349" - }, - "id": 18, - "scope_entry": "14", - "scope_exit": "15" - }, - { - "type": "AccessNode", - "label": "tmp", - "attributes": { - "debuginfo": null, - "data": "tmp", - "guid": "bc41207b-4ce4-439f-9cdc-d375ba9a0580" - }, - "id": 19, - "scope_entry": "14", - "scope_exit": "15" - }, - { - "type": "AccessNode", - "label": "shrB", - "attributes": { - "debuginfo": null, - "data": "shrB", - "guid": "943f436f-6a65-4cad-a365-4f858ffd5dfa" - }, - "id": 20, - "scope_entry": "16", - "scope_exit": "17" - }, - { - "type": "AccessNode", - "label": "shrA", - "attributes": { - "debuginfo": null, - "data": "shrA", - "guid": "21a4393c-7e02-4b7b-b62c-68e902b3bf0e" - }, - "id": 21, - "scope_entry": "16", - "scope_exit": "17" - }, - { - "type": "Tasklet", - "label": "custom_code", - "attributes": { - "code": { - "string_data": "// B[K,N]\n// shrB[16,128]\n// Inner Loop Condition: k <= K - 16 && b_j <= N - 128\nconst int tid = threadIdx.x + blockDim.x * threadIdx.y + (blockDim.x * blockDim.y) * threadIdx.z;\n// Num Threads: 256, Line Length (max): 128\nif (k <= K - 16 && b_j <= N - 128) {\n// load multiple lines at a time 2\nconst int line_offset = tid % 128;\nconst int line_num = tid / 128;\n#pragma unroll\nfor (int i0 = 0; i0 < 16; i0 += 2) {\nshrB[line_num*128 + line_offset + ((i0) * 128)] = B[(N*(k))+(1*(b_j)) + line_num*N + line_offset + ((i0) * N)];\n}\n} else { \n// load multiple lines at a time 2\nconst int effective_line_len = Min(N - b_j, 128);\nconst int line_offset = tid % effective_line_len;\nconst int line_num = tid / effective_line_len;\nconst int effectivenum_threads = 2 * effective_line_len;\nif (tid < effectivenum_threads){\n#pragma unroll\nfor (int i0 = 0; i0 < Min(K - k, 16); i0 += 2) {\nif(line_offset < effective_line_len && line_num + i0 < Min(K - k, 16)){\nshrB[line_num*128 + line_offset + ((i0) * 128)] = B[(N*(k))+(1*(b_j)) + line_num*N + line_offset + ((i0) * N)];\n}\n}\n}\n}\n", - "language": "CPP" - }, - "debuginfo": { - "type": "DebugInfo", - "start_line": 0, - "end_line": 0, - "start_column": 0, - "end_column": 0, - "filename": null - }, - "label": "custom_code", - "in_connectors": { - "IN_B": { - "type": "pointer", - "dtype": "float32" - } - }, - "out_connectors": { - "OUT_B": { - "type": "pointer", - "dtype": "float32" - } - }, - "guid": "ee6d7afd-583f-4121-9dcf-7ae5f6f60080" - }, - "id": 22, - "scope_entry": "16", - "scope_exit": "17" - }, - { - "type": "Tasklet", - "label": "custom_code", - "attributes": { - "code": { - "string_data": "// A[M,K]\n// shrA[128,16]\n// Inner Loop Condition: b_i <= M - 128 && k <= K - 16\nconst int tid = threadIdx.x + blockDim.x * threadIdx.y + (blockDim.x * blockDim.y) * threadIdx.z;\n// Num Threads: 256, Line Length (max): 16\nif (b_i <= M - 128 && k <= K - 16) {\n// load multiple lines at a time 16\nconst int line_offset = tid % 16;\nconst int line_num = tid / 16;\n#pragma unroll\nfor (int i0 = 0; i0 < 128; i0 += 16) {\nshrA[line_num*16 + line_offset + ((i0) * 16)] = A[(K*(b_i))+(1*(k)) + line_num*K + line_offset + ((i0) * K)];\n}\n} else { \n// load multiple lines at a time 16\nconst int effective_line_len = Min(K - k, 16);\nconst int line_offset = tid % effective_line_len;\nconst int line_num = tid / effective_line_len;\nconst int effectivenum_threads = 16 * effective_line_len;\nif (tid < effectivenum_threads){\n#pragma unroll\nfor (int i0 = 0; i0 < Min(M - b_i, 128); i0 += 16) {\nif(line_offset < effective_line_len && line_num + i0 < Min(M - b_i, 128)){\nshrA[line_num*16 + line_offset + ((i0) * 16)] = A[(K*(b_i))+(1*(k)) + line_num*K + line_offset + ((i0) * K)];\n}\n}\n}\n}\n", - "language": "CPP" - }, - "debuginfo": { - "type": "DebugInfo", - "start_line": 0, - "end_line": 0, - "start_column": 0, - "end_column": 0, - "filename": null - }, - "label": "custom_code", - "in_connectors": { - "IN_A": { - "type": "pointer", - "dtype": "float32" - } - }, - "out_connectors": { - "OUT_A": { - "type": "pointer", - "dtype": "float32" - } - }, - "guid": "87c1e78b-08d7-4ad4-9df4-7c8307dfb4e1" - }, - "id": 23, - "scope_entry": "16", - "scope_exit": "17" - } - ], - "edges": [ - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "1", - "debuginfo": null, - "guid": "717e461d-b7de-4653-8912-6ebfd0b95b49", - "src_subset": null, - "dst_subset": null, - "is_data_src": false, - "num_accesses": "1" - } - } - }, - "src": "14", - "dst": "18", - "dst_connector": null, - "src_connector": null - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "16384*K*ceiling(M/128)*ceiling(N/128)", - "subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "M - 1", - "step": "1", - "tile": "1" - }, - { - "start": "0", - "end": "K - 1", - "step": "1", - "tile": "1" - } - ] - }, - "data": "A", - "debuginfo": null, - "guid": "00b2c1ed-40a8-462b-b84b-e498f0e61cb0", - "src_subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "M - 1", - "step": "1", - "tile": "1" - }, - { - "start": "0", - "end": "K - 1", - "step": "1", - "tile": "1" - } - ] - }, - "dst_subset": null, - "is_data_src": true, - "num_accesses": "16384*K*ceiling(M/128)*ceiling(N/128)" - } - } - }, - "src": "2", - "dst": "12", - "dst_connector": "IN_A", - "src_connector": null - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "128", - "subset": { - "type": "Range", - "ranges": [ - { - "start": "d_i", - "end": "d_i + 7", - "step": "1", - "tile": "1" - }, - { - "start": "0", - "end": "15", - "step": "1", - "tile": "1" - } - ] - }, - "data": "shrA", - "debuginfo": null, - "guid": "4e20efeb-25e3-48b2-bd36-62f0e610f7a0", - "src_subset": { - "type": "Range", - "ranges": [ - { - "start": "d_i", - "end": "d_i + 7", - "step": "1", - "tile": "1" - }, - { - "start": "0", - "end": "15", - "step": "1", - "tile": "1" - } - ] - }, - "dst_subset": null, - "is_data_src": true, - "num_accesses": "128" - } - } - }, - "src": "21", - "dst": "0", - "dst_connector": "IN_A", - "src_connector": null - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "16384*K*ceiling(M/128)*ceiling(N/128)", - "subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "K - 1", - "step": "1", - "tile": "1" - }, - { - "start": "0", - "end": "N - 1", - "step": "1", - "tile": "1" - } - ] - }, - "data": "B", - "debuginfo": null, - "guid": "dfca0807-6b73-41d8-803f-bd9657893bd6", - "src_subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "K - 1", - "step": "1", - "tile": "1" - }, - { - "start": "0", - "end": "N - 1", - "step": "1", - "tile": "1" - } - ] - }, - "dst_subset": null, - "is_data_src": true, - "num_accesses": "16384*K*ceiling(M/128)*ceiling(N/128)" - } - } - }, - "src": "3", - "dst": "12", - "dst_connector": "IN_B", - "src_connector": null - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "128", - "subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "15", - "step": "1", - "tile": "1" - }, - { - "start": "d_j", - "end": "d_j + 7", - "step": "1", - "tile": "1" - } - ] - }, - "data": "shrB", - "debuginfo": null, - "guid": "0a152535-c69d-4240-b0de-5473188c98af", - "src_subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "15", - "step": "1", - "tile": "1" - }, - { - "start": "d_j", - "end": "d_j + 7", - "step": "1", - "tile": "1" - } - ] - }, - "dst_subset": null, - "is_data_src": true, - "num_accesses": "128" - } - } - }, - "src": "20", - "dst": "0", - "dst_connector": "IN_B", - "src_connector": null - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "64", - "subset": { - "type": "Range", - "ranges": [ - { - "start": "b_i + d_i", - "end": "b_i + d_i + 7", - "step": "1", - "tile": "1" - }, - { - "start": "b_j + d_j", - "end": "b_j + d_j + 7", - "step": "1", - "tile": "1" - } - ] - }, - "data": "C", - "debuginfo": null, - "guid": "4ac6087d-0a43-4f88-bcfa-7d5188895035", - "src_subset": null, - "dst_subset": { - "type": "Range", - "ranges": [ - { - "start": "b_i + d_i", - "end": "b_i + d_i + 7", - "step": "1", - "tile": "1" - }, - { - "start": "b_j + d_j", - "end": "b_j + d_j + 7", - "step": "1", - "tile": "1" - } - ] - }, - "is_data_src": false, - "num_accesses": "64" - } - } - }, - "src": "19", - "dst": "15", - "dst_connector": "IN_C", - "src_connector": null - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "64", - "subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "7", - "step": "1", - "tile": "1" - }, - { - "start": "0", - "end": "7", - "step": "1", - "tile": "1" - } - ] - }, - "data": "tmp", - "debuginfo": null, - "guid": "5686066b-de34-4a45-a4d1-be03e4d3ce70", - "src_subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "7", - "step": "1", - "tile": "1" - }, - { - "start": "0", - "end": "7", - "step": "1", - "tile": "1" - } - ] - }, - "dst_subset": null, - "is_data_src": true, - "num_accesses": "64" - } - } - }, - "src": "18", - "dst": "16", - "dst_connector": "IN___tmp1", - "src_connector": null - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "1", - "subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "0", - "step": "1", - "tile": "1" - } - ] - }, - "data": "__tmp3", - "debuginfo": null, - "guid": "e45b8649-9d7f-4505-8d96-cb03e2c5a2ec", - "src_subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "0", - "step": "1", - "tile": "1" - } - ] - }, - "dst_subset": null, - "is_data_src": true, - "num_accesses": "1" - } - } - }, - "src": "8", - "dst": "9", - "dst_connector": "__in2", - "src_connector": null - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "1", - "subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "0", - "step": "1", - "tile": "1" - } - ] - }, - "data": "__tmp4", - "debuginfo": null, - "guid": "08c7701b-f572-41de-9386-a9bcd94fb418", - "src_subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "0", - "step": "1", - "tile": "1" - } - ] - }, - "dst_subset": null, - "is_data_src": true, - "num_accesses": "1" - } - } - }, - "src": "10", - "dst": "11", - "dst_connector": "__inp", - "src_connector": null - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "128", - "subset": { - "type": "Range", - "ranges": [ - { - "start": "d_i", - "end": "d_i + 7", - "step": "1", - "tile": "1" - }, - { - "start": "0", - "end": "15", - "step": "1", - "tile": "1" - } - ] - }, - "data": "shrA", - "debuginfo": null, - "guid": "89f85fcb-cbb5-49ac-b0bd-fcba411c131d", - "src_subset": null, - "dst_subset": { - "type": "Range", - "ranges": [ - { - "start": "d_i", - "end": "d_i + 7", - "step": "1", - "tile": "1" - }, - { - "start": "0", - "end": "15", - "step": "1", - "tile": "1" - } - ] - }, - "is_data_src": false, - "num_accesses": "128" - } - } - }, - "src": "23", - "dst": "21", - "dst_connector": null, - "src_connector": "OUT_A" - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "16384*K", - "subset": { - "type": "Range", - "ranges": [ - { - "start": "b_i", - "end": "b_i + 127", - "step": "1", - "tile": "1" - }, - { - "start": "0", - "end": "K - 1", - "step": "1", - "tile": "1" - } - ] - }, - "data": "A", - "debuginfo": null, - "guid": "33372399-9833-4551-81fa-2bbd4fc94206", - "src_subset": { - "type": "Range", - "ranges": [ - { - "start": "b_i", - "end": "b_i + 127", - "step": "1", - "tile": "1" - }, - { - "start": "0", - "end": "K - 1", - "step": "1", - "tile": "1" - } - ] - }, - "dst_subset": null, - "is_data_src": true, - "num_accesses": "16384*K" - } - } - }, - "src": "12", - "dst": "14", - "dst_connector": "IN_A", - "src_connector": "OUT_A" - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "64*K", - "subset": { - "type": "Range", - "ranges": [ - { - "start": "b_i + d_i", - "end": "b_i + d_i + 7", - "step": "1", - "tile": "1" - }, - { - "start": "0", - "end": "K - 1", - "step": "1", - "tile": "1" - } - ] - }, - "data": "A", - "debuginfo": null, - "guid": "5752458c-f2d7-412c-86ab-66dee65fb42e", - "src_subset": { - "type": "Range", - "ranges": [ - { - "start": "b_i + d_i", - "end": "b_i + d_i + 7", - "step": "1", - "tile": "1" - }, - { - "start": "0", - "end": "K - 1", - "step": "1", - "tile": "1" - } - ] - }, - "dst_subset": null, - "is_data_src": true, - "num_accesses": "64*K" - } - } - }, - "src": "14", - "dst": "16", - "dst_connector": "IN_A", - "src_connector": "OUT_A" - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "128", - "subset": { - "type": "Range", - "ranges": [ - { - "start": "b_i + d_i", - "end": "b_i + d_i + 7", - "step": "1", - "tile": "1" - }, - { - "start": "k", - "end": "k + 15", - "step": "1", - "tile": "1" - } - ] - }, - "data": "A", - "debuginfo": null, - "guid": "f677499b-19ac-428a-94d6-edc64403f5cd", - "src_subset": { - "type": "Range", - "ranges": [ - { - "start": "b_i + d_i", - "end": "b_i + d_i + 7", - "step": "1", - "tile": "1" - }, - { - "start": "k", - "end": "k + 15", - "step": "1", - "tile": "1" - } - ] - }, - "dst_subset": null, - "is_data_src": true, - "num_accesses": "128" - } - } - }, - "src": "16", - "dst": "23", - "dst_connector": "IN_A", - "src_connector": "OUT_A" - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "16", - "subset": { - "type": "Range", - "ranges": [ - { - "start": "d_i + i", - "end": "d_i + i", - "step": "1", - "tile": "1" - }, - { - "start": "0", - "end": "15", - "step": "1", - "tile": "1" - } - ] - }, - "data": "shrA", - "debuginfo": null, - "guid": "22c32597-9371-45d1-8394-e06e6b9e67f7", - "src_subset": { - "type": "Range", - "ranges": [ - { - "start": "d_i + i", - "end": "d_i + i", - "step": "1", - "tile": "1" - }, - { - "start": "0", - "end": "15", - "step": "1", - "tile": "1" - } - ] - }, - "dst_subset": null, - "is_data_src": true, - "num_accesses": "16" - } - } - }, - "src": "0", - "dst": "5", - "dst_connector": "IN___tmp_261_18_r_in_from_2_0", - "src_connector": "OUT_A" - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "128", - "subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "15", - "step": "1", - "tile": "1" - }, - { - "start": "d_j", - "end": "d_j + 7", - "step": "1", - "tile": "1" - } - ] - }, - "data": "shrB", - "debuginfo": null, - "guid": "0b2cf75a-cf1c-4808-99ae-447b3c88ddb8", - "src_subset": null, - "dst_subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "15", - "step": "1", - "tile": "1" - }, - { - "start": "d_j", - "end": "d_j + 7", - "step": "1", - "tile": "1" - } - ] - }, - "is_data_src": false, - "num_accesses": "128" - } - } - }, - "src": "22", - "dst": "20", - "dst_connector": null, - "src_connector": "OUT_B" - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "16384*K", - "subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "K - 1", - "step": "1", - "tile": "1" - }, - { - "start": "b_j", - "end": "b_j + 127", - "step": "1", - "tile": "1" - } - ] - }, - "data": "B", - "debuginfo": null, - "guid": "b741ef83-a465-4021-811c-6bd84813fa19", - "src_subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "K - 1", - "step": "1", - "tile": "1" - }, - { - "start": "b_j", - "end": "b_j + 127", - "step": "1", - "tile": "1" - } - ] - }, - "dst_subset": null, - "is_data_src": true, - "num_accesses": "16384*K" - } - } - }, - "src": "12", - "dst": "14", - "dst_connector": "IN_B", - "src_connector": "OUT_B" - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "64*K", - "subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "K - 1", - "step": "1", - "tile": "1" - }, - { - "start": "b_j + d_j", - "end": "b_j + d_j + 7", - "step": "1", - "tile": "1" - } - ] - }, - "data": "B", - "debuginfo": null, - "guid": "1f1b7232-f62a-45eb-ad3f-e2cc948b0a85", - "src_subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "K - 1", - "step": "1", - "tile": "1" - }, - { - "start": "b_j + d_j", - "end": "b_j + d_j + 7", - "step": "1", - "tile": "1" - } - ] - }, - "dst_subset": null, - "is_data_src": true, - "num_accesses": "64*K" - } - } - }, - "src": "14", - "dst": "16", - "dst_connector": "IN_B", - "src_connector": "OUT_B" - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "128", - "subset": { - "type": "Range", - "ranges": [ - { - "start": "k", - "end": "k + 15", - "step": "1", - "tile": "1" - }, - { - "start": "b_j + d_j", - "end": "b_j + d_j + 7", - "step": "1", - "tile": "1" - } - ] - }, - "data": "B", - "debuginfo": null, - "guid": "2742c5c3-debb-405b-9e4c-c125b0d8c5b3", - "src_subset": { - "type": "Range", - "ranges": [ - { - "start": "k", - "end": "k + 15", - "step": "1", - "tile": "1" - }, - { - "start": "b_j + d_j", - "end": "b_j + d_j + 7", - "step": "1", - "tile": "1" - } - ] - }, - "dst_subset": null, - "is_data_src": true, - "num_accesses": "128" - } - } - }, - "src": "16", - "dst": "22", - "dst_connector": "IN_B", - "src_connector": "OUT_B" - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "16", - "subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "15", - "step": "1", - "tile": "1" - }, - { - "start": "d_j + j", - "end": "d_j + j", - "step": "1", - "tile": "1" - } - ] - }, - "data": "shrB", - "debuginfo": null, - "guid": "b1cf4e06-8c3a-4f1b-a8c9-1d72c93aad79", - "src_subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "15", - "step": "1", - "tile": "1" - }, - { - "start": "d_j + j", - "end": "d_j + j", - "step": "1", - "tile": "1" - } - ] - }, - "dst_subset": null, - "is_data_src": true, - "num_accesses": "16" - } - } - }, - "src": "0", - "dst": "5", - "dst_connector": "IN___tmp_261_28_r_in_from_2_0", - "src_connector": "OUT_B" - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "M*N", - "subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "M - 1", - "step": "1", - "tile": "1" - }, - { - "start": "0", - "end": "N - 1", - "step": "1", - "tile": "1" - } - ] - }, - "data": "C", - "debuginfo": null, - "guid": "28fe3bb5-718f-4fa5-aece-d31643392657", - "src_subset": null, - "dst_subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "M - 1", - "step": "1", - "tile": "1" - }, - { - "start": "0", - "end": "N - 1", - "step": "1", - "tile": "1" - } - ] - }, - "is_data_src": false, - "num_accesses": "M*N" - } - } - }, - "src": "13", - "dst": "4", - "dst_connector": null, - "src_connector": "OUT_C" - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "64", - "subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "7", - "step": "1", - "tile": "1" - }, - { - "start": "0", - "end": "7", - "step": "1", - "tile": "1" - } - ] - }, - "data": "tmp", - "debuginfo": null, - "guid": "f756aeb2-08bb-4a62-bdc3-7946ac999339", - "src_subset": null, - "dst_subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "7", - "step": "1", - "tile": "1" - }, - { - "start": "0", - "end": "7", - "step": "1", - "tile": "1" - } - ] - }, - "is_data_src": false, - "num_accesses": "64" - } - } - }, - "src": "17", - "dst": "19", - "dst_connector": null, - "src_connector": "OUT_C" - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "16384", - "subset": { - "type": "Range", - "ranges": [ - { - "start": "b_i", - "end": "b_i + 127", - "step": "1", - "tile": "1" - }, - { - "start": "b_j", - "end": "b_j + 127", - "step": "1", - "tile": "1" - } - ] - }, - "data": "C", - "debuginfo": null, - "guid": "fe8d8836-ded8-4ef6-a40a-78bcdad3eeb6", - "src_subset": null, - "dst_subset": { - "type": "Range", - "ranges": [ - { - "start": "b_i", - "end": "b_i + 127", - "step": "1", - "tile": "1" - }, - { - "start": "b_j", - "end": "b_j + 127", - "step": "1", - "tile": "1" - } - ] - }, - "is_data_src": false, - "num_accesses": "16384" - } - } - }, - "src": "15", - "dst": "13", - "dst_connector": "IN_C", - "src_connector": "OUT_C" - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "64", - "subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "7", - "step": "1", - "tile": "1" - }, - { - "start": "0", - "end": "7", - "step": "1", - "tile": "1" - } - ] - }, - "data": "tmp", - "debuginfo": null, - "guid": "11f9a0db-52f5-4d91-ad9e-3f26ca230d90", - "src_subset": null, - "dst_subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "7", - "step": "1", - "tile": "1" - }, - { - "start": "0", - "end": "7", - "step": "1", - "tile": "1" - } - ] - }, - "is_data_src": false, - "num_accesses": "64" - } - } - }, - "src": "1", - "dst": "17", - "dst_connector": "IN_C", - "src_connector": "OUT_C" - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "1", - "subset": { - "type": "Range", - "ranges": [ - { - "start": "i", - "end": "i", - "step": "1", - "tile": "1" - }, - { - "start": "j", - "end": "j", - "step": "1", - "tile": "1" - } - ] - }, - "data": "tmp", - "debuginfo": null, - "guid": "4f8548fd-6128-45ea-9fc9-f6df35f09325", - "src_subset": null, - "dst_subset": { - "type": "Range", - "ranges": [ - { - "start": "i", - "end": "i", - "step": "1", - "tile": "1" - }, - { - "start": "j", - "end": "j", - "step": "1", - "tile": "1" - } - ] - }, - "is_data_src": false, - "num_accesses": "1" - } - } - }, - "src": "6", - "dst": "1", - "dst_connector": "IN_C", - "src_connector": "OUT___tmp1" - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "1", - "subset": { - "type": "Range", - "ranges": [ - { - "start": "i", - "end": "i", - "step": "1", - "tile": "1" - }, - { - "start": "j", - "end": "j", - "step": "1", - "tile": "1" - } - ] - }, - "data": "tmp", - "debuginfo": null, - "guid": "0be0a668-1110-4d99-90e6-c11606cc50f6", - "src_subset": { - "type": "Range", - "ranges": [ - { - "start": "i", - "end": "i", - "step": "1", - "tile": "1" - }, - { - "start": "j", - "end": "j", - "step": "1", - "tile": "1" - } - ] - }, - "dst_subset": null, - "is_data_src": true, - "num_accesses": "1" - } - } - }, - "src": "0", - "dst": "5", - "dst_connector": "IN___tmp1", - "src_connector": "OUT___tmp1" - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "64", - "subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "7", - "step": "1", - "tile": "1" - }, - { - "start": "0", - "end": "7", - "step": "1", - "tile": "1" - } - ] - }, - "data": "tmp", - "debuginfo": null, - "guid": "42cadbaf-e4d6-4b6f-aa92-1c652a7b7d0b", - "src_subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "7", - "step": "1", - "tile": "1" - }, - { - "start": "0", - "end": "7", - "step": "1", - "tile": "1" - } - ] - }, - "dst_subset": null, - "is_data_src": true, - "num_accesses": "64" - } - } - }, - "src": "16", - "dst": "0", - "dst_connector": "IN___tmp1", - "src_connector": "OUT___tmp1" - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "1", - "subset": { - "type": "Range", - "ranges": [ - { - "start": "i", - "end": "i", - "step": "1", - "tile": "1" - }, - { - "start": "j", - "end": "j", - "step": "1", - "tile": "1" - } - ] - }, - "data": "tmp", - "debuginfo": null, - "guid": "c36f83d1-d066-4dd9-b63a-2616908697bd", - "src_subset": { - "type": "Range", - "ranges": [ - { - "start": "i", - "end": "i", - "step": "1", - "tile": "1" - }, - { - "start": "j", - "end": "j", - "step": "1", - "tile": "1" - } - ] - }, - "dst_subset": null, - "is_data_src": true, - "num_accesses": "1" - } - } - }, - "src": "5", - "dst": "9", - "dst_connector": "__in1", - "src_connector": "OUT___tmp1" - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "1", - "subset": { - "type": "Range", - "ranges": [ - { - "start": "d_i + i", - "end": "d_i + i", - "step": "1", - "tile": "1" - }, - { - "start": "tk", - "end": "tk", - "step": "1", - "tile": "1" - } - ] - }, - "data": "shrA", - "debuginfo": null, - "guid": "b24525e0-08c4-4174-9baf-8d00fba27490", - "src_subset": { - "type": "Range", - "ranges": [ - { - "start": "d_i + i", - "end": "d_i + i", - "step": "1", - "tile": "1" - }, - { - "start": "tk", - "end": "tk", - "step": "1", - "tile": "1" - } - ] - }, - "dst_subset": null, - "is_data_src": true, - "num_accesses": "1" - } - } - }, - "src": "5", - "dst": "7", - "dst_connector": "__in1", - "src_connector": "OUT___tmp_261_18_r_in_from_2_0" - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "1", - "subset": { - "type": "Range", - "ranges": [ - { - "start": "tk", - "end": "tk", - "step": "1", - "tile": "1" - }, - { - "start": "d_j + j", - "end": "d_j + j", - "step": "1", - "tile": "1" - } - ] - }, - "data": "shrB", - "debuginfo": null, - "guid": "31287127-040d-4c00-aeac-badd43792f80", - "src_subset": { - "type": "Range", - "ranges": [ - { - "start": "tk", - "end": "tk", - "step": "1", - "tile": "1" - }, - { - "start": "d_j + j", - "end": "d_j + j", - "step": "1", - "tile": "1" - } - ] - }, - "dst_subset": null, - "is_data_src": true, - "num_accesses": "1" - } - } - }, - "src": "5", - "dst": "7", - "dst_connector": "__in2", - "src_connector": "OUT___tmp_261_28_r_in_from_2_0" - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "1", - "subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "0", - "step": "1", - "tile": "1" - } - ] - }, - "data": "__tmp3", - "debuginfo": null, - "guid": "a0276658-2d4c-451c-880c-00e32a770af8", - "src_subset": null, - "dst_subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "0", - "step": "1", - "tile": "1" - } - ] - }, - "is_data_src": false, - "num_accesses": "1" - } - } - }, - "src": "7", - "dst": "8", - "dst_connector": null, - "src_connector": "__out" - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "1", - "subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "0", - "step": "1", - "tile": "1" - } - ] - }, - "data": "__tmp4", - "debuginfo": null, - "guid": "d926e555-2206-4946-936f-637170fedb10", - "src_subset": null, - "dst_subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "0", - "step": "1", - "tile": "1" - } - ] - }, - "is_data_src": false, - "num_accesses": "1" - } - } - }, - "src": "9", - "dst": "10", - "dst_connector": null, - "src_connector": "__out" - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "1", - "subset": { - "type": "Range", - "ranges": [ - { - "start": "i", - "end": "i", - "step": "1", - "tile": "1" - }, - { - "start": "j", - "end": "j", - "step": "1", - "tile": "1" - } - ] - }, - "data": "tmp", - "debuginfo": null, - "guid": "482095c1-5f12-45ec-885b-312e5b96205f", - "src_subset": null, - "dst_subset": { - "type": "Range", - "ranges": [ - { - "start": "i", - "end": "i", - "step": "1", - "tile": "1" - }, - { - "start": "j", - "end": "j", - "step": "1", - "tile": "1" - } - ] - }, - "is_data_src": false, - "num_accesses": "1" - } - } - }, - "src": "11", - "dst": "6", - "dst_connector": "IN___tmp1", - "src_connector": "__out" - } - ], - "attributes": { - "executions": "1", - "dynamic_executions": false, - "guid": "21d6aa0d-ccfb-4036-ab01-8654ab4417ed" - } - } - ], - "edges": [], - "collapsed": false, - "label": "", - "id": null, - "cfg_list_id": 0, - "start_block": 0, - "dace_version": "1.0.0" -} diff --git a/berkay_workpace/scratch/yakups_examples/smem_related/generate_sdfgs.ipynb b/berkay_workpace/scratch/yakups_examples/smem_related/generate_sdfgs.ipynb deleted file mode 100644 index e8ce4f567e..0000000000 --- a/berkay_workpace/scratch/yakups_examples/smem_related/generate_sdfgs.ipynb +++ /dev/null @@ -1,656 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "id": "c60775aa", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "import dace\n", - "import cupy as cp\n", - "import numpy as np\n", - "from IPython.display import Code\n", - "from typing import Optional\n", - "\n", - "from dace import SDFG, properties\n", - "from dace.config import Config\n", - "from dace.transformation import pass_pipeline as ppl, transformation\n", - "from dace.sdfg import nodes\n", - "from dace import dtypes\n", - "from dace.transformation.passes.shared_memory_synchronization import DefaultSharedMemorySync\n", - "from dace.sdfg.state import LoopRegion, ConditionalBlock" - ] - }, - { - "cell_type": "markdown", - "id": "d4a9dd26", - "metadata": {}, - "source": [ - "## Weird Global To Global Example\n", - "\n", - "This is actually a nice example which does not compile when using the legacy CUDACodeGen due to const checks." - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "ed5d3795", - "metadata": {}, - "outputs": [], - "source": [ - "sdfg = dace.SDFG.from_file('weird_global_to_global.sdfg')" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "537b7cc2", - "metadata": {}, - "outputs": [], - "source": [ - "#sdfg.compile()" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "c276f872", - "metadata": {}, - "outputs": [], - "source": [ - "#Code(sdfg.generate_code()[1].clean_code)" - ] - }, - { - "cell_type": "markdown", - "id": "958dfcbf", - "metadata": {}, - "source": [ - "## Weir Shared To Shared Example" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "a98006e2", - "metadata": {}, - "outputs": [], - "source": [ - "sdfg = dace.SDFG.from_file('weird_shared_to_shared_copy.sdfg')" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "e819c1b0", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/home/berkay/master-thesis/dace/dace/codegen/targets/cuda.py:1872: UserWarning: No `gpu_block_size` property specified on map \"kernel_101\". Falling back to the configuration entry `compiler.cuda.default_block_size`: 32,1,1. You can either specify the block size to use with the gpu_block_size property, or by adding nested `GPU_ThreadBlock` maps, which map work to individual threads. For more information, see https://spcldace.readthedocs.io/en/latest/optimization/gpu.html\n", - " warnings.warn(\n" - ] - }, - { - "data": { - "text/html": [ - "
#include <cuda_runtime.h>\n",
-       "#include <dace/dace.h>\n",
-       "\n",
-       "\n",
-       "struct kernel_state_t {\n",
-       "    dace::cuda::Context *gpu_context;\n",
-       "};\n",
-       "\n",
-       "\n",
-       "\n",
-       "DACE_EXPORTED int __dace_init_cuda(kernel_state_t *__state, int64_t N);\n",
-       "DACE_EXPORTED int __dace_exit_cuda(kernel_state_t *__state);\n",
-       "\n",
-       "\n",
-       "\n",
-       "int __dace_init_cuda(kernel_state_t *__state, int64_t N) {\n",
-       "    int count;\n",
-       "\n",
-       "    // Check that we are able to run cuda code\n",
-       "    if (cudaGetDeviceCount(&count) != cudaSuccess)\n",
-       "    {\n",
-       "        printf("ERROR: GPU drivers are not configured or cuda-capable device "\n",
-       "               "not found\\n");\n",
-       "        return 1;\n",
-       "    }\n",
-       "    if (count == 0)\n",
-       "    {\n",
-       "        printf("ERROR: No cuda-capable devices found\\n");\n",
-       "        return 2;\n",
-       "    }\n",
-       "\n",
-       "    // Initialize cuda before we run the application\n",
-       "    float *dev_X;\n",
-       "    DACE_GPU_CHECK(cudaMalloc((void **) &dev_X, 1));\n",
-       "    DACE_GPU_CHECK(cudaFree(dev_X));\n",
-       "\n",
-       "    \n",
-       "\n",
-       "    __state->gpu_context = new dace::cuda::Context(1, 1);\n",
-       "\n",
-       "    // Create cuda streams and events\n",
-       "    for(int i = 0; i < 1; ++i) {\n",
-       "        DACE_GPU_CHECK(cudaStreamCreateWithFlags(&__state->gpu_context->internal_streams[i], cudaStreamNonBlocking));\n",
-       "        __state->gpu_context->streams[i] = __state->gpu_context->internal_streams[i]; // Allow for externals to modify streams\n",
-       "    }\n",
-       "    for(int i = 0; i < 1; ++i) {\n",
-       "        DACE_GPU_CHECK(cudaEventCreateWithFlags(&__state->gpu_context->events[i], cudaEventDisableTiming));\n",
-       "    }\n",
-       "\n",
-       "    \n",
-       "\n",
-       "    return 0;\n",
-       "}\n",
-       "\n",
-       "int __dace_exit_cuda(kernel_state_t *__state) {\n",
-       "    \n",
-       "\n",
-       "    // Synchronize and check for CUDA errors\n",
-       "    int __err = static_cast<int>(__state->gpu_context->lasterror);\n",
-       "    if (__err == 0)\n",
-       "        __err = static_cast<int>(cudaDeviceSynchronize());\n",
-       "\n",
-       "    // Destroy cuda streams and events\n",
-       "    for(int i = 0; i < 1; ++i) {\n",
-       "        DACE_GPU_CHECK(cudaStreamDestroy(__state->gpu_context->internal_streams[i]));\n",
-       "    }\n",
-       "    for(int i = 0; i < 1; ++i) {\n",
-       "        DACE_GPU_CHECK(cudaEventDestroy(__state->gpu_context->events[i]));\n",
-       "    }\n",
-       "\n",
-       "    delete __state->gpu_context;\n",
-       "    return __err;\n",
-       "}\n",
-       "\n",
-       "DACE_EXPORTED bool __dace_gpu_set_stream(kernel_state_t *__state, int streamid, gpuStream_t stream)\n",
-       "{\n",
-       "    if (streamid < 0 || streamid >= 1)\n",
-       "        return false;\n",
-       "\n",
-       "    __state->gpu_context->streams[streamid] = stream;\n",
-       "\n",
-       "    return true;\n",
-       "}\n",
-       "\n",
-       "DACE_EXPORTED void __dace_gpu_set_all_streams(kernel_state_t *__state, gpuStream_t stream)\n",
-       "{\n",
-       "    for (int i = 0; i < 1; ++i)\n",
-       "        __state->gpu_context->streams[i] = stream;\n",
-       "}\n",
-       "\n",
-       "__global__ void __launch_bounds__(32) kernel_101_0_0_0(const double * __restrict__ A, const double * __restrict__ B, double * __restrict__ C, int N) {\n",
-       "    {\n",
-       "        int i = (blockIdx.x * 32 + threadIdx.x);\n",
-       "        double __tmp1;\n",
-       "        __shared__ double shr_A[32];\n",
-       "        __shared__ double shr_B[32];\n",
-       "        if (i < N) {\n",
-       "            dace::GlobalToShared1D<double, 32, 1, 1, 1, 1, false>(A + i, 1, shr_A + (i % 32));\n",
-       "            dace::GlobalToShared1D<double, 32, 1, 1, 1, 1, false>(B + i, 1, shr_B + (i % 32));\n",
-       "            {\n",
-       "                double __in1 = shr_A[(i % 32)];\n",
-       "                double __in2 = shr_B[(i % 32)];\n",
-       "                double __out;\n",
-       "\n",
-       "                ///////////////////\n",
-       "                // Tasklet code (_Add_)\n",
-       "                __out = (__in1 + __in2);\n",
-       "                ///////////////////\n",
-       "\n",
-       "                __tmp1 = __out;\n",
-       "            }\n",
-       "            {\n",
-       "                double __inp = __tmp1;\n",
-       "                double __out;\n",
-       "\n",
-       "                ///////////////////\n",
-       "                // Tasklet code (assign_102_16)\n",
-       "                __out = __inp;\n",
-       "                ///////////////////\n",
-       "\n",
-       "                C[i] = __out;\n",
-       "            }\n",
-       "        }\n",
-       "    }\n",
-       "}\n",
-       "\n",
-       "\n",
-       "DACE_EXPORTED void __dace_runkernel_kernel_101_0_0_0(kernel_state_t *__state, const double * __restrict__ A, const double * __restrict__ B, double * __restrict__ C, int N);\n",
-       "void __dace_runkernel_kernel_101_0_0_0(kernel_state_t *__state, const double * __restrict__ A, const double * __restrict__ B, double * __restrict__ C, int N)\n",
-       "{\n",
-       "\n",
-       "    if (((int_ceil(int_ceil(N, 1), 32)) <= 0)) {\n",
-       "\n",
-       "        return;\n",
-       "    }\n",
-       "\n",
-       "    void  *kernel_101_0_0_0_args[] = { (void *)&A, (void *)&B, (void *)&C, (void *)&N };\n",
-       "    gpuError_t __err = cudaLaunchKernel((void*)kernel_101_0_0_0, dim3(int_ceil(int_ceil(N, 1), 32), 1, 1), dim3(32, 1, 1), kernel_101_0_0_0_args, 0, __state->gpu_context->streams[0]);\n",
-       "    DACE_KERNEL_LAUNCH_CHECK(__err, "kernel_101_0_0_0", int_ceil(int_ceil(N, 1), 32), 1, 1, 32, 1, 1);\n",
-       "}\n",
-       "
\n" - ], - "text/latex": [ - "\\begin{Verbatim}[commandchars=\\\\\\{\\}]\n", - "\\PY{n}{\\PYZsh{}include}\\PY{+w}{ }\\PY{o}{\\PYZlt{}}\\PY{n}{cuda\\PYZus{}runtime}\\PY{p}{.}\\PY{n}{h}\\PY{o}{\\PYZgt{}}\n", - "\\PY{n}{\\PYZsh{}include}\\PY{+w}{ }\\PY{o}{\\PYZlt{}}\\PY{n}{dace}\\PY{o}{/}\\PY{n}{dace}\\PY{p}{.}\\PY{n}{h}\\PY{o}{\\PYZgt{}}\n", - "\n", - "\n", - "\\PY{n}{struct}\\PY{+w}{ }\\PY{n}{kernel\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{err}{\\PYZob{}}\n", - "\\PY{+w}{ }\\PY{n+nl}{dace}\\PY{p}{:}\\PY{err}{:}\\PY{n+nl}{cuda}\\PY{p}{:}\\PY{err}{:}\\PY{n}{Context}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{gpu\\PYZus{}context}\\PY{p}{;}\n", - "\\PY{err}{\\PYZcb{}}\\PY{p}{;}\n", - "\n", - "\n", - "\n", - "\\PY{n}{DACE\\PYZus{}EXPORTED}\\PY{+w}{ }\\PY{n+nc}{int}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}dace\\PYZus{}init\\PYZus{}cuda}\\PY{p}{(}\\PY{n}{kernel\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{int64\\PYZus{}t}\\PY{+w}{ }\\PY{n}{N}\\PY{p}{)}\\PY{p}{;}\n", - "\\PY{n}{DACE\\PYZus{}EXPORTED}\\PY{+w}{ }\\PY{n+nc}{int}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}dace\\PYZus{}exit\\PYZus{}cuda}\\PY{p}{(}\\PY{n}{kernel\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{)}\\PY{p}{;}\n", - "\n", - "\n", - "\n", - "\\PY{n+nc}{int}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}dace\\PYZus{}init\\PYZus{}cuda}\\PY{p}{(}\\PY{n}{kernel\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{int64\\PYZus{}t}\\PY{+w}{ }\\PY{n}{N}\\PY{p}{)}\\PY{+w}{ }\\PY{err}{\\PYZob{}}\n", - "\\PY{+w}{ }\\PY{n+nc}{int}\\PY{+w}{ }\\PY{n+nf}{count}\\PY{p}{;}\n", - "\n", - "\\PY{+w}{ }\\PY{o}{/}\\PY{o}{/}\\PY{+w}{ }\\PY{k}{Check}\\PY{+w}{ }\\PY{n}{that}\\PY{+w}{ }\\PY{n}{we}\\PY{+w}{ }\\PY{k}{are}\\PY{+w}{ }\\PY{n}{able}\\PY{+w}{ }\\PY{k}{to}\\PY{+w}{ }\\PY{n}{run}\\PY{+w}{ }\\PY{n}{cuda}\\PY{+w}{ }\\PY{n}{code}\n", - "\\PY{+w}{ }\\PY{k}{if}\\PY{+w}{ }\\PY{p}{(}\\PY{n}{cudaGetDeviceCount}\\PY{p}{(}\\PY{o}{\\PYZam{}}\\PY{n+nf}{count}\\PY{p}{)}\\PY{+w}{ }\\PY{o}{!=}\\PY{+w}{ }\\PY{n}{cudaSuccess}\\PY{p}{)}\n", - "\\PY{+w}{ }\\PY{err}{\\PYZob{}}\n", - "\\PY{+w}{ }\\PY{n}{printf}\\PY{p}{(}\\PY{l+s+ss}{\\PYZdq{}ERROR: GPU drivers are not configured or cuda\\PYZhy{}capable device \\PYZdq{}}\n", - "\\PY{+w}{ }\\PY{l+s+ss}{\\PYZdq{}not found\\PYZbs{}n\\PYZdq{}}\\PY{p}{)}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{k}{return}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{err}{\\PYZcb{}}\n", - "\\PY{+w}{ }\\PY{k}{if}\\PY{+w}{ }\\PY{p}{(}\\PY{n+nf}{count}\\PY{+w}{ }\\PY{o}{=}\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{)}\n", - "\\PY{+w}{ }\\PY{err}{\\PYZob{}}\n", - "\\PY{+w}{ }\\PY{n}{printf}\\PY{p}{(}\\PY{l+s+ss}{\\PYZdq{}ERROR: No cuda\\PYZhy{}capable devices found\\PYZbs{}n\\PYZdq{}}\\PY{p}{)}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{k}{return}\\PY{+w}{ }\\PY{l+m+mi}{2}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{err}{\\PYZcb{}}\n", - "\n", - "\\PY{+w}{ }\\PY{o}{/}\\PY{o}{/}\\PY{+w}{ }\\PY{k}{Initialize}\\PY{+w}{ }\\PY{n}{cuda}\\PY{+w}{ }\\PY{k}{before}\\PY{+w}{ }\\PY{n}{we}\\PY{+w}{ }\\PY{n}{run}\\PY{+w}{ }\\PY{n}{the}\\PY{+w}{ }\\PY{n}{application}\n", - "\\PY{+w}{ }\\PY{n+nc}{float}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{dev\\PYZus{}X}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{n}{DACE\\PYZus{}GPU\\PYZus{}CHECK}\\PY{p}{(}\\PY{n}{cudaMalloc}\\PY{p}{(}\\PY{p}{(}\\PY{n}{void}\\PY{+w}{ }\\PY{o}{*}\\PY{o}{*}\\PY{p}{)}\\PY{+w}{ }\\PY{o}{\\PYZam{}}\\PY{n}{dev\\PYZus{}X}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{)}\\PY{p}{)}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{n}{DACE\\PYZus{}GPU\\PYZus{}CHECK}\\PY{p}{(}\\PY{n}{cudaFree}\\PY{p}{(}\\PY{n}{dev\\PYZus{}X}\\PY{p}{)}\\PY{p}{)}\\PY{p}{;}\n", - "\n", - "\\PY{+w}{ }\n", - "\n", - "\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{k}{new}\\PY{+w}{ }\\PY{n+nl}{dace}\\PY{p}{:}\\PY{err}{:}\\PY{n+nl}{cuda}\\PY{p}{:}\\PY{err}{:}\\PY{n}{Context}\\PY{p}{(}\\PY{l+m+mi}{1}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{)}\\PY{p}{;}\n", - "\n", - "\\PY{+w}{ }\\PY{o}{/}\\PY{o}{/}\\PY{+w}{ }\\PY{k}{Create}\\PY{+w}{ }\\PY{n}{cuda}\\PY{+w}{ }\\PY{n}{streams}\\PY{+w}{ }\\PY{o+ow}{and}\\PY{+w}{ }\\PY{n}{events}\n", - "\\PY{+w}{ }\\PY{k}{for}\\PY{p}{(}\\PY{n+nc}{int}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{;}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{\\PYZlt{}}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{;}\\PY{+w}{ }\\PY{o}{+}\\PY{o}{+}\\PY{n}{i}\\PY{p}{)}\\PY{+w}{ }\\PY{err}{\\PYZob{}}\n", - "\\PY{+w}{ }\\PY{n}{DACE\\PYZus{}GPU\\PYZus{}CHECK}\\PY{p}{(}\\PY{n}{cudaStreamCreateWithFlags}\\PY{p}{(}\\PY{o}{\\PYZam{}}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{internal\\PYZus{}streams}\\PY{o}{[}\\PY{n}{i}\\PY{o}{]}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{cudaStreamNonBlocking}\\PY{p}{)}\\PY{p}{)}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{streams}\\PY{o}{[}\\PY{n}{i}\\PY{o}{]}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{internal\\PYZus{}streams}\\PY{o}{[}\\PY{n}{i}\\PY{o}{]}\\PY{p}{;}\\PY{+w}{ }\\PY{o}{/}\\PY{o}{/}\\PY{+w}{ }\\PY{n}{Allow}\\PY{+w}{ }\\PY{k}{for}\\PY{+w}{ }\\PY{n}{externals}\\PY{+w}{ }\\PY{k}{to}\\PY{+w}{ }\\PY{k}{modify}\\PY{+w}{ }\\PY{n}{streams}\n", - "\\PY{+w}{ }\\PY{err}{\\PYZcb{}}\n", - "\\PY{+w}{ }\\PY{k}{for}\\PY{p}{(}\\PY{n+nc}{int}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{;}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{\\PYZlt{}}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{;}\\PY{+w}{ }\\PY{o}{+}\\PY{o}{+}\\PY{n}{i}\\PY{p}{)}\\PY{+w}{ }\\PY{err}{\\PYZob{}}\n", - "\\PY{+w}{ }\\PY{n}{DACE\\PYZus{}GPU\\PYZus{}CHECK}\\PY{p}{(}\\PY{n}{cudaEventCreateWithFlags}\\PY{p}{(}\\PY{o}{\\PYZam{}}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{events}\\PY{o}{[}\\PY{n}{i}\\PY{o}{]}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{cudaEventDisableTiming}\\PY{p}{)}\\PY{p}{)}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{err}{\\PYZcb{}}\n", - "\n", - "\\PY{+w}{ }\n", - "\n", - "\\PY{+w}{ }\\PY{k}{return}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{;}\n", - "\\PY{err}{\\PYZcb{}}\n", - "\n", - "\\PY{n+nc}{int}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}dace\\PYZus{}exit\\PYZus{}cuda}\\PY{p}{(}\\PY{n}{kernel\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{)}\\PY{+w}{ }\\PY{err}{\\PYZob{}}\n", - "\\PY{+w}{ }\n", - "\n", - "\\PY{+w}{ }\\PY{o}{/}\\PY{o}{/}\\PY{+w}{ }\\PY{n}{Synchronize}\\PY{+w}{ }\\PY{o+ow}{and}\\PY{+w}{ }\\PY{k}{check}\\PY{+w}{ }\\PY{k}{for}\\PY{+w}{ }\\PY{n}{CUDA}\\PY{+w}{ }\\PY{n}{errors}\n", - "\\PY{+w}{ }\\PY{n+nc}{int}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}err}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{static\\PYZus{}cast}\\PY{o}{\\PYZlt{}}\\PY{n+nc}{int}\\PY{o}{\\PYZgt{}}\\PY{p}{(}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{lasterror}\\PY{p}{)}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{k}{if}\\PY{+w}{ }\\PY{p}{(}\\PY{n}{\\PYZus{}\\PYZus{}err}\\PY{+w}{ }\\PY{o}{=}\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{)}\n", - "\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}err}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{static\\PYZus{}cast}\\PY{o}{\\PYZlt{}}\\PY{n+nc}{int}\\PY{o}{\\PYZgt{}}\\PY{p}{(}\\PY{n}{cudaDeviceSynchronize}\\PY{p}{(}\\PY{p}{)}\\PY{p}{)}\\PY{p}{;}\n", - "\n", - "\\PY{+w}{ }\\PY{o}{/}\\PY{o}{/}\\PY{+w}{ }\\PY{k}{Destroy}\\PY{+w}{ }\\PY{n}{cuda}\\PY{+w}{ }\\PY{n}{streams}\\PY{+w}{ }\\PY{o+ow}{and}\\PY{+w}{ }\\PY{n}{events}\n", - "\\PY{+w}{ }\\PY{k}{for}\\PY{p}{(}\\PY{n+nc}{int}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{;}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{\\PYZlt{}}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{;}\\PY{+w}{ }\\PY{o}{+}\\PY{o}{+}\\PY{n}{i}\\PY{p}{)}\\PY{+w}{ }\\PY{err}{\\PYZob{}}\n", - "\\PY{+w}{ }\\PY{n}{DACE\\PYZus{}GPU\\PYZus{}CHECK}\\PY{p}{(}\\PY{n}{cudaStreamDestroy}\\PY{p}{(}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{internal\\PYZus{}streams}\\PY{o}{[}\\PY{n}{i}\\PY{o}{]}\\PY{p}{)}\\PY{p}{)}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{err}{\\PYZcb{}}\n", - "\\PY{+w}{ }\\PY{k}{for}\\PY{p}{(}\\PY{n+nc}{int}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{;}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{\\PYZlt{}}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{;}\\PY{+w}{ }\\PY{o}{+}\\PY{o}{+}\\PY{n}{i}\\PY{p}{)}\\PY{+w}{ }\\PY{err}{\\PYZob{}}\n", - "\\PY{+w}{ }\\PY{n}{DACE\\PYZus{}GPU\\PYZus{}CHECK}\\PY{p}{(}\\PY{n}{cudaEventDestroy}\\PY{p}{(}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{events}\\PY{o}{[}\\PY{n}{i}\\PY{o}{]}\\PY{p}{)}\\PY{p}{)}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{err}{\\PYZcb{}}\n", - "\n", - "\\PY{+w}{ }\\PY{k}{delete}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{k}{return}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}err}\\PY{p}{;}\n", - "\\PY{err}{\\PYZcb{}}\n", - "\n", - "\\PY{n}{DACE\\PYZus{}EXPORTED}\\PY{+w}{ }\\PY{n}{bool}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}dace\\PYZus{}gpu\\PYZus{}set\\PYZus{}stream}\\PY{p}{(}\\PY{n}{kernel\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{,}\\PY{+w}{ }\\PY{n+nc}{int}\\PY{+w}{ }\\PY{n}{streamid}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{gpuStream\\PYZus{}t}\\PY{+w}{ }\\PY{n}{stream}\\PY{p}{)}\n", - "\\PY{err}{\\PYZob{}}\n", - "\\PY{+w}{ }\\PY{k}{if}\\PY{+w}{ }\\PY{p}{(}\\PY{n}{streamid}\\PY{+w}{ }\\PY{o}{\\PYZlt{}}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{+w}{ }\\PY{o}{|}\\PY{o}{|}\\PY{+w}{ }\\PY{n}{streamid}\\PY{+w}{ }\\PY{o}{\\PYZgt{}=}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{)}\n", - "\\PY{+w}{ }\\PY{k}{return}\\PY{+w}{ }\\PY{k}{false}\\PY{p}{;}\n", - "\n", - "\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{streams}\\PY{o}{[}\\PY{n}{streamid}\\PY{o}{]}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{stream}\\PY{p}{;}\n", - "\n", - "\\PY{+w}{ }\\PY{k}{return}\\PY{+w}{ }\\PY{k}{true}\\PY{p}{;}\n", - "\\PY{err}{\\PYZcb{}}\n", - "\n", - "\\PY{n}{DACE\\PYZus{}EXPORTED}\\PY{+w}{ }\\PY{n}{void}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}dace\\PYZus{}gpu\\PYZus{}set\\PYZus{}all\\PYZus{}streams}\\PY{p}{(}\\PY{n}{kernel\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{gpuStream\\PYZus{}t}\\PY{+w}{ }\\PY{n}{stream}\\PY{p}{)}\n", - "\\PY{err}{\\PYZob{}}\n", - "\\PY{+w}{ }\\PY{k}{for}\\PY{+w}{ }\\PY{p}{(}\\PY{n+nc}{int}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{;}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{\\PYZlt{}}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{;}\\PY{+w}{ }\\PY{o}{+}\\PY{o}{+}\\PY{n}{i}\\PY{p}{)}\n", - "\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{streams}\\PY{o}{[}\\PY{n}{i}\\PY{o}{]}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{stream}\\PY{p}{;}\n", - "\\PY{err}{\\PYZcb{}}\n", - "\n", - "\\PY{n}{\\PYZus{}\\PYZus{}global\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{n}{void}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}launch\\PYZus{}bounds\\PYZus{}\\PYZus{}}\\PY{p}{(}\\PY{l+m+mi}{32}\\PY{p}{)}\\PY{+w}{ }\\PY{n}{kernel\\PYZus{}101\\PYZus{}0\\PYZus{}0\\PYZus{}0}\\PY{p}{(}\\PY{n}{const}\\PY{+w}{ }\\PY{k}{double}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}restrict\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{n}{A}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{const}\\PY{+w}{ }\\PY{k}{double}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}restrict\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{n}{B}\\PY{p}{,}\\PY{+w}{ }\\PY{k}{double}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}restrict\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{n}{C}\\PY{p}{,}\\PY{+w}{ }\\PY{n+nc}{int}\\PY{+w}{ }\\PY{n}{N}\\PY{p}{)}\\PY{+w}{ }\\PY{err}{\\PYZob{}}\n", - "\\PY{+w}{ }\\PY{err}{\\PYZob{}}\n", - "\\PY{+w}{ }\\PY{n+nc}{int}\\PY{+w}{ }\\PY{n}{i}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{p}{(}\\PY{n}{blockIdx}\\PY{p}{.}\\PY{n}{x}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{l+m+mi}{32}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{n}{threadIdx}\\PY{p}{.}\\PY{n}{x}\\PY{p}{)}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{k}{double}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}tmp1}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}shared\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{k}{double}\\PY{+w}{ }\\PY{n}{shr\\PYZus{}A}\\PY{o}{[}\\PY{n}{32}\\PY{o}{]}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}shared\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{k}{double}\\PY{+w}{ }\\PY{n}{shr\\PYZus{}B}\\PY{o}{[}\\PY{n}{32}\\PY{o}{]}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{k}{if}\\PY{+w}{ }\\PY{p}{(}\\PY{n}{i}\\PY{+w}{ }\\PY{o}{\\PYZlt{}}\\PY{+w}{ }\\PY{n}{N}\\PY{p}{)}\\PY{+w}{ }\\PY{err}{\\PYZob{}}\n", - "\\PY{+w}{ }\\PY{n+nl}{dace}\\PY{p}{:}\\PY{err}{:}\\PY{n}{GlobalToShared1D}\\PY{o}{\\PYZlt{}}\\PY{k}{double}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{32}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{,}\\PY{+w}{ }\\PY{k}{false}\\PY{o}{\\PYZgt{}}\\PY{p}{(}\\PY{n}{A}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{n}{i}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{shr\\PYZus{}A}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{p}{(}\\PY{n}{i}\\PY{+w}{ }\\PY{o}{\\PYZpc{}}\\PY{+w}{ }\\PY{l+m+mi}{32}\\PY{p}{)}\\PY{p}{)}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{n+nl}{dace}\\PY{p}{:}\\PY{err}{:}\\PY{n}{GlobalToShared1D}\\PY{o}{\\PYZlt{}}\\PY{k}{double}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{32}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{,}\\PY{+w}{ }\\PY{k}{false}\\PY{o}{\\PYZgt{}}\\PY{p}{(}\\PY{n}{B}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{n}{i}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{shr\\PYZus{}B}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{p}{(}\\PY{n}{i}\\PY{+w}{ }\\PY{o}{\\PYZpc{}}\\PY{+w}{ }\\PY{l+m+mi}{32}\\PY{p}{)}\\PY{p}{)}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{err}{\\PYZob{}}\n", - "\\PY{+w}{ }\\PY{k}{double}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}in1}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{shr\\PYZus{}A}\\PY{o}{[}\\PY{n}{(i \\PYZpc{} 32)}\\PY{o}{]}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{k}{double}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}in2}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{shr\\PYZus{}B}\\PY{o}{[}\\PY{n}{(i \\PYZpc{} 32)}\\PY{o}{]}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{k}{double}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}out}\\PY{p}{;}\n", - "\n", - "\\PY{+w}{ }\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\n", - "\\PY{+w}{ }\\PY{o}{/}\\PY{o}{/}\\PY{+w}{ }\\PY{n}{Tasklet}\\PY{+w}{ }\\PY{n}{code}\\PY{+w}{ }\\PY{p}{(}\\PY{n}{\\PYZus{}Add\\PYZus{}}\\PY{p}{)}\n", - "\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}out}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{p}{(}\\PY{n}{\\PYZus{}\\PYZus{}in1}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}in2}\\PY{p}{)}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\n", - "\n", - "\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}tmp1}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}out}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{err}{\\PYZcb{}}\n", - "\\PY{+w}{ }\\PY{err}{\\PYZob{}}\n", - "\\PY{+w}{ }\\PY{k}{double}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}inp}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}tmp1}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{k}{double}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}out}\\PY{p}{;}\n", - "\n", - "\\PY{+w}{ }\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\n", - "\\PY{+w}{ }\\PY{o}{/}\\PY{o}{/}\\PY{+w}{ }\\PY{n}{Tasklet}\\PY{+w}{ }\\PY{n}{code}\\PY{+w}{ }\\PY{p}{(}\\PY{n}{assign\\PYZus{}102\\PYZus{}16}\\PY{p}{)}\n", - "\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}out}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}inp}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\\PY{o}{/}\n", - "\n", - "\\PY{+w}{ }\\PY{n}{C}\\PY{o}{[}\\PY{n}{i}\\PY{o}{]}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}out}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{err}{\\PYZcb{}}\n", - "\\PY{+w}{ }\\PY{err}{\\PYZcb{}}\n", - "\\PY{+w}{ }\\PY{err}{\\PYZcb{}}\n", - "\\PY{err}{\\PYZcb{}}\n", - "\n", - "\n", - "\\PY{n}{DACE\\PYZus{}EXPORTED}\\PY{+w}{ }\\PY{n}{void}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}dace\\PYZus{}runkernel\\PYZus{}kernel\\PYZus{}101\\PYZus{}0\\PYZus{}0\\PYZus{}0}\\PY{p}{(}\\PY{n}{kernel\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{const}\\PY{+w}{ }\\PY{k}{double}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}restrict\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{n}{A}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{const}\\PY{+w}{ }\\PY{k}{double}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}restrict\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{n}{B}\\PY{p}{,}\\PY{+w}{ }\\PY{k}{double}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}restrict\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{n}{C}\\PY{p}{,}\\PY{+w}{ }\\PY{n+nc}{int}\\PY{+w}{ }\\PY{n}{N}\\PY{p}{)}\\PY{p}{;}\n", - "\\PY{n}{void}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}dace\\PYZus{}runkernel\\PYZus{}kernel\\PYZus{}101\\PYZus{}0\\PYZus{}0\\PYZus{}0}\\PY{p}{(}\\PY{n}{kernel\\PYZus{}state\\PYZus{}t}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{const}\\PY{+w}{ }\\PY{k}{double}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}restrict\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{n}{A}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{const}\\PY{+w}{ }\\PY{k}{double}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}restrict\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{n}{B}\\PY{p}{,}\\PY{+w}{ }\\PY{k}{double}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}restrict\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{n}{C}\\PY{p}{,}\\PY{+w}{ }\\PY{n+nc}{int}\\PY{+w}{ }\\PY{n}{N}\\PY{p}{)}\n", - "\\PY{err}{\\PYZob{}}\n", - "\n", - "\\PY{+w}{ }\\PY{k}{if}\\PY{+w}{ }\\PY{p}{(}\\PY{p}{(}\\PY{p}{(}\\PY{n}{int\\PYZus{}ceil}\\PY{p}{(}\\PY{n}{int\\PYZus{}ceil}\\PY{p}{(}\\PY{n}{N}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{)}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{32}\\PY{p}{)}\\PY{p}{)}\\PY{+w}{ }\\PY{o}{\\PYZlt{}=}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{)}\\PY{p}{)}\\PY{+w}{ }\\PY{err}{\\PYZob{}}\n", - "\n", - "\\PY{+w}{ }\\PY{k}{return}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{err}{\\PYZcb{}}\n", - "\n", - "\\PY{+w}{ }\\PY{n}{void}\\PY{+w}{ }\\PY{o}{*}\\PY{n}{kernel\\PYZus{}101\\PYZus{}0\\PYZus{}0\\PYZus{}0\\PYZus{}args}\\PY{err}{[}\\PY{err}{]}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{err}{\\PYZob{}}\\PY{+w}{ }\\PY{p}{(}\\PY{n}{void}\\PY{+w}{ }\\PY{o}{*}\\PY{p}{)}\\PY{o}{\\PYZam{}}\\PY{n}{A}\\PY{p}{,}\\PY{+w}{ }\\PY{p}{(}\\PY{n}{void}\\PY{+w}{ }\\PY{o}{*}\\PY{p}{)}\\PY{o}{\\PYZam{}}\\PY{n}{B}\\PY{p}{,}\\PY{+w}{ }\\PY{p}{(}\\PY{n}{void}\\PY{+w}{ }\\PY{o}{*}\\PY{p}{)}\\PY{o}{\\PYZam{}}\\PY{n}{C}\\PY{p}{,}\\PY{+w}{ }\\PY{p}{(}\\PY{n}{void}\\PY{+w}{ }\\PY{o}{*}\\PY{p}{)}\\PY{o}{\\PYZam{}}\\PY{n}{N}\\PY{+w}{ }\\PY{err}{\\PYZcb{}}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{n}{gpuError\\PYZus{}t}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}err}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{cudaLaunchKernel}\\PY{p}{(}\\PY{p}{(}\\PY{n}{void}\\PY{o}{*}\\PY{p}{)}\\PY{n}{kernel\\PYZus{}101\\PYZus{}0\\PYZus{}0\\PYZus{}0}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{dim3}\\PY{p}{(}\\PY{n}{int\\PYZus{}ceil}\\PY{p}{(}\\PY{n}{int\\PYZus{}ceil}\\PY{p}{(}\\PY{n}{N}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{)}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{32}\\PY{p}{)}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{)}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{dim3}\\PY{p}{(}\\PY{l+m+mi}{32}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{)}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{kernel\\PYZus{}101\\PYZus{}0\\PYZus{}0\\PYZus{}0\\PYZus{}args}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}state}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{gpu\\PYZus{}context}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{streams}\\PY{o}{[}\\PY{n}{0}\\PY{o}{]}\\PY{p}{)}\\PY{p}{;}\n", - "\\PY{+w}{ }\\PY{n}{DACE\\PYZus{}KERNEL\\PYZus{}LAUNCH\\PYZus{}CHECK}\\PY{p}{(}\\PY{n}{\\PYZus{}\\PYZus{}err}\\PY{p}{,}\\PY{+w}{ }\\PY{l+s+ss}{\\PYZdq{}kernel\\PYZus{}101\\PYZus{}0\\PYZus{}0\\PYZus{}0\\PYZdq{}}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{int\\PYZus{}ceil}\\PY{p}{(}\\PY{n}{int\\PYZus{}ceil}\\PY{p}{(}\\PY{n}{N}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{)}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{32}\\PY{p}{)}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{32}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{,}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{)}\\PY{p}{;}\n", - "\\PY{err}{\\PYZcb{}}\n", - "\\end{Verbatim}\n" - ], - "text/plain": [ - "\n", - "#include \n", - "#include \n", - "\n", - "\n", - "struct kernel_state_t {\n", - " dace::cuda::Context *gpu_context;\n", - "};\n", - "\n", - "\n", - "\n", - "DACE_EXPORTED int __dace_init_cuda(kernel_state_t *__state, int64_t N);\n", - "DACE_EXPORTED int __dace_exit_cuda(kernel_state_t *__state);\n", - "\n", - "\n", - "\n", - "int __dace_init_cuda(kernel_state_t *__state, int64_t N) {\n", - " int count;\n", - "\n", - " // Check that we are able to run cuda code\n", - " if (cudaGetDeviceCount(&count) != cudaSuccess)\n", - " {\n", - " printf(\"ERROR: GPU drivers are not configured or cuda-capable device \"\n", - " \"not found\\n\");\n", - " return 1;\n", - " }\n", - " if (count == 0)\n", - " {\n", - " printf(\"ERROR: No cuda-capable devices found\\n\");\n", - " return 2;\n", - " }\n", - "\n", - " // Initialize cuda before we run the application\n", - " float *dev_X;\n", - " DACE_GPU_CHECK(cudaMalloc((void **) &dev_X, 1));\n", - " DACE_GPU_CHECK(cudaFree(dev_X));\n", - "\n", - " \n", - "\n", - " __state->gpu_context = new dace::cuda::Context(1, 1);\n", - "\n", - " // Create cuda streams and events\n", - " for(int i = 0; i < 1; ++i) {\n", - " DACE_GPU_CHECK(cudaStreamCreateWithFlags(&__state->gpu_context->internal_streams[i], cudaStreamNonBlocking));\n", - " __state->gpu_context->streams[i] = __state->gpu_context->internal_streams[i]; // Allow for externals to modify streams\n", - " }\n", - " for(int i = 0; i < 1; ++i) {\n", - " DACE_GPU_CHECK(cudaEventCreateWithFlags(&__state->gpu_context->events[i], cudaEventDisableTiming));\n", - " }\n", - "\n", - " \n", - "\n", - " return 0;\n", - "}\n", - "\n", - "int __dace_exit_cuda(kernel_state_t *__state) {\n", - " \n", - "\n", - " // Synchronize and check for CUDA errors\n", - " int __err = static_cast(__state->gpu_context->lasterror);\n", - " if (__err == 0)\n", - " __err = static_cast(cudaDeviceSynchronize());\n", - "\n", - " // Destroy cuda streams and events\n", - " for(int i = 0; i < 1; ++i) {\n", - " DACE_GPU_CHECK(cudaStreamDestroy(__state->gpu_context->internal_streams[i]));\n", - " }\n", - " for(int i = 0; i < 1; ++i) {\n", - " DACE_GPU_CHECK(cudaEventDestroy(__state->gpu_context->events[i]));\n", - " }\n", - "\n", - " delete __state->gpu_context;\n", - " return __err;\n", - "}\n", - "\n", - "DACE_EXPORTED bool __dace_gpu_set_stream(kernel_state_t *__state, int streamid, gpuStream_t stream)\n", - "{\n", - " if (streamid < 0 || streamid >= 1)\n", - " return false;\n", - "\n", - " __state->gpu_context->streams[streamid] = stream;\n", - "\n", - " return true;\n", - "}\n", - "\n", - "DACE_EXPORTED void __dace_gpu_set_all_streams(kernel_state_t *__state, gpuStream_t stream)\n", - "{\n", - " for (int i = 0; i < 1; ++i)\n", - " __state->gpu_context->streams[i] = stream;\n", - "}\n", - "\n", - "__global__ void __launch_bounds__(32) kernel_101_0_0_0(const double * __restrict__ A, const double * __restrict__ B, double * __restrict__ C, int N) {\n", - " {\n", - " int i = (blockIdx.x * 32 + threadIdx.x);\n", - " double __tmp1;\n", - " __shared__ double shr_A[32];\n", - " __shared__ double shr_B[32];\n", - " if (i < N) {\n", - " dace::GlobalToShared1D(A + i, 1, shr_A + (i % 32));\n", - " dace::GlobalToShared1D(B + i, 1, shr_B + (i % 32));\n", - " {\n", - " double __in1 = shr_A[(i % 32)];\n", - " double __in2 = shr_B[(i % 32)];\n", - " double __out;\n", - "\n", - " ///////////////////\n", - " // Tasklet code (_Add_)\n", - " __out = (__in1 + __in2);\n", - " ///////////////////\n", - "\n", - " __tmp1 = __out;\n", - " }\n", - " {\n", - " double __inp = __tmp1;\n", - " double __out;\n", - "\n", - " ///////////////////\n", - " // Tasklet code (assign_102_16)\n", - " __out = __inp;\n", - " ///////////////////\n", - "\n", - " C[i] = __out;\n", - " }\n", - " }\n", - " }\n", - "}\n", - "\n", - "\n", - "DACE_EXPORTED void __dace_runkernel_kernel_101_0_0_0(kernel_state_t *__state, const double * __restrict__ A, const double * __restrict__ B, double * __restrict__ C, int N);\n", - "void __dace_runkernel_kernel_101_0_0_0(kernel_state_t *__state, const double * __restrict__ A, const double * __restrict__ B, double * __restrict__ C, int N)\n", - "{\n", - "\n", - " if (((int_ceil(int_ceil(N, 1), 32)) <= 0)) {\n", - "\n", - " return;\n", - " }\n", - "\n", - " void *kernel_101_0_0_0_args[] = { (void *)&A, (void *)&B, (void *)&C, (void *)&N };\n", - " gpuError_t __err = cudaLaunchKernel((void*)kernel_101_0_0_0, dim3(int_ceil(int_ceil(N, 1), 32), 1, 1), dim3(32, 1, 1), kernel_101_0_0_0_args, 0, __state->gpu_context->streams[0]);\n", - " DACE_KERNEL_LAUNCH_CHECK(__err, \"kernel_101_0_0_0\", int_ceil(int_ceil(N, 1), 32), 1, 1, 32, 1, 1);\n", - "}\n" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "Code(sdfg.generate_code()[1].clean_code)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "686e4fc6", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "dace_env", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.12.3" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/berkay_workpace/scratch/yakups_examples/smem_related/nice_global_to_shared_copy.sdfg b/berkay_workpace/scratch/yakups_examples/smem_related/nice_global_to_shared_copy.sdfg deleted file mode 100644 index 1f276cf47a..0000000000 --- a/berkay_workpace/scratch/yakups_examples/smem_related/nice_global_to_shared_copy.sdfg +++ /dev/null @@ -1,1278 +0,0 @@ -{ - "type": "SDFG", - "attributes": { - "name": "kernel", - "arg_names": [ - "A", - "B", - "C" - ], - "_arrays": { - "A": { - "type": "Array", - "attributes": { - "strides": [ - "1" - ], - "total_size": "N", - "offset": [ - "0" - ], - "optional": false, - "dtype": "float64", - "shape": [ - "N" - ], - "storage": "GPU_Global", - "debuginfo": null - } - }, - "B": { - "type": "Array", - "attributes": { - "strides": [ - "1" - ], - "total_size": "N", - "offset": [ - "0" - ], - "optional": false, - "dtype": "float64", - "shape": [ - "N" - ], - "storage": "GPU_Global", - "debuginfo": null - } - }, - "C": { - "type": "Array", - "attributes": { - "strides": [ - "1" - ], - "total_size": "N", - "offset": [ - "0" - ], - "optional": false, - "dtype": "float64", - "shape": [ - "N" - ], - "storage": "GPU_Global", - "debuginfo": null - } - }, - "__tmp3": { - "type": "Scalar", - "attributes": { - "dtype": "float64", - "shape": [ - "1" - ], - "transient": true, - "debuginfo": null - } - }, - "shr_A": { - "type": "Array", - "attributes": { - "strides": [ - "1" - ], - "total_size": "256", - "offset": [ - "0" - ], - "optional": false, - "dtype": "float64", - "shape": [ - "256" - ], - "transient": true, - "storage": "GPU_Shared", - "debuginfo": null - } - }, - "shr_B": { - "type": "Array", - "attributes": { - "strides": [ - "1" - ], - "total_size": "256", - "offset": [ - "0" - ], - "optional": false, - "dtype": "float64", - "shape": [ - "256" - ], - "transient": true, - "storage": "GPU_Shared", - "debuginfo": null - } - } - }, - "symbols": { - "N": "int64" - }, - "global_code": { - "frame": { - "string_data": "", - "language": "CPP" - } - }, - "init_code": { - "frame": { - "string_data": "", - "language": "CPP" - } - }, - "exit_code": { - "frame": { - "string_data": "", - "language": "CPP" - } - }, - "debuginfo": { - "type": "DebugInfo", - "start_line": 20, - "end_line": 29, - "start_column": 0, - "end_column": 0, - "filename": "/home/primrose/Work/DaceLayoutAndScheduleTransformations/layout_and_schedule_transformations/tests/shrmem_test.py" - }, - "using_explicit_control_flow": true, - "guid": "82bba983-0c4a-4ea9-a197-02e81a8cff11", - "hash": "30a8e228873be667ed26ed0ded89004c1c79a380321118fa9e38753bb6cfa4f8" - }, - "nodes": [ - { - "type": "SDFGState", - "label": "MapState", - "id": 0, - "collapsed": false, - "scope_dict": { - "-1": [ - 0, - 2, - 3, - 4 - ], - "0": [ - 1, - 5, - 12, - 13 - ], - "5": [ - 6, - 7 - ], - "7": [ - 8, - 9, - 10, - 11 - ] - }, - "nodes": [ - { - "type": "MapEntry", - "label": "kernel_26[i=0:N:256]", - "attributes": { - "label": "kernel_26", - "params": [ - "i" - ], - "range": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "N - 1", - "step": "256", - "tile": "1" - } - ] - }, - "schedule": "GPU_Device", - "debuginfo": { - "type": "DebugInfo", - "start_line": 26, - "end_line": 26, - "start_column": 30, - "end_column": 30, - "filename": "/home/primrose/Work/DaceLayoutAndScheduleTransformations/layout_and_schedule_transformations/tests/shrmem_test.py" - }, - "in_connectors": { - "IN_A": null, - "IN_B": null - }, - "out_connectors": { - "OUT_A": null, - "OUT_B": null - }, - "guid": "49c7e4c6-3cc7-4ea9-8c5c-87e19b34cd49" - }, - "id": 0, - "scope_entry": null, - "scope_exit": "1" - }, - { - "type": "MapExit", - "label": "kernel_26[i=0:N:256]", - "attributes": { - "in_connectors": { - "IN_C": null - }, - "out_connectors": { - "OUT_C": null - }, - "guid": "633311fa-fef3-47c7-811a-b55dbaf09c9b" - }, - "id": 1, - "scope_entry": "0", - "scope_exit": "1" - }, - { - "type": "AccessNode", - "label": "A", - "attributes": { - "debuginfo": { - "type": "DebugInfo", - "start_line": 26, - "end_line": 26, - "start_column": 30, - "end_column": 30, - "filename": "/home/primrose/Work/DaceLayoutAndScheduleTransformations/layout_and_schedule_transformations/tests/shrmem_test.py" - }, - "data": "A", - "guid": "13961939-8f39-4f3f-b505-83b06e3f89c8" - }, - "id": 2, - "scope_entry": null, - "scope_exit": null - }, - { - "type": "AccessNode", - "label": "B", - "attributes": { - "debuginfo": { - "type": "DebugInfo", - "start_line": 26, - "end_line": 26, - "start_column": 30, - "end_column": 30, - "filename": "/home/primrose/Work/DaceLayoutAndScheduleTransformations/layout_and_schedule_transformations/tests/shrmem_test.py" - }, - "data": "B", - "guid": "cf20b347-bd6f-46af-b499-59775d1eb039" - }, - "id": 3, - "scope_entry": null, - "scope_exit": null - }, - { - "type": "AccessNode", - "label": "C", - "attributes": { - "debuginfo": { - "type": "DebugInfo", - "start_line": 26, - "end_line": 26, - "start_column": 30, - "end_column": 30, - "filename": "/home/primrose/Work/DaceLayoutAndScheduleTransformations/layout_and_schedule_transformations/tests/shrmem_test.py" - }, - "data": "C", - "guid": "3f602807-15c6-4dbc-a17f-bec5154cea16" - }, - "id": 4, - "scope_entry": null, - "scope_exit": null - }, - { - "type": "MapEntry", - "label": "kernel_26_4_27[k=0]", - "attributes": { - "label": "kernel_26_4_27", - "params": [ - "k" - ], - "range": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "0", - "step": "1", - "tile": "1" - } - ] - }, - "schedule": "Sequential", - "debuginfo": { - "type": "DebugInfo", - "start_line": 27, - "end_line": 27, - "start_column": 8, - "end_column": 8, - "filename": "/home/primrose/Work/DaceLayoutAndScheduleTransformations/layout_and_schedule_transformations/tests/shrmem_test.py" - }, - "in_connectors": { - "IN___tmp_29_37_r_in_from_1_0_in_from_1_0": null, - "IN___tmp_29_58_r_in_from_1_0_in_from_1_0": null - }, - "out_connectors": { - "OUT___tmp_29_37_r_in_from_1_0_in_from_1_0": null, - "OUT___tmp_29_58_r_in_from_1_0_in_from_1_0": null - }, - "guid": "16f44972-423e-4407-9f82-ceba8fbbd9cb" - }, - "id": 5, - "scope_entry": "0", - "scope_exit": "6" - }, - { - "type": "MapExit", - "label": "kernel_26_4_27[k=0]", - "attributes": { - "in_connectors": { - "IN___tmp_29_16_w_out_of_1_1_out_of_1_1": null - }, - "out_connectors": { - "OUT___tmp_29_16_w_out_of_1_1_out_of_1_1": null - }, - "guid": "0881fb62-ec71-49cc-8801-f2c1ec221db3" - }, - "id": 6, - "scope_entry": "5", - "scope_exit": "6" - }, - { - "type": "MapEntry", - "label": "kernel_26_4_27_8_28[j=0:256]", - "attributes": { - "label": "kernel_26_4_27_8_28", - "params": [ - "j" - ], - "range": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "255", - "step": "1", - "tile": "1" - } - ] - }, - "schedule": "GPU_ThreadBlock", - "debuginfo": { - "type": "DebugInfo", - "start_line": 28, - "end_line": 28, - "start_column": 12, - "end_column": 12, - "filename": "/home/primrose/Work/DaceLayoutAndScheduleTransformations/layout_and_schedule_transformations/tests/shrmem_test.py" - }, - "in_connectors": { - "IN___tmp_29_37_r_in_from_1_0": null, - "IN___tmp_29_58_r_in_from_1_0": null - }, - "out_connectors": { - "OUT___tmp_29_37_r_in_from_1_0": null, - "OUT___tmp_29_58_r_in_from_1_0": null - }, - "guid": "851c1aed-dd11-44c0-a861-e38e10854e37" - }, - "id": 7, - "scope_entry": "5", - "scope_exit": "8" - }, - { - "type": "MapExit", - "label": "kernel_26_4_27_8_28[j=0:256]", - "attributes": { - "in_connectors": { - "IN___tmp_29_16_w_out_of_1_1": null - }, - "out_connectors": { - "OUT___tmp_29_16_w_out_of_1_1": null - }, - "guid": "dbf82313-ebe2-4ccd-9130-359733ccfe16" - }, - "id": 8, - "scope_entry": "7", - "scope_exit": "8" - }, - { - "type": "Tasklet", - "label": "_Add_", - "attributes": { - "code": { - "string_data": "__out = (__in1 + __in2)", - "language": "Python" - }, - "debuginfo": { - "type": "DebugInfo", - "start_line": 29, - "end_line": 29, - "start_column": 72, - "end_column": 72, - "filename": "/home/primrose/Work/DaceLayoutAndScheduleTransformations/layout_and_schedule_transformations/tests/shrmem_test.py" - }, - "label": "_Add_", - "in_connectors": { - "__in1": null, - "__in2": null - }, - "out_connectors": { - "__out": null - }, - "guid": "1d6a1f25-c807-4172-a42b-1a33f5ee1e75" - }, - "id": 9, - "scope_entry": "7", - "scope_exit": "8" - }, - { - "type": "AccessNode", - "label": "__tmp3", - "attributes": { - "debuginfo": { - "type": "DebugInfo", - "start_line": 29, - "end_line": 29, - "start_column": 72, - "end_column": 72, - "filename": "/home/primrose/Work/DaceLayoutAndScheduleTransformations/layout_and_schedule_transformations/tests/shrmem_test.py" - }, - "data": "__tmp3", - "guid": "25a5107c-0b90-4bd8-9deb-378f3a3463b2" - }, - "id": 10, - "scope_entry": "7", - "scope_exit": "8" - }, - { - "type": "Tasklet", - "label": "assign_29_16", - "attributes": { - "code": { - "string_data": "__out = __inp", - "language": "Python" - }, - "debuginfo": { - "type": "DebugInfo", - "start_line": 29, - "end_line": 29, - "start_column": 30, - "end_column": 30, - "filename": "/home/primrose/Work/DaceLayoutAndScheduleTransformations/layout_and_schedule_transformations/tests/shrmem_test.py" - }, - "label": "assign_29_16", - "in_connectors": { - "__inp": null - }, - "out_connectors": { - "__out": null - }, - "guid": "34f9f783-0749-47eb-93db-97d53d173644" - }, - "id": 11, - "scope_entry": "7", - "scope_exit": "8" - }, - { - "type": "AccessNode", - "label": "shr_A", - "attributes": { - "debuginfo": { - "type": "DebugInfo", - "start_line": 48, - "end_line": 48, - "start_column": 0, - "end_column": 0, - "filename": "/home/primrose/Work/DaceLayoutAndScheduleTransformations/layout_and_schedule_transformations/tests/test_utils.py" - }, - "data": "shr_A", - "guid": "3d052d35-1d25-4300-9969-30227e71974d" - }, - "id": 12, - "scope_entry": "0", - "scope_exit": "1" - }, - { - "type": "AccessNode", - "label": "shr_B", - "attributes": { - "debuginfo": { - "type": "DebugInfo", - "start_line": 48, - "end_line": 48, - "start_column": 0, - "end_column": 0, - "filename": "/home/primrose/Work/DaceLayoutAndScheduleTransformations/layout_and_schedule_transformations/tests/test_utils.py" - }, - "data": "shr_B", - "guid": "d73f5a2f-40d2-411a-bb05-adedfe3e2166" - }, - "id": 13, - "scope_entry": "0", - "scope_exit": "1" - } - ], - "edges": [ - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "256*ceiling(N/256)", - "subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "N - 1", - "step": "1", - "tile": "1" - } - ] - }, - "data": "A", - "debuginfo": null, - "guid": "6f946fb5-33ba-4489-8eac-8a96431fd08f", - "src_subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "N - 1", - "step": "1", - "tile": "1" - } - ] - }, - "dst_subset": null, - "is_data_src": true, - "num_accesses": "256*ceiling(N/256)" - } - } - }, - "src": "2", - "dst": "0", - "dst_connector": "IN_A", - "src_connector": null - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "256*ceiling(N/256)", - "subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "N - 1", - "step": "1", - "tile": "1" - } - ] - }, - "data": "B", - "debuginfo": null, - "guid": "39f2af82-f36f-4955-a1d0-88813578a913", - "src_subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "N - 1", - "step": "1", - "tile": "1" - } - ] - }, - "dst_subset": null, - "is_data_src": true, - "num_accesses": "256*ceiling(N/256)" - } - } - }, - "src": "3", - "dst": "0", - "dst_connector": "IN_B", - "src_connector": null - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "256", - "subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "255", - "step": "1", - "tile": "1" - } - ] - }, - "data": "shr_A", - "debuginfo": null, - "guid": "22ac004d-8470-42c8-a6c5-3e9d8ec80b58", - "src_subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "255", - "step": "1", - "tile": "1" - } - ] - }, - "dst_subset": null, - "is_data_src": true, - "num_accesses": "256" - } - } - }, - "src": "12", - "dst": "5", - "dst_connector": "IN___tmp_29_37_r_in_from_1_0_in_from_1_0", - "src_connector": null - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "256", - "subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "255", - "step": "1", - "tile": "1" - } - ] - }, - "data": "shr_B", - "debuginfo": null, - "guid": "325ac1e2-6982-442c-a419-6bc987ac7c89", - "src_subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "255", - "step": "1", - "tile": "1" - } - ] - }, - "dst_subset": null, - "is_data_src": true, - "num_accesses": "256" - } - } - }, - "src": "13", - "dst": "5", - "dst_connector": "IN___tmp_29_58_r_in_from_1_0_in_from_1_0", - "src_connector": null - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "1", - "subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "0", - "step": "1", - "tile": "1" - } - ] - }, - "data": "__tmp3", - "debuginfo": null, - "guid": "373cdd55-1fd6-4899-aa53-b4390f7eb5d1", - "src_subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "0", - "step": "1", - "tile": "1" - } - ] - }, - "dst_subset": null, - "is_data_src": true, - "num_accesses": "1" - } - } - }, - "src": "10", - "dst": "11", - "dst_connector": "__inp", - "src_connector": null - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "256", - "subset": { - "type": "Range", - "ranges": [ - { - "start": "i", - "end": "i + 255", - "step": "1", - "tile": "1" - } - ] - }, - "other_subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "255", - "step": "1", - "tile": "1" - } - ] - }, - "data": "A", - "debuginfo": null, - "guid": "ebd0daa9-e1db-403e-9aa7-d420b50a67e3", - "src_subset": { - "type": "Range", - "ranges": [ - { - "start": "i", - "end": "i + 255", - "step": "1", - "tile": "1" - } - ] - }, - "dst_subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "255", - "step": "1", - "tile": "1" - } - ] - }, - "is_data_src": true, - "num_accesses": "256" - } - } - }, - "src": "0", - "dst": "12", - "dst_connector": null, - "src_connector": "OUT_A" - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "256", - "subset": { - "type": "Range", - "ranges": [ - { - "start": "i", - "end": "i + 255", - "step": "1", - "tile": "1" - } - ] - }, - "other_subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "255", - "step": "1", - "tile": "1" - } - ] - }, - "data": "B", - "debuginfo": null, - "guid": "6c6c4371-2f5f-4ce1-8d3e-9e103de2bf8c", - "src_subset": { - "type": "Range", - "ranges": [ - { - "start": "i", - "end": "i + 255", - "step": "1", - "tile": "1" - } - ] - }, - "dst_subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "255", - "step": "1", - "tile": "1" - } - ] - }, - "is_data_src": true, - "num_accesses": "256" - } - } - }, - "src": "0", - "dst": "13", - "dst_connector": null, - "src_connector": "OUT_B" - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "256*ceiling(N/256)", - "subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "N - 1", - "step": "1", - "tile": "1" - } - ] - }, - "data": "C", - "debuginfo": null, - "guid": "b655475c-0841-4511-8a7e-0314a72ac3e9", - "src_subset": null, - "dst_subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "N - 1", - "step": "1", - "tile": "1" - } - ] - }, - "is_data_src": false, - "num_accesses": "256*ceiling(N/256)" - } - } - }, - "src": "1", - "dst": "4", - "dst_connector": null, - "src_connector": "OUT_C" - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "256", - "subset": { - "type": "Range", - "ranges": [ - { - "start": "i + 256*k", - "end": "i + 256*k + 255", - "step": "1", - "tile": "1" - } - ] - }, - "data": "C", - "debuginfo": null, - "guid": "cc08b6fa-9597-4e8d-b7ed-c8b640f64093", - "src_subset": null, - "dst_subset": { - "type": "Range", - "ranges": [ - { - "start": "i + 256*k", - "end": "i + 256*k + 255", - "step": "1", - "tile": "1" - } - ] - }, - "is_data_src": false, - "num_accesses": "256" - } - } - }, - "src": "8", - "dst": "6", - "dst_connector": "IN___tmp_29_16_w_out_of_1_1_out_of_1_1", - "src_connector": "OUT___tmp_29_16_w_out_of_1_1" - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "256", - "subset": { - "type": "Range", - "ranges": [ - { - "start": "i", - "end": "i + 255", - "step": "1", - "tile": "1" - } - ] - }, - "data": "C", - "debuginfo": null, - "guid": "a052d8cc-d3fc-4ceb-9f77-5b9cafc2ca66", - "src_subset": null, - "dst_subset": { - "type": "Range", - "ranges": [ - { - "start": "i", - "end": "i + 255", - "step": "1", - "tile": "1" - } - ] - }, - "is_data_src": false, - "num_accesses": "256" - } - } - }, - "src": "6", - "dst": "1", - "dst_connector": "IN_C", - "src_connector": "OUT___tmp_29_16_w_out_of_1_1_out_of_1_1" - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "1", - "subset": { - "type": "Range", - "ranges": [ - { - "start": "j + 256*k", - "end": "j + 256*k", - "step": "1", - "tile": "1" - } - ] - }, - "data": "shr_A", - "debuginfo": null, - "guid": "95a58f89-1cfe-4e1f-b042-a165849dd0eb", - "src_subset": { - "type": "Range", - "ranges": [ - { - "start": "j + 256*k", - "end": "j + 256*k", - "step": "1", - "tile": "1" - } - ] - }, - "dst_subset": null, - "is_data_src": true, - "num_accesses": "1" - } - } - }, - "src": "7", - "dst": "9", - "dst_connector": "__in1", - "src_connector": "OUT___tmp_29_37_r_in_from_1_0" - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "256", - "subset": { - "type": "Range", - "ranges": [ - { - "start": "256*k", - "end": "256*k + 255", - "step": "1", - "tile": "1" - } - ] - }, - "data": "shr_A", - "debuginfo": null, - "guid": "2ff7abff-b07d-4f3f-89f1-877c46f9d44b", - "src_subset": { - "type": "Range", - "ranges": [ - { - "start": "256*k", - "end": "256*k + 255", - "step": "1", - "tile": "1" - } - ] - }, - "dst_subset": null, - "is_data_src": true, - "num_accesses": "256" - } - } - }, - "src": "5", - "dst": "7", - "dst_connector": "IN___tmp_29_37_r_in_from_1_0", - "src_connector": "OUT___tmp_29_37_r_in_from_1_0_in_from_1_0" - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "1", - "subset": { - "type": "Range", - "ranges": [ - { - "start": "j + 256*k", - "end": "j + 256*k", - "step": "1", - "tile": "1" - } - ] - }, - "data": "shr_B", - "debuginfo": null, - "guid": "2d56b2f0-9785-47d4-9aec-d79abfd11d5f", - "src_subset": { - "type": "Range", - "ranges": [ - { - "start": "j + 256*k", - "end": "j + 256*k", - "step": "1", - "tile": "1" - } - ] - }, - "dst_subset": null, - "is_data_src": true, - "num_accesses": "1" - } - } - }, - "src": "7", - "dst": "9", - "dst_connector": "__in2", - "src_connector": "OUT___tmp_29_58_r_in_from_1_0" - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "256", - "subset": { - "type": "Range", - "ranges": [ - { - "start": "256*k", - "end": "256*k + 255", - "step": "1", - "tile": "1" - } - ] - }, - "data": "shr_B", - "debuginfo": null, - "guid": "0a44e0d1-4899-490e-827b-31557145c45d", - "src_subset": { - "type": "Range", - "ranges": [ - { - "start": "256*k", - "end": "256*k + 255", - "step": "1", - "tile": "1" - } - ] - }, - "dst_subset": null, - "is_data_src": true, - "num_accesses": "256" - } - } - }, - "src": "5", - "dst": "7", - "dst_connector": "IN___tmp_29_58_r_in_from_1_0", - "src_connector": "OUT___tmp_29_58_r_in_from_1_0_in_from_1_0" - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "1", - "subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "0", - "step": "1", - "tile": "1" - } - ] - }, - "data": "__tmp3", - "debuginfo": null, - "guid": "30d11bb4-e9e3-49d3-9e54-2bd7285f0136", - "src_subset": null, - "dst_subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "0", - "step": "1", - "tile": "1" - } - ] - }, - "is_data_src": false, - "num_accesses": "1" - } - } - }, - "src": "9", - "dst": "10", - "dst_connector": null, - "src_connector": "__out" - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "1", - "subset": { - "type": "Range", - "ranges": [ - { - "start": "i + j + 256*k", - "end": "i + j + 256*k", - "step": "1", - "tile": "1" - } - ] - }, - "data": "C", - "debuginfo": null, - "guid": "53b5e6e3-8ae7-4baf-91c2-94fbfdf344d5", - "src_subset": null, - "dst_subset": { - "type": "Range", - "ranges": [ - { - "start": "i + j + 256*k", - "end": "i + j + 256*k", - "step": "1", - "tile": "1" - } - ] - }, - "is_data_src": false, - "num_accesses": "1" - } - } - }, - "src": "11", - "dst": "8", - "dst_connector": "IN___tmp_29_16_w_out_of_1_1", - "src_connector": "__out" - } - ], - "attributes": { - "guid": "ee35ff06-cf28-4b20-985b-04aa7d3a2686", - "executions": "1", - "dynamic_executions": false - } - } - ], - "edges": [], - "collapsed": false, - "label": "", - "id": null, - "cfg_list_id": 0, - "start_block": 0, - "dace_version": "1.0.0" -} diff --git a/berkay_workpace/scratch/yakups_examples/smem_related/weird_global_to_global.sdfg b/berkay_workpace/scratch/yakups_examples/smem_related/weird_global_to_global.sdfg deleted file mode 100644 index afc8a6b4a8..0000000000 --- a/berkay_workpace/scratch/yakups_examples/smem_related/weird_global_to_global.sdfg +++ /dev/null @@ -1,1404 +0,0 @@ -{ - "type": "SDFG", - "attributes": { - "name": "kernel", - "arg_names": [ - "A", - "B", - "C" - ], - "_arrays": { - "A": { - "type": "Array", - "attributes": { - "strides": [ - "1" - ], - "total_size": "N", - "offset": [ - "0" - ], - "optional": false, - "dtype": "float64", - "shape": [ - "N" - ], - "storage": "GPU_Global", - "debuginfo": null - } - }, - "B": { - "type": "Array", - "attributes": { - "strides": [ - "1" - ], - "total_size": "N", - "offset": [ - "0" - ], - "optional": false, - "dtype": "float64", - "shape": [ - "N" - ], - "storage": "GPU_Global", - "debuginfo": null - } - }, - "C": { - "type": "Array", - "attributes": { - "strides": [ - "1" - ], - "total_size": "N", - "offset": [ - "0" - ], - "optional": false, - "dtype": "float64", - "shape": [ - "N" - ], - "storage": "GPU_Global", - "debuginfo": null - } - }, - "__tmp3": { - "type": "Scalar", - "attributes": { - "dtype": "float64", - "shape": [ - "1" - ], - "transient": true, - "debuginfo": null - } - }, - "shr_A": { - "type": "Array", - "attributes": { - "strides": [ - "1" - ], - "total_size": "256", - "offset": [ - "0" - ], - "optional": false, - "dtype": "float64", - "shape": [ - "256" - ], - "transient": true, - "storage": "GPU_Shared", - "debuginfo": null - } - }, - "shr_B": { - "type": "Array", - "attributes": { - "strides": [ - "1" - ], - "total_size": "256", - "offset": [ - "0" - ], - "optional": false, - "dtype": "float64", - "shape": [ - "256" - ], - "transient": true, - "storage": "GPU_Shared", - "debuginfo": null - } - } - }, - "symbols": { - "N": "int64" - }, - "global_code": { - "frame": { - "string_data": "", - "language": "CPP" - } - }, - "init_code": { - "frame": { - "string_data": "", - "language": "CPP" - } - }, - "exit_code": { - "frame": { - "string_data": "", - "language": "CPP" - } - }, - "debuginfo": { - "type": "DebugInfo", - "start_line": 20, - "end_line": 29, - "start_column": 0, - "end_column": 0, - "filename": "/home/primrose/Work/DaceLayoutAndScheduleTransformations/layout_and_schedule_transformations/tests/shrmem_test.py" - }, - "using_explicit_control_flow": true, - "guid": "29a0076f-cd9e-40c6-83b8-3792f2274970", - "hash": "6a91f9230ef48fd70071ef71d500050e80a256b5cf8904637d47d3b2ae04184f" - }, - "nodes": [ - { - "type": "SDFGState", - "label": "MapState", - "id": 0, - "collapsed": false, - "scope_dict": { - "-1": [ - 0, - 2, - 3, - 4 - ], - "0": [ - 1, - 5, - 12, - 13, - 14, - 15 - ], - "5": [ - 6, - 7 - ], - "7": [ - 8, - 9, - 10, - 11 - ] - }, - "nodes": [ - { - "type": "MapEntry", - "label": "kernel_26[i=0:N:256]", - "attributes": { - "label": "kernel_26", - "params": [ - "i" - ], - "range": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "N - 1", - "step": "256", - "tile": "1" - } - ] - }, - "schedule": "GPU_Device", - "debuginfo": { - "type": "DebugInfo", - "start_line": 26, - "end_line": 26, - "start_column": 30, - "end_column": 30, - "filename": "/home/primrose/Work/DaceLayoutAndScheduleTransformations/layout_and_schedule_transformations/tests/shrmem_test.py" - }, - "in_connectors": { - "IN_A": null, - "IN_B": null - }, - "out_connectors": { - "OUT_A": null, - "OUT_B": null - }, - "guid": "f808be18-fafd-4130-8275-065342d16162" - }, - "id": 0, - "scope_entry": null, - "scope_exit": "1" - }, - { - "type": "MapExit", - "label": "kernel_26[i=0:N:256]", - "attributes": { - "in_connectors": { - "IN_C": null - }, - "out_connectors": { - "OUT_C": null - }, - "guid": "19fb21a2-5228-44dc-9921-5b0daaaa2bca" - }, - "id": 1, - "scope_entry": "0", - "scope_exit": "1" - }, - { - "type": "AccessNode", - "label": "A", - "attributes": { - "debuginfo": { - "type": "DebugInfo", - "start_line": 26, - "end_line": 26, - "start_column": 30, - "end_column": 30, - "filename": "/home/primrose/Work/DaceLayoutAndScheduleTransformations/layout_and_schedule_transformations/tests/shrmem_test.py" - }, - "data": "A", - "guid": "0bdcca6e-5c4e-4971-837a-a83b7000befc" - }, - "id": 2, - "scope_entry": null, - "scope_exit": null - }, - { - "type": "AccessNode", - "label": "B", - "attributes": { - "debuginfo": { - "type": "DebugInfo", - "start_line": 26, - "end_line": 26, - "start_column": 30, - "end_column": 30, - "filename": "/home/primrose/Work/DaceLayoutAndScheduleTransformations/layout_and_schedule_transformations/tests/shrmem_test.py" - }, - "data": "B", - "guid": "d97310f7-f1f4-45b0-9e32-33588cf45de2" - }, - "id": 3, - "scope_entry": null, - "scope_exit": null - }, - { - "type": "AccessNode", - "label": "C", - "attributes": { - "debuginfo": { - "type": "DebugInfo", - "start_line": 26, - "end_line": 26, - "start_column": 30, - "end_column": 30, - "filename": "/home/primrose/Work/DaceLayoutAndScheduleTransformations/layout_and_schedule_transformations/tests/shrmem_test.py" - }, - "data": "C", - "guid": "e62d00c9-d840-4fcd-820b-2e6e6cc02db3" - }, - "id": 4, - "scope_entry": null, - "scope_exit": null - }, - { - "type": "MapEntry", - "label": "kernel_26_4_27[k=0]", - "attributes": { - "label": "kernel_26_4_27", - "params": [ - "k" - ], - "range": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "0", - "step": "1", - "tile": "1" - } - ] - }, - "schedule": "Sequential", - "debuginfo": { - "type": "DebugInfo", - "start_line": 27, - "end_line": 27, - "start_column": 8, - "end_column": 8, - "filename": "/home/primrose/Work/DaceLayoutAndScheduleTransformations/layout_and_schedule_transformations/tests/shrmem_test.py" - }, - "in_connectors": { - "IN___tmp_29_37_r_in_from_1_0_in_from_1_0": null, - "IN___tmp_29_58_r_in_from_1_0_in_from_1_0": null - }, - "out_connectors": { - "OUT___tmp_29_37_r_in_from_1_0_in_from_1_0": null, - "OUT___tmp_29_58_r_in_from_1_0_in_from_1_0": null - }, - "guid": "18a4a94c-d93a-4787-aad4-f5f35504caf2" - }, - "id": 5, - "scope_entry": "0", - "scope_exit": "6" - }, - { - "type": "MapExit", - "label": "kernel_26_4_27[k=0]", - "attributes": { - "in_connectors": { - "IN___tmp_29_16_w_out_of_1_1_out_of_1_1": null - }, - "out_connectors": { - "OUT___tmp_29_16_w_out_of_1_1_out_of_1_1": null - }, - "guid": "504168ed-6581-4939-abf1-8533df45e0d6" - }, - "id": 6, - "scope_entry": "5", - "scope_exit": "6" - }, - { - "type": "MapEntry", - "label": "kernel_26_4_27_8_28[j=0:256]", - "attributes": { - "label": "kernel_26_4_27_8_28", - "params": [ - "j" - ], - "range": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "255", - "step": "1", - "tile": "1" - } - ] - }, - "schedule": "GPU_ThreadBlock", - "debuginfo": { - "type": "DebugInfo", - "start_line": 28, - "end_line": 28, - "start_column": 12, - "end_column": 12, - "filename": "/home/primrose/Work/DaceLayoutAndScheduleTransformations/layout_and_schedule_transformations/tests/shrmem_test.py" - }, - "in_connectors": { - "IN___tmp_29_37_r_in_from_1_0": null, - "IN___tmp_29_58_r_in_from_1_0": null - }, - "out_connectors": { - "OUT___tmp_29_37_r_in_from_1_0": null, - "OUT___tmp_29_58_r_in_from_1_0": null - }, - "guid": "b2a484ce-0085-4176-bb6a-f1883b41450e" - }, - "id": 7, - "scope_entry": "5", - "scope_exit": "8" - }, - { - "type": "MapExit", - "label": "kernel_26_4_27_8_28[j=0:256]", - "attributes": { - "in_connectors": { - "IN___tmp_29_16_w_out_of_1_1": null - }, - "out_connectors": { - "OUT___tmp_29_16_w_out_of_1_1": null - }, - "guid": "5b163845-29df-4006-bc69-bd20f003db25" - }, - "id": 8, - "scope_entry": "7", - "scope_exit": "8" - }, - { - "type": "Tasklet", - "label": "_Add_", - "attributes": { - "code": { - "string_data": "__out = (__in1 + __in2)", - "language": "Python" - }, - "debuginfo": { - "type": "DebugInfo", - "start_line": 29, - "end_line": 29, - "start_column": 72, - "end_column": 72, - "filename": "/home/primrose/Work/DaceLayoutAndScheduleTransformations/layout_and_schedule_transformations/tests/shrmem_test.py" - }, - "label": "_Add_", - "in_connectors": { - "__in1": null, - "__in2": null - }, - "out_connectors": { - "__out": null - }, - "guid": "89e5325f-ec0d-4d3c-9377-cf96b2c7d445" - }, - "id": 9, - "scope_entry": "7", - "scope_exit": "8" - }, - { - "type": "AccessNode", - "label": "__tmp3", - "attributes": { - "debuginfo": { - "type": "DebugInfo", - "start_line": 29, - "end_line": 29, - "start_column": 72, - "end_column": 72, - "filename": "/home/primrose/Work/DaceLayoutAndScheduleTransformations/layout_and_schedule_transformations/tests/shrmem_test.py" - }, - "data": "__tmp3", - "guid": "f089e4d9-82ab-443f-bd07-97db1bfcdbff" - }, - "id": 10, - "scope_entry": "7", - "scope_exit": "8" - }, - { - "type": "Tasklet", - "label": "assign_29_16", - "attributes": { - "code": { - "string_data": "__out = __inp", - "language": "Python" - }, - "debuginfo": { - "type": "DebugInfo", - "start_line": 29, - "end_line": 29, - "start_column": 30, - "end_column": 30, - "filename": "/home/primrose/Work/DaceLayoutAndScheduleTransformations/layout_and_schedule_transformations/tests/shrmem_test.py" - }, - "label": "assign_29_16", - "in_connectors": { - "__inp": null - }, - "out_connectors": { - "__out": null - }, - "guid": "5d9c843f-085b-4c53-acc3-f8ea40cfdfda" - }, - "id": 11, - "scope_entry": "7", - "scope_exit": "8" - }, - { - "type": "AccessNode", - "label": "A", - "attributes": { - "debuginfo": { - "type": "DebugInfo", - "start_line": 32, - "end_line": 32, - "start_column": 0, - "end_column": 0, - "filename": "/home/primrose/Work/DaceLayoutAndScheduleTransformations/layout_and_schedule_transformations/tests/test_utils.py" - }, - "data": "A", - "guid": "d53c93ef-ec10-4a01-8b4d-17ad5c6c6010" - }, - "id": 12, - "scope_entry": "0", - "scope_exit": "1" - }, - { - "type": "AccessNode", - "label": "shr_A", - "attributes": { - "debuginfo": { - "type": "DebugInfo", - "start_line": 33, - "end_line": 33, - "start_column": 0, - "end_column": 0, - "filename": "/home/primrose/Work/DaceLayoutAndScheduleTransformations/layout_and_schedule_transformations/tests/test_utils.py" - }, - "data": "shr_A", - "guid": "ea981031-8002-4cab-be84-affd26c3fe30" - }, - "id": 13, - "scope_entry": "0", - "scope_exit": "1" - }, - { - "type": "AccessNode", - "label": "B", - "attributes": { - "debuginfo": { - "type": "DebugInfo", - "start_line": 32, - "end_line": 32, - "start_column": 0, - "end_column": 0, - "filename": "/home/primrose/Work/DaceLayoutAndScheduleTransformations/layout_and_schedule_transformations/tests/test_utils.py" - }, - "data": "B", - "guid": "8eb37d53-7ea4-47eb-8082-d375d39320aa" - }, - "id": 14, - "scope_entry": "0", - "scope_exit": "1" - }, - { - "type": "AccessNode", - "label": "shr_B", - "attributes": { - "debuginfo": { - "type": "DebugInfo", - "start_line": 33, - "end_line": 33, - "start_column": 0, - "end_column": 0, - "filename": "/home/primrose/Work/DaceLayoutAndScheduleTransformations/layout_and_schedule_transformations/tests/test_utils.py" - }, - "data": "shr_B", - "guid": "62cd600a-509d-4362-b577-c1fb48b7d3e6" - }, - "id": 15, - "scope_entry": "0", - "scope_exit": "1" - } - ], - "edges": [ - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "256", - "subset": { - "type": "Range", - "ranges": [ - { - "start": "i", - "end": "i + 255", - "step": "1", - "tile": "1" - } - ] - }, - "other_subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "255", - "step": "1", - "tile": "1" - } - ] - }, - "data": "A", - "debuginfo": null, - "guid": "15aa0bf1-5e41-4e8c-9cbf-dbf3f2807f0b", - "src_subset": { - "type": "Range", - "ranges": [ - { - "start": "i", - "end": "i + 255", - "step": "1", - "tile": "1" - } - ] - }, - "dst_subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "255", - "step": "1", - "tile": "1" - } - ] - }, - "is_data_src": true, - "num_accesses": "256" - } - } - }, - "src": "12", - "dst": "13", - "dst_connector": null, - "src_connector": null - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "256", - "subset": { - "type": "Range", - "ranges": [ - { - "start": "i", - "end": "i + 255", - "step": "1", - "tile": "1" - } - ] - }, - "other_subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "255", - "step": "1", - "tile": "1" - } - ] - }, - "data": "B", - "debuginfo": null, - "guid": "ebc72b7f-f62d-4e50-96c9-3b61c345593b", - "src_subset": { - "type": "Range", - "ranges": [ - { - "start": "i", - "end": "i + 255", - "step": "1", - "tile": "1" - } - ] - }, - "dst_subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "255", - "step": "1", - "tile": "1" - } - ] - }, - "is_data_src": true, - "num_accesses": "256" - } - } - }, - "src": "14", - "dst": "15", - "dst_connector": null, - "src_connector": null - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "256*ceiling(N/256)", - "subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "N - 1", - "step": "1", - "tile": "1" - } - ] - }, - "data": "A", - "debuginfo": null, - "guid": "31eeb421-3699-4131-bca0-927923ab3a01", - "src_subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "N - 1", - "step": "1", - "tile": "1" - } - ] - }, - "dst_subset": null, - "is_data_src": true, - "num_accesses": "256*ceiling(N/256)" - } - } - }, - "src": "2", - "dst": "0", - "dst_connector": "IN_A", - "src_connector": null - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "256*ceiling(N/256)", - "subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "N - 1", - "step": "1", - "tile": "1" - } - ] - }, - "data": "B", - "debuginfo": null, - "guid": "3a919458-c1c6-49b9-a3ab-100f74087649", - "src_subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "N - 1", - "step": "1", - "tile": "1" - } - ] - }, - "dst_subset": null, - "is_data_src": true, - "num_accesses": "256*ceiling(N/256)" - } - } - }, - "src": "3", - "dst": "0", - "dst_connector": "IN_B", - "src_connector": null - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "256", - "subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "255", - "step": "1", - "tile": "1" - } - ] - }, - "data": "shr_A", - "debuginfo": null, - "guid": "eb547942-f49e-4344-8601-9ef92363566e", - "src_subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "255", - "step": "1", - "tile": "1" - } - ] - }, - "dst_subset": null, - "is_data_src": true, - "num_accesses": "256" - } - } - }, - "src": "13", - "dst": "5", - "dst_connector": "IN___tmp_29_37_r_in_from_1_0_in_from_1_0", - "src_connector": null - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "256", - "subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "255", - "step": "1", - "tile": "1" - } - ] - }, - "data": "shr_B", - "debuginfo": null, - "guid": "9b95e40b-23ca-4d63-8698-f72a5e191e9f", - "src_subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "255", - "step": "1", - "tile": "1" - } - ] - }, - "dst_subset": null, - "is_data_src": true, - "num_accesses": "256" - } - } - }, - "src": "15", - "dst": "5", - "dst_connector": "IN___tmp_29_58_r_in_from_1_0_in_from_1_0", - "src_connector": null - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "1", - "subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "0", - "step": "1", - "tile": "1" - } - ] - }, - "data": "__tmp3", - "debuginfo": null, - "guid": "a19e5bc2-904d-4337-9e4b-83d41cfe29d5", - "src_subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "0", - "step": "1", - "tile": "1" - } - ] - }, - "dst_subset": null, - "is_data_src": true, - "num_accesses": "1" - } - } - }, - "src": "10", - "dst": "11", - "dst_connector": "__inp", - "src_connector": null - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "256", - "subset": { - "type": "Range", - "ranges": [ - { - "start": "i", - "end": "i + 255", - "step": "1", - "tile": "1" - } - ] - }, - "data": "A", - "debuginfo": null, - "guid": "d1281909-b405-44f5-a31e-1f6d3d656aee", - "src_subset": { - "type": "Range", - "ranges": [ - { - "start": "i", - "end": "i + 255", - "step": "1", - "tile": "1" - } - ] - }, - "dst_subset": null, - "is_data_src": true, - "num_accesses": "256" - } - } - }, - "src": "0", - "dst": "12", - "dst_connector": null, - "src_connector": "OUT_A" - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "256", - "subset": { - "type": "Range", - "ranges": [ - { - "start": "i", - "end": "i + 255", - "step": "1", - "tile": "1" - } - ] - }, - "data": "B", - "debuginfo": null, - "guid": "d6c0b98c-3ce7-40bd-ba2e-94ac0446b651", - "src_subset": { - "type": "Range", - "ranges": [ - { - "start": "i", - "end": "i + 255", - "step": "1", - "tile": "1" - } - ] - }, - "dst_subset": null, - "is_data_src": true, - "num_accesses": "256" - } - } - }, - "src": "0", - "dst": "14", - "dst_connector": null, - "src_connector": "OUT_B" - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "256*ceiling(N/256)", - "subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "N - 1", - "step": "1", - "tile": "1" - } - ] - }, - "data": "C", - "debuginfo": null, - "guid": "7eb38a56-2770-46fc-81df-c0c386363b73", - "src_subset": null, - "dst_subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "N - 1", - "step": "1", - "tile": "1" - } - ] - }, - "is_data_src": false, - "num_accesses": "256*ceiling(N/256)" - } - } - }, - "src": "1", - "dst": "4", - "dst_connector": null, - "src_connector": "OUT_C" - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "256", - "subset": { - "type": "Range", - "ranges": [ - { - "start": "i + 256*k", - "end": "i + 256*k + 255", - "step": "1", - "tile": "1" - } - ] - }, - "data": "C", - "debuginfo": null, - "guid": "aaf84bab-0c41-4b0d-8886-da40683baa16", - "src_subset": null, - "dst_subset": { - "type": "Range", - "ranges": [ - { - "start": "i + 256*k", - "end": "i + 256*k + 255", - "step": "1", - "tile": "1" - } - ] - }, - "is_data_src": false, - "num_accesses": "256" - } - } - }, - "src": "8", - "dst": "6", - "dst_connector": "IN___tmp_29_16_w_out_of_1_1_out_of_1_1", - "src_connector": "OUT___tmp_29_16_w_out_of_1_1" - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "256", - "subset": { - "type": "Range", - "ranges": [ - { - "start": "i", - "end": "i + 255", - "step": "1", - "tile": "1" - } - ] - }, - "data": "C", - "debuginfo": null, - "guid": "4b2bd584-3030-4cfa-b4e3-cbd9263a1924", - "src_subset": null, - "dst_subset": { - "type": "Range", - "ranges": [ - { - "start": "i", - "end": "i + 255", - "step": "1", - "tile": "1" - } - ] - }, - "is_data_src": false, - "num_accesses": "256" - } - } - }, - "src": "6", - "dst": "1", - "dst_connector": "IN_C", - "src_connector": "OUT___tmp_29_16_w_out_of_1_1_out_of_1_1" - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "1", - "subset": { - "type": "Range", - "ranges": [ - { - "start": "j + 256*k", - "end": "j + 256*k", - "step": "1", - "tile": "1" - } - ] - }, - "data": "shr_A", - "debuginfo": null, - "guid": "6004ab2e-675e-4b25-8f4c-ac9bb8ebfec3", - "src_subset": { - "type": "Range", - "ranges": [ - { - "start": "j + 256*k", - "end": "j + 256*k", - "step": "1", - "tile": "1" - } - ] - }, - "dst_subset": null, - "is_data_src": true, - "num_accesses": "1" - } - } - }, - "src": "7", - "dst": "9", - "dst_connector": "__in1", - "src_connector": "OUT___tmp_29_37_r_in_from_1_0" - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "256", - "subset": { - "type": "Range", - "ranges": [ - { - "start": "256*k", - "end": "256*k + 255", - "step": "1", - "tile": "1" - } - ] - }, - "data": "shr_A", - "debuginfo": null, - "guid": "e9fcd9dd-119f-49b8-92f8-35dae9b73999", - "src_subset": { - "type": "Range", - "ranges": [ - { - "start": "256*k", - "end": "256*k + 255", - "step": "1", - "tile": "1" - } - ] - }, - "dst_subset": null, - "is_data_src": true, - "num_accesses": "256" - } - } - }, - "src": "5", - "dst": "7", - "dst_connector": "IN___tmp_29_37_r_in_from_1_0", - "src_connector": "OUT___tmp_29_37_r_in_from_1_0_in_from_1_0" - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "1", - "subset": { - "type": "Range", - "ranges": [ - { - "start": "j + 256*k", - "end": "j + 256*k", - "step": "1", - "tile": "1" - } - ] - }, - "data": "shr_B", - "debuginfo": null, - "guid": "1e6fa9ca-2a9e-4dfc-826c-868c214ac387", - "src_subset": { - "type": "Range", - "ranges": [ - { - "start": "j + 256*k", - "end": "j + 256*k", - "step": "1", - "tile": "1" - } - ] - }, - "dst_subset": null, - "is_data_src": true, - "num_accesses": "1" - } - } - }, - "src": "7", - "dst": "9", - "dst_connector": "__in2", - "src_connector": "OUT___tmp_29_58_r_in_from_1_0" - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "256", - "subset": { - "type": "Range", - "ranges": [ - { - "start": "256*k", - "end": "256*k + 255", - "step": "1", - "tile": "1" - } - ] - }, - "data": "shr_B", - "debuginfo": null, - "guid": "98d23be2-1bf1-484e-85dc-a408af3c4acd", - "src_subset": { - "type": "Range", - "ranges": [ - { - "start": "256*k", - "end": "256*k + 255", - "step": "1", - "tile": "1" - } - ] - }, - "dst_subset": null, - "is_data_src": true, - "num_accesses": "256" - } - } - }, - "src": "5", - "dst": "7", - "dst_connector": "IN___tmp_29_58_r_in_from_1_0", - "src_connector": "OUT___tmp_29_58_r_in_from_1_0_in_from_1_0" - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "1", - "subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "0", - "step": "1", - "tile": "1" - } - ] - }, - "data": "__tmp3", - "debuginfo": null, - "guid": "805ec629-7f1a-4533-9488-81aa404fef26", - "src_subset": null, - "dst_subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "0", - "step": "1", - "tile": "1" - } - ] - }, - "is_data_src": false, - "num_accesses": "1" - } - } - }, - "src": "9", - "dst": "10", - "dst_connector": null, - "src_connector": "__out" - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "1", - "subset": { - "type": "Range", - "ranges": [ - { - "start": "i + j + 256*k", - "end": "i + j + 256*k", - "step": "1", - "tile": "1" - } - ] - }, - "data": "C", - "debuginfo": null, - "guid": "5c3be07e-084d-45b5-95a2-39985435aa51", - "src_subset": null, - "dst_subset": { - "type": "Range", - "ranges": [ - { - "start": "i + j + 256*k", - "end": "i + j + 256*k", - "step": "1", - "tile": "1" - } - ] - }, - "is_data_src": false, - "num_accesses": "1" - } - } - }, - "src": "11", - "dst": "8", - "dst_connector": "IN___tmp_29_16_w_out_of_1_1", - "src_connector": "__out" - } - ], - "attributes": { - "guid": "1958e62b-83b6-42ac-9a59-bb49bb2433f4", - "executions": "1", - "dynamic_executions": false - } - } - ], - "edges": [], - "collapsed": false, - "label": "", - "id": null, - "cfg_list_id": 0, - "start_block": 0, - "dace_version": "1.0.0" -} diff --git a/berkay_workpace/scratch/yakups_examples/smem_related/weird_shared_to_shared_copy.sdfg b/berkay_workpace/scratch/yakups_examples/smem_related/weird_shared_to_shared_copy.sdfg deleted file mode 100644 index 5842b832b1..0000000000 --- a/berkay_workpace/scratch/yakups_examples/smem_related/weird_shared_to_shared_copy.sdfg +++ /dev/null @@ -1,896 +0,0 @@ -{ - "type": "SDFG", - "attributes": { - "name": "kernel", - "arg_names": [ - "A", - "B", - "C" - ], - "_arrays": { - "A": { - "type": "Array", - "attributes": { - "strides": [ - "1" - ], - "total_size": "N", - "offset": [ - "0" - ], - "optional": false, - "dtype": "float64", - "shape": [ - "N" - ], - "storage": "GPU_Global", - "debuginfo": null - } - }, - "B": { - "type": "Array", - "attributes": { - "strides": [ - "1" - ], - "total_size": "N", - "offset": [ - "0" - ], - "optional": false, - "dtype": "float64", - "shape": [ - "N" - ], - "storage": "GPU_Global", - "debuginfo": null - } - }, - "C": { - "type": "Array", - "attributes": { - "strides": [ - "1" - ], - "total_size": "N", - "offset": [ - "0" - ], - "optional": false, - "dtype": "float64", - "shape": [ - "N" - ], - "storage": "GPU_Global", - "debuginfo": null - } - }, - "__tmp1": { - "type": "Scalar", - "attributes": { - "dtype": "float64", - "shape": [ - "1" - ], - "transient": true, - "debuginfo": null - } - }, - "shr_A": { - "type": "Array", - "attributes": { - "strides": [ - "1" - ], - "total_size": "32", - "offset": [ - "0" - ], - "optional": false, - "dtype": "float64", - "shape": [ - "32" - ], - "transient": true, - "storage": "GPU_Shared", - "debuginfo": null - } - }, - "shr_B": { - "type": "Array", - "attributes": { - "strides": [ - "1" - ], - "total_size": "32", - "offset": [ - "0" - ], - "optional": false, - "dtype": "float64", - "shape": [ - "32" - ], - "transient": true, - "storage": "GPU_Shared", - "debuginfo": null - } - } - }, - "symbols": { - "N": "int64" - }, - "global_code": { - "frame": { - "string_data": "", - "language": "CPP" - } - }, - "init_code": { - "frame": { - "string_data": "", - "language": "CPP" - } - }, - "exit_code": { - "frame": { - "string_data": "", - "language": "CPP" - } - }, - "debuginfo": { - "type": "DebugInfo", - "start_line": 95, - "end_line": 102, - "start_column": 0, - "end_column": 0, - "filename": "/home/primrose/Work/DaceLayoutAndScheduleTransformations/layout_and_schedule_transformations/tests/shrmem_test.py" - }, - "using_explicit_control_flow": true, - "guid": "6f84fc47-c268-49be-b4bf-b3db5d8f4afc", - "hash": "02c373fd95a88d2386204512c0f24ede7d6118e3675ac1cf6abf4aef43326074" - }, - "nodes": [ - { - "type": "SDFGState", - "label": "MapState", - "id": 0, - "collapsed": false, - "scope_dict": { - "-1": [ - 0, - 2, - 3, - 4 - ], - "0": [ - 1, - 5, - 6, - 7, - 8, - 9 - ] - }, - "nodes": [ - { - "type": "MapEntry", - "label": "kernel_101[i=0:N]", - "attributes": { - "label": "kernel_101", - "params": [ - "i" - ], - "range": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "N - 1", - "step": "1", - "tile": "1" - } - ] - }, - "schedule": "GPU_Device", - "debuginfo": { - "type": "DebugInfo", - "start_line": 101, - "end_line": 101, - "start_column": 4, - "end_column": 4, - "filename": "/home/primrose/Work/DaceLayoutAndScheduleTransformations/layout_and_schedule_transformations/tests/shrmem_test.py" - }, - "in_connectors": { - "IN_A": null, - "IN_B": null - }, - "out_connectors": { - "OUT_A": null, - "OUT_B": null - }, - "guid": "65bd4c66-c8b1-4bb0-a886-38f7742d80d8" - }, - "id": 0, - "scope_entry": null, - "scope_exit": "1" - }, - { - "type": "MapExit", - "label": "kernel_101[i=0:N]", - "attributes": { - "in_connectors": { - "IN_C": null - }, - "out_connectors": { - "OUT_C": null - }, - "guid": "a4aeac5c-af3c-4250-bdae-6a9c6eed9d0c" - }, - "id": 1, - "scope_entry": "0", - "scope_exit": "1" - }, - { - "type": "AccessNode", - "label": "A", - "attributes": { - "debuginfo": { - "type": "DebugInfo", - "start_line": 101, - "end_line": 101, - "start_column": 4, - "end_column": 4, - "filename": "/home/primrose/Work/DaceLayoutAndScheduleTransformations/layout_and_schedule_transformations/tests/shrmem_test.py" - }, - "data": "A", - "guid": "0f4fdd7b-2487-4972-be44-c65f2e9706a2" - }, - "id": 2, - "scope_entry": null, - "scope_exit": null - }, - { - "type": "AccessNode", - "label": "B", - "attributes": { - "debuginfo": { - "type": "DebugInfo", - "start_line": 101, - "end_line": 101, - "start_column": 4, - "end_column": 4, - "filename": "/home/primrose/Work/DaceLayoutAndScheduleTransformations/layout_and_schedule_transformations/tests/shrmem_test.py" - }, - "data": "B", - "guid": "0d49a9f9-f91e-43ad-98b0-564a96450129" - }, - "id": 3, - "scope_entry": null, - "scope_exit": null - }, - { - "type": "AccessNode", - "label": "C", - "attributes": { - "debuginfo": { - "type": "DebugInfo", - "start_line": 101, - "end_line": 101, - "start_column": 4, - "end_column": 4, - "filename": "/home/primrose/Work/DaceLayoutAndScheduleTransformations/layout_and_schedule_transformations/tests/shrmem_test.py" - }, - "data": "C", - "guid": "bfea091e-eac1-4556-835f-a8541497b4fb" - }, - "id": 4, - "scope_entry": null, - "scope_exit": null - }, - { - "type": "Tasklet", - "label": "_Add_", - "attributes": { - "code": { - "string_data": "__out = (__in1 + __in2)", - "language": "Python" - }, - "debuginfo": { - "type": "DebugInfo", - "start_line": 102, - "end_line": 102, - "start_column": 32, - "end_column": 32, - "filename": "/home/primrose/Work/DaceLayoutAndScheduleTransformations/layout_and_schedule_transformations/tests/shrmem_test.py" - }, - "label": "_Add_", - "in_connectors": { - "__in1": null, - "__in2": null - }, - "out_connectors": { - "__out": null - }, - "guid": "f88baeb1-b915-4c5e-bffa-7ba8c5ecc856" - }, - "id": 5, - "scope_entry": "0", - "scope_exit": "1" - }, - { - "type": "AccessNode", - "label": "__tmp1", - "attributes": { - "debuginfo": { - "type": "DebugInfo", - "start_line": 102, - "end_line": 102, - "start_column": 32, - "end_column": 32, - "filename": "/home/primrose/Work/DaceLayoutAndScheduleTransformations/layout_and_schedule_transformations/tests/shrmem_test.py" - }, - "data": "__tmp1", - "guid": "6dffddb3-1a08-41da-ba88-077ab159c935" - }, - "id": 6, - "scope_entry": "0", - "scope_exit": "1" - }, - { - "type": "Tasklet", - "label": "assign_102_16", - "attributes": { - "code": { - "string_data": "__out = __inp", - "language": "Python" - }, - "debuginfo": { - "type": "DebugInfo", - "start_line": 102, - "end_line": 102, - "start_column": 18, - "end_column": 18, - "filename": "/home/primrose/Work/DaceLayoutAndScheduleTransformations/layout_and_schedule_transformations/tests/shrmem_test.py" - }, - "label": "assign_102_16", - "in_connectors": { - "__inp": null - }, - "out_connectors": { - "__out": null - }, - "guid": "749cbeda-09f4-4238-9e12-034c49b8e6df" - }, - "id": 7, - "scope_entry": "0", - "scope_exit": "1" - }, - { - "type": "AccessNode", - "label": "shr_A", - "attributes": { - "debuginfo": { - "type": "DebugInfo", - "start_line": 117, - "end_line": 117, - "start_column": 0, - "end_column": 0, - "filename": "/home/primrose/Work/DaceLayoutAndScheduleTransformations/layout_and_schedule_transformations/tests/test_utils.py" - }, - "data": "shr_A", - "guid": "b6be5ff8-a202-495d-acf9-436948e970c1" - }, - "id": 8, - "scope_entry": "0", - "scope_exit": "1" - }, - { - "type": "AccessNode", - "label": "shr_B", - "attributes": { - "debuginfo": { - "type": "DebugInfo", - "start_line": 117, - "end_line": 117, - "start_column": 0, - "end_column": 0, - "filename": "/home/primrose/Work/DaceLayoutAndScheduleTransformations/layout_and_schedule_transformations/tests/test_utils.py" - }, - "data": "shr_B", - "guid": "b7ba403f-3139-4e7a-974f-44f41106ae37" - }, - "id": 9, - "scope_entry": "0", - "scope_exit": "1" - } - ], - "edges": [ - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "N", - "subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "N - 1", - "step": "1", - "tile": "1" - } - ] - }, - "data": "A", - "debuginfo": null, - "guid": "2368aa63-cb9d-4289-b1de-558cbc048fd4", - "src_subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "N - 1", - "step": "1", - "tile": "1" - } - ] - }, - "dst_subset": null, - "is_data_src": true, - "num_accesses": "N" - } - } - }, - "src": "2", - "dst": "0", - "dst_connector": "IN_A", - "src_connector": null - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "N", - "subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "N - 1", - "step": "1", - "tile": "1" - } - ] - }, - "data": "B", - "debuginfo": null, - "guid": "7abc4434-9e30-4369-aad3-d1f6b7c8443f", - "src_subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "N - 1", - "step": "1", - "tile": "1" - } - ] - }, - "dst_subset": null, - "is_data_src": true, - "num_accesses": "N" - } - } - }, - "src": "3", - "dst": "0", - "dst_connector": "IN_B", - "src_connector": null - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "1", - "subset": { - "type": "Range", - "ranges": [ - { - "start": "Mod(i, 32)", - "end": "Mod(i, 32)", - "step": "1", - "tile": "1" - } - ] - }, - "data": "shr_A", - "debuginfo": null, - "guid": "9e7599f9-608f-4c22-ba63-e13fa392c2a3", - "src_subset": { - "type": "Range", - "ranges": [ - { - "start": "Mod(i, 32)", - "end": "Mod(i, 32)", - "step": "1", - "tile": "1" - } - ] - }, - "dst_subset": null, - "is_data_src": true, - "num_accesses": "1" - } - } - }, - "src": "8", - "dst": "5", - "dst_connector": "__in1", - "src_connector": null - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "1", - "subset": { - "type": "Range", - "ranges": [ - { - "start": "Mod(i, 32)", - "end": "Mod(i, 32)", - "step": "1", - "tile": "1" - } - ] - }, - "data": "shr_B", - "debuginfo": null, - "guid": "81574450-25db-4cd3-b433-d1e1ff842de8", - "src_subset": { - "type": "Range", - "ranges": [ - { - "start": "Mod(i, 32)", - "end": "Mod(i, 32)", - "step": "1", - "tile": "1" - } - ] - }, - "dst_subset": null, - "is_data_src": true, - "num_accesses": "1" - } - } - }, - "src": "9", - "dst": "5", - "dst_connector": "__in2", - "src_connector": null - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "1", - "subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "0", - "step": "1", - "tile": "1" - } - ] - }, - "data": "__tmp1", - "debuginfo": null, - "guid": "400ec4c5-929d-4d54-8a49-0c699b2611dd", - "src_subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "0", - "step": "1", - "tile": "1" - } - ] - }, - "dst_subset": null, - "is_data_src": true, - "num_accesses": "1" - } - } - }, - "src": "6", - "dst": "7", - "dst_connector": "__inp", - "src_connector": null - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "1", - "subset": { - "type": "Range", - "ranges": [ - { - "start": "i", - "end": "i", - "step": "1", - "tile": "1" - } - ] - }, - "other_subset": { - "type": "Range", - "ranges": [ - { - "start": "Mod(i, 32)", - "end": "Mod(i, 32)", - "step": "1", - "tile": "1" - } - ] - }, - "data": "A", - "debuginfo": null, - "guid": "673015a0-2f78-45a9-a1a9-1030396e4f10", - "src_subset": { - "type": "Range", - "ranges": [ - { - "start": "i", - "end": "i", - "step": "1", - "tile": "1" - } - ] - }, - "dst_subset": { - "type": "Range", - "ranges": [ - { - "start": "Mod(i, 32)", - "end": "Mod(i, 32)", - "step": "1", - "tile": "1" - } - ] - }, - "is_data_src": true, - "num_accesses": "1" - } - } - }, - "src": "0", - "dst": "8", - "dst_connector": null, - "src_connector": "OUT_A" - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "1", - "subset": { - "type": "Range", - "ranges": [ - { - "start": "i", - "end": "i", - "step": "1", - "tile": "1" - } - ] - }, - "other_subset": { - "type": "Range", - "ranges": [ - { - "start": "Mod(i, 32)", - "end": "Mod(i, 32)", - "step": "1", - "tile": "1" - } - ] - }, - "data": "B", - "debuginfo": null, - "guid": "b4750339-ce40-4ea4-a6ad-5e89c9860a1b", - "src_subset": { - "type": "Range", - "ranges": [ - { - "start": "i", - "end": "i", - "step": "1", - "tile": "1" - } - ] - }, - "dst_subset": { - "type": "Range", - "ranges": [ - { - "start": "Mod(i, 32)", - "end": "Mod(i, 32)", - "step": "1", - "tile": "1" - } - ] - }, - "is_data_src": true, - "num_accesses": "1" - } - } - }, - "src": "0", - "dst": "9", - "dst_connector": null, - "src_connector": "OUT_B" - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "N", - "subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "N - 1", - "step": "1", - "tile": "1" - } - ] - }, - "data": "C", - "debuginfo": null, - "guid": "5322f7ea-bfea-4063-9d7b-9b25e05d480c", - "src_subset": null, - "dst_subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "N - 1", - "step": "1", - "tile": "1" - } - ] - }, - "is_data_src": false, - "num_accesses": "N" - } - } - }, - "src": "1", - "dst": "4", - "dst_connector": null, - "src_connector": "OUT_C" - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "1", - "subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "0", - "step": "1", - "tile": "1" - } - ] - }, - "data": "__tmp1", - "debuginfo": null, - "guid": "2239ae7c-60ae-43dd-b95e-4f4534646662", - "src_subset": null, - "dst_subset": { - "type": "Range", - "ranges": [ - { - "start": "0", - "end": "0", - "step": "1", - "tile": "1" - } - ] - }, - "is_data_src": false, - "num_accesses": "1" - } - } - }, - "src": "5", - "dst": "6", - "dst_connector": null, - "src_connector": "__out" - }, - { - "type": "MultiConnectorEdge", - "attributes": { - "data": { - "type": "Memlet", - "attributes": { - "volume": "1", - "subset": { - "type": "Range", - "ranges": [ - { - "start": "i", - "end": "i", - "step": "1", - "tile": "1" - } - ] - }, - "data": "C", - "debuginfo": null, - "guid": "c9b5e42d-2483-40f5-a125-a1693a6687b0", - "src_subset": null, - "dst_subset": { - "type": "Range", - "ranges": [ - { - "start": "i", - "end": "i", - "step": "1", - "tile": "1" - } - ] - }, - "is_data_src": false, - "num_accesses": "1" - } - } - }, - "src": "7", - "dst": "1", - "dst_connector": "IN_C", - "src_connector": "__out" - } - ], - "attributes": { - "guid": "3b24f0bd-0925-4793-bb62-44efcd062222", - "executions": "1", - "dynamic_executions": false - } - } - ], - "edges": [], - "collapsed": false, - "label": "", - "id": null, - "cfg_list_id": 0, - "start_block": 0, - "dace_version": "1.0.0" -} diff --git a/berkay_workpace/tests/experimental_features_tests/warp_level_test.py b/berkay_workpace/tests/experimental_features_tests/warp_level_test.py deleted file mode 100644 index 78a4c7da72..0000000000 --- a/berkay_workpace/tests/experimental_features_tests/warp_level_test.py +++ /dev/null @@ -1,445 +0,0 @@ -import dace -import pytest -import cupy as cp - -from IPython.display import Code -from dace.config import Config - -####################### Testing correct mapping of indices to WarpIds ################## - - -# NOTE: Focus in these section is not on the tasklet (just used to have a simple -# verification option) and the SDFG is not correct, dataFlow to warps includes 32 elements -# and not only 1 element. But there is no support for correct representation (yet). However, -# the construction of the warpIds is not affected by this. Correct SDFGs appear in the next -# test section -@pytest.mark.gpu -@pytest.mark.parametrize("start, end, stride", [(0, 32, 1), (3, 16, 1), (5, 17, 3)]) -def test_warp_map_single_TB(start, end, stride): - - @dace.program - def simple_warp_map(A: dace.uint32[1024] @ dace.dtypes.StorageType.GPU_Global, - B: dace.uint32[1024] @ dace.dtypes.StorageType.GPU_Global): - """ - 1D check with different start, end and strides. - """ - for i in dace.map[0:1024:1024] @ dace.dtypes.ScheduleType.GPU_Device: - for j in dace.map[0:1024] @ dace.dtypes.ScheduleType.GPU_ThreadBlock: - for _ in dace.map[start:end:stride] @ dace.dtypes.ScheduleType.GPU_Warp: - mask = 0xffffffff - value = A[j] - result = dace.define_local_scalar(dace.uint32) - with dace.tasklet(dace.Language.CPP): - inp_mask << mask - inp_value << value - out_result >> result - """ - out_result = __reduce_add_sync(inp_mask, inp_value); - """ - - B[j] = result - - sdfg = simple_warp_map.to_sdfg() - - A = cp.ones(1024, dtype=cp.uint32) - B = cp.zeros(1024, dtype=cp.uint32) - - sdfg(A=A, B=B) - - expected = cp.full(1024, 0, dtype=cp.uint32) - for tid in range(1024): - warpId = tid // 32 - if warpId in range(start, end, stride): - expected[tid] = 32 - - cp.testing.assert_array_equal(B, expected) - - -@pytest.mark.gpu -@pytest.mark.parametrize("start, end, stride", [(2, 16, 6), (3, 15, 3)]) -def test_warp_map_multiple_TB(start, end, stride): - - @dace.program - def multTB_warp_map(A: dace.uint32[1024] @ dace.dtypes.StorageType.GPU_Global, - B: dace.uint32[1024] @ dace.dtypes.StorageType.GPU_Global): - """ - The case where we have more than one ThreadBlock. - """ - for i in dace.map[0:1024:512] @ dace.dtypes.ScheduleType.GPU_Device: - for j in dace.map[0:512] @ dace.dtypes.ScheduleType.GPU_ThreadBlock: - for _ in dace.map[start:end:stride] @ dace.dtypes.ScheduleType.GPU_Warp: - mask = 0xffffffff - value = A[i + j] - result = dace.define_local_scalar(dace.uint32) - with dace.tasklet(dace.Language.CPP): - inp_mask << mask - inp_value << value - out_result >> result - """ - out_result = __reduce_add_sync(inp_mask, inp_value); - """ - - B[i + j] = result - - sdfg = multTB_warp_map.to_sdfg() - - A = cp.ones(1024, dtype=cp.uint32) - B = cp.zeros(1024, dtype=cp.uint32) - - sdfg(A=A, B=B) - - expected = cp.full(1024, 0, dtype=cp.uint32) - for block_start in range(0, 1024, 512): - for tid in range(512): - warpId = tid // 32 - if warpId in range(start, end, stride): - expected[block_start + tid] = 32 - - cp.testing.assert_array_equal(B, expected) - - -@pytest.mark.gpu -@pytest.mark.parametrize("b1, e1, s1, b2, e2, s2", [ - (0, 4, 1, 0, 4, 1), - (0, 3, 2, 0, 5, 3), -]) -def test_warp_map_2D(b1, e1, s1, b2, e2, s2): - - @dace.program - def multTB_warp_map_2D(A: dace.uint32[1024] @ dace.dtypes.StorageType.GPU_Global, - B: dace.uint32[1024] @ dace.dtypes.StorageType.GPU_Global): - """ - Simple functionality check of 2D maps, focus is on 2D and less on multible TB. - """ - for i in dace.map[0:1024:512] @ dace.dtypes.ScheduleType.GPU_Device: - for j in dace.map[0:512] @ dace.dtypes.ScheduleType.GPU_ThreadBlock: - for k, l in dace.map[b1:e1:s1, b2:e2:s2] @ dace.dtypes.ScheduleType.GPU_Warp: - mask = 0xffffffff - value = A[i + j] - result = dace.define_local_scalar(dace.uint32) - with dace.tasklet(dace.Language.CPP): - inp_mask << mask - inp_value << value - out_result >> result - """ - out_result = __reduce_add_sync(inp_mask, inp_value); - """ - - B[i + j] = result - - sdfg = multTB_warp_map_2D.to_sdfg() - - A = cp.ones(1024, dtype=cp.uint32) - B = cp.zeros(1024, dtype=cp.uint32) - - sdfg(A=A, B=B) - - # Check whether result is as expected - expected = cp.full(1024, 0, dtype=cp.uint32) - for block_start in range(0, 1024, 512): - for tid in range(512): - warpId = (tid // 32) - if warpId >= e1 * e2: - continue - warpIdx = (warpId % e2) - warpIdy = (warpId // e2) % e1 - if (warpIdx - b2) % s2 == 0 and (warpIdy - b1) % s1 == 0: - expected[block_start + tid] = 32 - - cp.testing.assert_array_equal(B, expected) - - -@pytest.mark.gpu -@pytest.mark.parametrize("b1, e1, s1, b2, e2, s2, b3, e3, s3", [ - (0, 4, 1, 0, 4, 2, 0, 2, 1), - (0, 3, 2, 1, 5, 3, 1, 2, 1), -]) -def test_warp_map_3D(b1, e1, s1, b2, e2, s2, b3, e3, s3): - - @dace.program - def warp_map_3D(A: dace.uint32[1024] @ dace.dtypes.StorageType.GPU_Global, - B: dace.uint32[1024] @ dace.dtypes.StorageType.GPU_Global): - """ - Simple functionality check of 3D maps - """ - for i in dace.map[0:1024:1024] @ dace.dtypes.ScheduleType.GPU_Device: - for j in dace.map[0:1024] @ dace.dtypes.ScheduleType.GPU_ThreadBlock: - for k, l, m in dace.map[b1:e1:s1, b2:e2:s2, b3:e3:s3] @ dace.dtypes.ScheduleType.GPU_Warp: - mask = 0xffffffff - value = A[i + j] - result = dace.define_local_scalar(dace.uint32) - with dace.tasklet(dace.Language.CPP): - inp_mask << mask - inp_value << value - out_result >> result - """ - out_result = __reduce_add_sync(inp_mask, inp_value); - """ - - B[i + j] = result - - sdfg = warp_map_3D.to_sdfg() - - A = cp.ones(1024, dtype=cp.uint32) - B = cp.zeros(1024, dtype=cp.uint32) - - sdfg(A=A, B=B) - - # Check whether result is as expected - expected = cp.full(1024, 0, dtype=cp.uint32) - for block_start in range(0, 1024, 1024): - for tid in range(1024): - warpId = (tid // 32) - if warpId >= e1 * e2 * e3: - continue - warpIdx = warpId % e3 - warpIdy = (warpId // e3) % e2 - warpIdz = (warpId // (e3 * e2)) % e1 - if ((warpIdx - b3) % s3 == 0 and warpIdx >= b3 and (warpIdy - b2) % s2 == 0 and warpIdx >= b2 - and (warpIdz - b1) % s1 == 0 and warpIdx >= b1): - expected[block_start + tid] = 32 - - cp.testing.assert_array_equal(B, expected) - - -@pytest.mark.gpu -@pytest.mark.parametrize("bs, ns", [(512, 1024), (1024, 2048)]) -def test_symbolic_warp_map(bs, ns): - - BS = dace.symbol('BS') - NS = dace.symbol('NS') - - START = dace.symbol('START') - WS = dace.symbol('WS') - STRIDE = dace.symbol('STRIDE') - - start = 2 - stride = 3 - ws = bs // 32 - - @dace.program - def symbolic_warp_map(A: dace.uint32[NS] @ dace.dtypes.StorageType.GPU_Global, - B: dace.uint32[NS] @ dace.dtypes.StorageType.GPU_Global): - """ - Focus is in the use of symbolic variables in the MAP. - """ - for i in dace.map[0:NS:BS] @ dace.dtypes.ScheduleType.GPU_Device: - for j in dace.map[0:BS] @ dace.dtypes.ScheduleType.GPU_ThreadBlock: - - for k in dace.map[START:WS:STRIDE] @ dace.dtypes.ScheduleType.GPU_Warp: - mask = 0xffffffff - value = A[i + j] - result = dace.define_local_scalar(dace.uint32) - with dace.tasklet(dace.Language.CPP): - inp_mask << mask - inp_value << value - out_result >> result - """ - out_result = __reduce_add_sync(inp_mask, inp_value); - """ - - B[i + j] = result - - sdfg = symbolic_warp_map.to_sdfg() - - A = cp.ones(ns, dtype=cp.uint32) - B = cp.zeros(ns, dtype=cp.uint32) - - sdfg(A=A, B=B, START=start, WS=ws, STRIDE=stride, BS=bs, NS=ns) - - expected = cp.full(ns, 0, dtype=cp.uint32) - for block_start in range(0, ns, bs): - for tid in range(bs): - warpId = tid // 32 - if warpId in range(start, ws, stride): - expected[block_start + tid] = 32 - - cp.testing.assert_array_equal(B, expected) - - -@pytest.mark.gpu -def test_dynamic_warpSize_warp_map(): - - STRIDE = 3 # just smth else than 1, 1 is easy to pass - BS = dace.symbol('BS') - NS = dace.symbol('NS') - - bs = 1024 - ns = 2024 - - @dace.program - def symbolic_warp_map(A: dace.uint32[NS] @ dace.dtypes.StorageType.GPU_Global, - B: dace.uint32[NS] @ dace.dtypes.StorageType.GPU_Global): - """ - What if warpSize is determined at runtime. - """ - for i in dace.map[0:NS:BS] @ dace.dtypes.ScheduleType.GPU_Device: - for j in dace.map[0:BS] @ dace.dtypes.ScheduleType.GPU_ThreadBlock: - ws = bs // 32 - for k in dace.map[0:ws:STRIDE] @ dace.dtypes.ScheduleType.GPU_Warp: - mask = 0xffffffff - value = A[i + j] - result = dace.define_local_scalar(dace.uint32) - with dace.tasklet(dace.Language.CPP): - inp_mask << mask - inp_value << value - out_result >> result - """ - out_result = __reduce_add_sync(inp_mask, inp_value); - """ - - B[i + j] = result - - sdfg = symbolic_warp_map.to_sdfg() - - A = cp.ones(ns, dtype=cp.uint32) - B = cp.zeros(ns, dtype=cp.uint32) - - sdfg(A=A, B=B, BS=bs, NS=ns) - - expected = cp.full(ns, 0, dtype=cp.uint32) - for block_start in range(0, ns, bs): - for tid in range(bs): - ws = bs // 32 - warpId = tid // 32 - if warpId in range(0, ws, STRIDE): - expected[block_start + tid] = 32 - - cp.testing.assert_array_equal(B, expected) - - -####################### Testing simple warplevel programs ################# - - -@pytest.mark.gpu -def test_warp_reduce_add(): - """ - Best way to understand this is to copy paste it and - to look at the sdfg. A simple explanation: It tests whether - the most basic functionality of warp maps work and whether - we can use "__reduce_add_sync(mask, value)" on by definining a - custom tasklet. - """ - - # Generate framework - sdfg = dace.SDFG("Warp_test_1") - state = sdfg.add_state("main") - - # Generate access nodes - a_dev = sdfg.add_array("A", (32, ), dace.uint32, dace.dtypes.StorageType.GPU_Global) - b_dev = sdfg.add_array("B", (32, ), dace.uint32, dace.dtypes.StorageType.GPU_Global) - a_acc = state.add_access("A") - b_acc = state.add_access("B") - - # Generate maps, connect entries with access data - gpu_map_entry, gpu_map_exit = state.add_map(name="GPU_Map", - ndrange=dict(i='0:32:32'), - schedule=dace.dtypes.ScheduleType.GPU_Device) - state.add_edge(a_acc, None, gpu_map_entry, None, dace.memlet.Memlet('A[0:32]')) - - tblock_map_entry, tblock_map_exit = state.add_map(name="Block_Map", - ndrange=dict(j='0:32'), - schedule=dace.dtypes.ScheduleType.GPU_ThreadBlock) - state.add_edge(gpu_map_entry, None, tblock_map_entry, None, dace.memlet.Memlet('A[0:32]')) - - tasklet, warp_scope_entry, warp_scope_exit = state.add_mapped_tasklet( - name='WarpLevel_Operation', - map_ranges=dict(_='0:1'), - inputs=dict(inp=dace.Memlet('A[0:32]', volume=32)), - code=""" -value = inp[j] -out = __reduce_add_sync(0xFFFFFFFF, value); -""", - outputs=dict(out=dace.Memlet("B[j]")), - schedule=dace.dtypes.ScheduleType.GPU_Warp) - - state.add_edge(tblock_map_entry, None, warp_scope_entry, None, dace.memlet.Memlet('A[0:32]')) - - # Connect Exit nodes - state.add_edge(warp_scope_exit, None, tblock_map_exit, None, dace.memlet.Memlet('B[j]')) - state.add_edge(tblock_map_exit, None, gpu_map_exit, None, dace.memlet.Memlet('B[j]')) - state.add_edge(gpu_map_exit, None, b_acc, None, dace.memlet.Memlet('B[0:32]')) - - sdfg.fill_scope_connectors() - - A = cp.ones(32, dtype=cp.uint32) - B = cp.zeros(32, dtype=cp.uint32) - - sdfg(A=A, B=B) - - all_32 = cp.full(32, 32, dtype=cp.uint32) - cp.testing.assert_array_equal(B, all_32) - - -@pytest.mark.gpu -def test_warp_shfl_op(): - """ - Best way to understand this is to copy paste it and - to look at the sdfg. A simple explanation: It tests now another - warpLevel primitive, namely __shfl_down_sync and __shfl_up_sync. - """ - sdfg = dace.SDFG("Warp_test_1") - state = sdfg.add_state("main") - - # Generate access nodes - a_dev = sdfg.add_array("A", (32, ), dace.uint32, dace.dtypes.StorageType.GPU_Global) - b_dev = sdfg.add_array("B", (32, ), dace.uint32, dace.dtypes.StorageType.GPU_Global) - a_acc = state.add_access("A") - b_acc = state.add_access("B") - - # Generate maps, connect entries with access data - gpu_map_entry, gpu_map_exit = state.add_map(name="GPU_Map", - ndrange=dict(i='0:32:32'), - schedule=dace.dtypes.ScheduleType.GPU_Device) - state.add_edge(a_acc, None, gpu_map_entry, None, dace.memlet.Memlet('A[0:32]')) - - tblock_map_entry, tblock_map_exit = state.add_map(name="Block_Map", - ndrange=dict(j='0:32'), - schedule=dace.dtypes.ScheduleType.GPU_ThreadBlock) - state.add_edge(gpu_map_entry, None, tblock_map_entry, None, dace.memlet.Memlet('A[0:32]')) - - tasklet, warp_scope_entry, warp_scope_exit = state.add_mapped_tasklet( - name='WarpLevel_Operation', - map_ranges=dict(_='0:1'), - inputs=dict(inp=dace.Memlet('A[0:32]', volume=32)), - code=""" -tid = j; -value = inp[tid]; -up = __shfl_down_sync(0xFFFFFFFF, value, 16); -low = __shfl_up_sync(0xFFFFFFFF, value, 16); -if tid < 16: - value = up; -else: - value = low -out= value - -""", - outputs=dict(out=dace.Memlet("B[j]")), - schedule=dace.dtypes.ScheduleType.GPU_Warp) - - state.add_edge(tblock_map_entry, None, warp_scope_entry, None, dace.memlet.Memlet('A[0:32]')) - - # Connect Exit nodes - state.add_edge(warp_scope_exit, None, tblock_map_exit, None, dace.memlet.Memlet('B[j]')) - state.add_edge(tblock_map_exit, None, gpu_map_exit, None, dace.memlet.Memlet('B[j]')) - state.add_edge(gpu_map_exit, None, b_acc, None, dace.memlet.Memlet('B[0:32]')) - - sdfg.fill_scope_connectors() - - A = cp.array([0 if False else i for i in range(32)], dtype=cp.uint32) - B = cp.zeros(32, dtype=cp.uint32) - - sdfg(A=A, B=B) - - expected = cp.array(cp.concatenate((A[16:32], A[0:16]))) - cp.testing.assert_array_equal(B, expected) - - -if __name__ == '__main__': - - # Warnings are ignored - #test_warp_map(0, 32, 1) - pytest.main(["-v", "-p", "no:warnings", __file__]) - - # Use this if you want to see the warning - # pytest.main(["-v", __file__]) diff --git a/berkay_workpace/tests/gpu_map_tests/device_map_test.py b/berkay_workpace/tests/gpu_map_tests/device_map_test.py deleted file mode 100644 index 618b3073f4..0000000000 --- a/berkay_workpace/tests/gpu_map_tests/device_map_test.py +++ /dev/null @@ -1,153 +0,0 @@ -import dace -import random -import cupy as cp -import pytest - -from dace.config import Config - - -@pytest.mark.gpu -@pytest.mark.parametrize("vec_size", - [0, 15, 32, 67]) # default block size is 32, so these parameters handle interesting groups -def test_1d_maps_fixed_sizes(vec_size): - """ - Tests flat 1D vector copy from B to A using a single GPU_Device map (no thread blocking) for fixed size arrays. - The vector sizes are chosen to cover interesting cases considering a default block size is 32. - """ - - @dace.program - def vector_copy_flat(A: dace.float64[vec_size] @ dace.dtypes.StorageType.GPU_Global, - B: dace.float64[vec_size] @ dace.dtypes.StorageType.GPU_Global): - for i in dace.map[0:vec_size] @ dace.dtypes.ScheduleType.GPU_Device: - A[i] = B[i] - - sdfg = vector_copy_flat.to_sdfg() - - # Initialize random CUDA arrays - A = cp.zeros(vec_size, dtype=cp.float64) # Output array - B = cp.random.rand(vec_size).astype(cp.float64) # Input array - - # Ensure arrays differ at start - if vec_size != 0: - assert not cp.allclose(A, B), "Arrays are unexpectedly equal before copy." - - # Run the SDFG - sdfg(A=A, B=B) - - # Assert values match - cp.testing.assert_array_equal(A, B) - - -@pytest.mark.gpu -@pytest.mark.parametrize("n", [0, 15, 32, 67]) -def test_1d_maps_dynamic_sizes(n): - """ - Tests flat 1D vector copy from B to A using a single GPU_Device map (no thread blocking) for variable size arrays. - The vector sizes are chosen to cover interesting cases considering a default block size is 32. - """ - N = dace.symbol('N') - - @dace.program - def vector_copy_dyn_sizes(A: dace.float64[N] @ dace.dtypes.StorageType.GPU_Global, - B: dace.float64[N] @ dace.dtypes.StorageType.GPU_Global): - for i in dace.map[0:N] @ dace.dtypes.ScheduleType.GPU_Device: - A[i] = B[i] - - sdfg = vector_copy_dyn_sizes.to_sdfg() - - # Initialize random CUDA arrays - A = cp.zeros(n, dtype=cp.float64) # Output array - B = cp.random.rand(n).astype(cp.float64) # Input array - - # Ensure arrays differ at start - if n != 0: - assert not cp.allclose(A, B), "Arrays are unexpectedly equal before copy." - - sdfg(A=A, B=B, N=n) - - # Assert values match - cp.testing.assert_array_equal(A, B) - - -@pytest.mark.gpu -@pytest.mark.parametrize("s", [1, 2, 32, 33]) -def test_1d_maps_strides(s): - """ - Tests flat 1D vector copy from B to A using a single GPU_Device map (no thread blocking) for different strides. - N is variable in the sdfg/code but we just test for N = 67 here. - """ - N = dace.symbol('N') - n = 67 - - @dace.program - def vector_copy_strides(A: dace.float64[N] @ dace.dtypes.StorageType.GPU_Global, - B: dace.float64[N] @ dace.dtypes.StorageType.GPU_Global): - for i in dace.map[0:N:s] @ dace.dtypes.ScheduleType.GPU_Device: - A[i] = B[i] - - sdfg = vector_copy_strides.to_sdfg() - - # Initialize random CUDA arrays - A = cp.zeros(n, dtype=cp.float64) # Output array - B = cp.random.rand(n).astype(cp.float64) # Input array - - # Ensure arrays differ at start - if n != 0: - assert not cp.allclose(A, B), "Arrays are unexpectedly equal before copy." - - sdfg(A=A, B=B, N=n) - - # Check at stride positions: A[i] == B[i] - cp.testing.assert_array_equal(A[::s], B[::s]) - - # Check non-stride positions: A[i] == 0 - mask = cp.ones(n, dtype=bool) - mask[::s] = False - cp.testing.assert_array_equal(A[mask], cp.zeros_like(A[mask])) - - -@pytest.mark.gpu -@pytest.mark.parametrize("shape", [(2, 16), (3, 32)]) -def test_2d_maps_dynamic_sizes(shape): - """ - Tests 2D matrix copy from B to A using a GPU_Device map for variable-sized matrices. - """ - M = dace.symbol('M') - N = dace.symbol('N') - m, n = shape - - @dace.program - def matrix_copy(A: dace.float64[M, N] @ dace.dtypes.StorageType.GPU_Global, - B: dace.float64[M, N] @ dace.dtypes.StorageType.GPU_Global): - for i, j in dace.map[0:M, 0:N] @ dace.ScheduleType.GPU_Device: - A[i, j] = B[i, j] - - sdfg = matrix_copy.to_sdfg() - - # Initialize arrays - A = cp.zeros((m, n), dtype=cp.float64) - B = cp.random.rand(m, n).astype(cp.float64) - - # Ensure they differ at start - assert not cp.allclose(A, B), "Arrays are unexpectedly equal before copy." - - # Run the SDFG - sdfg(A=A, B=B, M=m, N=n) - - # Assert result - cp.testing.assert_array_equal(A, B) - - -# higher dimensions in old tests - -if __name__ == '__main__': - - print( - f"\n\n\033[94m[INFO] You are using the \033[92m{Config.get('compiler', 'cuda', 'implementation')}\033[94m CUDA implementation.\033[0m \n\n" - ) - - # Warnings are ignored - pytest.main(["-v", "-p", "no:warnings", __file__]) - - # Use this if you want to see the warning - # pytest.main(["-v", __file__]) diff --git a/berkay_workpace/tests/gpu_map_tests/threadBlock_test.py b/berkay_workpace/tests/gpu_map_tests/threadBlock_test.py deleted file mode 100644 index 9898875565..0000000000 --- a/berkay_workpace/tests/gpu_map_tests/threadBlock_test.py +++ /dev/null @@ -1,95 +0,0 @@ -import dace -import cupy as cp -import pytest - -from dace.config import Config - -# More tests at old tests, see /reusable_test - - -@pytest.mark.gpu -@pytest.mark.parametrize("vec_size, block_size, stride", [ - (32, 32, 2), - (64, 32, 4), - (67, 32, 2), - (128, 64, 8), -]) -def test_tb_map_strided(vec_size, block_size, stride): - """ - Tests strided copy from B to A using nested GPU maps: outer map with GPU_Device and - inner map with GPU_ThreadBlock. Only indices matching the stride are written. - """ - - N = dace.symbol('N') - - @dace.program - def vector_copy_strided(A: dace.float64[N] @ dace.dtypes.StorageType.GPU_Global, - B: dace.float64[N] @ dace.dtypes.StorageType.GPU_Global): - for i in dace.map[0:N:block_size] @ dace.dtypes.ScheduleType.GPU_Device: - for j in dace.map[0:block_size:stride] @ dace.dtypes.ScheduleType.GPU_ThreadBlock: - if i + j < N: - A[i + j] = B[i + j] - - sdfg = vector_copy_strided.to_sdfg() - - A = cp.zeros(vec_size, dtype=cp.float64) - B = cp.random.rand(vec_size).astype(cp.float64) - - assert not cp.allclose(A, B), "Arrays are unexpectedly equal at the start." - - sdfg(A=A, B=B, N=vec_size) - - # Check stride positions - cp.testing.assert_array_equal(A[::stride], B[::stride]) - - # Check untouched values (non-stride positions) - mask = cp.ones(vec_size, dtype=bool) - mask[::stride] = False - cp.testing.assert_array_equal(A[mask], cp.zeros_like(A[mask])) - - -@pytest.mark.gpu -@pytest.mark.parametrize("n", [40, 64, 100, 128, 149]) -def test_skewed_like_map_range_flat_add(n): - """ - Tests vector addition C = A + B using a skewed-style inner map: - outer GPU_Device map over blocks of size 32, and inner GPU_ThreadBlock map over absolute indices. - """ - - N = dace.symbol('N') - - @dace.program - def vadd_flat_skew_like(A: dace.float32[N] @ dace.StorageType.GPU_Global, - B: dace.float32[N] @ dace.StorageType.GPU_Global, - C: dace.float32[N] @ dace.StorageType.GPU_Global): - for i in dace.map[0:N:32] @ dace.ScheduleType.GPU_Device: - for j in dace.map[i:(i + 32)] @ dace.ScheduleType.GPU_ThreadBlock: - if j < N: - C[j] = A[j] + B[j] - - sdfg = vadd_flat_skew_like.to_sdfg() - - # Allocate test data - A = cp.random.rand(n).astype(cp.float32) - B = cp.random.rand(n).astype(cp.float32) - C = cp.zeros(n, dtype=cp.float32) - C_expected = A + B - - # Run the program - sdfg(A=A, B=B, C=C, N=n) - - # Validate output - cp.testing.assert_allclose(C, C_expected, rtol=1e-5, err_msg=f"Mismatch in output vector C for n={n}") - - -if __name__ == '__main__': - - print( - f"\n\n\033[94m[INFO] You are using the \033[92m{Config.get('compiler', 'cuda', 'implementation')}\033[94m CUDA implementation.\033[0m \n\n" - ) - - # Warnings are ignored - pytest.main(["-v", "-p", "no:warnings", __file__]) - - # Use this if you want to see the warning - # pytest.main(["-v", __file__]) diff --git a/berkay_workpace/tests/memcopy_tests/out_of_kernel_memcpy_test.py b/berkay_workpace/tests/memcopy_tests/out_of_kernel_memcpy_test.py deleted file mode 100644 index 509aa767e8..0000000000 --- a/berkay_workpace/tests/memcopy_tests/out_of_kernel_memcpy_test.py +++ /dev/null @@ -1,280 +0,0 @@ -import dace -import cupy as cp -import numpy as np -import pytest - -from dace.codegen import common -""" -NOTE: -This test suite focuses on GPU memory copies that are generated outside the kernel code using DaCe and aims to -remain backend-agnostic (CUDA/HIP). While HIP support has not been verified, care was taken to ensure tests are -not backend-specific. - -Design notes: -- A small number of test cases is used intentionally to avoid redundancy while still covering a broad set of scenarios. -- The test set alternates between different offsets, symbolic sizes, fixed sizes and different locations of the source and destination - (GPU or CPU) to simulate common usage patterns. -- At the time of writing, the DaCe Python frontend does not correctly translate some advanced slicing patterns - (e.g., `dst[b1:e1:s1] = src[b2:e2:s2]`) into valid SDFG representations. - Therefore, such cases are implemented directly through the SDFG API for full control and correctness. -""" - -BACKEND = common.get_gpu_backend() - - -#------------------ 1D Memory Copy Tests ----------------------- -@pytest.mark.gpu -def test_1d_out_of_kernel_memcpy(): - """ - Test simple 1D out-of-kernel memory copy. - The size of both arrays is symbolic, both are defined on - the GPU. - """ - # Symbolic array size - N = dace.symbol('N') - - sdfg = dace.SDFG("simple_1D_memory_copy") - state = sdfg.add_state("main") - - # Access nodes - sdfg.add_array("src", (N, ), dace.uint32, dace.dtypes.StorageType.GPU_Global) - sdfg.add_array("dst", (N, ), dace.uint32, dace.dtypes.StorageType.GPU_Global) - src_acc = state.add_access("src") - dst_acc = state.add_access("dst") - - # Create memlet/edge - state.add_edge(src_acc, None, dst_acc, None, dace.memlet.Memlet(expr='[0:N] -> dst[0:N]', volume=N)) - sdfg.fill_scope_connectors() - - # Check correctness - - # Initialize arrays on GPU - n = 100 - src = cp.ones(n, dtype=cp.uint32) - dst = cp.zeros(n, dtype=cp.uint32) - - # Run SDFG - sdfg(src=src, dst=dst, N=n) - - # Check generated code for correct memcpy usage - func_name = f"{BACKEND}MemcpyAsync" - kind = f"{BACKEND}MemcpyDeviceToDevice" - code = sdfg.generate_code()[0].code - assert func_name in code and kind in code - - # Check correctness - cp.testing.assert_array_equal(dst, src) - - -@pytest.mark.gpu -def test_1d_out_of_kernel_memcpy_strided(): - """ - Test strided 1D out-of-kernel memcpy. - Here, the copy shape is strided (different strides for source and destination) - and we use fixed sizes. Src is a CPU array, dst a GPU one. - """ - - sdfg = dace.SDFG("strided_1D_memory_copy") - state = sdfg.add_state("main") - - # Access nodes of fixed shapes - sdfg.add_array("src", (40, ), dace.uint32) - sdfg.add_array("dst", (20, ), dace.uint32, dace.dtypes.StorageType.GPU_Global) - src_acc = state.add_access("src") - dst_acc = state.add_access("dst") - - # copy is of the form: src[0:40:4] -> dst[0:20:2], Volume 10 - state.add_edge(src_acc, None, dst_acc, None, dace.memlet.Memlet('[0:40:4] -> dst[0:20:2]')) - sdfg.fill_scope_connectors() - - # Check correctness - - # Initialize arrays - src = np.ones(40, dtype=cp.uint32) - dst = cp.zeros(20, dtype=cp.uint32) - - # Run program - sdfg(src=src, dst=dst) - - # Check generated code for expected memcpy usage - # NOTE: Memcpy2DAsync is used! Check the codegen, neat trick :) - func_name = f"{BACKEND}Memcpy2DAsync" - kind = f"{BACKEND}MemcpyHostToDevice" - code = sdfg.generate_code()[0].code - assert func_name in code and kind in code - - #Check whether result is as expected - expected = cp.zeros(20, dtype=cp.uint32) - expected[::2] = 1 - cp.testing.assert_array_equal(expected, dst) - - -#------------------ 2D Memory Copy Tests ----------------------- -@pytest.mark.gpu -def test_2d_out_of_kernel_memcpy(): - """ - Test 2D out-of-kernel memcpy. - Here, the copy shape is contigous (copy contiguous src to contigous dst), - we use fixed sizes and only copy a subset of the array. - Source is on GPU, destination an array on CPU. - """ - sdfg = dace.SDFG("simple_2D_memory_copy") - state = sdfg.add_state("main") - - # Access nodes of fixed shape (5,10) - sdfg.add_array("src", ( - 5, - 10, - ), dace.uint32, dace.dtypes.StorageType.GPU_Global) - sdfg.add_array("dst", ( - 5, - 10, - ), dace.uint32) - src_acc = state.add_access("src") - dst_acc = state.add_access("dst") - - # Copying only subset of src to dst, i.e. src[2:4,5:8] -> dst[2:4,5:8] - state.add_edge(src_acc, None, dst_acc, None, dace.memlet.Memlet('[2:4,5:8] -> dst[2:4,5:8]')) - sdfg.fill_scope_connectors() - - # Check correctness - - # Initialize arrays - src = cp.ones((5, 10), dtype=cp.uint32) - dst = np.zeros((5, 10), dtype=cp.uint32) - - # Run program - sdfg(src=src, dst=dst) - - # Check generated code for expected memcpy usage - func_name = f"{BACKEND}Memcpy2DAsync" - kind = f"{BACKEND}MemcpyDeviceToHost" - code = sdfg.generate_code()[0].code - assert func_name in code and kind in code - - #Check whether result is as expected - expected = np.zeros((5, 10), dtype=cp.uint32) - expected[2:4, 5:8] = 1 - np.testing.assert_array_equal(dst, expected) - - -@pytest.mark.gpu -def test_2d_out_of_kernel_memcpy_one_strided(): - """ - Test strided 2D out-of-kernel memcpy. - Symbolic sizes are used, stride is non-contigous - only in one access node. - """ - - N = dace.symbol('N') - M = dace.symbol('M') - sdfg = dace.SDFG("one_strided_2D_memory_copy") - state = sdfg.add_state("main") - - # Access nodes - sdfg.add_array("src", ( - N, - 2 * M, - ), dace.uint32) - sdfg.add_array("dst", ( - N, - M, - ), dace.uint32, dace.dtypes.StorageType.GPU_Global) - src_acc = state.add_access("src") - dst_acc = state.add_access("dst") - - # the edge/memlet - state.add_edge(src_acc, None, dst_acc, None, dace.memlet.Memlet('[0:N,0:2*M:2] -> dst[0:N,0:M]')) - sdfg.fill_scope_connectors() - - # Check correctness - - # Initialize arrays - n = 3 - m = 10 - src = np.ones((n, 2 * m), dtype=cp.uint32) - dst = cp.zeros((n, m), dtype=cp.uint32) - - # Run program - sdfg(src=src, dst=dst, N=n, M=m) - - # Check generated code for expected memcpy usage - func_name = f"{BACKEND}Memcpy2DAsync" - kind = f"{BACKEND}MemcpyHostToDevice" - code = sdfg.generate_code()[0].code - assert func_name in code and kind in code - - #Check whether result is as expected - expected = cp.ones((n, m), dtype=cp.uint32) - cp.testing.assert_array_equal(dst, expected) - - -@pytest.mark.gpu -def test_2d_oofkmemcpy_strided(): - """ - Test strided 2D out-of-kernel memcpy. - """ - - sdfg = dace.SDFG("strided_2D_memory_copy") - state = sdfg.add_state("main") - - # Access nodes - sdfg.add_array("src", ( - 2, - 20, - ), dace.uint32, dace.dtypes.StorageType.GPU_Global) - sdfg.add_array("dst", ( - 2, - 10, - ), dace.uint32, dace.dtypes.StorageType.GPU_Global) - src_acc = state.add_access("src") - dst_acc = state.add_access("dst") - - # the edge/memlet - state.add_edge(src_acc, None, dst_acc, None, dace.memlet.Memlet('[0:2,0:20:10] -> dst[0:2,0:10:5]')) - sdfg.fill_scope_connectors() - - # Check correctness - - # Initialize arrays - src = cp.ones((2, 20), dtype=cp.uint32) - dst = cp.zeros((2, 10), dtype=cp.uint32) - - # Execute program - sdfg(src=src, dst=dst) - - # Compute expected result & verify - expected = cp.zeros((2, 10), dtype=cp.uint32) - expected[0:2, 0:10:5] = src[0:2, 0:20:10] - cp.testing.assert_array_equal(dst, expected) - - -# ---------- Higher-Dimensional (>2D) Memory Copy Tests -------- -@pytest.mark.gpu -def test_3d_oofkmemcpy(): - """ - Test simple 3D out-of-kernel memcpy. - """ - - sdfg = dace.SDFG("simple_3D_memory_copy") - state = sdfg.add_state("main") - - # Access nodes - sdfg.add_array("src", (2, 2, 4), dace.uint32, dace.dtypes.StorageType.GPU_Global) - sdfg.add_array("dst", (2, 2, 4), dace.uint32, dace.dtypes.StorageType.GPU_Global) - src_acc = state.add_access("src") - dst_acc = state.add_access("dst") - - # the edge/memlet - state.add_edge(src_acc, None, dst_acc, None, dace.memlet.Memlet('[0:2,0:2,0:4] -> dst[0:2,0:2,0:4]')) - sdfg.fill_scope_connectors() - - # Check correctness - - # Initialize arrays - src = cp.ones((2, 2, 4), dtype=cp.uint32) - dst = cp.zeros((2, 2, 4), dtype=cp.uint32) - - # run and check - sdfg(src=src, dst=dst) - cp.testing.assert_array_equal(dst, src) diff --git a/berkay_workpace/tests/reusable_tests/cuda_block_test.py b/berkay_workpace/tests/reusable_tests/cuda_block_test.py deleted file mode 100644 index aead6ea25f..0000000000 --- a/berkay_workpace/tests/reusable_tests/cuda_block_test.py +++ /dev/null @@ -1,204 +0,0 @@ -# Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved. -import dace -from dace.transformation.dataflow import GPUTransformMap -from dace.transformation.interstate import GPUTransformSDFG -import numpy as np -import pytest - -N = dace.symbol('N') - - -@dace.program(dace.float64[N], dace.float64[N]) -def cudahello(V, Vout): - - @dace.mapscope(_[0:N:32]) - def multiplication(i): - - # I don't understand why this is here - # Probably will be removed later? - @dace.map(_[0:32]) - def mult_block(bi): - in_V << V[i + bi] - out >> Vout[i + bi] - out = in_V * 2 - - @dace.map(_[0:32]) - def mult_block_2(bi): - in_V << V[i + bi] - out >> Vout[i + bi] - out = in_V * 2 - - -def _test(sdfg): - N = 128 - - print('Vector double CUDA (block) %d' % (N)) - - V = dace.ndarray([N], dace.float64) - Vout = dace.ndarray([N], dace.float64) - V[:] = np.random.rand(N).astype(dace.float64.type) - Vout[:] = dace.float64(0) - - cudahello(V=V, Vout=Vout, N=N) - - diff = np.linalg.norm(2 * V - Vout) / N - print("Difference:", diff) - assert diff <= 1e-5 - - -def test_cpu(): - _test(cudahello.to_sdfg()) - - -@pytest.mark.gpu -def test_gpu(): - sdfg = cudahello.to_sdfg() - assert sdfg.apply_transformations(GPUTransformMap) == 1 - _test(sdfg) - - -@pytest.mark.gpu -def test_different_block_sizes_nesting(): - - @dace.program - def nested(V: dace.float64[34], v1: dace.float64[1]): - with dace.tasklet: - o >> v1(-1) - # Tasklet that does nothing - pass - - for i in dace.map[0:34]: - with dace.tasklet: - inp << V[i] - out >> v1(1, lambda a, b: a + b)[0] - out = inp + inp - - @dace.program - def nested2(V: dace.float64[34], v1: dace.float64[1]): - with dace.tasklet: - o >> v1(-1) - # Tasklet that does nothing - pass - - nested(V, v1) - - @dace.program - def diffblocks(V: dace.float64[130], v1: dace.float64[4], v2: dace.float64[128]): - for bi in dace.map[1:129:32]: - for i in dace.map[0:32]: - with dace.tasklet: - in_V << V[i + bi] - out >> v2[i + bi - 1] - out = in_V * 3 - - nested2(V[bi - 1:bi + 33], v1[bi // 32:bi // 32 + 1]) - - sdfg = diffblocks.to_sdfg() - assert sdfg.apply_transformations(GPUTransformSDFG, dict(sequential_innermaps=False)) == 1 - V = np.random.rand(130) - v1 = np.zeros([4], np.float64) - v2 = np.random.rand(128) - expected_v2 = V[1:129] * 3 - expected_v1 = np.zeros([4], np.float64) - for i in range(4): - expected_v1[i] = np.sum(V[i * 32:(i + 1) * 32 + 2]) * 2 - - sdfg(V, v1, v2) - assert np.linalg.norm(v1 - expected_v1) <= 1e-6 - assert np.allclose(v2, expected_v2) - - -@pytest.mark.gpu -def test_custom_block_size_onemap(): - - @dace.program - def tester(A: dace.float64[400, 300]): - for i, j in dace.map[0:400, 0:300]: - with dace.tasklet: - a >> A[i, j] - a = 1 - - sdfg = tester.to_sdfg() - sdfg.apply_gpu_transformations() - mapentry: dace.nodes.MapEntry = next(n for n, _ in sdfg.all_nodes_recursive() if isinstance(n, dace.nodes.MapEntry)) - - # Test 1: too many dimensions - mapentry.map.gpu_block_size = (13, 5, 3, 4) - code = sdfg.generate_code()[1].clean_code # Get GPU code (second file) - assert 'dim3(13, 5, 12)' in code - - # Test 2: too few dimensions - mapentry.map.gpu_block_size = (127, 5) - code = sdfg.generate_code()[1].clean_code # Get GPU code (second file) - assert 'dim3(127, 5, 1)' in code - - # Test 3: compilation - sdfg.compile() - - -@pytest.mark.gpu -def test_custom_block_size_twomaps(): - - @dace.program - def tester(A: dace.float64[400, 300, 2, 32]): - for i, j in dace.map[0:400, 0:300]: - for bi, bj in dace.map[0:2, 0:32]: - with dace.tasklet: - a >> A[i, j, bi, bj] - a = 1 - - sdfg = tester.to_sdfg() - sdfg.apply_gpu_transformations(sequential_innermaps=True) - mapentry: dace.nodes.MapEntry = next( - n for n, _ in sdfg.all_nodes_recursive() - if isinstance(n, dace.nodes.MapEntry) and n.map.schedule == dace.ScheduleType.GPU_Device) - - mapentry.map.gpu_block_size = (127, 5) - code = sdfg.generate_code()[1].clean_code # Get GPU code (second file) - assert 'dim3(127, 5, 1)' in code - - # Test 3: compilation - sdfg.compile() - - -""" -# Not implemened -@pytest.mark.gpu -def test_block_thread_specialization(): - - @dace.program - def tester(A: dace.float64[200]): - for i in dace.map[0:200:32]: - for bi in dace.map[0:32]: - with dace.tasklet: - a >> A[i + bi] - a = 1 - with dace.tasklet: # Tasklet to be specialized - a >> A[i + bi] - a = 2 - - sdfg = tester.to_sdfg() - sdfg.apply_gpu_transformations(sequential_innermaps=False) - tasklet = next(n for n, _ in sdfg.all_nodes_recursive() - if isinstance(n, dace.nodes.Tasklet) and '2' in n.code.as_string) - tasklet.location['gpu_thread'] = dace.subsets.Range.from_string('2:9:3') - tasklet.location['gpu_block'] = 1 - - code = sdfg.generate_code()[1].clean_code # Get GPU code (second file) - assert '>= 2' in code and '<= 8' in code - assert ' == 1' in code - - a = np.random.rand(200) - ref = np.ones_like(a) - ref[32:64][2:9:3] = 2 - sdfg(a) - assert np.allclose(a, ref) -""" - -if __name__ == "__main__": - test_cpu() - test_gpu() - test_different_block_sizes_nesting() - test_custom_block_size_onemap() - test_custom_block_size_twomaps() - #test_block_thread_specialization() diff --git a/berkay_workpace/tests/reusable_tests/cuda_highdim_kernel_test.py b/berkay_workpace/tests/reusable_tests/cuda_highdim_kernel_test.py deleted file mode 100644 index fed6f72fe1..0000000000 --- a/berkay_workpace/tests/reusable_tests/cuda_highdim_kernel_test.py +++ /dev/null @@ -1,213 +0,0 @@ -# Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved. -import dace -from dace.transformation.dataflow import GPUTransformMap -import numpy as np -import pytest - -# Symbols -N = dace.symbol('N') -M = dace.symbol('M') -K = dace.symbol('K') -L = dace.symbol('L') - -X = dace.symbol('X') -Y = dace.symbol('Y') -Z = dace.symbol('Z') -W = dace.symbol('W') -U = dace.symbol('U') - - -@dace.program -def highdim(A: dace.uint64[N, M, K, L, X, Y, Z, W, U], B: dace.uint64[N, M, K, L]): - - @dace.mapscope - def kernel(i: _[5:N - 5], j: _[0:M], k: _[7:K - 1], l: _[0:L]): - - @dace.map - def block(a: _[0:X], b: _[0:Y], c: _[1:Z], d: _[2:W - 1], e: _[0:U]): - input << A[i, j, k, l, a, b, c, d, e] - output >> B(1, lambda a, b: a + b)[i, j, k, l] - output = input - - -def makendrange(*args): - result = [] - for i in range(0, len(args), 2): - result.append(slice(args[i], args[i + 1], 1)) - return result - - -def _test(sdfg): - # 4D kernel with 5D block - N = 12 - M = 3 - K = 14 - L = 15 - X = 1 - Y = 2 - Z = 3 - W = 4 - U = 5 - dims = tuple(s for s in (N, M, K, L, X, Y, Z, W, U)) - outdims = tuple(s for s in (N, M, K, L)) - print('High-dimensional GPU kernel test', dims) - - A = dace.ndarray((N, M, K, L, X, Y, Z, W, U), dtype=dace.uint64) - B = dace.ndarray((N, M, K, L), dtype=dace.uint64) - A[:] = np.random.randint(10, size=dims).astype(np.uint64) - B[:] = np.zeros(outdims, dtype=np.uint64) - B_regression = np.zeros(outdims, dtype=np.uint64) - - # Equivalent python code - for i, j, k, l in dace.ndrange(makendrange(5, N - 5, 0, M, 7, K - 1, 0, L)): - for a, b, c, d, e in dace.ndrange(makendrange(0, X, 0, Y, 1, Z, 2, W - 1, 0, U)): - B_regression[i, j, k, l] += A[i, j, k, l, a, b, c, d, e] - - sdfg(A=A, B=B, N=N, M=M, K=K, L=L, X=X, Y=Y, Z=Z, W=W, U=U) - - diff = np.linalg.norm(B_regression - B) / (N * M * K * L) - print('Difference:', diff) - assert diff <= 1e-5 - - -def test_cpu(): - _test(highdim.to_sdfg()) - - -@pytest.mark.gpu -def test_gpu(): - sdfg = highdim.to_sdfg() - assert sdfg.apply_transformations(GPUTransformMap, options=dict(fullcopy=True)) == 1 - _test(sdfg) - - -@pytest.mark.gpu -def test_highdim_implicit_block(): - - @dace.program - def tester(x: dace.float64[32, 90, 80, 70]): - for i, j, k, l in dace.map[0:32, 0:90, 0:80, 0:70]: - x[i, j, k, l] = 2.0 - - # Create GPU SDFG - sdfg = tester.to_sdfg() - sdfg.apply_gpu_transformations() - - # Change map implicit block size - for node, _ in sdfg.all_nodes_recursive(): - if isinstance(node, dace.nodes.MapEntry): - node.map.gpu_block_size = [8, 2, 4] - - a = np.random.rand(32, 90, 80, 70) - sdfg(a) - assert np.allclose(a, 2) - - -@pytest.mark.gpu -def test_highdim_implicit_block_threadsplit(): - - @dace.program - def tester(x: dace.float64[2, 2, 80, 70]): - for i, j, k, l in dace.map[0:2, 0:2, 0:80, 0:70]: - x[i, j, k, l] = 2.0 - - # Create GPU SDFG - sdfg = tester.to_sdfg() - sdfg.apply_gpu_transformations() - - # Change map implicit block size - for node, _ in sdfg.all_nodes_recursive(): - if isinstance(node, dace.nodes.MapEntry): - node.map.gpu_block_size = [8, 2, 3] - - a = np.random.rand(2, 2, 80, 70) - sdfg(a) - assert np.allclose(a, 2) - - -def test_highdim_default_block_size(): - - @dace.program - def tester(a: dace.float64[1024, 1024] @ dace.StorageType.GPU_Global): - for i, j in dace.map[0:1024, 0:1024] @ dace.ScheduleType.GPU_Device: - a[i, j] = 1 - - with dace.config.set_temporary('compiler', 'cuda', 'default_block_size', value='32, 8, 2'): - with pytest.warns(UserWarning, match='has more dimensions'): - sdfg = tester.to_sdfg() - gpu_code = sdfg.generate_code()[1] - assert 'dim3(32, 16, 1)' in gpu_code.code - - -def test_block_size_mismatch_warning(): - - @dace.program - def tester(a: dace.float64[1024, 1024] @ dace.StorageType.GPU_Global): - for i, j in dace.map[0:512:2, 0:512:2] @ dace.ScheduleType.GPU_Device: - for bi, bj in dace.map[0:2, 0:2] @ dace.ScheduleType.GPU_ThreadBlock: - a[i + bi, j + bj] = 1 - for bi, bj in dace.map[0:2, 0:1] @ dace.ScheduleType.GPU_ThreadBlock: - a[i + bi, j + bj] = 1 - - sdfg = tester.to_sdfg() - with pytest.warns(UserWarning, match='Multiple thread-block maps'): - sdfg.generate_code() - - -def test_block_size_mismatch_error(): - - @dace.program - def tester(a: dace.float64[1024, 1024] @ dace.StorageType.GPU_Global): - for i, j in dace.map[0:512:2, 0:512:2] @ dace.ScheduleType.GPU_Device: - for bi, bj in dace.map[0:2, 0:2] @ dace.ScheduleType.GPU_ThreadBlock: - a[i + bi, j + bj] = 1 - - sdfg = tester.to_sdfg() - for n, _ in sdfg.all_nodes_recursive(): - if isinstance(n, dace.nodes.MapEntry) and n.schedule == dace.ScheduleType.GPU_Device: - n.gpu_block_size = [4, 2, 1] - - with pytest.raises(ValueError): - sdfg.generate_code() - - -def test_block_size_too_large(): - - @dace.program - def tester(a: dace.float64[1024, 1024] @ dace.StorageType.GPU_Global): - for i, j in dace.map[0:1024, 0:1024] @ dace.ScheduleType.GPU_Device: - a[i, j] = 1 - - sdfg = tester.to_sdfg() - for n, _ in sdfg.all_nodes_recursive(): - if isinstance(n, dace.nodes.MapEntry) and n.schedule == dace.ScheduleType.GPU_Device: - n.gpu_block_size = [64, 32, 1] - - with pytest.raises(ValueError): - sdfg.generate_code() - - -def test_highdim_block_size_too_large(): - BX, BY, BZ, BW = 64, 2, 2, 2 - - @dace.program - def tester(a: dace.float64[1024, 2, 2, 20] @ dace.StorageType.GPU_Global): - for i, j, k, l in dace.map[0:16, 0:1, 0:1, 0:10:2] @ dace.ScheduleType.GPU_Device: - for bi, bj, bk, bl in dace.map[0:BX, 0:BY, 0:BZ, 0:BW] @ dace.ScheduleType.GPU_ThreadBlock: - a[i + bi, j + bj, k + bk, l + bl] = 1 - - sdfg = tester.to_sdfg() - with pytest.raises(ValueError): - sdfg.generate_code() - - -if __name__ == "__main__": - # test_cpu() - # test_gpu() - test_highdim_implicit_block() - # test_highdim_implicit_block_threadsplit() - # test_highdim_default_block_size() - # test_block_size_mismatch_warning() - # test_block_size_mismatch_error() - # test_block_size_too_large() - # test_highdim_block_size_too_large() diff --git a/berkay_workpace/tests/reusable_tests/cuda_smem2d_test.py b/berkay_workpace/tests/reusable_tests/cuda_smem2d_test.py deleted file mode 100644 index 0b2225daef..0000000000 --- a/berkay_workpace/tests/reusable_tests/cuda_smem2d_test.py +++ /dev/null @@ -1,177 +0,0 @@ -# Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved. -import dace -import numpy as np -from dace import nodes -from dace.transformation.dataflow import GPUTransformMap, InLocalStorage -import pytest - -H = dace.symbol('H') -W = dace.symbol('W') - - -@dace.program(dace.float64[H, W], dace.float64[H, W]) -def cudahello(V, Vout): - - @dace.mapscope(_[0:H:8, 0:W:32]) - def multiplication(i, j): - - @dace.map(_[0:8, 0:32]) - def mult_block(bi, bj): - in_V << V[i + bi, j + bj] - out >> Vout[i + bi, j + bj] - out = in_V * 2.0 - - -def _test(sdfg): - W = 128 - H = 64 - - print('Vector double CUDA (shared memory 2D) %dx%d' % (W, H)) - - V = dace.ndarray([H, W], dace.float64) - Vout = dace.ndarray([H, W], dace.float64) - V[:] = np.random.rand(H, W).astype(dace.float64.type) - Vout[:] = dace.float64(0) - - sdfg(V=V, Vout=Vout, H=H, W=W) - - diff = np.linalg.norm(2 * V - Vout) / (H * W) - print("Difference:", diff) - assert diff <= 1e-5 - - -def test_cpu(): - sdfg = cudahello.to_sdfg() - sdfg.name = "cuda_smem2d_cpu" - _test(sdfg) - - -@pytest.mark.gpu -def test_gpu(): - sdfg = cudahello.to_sdfg() - sdfg.name = "cuda_smem2d_gpu" - _test(sdfg) - - -@pytest.mark.gpu -def test_gpu_localstorage(): - sdfg = cudahello.to_sdfg() - sdfg.name = "cuda_smem2d_gpu_localstorage" - assert sdfg.apply_transformations([GPUTransformMap, InLocalStorage], options=[{}, {'array': 'gpu_V'}]) == 2 - _test(sdfg) - - -@pytest.mark.gpu -def test_gpu_2localstorage(): - - @dace.program - def addtwoandmult(A: dace.float64[H, W], B: dace.float64[H, W], Vout: dace.float64[H, W]): - for i, j in dace.map[0:H:8, 0:W:32]: - for bi, bj in dace.map[0:8, 0:32]: - with dace.tasklet: - a << A[i + bi, j + bj] - b << B[i + bi, j + bj] - out = (a + b) * 2.0 - out >> Vout[i + bi, j + bj] - - sdfg = addtwoandmult.to_sdfg() - sdfg.name = "cuda_2_smem2d_gpu_localstorage" - assert sdfg.apply_transformations([GPUTransformMap, InLocalStorage, InLocalStorage], - options=[{}, { - 'array': 'gpu_A' - }, { - 'array': 'gpu_B' - }]) == 3 - - A = np.random.rand(128, 64) - B = np.random.rand(128, 64) - out = np.random.rand(128, 64) - refout = (A + B) * 2 - sdfg(A, B, out, H=128, W=64) - assert np.allclose(refout, out) - - -@pytest.mark.gpu -def test_gpu_2shared_for(): - - @dace.program - def addtwoandmult(A: dace.float64[H, W], B: dace.float64[H, W], Vout: dace.float64[H, W]): - for i, j in dace.map[0:H:8, 0:W:32]: - for _ in range(1): - local_a = dace.ndarray([8, 32], dtype=dace.float64, storage=dace.StorageType.GPU_Shared) - local_b = dace.ndarray([8, 32], dtype=dace.float64, storage=dace.StorageType.GPU_Shared) - local_a << A[i:i + 8, j:j + 32] - local_b << B[i:i + 8, j:j + 32] - for bi, bj in dace.map[0:8, 0:32]: - with dace.tasklet: - a << local_a[bi, bj] - b << local_b[bi, bj] - out = (a + b) * 2.0 - out >> Vout[i + bi, j + bj] - - sdfg = addtwoandmult.to_sdfg() - sdfg.name = "cuda_2_shared_for" - state = sdfg.nodes()[0] - map_entry = -1 - for node in state.nodes(): - if isinstance(node, nodes.MapEntry) and 'i' in node.map.params: - map_entry = state.node_id(node) - break - transformation = GPUTransformMap() - transformation.setup_match(sdfg, 0, 0, {GPUTransformMap.map_entry: map_entry}, 0) - transformation.apply(state, sdfg) - - A = np.random.rand(128, 64) - B = np.random.rand(128, 64) - out = np.random.rand(128, 64) - refout = (A + B) * 2 - sdfg(A, B, out, H=128, W=64) - assert np.allclose(refout, out) - - -def _find_map_by_param(sdfg: dace.SDFG, pname: str) -> dace.nodes.MapEntry: - """ Finds the first map entry node by the given parameter name. """ - return next(n for n, _ in sdfg.all_nodes_recursive() if isinstance(n, dace.nodes.MapEntry) and pname in n.params) - - -@pytest.mark.gpu -def test_gpu_2shared_map(): - K = dace.symbol('K') - - @dace.program - def addtwoandmult(A: dace.float64[H, W], B: dace.float64[H, W], Vout: dace.float64[H, W]): - for i, j in dace.map[0:H:8, 0:W:32]: - for _ in dace.map[0:K]: - local_a = dace.ndarray([8, 32], dtype=dace.float64, storage=dace.StorageType.GPU_Shared) - local_b = dace.ndarray([8, 32], dtype=dace.float64, storage=dace.StorageType.GPU_Shared) - local_a << A[i:i + 8, j:j + 32] - local_b << B[i:i + 8, j:j + 32] - for bi, bj in dace.map[0:8, 0:32]: - with dace.tasklet: - a << local_a[bi, bj] - b << local_b[bi, bj] - out = (a + b) * 2.0 - out >> Vout[i + bi, j + bj] - - sdfg = addtwoandmult.to_sdfg() - sdfg.name = "cuda_2_shared_map" - - me = _find_map_by_param(sdfg, '_') - me.schedule = dace.ScheduleType.Sequential - sdfg.apply_gpu_transformations() - me = _find_map_by_param(sdfg, 'bi') - me.schedule = dace.ScheduleType.GPU_ThreadBlock - - A = np.random.rand(128, 64) - B = np.random.rand(128, 64) - out = np.random.rand(128, 64) - refout = (A + B) * 2 - sdfg(A, B, out, H=128, W=64, K=1) - assert np.allclose(refout, out) - - -if __name__ == "__main__": - test_cpu() - test_gpu_2localstorage() - test_gpu_2shared_for() - test_gpu_2shared_map() diff --git a/berkay_workpace/tests/reusable_tests/cuda_smem_test.py b/berkay_workpace/tests/reusable_tests/cuda_smem_test.py deleted file mode 100644 index e7191a2631..0000000000 --- a/berkay_workpace/tests/reusable_tests/cuda_smem_test.py +++ /dev/null @@ -1,62 +0,0 @@ -# Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved. - -import dace -from dace.transformation.dataflow import GPUTransformMap, InLocalStorage -from dace.transformation.passes import gpustream_scheduling -import numpy as np -import pytest - -N = dace.symbol('N') - - -@dace.program(dace.float64[N], dace.float64[N]) -def cudahello(A, Vout): - - @dace.mapscope(_[0:ceiling(N / 32)]) - def multiplication(i): - - @dace.map(_[i * 32:min(N, (i + 1) * 32)]) - def mult_block(bi): - in_V << A[bi] - out >> Vout[bi] - out = in_V * 2.0 - - -def _test(sdfg): - N = 144 - - print('Vector double CUDA (shared memory) %d' % (N)) - - V = dace.ndarray([N], dace.float64) - Vout = dace.ndarray([N], dace.float64) - V[:] = np.random.rand(N).astype(dace.float64.type) - Vout[:] = dace.float64(0) - - sdfg(A=V, Vout=Vout, N=N) - - diff = np.linalg.norm(2 * V - Vout) / N - print("Difference:", diff) - assert diff <= 1e-5 - - -def test_cpu(): - _test(cudahello.to_sdfg()) - - -@pytest.mark.gpu -def test_gpu(): - sdfg = cudahello.to_sdfg() - assert sdfg.apply_transformations(GPUTransformMap) == 1 - _test(sdfg) - - -@pytest.mark.gpu -def test_gpu_localstorage(): - sdfg = cudahello.to_sdfg() - assert sdfg.apply_transformations([GPUTransformMap, InLocalStorage], options=[{}, {'array': 'gpu_A'}]) == 2 - _test(sdfg) - - -if __name__ == "__main__": - test_cpu() - test_gpu() diff --git a/berkay_workpace/tests/reusable_tests/gpu_launch_bounds_test.py b/berkay_workpace/tests/reusable_tests/gpu_launch_bounds_test.py deleted file mode 100644 index 48ae32665c..0000000000 --- a/berkay_workpace/tests/reusable_tests/gpu_launch_bounds_test.py +++ /dev/null @@ -1,70 +0,0 @@ -# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved. - -import dace -import pytest - - -@pytest.mark.gpu -def test_launch_bounds_default(): - - @dace.program - def prog(a: dace.float64[100, 20] @ dace.StorageType.GPU_Global): - for i, j in dace.map[0:100, 0:20] @ dace.ScheduleType.GPU_Device: - a[i, j] = 1 - - with dace.config.set_temporary('compiler', 'cuda', 'default_block_size', value='32,2,1'): - assert '__launch_bounds__(64)' in prog.to_sdfg().generate_code()[1].code - - -@pytest.mark.gpu -def test_launch_bounds_implicit(): - - @dace.program - def prog(a: dace.float64[100, 20] @ dace.StorageType.GPU_Global): - for i, j in dace.map[0:50, 0:10] @ dace.ScheduleType.GPU_Device: - for bi, bj in dace.map[0:2, 0:2] @ dace.ScheduleType.GPU_ThreadBlock: - a[i * 2 + bi, j * 2 + bj] = 1 - - assert '__launch_bounds__(4)' in prog.to_sdfg().generate_code()[1].code - - -@pytest.mark.gpu -def test_launch_bounds_implicit_sym(): - B = dace.symbol('B') - - @dace.program - def prog(a: dace.float64[100, 20] @ dace.StorageType.GPU_Global): - for i, j in dace.map[0:50, 0:10] @ dace.ScheduleType.GPU_Device: - for bi, bj in dace.map[0:B, 0:B] @ dace.ScheduleType.GPU_ThreadBlock: - a[i * B + bi, j * B + bj] = 1 - - assert '__launch_bounds__' not in prog.to_sdfg().generate_code()[1].code - - -@pytest.mark.gpu -def test_launch_bounds_explicit(): - B = 2 - - @dace.program - def prog(a: dace.float64[100, 20] @ dace.StorageType.GPU_Global): - for i, j in dace.map[0:50, 0:10] @ dace.ScheduleType.GPU_Device: - for bi, bj in dace.map[0:B, 0:B] @ dace.ScheduleType.GPU_ThreadBlock: - a[i * B + bi, j * B + bj] = 1 - - sdfg = prog.to_sdfg() - for n, _ in sdfg.all_nodes_recursive(): - if isinstance(n, dace.nodes.MapEntry) and n.map.schedule == dace.ScheduleType.GPU_Device: - mapentry = n - break - - mapentry.map.gpu_launch_bounds = '-1' - assert '__launch_bounds__' not in sdfg.generate_code()[1].code - mapentry.map.gpu_launch_bounds = '5, 1' - assert '__launch_bounds__(5, 1)' in sdfg.generate_code()[1].code - - -if __name__ == '__main__': - test_launch_bounds_default() - test_launch_bounds_implicit() - test_launch_bounds_implicit_sym() - test_launch_bounds_explicit() diff --git a/berkay_workpace/tests/reusable_tests/halfvec_cudatest.py b/berkay_workpace/tests/reusable_tests/halfvec_cudatest.py deleted file mode 100644 index 8772d6b24d..0000000000 --- a/berkay_workpace/tests/reusable_tests/halfvec_cudatest.py +++ /dev/null @@ -1,160 +0,0 @@ -# Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved. -""" Tests for half-precision syntax quirks. """ - -import dace -import math -import numpy as np -import pytest -from dace.transformation.dataflow import MapFusion, Vectorization -from dace.transformation.optimizer import Optimizer - -N = dace.symbol('N') - - -def _config(): - # Prerequisite for test: CUDA compute capability >= 6.0 - dace.Config.set('compiler', 'cuda', 'cuda_arch', value='60') - - -def _test_half(veclen): - """ Tests a set of elementwise operations on a vector half type. """ - _config() - - @dace.program - def halftest(A: dace.float16[N], B: dace.float16[N]): - return A * B + A - - A = np.random.rand(24).astype(np.float16) - B = np.random.rand(24).astype(np.float16) - sdfg = halftest.to_sdfg() - sdfg.simplify() - sdfg.apply_gpu_transformations() - - # Apply vectorization on each map and count applied - applied = 0 - for xform in Optimizer(sdfg).get_pattern_matches(patterns=Vectorization, - options=dict(vector_len=veclen, postamble=False)): - xform.apply(sdfg.node(xform.state_id), sdfg) - applied += 1 - assert applied == 2 - - out = sdfg(A=A, B=B, N=24) - assert np.allclose(out, A * B + A) - - -@pytest.mark.gpu -def test_half4(): - """ Tests a set of elementwise operations on half with vector length 4. """ - _test_half(4) - - -@pytest.mark.gpu -def test_half8(): - """ Tests a set of elementwise operations on half with vector length 8. """ - _test_half(8) - - -@pytest.mark.gpu -def test_exp_vec(): - """ Tests an exp operator on a vector half type. """ - _config() - - @dace.program - def halftest(A: dace.float16[N]): - out = np.ndarray([N], dace.float16) - for i in dace.map[0:N]: - with dace.tasklet: - a << A[i] - o >> out[i] - o = math.exp(a) - return out - - A = np.random.rand(24).astype(np.float16) - sdfg = halftest.to_sdfg() - sdfg.apply_gpu_transformations() - assert sdfg.apply_transformations(Vectorization, dict(vector_len=8)) == 1 - out = sdfg(A=A, N=24) - assert np.allclose(out, np.exp(A)) - - -@pytest.mark.gpu -def test_relu_vec(): - """ Tests a ReLU operator on a vector half type. """ - _config() - - @dace.program - def halftest(A: dace.float16[N]): - out = np.ndarray([N], dace.float16) - for i in dace.map[0:N]: - with dace.tasklet: - a << A[i] - o >> out[i] - o = max(a, dace.float16(0)) - return out - - A = np.random.rand(24).astype(np.float16) - sdfg = halftest.to_sdfg() - sdfg.apply_gpu_transformations() - assert sdfg.apply_transformations(Vectorization, dict(vector_len=8)) == 1 - out = sdfg(A=A, N=24) - assert np.allclose(out, np.maximum(A, 0)) - - -@pytest.mark.gpu -def test_dropout_vec(): - """ Tests a dropout operator on a vector half type. """ - _config() - - @dace.program - def halftest(A: dace.float16[N], mask: dace.float16[N]): - out = np.ndarray([N], dace.float16) - for i in dace.map[0:N]: - with dace.tasklet: - a << A[i] - d << mask[i] - o >> out[i] - o = a * d - return out - - A = np.random.rand(24).astype(np.float16) - mask = np.random.randint(0, 2, size=[24]).astype(np.float16) - sdfg: dace.SDFG = halftest.to_sdfg() - sdfg.apply_gpu_transformations() - assert sdfg.apply_transformations(Vectorization, dict(vector_len=8)) == 1 - out = sdfg(A=A, mask=mask, N=24) - assert np.allclose(out, A * mask) - - -@pytest.mark.gpu -def test_gelu_vec(): - """ Tests a GELU operator on a vector half type. """ - _config() - s2pi = math.sqrt(2.0 / math.pi) - - @dace.program - def halftest(A: dace.float16[N]): - out = np.ndarray([N], dace.float16) - for i in dace.map[0:N]: - with dace.tasklet: - a << A[i] - o >> out[i] - o = dace.float16(0.5) * a * (dace.float16(1) + - math.tanh(dace.float16(s2pi) * (a + dace.float16(0.044715) * (a**3)))) - return out - - A = np.random.rand(24).astype(np.float16) - sdfg = halftest.to_sdfg() - sdfg.apply_gpu_transformations() - assert sdfg.apply_transformations(Vectorization, dict(vector_len=4)) == 1 - out = sdfg(A=A, N=24) - expected = 0.5 * A * (1 + np.tanh(math.sqrt(2.0 / math.pi) * (A + 0.044715 * (A**3)))) - assert np.allclose(out, expected, rtol=1e-2, atol=1e-4) - - -if __name__ == '__main__': - test_half4() - test_half8() - test_exp_vec() - test_relu_vec() - test_dropout_vec() - test_gelu_vec() diff --git a/berkay_workpace/tests/reusable_tests/multiprogram_cudatest.py b/berkay_workpace/tests/reusable_tests/multiprogram_cudatest.py deleted file mode 100644 index 1b8dae3247..0000000000 --- a/berkay_workpace/tests/reusable_tests/multiprogram_cudatest.py +++ /dev/null @@ -1,57 +0,0 @@ -# Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved. -import dace -from dace.transformation import optimizer -from dace.transformation.dataflow import GPUTransformMap -import numpy as np -import pytest - - -@dace.program -def prog1(A: dace.float32[32], B: dace.float32[32]): - - @dace.map - def work1(i: _[0:32]): - a << A[i] - b >> B[i] - b = a * 2.0 - - -@dace.program -def prog2(A: dace.float32[32], B: dace.float32[32]): - - @dace.map - def work2(i: _[0:32]): - a << A[i] - b >> B[i] - b = a / 2.0 - - -###################################### -@pytest.mark.gpu -def test_multiprogram(): - print('Multi-program CUDA test') - - A = np.random.rand(32).astype(np.float32) - B = np.random.rand(32).astype(np.float32) - C = np.random.rand(32).astype(np.float32) - - s1 = prog1.to_sdfg() - s1.apply_transformations(GPUTransformMap) - - s2 = prog2.to_sdfg() - s2.apply_transformations(GPUTransformMap) - - s1func = s1.compile() - s2func = s2.compile() - - s1func(A=A, B=B) - s2func(A=B, B=C) - - diff = np.linalg.norm(A - C) - - print('Difference:', diff) - assert diff <= 1e-5 - - -if __name__ == '__main__': - test() diff --git a/berkay_workpace/tests/reusable_tests/multistream_copy_cudatest.py b/berkay_workpace/tests/reusable_tests/multistream_copy_cudatest.py deleted file mode 100644 index df307d9958..0000000000 --- a/berkay_workpace/tests/reusable_tests/multistream_copy_cudatest.py +++ /dev/null @@ -1,93 +0,0 @@ -# Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved. -import dace -import numpy as np -import pytest - - -###################################### -@pytest.mark.gpu -def test_multistream_copy(): - sdfg = dace.SDFG('multistream') - - _, A = sdfg.add_array('A', [2], dace.float32, storage=dace.StorageType.CPU_Pinned) - _, B = sdfg.add_array('B', [2], dace.float32, storage=dace.StorageType.CPU_Pinned) - _, C = sdfg.add_array('C', [2], dace.float32, storage=dace.StorageType.CPU_Pinned) - - gA = sdfg.add_transient('gA', [2], dace.float32, storage=dace.StorageType.GPU_Global) - gB = sdfg.add_transient('gB', [2], dace.float32, storage=dace.StorageType.GPU_Global) - gC = sdfg.add_transient('gC', [2], dace.float32, storage=dace.StorageType.GPU_Global) - - state = sdfg.add_state('s0') - - a1 = state.add_read('A') - a2 = state.add_access('gA') - - b1 = state.add_read('B') - b2 = state.add_access('gB') - - c1 = state.add_access('gC') - c2 = state.add_write('C') - - state.add_nedge(a1, a2, dace.Memlet.from_array('A', A)) - state.add_nedge(b1, b2, dace.Memlet.from_array('B', B)) - state.add_nedge(c1, c2, dace.Memlet.from_array('C', C)) - - state.add_nedge(a2, c1, dace.Memlet.simple('gA', '0')) - state.add_nedge(b2, c1, dace.Memlet.simple('gB', '1', other_subset_str='1')) - - # Validate correctness of initial SDFG - sdfg.validate() - - a = np.random.rand(2).astype(np.float32) - b = np.random.rand(2).astype(np.float32) - c = np.random.rand(2).astype(np.float32) - - sdfg(A=a, B=b, C=c) - - refC = np.array([a[0], b[1]], dtype=np.float32) - diff = np.linalg.norm(c - refC) - print('Difference:', diff) - assert diff <= 1e-5 - - -@pytest.mark.gpu -def test_copy_sync(): - sdfg = dace.SDFG('h2dsync') - sdfg.add_scalar('scal_outer', dace.float32) - sdfg.add_scalar('gpu_scal_outer', dace.float32, dace.StorageType.GPU_Global, transient=True) - sdfg.add_array('output_outer', [1], dace.float32) - - nsdfg = dace.SDFG('nested') - nsdfg.add_scalar('gpu_scal', dace.float32, dace.StorageType.GPU_Global) - nsdfg.add_scalar('cpu_scal', dace.float32, transient=True) - nsdfg.add_array('output', [1], dace.float32) - - nstate = nsdfg.add_state() - r = nstate.add_read('gpu_scal') - a = nstate.add_access('cpu_scal') - nt = nstate.add_tasklet('addone', {'inp'}, {'out'}, 'out = inp + 1') - w = nstate.add_write('output') - nstate.add_nedge(r, a, dace.Memlet('gpu_scal')) - nstate.add_edge(a, None, nt, 'inp', dace.Memlet('cpu_scal')) - nstate.add_edge(nt, 'out', w, None, dace.Memlet('output')) - - state = sdfg.add_state() - r = state.add_read('scal_outer') - w = state.add_write('gpu_scal_outer') - state.add_nedge(r, w, dace.Memlet('scal_outer')) - - state = sdfg.add_state_after(state) - ro = state.add_read('gpu_scal_outer') - wo = state.add_write('output_outer') - nsdfg_node = state.add_nested_sdfg(nsdfg, None, {'gpu_scal'}, {'output'}) - state.add_edge(ro, None, nsdfg_node, 'gpu_scal', dace.Memlet('gpu_scal_outer')) - state.add_edge(nsdfg_node, 'output', wo, None, dace.Memlet('output_outer')) - - out = np.random.rand(1).astype(np.float32) - sdfg(scal_outer=np.float32(2), output_outer=out) - assert np.allclose(out, 3) - - -if __name__ == '__main__': - test_multistream_copy() - test_copy_sync() diff --git a/berkay_workpace/tests/reusable_tests/multistream_kernel_cudatest.py b/berkay_workpace/tests/reusable_tests/multistream_kernel_cudatest.py deleted file mode 100644 index f1451003ac..0000000000 --- a/berkay_workpace/tests/reusable_tests/multistream_kernel_cudatest.py +++ /dev/null @@ -1,79 +0,0 @@ -# Copyright 2019-2021 ETH Zurich and the DaCe authors. All rights reserved. -import dace -import numpy as np -import pytest - -sdfg = dace.SDFG('multistream_kernel') - -sdfg.add_array('A', [2], dace.float32, storage=dace.StorageType.CPU_Pinned) -sdfg.add_array('B', [2], dace.float32, storage=dace.StorageType.CPU_Pinned) -sdfg.add_array('C', [2], dace.float32, storage=dace.StorageType.CPU_Pinned) - -sdfg.add_transient('gA1', [2], dace.float32, storage=dace.StorageType.GPU_Global) -sdfg.add_transient('gA2', [2], dace.float32, storage=dace.StorageType.GPU_Global) -sdfg.add_transient('gB1', [2], dace.float32, storage=dace.StorageType.GPU_Global) -sdfg.add_transient('gB2', [2], dace.float32, storage=dace.StorageType.GPU_Global) -sdfg.add_transient('gC', [2], dace.float32, storage=dace.StorageType.GPU_Global) - -state = sdfg.add_state('s0') - -a = state.add_read('A') -ga1 = state.add_access('gA1') -ga2 = state.add_access('gA2') -state.add_nedge(a, ga1, dace.Memlet.simple('A', '0:2')) - -b = state.add_read('B') -gb1 = state.add_access('gB1') -gb2 = state.add_access('gB2') -state.add_nedge(b, gb1, dace.Memlet.simple('B', '0:2')) - -gc = state.add_access('gC') -c = state.add_write('C') -state.add_nedge(gc, c, dace.Memlet.simple('gC', '0:2')) - -t1, me1, mx1 = state.add_mapped_tasklet('addone', dict(i='0:2'), - dict(inp=dace.Memlet.simple('gA1', 'i')), 'out = inp + 1', - dict(out=dace.Memlet.simple('gA2', 'i')), dace.ScheduleType.GPU_Device) -t2, me2, mx2 = state.add_mapped_tasklet('addtwo', dict(i='0:2'), - dict(inp=dace.Memlet.simple('gB1', 'i')), 'out = inp + 2', - dict(out=dace.Memlet.simple('gB2', 'i')), dace.ScheduleType.GPU_Device) - -t2, me3, mx3 = state.add_mapped_tasklet('twoarrays', dict(i='0:2'), - dict(inp1=dace.Memlet.simple('gA2', 'i'), - inp2=dace.Memlet.simple('gB2', 'i')), 'out = inp1 * inp2', - dict(out=dace.Memlet.simple('gC', 'i')), dace.ScheduleType.GPU_Device) - -state.add_nedge(ga1, me1, dace.Memlet.simple('gA1', '0:2')) -state.add_nedge(gb1, me2, dace.Memlet.simple('gB1', '0:2')) -state.add_nedge(mx1, ga2, dace.Memlet.simple('gA2', '0:2')) -state.add_nedge(mx2, gb2, dace.Memlet.simple('gB2', '0:2')) - -state.add_nedge(ga2, me3, dace.Memlet.simple('gA2', '0:2')) -state.add_nedge(gb2, me3, dace.Memlet.simple('gB2', '0:2')) -state.add_nedge(mx3, gc, dace.Memlet.simple('gC', '0:2')) - -sdfg.fill_scope_connectors() - -# Validate correctness of initial SDFG -sdfg.validate() - - -###################################### -@pytest.mark.gpu -def test_multistream_kernel(): - print('Multi-stream kernel test') - - a = np.random.rand(2).astype(np.float32) - b = np.random.rand(2).astype(np.float32) - c = np.random.rand(2).astype(np.float32) - - sdfg(A=a, B=b, C=c) - - refC = (a + 1) * (b + 2) - diff = np.linalg.norm(c - refC) - print('Difference:', diff) - assert diff <= 1e-5 - - -if __name__ == "__main__": - test_multistream_kernel() diff --git a/berkay_workpace/tests/smem_tests/default_smem_sync_pass_test.py b/berkay_workpace/tests/smem_tests/default_smem_sync_pass_test.py deleted file mode 100644 index 554c7b68a2..0000000000 --- a/berkay_workpace/tests/smem_tests/default_smem_sync_pass_test.py +++ /dev/null @@ -1,337 +0,0 @@ -import dace -import dace.sdfg.nodes as nodes -from dace.sdfg.state import LoopRegion -from dace.transformation.passes.shared_memory_synchronization import DefaultSharedMemorySync - -import pytest -""" -Simple tests checking core functionality of the "DefaultSharedMemorySync" pass. -""" - - -@pytest.mark.gpu -def test_scalar_multiplic(): - """ - Constructs an SDFG that performs scalar multiplication on a vector. - - In this test, a sequential loop is placed inside the GPU kernel, reusing shared memory. - As a result, the 'DefaultSharedMemorySync' pass should insert a "__syncthreads();" - at the end of each iteration to ensure correctness. - - Note: This test is designed to evaluate where the 'DefaultSharedMemorySync' pass places - synchronization tasklets. In this particular example, the inserted synchronizations are - not strictly necessary and could be avoided with more advanced analysis, which is beyond - the scope of this pass. - """ - - #----------------- Build test program/SDFG-------------------- - - # Create SDFG and state - sdfg = dace.SDFG("scalarMultiplication_smem") - state = sdfg.add_state("main") - - # Add arrays - sdfg.add_array("A", (128, ), dace.uint32, storage=dace.dtypes.StorageType.GPU_Global) - sdfg.add_scalar("scalar", dace.uint32) - sdfg.add_array("S", (32, ), - dace.uint32, - storage=dace.dtypes.StorageType.GPU_Shared, - transient=True, - lifetime=dace.dtypes.AllocationLifetime.Scope) - - # Add access nodes - a_acc = state.add_read("A") - a_store = state.add_write("A") - scalar_acc = state.add_access("scalar") - s_acc = state.add_access("S") - - # Sequential map (outermost) - seq_map_entry, seq_map_exit = state.add_map( - "seq_map", - dict(k="0:4"), - schedule=dace.dtypes.ScheduleType.Sequential, - ) - - # GPU Device map - gpu_map_entry, gpu_map_exit = state.add_map( - "gpu_map", - dict(i="0:32:32"), - schedule=dace.dtypes.ScheduleType.GPU_Device, - ) - - # GPU TB map - tb_map_entry, tb_map_exit = state.add_map( - "tb", - dict(j="0:32"), - schedule=dace.dtypes.ScheduleType.GPU_ThreadBlock, - ) - - # Add tasklets for A -> S -> B - tasklet1 = state.add_tasklet("addMult", - inputs={"__inp_A", "__inp_scalar"}, - outputs={"__out"}, - code="__out = __inp_A * __inp_scalar;", - language=dace.dtypes.Language.CPP) - - tasklet2 = state.add_tasklet("store_to_global", - inputs={"__inp"}, - outputs={"__out"}, - code="__out = __inp;", - language=dace.dtypes.Language.CPP) - - # Edges - - # A and scalar to first map - state.add_edge(a_acc, None, gpu_map_entry, None, dace.Memlet("A[0:128]")) - state.add_edge(scalar_acc, None, gpu_map_entry, None, dace.Memlet("scalar[0]")) - - # Add both down to last map, the threadblock map - state.add_edge(gpu_map_entry, None, seq_map_entry, None, dace.Memlet("A[0:128]")) - state.add_edge(gpu_map_entry, None, seq_map_entry, None, dace.Memlet("scalar[0]")) - state.add_edge(seq_map_entry, None, tb_map_entry, None, dace.Memlet("A[32 * k: 32 * (k+1)]")) - state.add_edge(seq_map_entry, None, tb_map_entry, None, dace.Memlet("scalar[0]")) - - # connect to tasklets - state.add_edge(tb_map_entry, None, tasklet1, "__inp_A", dace.Memlet("A[j + 32* k]")) - state.add_edge(tb_map_entry, None, tasklet1, "__inp_scalar", dace.Memlet("scalar[0]")) - state.add_edge(tasklet1, "__out", s_acc, None, dace.Memlet("S[j]")) - state.add_edge(s_acc, None, tasklet2, "__inp", dace.Memlet("S[j]")) - - # connect to all map exit nodes and then back to A to store back - state.add_edge(tasklet2, "__out", tb_map_exit, None, dace.Memlet("A[j + 32* k]")) - state.add_edge(tb_map_exit, None, seq_map_exit, None, dace.Memlet("A[32 * k: 32 * (k+1)]")) - state.add_edge(seq_map_exit, None, gpu_map_exit, None, dace.Memlet("A[0:128]")) - state.add_edge(gpu_map_exit, None, a_store, None, dace.Memlet("A[0:128]")) - - sdfg.fill_scope_connectors() - - #----------------- Apply pass -------------------- - - DefaultSharedMemorySync().apply_pass(sdfg, None) - - #----------------- Check correct insertion of sync tasklets -------------------- - - # s_acc has a sync tasklet successor - found = None - for succ in state.successors(s_acc): - if (hasattr(succ, "_label") and succ._label == "pre_sync_barrier" and isinstance(succ, nodes.Tasklet) - and "__syncthreads();" in succ.code.code): - found = succ - break - - assert found is not None, "There should be a synchronization tasklet after the shared memory access" - - # smem is reused in seq map, so we need synchronization after each iteration - found = None - for pred in state.predecessors(seq_map_exit): - if (hasattr(pred, "_label") and pred._label == "post_sync_barrier" and isinstance(pred, nodes.Tasklet) - and "__syncthreads();" in pred.code.code): - found = pred - break - - assert found is not None, "There should be a synchronization tasklet after each iteration of the sequential map" - - -@pytest.mark.gpu -def test_scalar_multiplic_special(): - """ - Constructs an SDFG that performs scalar multiplication on a vector. - - Similar to 'test_scalar_multiplic()', but now, since the sequential map - only iterates once, there is no post synchronization required and should be - omitted (although having it would not lead to wrong computations). - - """ - - #----------------- Build test program/SDFG-------------------- - - # Create SDFG and state - sdfg = dace.SDFG("scalarMultiplication_smem") - state = sdfg.add_state("main") - - # Add arrays - sdfg.add_array("A", (32, ), dace.uint32, storage=dace.dtypes.StorageType.GPU_Global) - sdfg.add_scalar("scalar", dace.uint32) - sdfg.add_array("S", (32, ), - dace.uint32, - storage=dace.dtypes.StorageType.GPU_Shared, - transient=True, - lifetime=dace.dtypes.AllocationLifetime.Scope) - - # Add access nodes - a_acc = state.add_read("A") - a_store = state.add_write("A") - scalar_acc = state.add_access("scalar") - s_acc = state.add_access("S") - - # Sequential map (outermost) - seq_map_entry, seq_map_exit = state.add_map( - "seq_map", - dict(k="0:1"), - schedule=dace.dtypes.ScheduleType.Sequential, - ) - - # GPU Device map - gpu_map_entry, gpu_map_exit = state.add_map( - "gpu_map", - dict(i="0:32:32"), - schedule=dace.dtypes.ScheduleType.GPU_Device, - ) - - # GPU TB map - tb_map_entry, tb_map_exit = state.add_map( - "tb", - dict(j="0:32"), - schedule=dace.dtypes.ScheduleType.GPU_ThreadBlock, - ) - - # Add tasklets for A -> S -> B - tasklet1 = state.add_tasklet("addMult", - inputs={"__inp_A", "__inp_scalar"}, - outputs={"__out"}, - code="__out = __inp_A * __inp_scalar;", - language=dace.dtypes.Language.CPP) - - tasklet2 = state.add_tasklet("store_to_global", - inputs={"__inp"}, - outputs={"__out"}, - code="__out = __inp;", - language=dace.dtypes.Language.CPP) - - # Edges - - # A and scalar to first map - state.add_edge(a_acc, None, gpu_map_entry, None, dace.Memlet("A[0:32]")) - state.add_edge(scalar_acc, None, gpu_map_entry, None, dace.Memlet("scalar[0]")) - - # Add both down to last map, the threadblock map - state.add_edge(gpu_map_entry, None, seq_map_entry, None, dace.Memlet("A[0:32]")) - state.add_edge(gpu_map_entry, None, seq_map_entry, None, dace.Memlet("scalar[0]")) - state.add_edge(seq_map_entry, None, tb_map_entry, None, dace.Memlet("A[32 * k: 32 * (k+1)]")) - state.add_edge(seq_map_entry, None, tb_map_entry, None, dace.Memlet("scalar[0]")) - - # connect to tasklets - state.add_edge(tb_map_entry, None, tasklet1, "__inp_A", dace.Memlet("A[j + 32* k]")) - state.add_edge(tb_map_entry, None, tasklet1, "__inp_scalar", dace.Memlet("scalar[0]")) - state.add_edge(tasklet1, "__out", s_acc, None, dace.Memlet("S[j]")) - state.add_edge(s_acc, None, tasklet2, "__inp", dace.Memlet("S[j]")) - - # connect to all map exit nodes and then back to A to store back - state.add_edge(tasklet2, "__out", tb_map_exit, None, dace.Memlet("A[j + 32* k]")) - state.add_edge(tb_map_exit, None, seq_map_exit, None, dace.Memlet("A[32 * k: 32 * (k+1)]")) - state.add_edge(seq_map_exit, None, gpu_map_exit, None, dace.Memlet("A[0:32]")) - state.add_edge(gpu_map_exit, None, a_store, None, dace.Memlet("A[0:32]")) - - sdfg.fill_scope_connectors() - - #----------------- Apply pass -------------------- - - DefaultSharedMemorySync().apply_pass(sdfg, None) - - #----------------- Check correct insertion of sync tasklets -------------------- - - # s_acc has a sync tasklet successor - found = None - for succ in state.successors(s_acc): - if (hasattr(succ, "_label") and succ._label == "pre_sync_barrier" and isinstance(succ, nodes.Tasklet) - and "__syncthreads();" in succ.code.code): - found = succ - break - - assert found is not None, "There should be a synchronization tasklet after the shared memory access" - - # smem is NOT reused in seq map - found = None - for pred in state.predecessors(seq_map_exit): - if (hasattr(pred, "_label") and pred._label == "post_sync_barrier" and isinstance(pred, nodes.Tasklet) - and "__syncthreads();" in pred.code.code): - found = pred - break - - assert found is None, "The DefaultSharedMemorySync pass should not have inserted at the end of the sequential map body" - - -@pytest.mark.gpu -def test_scalar_multiplic_loopRegion(): - """ - Constructs an SDFG that performs scalar multiplication on a vector. - - Analogous to 'test_scalar_multiplic()', where a for loop instead of a sequential map - is used. - """ - - #----------------- Build test program/SDFG-------------------- - - sdfg = dace.SDFG("scalarMultiplication_smem") - state = sdfg.add_state("main") - - # Arrays and access nodes - sdfg.add_array("A", (128, ), dace.uint32, storage=dace.dtypes.StorageType.GPU_Global) - sdfg.add_scalar("scalar", dace.uint32) - a_acc = state.add_read("A") - a_store = state.add_write("A") - scalar_acc = state.add_access("scalar") - - # Device and thread-block maps - gpu_map_entry, gpu_map_exit = state.add_map("gpu_map", - dict(i="0:32:32"), - schedule=dace.dtypes.ScheduleType.GPU_Device) - tb_map_entry, tb_map_exit = state.add_map("tb", dict(j="0:32"), schedule=dace.dtypes.ScheduleType.GPU_ThreadBlock) - - # Nested SDFG setup - inner_sdfg = dace.SDFG('nested_sdfg') - nested = state.add_nested_sdfg(inner_sdfg, sdfg, inputs={'__inp_A', '__inp_scalar'}, outputs={'tmp_ret'}) - - loopreg = LoopRegion("loop", "k < 4", "k", "k = 0", "k = (k + 1)", False, inner_sdfg) - inner_sdfg.add_node(loopreg) - inner_state = loopreg.add_state("use_smem") - - # Shared memory and result - inner_sdfg.add_array("S", (32, ), dace.uint32, storage=dace.dtypes.StorageType.GPU_Shared, transient=True) - inner_sdfg.add_scalar("tmp_ret", dace.uint32) - s_acc = inner_state.add_access("S") - ret = inner_state.add_write("tmp_ret") - - # Tasklets - tasklet1 = inner_state.add_tasklet("assign_to_smem", - inputs={}, - outputs={"__out1"}, - code="__out1 = __inp_A[j + 32 * k]", - language=dace.dtypes.Language.CPP) - tasklet2 = inner_state.add_tasklet("addMult", - inputs={"__inp2"}, - outputs={"__out2"}, - code="__out2 = __inp2 * __inp_scalar;", - language=dace.dtypes.Language.CPP) - - # Main SDFG edges - state.add_edge(a_acc, None, gpu_map_entry, None, dace.Memlet("A[0:128]")) - state.add_edge(scalar_acc, None, gpu_map_entry, None, dace.Memlet("scalar[0]")) - state.add_edge(gpu_map_entry, None, tb_map_entry, None, dace.Memlet("A[0:128]")) - state.add_edge(gpu_map_entry, None, tb_map_entry, None, dace.Memlet("scalar[0]")) - state.add_edge(tb_map_entry, None, nested, "__inp_A", dace.Memlet("A[j : j + 97 : 32]")) - state.add_edge(tb_map_entry, None, nested, "__inp_scalar", dace.Memlet("scalar[0]")) - state.add_edge(nested, "tmp_ret", tb_map_exit, None, dace.Memlet("A[j : j + 97 : 32]")) - state.add_edge(tb_map_exit, None, gpu_map_exit, None, dace.Memlet("A[0:128]")) - state.add_edge(gpu_map_exit, None, a_store, None, dace.Memlet("A[0:128]")) - - # Inner SDFG edges - inner_state.add_edge(tasklet1, "__out1", s_acc, None, dace.Memlet("S[j]")) - inner_state.add_edge(s_acc, None, tasklet2, "__inp2", dace.Memlet("S[j]")) - inner_state.add_edge(tasklet2, "__out2", ret, None, dace.Memlet("S[j]")) - - sdfg.fill_scope_connectors() - - #----------------- Apply pass -------------------- - - DefaultSharedMemorySync().apply_pass(sdfg, None) - - #----------------- Check correct insertion of sync tasklets -------------------- - - try: - # there should be only one successor of the ret accessNode, which is a sync tasklet - post_sync_tasklet = inner_state.successors(ret)[0] - assert "__syncthreads();" in post_sync_tasklet.code.code, "Post synchronization tasklet is not correctly inserted" - except: - # Any other weird failures - assert False, "Post synchronization tasklet is not correctly inserted" diff --git a/berkay_workpace/tests/smem_tests/gemm_test.py b/berkay_workpace/tests/smem_tests/gemm_test.py deleted file mode 100644 index f6c7b9b081..0000000000 --- a/berkay_workpace/tests/smem_tests/gemm_test.py +++ /dev/null @@ -1,33 +0,0 @@ -import dace -from dace import dtypes - -import cupy as cp -import pytest -import os - - -@pytest.mark.gpu -def test_gemm(): - """ - Advanced test: Checks shared memory synchronization and numerical correctness - of a GEMM SDFG using 2D block tiling with custom copy. - """ - current_dir = os.path.dirname(os.path.abspath(__file__)) - sdfg_path = os.path.join(current_dir, - '../../scratch/yakups_examples/smem_related/2d_blocktiled_gemm_with_custom_copy.sdfg') - sdfg = dace.SDFG.from_file(sdfg_path) - - m, n, k = 1024, 1024, 1024 - A = cp.random.rand(m, k).astype(cp.float32) - B = cp.random.rand(k, n).astype(cp.float32) - C = cp.random.rand(m, n).astype(cp.float32) - - # Count __syncthreads(); calls across all generated files - generated_code = sdfg.generate_code() - nr_sync_barriers = sum(f.clean_code.count("__syncthreads();") for f in generated_code) - assert nr_sync_barriers == 2, f"Expected exactly 2 '__syncthreads();' calls, but found {nr_sync_barriers}" - - # Compute expected result - expected = A @ B - sdfg(A=A, B=B, C=C, M=m, N=n, K=k) - cp.testing.assert_allclose(C, expected, atol=0.001, err_msg="Mismatch: unexpected GEMM result") diff --git a/berkay_workpace/tests/smem_tests/special_sync_pass_test.py b/berkay_workpace/tests/smem_tests/special_sync_pass_test.py deleted file mode 100644 index b3e98312ea..0000000000 --- a/berkay_workpace/tests/smem_tests/special_sync_pass_test.py +++ /dev/null @@ -1,37 +0,0 @@ -import dace -from dace import dtypes - -import cupy as cp -import pytest -import os - - -@pytest.mark.gpu -def test_correctness_and_reuse(): - """ - Only one synchronization barrier should be her (other tests verify - already that at the end of this seq map there is no synchronization, because - the range has size 1). This tests essentially shows that we reuse the sync tasklet - (which is more optimal) by checking that only one such barrier is in the generated code - (we also check correcntess, which is however not interesting here since threads only access - smem locations which they also write to, so synchronization is not stictly needed here) - """ - current_dir = os.path.dirname(os.path.abspath(__file__)) - sdfg_path = os.path.join(current_dir, '../../scratch/yakups_examples/smem_related/nice_global_to_shared_copy.sdfg') - sdfg = dace.SDFG.from_file(sdfg_path) - - size = 512 - a = cp.random.rand(size, dtype=cp.float64) - b = cp.random.rand(size, dtype=cp.float64) - c = cp.zeros((size, ), dtype=cp.float64) - - # count that there is only one __syncthread(); call. You can also inspect the final SDFG in the cache for that - generated_code = sdfg.generate_code()[1].clean_code - nr_sync_barriers = generated_code.count("__syncthreads();") - - assert nr_sync_barriers == 1, f"expected only 1 '__syncthreads(); call, but got '{nr_sync_barriers}" - - # Check whether result is correctly computed - expected_res = a + b - sdfg(A=a, B=b, C=c, N=size) - cp.testing.assert_allclose(c, expected_res, err_msg="Mismatch: Not expected result") From bd632d6051efd91c4734550228886bc592b0397a Mon Sep 17 00:00:00 2001 From: aydogdub Date: Mon, 14 Jul 2025 10:38:57 +0200 Subject: [PATCH 59/94] provisional implementation for constant checks --- dace/codegen/targets/cpp.py | 27 +-- dace/codegen/targets/experimental_cuda.py | 220 +++++++++++------- .../copy_strategies.py | 6 +- .../experimental_cuda_helpers/gpu_utils.py | 15 +- .../scope_strategies.py | 2 +- 5 files changed, 163 insertions(+), 107 deletions(-) diff --git a/dace/codegen/targets/cpp.py b/dace/codegen/targets/cpp.py index 89bade5d7e..6523cf033c 100644 --- a/dace/codegen/targets/cpp.py +++ b/dace/codegen/targets/cpp.py @@ -236,14 +236,22 @@ def memlet_copy_to_absolute_strides(dispatcher: 'TargetDispatcher', def is_cuda_codegen_in_device(framecode) -> bool: """ - Check the state of the CUDA code generator, whether it is inside device code. + Check the state of the (Experimental) CUDA code generator, whether it is inside device code. """ from dace.codegen.targets.cuda import CUDACodeGen + from dace.codegen.targets.experimental_cuda import ExperimentalCUDACodeGen + + cuda_impl = Config.get('compiler', 'cuda', 'implementation') + if cuda_impl == 'legacy': + cudaClass = CUDACodeGen + elif cuda_impl == 'experimental': + cudaClass = ExperimentalCUDACodeGen + if framecode is None: cuda_codegen_in_device = False else: for codegen in framecode.targets: - if isinstance(codegen, CUDACodeGen): + if isinstance(codegen, cudaClass): cuda_codegen_in_device = codegen._in_device_code break else: @@ -266,25 +274,12 @@ def ptr(name: str, desc: data.Data, sdfg: SDFG = None, framecode=None) -> str: root = name.split('.')[0] if root in sdfg.arrays and isinstance(sdfg.arrays[root], data.Structure): name = name.replace('.', '->') - # Special case: If memory is persistent and defined in this SDFG, add state # struct to name if (desc.transient and desc.lifetime in (dtypes.AllocationLifetime.Persistent, dtypes.AllocationLifetime.External)): - - # Avoid import loop - from dace.codegen.targets.cuda import CUDACodeGen - from dace.codegen.targets.experimental_cuda import ExperimentalCUDACodeGen - - # Check whether we are in kernel/ device code of GPU backend - cuda_impl = Config.get('compiler', 'cuda', 'implementation') - if cuda_impl == "legacy": - in_device_code = CUDACodeGen._in_device_code - elif cuda_impl == "experimental": - in_device_code = ExperimentalCUDACodeGen._in_kernel_code - if desc.storage == dtypes.StorageType.CPU_ThreadLocal: # Use unambiguous name for thread-local arrays return f'__{sdfg.cfg_id}_{name}' - elif not in_device_code: # GPU kernels cannot access state + elif not is_cuda_codegen_in_device(framecode): # GPU kernels cannot access state return f'__state->__{sdfg.cfg_id}_{name}' elif (sdfg, name) in framecode.where_allocated and framecode.where_allocated[(sdfg, name)] is not sdfg: return f'__{sdfg.cfg_id}_{name}' diff --git a/dace/codegen/targets/experimental_cuda.py b/dace/codegen/targets/experimental_cuda.py index fd676e338f..f602b53861 100644 --- a/dace/codegen/targets/experimental_cuda.py +++ b/dace/codegen/targets/experimental_cuda.py @@ -27,7 +27,6 @@ from dace.codegen.targets.target import IllegalCopy, TargetCodeGenerator, make_absolute # DaCe transformation imports -from dace.transformation import helpers from dace.transformation.passes import analysis as ap from dace.transformation.passes.gpustream_scheduling import NaiveGPUStreamScheduler from dace.transformation.passes.shared_memory_synchronization import DefaultSharedMemorySync @@ -36,7 +35,7 @@ # Experimental CUDA helper imports from dace.codegen.targets.experimental_cuda_helpers.gpu_stream_manager import GPUStreamManager -from dace.codegen.targets.experimental_cuda_helpers.gpu_utils import symbolic_to_cpp, product, emit_sync_debug_checks +from dace.codegen.targets.experimental_cuda_helpers.gpu_utils import symbolic_to_cpp, emit_sync_debug_checks, get_defined_type # Type checking imports (conditional) if TYPE_CHECKING: @@ -45,9 +44,6 @@ # add symbolic_to_cpp ! -# TODO's harder: -# 1. Include constant expressions - @registry.autoregister_params(name='experimental_cuda') class ExperimentalCUDACodeGen(TargetCodeGenerator): @@ -63,7 +59,7 @@ def __init__(self, frame_codegen: 'DaCeCodeGenerator', sdfg: SDFG): self._frame: DaCeCodeGenerator = frame_codegen # creates the frame code, orchestrates the code generation for targets self._dispatcher: TargetDispatcher = frame_codegen.dispatcher # responsible for dispatching code generation to the appropriate target - ExperimentalCUDACodeGen._in_kernel_code = False + self._in_device_code = False self._cpu_codegen: Optional['CPUCodeGen'] = None # NOTE: Moved from preprossessing to here @@ -335,23 +331,43 @@ def generate_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: ScopeSub KernelScopeGenerator, ThreadBlockScopeGenerator, WarpScopeGenerator) + # Entry Node of the scope + scope_entry = dfg_scope.source_nodes()[0] #--------------- Start of Kernel Function Code Generation -------------------- - if not ExperimentalCUDACodeGen._in_kernel_code: - - # Prepare and cache kernel metadata (name, dimensions, arguments, etc.) - self._current_kernel_spec = KernelSpec(cudaCodeGen=self, - sdfg=sdfg, - cfg=cfg, - dfg_scope=dfg_scope, - state_id=state_id) - - # Generate wrapper function - self._generate_kernel_wrapper(sdfg, cfg, dfg_scope, state_id, function_stream, callsite_stream) + if not self._in_device_code: # Enter kernel context and recursively generate device code - ExperimentalCUDACodeGen._in_kernel_code = True + self._in_device_code = True + + # New scope for defined variables (kernel functions scope) + self._dispatcher.defined_vars.enter_scope(scope_entry) + + # Store kernel metadata (name, dimensions, arguments, etc.) in a KernelSpec object + # and save it as an attribute + kernel_spec = KernelSpec(cudaCodeGen=self, + sdfg=sdfg, + cfg=cfg, + dfg_scope=dfg_scope, + state_id=state_id) + + self._current_kernel_spec = kernel_spec + + # Update types of constant variables in the current scope + for dname, data_desc in kernel_spec.arglist.items(): + ptr_name = ptr(dname, data_desc, sdfg, self._frame) + defined_type, ctype = self._dispatcher.defined_vars.get(ptr_name) + + if dname in kernel_spec.kernel_constants: + ctype = f"const {ctype}" + + self._dispatcher.defined_vars.add(ptr_name, defined_type, ctype, allow_shadowing=True) + + # declare and call kernel wrapper function (in the CPU-side code) + self._declare_and_invoke_kernel_wrapper(sdfg, cfg, dfg_scope, state_id, function_stream, callsite_stream) + + # Recursively generate GPU code into the kernel_stream (will be in a .cu file) kernel_stream = CodeIOStream() kernel_function_stream = self._globalcode @@ -366,15 +382,17 @@ def generate_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: ScopeSub self._localcode.write(kernel_stream.getvalue() + '\n') # Exit kernel context - ExperimentalCUDACodeGen._in_kernel_code = False + self._in_device_code = False + + # Generate kernel wrapper, i.e. function which will launch the kernel + self._generate_kernel_wrapper(sdfg, cfg, dfg_scope, state_id, function_stream, callsite_stream) - # Generate kernel launch - self._generate_kernel_launch(sdfg, cfg, dfg_scope, state_id, function_stream, callsite_stream) + # Exit scope for defined variables + self._dispatcher.defined_vars.exit_scope(scope_entry) return #--------------- Nested GPU Scope -------------------- - supported_strategies: List[ScopeGenerationStrategy] = [ ThreadBlockScopeGenerator(codegen=self), WarpScopeGenerator(codegen=self) @@ -388,8 +406,7 @@ def generate_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: ScopeSub #--------------- Unsupported Cases -------------------- # Note: We are inside a nested GPU scope at this point. - node = dfg_scope.source_nodes()[0] - schedule_type = node.map.schedule + schedule_type = scope_entry.map.schedule if schedule_type == dace.ScheduleType.GPU_Device: raise NotImplementedError("Dynamic parallelism (nested GPU_Device schedules) is not supported.") @@ -398,8 +415,8 @@ def generate_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: ScopeSub f"Scope generation for schedule type '{schedule_type}' is not implemented in ExperimentalCUDACodeGen. " "Please check for supported schedule types or implement the corresponding strategy.") - def _generate_kernel_wrapper(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: ScopeSubgraphView, state_id: int, - function_stream: CodeIOStream, callsite_stream: CodeIOStream) -> None: + def _declare_and_invoke_kernel_wrapper(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: ScopeSubgraphView, state_id: int, + function_stream: CodeIOStream, callsite_stream: CodeIOStream) -> None: scope_entry = dfg_scope.source_nodes()[0] @@ -408,21 +425,17 @@ def _generate_kernel_wrapper(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope kernel_wrapper_args_as_input = kernel_spec.kernel_wrapper_args_as_input kernel_wrapper_args_typed = kernel_spec.kernel_wrapper_args_typed - # Declaration of the function which launches the kernel (C++ code) + # Declaration of the kernel wrapper function (in the CPU-side code) function_stream.write( 'DACE_EXPORTED void __dace_runkernel_%s(%s);\n' % (kernel_name, ', '.join(kernel_wrapper_args_typed)), cfg, state_id, scope_entry) - # Calling the function which launches the kernel (C++ code) + # Calling the kernel wrapper function (in the CPU-side code) callsite_stream.write('__dace_runkernel_%s(%s);\n' % (kernel_name, ', '.join(kernel_wrapper_args_as_input)), cfg, state_id, scope_entry) - def _generate_kernel_launch(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: ScopeSubgraphView, state_id: int, - function_stream: CodeIOStream, callsite_stream: CodeIOStream) -> None: - - # NOTE: This generates the function that launches the kernel. - # Do not confuse it with CUDA's internal "LaunchKernel" API — - # the generated function *calls* that API, but we also refer to it as a "launch function". + def _generate_kernel_wrapper(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: ScopeSubgraphView, state_id: int, + function_stream: CodeIOStream, callsite_stream: CodeIOStream) -> None: scope_entry = dfg_scope.source_nodes()[0] @@ -441,11 +454,14 @@ def _generate_kernel_launch(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: gpu_stream = self._gpu_stream_manager.get_stream_node(scope_entry) # ----------------- Kernel Launch Function Declaration ----------------------- + self._localcode.write( - """ - DACE_EXPORTED void __dace_runkernel_{fname}({fargs}); - void __dace_runkernel_{fname}({fargs}) - """.format(fname=kernel_name, fargs=', '.join(kernel_launch_args_typed)), cfg, state_id, scope_entry) + f""" + DACE_EXPORTED void __dace_runkernel_{kernel_name}({', '.join(kernel_launch_args_typed)}); + void __dace_runkernel_{kernel_name}({', '.join(kernel_launch_args_typed)}) + """, + cfg, state_id, scope_entry + ) # Open bracket self._localcode.write('{', cfg, state_id, scope_entry) @@ -475,18 +491,15 @@ def _generate_kernel_launch(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: }}''', cfg, state_id, scope_entry) # ----------------- Kernel Launch Invocation ----------------------- + kargs = ', '.join(['(void *)&' + arg for arg in kernel_args_as_input]) + self._localcode.write( - ''' - void *{kname}_args[] = {{ {kargs} }}; - gpuError_t __err = {backend}LaunchKernel( (void*){kname}, dim3({gdims}), dim3({bdims}), {kname}_args, {dynsmem}, {stream} - ); - '''.format(kname=kernel_name, - kargs=', '.join(['(void *)&' + arg for arg in kernel_args_as_input]), - gdims=gdims, - bdims=bdims, - dynsmem='0', - stream=gpu_stream, - backend=self.backend), cfg, state_id, scope_entry) + f''' + void *{kernel_name}_args[] = {{ {kargs} }}; + gpuError_t __err = {self.backend}LaunchKernel( + (void*){kernel_name}, dim3({gdims}), dim3({bdims}), {kernel_name}_args, {0}, {gpu_stream} + ); + ''', cfg, state_id, scope_entry) self._localcode.write(f'DACE_KERNEL_LAUNCH_CHECK(__err, "{kernel_name}", {gdims}, {bdims});') emit_sync_debug_checks(self.backend, self._localcode) @@ -536,7 +549,7 @@ def state_dispatch_predicate(self, sdfg, state): Returns True if the generator is currently generating kernel code. """ - return ExperimentalCUDACodeGen._in_kernel_code + return self._in_device_code def node_dispatch_predicate(self, sdfg, state, node): """ @@ -552,7 +565,7 @@ def node_dispatch_predicate(self, sdfg, state, node): if schedule in dtypes.GPU_SCHEDULES_EXPERIMENTAL_CUDACODEGEN: return True - if ExperimentalCUDACodeGen._in_kernel_code: + if self._in_device_code: return True return False @@ -572,7 +585,7 @@ def generate_state(self, self._frame.generate_state(sdfg, cfg, state, function_stream, callsite_stream) # Special: Release of pooled memory if not in device code that need to be released her - if not ExperimentalCUDACodeGen._in_kernel_code: + if not self._in_device_code: handled_keys = set() backend = self.backend @@ -630,9 +643,9 @@ def generate_nsdfg_call(self, sdfg, cfg, state, node, memlet_references, sdfg_la state_struct=False) def generate_nsdfg_arguments(self, sdfg, cfg, dfg, state, node): - result = self._cpu_codegen.generate_nsdfg_arguments(sdfg, cfg, dfg, state, node) - return result - + args = self._cpu_codegen.generate_nsdfg_arguments(sdfg, cfg, dfg, state, node) + return args + def _generate_NestedSDFG(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int, node: nodes.NestedSDFG, function_stream: CodeIOStream, callsite_stream: CodeIOStream) -> None: @@ -640,9 +653,35 @@ def _generate_NestedSDFG(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSub self._toplevel_schedule = node.schedule old_codegen = self._cpu_codegen.calling_codegen self._cpu_codegen.calling_codegen = self - + + # Determine and update ctype of new constant data and symbols within the NSDFG + parent_state: SDFGState = cfg.state(state_id) + nsdfg = node.sdfg + + dispatcher: TargetDispatcher = self._dispatcher + dispatcher.defined_vars.enter_scope(node) + + # update const data + new_const_data = sdutil.get_constant_data(node, parent_state) - self._current_kernel_spec.kernel_constants + for name in new_const_data: + desc = nsdfg.arrays[name] + ptr_name = ptr(name, desc, nsdfg, self._frame) + defined_type= get_defined_type(desc) + ctype = f"const {desc.ctype}" + dispatcher.defined_vars.add(ptr_name, defined_type, ctype, allow_shadowing=True) + + # update const symbols + new_const_symbols = sdutil.get_constant_symbols(node, parent_state) - self._current_kernel_spec.kernel_constants + for name in new_const_symbols: + defined_type = DefinedType.Scalar + ctype = f"const {nsdfg.symbols[name].ctype}" + dispatcher.defined_vars.add(name, defined_type, ctype, allow_shadowing=True) + self._cpu_codegen._generate_NestedSDFG(sdfg, cfg, dfg, state_id, node, function_stream, callsite_stream) + # Exit scope + dispatcher.defined_vars.exit_scope(node) + self._cpu_codegen.calling_codegen = old_codegen self._toplevel_schedule = old_schedule @@ -1104,38 +1143,36 @@ def __init__(self, cudaCodeGen: ExperimentalCUDACodeGen, sdfg: SDFG, cfg: Contro dfg_scope: ScopeSubgraphView, state_id: int): # Get kernel entry/exit nodes and current state - kernel_entry_node = dfg_scope.source_nodes()[0] - kernel_exit_node = dfg_scope.sink_nodes()[0] - state: SDFGState = cfg.state(state_id) + kernel_map_entry = dfg_scope.source_nodes()[0] + kernel_parent_state: SDFGState = cfg.state(state_id) - self._kernel_entry_node: nodes.MapEntry = kernel_entry_node + self._kernel_map_entry: nodes.MapEntry = kernel_map_entry + self._kernels_state: SDFGState = kernel_parent_state # Kernel name - self._kernel_name: str = f'{kernel_entry_node.map.label}_{cfg.cfg_id}_{state.block_id}_{state.node_id(kernel_entry_node)}' - - # Kernel arguments - arglist: Dict[str, Any] = {} - for state_, node, defined_syms in sdutil.traverse_sdfg_with_defined_symbols(sdfg, recursive=True): - if node is kernel_entry_node: - shared_transients = state_.parent.shared_transients() - arglist = state_.scope_subgraph(node).arglist(defined_syms, shared_transients) - break - - self._args: Dict[str, Any] = arglist - - # Typed arguments and argument access as input - self._args_typed: list[str] = [adata.as_arg(name=aname) for aname, adata in self._args.items()] - self._args_as_input: list[str] = [ - ptr(aname, adata, sdfg, cudaCodeGen._frame) for aname, adata in self._args.items() - ] + self._kernel_name: str = f'{kernel_map_entry.map.label}_{cfg.cfg_id}_{kernel_parent_state.block_id}_{kernel_parent_state.node_id(kernel_map_entry)}' + + # Get and store kernel constants — needed for applying 'const' and updating defined + # constant variable types in the dispatcher (handled at GPU codegen) + kernel_const_data = sdutil.get_constant_data(kernel_map_entry, kernel_parent_state) + kernel_const_symbols = sdutil.get_constant_symbols(kernel_map_entry, kernel_parent_state) + + kernel_constants = kernel_const_data | kernel_const_symbols + self._kernel_constants: Set[str] = kernel_constants - # Used for the kernel wrapper function, be careful: a change in the name __state will probably lead to compilation errors - state_param: list[str] = [f'{mangle_dace_state_struct_name(cudaCodeGen._global_sdfg)} *__state'] - self._kernel_wrapper_args_as_input: list[str] = ['__state'] + self._args_as_input - self._kernel_wrapper_args_typed: list[str] = state_param + self._args_typed + # Retrieve arguments required for the kernels subgraph + arglist: Dict[str, dt.Data] = kernel_parent_state.scope_subgraph(kernel_map_entry).arglist() + self._arglist = arglist + + # Format arguments for input passing and function signatures (kernel and kernel wrapper) + self._args_as_input = [ptr(name, data, sdfg, cudaCodeGen._frame) for name, data in arglist.items()] + self._args_typed = [('const ' if name in kernel_constants else '') + data.as_arg(name=name) for name, data in arglist.items()] + + self._kernel_wrapper_args_as_input = ['__state'] + self._args_as_input + self._kernel_wrapper_args_typed = [f'{mangle_dace_state_struct_name(cudaCodeGen._global_sdfg)} *__state'] + self._args_typed # The kernel's grid and block dimensions - self._grid_dims, self._block_dims = cudaCodeGen._kernel_dimensions_map[kernel_entry_node] + self._grid_dims, self._block_dims = cudaCodeGen._kernel_dimensions_map[kernel_map_entry] # C type of block, thread, and warp indices (as a string) self._gpu_index_ctype: str = self.get_gpu_index_ctype() @@ -1170,23 +1207,36 @@ def get_gpu_index_ctype(self, config_key='gpu_index_type') -> str: 'Please use a valid type from dace.dtypes (e.g., "int32", "uint64").') return dtype.ctype + @property + def kernel_constants(self) -> Set[str]: + """Returns the kernel's constant data and symbols.""" + return self._kernel_constants + @property def kernel_name(self) -> list[str]: """Returns the kernel (function's) name.""" return self._kernel_name @property - def kernel_entry_node(self) -> nodes.MapEntry: + def kernel_map_entry(self) -> nodes.MapEntry: """ Returns the entry node of the kernel, which is a MapEntry node scheduled with dace.dtypes.ScheduleType.GPU_Device. """ - return self._kernel_entry_node + return self._kernel_map_entry @property def kernel_map(self) -> nodes.Map: """Returns the kernel's map node.""" - return self._kernel_entry_node.map + return self._kernel_map_entry.map + + @property + def arglist(self) -> Dict[str, dt.Data]: + """ + Returns a dictionary of arguments for the kernel's subgraph, + mapping each data name to its corresponding data descriptor. + """ + return self._arglist @property def args_as_input(self) -> list[str]: diff --git a/dace/codegen/targets/experimental_cuda_helpers/copy_strategies.py b/dace/codegen/targets/experimental_cuda_helpers/copy_strategies.py index 3dcf29cd9f..5c55fd9d1f 100644 --- a/dace/codegen/targets/experimental_cuda_helpers/copy_strategies.py +++ b/dace/codegen/targets/experimental_cuda_helpers/copy_strategies.py @@ -201,7 +201,7 @@ def applicable(self, copy_context: CopyContext) -> bool: # TODO: I don't understand why all of these conditions are needed, look into it cpu_storage_types = [StorageType.CPU_Heap, StorageType.CPU_ThreadLocal, StorageType.CPU_Pinned] - not_in_kernel_code = not ExperimentalCUDACodeGen._in_kernel_code + not_in_device_code = not copy_context.codegen._in_device_code is_between_access_nodes = (isinstance(copy_context.src_node, nodes.AccessNode) and isinstance(copy_context.dst_node, nodes.AccessNode)) @@ -212,7 +212,7 @@ def applicable(self, copy_context: CopyContext) -> bool: is_not_cpu_to_cpu = not (copy_context.src_storage in cpu_storage_types and copy_context.dst_storage in cpu_storage_types) - is_gpu_host_copy = (not_in_kernel_code and is_between_access_nodes and involves_gpu_or_pinned + is_gpu_host_copy = (not_in_device_code and is_between_access_nodes and involves_gpu_or_pinned and is_not_cpu_to_cpu) return is_gpu_host_copy @@ -456,7 +456,7 @@ def generate_copy(self, copy_context: CopyContext) -> None: if reduction_type != dtypes.ReductionType.Custom: # Use predefined reduction reduction_type_str = str(reduction_type).split('.')[-1] # e.g., "Sum" - reduction_template = f"<{reduction_type_str}>" + reduction_template = f"" else: custom_reduction = [unparse_cr(sdfg, wcr, dtype)] reduction_template = "" diff --git a/dace/codegen/targets/experimental_cuda_helpers/gpu_utils.py b/dace/codegen/targets/experimental_cuda_helpers/gpu_utils.py index 3ae2f3a347..c7554b2c16 100644 --- a/dace/codegen/targets/experimental_cuda_helpers/gpu_utils.py +++ b/dace/codegen/targets/experimental_cuda_helpers/gpu_utils.py @@ -1,11 +1,13 @@ import functools + import sympy from typing import List -from dace import Config, symbolic +from dace import Config, symbolic, data as dt +from dace.sdfg import nodes from dace.codegen import cppunparse +from dace.codegen.dispatcher import DefinedType from dace.codegen.prettycode import CodeIOStream -from dace.sdfg import nodes def symbolic_to_cpp(arr): @@ -108,3 +110,12 @@ def emit_sync_debug_checks(backend: str, codestream: CodeIOStream): if Config.get_bool('compiler', 'cuda', 'syncdebug'): codestream.write(f"DACE_GPU_CHECK({backend}GetLastError());\n" f"DACE_GPU_CHECK({backend}DeviceSynchronize());\n") + +def get_defined_type(data: dt.Data) -> DefinedType: + if isinstance(data, dt.Scalar): + return DefinedType.Scalar + elif isinstance(data, dt.Array): + return DefinedType.Pointer + else: + raise NotImplementedError(f"Data type '{type(data).__name__}' is not supported for defined type inference." + "Only Scalars and Arrays are expected for Kernels.") \ No newline at end of file diff --git a/dace/codegen/targets/experimental_cuda_helpers/scope_strategies.py b/dace/codegen/targets/experimental_cuda_helpers/scope_strategies.py index e0af55adb9..7824ed36da 100644 --- a/dace/codegen/targets/experimental_cuda_helpers/scope_strategies.py +++ b/dace/codegen/targets/experimental_cuda_helpers/scope_strategies.py @@ -87,7 +87,7 @@ def generate(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: ScopeSubgraphV # ----------------- Retrieve kernel configuration ----------------------- kernel_spec = self._current_kernel_spec - kernel_entry_node = kernel_spec._kernel_entry_node # == dfg_scope.source_nodes()[0] + kernel_entry_node = kernel_spec._kernel_map_entry # == dfg_scope.source_nodes()[0] kernel_map = kernel_spec.kernel_map # ----------------- Kernel/Map Range Preprocessing ----------------------- From 09caf2c4aeca2efb3529ef3144b44eb0dc9fdc80 Mon Sep 17 00:00:00 2001 From: aydogdub Date: Mon, 14 Jul 2025 10:40:02 +0200 Subject: [PATCH 60/94] ensure correct CUDA backend is selected --- dace/registry.py | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/dace/registry.py b/dace/registry.py index fe14b8a3ba..69d4958a0e 100644 --- a/dace/registry.py +++ b/dace/registry.py @@ -37,12 +37,23 @@ def autoregister(cls: Type, **kwargs): that automatically registers the subclass with the superclass registry upon creation. """ - - if 'name' in kwargs and (kwargs['name'] == 'cuda' or kwargs['name'] == 'experimental_cuda'): - from dace.config import Config - if Config.get('compiler', 'cuda', 'implementation') == 'experimental' and kwargs['name'] == 'cuda': - return - if Config.get('compiler', 'cuda', 'implementation') == 'legacy' and kwargs['name'] == 'experimental_cuda': + # Ensures that the correct CUDA implementation is selected and the other is not registered. + # Registering both leads to errors. + from dace.config import Config + + name = kwargs.get('name') + impl = Config.get('compiler', 'cuda', 'implementation') + + valid_impls = {'legacy', 'experimental'} + if impl not in valid_impls: + raise ValueError( + f"Invalid CUDA implementation: {impl}. " + f"Please select one of {valid_impls} under compiler.cuda.implementation in the configs." + ) + + # Only the CUDA implementation selected in Config is registered + if name in {'cuda', 'experimental_cuda'}: + if (impl == 'experimental' and name == 'cuda') or (impl == 'legacy' and name == 'experimental_cuda'): return registered = False From 31fe8f887c8a80bdb44c61e5137b19a912625142 Mon Sep 17 00:00:00 2001 From: aydogdub Date: Mon, 14 Jul 2025 10:42:30 +0200 Subject: [PATCH 61/94] Yakups fixes during Meeting --- dace/sdfg/nodes.py | 5 ++++- dace/sdfg/state.py | 3 ++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/dace/sdfg/nodes.py b/dace/sdfg/nodes.py index 167081a686..d2022b1232 100644 --- a/dace/sdfg/nodes.py +++ b/dace/sdfg/nodes.py @@ -927,7 +927,10 @@ def used_symbols_within_scope(self, parent_state: 'dace.SDFGState', all_symbols: continue if hasattr(n, 'used_symbols'): - free_symbols |= n.used_symbols(parent_state, all_symbols) + if isinstance(n, dace.nodes.NestedSDFG): + free_symbols |= n.used_symbols(all_symbols) + else: + free_symbols |= n.used_symbols(parent_state, all_symbols) else: free_symbols |= n.free_symbols diff --git a/dace/sdfg/state.py b/dace/sdfg/state.py index 42c2ce6746..00249efcc5 100644 --- a/dace/sdfg/state.py +++ b/dace/sdfg/state.py @@ -1548,7 +1548,8 @@ def _repr_html_(self): from dace.sdfg import SDFG arrays = set(n.data for n in self.data_nodes()) sdfg = SDFG(self.label) - sdfg._arrays = {k: self.sdfg.arrays[k] for k in arrays} + sdfg._arrays = dace.sdfg.NestedDict({k: self.sdfg.arrays[k] for k in arrays}) + #sdfg._arrays = {k: self.sdfg.arrays[k] for k in arrays} sdfg.add_node(self) return sdfg._repr_html_() From 20b4e090d7c8ddbb9042cce6e1eb415555c3c48e Mon Sep 17 00:00:00 2001 From: Yakup Budanaz Date: Mon, 14 Jul 2025 12:21:24 +0200 Subject: [PATCH 62/94] Update map free symbols --- dace/sdfg/nodes.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/dace/sdfg/nodes.py b/dace/sdfg/nodes.py index d2022b1232..01ddd8caad 100644 --- a/dace/sdfg/nodes.py +++ b/dace/sdfg/nodes.py @@ -942,6 +942,9 @@ def used_symbols_within_scope(self, parent_state: 'dace.SDFGState', all_symbols: free_symbols |= e.data.used_symbols(all_symbols, e) + # Update with the symbols needed by the map + free_symbols |= self.free_symbols + # Do not consider SDFG constants as symbols new_symbols.update(set(parent_sdfg.constants.keys())) return free_symbols - new_symbols From 1838a17ffcf8eeb13843ac7fe9e9d29a760fea17 Mon Sep 17 00:00:00 2001 From: Yakup Budanaz Date: Mon, 14 Jul 2025 12:33:58 +0200 Subject: [PATCH 63/94] Update --- dace/sdfg/utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dace/sdfg/utils.py b/dace/sdfg/utils.py index db85958ddc..4094934e03 100644 --- a/dace/sdfg/utils.py +++ b/dace/sdfg/utils.py @@ -2263,9 +2263,9 @@ def _get_used_symbols_impl(scope: Union[SDFG, ControlFlowRegion, SDFGState, nd.M :return: A set of constant symbol names. """ - def _get_assignments(cfg: Union[ControlFlowRegion, SDFG]) -> Set[str]: + def _get_assignments(cfg: ControlFlowRegion) -> Set[str]: written_symbols = set() - for edge in cfg.all_edges(*list(cfg.all_control_flow_blocks())): + for edge in cfg.all_interstate_edges(): if edge.data is not None and isinstance(edge.data, dace.InterstateEdge): written_symbols = written_symbols.union(edge.data.assignments.keys()) return written_symbols From a71b0ff21a626a06e9b166b8d9bdf38508555128 Mon Sep 17 00:00:00 2001 From: aydogdub Date: Tue, 15 Jul 2025 15:21:25 +0200 Subject: [PATCH 64/94] Ensure no gpu stream synchronization within Kernels occur --- .../experimental_cuda_helpers/gpu_utils.py | 35 ++++++++++++++++--- .../passes/gpustream_scheduling.py | 20 ++++++++--- 2 files changed, 47 insertions(+), 8 deletions(-) diff --git a/dace/codegen/targets/experimental_cuda_helpers/gpu_utils.py b/dace/codegen/targets/experimental_cuda_helpers/gpu_utils.py index c7554b2c16..de8913c176 100644 --- a/dace/codegen/targets/experimental_cuda_helpers/gpu_utils.py +++ b/dace/codegen/targets/experimental_cuda_helpers/gpu_utils.py @@ -1,13 +1,15 @@ import functools import sympy -from typing import List +from typing import Set, List, Optional -from dace import Config, symbolic, data as dt -from dace.sdfg import nodes +import dace +from dace import Config, symbolic, data as dt, dtypes +from dace.sdfg import nodes, SDFGState from dace.codegen import cppunparse from dace.codegen.dispatcher import DefinedType from dace.codegen.prettycode import CodeIOStream +from dace.transformation.helpers import get_parent_map def symbolic_to_cpp(arr): @@ -118,4 +120,29 @@ def get_defined_type(data: dt.Data) -> DefinedType: return DefinedType.Pointer else: raise NotImplementedError(f"Data type '{type(data).__name__}' is not supported for defined type inference." - "Only Scalars and Arrays are expected for Kernels.") \ No newline at end of file + "Only Scalars and Arrays are expected for Kernels.") + +def is_within_schedule_types(state: SDFGState, node: nodes.Node, schedules: Set[dtypes.ScheduleType]) -> bool: + """ + Checks if the given node is enclosed within a Map whose schedule type + matches any in the `schedules` set. + + Args: + state (SDFGState): The State where the node resides + node (nodes.Node): The node to check. + schedules (set[dtypes.ScheduleType]): A set of schedule types to match (e.g., {dtypes.ScheduleType.GPU_Device}). + + Returns: + True if the node is enclosed by a Map with a schedule type in `schedules`, False otherwise. + """ + current = node + + while current is not None: + if isinstance(current, nodes.MapEntry): + if current.map.schedule in schedules: + return True + + parent = get_parent_map(state, current) + if parent is None: + return False + current, state = parent \ No newline at end of file diff --git a/dace/transformation/passes/gpustream_scheduling.py b/dace/transformation/passes/gpustream_scheduling.py index f463ac8053..dcb423a661 100644 --- a/dace/transformation/passes/gpustream_scheduling.py +++ b/dace/transformation/passes/gpustream_scheduling.py @@ -4,6 +4,7 @@ from dace import SDFG, properties, SDFGState from dace import dtypes from dace.codegen import common +from dace.codegen.targets.experimental_cuda_helpers.gpu_utils import is_within_schedule_types from dace.config import Config from dace.transformation import pass_pipeline as ppl, transformation from dace.sdfg import nodes @@ -269,6 +270,7 @@ def _identify_sync_locations(self, sdfg: SDFG, Synchronization is needed: - At the end of a state, if we copy to/from GPU AccessNodes. - Immediately after a node, if data leaves GPU memory and is further used. + - Furthermore, never within the kernel code. Returns: - sync_state: Maps each SDFGState to a set of stream IDs to sync at the end of the state. @@ -290,6 +292,12 @@ def is_kernel_exit(node): def is_sink_node(node, state): return state.out_degree(node) == 0 + + def edge_within_kernel(state, src, dst): + gpu_schedules = dtypes.GPU_SCHEDULES_EXPERIMENTAL_CUDACODEGEN + src_in_kernel = is_within_schedule_types(state, src, gpu_schedules) + dst_in_kernel = is_within_schedule_types(state, dst, gpu_schedules) + return src_in_kernel and dst_in_kernel # ------------------ Sync detection logic ----------------------------- @@ -304,17 +312,21 @@ def is_sink_node(node, state): sync_state[state] = set() # --- Heuristics for when to sync --- - if is_gpu_accessnode(src, state) and is_nongpu_accessnode(dst, state) and is_sink_node(dst, state): + if (is_gpu_accessnode(src, state) and is_nongpu_accessnode(dst, state) and + is_sink_node(dst, state) and not edge_within_kernel(state, src, dst)): sync_state[state].add(assigned_nodes[dst]) - elif is_gpu_accessnode(src, state) and is_nongpu_accessnode(dst, state) and not is_sink_node(dst, state): + elif (is_gpu_accessnode(src, state) and is_nongpu_accessnode(dst, state) and + not is_sink_node(dst, state) and not edge_within_kernel(state, src, dst)): sync_state[state].add(assigned_nodes[dst]) sync_node[dst] = state - elif is_nongpu_accessnode(src, state) and is_gpu_accessnode(dst, state): + elif (is_nongpu_accessnode(src, state) and is_gpu_accessnode(dst, state) and + not edge_within_kernel(state, src, dst)): sync_state[state].add(assigned_nodes[dst]) - elif is_kernel_exit(src) and is_gpu_accessnode(dst, state) and is_sink_node(dst, state): + elif (is_kernel_exit(src) and is_gpu_accessnode(dst, state) and + is_sink_node(dst, state)): sync_state[state].add(assigned_nodes[dst]) else: From 1ec735eb794798ccc95fd30a817763ba2b91fe0d Mon Sep 17 00:00:00 2001 From: aydogdub Date: Mon, 28 Jul 2025 19:52:49 +0200 Subject: [PATCH 65/94] Handle GPU_global, in-kernel defined transients used for backwards compatibility --- .../interstate/gpu_transform_sdfg.py | 70 +- .../passes/move_array_out_of_kernel.py | 885 ++++++++++++++++++ 2 files changed, 953 insertions(+), 2 deletions(-) create mode 100644 dace/transformation/passes/move_array_out_of_kernel.py diff --git a/dace/transformation/interstate/gpu_transform_sdfg.py b/dace/transformation/interstate/gpu_transform_sdfg.py index de1dfcf645..31cbdb45e4 100644 --- a/dace/transformation/interstate/gpu_transform_sdfg.py +++ b/dace/transformation/interstate/gpu_transform_sdfg.py @@ -618,8 +618,74 @@ def _create_copy_out(arrays_used: Set[str]) -> Dict[str, str]: for devicename, hostname in mapping.items(): block.replace_meta_accesses({devicename: hostname}) + # Step 9: Simplify - if not self.simplify: + if self.simplify: + sdfg.simplify() + + ######################################################################## + # In case the ExperimentalCUDACodeGen is selected, we handle, for backwards + # compatibility, the use of in-kernel, transient GPU_Global stored array here. + from dace.config import Config + if not Config.get('compiler', 'cuda', 'implementation') == 'experimental': return + + # import needed modules + from dace.transformation import helpers + from dace.transformation.passes.move_array_out_of_kernel import MoveArrayOutOfKernel + import warnings + + # Detect transient GPU_Global arrays inside GPU_Device-scheduled maps + transients_in_kernels: Set[Tuple[str, data.Array, nodes.MapEntry]] = set() + transient_outside_kernels: Set[Tuple[str, data.Array]] = set() + + for node, parent in sdfg.all_nodes_recursive(): + # ---------- Consider only transient GPU_Global arrays ------- + if not isinstance(node, nodes.AccessNode): + continue - sdfg.simplify() + desc = node.desc(parent) + if not isinstance(desc, data.Array): + continue + if not desc.transient: + continue + if desc.storage != dtypes.StorageType.GPU_Global: + continue + + #------- Check whether transient/access node occurs within a kernel -------- + in_kernel = False + parent_map_info = helpers.get_parent_map(state=parent, node=node) + while parent_map_info is not None: + map_entry, map_state = parent_map_info + if (isinstance(map_entry, nodes.MapEntry) and + map_entry.map.schedule == dtypes.ScheduleType.GPU_Device): + in_kernel = True + break + parent_map_info = helpers.get_parent_map(map_state, map_entry) + + if in_kernel: + transients_in_kernels.add((node.data, desc, map_entry)) + else: + transient_outside_kernels.add((node.data, desc)) + + # Skip transients that are used outside of GPU kernels, unless a separate, strictly kernel-local + # transient with the same name exists inside a kernel. In such cases, 'MoveArrayOutOfKernel' is + # still applied to the local one, and naming conflicts are handled automatically. + transient_defined_inside_kernel: Set[Tuple[str, nodes.MapEntry]] = set() + for data_name, array_desc, kernel_entry in transients_in_kernels: + if (data_name, array_desc) in transient_outside_kernels: + continue + else: + transient_defined_inside_kernel.add((data_name, kernel_entry)) + + # Apply the pass and warn the user of its use + for data_name, kernel_entry in transient_defined_inside_kernel: + warnings.warn( + f"Transient array '{data_name}' with storage type GPU_Global detected inside kernel {kernel_entry}. " + "GPU_Global memory cannot be allocated within GPU kernels, so this usage is semantically invalid. " + "As a best-effort fix, the array will be lifted outside the kernel as a non-transient GPU_Global array. " + "Any naming conflicts are resolved automatically. " + "Please avoid this pattern, as it is strongly discouraged and may lead to undefined behavior. " + "Note that this fix provides no guarantees, especially for unusual or complex use cases." + ) + MoveArrayOutOfKernel().apply_pass(sdfg, kernel_entry, data_name) diff --git a/dace/transformation/passes/move_array_out_of_kernel.py b/dace/transformation/passes/move_array_out_of_kernel.py new file mode 100644 index 0000000000..a53677a2f2 --- /dev/null +++ b/dace/transformation/passes/move_array_out_of_kernel.py @@ -0,0 +1,885 @@ +from typing import Dict, FrozenSet, Set, Tuple, List, Optional +import copy +import functools +from collections import deque + +import sympy + +import dace +from dace import SDFG, SDFGState, dtypes, data as dt +from dace.sdfg import nodes +from dace.properties import make_properties +from dace.transformation import transformation, helpers +from dace.transformation.pass_pipeline import Pass +from dace.subsets import Range +from dace.sdfg.graph import MultiConnectorEdge +from dace.memlet import Memlet +from dace.symbolic import symbol + +@make_properties +@transformation.explicit_cf_compatible +class MoveArrayOutOfKernel(Pass): + """ + This pass supports a legacy use case in the 'ExperimentalCUDACodeGen' backend: the use of + transient arrays with dtypes.StorageType.GPU_Global inside GPU_Device scheduled maps (kernels). + Previously, the old 'CUDACodeGen' moved such arrays outside the kernel during codegen, which caused: + + 1. Mismatches between the SDFG and the generated code, + 2. Complex, misplaced logic in codegen, + 3. Incorrect semantics — a single shared array was reused instead of per-iteration replication, + leading to race conditions. + + This pass fixes these issues by explicitly lifting such arrays out of GPU_Device maps + and creating disjoint arrays per map iteration. Unlike the legacy approach, the transformation + is now visible and consistent at the SDFG level, avoiding naming collisions and improving clarity. + + NOTE: There is no true "local device (GPU_Device) memory" on GPUs, but DaCe supports this + pattern for legacy reasons. This pass exists purely for backward compatibility, and its use + is strongly discouraged. + """ + + def __init__(self): + """ + Initializes caches for mapping nodes to their states and SDFGs. + + This avoids repeatedly traversing the SDFG structure during the pass. + The caches are populated in `apply_pass` for convenience. + """ + self._node_to_state_cache: Dict[nodes.Node, SDFGState] = dict() + self._node_to_sdfg_cache: Dict[nodes.Node, SDFG] = dict() + + # Entry point + def apply_pass(self, root_sdfg: SDFG, kernel_entry: nodes.MapEntry, array_name: str) -> None: + """ + Applies the pass to move a transient GPU_Global array out of a GPU_Device map. + + Args: + root_sdfg: The top-level SDFG to operate on. + kernel_entry: The MapEntry node representing the GPU_Device scheduled map (i.e., the kernel) + that contains the transient array. + array_name: The name of the transient array to move. Note that multiple arrays with the + same name may exist within the kernel. All will be lifted. + """ + # Cache every nodes parent state and parent sdfg + for node, parent in root_sdfg.all_nodes_recursive(): + if isinstance(node, nodes.Node): + assert isinstance(parent, SDFGState) + self._node_to_state_cache[node] = parent + self._node_to_sdfg_cache[node] = parent.sdfg + + # Check if all access nodes to 'array_name' within the kernel are defined in the same SDFG as the map + kernel_parent_sdfg = self._node_to_sdfg_cache[kernel_entry] + simple_case = True + for (_, outermost_sdfg, _, _) in self.collect_array_descriptor_usage(kernel_entry, array_name): + if outermost_sdfg != kernel_parent_sdfg: + simple_case = False + break + + if simple_case: + # All access nodes are in the same SDFG as the kernel map - easy + access_nodes = [an for an, _, _ in self.get_access_nodes_within_map(kernel_entry, array_name)] + self.move_array_out_of_kernel_flat(kernel_entry, array_name, access_nodes) + else: + # Access nodes span nested maps or SDFGs — more involved (more checks, naming conflicts, several seperate + # array descriptors with the same array_name) + self.move_array_out_of_kernel_nested(kernel_entry, array_name) + + # Main transformation algorithms and helpers + def move_array_out_of_kernel_flat(self, kernel_entry: nodes.MapEntry, array_name: str, + access_nodes: List[nodes.AccessNode]) -> None: + """ + Moves a transient GPU_Global array out of a GPU_Device map (kernel) in the flat case. + + This function handles the simpler case where all access nodes to the array are in the same + SDFG and state as the kernel map. Therefore, there are no nested SDFGs or naming conflicts + (since an SDFG cannot define multiple descriptors with the same name). + + The array is reshaped to allocate a disjoint slice per map iteration. For example, given: + + for x, y in dace.map[0:128, 0:32] @ GPU_Device: + gpu_A = dace.define_local([64], dtype, storage=GPU_Global) + + the array shape will be updated to [128, 32, 64], and memlets will ensure each thread + accesses [x, y, 0:64]. + + Additionally, this method inserts the necessary access nodes and edges to correctly move + the array out of the map scope and maintain correctness. + + Args: + kernel_entry: The MapEntry node representing the GPU kernel. + array_name: Name of the transient array to move. + access_nodes: List of access nodes referring to the array inside the map. + """ + # A closest AccessNode of kernel exit is used + parent_state = self._node_to_state_cache[kernel_entry] + kernel_exit: nodes.MapExit = parent_state.exit_node(kernel_entry) + closest_an = self.get_nearest_access_node(access_nodes, kernel_exit) + array_desc = closest_an.desc(parent_state) + + # Get the chain of MapEntries from the AccessNode up to and including the kernel map entry + map_entry_chain, _= self.get_maps_between(kernel_entry, closest_an) + + # Store the original full-range subset of the array. + # Needed to define correct memlets when moving the array out of the kernel. + old_subset = [(0, dim - 1, 1) for dim in array_desc.shape] + + # Update the array + new_shape, new_strides, new_total_size, new_offsets = self.get_new_shape_info(array_desc, map_entry_chain) + array_desc.set_shape(new_shape=new_shape, strides=new_strides, total_size=new_total_size, offset=new_offsets) + + # Update all memlets + self.update_memlets(kernel_entry, array_name, closest_an, access_nodes) + + + # add new edges to move access Node out of map + in_connector: str = 'IN_' + array_name + out_connector: str = 'OUT_' + array_name + previous_node = closest_an + previous_out_connector = None + for next_map_entry in map_entry_chain: + + next_map_exit = parent_state.exit_node(next_map_entry) + if in_connector not in next_map_exit.in_connectors: + next_map_state = self._node_to_state_cache[next_map_exit] + next_map_exit.add_in_connector(in_connector) + next_map_exit.add_out_connector(out_connector) + + next_entries, _ = self.get_maps_between(kernel_entry, previous_node) + memlet_subset = Range(self.get_memlet_subset(next_entries, previous_node) + old_subset) + + next_map_state.add_edge(previous_node, previous_out_connector, next_map_exit, in_connector, Memlet(data= array_name, subset=memlet_subset)) + + previous_node = next_map_exit + previous_out_connector = out_connector + + # New Access Node outside of the target map, connected to the exit + access_node_outside = parent_state.add_access(array_name) + parent_state.add_edge(kernel_exit, out_connector, access_node_outside, None, Memlet.from_array(array_name, array_desc)) + + def move_array_out_of_kernel_nested(self, kernel_entry: nodes.MapEntry, array_name: str) -> None: + """ + Moves a transient GPU_Global array out of a GPU_Device map (kernel) in the nested case. + + This function handles the more complex scenario where access nodes to the array may be + defined inside nested SDFGs within the kernel's parent SDFG. It moves the array out of + all nested maps and SDFGs, updating shapes and memlets accordingly, and resolves naming + conflicts that arise from multiple descriptors with the same name in different scopes + (by renaming). + + The method also ensures that the array is correctly lifted through all nested SDFGs + between its original definition and the kernel map, updating symbols and connectors + along the way. + + Args: + kernel_entry: The MapEntry node representing the GPU kernel. + array_name: Name of the transient array to move. + """ + # Collect all information about every distinct data descriptor with the same name "array_name" + array_descriptor_usage = self.collect_array_descriptor_usage(kernel_entry, array_name) + original_array_name = array_name + kernel_parent_sdfg = self._node_to_sdfg_cache[kernel_entry] + + for array_desc, outermost_sdfg, sdfg_defined, access_nodes in array_descriptor_usage: + + if outermost_sdfg == kernel_parent_sdfg: + # Special case: There are nested accesss nodes, but their descriptor is defined at + # the same sdfg as the kernel. Thus, we can use the simpler algorithm. + self.move_array_out_of_kernel_flat(kernel_entry, original_array_name, list(access_nodes)) + continue + + # The outermost node + nsdfg_node = outermost_sdfg.parent_nsdfg_node + map_entry_chain, _ = self.get_maps_between(kernel_entry, nsdfg_node) + + # Store the original full-range subset of the array. + # Needed to define correct memlets when moving the array out of the kernel. + old_subset = [(0, dim - 1, 1) for dim in array_desc.shape] + + # Update array_descriptor + new_shape, new_strides, new_total_size, new_offsets = self.get_new_shape_info(array_desc, map_entry_chain) + array_desc.set_shape(new_shape=new_shape, strides=new_strides, total_size=new_total_size, offset=new_offsets) + array_desc.transient = False + + # Update memlets data movement + self.update_memlets(kernel_entry, original_array_name, nsdfg_node, access_nodes) + + # Update name if names conflict + required, array_name = self.new_name_required(kernel_entry, original_array_name, sdfg_defined) + if required: + self.replace_array_name(sdfg_defined, original_array_name, array_name, array_desc) + + # Ensure required symbols are defined + self.update_symbols(map_entry_chain, kernel_parent_sdfg) + + # Collect all SDFGs from the outermost definition to the target map's parent (inclusive) + sdfg_hierarchy: List[SDFG] = [outermost_sdfg] + current_sdfg = outermost_sdfg + while current_sdfg != kernel_parent_sdfg: + current_sdfg = current_sdfg.parent_sdfg + sdfg_hierarchy.append(current_sdfg) + + # Validate collected SDFGs: no None entries + if any(sdfg is None for sdfg in sdfg_hierarchy): + raise ValueError("Invalid SDFG hierarchy: contains 'None' entries. This should not happen.") + + # Validate depth: must include at least outer + target SDFG + if len(sdfg_hierarchy) < 2: + raise ValueError( + f"Invalid SDFG hierarchy: only one SDFG found. " + f"Expected at least two levels, since {outermost_sdfg} is not equal to " + "the kernel map's SDFG and is contained within it — the last entry should " + "be the kernel's parent SDFG." + ) + + self.lift_array_through_nested_sdfgs(array_name, kernel_entry, sdfg_hierarchy, old_subset) + + def lift_array_through_nested_sdfgs(self, array_name:str, kernel_entry: nodes.MapEntry, + sdfg_hierarchy: List[SDFG], old_subset: List) -> None: + """ + Lifts a transient array through nested SDFGs. + + For each SDFG in the hierarchy (from inner to outer), this deepcopies the array descriptor + and adds edges from the NestedSDFG node through any enclosing maps to a new access node. + This is done until the kernel is exited. + Memlets are updated using `old_subset` and enclosing map parameters. + + Args: + array_name: Name of the array to lift. + kernel_entry: Innermost GPU kernel MapEntry. + sdfg_hierarchy: Ordered list of nested SDFGs (inner to outer). + old_subset: Inner array subset used for memlet construction. + """ + # Move array out ouf the kernel map entry through nested SDFGs + outer_sdfg = sdfg_hierarchy.pop(0) + while sdfg_hierarchy: + inner_sdfg = outer_sdfg + outer_sdfg = sdfg_hierarchy.pop(0) + nsdfg_node = inner_sdfg.parent_nsdfg_node + nsdfg_parent_state = self._node_to_state_cache[nsdfg_node] + + # copy and add the descriptor to the outer sdfg + old_desc = inner_sdfg.arrays[array_name] + new_desc = copy.deepcopy(old_desc) + outer_sdfg.add_datadesc(array_name, new_desc) + + # Get all parent scopes to detect how the data needs to flow. + # E.g. nsdfg_node -> MapExit needs to be nsdfg_node -> MapExit -> AccessNode (new) + parent_scopes: List[nodes.MapEntry] = [] + current_parent_scope = nsdfg_node + scope_dict = nsdfg_parent_state.scope_dict() + while scope_dict[current_parent_scope] is not None and current_parent_scope is not kernel_entry: + parent_scopes.append(scope_dict[current_parent_scope]) + current_parent_scope = scope_dict[current_parent_scope] + + # Get a new AccessNode where the nsdfg node's parent state is. + # Note: This is in the OUTER sdfg, so this is the first accessNode accessing + # the current array descriptor + exit_access_node = nsdfg_parent_state.add_access(array_name) + + # Cache its location + self._node_to_state_cache[exit_access_node] = nsdfg_parent_state + self._node_to_sdfg_cache[exit_access_node] = outer_sdfg + + # Create a dataflow path from the NestedSDFG node to the new exit access node, + # passing through any enclosing map scopes (if the NestedSDFG is nested within maps). + src = nsdfg_node + for scope_entry in parent_scopes: + # next destination is the scope exit + scope_exit = nsdfg_parent_state.exit_node(scope_entry) + dst = scope_exit + + # Next, add edge between src and dst in 2 steps: + # 1.1 Determine source connector name and register it based on src type + if isinstance(src, nodes.NestedSDFG): + src_conn = array_name + src.add_out_connector(src_conn) + elif isinstance(src, nodes.MapExit): + src_conn = f"OUT_{array_name}" + src.add_out_connector(src_conn) + else: + raise NotImplementedError(f"Unsupported source node type '{type(src).__name__}' — only NestedSDFG or MapExit are expected.") + + # 1.2 Determine destination connector name and register it based on dst type + if isinstance(dst, nodes.AccessNode): + dst_conn = None # AccessNodes use implicit connectors + elif isinstance(dst, nodes.MapExit): # Assuming dst is the entry for parent scope + dst_conn = f"IN_{array_name}" + dst.add_in_connector(dst_conn) + else: + raise NotImplementedError(f"Unsupported destination node type '{type(dst).__name__}' — expected AccessNode or MapEntry.") + + # 2. Add the edge using the connector names determined in Step 1. + next_entries, _ = self.get_maps_between(kernel_entry, src) + memlet_subset = Range(self.get_memlet_subset(next_entries, src) + old_subset) + nsdfg_parent_state.add_edge(src, src_conn, dst, dst_conn, Memlet(data= array_name, subset=memlet_subset)) + + # Continue by setting the dst as source + src = dst + + + # After processing all scopes, the last src (which is either the last MapExit or the intial nsdfg if there are no parent scope) + # needs to be connected to the exit access node added before + dst = exit_access_node + + if isinstance(src, nodes.NestedSDFG): + src_conn = array_name + src.add_out_connector(src_conn) + elif isinstance(src, nodes.MapExit): + src_conn = f"OUT_{array_name}" + src.add_out_connector(src_conn) + else: + raise NotImplementedError(f"Unsupported source node type '{type(src).__name__}' — only NestedSDFG or MapExit are expected.") + + next_entries, _ = self.get_maps_between(kernel_entry, src) + memlet_subset = Range(self.get_memlet_subset(next_entries, src) + old_subset) + nsdfg_parent_state.add_edge(src, src_conn, dst, None, Memlet(data= array_name, subset=memlet_subset)) + + # At the outermost sdfg we set the array descriptor to be transient again, + # Since it is not needed beyond it. Furthermore, this ensures that the codegen + # allocates the array and does not expect it as input to the kernel + new_desc.transient = True + + # Memlet related helper functions + def get_memlet_subset(self, map_chain: List[nodes.MapEntry], node: nodes.Node): + """ + Compute the memlet subset to access an array based on the position of a node within nested GPU maps. + + For each GPU_Device or GPU_ThreadBlock map in the chain: + - If the node lies inside the map (but is not the map entry or exit itself), + the subset is the single index corresponding to the map parameter (symbolic). + - Otherwise, the full range of the map dimension is used. + + This ensures that memlets correctly represent per-thread or per-block slices + when moving arrays out of kernel scopes. + + Args: + map_chain: List of MapEntry nodes representing nested maps from outermost to innermost. + node: The node for which to determine the subset (could be an access node or map entry/exit). + + Returns: + A list of subsets (start, end, stride) tuples for each map dimension. + """ + subset = [] + for next_map in map_chain: + if not next_map.map.schedule in [dtypes.ScheduleType.GPU_Device, dtypes.ScheduleType.GPU_ThreadBlock]: + continue + + map_parent_state = self._node_to_state_cache[next_map] + for param, (start, end, stride) in zip(next_map.map.params, next_map.map.range.ndrange()): + + node_is_map = ( + (isinstance(node, nodes.MapEntry) and node == next_map) or + (isinstance(node, nodes.MapExit) and map_parent_state.exit_node(next_map) == node) + ) + node_state = self._node_to_state_cache[node] + if helpers.contained_in(node_state, node, next_map) and not node_is_map: + index = symbol(param) + subset.append((index, index, 1)) + else: + subset.append((start, end, stride)) + + + return subset + + def update_memlets(self, kernel_entry: nodes.MapEntry, array_name: str, outermost_node: nodes.Node, + access_nodes: Set[nodes.AccessNode]) -> None: + """ + Updates all memlets related to a given transient array to reflect correct data + movement when moving array out of the kernel entry. + + Any map enclosing the `outermost_node` also encloses all access nodes and is + used to determine which maps are strictly above the access nodes. Based on this, + we compute the correct memlet subset that includes the additional dimensions + from the GPU map hierarchy. + + Args: + kernel_entry: The MapEntry node representing the GPU kernel scope. + array_name: Name of the transient array being moved out. + outermost_node: The outermost node. + access_nodes: Set of AccessNodes inside the kernel that reference the same array. + """ + map_entry_chain, _ = self.get_maps_between(kernel_entry, outermost_node) + params_as_ranges = self.get_memlet_subset(map_entry_chain, outermost_node) + + # Update in and out path memlets + visited: Set[MultiConnectorEdge[Memlet]] = set() + for access_node in access_nodes: + # in paths + for path in self.in_paths(access_node): + for edge in path: + + # Guards + if edge in visited: + continue + + if edge.data.data == array_name: + old_range = edge.data.subset.ndrange() + new_range = params_as_ranges + old_range + edge.data.subset = Range(new_range) + visited.add(edge) + + elif edge.data.data != array_name and edge.dst is access_node and edge.data.dst_subset is not None: + old_range = edge.data.dst_subset.ndrange() + new_range = params_as_ranges + old_range + edge.data.dst_subset = Range(new_range) + visited.add(edge) + + else: + continue + + + # out paths + for path in self.out_paths(access_node): + for edge in path: + if edge in visited: + continue + + if edge.data.data == array_name: + old_range = edge.data.subset.ndrange() + new_range = params_as_ranges + old_range + edge.data.subset = Range(new_range) + visited.add(edge) + + elif(edge.data.data != array_name) and edge.src is access_node and edge.data.src_subset is not None: + old_range = edge.data.src_subset.ndrange() + new_range = params_as_ranges + old_range + edge.data.src_subset = Range(new_range) + visited.add(edge) + + else: + continue + + # Array, symbol and renaming related helper functions + def get_new_shape_info(self, array_desc: dt.Array, map_exit_chain: List[nodes.MapEntry]): + """ + Calculate the new shape, strides, total size, and offsets for a transient array + when moving it out of a GPU_Device kernel. + + Each GPU_Device map adds dimensions to allocate disjoint slices per thread. + + For example: + + for x, y in dace.map[0:128, 0:32] @ GPU_Device: + gpu_A = dace.define_local([64], dtype, storage=GPU_Global) + + gpu_A's shape changes from [64] to [128, 32, 64] to give each thread its own slice + (i.e. gpu_A[x, y, 64]). + + Args: + array_desc: Original array descriptor. + map_exit_chain: List of MapEntry nodes between array and kernel exit. + + Returns: + Tuple (new_shape, new_strides, new_total_size, new_offsets) for the updated array. + """ + extended_size = [] + new_strides = list(array_desc.strides) + new_offsets = list(array_desc.offset) + for next_map in map_exit_chain: + if not next_map.map.schedule in [dtypes.ScheduleType.GPU_Device, dtypes.ScheduleType.GPU_ThreadBlock]: + continue + + map_range: Range = next_map.map.range + max_elements = map_range.max_element() + min_elements = map_range.min_element() + range_size = [max_elem + 1 - min_elem for max_elem, min_elem in zip(max_elements, min_elements)] + + extended_size = range_size + extended_size + new_strides = [1 for _ in next_map.map.params] + new_strides # add 1 per dimension + new_offsets = [0 for _ in next_map.map.params] + new_offsets # add 0 per dimension + + new_shape = extended_size + list(array_desc.shape) + new_total_size = functools.reduce(sympy.Mul, extended_size, 1) * array_desc.total_size + + return new_shape, new_strides, new_total_size, new_offsets + + # TODO: Ask Yakup -> No states test but this should be alright + def replace_array_name(self, sdfgs: FrozenSet[SDFG], old_name: str, new_name: str, array_desc: dt.Array) -> None: + """ + Replaces all occurrences of an array name in the given SDFGs, including its data descriptor, + memlets, connectors and access nodes with a new name. + + Args: + sdfgs (Set[SDFG]): The SDFGs in which to perform the renaming. + old_name (str): The original array name to be replaced. + new_name (str): The new array name. + new_descriptor (dt.Array): The data descriptor associated with the old and new name. + """ + for sdfg in sdfgs: + + # Replace by removing the data descriptor and adding it with the new name + sdfg.remove_data(old_name, False) + sdfg.add_datadesc(new_name, array_desc) + sdfg.replace(old_name, new_name) + + # Find all states + for state in sdfg.states(): + for edge in state.edges(): + + # Update out connectors + src = edge.src + old_out_conn = f"OUT_{old_name}" + new_out_conn = f"OUT_{new_name}" + if edge.src_conn == old_out_conn: + edge.src_conn = new_out_conn + src.remove_out_connector(old_out_conn) + src.add_out_connector(new_out_conn) + + # Update in connectors + dst = edge.dst + old_in_conn = f"IN_{old_name}" + new_in_conn = f"IN_{new_name}" + if edge.dst_conn == old_in_conn: + edge.dst_conn = new_in_conn + dst.remove_in_connector(old_in_conn) + dst.add_in_connector(new_in_conn) + + def update_symbols(self, map_entry_chain: List[nodes.MapEntry], top_sdfg: SDFG) -> None: + """ + Ensures symbols from GPU maps are defined in all nested SDFGs. + + When lifting arrays out of GPU maps, any used symbols (e.g., map indices) + must be available in nested SDFGs for correct memlet updates. + This function collects such symbols from the map scopes and adds them to + the symbol tables and mappings of all nested SDFGs under `top_sdfg`. + + Args: + map_entry_chain: List of GPU MapEntry nodes whose symbols are relevant. + top_sdfg: The top-level SDFG under which symbols will be propagated. + """ + all_symbols = set() + for next_map in map_entry_chain: + if not next_map.map.schedule in [dace.dtypes.ScheduleType.GPU_Device, dace.dtypes.ScheduleType.GPU_ThreadBlock]: + continue + all_symbols = all_symbols | next_map.used_symbols_within_scope(self._node_to_state_cache[next_map]) + + + + for sdfg in top_sdfg.all_sdfgs_recursive(): + nsdfg_node = sdfg.parent_nsdfg_node + if nsdfg_node is None: + continue + + for symbol in all_symbols: + if str(symbol) not in sdfg.symbols: + sdfg.add_symbol(str(symbol), dace.dtypes.int32) + if str(symbol) not in nsdfg_node.symbol_mapping: + nsdfg_node.symbol_mapping[symbol] = dace.symbol(symbol) + + # Array analysis and metadata functions + def collect_array_descriptor_usage(self, map_entry: nodes.MapEntry, + array_name: str) -> Set[Tuple[dt.Array, SDFG, FrozenSet[SDFG], FrozenSet[nodes.AccessNode]]]: + """ + Tracks usage of a transient array across nested SDFGs within the scope of a map. + + For each array it collects: + - the outermost SDFG where it is defined or passed through, + - all SDFGs in which it is accessed or passed via connectors, + - all AccessNodes referencing it in those SDFGs. + + Note: By "same array" we mean arrays with the same name and connected via memlets; + multiple descriptor objects (dt.Array) may exist across SDFGs for the same logical array. + + Args: + map_entry: The MapEntry node whose scope is used for analysis. + array_name: The name of the array to analyze. + + Returns: + A set of tuples, each containing: + - one of potentially many dt.Array descriptors, + - the outermost defining or using SDFG, + - a frozenset of all involved SDFGs, + - a frozenset of all AccessNodes using this array. + """ + access_nodes_info: List[Tuple[nodes.AccessNode, SDFGState, SDFG]] = self.get_access_nodes_within_map(map_entry, array_name) + last_sdfg: SDFG = self._node_to_sdfg_cache[map_entry] + + result: Set[Tuple[dt.Array, SDFG, Set[SDFG], Set[nodes.AccessNode]]] = set() + visited_sdfgs: Set[SDFG] = set() + + for access_node, state, sdfg in access_nodes_info: + + # Skip visited sdfgs where the array name is defined + if sdfg in visited_sdfgs: + continue + + # Get the array_desc (there may be several copies across SDFG, but + # we are only interested in the information thus this is fine) + array_desc = access_node.desc(state) + + # Collect all sdfgs and access nodes which refer to the same array + # (we determine this by inspecting if the array name is passed via connectors) + sdfg_set: Set[SDFG] = set() + access_nodes_set: Set[nodes.AccessNode] = set() + access_nodes_set.add(access_node) + + # Get all parent SDFGs and the outermost sdfg where defined + current_sdfg = sdfg + outermost_sdfg = current_sdfg + while True: + sdfg_set.add(current_sdfg) + + # We have reached the map's sdfg, so this is the + # outermost_sdfg we consider + if current_sdfg == last_sdfg: + outermost_sdfg = current_sdfg + break + + nsdfg_node = current_sdfg.parent_nsdfg_node + if array_name in nsdfg_node.in_connectors or array_name in nsdfg_node.out_connectors: + current_sdfg = current_sdfg.parent_sdfg + outermost_sdfg = current_sdfg + else: + break + + # Get all child SDFGs where the array was also passed to + queue = [sdfg] + while queue: + current_sdfg = queue.pop(0) + for child_state in current_sdfg.states(): + for node in child_state.nodes(): + if not isinstance(node, nodes.NestedSDFG): + continue + + nsdfg_node = node + if array_name in nsdfg_node.in_connectors or array_name in nsdfg_node.out_connectors: + queue.append(nsdfg_node.sdfg) + sdfg_set.add(nsdfg_node.sdfg) + + # Get all access nodes with the array name used in the sdfgs we found + for current_sdfg in sdfg_set: + for current_state in current_sdfg.states(): + for node in current_state.nodes(): + if isinstance(node, nodes.AccessNode) and node.data == array_name: + access_nodes_set.add(node) + + # Update all visited sdfgs + visited_sdfgs.update(sdfg_set) + + # Finally add information to the result + result.add((array_desc, outermost_sdfg, frozenset(sdfg_set), frozenset(access_nodes_set))) + + return result + + def new_name_required(self, map_entry: nodes.MapEntry, array_name: str, sdfg_defined: FrozenSet[SDFG]) -> Tuple[bool, str]: + """ + Returns whether the array_name is also used at an SDFG which is not in the sdfg_defined set. + This means that the array_name at that SDFG refers to another data descriptor. + Another new name is suggested if this case occurs. + + Args: + map_entry: The MapEntry node whose scope is used to determine name usage. + array_name: The name of the data descriptor of interest + sdfg_defined: where the data descriptor is defined + + Returns: + A Tuple where first element is indicatin whether a new name is required, and + the other is either the same name if no new name is required or otherwise a new name suggestion. + """ + map_parent_sdfg = self._node_to_sdfg_cache[map_entry] + taken_names = set() + + for sdfg in map_parent_sdfg.all_sdfgs_recursive(): + + # Continue if sdfg is neither the map's parent state + # or not contained within the map scope + nsdfg_node = sdfg.parent_nsdfg_node + state = self._node_to_state_cache[nsdfg_node] if nsdfg_node else None + + if not ((nsdfg_node and state and helpers.contained_in(state, nsdfg_node, map_entry)) + or sdfg is map_parent_sdfg): + continue + + # Taken names are all symbol and array identifiers of sdfgs in which + # the array_name's data descriptor we are interested in IS NOT defined + if sdfg not in sdfg_defined: + taken_names.update(sdfg.arrays.keys()) + taken_names.update(sdfg.used_symbols(True)) + + + if array_name in taken_names: + counter = 0 + new_name = f"local_{counter}_{array_name}" + while new_name in taken_names: + counter += 1 + new_name = f"local_{counter}_{array_name}" + + return True, new_name + else: + return False, array_name + + # Utility functions - basic building blocks + def get_access_nodes_within_map(self, map_entry: nodes.MapEntry, data_name: str) -> List[Tuple[nodes.AccessNode, SDFGState, SDFG]]: + """ + Finds all AccessNodes that refer to the given `data_name` and are located inside + the scope of the specified MapEntry. + + Returns: + A list of tuples, each consisting of: + - the matching AccessNode, + - the SDFGState in which it resides, + - and the parent SDFG containing the node. + """ + starting_sdfg = self._node_to_sdfg_cache[map_entry] + matching_access_nodes = [] + + for node, parent_state in starting_sdfg.all_nodes_recursive(): + + if (isinstance(node, nodes.AccessNode) and node.data == data_name and + helpers.contained_in(parent_state, node, map_entry)): + + parent_sdfg = self._node_to_sdfg_cache[node] + matching_access_nodes.append((node, parent_state, parent_sdfg)) + + return matching_access_nodes + + def get_maps_between(self, stop_map_entry: nodes.MapEntry, + node: nodes.Node) -> Tuple[List[nodes.MapEntry], List[nodes.MapExit]]: + """ + Returns all MapEntry/MapExit pairs between `node` and `stop_map_entry`, inclusive. + + Maps are returned from innermost to outermost, starting at the scope of `node` and + ending at `stop_map_entry`. Assumes that `node` is (directly or indirectly via a + nestedSDFG) contained within the `stop_map_entry`'s scope. + + Args: + stop_map_entry: The outermost MapEntry to stop at (inclusive). + node: The node from which to begin scope traversal. + + Returns: + A tuple of two lists: + - List of MapEntry nodes (from inner to outer scope), + - List of corresponding MapExit nodes. + """ + stop_state = self._node_to_state_cache[stop_map_entry] + stop_exit = stop_state.exit_node(stop_map_entry) + + entries: List[nodes.MapEntry] = [] + exits: List[nodes.MapExit] = [] + + current_state = self._node_to_state_cache[node] + parent_info = helpers.get_parent_map(current_state, node) + + while True: + if parent_info is None: + raise ValueError("Expected node to be in scope of stop_map_entry, but no parent map was found.") + + entry, state = parent_info + exit_node = state.exit_node(entry) + + entries.append(entry) + exits.append(exit_node) + + if exit_node == stop_exit: + break + + parent_info = helpers.get_parent_map(state, entry) + + return entries, exits + + def get_nearest_access_node(self, access_nodes: List[nodes.AccessNode], node: nodes.Node) -> nodes.AccessNode: + """ + Finds the closest access node (by graph distance) to the given node + within the same state. Direction is ignored. + + Args: + access_nodes: List of candidate AccessNodes to search from. + node: The node from which to start the search. + + Returns: + The closest AccessNode (by number of edges traversed). + + Raises: + RuntimeError: If no access node is conected in the node's state to the node. + """ + state = self._node_to_state_cache[node] + + visited = set() + queue = [node] + while queue: + current = queue.pop(0) + if current in access_nodes: + return current + + visited.add(current) + for neighbor in state.neighbors(current): + if neighbor not in visited: + queue.append(neighbor) + + raise RuntimeError(f"No access node found connected to the given node {node}.") + + def in_paths(self, access_node: nodes.AccessNode) -> List[List[MultiConnectorEdge[Memlet]]]: + """ + Traces all incoming dataflow paths to the given AccessNode. + Only searches in the same state where the AccessNode is. + + Returns: + A list of edge paths (each a list of edges). + """ + state = self._node_to_state_cache[access_node] + + # Start paths with in-edges to the access node. + initial_paths = [[edge] for edge in state.in_edges(access_node)] + queue = deque(initial_paths) + complete_paths = [] + + while queue: + # Get current path and see whether the starting node has in-edges carrying the access nodes data + current_path = queue.popleft() + first_edge = current_path[0] + current_node = first_edge.src + incoming_edges = [edge for edge in state.in_edges(current_node)] + + # If no incoming edges found, this path is complete + if len(incoming_edges) == 0: + + complete_paths.append(current_path) + continue + + # Otherwise, extend the current path and add it to the queue for further processing + for edge in incoming_edges: + if edge in current_path: + raise ValueError("Unexpected cycle detected") + + extended_path = [edge] + current_path + queue.append(extended_path) + + return complete_paths + + def out_paths(self, access_node: nodes.AccessNode) -> List[List[MultiConnectorEdge[Memlet]]]: + """ + Traces all outgoing dataflow paths to the given AccessNode. + Only searches in the same state where the AccessNode is. + + Returns: + A list of edge paths (each a list of edges). + """ + state: SDFGState = self._node_to_state_cache[access_node] + + initial_paths = [[edge] for edge in state.out_edges(access_node)] + queue = deque(initial_paths) + complete_paths = [] + + while queue: + # Get current path and see whether the last node has out-edges carrying the access nodes data + current_path = queue.popleft() + last_edge = current_path[-1] + current_node = last_edge.dst + outgoing_edges = [edge for edge in state.out_edges(current_node)] + + # If no such edges found, this path is complete + if len(outgoing_edges) == 0: + complete_paths.append(current_path) + continue + + # Otherwise, extend the current path and add it to the queue for further processing + for edge in outgoing_edges: + + if edge in current_path: + raise ValueError("Unexpected cycle detected") + + extended_path = current_path + [edge] + queue.append(extended_path) + + return complete_paths + \ No newline at end of file From 1b63608b45b31d4a27d0aa35bc5b472e57119da6 Mon Sep 17 00:00:00 2001 From: aydogdub Date: Tue, 5 Aug 2025 13:11:03 +0200 Subject: [PATCH 66/94] small refactoring --- dace/codegen/targets/experimental_cuda.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/dace/codegen/targets/experimental_cuda.py b/dace/codegen/targets/experimental_cuda.py index f602b53861..bb7fb2e77e 100644 --- a/dace/codegen/targets/experimental_cuda.py +++ b/dace/codegen/targets/experimental_cuda.py @@ -762,13 +762,12 @@ def _prepare_GPU_Global_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: Sta dataname = ptr(node.data, nodedesc, sdfg, self._frame) # ------------------- Declaration ------------------- - array_ctype = f'{nodedesc.dtype.ctype} *' declared = self._dispatcher.declared_arrays.has(dataname) if not declared: + array_ctype = f'{nodedesc.dtype.ctype} *' declaration_stream.write(f'{array_ctype} {dataname};\n', cfg, state_id, node) - - self._dispatcher.defined_vars.add(dataname, DefinedType.Pointer, array_ctype) + self._dispatcher.defined_vars.add(dataname, DefinedType.Pointer, array_ctype) # ------------------- Allocation ------------------- arrsize = nodedesc.total_size @@ -803,13 +802,12 @@ def _prepare_CPU_Pinned_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: Sta dataname = ptr(node.data, nodedesc, sdfg, self._frame) # ------------------- Declaration ------------------- - array_ctype = f'{nodedesc.dtype.ctype} *' declared = self._dispatcher.declared_arrays.has(dataname) if not declared: + array_ctype = f'{nodedesc.dtype.ctype} *' declaration_stream.write(f'{array_ctype} {dataname};\n', cfg, state_id, node) - - self._dispatcher.defined_vars.add(dataname, DefinedType.Pointer, array_ctype) + self._dispatcher.defined_vars.add(dataname, DefinedType.Pointer, array_ctype) # ------------------- Allocation ------------------- arrsize = nodedesc.total_size From b8f282b0c8a5d6ed814ebbc142cb3165b1643ddb Mon Sep 17 00:00:00 2001 From: aydogdub Date: Tue, 5 Aug 2025 14:11:25 +0200 Subject: [PATCH 67/94] Experimental way to support Stream objects --- dace/codegen/targets/experimental_cuda.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/dace/codegen/targets/experimental_cuda.py b/dace/codegen/targets/experimental_cuda.py index bb7fb2e77e..54b8c7c53b 100644 --- a/dace/codegen/targets/experimental_cuda.py +++ b/dace/codegen/targets/experimental_cuda.py @@ -802,6 +802,13 @@ def _prepare_CPU_Pinned_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: Sta dataname = ptr(node.data, nodedesc, sdfg, self._frame) # ------------------- Declaration ------------------- + # NOTE: Experimental for GPU stream + if nodedesc.dtype == dtypes.gpuStream_t: + array_ctype = f'{nodedesc.dtype.ctype} *' + declaration_stream.write(f'{nodedesc.dtype.ctype} * {dataname} = __state->gpu_context->streams;\n', cfg, state_id, node) + self._dispatcher.defined_vars.add(dataname, DefinedType.Pointer, array_ctype) + return + declared = self._dispatcher.declared_arrays.has(dataname) if not declared: @@ -904,6 +911,8 @@ def deallocate_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgrap callsite_stream.write(f'DACE_GPU_CHECK({self.backend}Free({dataname}));\n', cfg, state_id, node) elif nodedesc.storage == dtypes.StorageType.CPU_Pinned: + if nodedesc.dtype == dtypes.gpuStream_t: + return callsite_stream.write(f'DACE_GPU_CHECK({self.backend}FreeHost({dataname}));\n', cfg, state_id, node) elif nodedesc.storage in {dtypes.StorageType.GPU_Shared, dtypes.StorageType.Register}: From 8206f3e5b69447032ec0709644deb9d25ff07193 Mon Sep 17 00:00:00 2001 From: aydogdub Date: Tue, 5 Aug 2025 14:12:19 +0200 Subject: [PATCH 68/94] streams as opaque types --- dace/dtypes.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/dace/dtypes.py b/dace/dtypes.py index 7a2ade50a2..2a5ac8c765 100644 --- a/dace/dtypes.py +++ b/dace/dtypes.py @@ -1268,7 +1268,7 @@ def isconstant(var): complex128 = typeclass(numpy.complex128) string = stringtype() MPI_Request = opaque('MPI_Request') - +gpuStream_t = opaque('gpuStream_t') @undefined_safe_enum @extensible_enum @@ -1288,6 +1288,7 @@ class Typeclasses(aenum.AutoNumberEnum): float64 = float64 complex64 = complex64 complex128 = complex128 + gpuStream_t = gpuStream_t _bool = bool From 7245b1b1681ee118998ecc688fcb43bedbd82c13 Mon Sep 17 00:00:00 2001 From: aydogdub Date: Wed, 6 Aug 2025 18:15:19 +0200 Subject: [PATCH 69/94] Revert merge and Implement initial support for dynamic inputs --- dace/codegen/targets/experimental_cuda.py | 26 +++++++++++++++++++++++ dace/sdfg/state.py | 3 ++- 2 files changed, 28 insertions(+), 1 deletion(-) diff --git a/dace/codegen/targets/experimental_cuda.py b/dace/codegen/targets/experimental_cuda.py index bb7fb2e77e..7a936b6f10 100644 --- a/dace/codegen/targets/experimental_cuda.py +++ b/dace/codegen/targets/experimental_cuda.py @@ -430,10 +430,27 @@ def _declare_and_invoke_kernel_wrapper(self, sdfg: SDFG, cfg: ControlFlowRegion, 'DACE_EXPORTED void __dace_runkernel_%s(%s);\n' % (kernel_name, ', '.join(kernel_wrapper_args_typed)), cfg, state_id, scope_entry) + # If there are dynamic Map inputs, put the kernel invocation in its own scope to avoid redefinitions. + state = sdfg.state(state_id) + if dace.sdfg.has_dynamic_map_inputs(state, scope_entry): + callsite_stream.write('{', cfg, state_id, scope_entry) + + # Synchronize all events leading to dynamic map range connectors + for e in dace.sdfg.dynamic_map_inputs(state, scope_entry): + callsite_stream.write( + self._cpu_codegen.memlet_definition(sdfg, e.data, False, e.dst_conn, e.dst.in_connectors[e.dst_conn]), + cfg, state_id, scope_entry) + # Calling the kernel wrapper function (in the CPU-side code) callsite_stream.write('__dace_runkernel_%s(%s);\n' % (kernel_name, ', '.join(kernel_wrapper_args_as_input)), cfg, state_id, scope_entry) + + + # If there are dynamic Map inputs, put the kernel invocation in its own scope to avoid redefinitions. + if dace.sdfg.has_dynamic_map_inputs(state, scope_entry): + callsite_stream.write('}', cfg, state_id, scope_entry) + def _generate_kernel_wrapper(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: ScopeSubgraphView, state_id: int, function_stream: CodeIOStream, callsite_stream: CodeIOStream) -> None: @@ -1160,6 +1177,15 @@ def __init__(self, cudaCodeGen: ExperimentalCUDACodeGen, sdfg: SDFG, cfg: Contro # Retrieve arguments required for the kernels subgraph arglist: Dict[str, dt.Data] = kernel_parent_state.scope_subgraph(kernel_map_entry).arglist() + + # Add also dynamic inputs required for the kernel + for e in dace.sdfg.dynamic_map_inputs(kernel_parent_state, kernel_map_entry): + var_name = str(e.dst_conn) + data_desc = e.src.desc(sdfg) + defined_type = get_defined_type(data_desc) + arglist[var_name] = data_desc + cudaCodeGen._dispatcher.defined_vars.add(var_name, defined_type, data_desc.ctype) + self._arglist = arglist # Format arguments for input passing and function signatures (kernel and kernel wrapper) diff --git a/dace/sdfg/state.py b/dace/sdfg/state.py index 00249efcc5..7fd8cfcb84 100644 --- a/dace/sdfg/state.py +++ b/dace/sdfg/state.py @@ -976,7 +976,8 @@ def unordered_arglist(self, defined_syms = defined_syms or self.defined_symbols() scalar_args.update({ k: dt.Scalar(defined_syms[k]) if k in defined_syms else sdfg.arrays[k] - for k in self.used_symbols(all_symbols=False) if not k.startswith('__dace') and k not in sdfg.constants + for k in self.used_symbols(all_symbols=False) + if not k.startswith('__dace') and k not in sdfg.constants and (k in defined_syms or k in sdfg.arrays) }) # Add scalar arguments from free symbols of data descriptors From 217d8c252c069613f747724484c81939cb7a95b9 Mon Sep 17 00:00:00 2001 From: aydogdub Date: Fri, 15 Aug 2025 14:43:07 +0200 Subject: [PATCH 70/94] New approach for GPU streams- make it explicit --- dace/codegen/targets/cpu.py | 9 + dace/codegen/targets/experimental_cuda.py | 97 ++-- .../copy_strategies.py | 9 +- .../new_copy_strategies.py | 454 ++++++++++++++++++ dace/config_schema.yml | 13 +- dace/sdfg/state.py | 7 + .../gpu_stream_topology_simplification.py | 272 +++++++++++ .../passes/gpustream/gpustream_scheduling.py | 195 ++++++++ .../insert_gpu_stream_sync_tasklets.py | 287 +++++++++++ .../insert_gpu_streams_to_kernels.py | 79 +++ .../passes/gpustream_scheduling.py | 353 -------------- .../passes/insert_gpu_copy_tasklets.py | 180 +++++++ tests/codegen/gpu_memcpy_test.py | 7 +- 13 files changed, 1576 insertions(+), 386 deletions(-) create mode 100644 dace/codegen/targets/experimental_cuda_helpers/new_copy_strategies.py create mode 100644 dace/transformation/passes/gpustream/gpu_stream_topology_simplification.py create mode 100644 dace/transformation/passes/gpustream/gpustream_scheduling.py create mode 100644 dace/transformation/passes/gpustream/insert_gpu_stream_sync_tasklets.py create mode 100644 dace/transformation/passes/gpustream/insert_gpu_streams_to_kernels.py delete mode 100644 dace/transformation/passes/gpustream_scheduling.py create mode 100644 dace/transformation/passes/insert_gpu_copy_tasklets.py diff --git a/dace/codegen/targets/cpu.py b/dace/codegen/targets/cpu.py index 631a3af269..6e8ce7355a 100644 --- a/dace/codegen/targets/cpu.py +++ b/dace/codegen/targets/cpu.py @@ -1001,6 +1001,11 @@ def process_out_memlets(self, dst_edge = dfg.memlet_path(edge)[-1] dst_node = dst_edge.dst + if isinstance(dst_node, nodes.AccessNode) and dst_node.desc(state).dtype == dtypes.gpuStream_t: + # Special case: GPU Streams do not represent data flow - they assing GPU Streams to kernels/tasks + # Thus, nothing needs to be written and out memlets of this kind should be ignored. + continue + # Target is neither a data nor a tasklet node if isinstance(node, nodes.AccessNode) and (not isinstance(dst_node, nodes.AccessNode) and not isinstance(dst_node, nodes.CodeNode)): @@ -1579,6 +1584,10 @@ def define_out_memlet(self, sdfg: SDFG, cfg: ControlFlowRegion, state_dfg: State cdtype = src_node.out_connectors[edge.src_conn] if isinstance(sdfg.arrays[edge.data.data], data.Stream): pass + elif isinstance(dst_node, nodes.AccessNode) and dst_node.desc(state_dfg).dtype == dtypes.gpuStream_t: + # Special case: GPU Streams do not represent data flow - they assing GPU Streams to kernels/tasks + # Thus, nothing needs to be written. + pass elif isinstance(cdtype, dtypes.pointer): # If pointer, also point to output desc = sdfg.arrays[edge.data.data] diff --git a/dace/codegen/targets/experimental_cuda.py b/dace/codegen/targets/experimental_cuda.py index a485b08f5a..3f00dab61d 100644 --- a/dace/codegen/targets/experimental_cuda.py +++ b/dace/codegen/targets/experimental_cuda.py @@ -1,3 +1,4 @@ +# Copyright 2019-2025 ETH Zurich and the DaCe authors. All rights reserved. # Standard library imports import warnings from typing import TYPE_CHECKING, Any, Dict, List, Optional, Set, Tuple, Union @@ -28,7 +29,12 @@ # DaCe transformation imports from dace.transformation.passes import analysis as ap -from dace.transformation.passes.gpustream_scheduling import NaiveGPUStreamScheduler +from dace.transformation.pass_pipeline import Pipeline +from dace.transformation.passes.gpustream.gpustream_scheduling import NaiveGPUStreamScheduler +from dace.transformation.passes.gpustream.insert_gpu_streams_to_kernels import InsertGPUStreamsToKernels +from dace.transformation.passes.insert_gpu_copy_tasklets import InsertGPUCopyTasklets +from dace.transformation.passes.gpustream.gpu_stream_topology_simplification import GPUStreamTopologySimplification +from dace.transformation.passes.gpustream.insert_gpu_stream_sync_tasklets import InsertGPUStreamSyncTasklets from dace.transformation.passes.shared_memory_synchronization import DefaultSharedMemorySync from dace.transformation.dataflow.add_threadblock_map import AddThreadBlockMap from dace.transformation.passes.analysis.infer_gpu_grid_and_block_size import InferGPUGridAndBlockSize @@ -37,6 +43,9 @@ from dace.codegen.targets.experimental_cuda_helpers.gpu_stream_manager import GPUStreamManager from dace.codegen.targets.experimental_cuda_helpers.gpu_utils import symbolic_to_cpp, emit_sync_debug_checks, get_defined_type + +from dace.codegen.targets import cpp + # Type checking imports (conditional) if TYPE_CHECKING: from dace.codegen.targets.framecode import DaCeCodeGenerator @@ -126,7 +135,7 @@ def preprocess(self, sdfg: SDFG) -> None: - Adding explicit ThreadBlock Maps where missing and infer Grid and Block dimensions for every Kernel in the SDFG - Handling GPU<->GPU strided copies. - - Assigning backend GPU streams (e.g., CUDA streams) and creating the GPUStreamManager. + - TODO: update: Assigning backend GPU streams (e.g., CUDA streams) and creating the GPUStreamManager. - Handling memory pool management """ @@ -198,13 +207,36 @@ def preprocess(self, sdfg: SDFG) -> None: # Define backend stream access expression (e.g., CUDA stream handle) gpu_stream_access_template = "__state->gpu_context->streams[{gpu_stream}]" - # Initialize and configure GPU stream scheduling pass - gpu_stream_pass = NaiveGPUStreamScheduler() - gpu_stream_pass.set_gpu_stream_access_template(gpu_stream_access_template) - assigned_streams = gpu_stream_pass.apply_pass(sdfg, None) + # TODO: Update + stream_pipeline = Pipeline( + [ + NaiveGPUStreamScheduler(), + InsertGPUStreamsToKernels(), + InsertGPUStreamSyncTasklets(), + InsertGPUCopyTasklets(), + GPUStreamTopologySimplification(), + ] + ) + + self._dispatcher._used_targets.add(self) + gpustream_assignments = stream_pipeline.apply_pass(sdfg, {})['NaiveGPUStreamScheduler'] # Initialize runtime GPU stream manager - self._gpu_stream_manager = GPUStreamManager(sdfg, assigned_streams, gpu_stream_access_template) + # TODO: probably to be deleted + self._gpu_stream_manager = GPUStreamManager(sdfg, gpustream_assignments, gpu_stream_access_template) + + # Get GPU stream persistent array name used in state struct + # NOTE: GPU stream array name from the configurations is prepended with an ID for consistency, + # since struct definition and access are handled elsewhere (e.g., framecode.py, cpu.py, cpp.py) + # TODO: Nicer + self._initialize_gpustreams = "" + gpu_stream_array_name = Config.get('compiler', 'cuda', 'gpu_stream_name').split(",")[0] + for csdfg, name, desc in sdfg.arrays_recursive(include_nested_data=True): + if name == gpu_stream_array_name and desc.lifetime == dtypes.AllocationLifetime.Persistent: + gpu_stream_field_name = f'__{csdfg.cfg_id}_{name}' + self._initialize_gpustreams += f"__state->{gpu_stream_field_name} = __state->gpu_context->streams;\n" + + #----------------- Shared Memory Synchronization related Logic ----------------- @@ -339,7 +371,6 @@ def generate_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: ScopeSub if not self._in_device_code: # Enter kernel context and recursively generate device code - self._in_device_code = True # New scope for defined variables (kernel functions scope) self._dispatcher.defined_vars.enter_scope(scope_entry) @@ -371,6 +402,8 @@ def generate_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: ScopeSub kernel_stream = CodeIOStream() kernel_function_stream = self._globalcode + self._in_device_code = True + kernel_scope_generator = KernelScopeGenerator(codegen=self) if kernel_scope_generator.applicable(sdfg, cfg, dfg_scope, state_id, kernel_function_stream, kernel_stream): kernel_scope_generator.generate(sdfg, cfg, dfg_scope, state_id, kernel_function_stream, kernel_stream) @@ -431,7 +464,7 @@ def _declare_and_invoke_kernel_wrapper(self, sdfg: SDFG, cfg: ControlFlowRegion, state_id, scope_entry) # If there are dynamic Map inputs, put the kernel invocation in its own scope to avoid redefinitions. - state = sdfg.state(state_id) + state = cfg.state(state_id) if dace.sdfg.has_dynamic_map_inputs(state, scope_entry): callsite_stream.write('{', cfg, state_id, scope_entry) @@ -445,7 +478,6 @@ def _declare_and_invoke_kernel_wrapper(self, sdfg: SDFG, cfg: ControlFlowRegion, callsite_stream.write('__dace_runkernel_%s(%s);\n' % (kernel_name, ', '.join(kernel_wrapper_args_as_input)), cfg, state_id, scope_entry) - # If there are dynamic Map inputs, put the kernel invocation in its own scope to avoid redefinitions. if dace.sdfg.has_dynamic_map_inputs(state, scope_entry): @@ -509,12 +541,12 @@ def _generate_kernel_wrapper(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope # ----------------- Kernel Launch Invocation ----------------------- kargs = ', '.join(['(void *)&' + arg for arg in kernel_args_as_input]) - + _, stream_var_name = Config.get('compiler', 'cuda', 'gpu_stream_name').split(',') self._localcode.write( f''' void *{kernel_name}_args[] = {{ {kargs} }}; gpuError_t __err = {self.backend}LaunchKernel( - (void*){kernel_name}, dim3({gdims}), dim3({bdims}), {kernel_name}_args, {0}, {gpu_stream} + (void*){kernel_name}, dim3({gdims}), dim3({bdims}), {kernel_name}_args, {0}, {stream_var_name} ); ''', cfg, state_id, scope_entry) @@ -764,6 +796,10 @@ def allocate_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphV if nodedesc.lifetime in (dtypes.AllocationLifetime.Persistent, dtypes.AllocationLifetime.External): nodedesc = update_persistent_desc(nodedesc, sdfg) + # NOTE: Experimental for GPU stream + if nodedesc.dtype == dtypes.gpuStream_t: + return + # ------------------- Allocation/Declaration ------------------- # Call the appropriate handler based on storage type @@ -819,13 +855,6 @@ def _prepare_CPU_Pinned_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: Sta dataname = ptr(node.data, nodedesc, sdfg, self._frame) # ------------------- Declaration ------------------- - # NOTE: Experimental for GPU stream - if nodedesc.dtype == dtypes.gpuStream_t: - array_ctype = f'{nodedesc.dtype.ctype} *' - declaration_stream.write(f'{nodedesc.dtype.ctype} * {dataname} = __state->gpu_context->streams;\n', cfg, state_id, node) - self._dispatcher.defined_vars.add(dataname, DefinedType.Pointer, array_ctype) - return - declared = self._dispatcher.declared_arrays.has(dataname) if not declared: @@ -1040,6 +1069,9 @@ def get_generated_codeobjects(self): DACE_GPU_CHECK({backend}EventCreateWithFlags(&__state->gpu_context->events[i], {backend}EventDisableTiming)); }} + // Here + {other_gpustream_init} + {initcode} return 0; @@ -1091,6 +1123,7 @@ def get_generated_codeobjects(self): file_header=fileheader.getvalue(), nstreams=self._gpu_stream_manager.num_gpu_streams, nevents=self._gpu_stream_manager.num_gpu_events, + other_gpustream_init=self._initialize_gpustreams, backend=self.backend, backend_header=backend_header, pool_header=pool_header, @@ -1167,7 +1200,7 @@ def __init__(self, cudaCodeGen: ExperimentalCUDACodeGen, sdfg: SDFG, cfg: Contro dfg_scope: ScopeSubgraphView, state_id: int): # Get kernel entry/exit nodes and current state - kernel_map_entry = dfg_scope.source_nodes()[0] + kernel_map_entry: nodes.MapEntry = dfg_scope.source_nodes()[0] kernel_parent_state: SDFGState = cfg.state(state_id) self._kernel_map_entry: nodes.MapEntry = kernel_map_entry @@ -1187,13 +1220,23 @@ def __init__(self, cudaCodeGen: ExperimentalCUDACodeGen, sdfg: SDFG, cfg: Contro # Retrieve arguments required for the kernels subgraph arglist: Dict[str, dt.Data] = kernel_parent_state.scope_subgraph(kernel_map_entry).arglist() - # Add also dynamic inputs required for the kernel + # Add also dynamic inputs required for the kernel to arglist except streams + # streams are only needed for the kernel wrapper and launcher function + stream_args = [] + stream_args_typed = [] for e in dace.sdfg.dynamic_map_inputs(kernel_parent_state, kernel_map_entry): - var_name = str(e.dst_conn) data_desc = e.src.desc(sdfg) - defined_type = get_defined_type(data_desc) - arglist[var_name] = data_desc - cudaCodeGen._dispatcher.defined_vars.add(var_name, defined_type, data_desc.ctype) + var_name = str(e.dst_conn) + + if data_desc.dtype == dtypes.gpuStream_t: + _, stream_var_name = Config.get('compiler', 'cuda', 'gpu_stream_name').split(',') + stream_args.append(f"{var_name}") + stream_args_typed.append(f"gpuStream_t {stream_var_name}") + else: + var_name = str(e.dst_conn) + arglist[var_name] = data_desc + defined_type = get_defined_type(data_desc) + cudaCodeGen._dispatcher.defined_vars.add(var_name, defined_type, data_desc.ctype, allow_shadowing=True) self._arglist = arglist @@ -1201,8 +1244,8 @@ def __init__(self, cudaCodeGen: ExperimentalCUDACodeGen, sdfg: SDFG, cfg: Contro self._args_as_input = [ptr(name, data, sdfg, cudaCodeGen._frame) for name, data in arglist.items()] self._args_typed = [('const ' if name in kernel_constants else '') + data.as_arg(name=name) for name, data in arglist.items()] - self._kernel_wrapper_args_as_input = ['__state'] + self._args_as_input - self._kernel_wrapper_args_typed = [f'{mangle_dace_state_struct_name(cudaCodeGen._global_sdfg)} *__state'] + self._args_typed + self._kernel_wrapper_args_as_input = ['__state'] + self._args_as_input + stream_args + self._kernel_wrapper_args_typed = [f'{mangle_dace_state_struct_name(cudaCodeGen._global_sdfg)} *__state'] + self._args_typed + stream_args_typed # The kernel's grid and block dimensions self._grid_dims, self._block_dims = cudaCodeGen._kernel_dimensions_map[kernel_map_entry] diff --git a/dace/codegen/targets/experimental_cuda_helpers/copy_strategies.py b/dace/codegen/targets/experimental_cuda_helpers/copy_strategies.py index 5c55fd9d1f..6d037c2ae9 100644 --- a/dace/codegen/targets/experimental_cuda_helpers/copy_strategies.py +++ b/dace/codegen/targets/experimental_cuda_helpers/copy_strategies.py @@ -193,7 +193,7 @@ def applicable(self, copy_context: CopyContext) -> bool: This function returns True if: - We are not currently generating kernel code - The copy occurs between two AccessNodes - - The storage types involve a CPU and a GPU (but not CPU-to-CPU or GPU-to-GPU) + - This check is used to detect and handle transfers between host and device memory spaces. """ @@ -220,6 +220,7 @@ def applicable(self, copy_context: CopyContext) -> bool: def generate_copy(self, copy_context: CopyContext) -> None: """Execute host-device copy with CUDA memory operations""" + return # guard _, _, _, _, memlet = copy_context.edge if memlet.wcr is not None: @@ -357,9 +358,9 @@ def _generate_nd_copy(self, copy_context: CopyContext) -> None: src = f'{src_expr} + {offset_src}' dst = f'{dst_expr} + {offset_dst}' - dpitch = f'{dst_strides[-2]} + sizeof({ctype})' - spitch = f'{src_strides[-2]} + sizeof({ctype})' - width = f'{copy_shape[-1]} + sizeof({ctype})' + dpitch = f'{dst_strides[-2]} * sizeof({ctype})' + spitch = f'{src_strides[-2]} * sizeof({ctype})' + width = f'{copy_shape[-1]} * sizeof({ctype})' height = copy_shape[-2] kind = f'{backend}Memcpy{src_location}To{dst_location}' diff --git a/dace/codegen/targets/experimental_cuda_helpers/new_copy_strategies.py b/dace/codegen/targets/experimental_cuda_helpers/new_copy_strategies.py new file mode 100644 index 0000000000..66ee23cf34 --- /dev/null +++ b/dace/codegen/targets/experimental_cuda_helpers/new_copy_strategies.py @@ -0,0 +1,454 @@ +# Copyright 2019-2025 ETH Zurich and the DaCe authors. All rights reserved. +from abc import ABC, abstractmethod +from typing import Any, Dict, List, Tuple, Union + +from dace import SDFG, SDFGState, data, dtypes, subsets +from dace import memlet as mm +from dace.codegen import common +from dace.codegen.targets import cpp +from dace.codegen.targets.experimental_cuda_helpers.gpu_utils import symbolic_to_cpp +from dace.config import Config +from dace.dtypes import StorageType +from dace.sdfg import nodes, scope_contains_scope +from dace.sdfg.graph import MultiConnectorEdge +from dace.transformation import helpers + +class CopyContext: + + def __init__(self, sdfg: SDFG, state: SDFGState, src_node: nodes.Node, dst_node: nodes.Node, + edge: MultiConnectorEdge[mm.Memlet], gpustream_assignments: Dict[nodes.Node, Union[int, str]]): + + # Store the basic context as attributes + self.sdfg = sdfg + self.state = state + self.src_node = src_node + self.dst_node = dst_node + self.edge = edge + self.gpustream_assignments = gpustream_assignments + + memlet = edge.data + + self.copy_shape = memlet.subset.size_exact() + if isinstance(src_node, nodes.AccessNode) and isinstance(dst_node, nodes.AccessNode): + copy_shape, src_strides, dst_strides, src_expr, dst_expr = self.get_accessnode_to_accessnode_copy_info() + else: + copy_shape = memlet.subset.size_exact() + src_strides = dst_strides = src_expr = dst_expr = None + + self.copy_shape = copy_shape + self.src_strides = src_strides + self.dst_strides = dst_strides + self.src_expr = src_expr + self.dst_expr = dst_expr + + def get_storage_type(self, node: nodes.Node): + if isinstance(node, nodes.Tasklet): + storage_type = StorageType.Register + + elif isinstance(node, nodes.AccessNode): + storage_type = node.desc(self.sdfg).storage + + else: + raise NotImplementedError( + f"Unsupported node type {type(node)} for storage type retrieval; " + "expected AccessNode or Tasklet. Please extend this method accordingly." + ) + + return storage_type + + def get_assigned_gpustream(self) -> str: + src_stream = self.gpustream_assignments.get(self.src_node) + dst_stream = self.gpustream_assignments.get(self.dst_node) + + # 1. Catch unsupported cases + if src_stream is None or dst_stream is None: + raise ValueError("GPU stream assignment missing for source or destination node.") + + if src_stream != dst_stream: + raise ValueError( + f"Mismatch in assigned GPU streams: src_node has '{src_stream}', " + f"dst_node has '{dst_stream}'. They must be the same." + ) + + # 2. Generate GPU stream expression + + gpustream = src_stream + if gpustream == 'nullptr': + raise NotImplementedError("nullptr GPU stream not supported yet.") + + gpustream_var_name_prefix = Config.get('compiler', 'cuda', 'gpu_stream_name').split(',')[1] + gpustream_expr = f"{gpustream_var_name_prefix}{gpustream}" + + return gpustream_expr + + def get_memory_location(self) -> Tuple[str, str]: + src_storage = self.get_storage_type(self.src_node) + dst_storage = self.get_storage_type(self.dst_node) + src_location = 'Device' if src_storage == dtypes.StorageType.GPU_Global else 'Host' + dst_location = 'Device' if dst_storage == dtypes.StorageType.GPU_Global else 'Host' + + return src_location, dst_location + + def get_ctype(self) -> Any: + sdfg = self.sdfg + src_node, dst_node = self.src_node, self.dst_node + + if isinstance(src_node, nodes.AccessNode): + return src_node.desc(sdfg).ctype + + if isinstance(dst_node, nodes.AccessNode): + return dst_node.desc(sdfg).ctype + + raise NotImplementedError( + f"Cannot determine ctype: neither src nor dst node is an AccessNode. " + f"Got src_node type: {type(src_node).__name__}, dst_node type: {type(dst_node).__name__}. " + "Please extend this case or fix the issue." + ) + + def get_accessnode_to_accessnode_copy_info(self): + src_node, dst_node = self.src_node, self.dst_node + sdfg = self.sdfg + edge = self.edge + memlet = self.edge.data + state = self.state + copy_shape = self.copy_shape + + if not (isinstance(src_node, nodes.AccessNode) and isinstance(dst_node, nodes.AccessNode)): + raise TypeError( + f"get_accessnode_to_accessnode_copy_info requires both source and destination " + f"to be AccessNode instances, but got {type(src_node).__name__} and {type(dst_node).__name__}." + ) + + src_nodedesc = src_node.desc(sdfg) + dst_nodedesc = dst_node.desc(sdfg) + + src_subset = memlet.get_src_subset(edge, state) + dst_subset = memlet.get_dst_subset(edge, state) + + if src_subset is None: + src_subset = subsets.Range.from_array(src_nodedesc) + + if dst_subset is None: + dst_subset = subsets.Range.from_array(dst_nodedesc) + + src_strides = src_subset.absolute_strides(src_nodedesc.strides) + dst_strides = dst_subset.absolute_strides(dst_nodedesc.strides) + + # Try to turn into degenerate/strided ND copies + result = cpp.ndcopy_to_strided_copy( + copy_shape, + src_nodedesc.shape, + src_strides, + dst_nodedesc.shape, + dst_strides, + memlet.subset, + src_subset, + dst_subset, + ) + if result is not None: + copy_shape, src_strides, dst_strides = result + else: + # If other_subset is defined, reduce its dimensionality by + # removing the "empty" dimensions (size = 1) and filter the + # corresponding strides out + src_strides = ([stride + for stride, s in zip(src_strides, src_subset.size()) if s != 1] + src_strides[len(src_subset):] + ) # Include tiles + if not src_strides: + src_strides = [1] + dst_strides = ([stride + for stride, s in zip(dst_strides, dst_subset.size()) if s != 1] + dst_strides[len(dst_subset):] + ) # Include tiles + if not dst_strides: + dst_strides = [1] + copy_shape = [s for s in copy_shape if s != 1] + if not copy_shape: + copy_shape = [1] + + # Extend copy shape to the largest among the data dimensions, + # and extend other array with the appropriate strides + if len(dst_strides) != len(copy_shape) or len(src_strides) != len(copy_shape): + if memlet.data == src_node.data: + copy_shape, dst_strides = cpp.reshape_strides(src_subset, src_strides, dst_strides, copy_shape) + elif memlet.data == dst_node.data: + copy_shape, src_strides = cpp.reshape_strides(dst_subset, dst_strides, src_strides, copy_shape) + + + src_name = src_node.data + if isinstance(src_nodedesc, data.Scalar) and src_nodedesc.storage in dtypes.GPU_MEMORY_STORAGES_EXPERIMENTAL_CUDACODEGEN: + parent_nsdfg_node = state.sdfg.parent_nsdfg_node + if parent_nsdfg_node is not None and src_name in parent_nsdfg_node.in_connectors: + src_expr = f"&{src_name}" + else: + src_expr = src_name + + elif isinstance(src_nodedesc, data.Scalar): + src_expr = f"&{src_name}" + + elif isinstance(src_nodedesc, data.Array): + src_offset = cpp.cpp_offset_expr(src_nodedesc, src_subset) + src_expr = f"{src_name} + {src_offset}" if src_offset != "0" else src_name + + else: + raise NotImplementedError( + f"Expected {src_name} to be either data.Scalar or data.Array, " + f"but got {type(src_nodedesc).__name__}." + ) + + dst_name = dst_node.data + if isinstance(dst_nodedesc, data.Scalar) and dst_nodedesc.storage in dtypes.GPU_MEMORY_STORAGES_EXPERIMENTAL_CUDACODEGEN: + parent_nsdfg_node = state.sdfg.parent_nsdfg_node + if parent_nsdfg_node is not None and dst_name in parent_nsdfg_node.in_connectors: + dst_expr = f"&{dst_name}" + else: + dst_expr = dst_name + + elif isinstance(dst_nodedesc, data.Scalar): + dst_expr = f"&{dst_name}" + + elif isinstance(dst_nodedesc, data.Array): + dst_offset = cpp.cpp_offset_expr(dst_nodedesc, dst_subset) + dst_expr = f"{dst_name} + {dst_offset}" if dst_offset != "0" else dst_name + + else: + raise NotImplementedError( + f"Expected {dst_name} to be either data.Scalar or data.Array, " + f"but got {type(dst_nodedesc).__name__}." + ) + + return copy_shape, src_strides, dst_strides, src_expr, dst_expr + + +class CopyStrategy(ABC): + + @abstractmethod + def applicable(self, copy_context: CopyContext) -> bool: + """ + Return True if this strategy can handle the given memory copy. + """ + raise NotImplementedError('Abstract class') + + @abstractmethod + def generate_copy(self, copy_context: CopyContext) -> str: + """ + Generates and returns the copy code for the supported pattern. + """ + raise NotImplementedError('Abstract class') + + +class OutOfKernelCopyStrategy(CopyStrategy): + + def applicable(self, copy_context: CopyContext) -> bool: + """ + Determines whether the data movement is a host<->device memory copy. + + This function returns True if: + - We are not currently generating kernel code + - The copy occurs between two AccessNodes + - The storage types of either src or dst is CPU_Pinned or GPU_Device + - We do not have a CPU-to-CPU copy + + """ + # Retrieve needed information + state = copy_context.state + src_node, dst_node = copy_context.src_node, copy_context.dst_node + + # 1. Ensure copy is not occuring within a kernel + scope_dict = state.scope_dict() + deeper_node = dst_node if scope_contains_scope(scope_dict, src_node, dst_node) else src_node + + parent_map_tuple = helpers.get_parent_map(state, deeper_node) + while parent_map_tuple is not None: + parent_map, parent_state = parent_map_tuple + if parent_map.map.schedule in dtypes.GPU_SCHEDULES_EXPERIMENTAL_CUDACODEGEN: + return False + else: + parent_map_tuple = helpers.get_parent_map(parent_state, parent_map) + + # 2. Check whether copy is between to AccessNodes + if not (isinstance(src_node, nodes.AccessNode) and isinstance(dst_node, nodes.AccessNode)): + return False + + # 3. Check that one StorageType of either src or dst is CPU_Pinned or GPU_Device + src_storage = copy_context.get_storage_type(src_node) + dst_storage = copy_context.get_storage_type(dst_node) + if not (src_storage in (StorageType.GPU_Global, StorageType.CPU_Pinned) or + dst_storage in (StorageType.GPU_Global, StorageType.CPU_Pinned)): + return False + + # 4. Check that this is not a CPU to CPU copy + cpu_storage_types = [StorageType.CPU_Heap, StorageType.CPU_ThreadLocal, StorageType.CPU_Pinned] + if src_storage in cpu_storage_types and dst_storage in cpu_storage_types: + return False + + return True + + def generate_copy(self, copy_context: CopyContext) -> str: + """Execute host-device copy with CUDA memory operations""" + + # Guard + memlet = copy_context.edge.data + if memlet.wcr is not None: + src_location, dst_location = copy_context.get_memory_location() + raise NotImplementedError(f'Accumulate {src_location} to {dst_location} not implemented') + + # Based on the copy dimension, call appropiate helper function + num_dims = len(copy_context.copy_shape) + if num_dims == 1: + copy_call = self._generate_1d_copy(copy_context) + + elif num_dims == 2: + copy_call = self._generate_2d_copy(copy_context) + + else: + # sanity check + assert num_dims > 2, f"Expected copy shape with more than 2 dimensions, but got {num_dims}." + copy_call = self._generate_nd_copy(copy_context) + + return copy_call + + def _generate_1d_copy(self, copy_context: CopyContext) -> str: + """ + Emits code for a 1D memory copy between host and device using GPU backend. + Uses {backend}MemcpyAsync for contiguous memory and uses {backend}Memcpy2DAsync + for strided memory copies. + """ + + # ----------- Retrieve relevant copy parameters -------------- + backend: str = common.get_gpu_backend() + + # Due to applicable(), src and dst node must be AccessNodes + copy_shape, src_strides, dst_strides, src_expr, dst_expr = copy_context.get_accessnode_to_accessnode_copy_info() + + src_location, dst_location = copy_context.get_memory_location() + is_contiguous_copy = (src_strides[-1] == 1) and (dst_strides[-1] == 1) + ctype = copy_context.get_ctype() + gpustream = copy_context.get_assigned_gpustream() + + # ----------------- Generate backend call -------------------- + + if is_contiguous_copy: + # Memory is linear: can use {backend}MemcpyAsync + copysize = ' * '.join(symbolic_to_cpp(copy_shape)) + copysize += f' * sizeof({ctype})' + kind = f'{backend}Memcpy{src_location}To{dst_location}' + call = f'DACE_GPU_CHECK({backend}MemcpyAsync({dst_expr}, {src_expr}, {copysize}, {kind}, {gpustream}));\n' + + else: + # Memory is strided: use {backend}Memcpy2DAsync with dpitch/spitch + # This allows copying a strided 1D region + dpitch = f'{dst_strides[0]} * sizeof({ctype})' + spitch = f'{src_strides[0]} * sizeof({ctype})' + width = f'sizeof({ctype})' + height = copy_shape[0] + kind = f'{backend}Memcpy{src_location}To{dst_location}' + + call = f'DACE_GPU_CHECK({backend}Memcpy2DAsync({dst_expr}, {dpitch}, {src_expr}, {spitch}, {width}, {height}, {kind}, {gpustream}));\n' + + return call + + def _generate_2d_copy(self, copy_context: CopyContext) -> None: + """Generates code for a 2D copy, falling back to 1D flattening if applicable.""" + + # ----------- Extract relevant copy parameters -------------- + backend: str = common.get_gpu_backend() + + # Due to applicable(), src and dst node must be AccessNodes + copy_shape, src_strides, dst_strides, src_expr, dst_expr = copy_context.get_accessnode_to_accessnode_copy_info() + + src_location, dst_location = copy_context.get_memory_location() + is_contiguous_copy = (src_strides[-1] == 1) and (dst_strides[-1] == 1) + ctype = copy_context.get_ctype() + gpustream = copy_context.get_assigned_gpustream() + + # ----------------- Generate backend call if supported -------------------- + + if is_contiguous_copy: + dpitch = f'{dst_strides[0]} * sizeof({ctype})' + spitch = f'{src_strides[0]} * sizeof({ctype})' + width = f'{copy_shape[1]} * sizeof({ctype})' + height = f'{copy_shape[0]}' + kind = f'{backend}Memcpy{src_location}To{dst_location}' + + call = f'DACE_GPU_CHECK({backend}Memcpy2DAsync({dst_expr}, {dpitch}, {src_expr}, {spitch}, {width}, {height}, {kind}, {gpustream}));\n' + + elif src_strides[-1] != 1 or dst_strides[-1] != 1: + # TODO: Checks this, I am not sure but the old code and its description + # seems to be more complicated here than necessary.. + # But worth to mention: we essentially perform flattening + + # NOTE: Special case of continuous copy + # Example: dcol[0:I, 0:J, k] -> datacol[0:I, 0:J] + # with copy shape [I, J] and strides [J*K, K], [J, 1] + + dpitch = f'{dst_strides[1]} * sizeof({ctype})' + spitch = f'{src_strides[1]} * sizeof({ctype})' + width = f'sizeof({ctype})' + height = copy_shape[0] * copy_shape[1] + kind = f'{backend}Memcpy{src_location}To{dst_location}' + + call = f'DACE_GPU_CHECK({backend}Memcpy2DAsync({dst_expr}, {dpitch}, {src_expr}, {spitch}, {width}, {height}, {kind}, {gpustream}));\n' + + else: + raise NotImplementedError( + f"Unsupported 2D memory copy: shape={copy_shape}, src_strides={src_strides}, dst_strides={dst_strides}." + "Please implement this case if it is valid, or raise a more descriptive error if this path should not be taken." + ) + + return call + + def _generate_nd_copy(self, copy_context: CopyContext) -> None: + # TODO: comment + # ----------- Extract relevant copy parameters -------------- + backend: str = common.get_gpu_backend() + + # Due to applicable(), src and dst node must be AccessNodes + copy_shape, src_strides, dst_strides, src_expr, dst_expr = copy_context.get_accessnode_to_accessnode_copy_info() + + src_location, dst_location = copy_context.get_memory_location() + ctype = copy_context.get_ctype() + gpustream = copy_context.get_assigned_gpustream() + num_dims = len(copy_shape) + + # ----------- Guard for unsupported Pattern -------------- + is_contiguous_copy = (src_strides[-1] == 1) and (dst_strides[-1] == 1) + if not is_contiguous_copy: + src_node, dst_node = copy_context.src_node, copy_context.dst_node + src_storage = copy_context.get_storage_type(src_node) + dst_storage = copy_context.get_storage_type(dst_node) + raise NotImplementedError( + "Strided GPU memory copies for N-dimensional arrays are not currently supported.\n" + f" Source node: {src_node} (storage: {src_storage})\n" + f" Destination node: {copy_context.dst_node} (storage: {dst_storage})\n" + f" Source strides: {src_strides}\n" + f" Destination strides: {dst_strides}\n") + + # ----------------- Generate and write backend call(s) -------------------- + + call = "" + # Write for-loop headers + for dim in range(num_dims - 2): + call += f"for (int __copyidx{dim} = 0; __copyidx{dim} < {copy_shape[dim]}; ++__copyidx{dim}) {{\n" + + # Write Memcopy2DAsync + offset_src = ' + '.join(f'(__copyidx{d} * ({s}))' for d, s in enumerate(src_strides[:-2])) + offset_dst = ' + '.join(f'(__copyidx{d} * ({s}))' for d, s in enumerate(dst_strides[:-2])) + + src = f'{src_expr} + {offset_src}' + dst = f'{dst_expr} + {offset_dst}' + + dpitch = f'{dst_strides[-2]} * sizeof({ctype})' + spitch = f'{src_strides[-2]} * sizeof({ctype})' + width = f'{copy_shape[-1]} * sizeof({ctype})' + height = copy_shape[-2] + kind = f'{backend}Memcpy{src_location}To{dst_location}' + + # Generate call and write it + call += f'DACE_GPU_CHECK({backend}Memcpy2DAsync({dst}, {dpitch}, {src}, {spitch}, {width}, {height}, {kind}, {gpustream}));\n' + + # Write for-loop footers + for dim in range(num_dims - 2): + call += "\n}" + + # Return the code + return call \ No newline at end of file diff --git a/dace/config_schema.yml b/dace/config_schema.yml index a817e42b37..954a3507d8 100644 --- a/dace/config_schema.yml +++ b/dace/config_schema.yml @@ -457,7 +457,7 @@ required: Yakup Koray Budanaz for Berkays master-thesis. enum: [legacy, experimental] default: experimental - + gpu_index_type: type: str title: Thread/block/warp index data type @@ -512,6 +512,17 @@ required: variable name without modifying the source code or relying on a fixed name. default: block + gpu_stream_name: + type: str + title: Name for the GPU stream object + description: > + GPU streams allow GPU operations, such as kernel execution or memory transfers, to run asynchronously + and in parallel. This field specifies the naming convention for the hpu stream array and its connectors + in the SDFG. For example: 'gpu_streams,gpu_stream' means 'gpu_streams' is the array containing the + stream objects, and 'gpu_stream0' (prefix derived from the second name + stream id) is used as a + connector for gpu_streams[0]. + default: gpu_streams,gpu_stream + ############################################# # General FPGA flags fpga: diff --git a/dace/sdfg/state.py b/dace/sdfg/state.py index 7fd8cfcb84..c7fc96f1b8 100644 --- a/dace/sdfg/state.py +++ b/dace/sdfg/state.py @@ -404,6 +404,13 @@ def memlet_path(self, edge: MultiConnectorEdge[mm.Memlet]) -> List[MultiConnecto # If empty memlet, return itself as the path if (edge.src_conn is None and edge.dst_conn is None and edge.data.is_empty()): return result + + # For the (new) gpu stream handling we can have dynamic out connectors, e.g. + # KernelExit: stream -> None: AccessNode, where AccessNode accesses a Stream array + # Memlets are used but its not about seing how data flows + if (isinstance(edge.src, nd.MapExit) and edge.src.map.schedule == dtypes.ScheduleType.GPU_Device + and isinstance(edge.dst, nd.AccessNode) and edge.dst.desc(state).dtype == dtypes.gpuStream_t): + return result # Prepend incoming edges until reaching the source node curedge = edge diff --git a/dace/transformation/passes/gpustream/gpu_stream_topology_simplification.py b/dace/transformation/passes/gpustream/gpu_stream_topology_simplification.py new file mode 100644 index 0000000000..e33713056a --- /dev/null +++ b/dace/transformation/passes/gpustream/gpu_stream_topology_simplification.py @@ -0,0 +1,272 @@ +# Copyright 2019-2025 ETH Zurich and the DaCe authors. All rights reserved. +from typing import Any, Dict, List, Set, Tuple, Type, Union +import copy + +import dace +from dace import SDFG, SDFGState, dtypes, properties +from dace.config import Config +from dace.sdfg import nodes +from dace.transformation import pass_pipeline as ppl, transformation +from dace.transformation.passes.gpustream.gpustream_scheduling import NaiveGPUStreamScheduler +from dace.transformation.passes.gpustream.insert_gpu_stream_sync_tasklets import InsertGPUStreamSyncTasklets +from dace.transformation.passes.gpustream.insert_gpu_streams_to_kernels import InsertGPUStreamsToKernels +from dace.transformation.passes.insert_gpu_copy_tasklets import InsertGPUCopyTasklets + +@properties.make_properties +@transformation.explicit_cf_compatible +class GPUStreamTopologySimplification(ppl.Pass): + """ + Simplifies an SDFG after GPU stream nodes have been added. + + This pass is optional; the SDFG works without it, but it cleans up + the topology by merging adjacent or redundant GPU stream AccessNodes. + """ + + def depends_on(self) -> Set[Union[Type[ppl.Pass], ppl.Pass]]: + depending_passes = { + NaiveGPUStreamScheduler, InsertGPUStreamsToKernels, + InsertGPUStreamSyncTasklets, InsertGPUCopyTasklets + } + + return depending_passes + + def modifies(self) -> ppl.Modifies: + return ppl.Modifies.AccessNodes | ppl.Modifies.Memlets + + def should_reapply(self, modified: ppl.Modifies) -> bool: + return False + + def apply_pass(self, sdfg: SDFG, pipeline_results: Dict[str, Any]): + """ + Simplify the SDFG topology by merging adjacent GPU stream nodes. + """ + self._merge_close_gpustream_nodes(sdfg) + + self._simplify_kernel_exit_gpustreams(sdfg) + return {} + + def _merge_close_gpustream_nodes(self, sdfg: SDFG) -> None: + """ + Merge "close" GPU stream AccessNodes in the SDFG. + + This function looks for a predecessor GPU stream AccessNode that can be merged + with any successor GPU stream AccessNodes of its grand-predecessors. + + Example: + + Consider two GPU copy tasklets connected via distinct GPU stream AccessNodes: + the corresponding subgraph looks like this: + + -> Sink GPU Source GPU -> + ¦ ¦ + Tasklet ------> Data AccessNode -----> Tasklet + + This function would merge the sink and source node to simplify the SDFG. + """ + for sub_sdfg in sdfg.all_sdfgs_recursive(): + for state in sub_sdfg.states(): + for node in state.nodes(): + + # Skip AccessNodes + if isinstance(node, nodes.AccessNode): + continue + + # Find GPU stream AccessNode predecessors with no incoming edges + # (i.e. source GPU stream AccessNodes) + node_predecessors = state.predecessors(node) + preceeding_gpustream_sources = [pre for pre in node_predecessors if isinstance(pre, nodes.AccessNode) + and pre.desc(state).dtype == dtypes.gpuStream_t and state.in_degree(pre) == 0] + + # Skip if there are no preceding GPU stream sources + if len(preceeding_gpustream_sources) == 0: + continue + + # If multiple GPU stream sources exist, merge them; otherwise, use the single source + if len(preceeding_gpustream_sources) > 1: + combined_stream_node = preceeding_gpustream_sources.pop() + for preceeding_gpu_stream in preceeding_gpustream_sources: + # Note: there are no ingoing edges + for out_edge in state.out_edges(preceeding_gpu_stream): + _, src_conn, dst, dst_conn, data = out_edge + state.add_edge(combined_stream_node, src_conn, dst, dst_conn, data) + state.remove_edge(out_edge) + state.remove_node(preceeding_gpu_stream) + + else: + combined_stream_node = preceeding_gpustream_sources.pop() + + # Merge grand-predecessors' successors sink GPU streams with predecessor source GPU stream + node_grand_predecessors = [ + grand_pred for pred in node_predecessors + for grand_pred in state.predecessors(pred) + ] + node_gp_successors_streams = [ + succ_of_gp for gp in node_grand_predecessors + for succ_of_gp in state.successors(gp) + if isinstance(succ_of_gp, nodes.AccessNode) and succ_of_gp.desc(state).dtype == dtypes.gpuStream_t + and state.out_degree(succ_of_gp) == 0 + ] + + for gp_succ_stream in node_gp_successors_streams: + for edge in state.in_edges(gp_succ_stream): + src, src_conn, _, dst_conn, data = edge + state.add_edge(src, src_conn, combined_stream_node, dst_conn, data) + state.remove_edge(edge) + # Note: the grand-predecessor's successor GPU stream is a sink node and has no + # outgoing edges + state.remove_node(gp_succ_stream) + + def _simplify_kernel_exit_gpustreams(self, sdfg: SDFG) -> None: + """ + Special-case simplification after a GPU_Device scheduled kernel MapExit. + + 1) The MapExit feeds a GPU stream AccessNode that typically goes into a stream + synchronization tasklet. + 2) The same MapExit also feeds a GPU memory copy that has separate 'input' and + 'output' GPU stream AccessNodes. + + In this situation, the topology is simplified by using a single GPU stream + AccessNode before the memory copy and for the MapExit's GPU stream and another + GPU stream AccessNode after the copy. + + Explaining what is happening in words is difficult here. + Inspect intermediate SDFGs on this minimal case to see what is going on: + + Example + ------- + @dace.program + def example(A: dace.uint32[128], B: dace.uint32[128], + C: dace.uint32[128], D: dace.uint32[128]): + for i in dace.map[0:128:1]: + B[i] = A[i] + for i in dace.map[0:128:1]: + D[i] = C[i] + + sdfg = example.to_sdfg() + sdfg.apply_gpu_transformations() + """ + # Get the name of the GPU stream arry + gpustream_array_name = Config.get('compiler', 'cuda', 'gpu_stream_name').split(',')[0] + + #------------------------- Preprocess: Gather Information ---------------------------- + + # For each GPU Stream AccessNode connected to a kernel: Determine with which Tasklet Source + # and taskelt sink nodes it should be merged + merge_source_gpustream: Dict[Tuple[nodes.AccessNode, SDFGState], List[nodes.AccessNode]] = dict() + merge_sink_gpustream: Dict[Tuple[nodes.AccessNode, SDFGState], List[nodes.AccessNode]] = dict() + + for node, state in sdfg.all_nodes_recursive(): + # Skip non tasklets + if not isinstance(node, nodes.Tasklet): + continue + + # Find the GPU_Device-scheduled MapExit grand-predecessor, if any + node_predecessors = state.predecessors(node) + kernel_exit_grand_predecessor = [ + grand_pred for pred in node_predecessors + for grand_pred in state.predecessors(pred) + if isinstance(grand_pred, nodes.MapExit) and + grand_pred.map.schedule == dtypes.ScheduleType.GPU_Device + ] + + # For this case only tasklets succeeding kernelExit are relevant + if len(kernel_exit_grand_predecessor) == 0: + continue + + # Ignore such niche cases + if len(kernel_exit_grand_predecessor) > 1: + continue + + # Get the Kernel Exits GPU stream + kernel_exit = kernel_exit_grand_predecessor[0] + kernel_exit_gpustream_node = [succ for succ in state.successors(kernel_exit) if isinstance(succ, nodes.AccessNode) + and succ.desc(state).dtype == dtypes.gpuStream_t][0] + + # (Copy) Tasklet should have exactly one preceeding source GPU node and one following sink GPU node + # If not, we skip (because this pass is here purely for nicer graphs) + # Also, kernel exit is assumed to be connected to a GPU Stream AccessNode (see "depends_on()") + node_successors = state.successors(node) + downstream_gpustream_sinks = [succ for succ in node_successors if isinstance(succ, nodes.AccessNode) + and succ.desc(state).dtype == dtypes.gpuStream_t and state.out_degree(succ) == 0] + upstream_gpustream_sources = [pre for pre in node_predecessors if isinstance(pre, nodes.AccessNode) + and pre.desc(state).dtype == dtypes.gpuStream_t and state.in_degree(pre) == 0] + + # Skip not considered case + if not (len(upstream_gpustream_sources) == len(downstream_gpustream_sinks) and len(upstream_gpustream_sources) == 1): + continue + + # Collect and store the merging information + pre_gpustream: nodes.AccessNode = upstream_gpustream_sources[0] + succ_gpustream: nodes.AccessNode = downstream_gpustream_sinks[0] + if (kernel_exit_gpustream_node, state) in merge_source_gpustream: + merge_source_gpustream[(kernel_exit_gpustream_node, state)].append(pre_gpustream) + merge_sink_gpustream[(kernel_exit_gpustream_node, state)].append(succ_gpustream) + else: + merge_source_gpustream[(kernel_exit_gpustream_node, state)] = [pre_gpustream] + merge_sink_gpustream[(kernel_exit_gpustream_node, state)] = [succ_gpustream] + + + #------------------------- Merge the GPU Stream AccessNodes ---------------------------- + for kernel_exit_stream, state in merge_sink_gpustream.keys(): + + # Add new AccessNodes which merge the others loose streams + unified_in_stream = state.add_access(gpustream_array_name) + unified_out_stream = state.add_access(gpustream_array_name) + + # unified_in_stream connects to KernelExit and all Source nodes of memory copy tasklets + # whereas unified_out_stream unifies all sink streams of memory tasklets and connects to + # all following nodes of kernel_exit_stream + for in_edge in state.in_edges(kernel_exit_stream): + src, src_conn, _, dst_conn, memlet = in_edge + state.add_edge(src, src_conn, unified_in_stream, dst_conn, copy.deepcopy(memlet)) + state.remove_edge(in_edge) + + for out_edge in state.out_edges(kernel_exit_stream): + _, src_conn, dst, dst_conn, memlet = out_edge + state.add_edge(unified_out_stream, src_conn, dst, dst_conn, copy.deepcopy(memlet)) + state.remove_edge(out_edge) + + for source_stream in merge_source_gpustream[kernel_exit_stream, state]: + for out_edge in state.out_edges(source_stream): + _, src_conn, dst, dst_conn, memlet = out_edge + state.add_edge(unified_in_stream, src_conn, dst, dst_conn, copy.deepcopy(memlet)) + state.remove_edge(out_edge) + state.remove_node(source_stream) + + for sink_stream in merge_sink_gpustream[kernel_exit_stream, state]: + for in_edge in state.in_edges(sink_stream): + src, src_conn, _, dst_conn, memlet = in_edge + state.add_edge(src, src_conn, unified_out_stream, dst_conn, copy.deepcopy(memlet)) + state.remove_edge(in_edge) + state.remove_node(sink_stream) + + # Kernel exit stream is represented in the two unified streams, not needed anymore + state.remove_node(kernel_exit_stream) + + def _remove_passthrough_gpu_stream_access_node(self, sdfg: SDFG) -> None: + """ + Unused: This will need adaption at the codegen level. + It is mainly unused because I don't think it makes the final SDFG + visually nicer. + """ + + for node, state in sdfg.all_nodes_recursive(): + # remove only GPU Stream AccessNodes who have exactly one incoming and outgoing edge + if not (isinstance(node, nodes.AccessNode) and node.desc(state).dtype == dtypes.gpuStream_t): + continue + + if not (state.in_degree(node) == 1 and state.out_degree(node) == 1): + continue + + in_edge = state.in_edges(node)[0] + out_edge = state.out_edges(node)[0] + + # Unknown case: in and out edge carry different data. Skip + if in_edge.data.data != out_edge.data.data: + continue + + # Remove the passthrough GPU stream AccessNode and replace it by a single edge + state.add_edge(in_edge.src, in_edge.src_conn, out_edge.dst, out_edge.dst_conn, in_edge.data) + state.remove_edge(in_edge) + state.remove_edge(out_edge) + state.remove_node(node) diff --git a/dace/transformation/passes/gpustream/gpustream_scheduling.py b/dace/transformation/passes/gpustream/gpustream_scheduling.py new file mode 100644 index 0000000000..aeb9e3b9b7 --- /dev/null +++ b/dace/transformation/passes/gpustream/gpustream_scheduling.py @@ -0,0 +1,195 @@ +# Copyright 2019-2025 ETH Zurich and the DaCe authors. All rights reserved. +from typing import Dict, List, Set, Type, Union + +import dace +from dace import SDFG, SDFGState, properties +from dace.config import Config +from dace.sdfg import nodes +from dace.sdfg.graph import Graph, NodeT +from dace.transformation import pass_pipeline as ppl, transformation + +@properties.make_properties +@transformation.explicit_cf_compatible +class NaiveGPUStreamScheduler(ppl.Pass): + """ + Assigns GPU streams to nodes and stores the assignments in a dictionary. + This can be useful for enabling asynchronous and parallel GPU computation using GPU streams. + + Strategy Overview: + ------------------ + - GPU stream assignment is based on weakly connected components (WCCs) within each state. + - Nodes in the same WCC are assigned to the same stream. + - For top-level states (not within nested SDFGs), each new WCC starts on a new stream (starting from 0). + - In nested SDFGs: + * Stream assignment is inherited from the parent component, + * All internal components share the parent's stream. + - GPU stream IDs wrap around according to the `max_concurrent_streams` configuration. + + Example: + -------- + A state with the following independent chains: + K1 → K2 + K3 → K4 → K5 + K6 + + would be scheduled as: + K1, K2 → stream 0 + K3, K4, K5 → stream 1 + K6 → stream 2 + + (assuming no limit on the number of concurrent streams) + + Note: + ----- + These refer to **backend GPU streams** (e.g., CUDA or HIP), not DaCe symbolic streams. + """ + + def __init__(self): + # Maximum number of concurrent streams allowed (from config). + # Cached locally for frequent reuse. + self._max_concurrent_streams = int(Config.get('compiler', 'cuda', 'max_concurrent_streams')) + + def depends_on(self) -> Set[Union[Type[ppl.Pass], ppl.Pass]]: + return {} + + def modifies(self) -> ppl.Modifies: + return ppl.Modifies.Nothing + + def should_reapply(self, modified: ppl.Modifies) -> bool: + return False + + def apply_pass(self, sdfg: SDFG, _) -> Dict[nodes.Node, int]: + """ + Assigns GPU streams to nodes within the given SDFG. + + Parameters + ---------- + sdfg : SDFG + The top-level SDFG to process. + pipeline_results : Dict + Unused. + + Returns + ------- + Dict[nodes.Node, int] + A dictionary mapping each node to its assigned GPU stream. + """ + stream_assignments: Dict[nodes.Node, int] = dict() + for state in sdfg.states(): + self._assign_gpu_streams_in_state(sdfg, False, state, stream_assignments, 0) + + return stream_assignments + + def _assign_gpu_streams_in_state(self, sdfg: SDFG, in_nested_sdfg: bool, state: SDFGState, + stream_assignments: Dict[nodes.Node, int], gpu_stream: int) -> None: + """ + Assigns GPU streams to nodes in a single state. + + If inside a nested SDFG, components inherit the parent's stream. + Otherwise, each connected component gets a different stream. + Nested SDFGs are processed recursively. + + Parameters + ---------- + sdfg : SDFG + The SDFG containing the state. + in_nested_sdfg : bool + True if the state is in a nested SDFG. + state : SDFGState + The state to process. + stream_assignments : Dict[nodes.Node, int] + Mapping of nodes to assigned GPU streams (updated in-place). + gpu_stream : int + The current GPU stream ID. + + Returns + ------- + None + """ + components = self._get_weakly_connected_nodes(state) + + for component in components: + nodes_assigned_before = len(stream_assignments) + + for node in component: + stream_assignments[node] = gpu_stream + if isinstance(node, nodes.NestedSDFG): + for nested_state in node.sdfg.states(): + self._assign_gpu_streams_in_state(node.sdfg, True, nested_state, stream_assignments, gpu_stream) + + # Move to the next stream if we have assigned streams to any node in this component + # (careful: if nested, states are in same component) + if not in_nested_sdfg and len(stream_assignments) > nodes_assigned_before: + gpu_stream = self._next_stream(gpu_stream) + + def _get_weakly_connected_nodes(self, graph: Graph) -> List[Set[NodeT]]: + """ + Returns all weakly connected components in the given directed graph. + + A weakly connected component is a maximal group of nodes such that each pair + of nodes is connected by a path when ignoring edge directions. + + Parameters + ---------- + graph: Graph + A directed graph instance. + + Returns + ------- + List[Set[Node_T]] + + A list containing sets of nodes, with each set corresponding to a weakly + connected component. + """ + visited: Set[NodeT] = set() + components: List[Set[NodeT]] = [] + + for node in graph.nodes(): + if node in visited: + continue + + # Start a new weakly connected component + component: Set[NodeT] = set() + stack = [node] + + while stack: + current = stack.pop() + if current in visited: + continue + + visited.add(current) + component.add(current) + + for neighbor in graph.neighbors(current): + if neighbor not in visited: + stack.append(neighbor) + + components.append(component) + + return components + + def _next_stream(self, gpu_stream: int) -> int: + """ + Compute the next CUDA stream index according to the concurrency configuration. + + Behavior depends on the configured max_concurrent_streams value: + - If 0: unlimited streams allowed, so increment the stream index by one. + - If -1: default setting, always return stream 0 (no concurrency). + - Otherwise: cycle through stream indices from 0 up to max_concurrent_streams - 1. + + Parameters + ---------- + gpu_stream : int + The current CUDA stream index. + + Returns + ------- + int + The next CUDA stream index based on the concurrency policy. + """ + if self._max_concurrent_streams == 0: + return gpu_stream + 1 + elif self._max_concurrent_streams == -1: + return 0 + else: + return (gpu_stream + 1) % self._max_concurrent_streams diff --git a/dace/transformation/passes/gpustream/insert_gpu_stream_sync_tasklets.py b/dace/transformation/passes/gpustream/insert_gpu_stream_sync_tasklets.py new file mode 100644 index 0000000000..28ebf4171d --- /dev/null +++ b/dace/transformation/passes/gpustream/insert_gpu_stream_sync_tasklets.py @@ -0,0 +1,287 @@ +# Copyright 2019-2025 ETH Zurich and the DaCe authors. All rights reserved. +from typing import Any, Dict, List, Set, Tuple, Type, Union +import copy + +import dace +from dace import dtypes, properties, SDFG, SDFGState +from dace.codegen import common +from dace.codegen.targets.experimental_cuda_helpers.gpu_utils import is_within_schedule_types +from dace.config import Config +from dace.sdfg import nodes +from dace.transformation import pass_pipeline as ppl, transformation +from dace.transformation.passes.gpustream.gpustream_scheduling import NaiveGPUStreamScheduler +from dace.transformation.passes.gpustream.insert_gpu_streams_to_kernels import InsertGPUStreamsToKernels + +@properties.make_properties +@transformation.explicit_cf_compatible +class InsertGPUStreamSyncTasklets(ppl.Pass): + """ + Inserts GPU stream synchronization tasklets in an SDFG where needed. + + This pass uses a heuristic approach to find locations matching specific patterns + that require synchronization. Additional locations can be added easily if new + cases are discovered. + """ + def depends_on(self) -> Set[Union[Type[ppl.Pass], ppl.Pass]]: + return {NaiveGPUStreamScheduler, InsertGPUStreamsToKernels} + + def modifies(self) -> ppl.Modifies: + return ppl.Modifies.Tasklets | ppl.Modifies.Memlets + + def should_reapply(self, modified: ppl.Modifies) -> bool: + return False + + def apply_pass(self, sdfg: SDFG, pipeline_results: Dict[str, Any]): + """ + Inserts GPU stream synchronization tasklets at required locations: + - At the end of a state, for streams used in the state. + - After specific nodes, if synchronizeation is required before continuing. + """ + stream_assignments: Dict[nodes.Node, int] = pipeline_results['NaiveGPUStreamScheduler'] + + # Get sync locations + sync_state, sync_node = self._identify_sync_locations(sdfg, stream_assignments) + + # Synchronize all used streams at the end of a state + self._insert_gpu_stream_sync_at_state_end(sdfg, sync_state, stream_assignments) + + # Synchronize after specific nodes if required (e.g. After GPU->Non-GPU copies might be a case) + self._insert_gpu_stream_sync_after_node(sdfg, sync_node, stream_assignments) + + return {} + + def _identify_sync_locations(self, sdfg: SDFG, stream_assignments: Dict[nodes.Node, int] + ) -> Tuple[Dict[SDFGState, Set[int]], Dict[nodes.Node, SDFGState]]: + """ + Heuristically identifies GPU stream synchronization points in an SDFG. + + Synchronization is inserted in the following cases: + - **End of a state**: When copying to or from GPU AccessNodes. + - **After a specific node**: When data leaves GPU memory and is used afterwards. + + Parameters + ---------- + sdfg : SDFG + The SDFG to analyze. + stream_assignments : Dict[nodes.Node, int] + Mapping of nodes to their assigned GPU stream ids. + + Returns + ------- + Tuple[Dict[SDFGState, Set[int]], Dict[nodes.Node, SDFGState]] + - **sync_state**: Maps each state to the set of stream IDs that should be + synchronized at the end of the state. + - **sync_node**: The keys of this dictionary are nodes after which synchronization + is needed, and their corresponding value is the state they belong to. + """ + # ------------------ Helper predicates ----------------------------- + + def is_gpu_accessnode(node, state): + return isinstance(node, nodes.AccessNode) and node.desc( + state.parent).storage == dtypes.StorageType.GPU_Global + + def is_nongpu_accessnode(node, state): + return isinstance(node, nodes.AccessNode) and node.desc( + state.parent).storage not in dtypes.GPU_MEMORY_STORAGES_EXPERIMENTAL_CUDACODEGEN + + def is_kernel_exit(node): + return isinstance(node, nodes.ExitNode) and node.schedule == dtypes.ScheduleType.GPU_Device + + def is_sink_node(node, state): + return state.out_degree(node) == 0 + + def edge_within_kernel(state, src, dst): + gpu_schedules = dtypes.GPU_SCHEDULES_EXPERIMENTAL_CUDACODEGEN + src_in_kernel = is_within_schedule_types(state, src, gpu_schedules) + dst_in_kernel = is_within_schedule_types(state, dst, gpu_schedules) + return src_in_kernel and dst_in_kernel + + # ------------------ Sync detection logic ----------------------------- + + sync_state: Dict[SDFGState, Set[int]] = {} + sync_node: Dict[nodes.Node, SDFGState] = {} + + for edge, state in sdfg.all_edges_recursive(): + src, dst = edge.src, edge.dst + + # Ensure state is initialized in sync_state + if state not in sync_state: + sync_state[state] = set() + + # --- Heuristics for when to sync --- + if (is_gpu_accessnode(src, state) and is_nongpu_accessnode(dst, state) and + is_sink_node(dst, state) and not edge_within_kernel(state, src, dst)): + sync_state[state].add(stream_assignments[dst]) + + elif (is_gpu_accessnode(src, state) and is_nongpu_accessnode(dst, state) and + not is_sink_node(dst, state) and not edge_within_kernel(state, src, dst)): + sync_state[state].add(stream_assignments[dst]) + sync_node[dst] = state + + elif (is_nongpu_accessnode(src, state) and is_gpu_accessnode(dst, state) and + not edge_within_kernel(state, src, dst)): + sync_state[state].add(stream_assignments[dst]) + + elif (is_kernel_exit(src) and is_gpu_accessnode(dst, state) and + is_sink_node(dst, state)): + sync_state[state].add(stream_assignments[dst]) + + else: + continue + + # Check that state is indeed a SDFGState when added to the dictionary, to be on the safe side + if not isinstance(state, SDFGState): + raise NotImplementedError(f"Unexpected parent type '{type(state).__name__}' for edge '{edge}'. " + "Expected 'SDFGState'. Please handle this case explicitly.") + + # Remove states with no syncs + sync_state = {state: streams for state, streams in sync_state.items() if len(streams) > 0} + + return sync_state, sync_node + + def _insert_gpu_stream_sync_at_state_end(self, sdfg: SDFG, sync_state: Dict[SDFGState, Set[int]], + stream_assignments: Dict[nodes.Node, int]) -> None: + """ + Inserts GPU stream synchronization tasklets at the end of SDFG states. + + For each state that requires synchronization, this method: + + 1. Generates a tasklet that synchronizes all assigned GPU streams using + the appropriate backend (e.g., CUDA). + 2. Ensures all other operations in the state complete before synchronization + by connecting all sink nodes to the tasklet. + 3. Guarantees that only a single GPU stream AccessNode connects to the sync + tasklet, creating one if needed. + + Parameters + ---------- + sdfg : SDFG + The top level SDFG. + sync_state : Dict[SDFGState, Set[int] + Mapping of states to sets of stream IDs that require synchronization at the end of the state. + stream_assignments : Dict[nodes.Node, int] + Mapping of nodes to their assigned GPU stream IDs. + """ + # Prepare GPU stream info and backend + num_assigned_streams = max(stream_assignments.values(), default=0) + 1 + stream_array_name, stream_var_name_prefix = Config.get('compiler', 'cuda', 'gpu_stream_name').split(',') + backend: str = common.get_gpu_backend() + + for state, streams in sync_state.items(): + + #----------------- Generate GPU stream synchronization Tasklet ----------------- + + # Build synchronization calls for all streams used in this state + sync_code_lines = [] + for stream in streams: + gpu_stream_var_name = f"{stream_var_name_prefix}{stream}" + sync_call = f"DACE_GPU_CHECK({backend}StreamSynchronize({gpu_stream_var_name}));" + sync_code_lines.append(sync_call) + sync_code = "\n".join(sync_code_lines) + + # Create the tasklet + tasklet = state.add_tasklet(name=f"gpu_stream_{stream}_synchronization", inputs=set(), outputs=set(), + code=sync_code, language=dtypes.Language.CPP) + + # ----------------- Connect sink nodes to the synchronization tasklet ----------------- + + # 1. Seperate GPU stream sink nodes and other sink nodes + stream_sink_nodes: List[nodes.AccessNode] = [] + non_stream_sink_nodes: List[nodes.Node] = [] + for sink_node in state.sink_nodes(): + if isinstance(sink_node, nodes.AccessNode) and sink_node.desc(state).dtype == dtypes.gpuStream_t: + stream_sink_nodes.append(sink_node) + + elif sink_node != tasklet: + non_stream_sink_nodes.append(sink_node) + + # 2. Connect non-stream sink nodes to the sync tasklet + for sink_node in non_stream_sink_nodes: + state.add_edge(sink_node, None, tasklet, None, dace.Memlet()) + + # 3. Connect a single GPU stream sink node (create or merge if needed) + if len(stream_sink_nodes) == 0: + if stream_array_name not in state.sdfg.arrays: + state.sdfg.add_transient(stream_array_name, (num_assigned_streams,), dtype=dtypes.gpuStream_t, + storage=dtypes.StorageType.Register, lifetime=dtypes.AllocationLifetime.Persistent) + combined_stream_node = state.add_access(stream_array_name) + + else: + combined_stream_node = stream_sink_nodes.pop() + for stream_node in stream_sink_nodes: + for edge in state.in_edges(stream_node): + state.add_edge(edge.src, edge.src_conn, combined_stream_node, edge.dst_conn, edge.data) + state.remove_edge(edge) + state.remove_node(stream_node) + + # Connect back to output stream node + output_stream_node = state.add_access(combined_stream_node.data) + for stream in streams: + accessed_gpu_stream = f"{stream_array_name}[{stream}]" + conn = f"{stream_var_name_prefix}{stream}" # Note: Same as "gpu_stream_var_name" from tasklet + + tasklet.add_in_connector(conn, dtypes.gpuStream_t) + tasklet.add_out_connector(conn, dtypes.gpuStream_t, force=True) + state.add_edge(combined_stream_node, None, tasklet, conn, dace.Memlet(accessed_gpu_stream)) + state.add_edge(tasklet, conn, output_stream_node, None, dace.Memlet(accessed_gpu_stream)) + + def _insert_gpu_stream_sync_after_node(self, sdfg: SDFG, sync_node: Dict[nodes.Node, SDFGState], + stream_assignments: Dict[nodes.Node, int]) -> None: + """ + Insert a GPU stream synchronization tasklet immediately after specified nodes. + + Parameters + ---------- + sdfg : SDFG + The top level SDFG. + sync_node : Dict[nodes.Node, SDFGState] + Mapping of nodes to their parent state. After after the node a GPU stream synchronization should occur. + stream_assignments : Dict[nodes.Node, int] + Mapping of nodes to their assigned GPU stream IDs. + """ + # Prepare GPU stream info and backend + num_assigned_streams = max(stream_assignments.values(), default=0) + 1 + stream_array_name, stream_var_name_prefix = Config.get('compiler', 'cuda', 'gpu_stream_name').split(',') + backend: str = common.get_gpu_backend() + + for node, state in sync_node.items(): + + #----------------- Generate GPU stream synchronization Tasklet ----------------- + + # Get assigned GPU stream + stream = stream_assignments.get(node, "nullptr") + if stream == "nullptr": + raise NotImplementedError("Using the default 'nullptr' gpu stream is not supported yet.") + + # Create the tasklet + stream_var_name = f"{stream_var_name_prefix}{stream}" + sync_call = f"DACE_GPU_CHECK({backend}StreamSynchronize({stream_var_name}));\n" + tasklet = state.add_tasklet( name=f"gpu_stream_{stream}_synchronization", + inputs=set(), outputs=set(), + code=sync_call, language=dtypes.Language.CPP) + + + #----------------- Place tasklet between node and successors, link GPU streams ---------------- + + # 1. Put the tasklet between the node and its successors + for edge in state.out_edges(node): + src, src_conn, dst, dst_conn, memlet = edge + state.add_edge(src, src_conn, tasklet, None, copy.deepcopy(memlet)) + state.add_edge(tasklet, None, dst, dst_conn, copy.deepcopy(memlet)) + state.remove_edge(edge) + + # 2. If the GPU stream array is not defined in the data descriptor store, add it first + parent_sdfg = state.sdfg + if stream_array_name not in parent_sdfg.arrays: + parent_sdfg.add_transient(stream_array_name, (num_assigned_streams,), dtype=dtypes.gpuStream_t, + storage=dtypes.StorageType.Register, lifetime=dtypes.AllocationLifetime.Persistent) + + # 3. Connect tasklet to GPU stream AccessNodes + in_stream = state.add_access(stream_array_name) + out_stream = state.add_access(stream_array_name) + accessed_stream = f"{stream_array_name}[{stream}]" + state.add_edge(in_stream, None, tasklet, stream_var_name, dace.Memlet(accessed_stream)) + state.add_edge(tasklet, stream_var_name, out_stream, None, dace.Memlet(accessed_stream)) + tasklet.add_in_connector(stream_var_name, dtypes.gpuStream_t, force=True) + tasklet.add_out_connector(stream_var_name, dtypes.gpuStream_t, force=True) + \ No newline at end of file diff --git a/dace/transformation/passes/gpustream/insert_gpu_streams_to_kernels.py b/dace/transformation/passes/gpustream/insert_gpu_streams_to_kernels.py new file mode 100644 index 0000000000..50be81a872 --- /dev/null +++ b/dace/transformation/passes/gpustream/insert_gpu_streams_to_kernels.py @@ -0,0 +1,79 @@ +# Copyright 2019-2025 ETH Zurich and the DaCe authors. All rights reserved. +from typing import Any, Dict, Set, Type, Union + +import dace +from dace import dtypes, properties, SDFG +from dace.codegen import common +from dace.config import Config +from dace.sdfg import nodes +from dace.transformation import pass_pipeline as ppl, transformation +from dace.transformation.passes.gpustream.gpustream_scheduling import NaiveGPUStreamScheduler + +@properties.make_properties +@transformation.explicit_cf_compatible +class InsertGPUStreamsToKernels(ppl.Pass): + """ + This Pass attaches GPU streams to kernels (i.e., dtypes.ScheduleType.GPU_Device scheduled maps). + + Adds GPU stream AccessNodes and connects them to kernel entry and exit nodes, + indicating which GPU stream each kernel is assigned to. These assignments are e.g. + used when launching the kernels. + """ + + def depends_on(self) -> Set[Union[Type[ppl.Pass], ppl.Pass]]: + return {NaiveGPUStreamScheduler} + + def modifies(self) -> ppl.Modifies: + return ppl.Modifies.AccessNodes | ppl.Modifies.Memlets + + def should_reapply(self, modified: ppl.Modifies) -> bool: + return False + + def apply_pass(self, sdfg: SDFG, pipeline_results: Dict[str, Any]): + # Retrieve the GPU stream array name and the prefix for individual stream variables + stream_array_name, stream_var_name_prefix = Config.get('compiler', 'cuda', 'gpu_stream_name').split(',') + + # Retrieve GPU stream assignments for nodes + stream_assignments: Dict[nodes.Node, Union[int, str]] = pipeline_results['NaiveGPUStreamScheduler'] + + # Determine the number of assigned GPU streams, needed for creating the GPU stream Array + num_assigned_streams = max(stream_assignments.values(), default=0) + 1 + + # Link kernels to their assigned GPU streams + for sub_sdfg in sdfg.all_sdfgs_recursive(): + + # Track whether the GPU stream array is added to + # sub_sdfg's data descriptor store + gpustream_array_added = False + + for state in sub_sdfg.states(): + for node in state.nodes(): + + # Not a kernel entry - continue + if not (isinstance(node, nodes.MapEntry) and node.map.schedule == dtypes.ScheduleType.GPU_Device): + continue + + # If GPU stream array is not yet defined in the sub_sdfg, add it + if not gpustream_array_added: + sub_sdfg.add_transient(stream_array_name, (num_assigned_streams,), dtype=dace.dtypes.gpuStream_t, + storage=dace.dtypes.StorageType.Register, lifetime=dace.dtypes.AllocationLifetime.Persistent) + gpustream_array_added = True + + # Stream connector name and the used GPU Stream for the kernel + assigned_gpustream = stream_assignments[node] + gpu_stream_var_name = f"{stream_var_name_prefix}{assigned_gpustream}" + accessed_gpu_stream = f"{stream_array_name}[{assigned_gpustream}]" + + # Assign the GPU stream to the kernel entry + kernel_entry = node + kernel_entry.add_in_connector(gpu_stream_var_name, dtypes.gpuStream_t) + stream_array_in = state.add_access(stream_array_name) + state.add_edge(stream_array_in, None, kernel_entry, gpu_stream_var_name, dace.Memlet(accessed_gpu_stream)) + + # Assign the GPU stream to the kernel exit + kernel_exit = state.exit_node(kernel_entry) + kernel_exit.add_out_connector(gpu_stream_var_name, dtypes.gpuStream_t) + stream_array_out = state.add_access(stream_array_name) + state.add_edge(kernel_exit, gpu_stream_var_name, stream_array_out, None, dace.Memlet(accessed_gpu_stream)) + + return {} diff --git a/dace/transformation/passes/gpustream_scheduling.py b/dace/transformation/passes/gpustream_scheduling.py deleted file mode 100644 index dcb423a661..0000000000 --- a/dace/transformation/passes/gpustream_scheduling.py +++ /dev/null @@ -1,353 +0,0 @@ -from typing import Union, Dict, Set, List, Tuple - -import dace -from dace import SDFG, properties, SDFGState -from dace import dtypes -from dace.codegen import common -from dace.codegen.targets.experimental_cuda_helpers.gpu_utils import is_within_schedule_types -from dace.config import Config -from dace.transformation import pass_pipeline as ppl, transformation -from dace.sdfg import nodes -from dace.sdfg.graph import Graph, NodeT - - -@properties.make_properties -@transformation.explicit_cf_compatible -class NaiveGPUStreamScheduler(ppl.Pass): - """ - Assigns GPU streams to relevant nodes and inserts synchronization tasklets where needed. - - Strategy Overview: - ------------------ - - GPU stream assignment is based on weakly connected components (WCCs) within each state. - - "Relevant nodes" in a WCC are assigned to the same stream. - Relevant nodes include: - * AccessNodes in GPU memory, - * GPU-scheduled nodes (Maps or Library nodes), - * Nodes directly connected to the above. - - For top-level states (not within nested SDFGs), each new WCC starts on a new stream (starting from 0). - - In nested SDFGs: - * Stream assignment is inherited from the parent component, - * All internal components share the parent's stream (consider revisiting this for performance tuning). - - GPU stream IDs wrap around according to the `max_concurrent_streams` configuration. - - Synchronization tasklets are inserted using a simple heuristic: - * At the end of a state, if outputs certain patterns regarding GPU memory occur, - * After a node, if its outputs cross GPU boundaries and are reused downstream. - - Example: - -------- - A state with the following independent chains: - K1 → K2 - K3 → K4 → K5 - K6 - - would be scheduled as: - K1, K2 → stream 0 - K3, K4, K5 → stream 1 - K6 → stream 2 - - (assuming no limit on the number of concurrent streams) - - Note: - ----- - These refer to **backend GPU streams** (e.g., CUDA or HIP), not DaCe symbolic streams. - """ - - def __init__(self): - # max configured number of concurrent streams - self._max_concurrent_streams = int(Config.get('compiler', 'cuda', 'max_concurrent_streams')) - - # needed to call correct backend synchronization functions - self._backend: str = common.get_gpu_backend() - - # This is expected to be set by the calling backend code generator before applying the pass - self._gpu_stream_access_template: str = "" - - def apply_pass(self, sdfg: SDFG, _) -> Dict[nodes.Node, Union[int, str]]: - """ - Assigns GPU streams to nodes and inserts synchronization tasklets where needed. - """ - - # 1. Traverse each top-level state and assign stream IDs to eligible nodes (starting from stream 0). - assigned_nodes = dict() - for state in sdfg.states(): - self._assign_gpu_streams_in_state(sdfg, False, state, assigned_nodes, 0) - - # 2. If only one stream is used set all assignments to "nullptr". - num_assigned_streams = max(assigned_nodes.values(), - default=0) # self.max_concurrent_streams == -1 (default) also handled here - if num_assigned_streams == 0: - for k in assigned_nodes.keys(): - assigned_nodes[k] = "nullptr" - - # 3. Insert synchronization tasklets based on stream usage. - self._insert_gpu_stream_sync_tasklet(sdfg, assigned_nodes) - - return assigned_nodes - - def _assign_gpu_streams_in_state(self, sdfg: SDFG, in_nested_sdfg: bool, state: SDFGState, assigned_nodes: Dict, - gpu_stream: int): - """ - Processes connected components in a state, assigning each to a different GPU stream if not inside a nested SDFG. - If inside a nested SDFG, components inherit the stream from the parent state/component. - - Stream assignment is performed only for components that contain GPU-related nodes; - components without such nodes are skipped. - """ - components = self._get_weakly_connected_nodes(state) - - for component in components: - nodes_assigned_before = len(assigned_nodes) - - for node in component: - - if self._is_relevant_for_gpu_stream(node, sdfg, state): - assigned_nodes[node] = gpu_stream - - if isinstance(node, nodes.NestedSDFG): - for nested_state in node.sdfg.states(): - self._assign_gpu_streams_in_state(node.sdfg, True, nested_state, assigned_nodes, gpu_stream) - - # Move to next stream if we assigned streams to any node in this component (careful: if nested, states are in same component) - if not in_nested_sdfg and len(assigned_nodes) > nodes_assigned_before: - gpu_stream = self._next_stream(gpu_stream) - - def _get_weakly_connected_nodes(self, graph: Graph) -> List[Set[NodeT]]: - """ - Returns all weakly connected components in the given directed graph. - - A weakly connected component is a maximal group of nodes such that each pair - of nodes is connected by a path when ignoring edge directions. - - :param graph: A directed graph (Graph) instance. - :return: A list of sets, each containing the nodes of one weakly connected component. - """ - visited: Set[NodeT] = set() - components: List[Set[NodeT]] = [] - - for node in graph.nodes(): - if node in visited: - continue - - # Start a new weakly connected component - component: Set[NodeT] = set() - stack = [node] - - while stack: - current = stack.pop() - if current in visited: - continue - - visited.add(current) - component.add(current) - - for neighbor in graph.neighbors(current): - if neighbor not in visited: - stack.append(neighbor) - - components.append(component) - - return components - - def _is_relevant_for_gpu_stream(self, node: nodes.Node, sdfg: SDFG, state: SDFGState) -> bool: - """ - Determines whether a node is relevant for GPU stream assignment. - - A node is considered relevant if: - - It is an AccessNode accessing GPU global memory, - - It is a GPU-scheduled map entry/exit node (i.e., a kernel), - - It is a GPU-scheduled library node, - - Or it is directly connected (via in/out edges) to such a node. - - Args: - node: The node to check. - sdfg: The SDFG for memory/storage context. - state: The state in which the node resides. - - Returns: - True if the node is relevant for GPU stream assignment, False otherwise. - """ - - node_and_neighbors = list(state.neighbors(node)) - node_and_neighbors.append(node) - - for n in node_and_neighbors: - # GPU global memory access nodes - if (isinstance(n, nodes.AccessNode) and n.desc(sdfg).storage == dtypes.StorageType.GPU_Global): - return True - - # GPU-scheduled map entry/exit nodes (kernels) - if (isinstance(n, (nodes.EntryNode, nodes.ExitNode)) and n.schedule in dtypes.GPU_SCHEDULES): - return True - - # GPU-scheduled library nodes - if (isinstance(n, nodes.LibraryNode) and n.schedule in dtypes.GPU_SCHEDULES): - return True - - return False - - def _next_stream(self, gpu_stream: int) -> int: - """ - Returns the next CUDA stream index based on the configured concurrency policy. - - - If max_concurrent_streams == 0: unlimited streams → increment stream index - - If max_concurrent_streams == -1: default → always return 0 - - Else: wrap around within the allowed number of streams - """ - if self._max_concurrent_streams == 0: - return gpu_stream + 1 - elif self._max_concurrent_streams == -1: - return 0 - else: - return (gpu_stream + 1) % self._max_concurrent_streams - - def _insert_gpu_stream_sync_tasklet(self, sdfg: SDFG, assigned_nodes: Dict) -> None: - """ - Inserts GPU stream synchronization tasklets at required locations: - - At the end of a state, for streams used in the state. - - After specific nodes, if their outputs need to synchronize before reuse. - """ - sync_state, sync_node = self._identify_sync_locations(sdfg, assigned_nodes) - - #----------------- Insert synchronization tasklets at the end of each state ----------------- - for state, streams in sync_state.items(): - - # Important: get sink nodes before adding the tasklet - sink_nodes = list(state.sink_nodes()) - - # Generate sync code for all streams used in this state - sync_code_lines = [] - for stream in streams: - - if stream == "nullptr": - gpu_stream_access_expr = "nullptr" - else: - gpu_stream_access_expr = self._gpu_stream_access_template.format(gpu_stream=stream) - - sync_code_lines.append(f"DACE_GPU_CHECK({self._backend}StreamSynchronize({gpu_stream_access_expr}));") - - sync_code = "\n".join(sync_code_lines) - - tasklet = state.add_tasklet(name=f"gpu_stream_sync_{state}", - inputs=set(), - outputs=set(), - code=sync_code, - language=dtypes.Language.CPP) - - for sink_node in sink_nodes: - state.add_edge(sink_node, None, tasklet, None, dace.Memlet()) - - #----------------- Insert synchronization tasklets after specific nodes ----------------- - - for node, state in sync_node.items(): - - # get correct stream access expr - stream = assigned_nodes.get(node, "nullptr") - if stream == "nullptr": - gpu_stream_access_expr = "nullptr" - else: - gpu_stream_access_expr = self._gpu_stream_access_template.format(gpu_stream=stream) - - tasklet = state.add_tasklet( - name=f"gpu_stream_sync_{stream}", - inputs=set(), - outputs=set(), - code=f"DACE_GPU_CHECK({self._backend}StreamSynchronize({gpu_stream_access_expr}));\n", - language=dtypes.Language.CPP) - - # important: First get the successors, then add the tasklet - successors = list(state.successors(node)) - state.add_edge(node, None, tasklet, None, dace.Memlet()) - - for succ in successors: - state.add_edge(tasklet, None, succ, None, dace.Memlet()) - - def _identify_sync_locations(self, sdfg: SDFG, - assigned_nodes: Dict) -> Tuple[Dict[SDFGState, Set[str]], Dict[nodes.Node, SDFGState]]: - """ - Heuristically identifies GPU stream synchronization points in an SDFG. - - Synchronization is needed: - - At the end of a state, if we copy to/from GPU AccessNodes. - - Immediately after a node, if data leaves GPU memory and is further used. - - Furthermore, never within the kernel code. - - Returns: - - sync_state: Maps each SDFGState to a set of stream IDs to sync at the end of the state. - - sync_node: Maps individual nodes to the state where a sync is required after the node. - """ - - # ------------------ Helper predicates ----------------------------- - - def is_gpu_accessnode(node, state): - return isinstance(node, nodes.AccessNode) and node.desc( - state.parent).storage == dtypes.StorageType.GPU_Global - - def is_nongpu_accessnode(node, state): - return isinstance(node, nodes.AccessNode) and node.desc( - state.parent).storage not in dtypes.GPU_MEMORY_STORAGES_EXPERIMENTAL_CUDACODEGEN - - def is_kernel_exit(node): - return isinstance(node, nodes.ExitNode) and node.schedule == dtypes.ScheduleType.GPU_Device - - def is_sink_node(node, state): - return state.out_degree(node) == 0 - - def edge_within_kernel(state, src, dst): - gpu_schedules = dtypes.GPU_SCHEDULES_EXPERIMENTAL_CUDACODEGEN - src_in_kernel = is_within_schedule_types(state, src, gpu_schedules) - dst_in_kernel = is_within_schedule_types(state, dst, gpu_schedules) - return src_in_kernel and dst_in_kernel - - # ------------------ Sync detection logic ----------------------------- - - sync_state: Dict[SDFGState, Set[str]] = {} - sync_node: Dict[nodes.Node, SDFGState] = {} - - for edge, state in sdfg.all_edges_recursive(): - src, dst = edge.src, edge.dst - - # Ensure state is initialized in sync_state - if state not in sync_state: - sync_state[state] = set() - - # --- Heuristics for when to sync --- - if (is_gpu_accessnode(src, state) and is_nongpu_accessnode(dst, state) and - is_sink_node(dst, state) and not edge_within_kernel(state, src, dst)): - sync_state[state].add(assigned_nodes[dst]) - - elif (is_gpu_accessnode(src, state) and is_nongpu_accessnode(dst, state) and - not is_sink_node(dst, state) and not edge_within_kernel(state, src, dst)): - sync_state[state].add(assigned_nodes[dst]) - sync_node[dst] = state - - elif (is_nongpu_accessnode(src, state) and is_gpu_accessnode(dst, state) and - not edge_within_kernel(state, src, dst)): - sync_state[state].add(assigned_nodes[dst]) - - elif (is_kernel_exit(src) and is_gpu_accessnode(dst, state) and - is_sink_node(dst, state)): - sync_state[state].add(assigned_nodes[dst]) - - else: - continue - - # Check that state is indeed a SDFGState when added to the dictionary, to be on the safe side - if not isinstance(state, SDFGState): - raise NotImplementedError(f"Unexpected parent type '{type(state).__name__}' for edge '{edge}'. " - "Expected 'SDFGState'. Please handle this case explicitly.") - - # Remove states with no syncs - sync_state = {state: streams for state, streams in sync_state.items() if len(streams) > 0} - - return sync_state, sync_node - - def set_gpu_stream_access_template(self, expr_template: str): - """ - Sets the stream access expression template. The string should include - a `{gpu_stream}` placeholder. This function is expected to be called from a - gpu code generator. - """ - if "{gpu_stream}" not in expr_template: - raise ValueError("self._gpu_stream_access_template must include '{gpu_stream}' placeholder.") - self._gpu_stream_access_template = expr_template diff --git a/dace/transformation/passes/insert_gpu_copy_tasklets.py b/dace/transformation/passes/insert_gpu_copy_tasklets.py new file mode 100644 index 0000000000..3ffd946e47 --- /dev/null +++ b/dace/transformation/passes/insert_gpu_copy_tasklets.py @@ -0,0 +1,180 @@ +# Copyright 2019-2025 ETH Zurich and the DaCe authors. All rights reserved. +from typing import Any, Dict, List, Set, Tuple, Type, Union +import copy + +import dace +from dace import SDFG, SDFGState, dtypes, properties +from dace import memlet as mm +from dace.codegen.targets.experimental_cuda_helpers.new_copy_strategies import CopyContext, OutOfKernelCopyStrategy +from dace.config import Config +from dace.sdfg import nodes, scope_contains_scope +from dace.sdfg.graph import MultiConnectorEdge +from dace.transformation import pass_pipeline as ppl, transformation +from dace.transformation.passes.gpustream.gpustream_scheduling import NaiveGPUStreamScheduler +from dace.transformation.passes.gpustream.insert_gpu_streams_to_kernels import InsertGPUStreamsToKernels +from dace.transformation.passes.gpustream.insert_gpu_stream_sync_tasklets import InsertGPUStreamSyncTasklets + +@properties.make_properties +@transformation.explicit_cf_compatible +class InsertGPUCopyTasklets(ppl.Pass): + """ + This pass inserts explicit copy tasklets for data transfers that need to be handled + by the GPU and occur outside a kernel (for example, copying data from host memory + to the GPU before executing a kernel). + + It identifies such copy locations and inserts the corresponding tasklets. For each + memlet path describing a copy, the first edge is duplicated: one edge goes from the original + source to the tasklet, and the other from the tasklet to the original destination, while + the original edge is removed. + + This is experimental and could later serve as inspiration for making all copies explicit. + Considerations for future work include allowing tasklets to access array addresses + from connectors and describing in memlets how data will be moved, since currently + tasklets only support value inputs. + """ + + def depends_on(self) -> Set[Union[Type[ppl.Pass], ppl.Pass]]: + return {NaiveGPUStreamScheduler, InsertGPUStreamsToKernels, InsertGPUStreamSyncTasklets} + + def modifies(self) -> ppl.Modifies: + return ppl.Modifies.Tasklets | ppl.Modifies.Memlets + + def should_reapply(self, modified: ppl.Modifies) -> bool: + return False + + def apply_pass(self, sdfg: SDFG, pipeline_results: Dict[str, Any]) -> Dict: + """ + Inserts out-of-kernel GPU copy tasklets into the SDFG based on GPU stream scheduling. + Out-of-kernel copies are copies which are handled by the GPU and occur out of a kernel + function. + + Parameters + ---------- + sdfg : SDFG + The SDFG to transform by adding out-of-kernel GPU copy tasklets. + pipeline_results : Dict[str, Any] + Results from previous transformation passes, including GPU stream assignments. + + Returns + ------- + dict + Currently returns an empty dictionary. + """ + # Prepare GPU stream + gpustream_assignments: Dict[nodes.Node, Union[int, str]] = pipeline_results['NaiveGPUStreamScheduler'] + num_assigned_streams = max(gpustream_assignments.values(), default=0) + 1 + gpustream_array_name, gpustream_var_name_prefix = Config.get('compiler', 'cuda', 'gpu_stream_name').split(',') + + # Initialize the strategy for copies that occur outside of kernel execution + out_of_kernel_copy = OutOfKernelCopyStrategy() + + # Get all data copies to process the out of kernel copies + copy_worklist = self.find_all_data_copies(sdfg) + + for copy_sdfg, state, src_node, dst_node, edge in copy_worklist: + + copy_context = CopyContext(copy_sdfg, state, src_node, dst_node, edge, gpustream_assignments) + + # Only insert copy tasklets for GPU related copies occuring out of the + # kernel (i.e. a GPU_device scheduled map) + if not out_of_kernel_copy.applicable(copy_context): + continue + + # Generatae the copy call + code = out_of_kernel_copy.generate_copy(copy_context) + + # Ensure the GPU stream array exists in the current SDFG; add it if missing + if gpustream_array_name not in copy_sdfg.arrays: + copy_sdfg.add_transient(gpustream_array_name, (num_assigned_streams,), dtype=dace.dtypes.gpuStream_t, + storage=dace.dtypes.StorageType.Register, lifetime=dace.dtypes.AllocationLifetime.Persistent) + + # Prepare GPU ustream connectors and the stream to be accessed from the + # GPU stream array + gpustream_id = gpustream_assignments[dst_node] + gpustream_var_name = f"{gpustream_var_name_prefix}{gpustream_id}" + accessed_gpustream = f"{gpustream_array_name}[{gpustream_id}]" + + # Create the tasklet and add GPU stream related connectors + tasklet = state.add_tasklet("gpu_copy", {}, {}, code, language=dtypes.Language.CPP) + tasklet.add_in_connector(gpustream_var_name, dtypes.gpuStream_t, True) + tasklet.add_out_connector(gpustream_var_name, dtypes.gpuStream_t, True) + + # Add incoming and outgoing GPU stream accessNodes to the tasklet + in_gpustream = state.add_access(gpustream_array_name) + out_gpustream= state.add_access(gpustream_array_name) + state.add_edge(in_gpustream, None, tasklet, gpustream_var_name, dace.Memlet(accessed_gpustream)) + state.add_edge(tasklet, gpustream_var_name, out_gpustream, None, dace.Memlet(accessed_gpustream)) + + # Put the tasklet in between the edge + dst_node_pred, dst_node_conn, _, dst_conn, memlet = edge + state.add_edge(dst_node_pred, dst_node_conn, tasklet, None, copy.deepcopy(memlet)) + state.add_edge(tasklet, None, dst_node, dst_conn, copy.deepcopy(memlet)) + state.remove_edge(edge) + + return {} + + def find_all_data_copies(self, sdfg: SDFG) -> List[Tuple[SDFG, SDFGState, nodes.Node, nodes.Node, MultiConnectorEdge[mm.Memlet]]]: + """ + Finds and returns all data copies in the SDFG as tuples containing the SDFG, state, source node, + destination node, and the first memlet edge of in the memlet path between source and destination node. + + Parameters + ---------- + sdfg : SDFG + The SDFG to analyze for potential data copies. + + Returns + ------- + List[Tuple[SDFG, SDFGState, nodes.Node, nodes.Node, MultiConnectorEdge[mm.Memlet]]] + A list of tuples representing the data copy, each containing: + - The SDFG containing the copy + - The state in which the copy occurs + - The source node of the copy + - The destination node of the copy + - The first memlet edge representing the data movement + """ + copy_worklist: List[Tuple[SDFG, SDFGState, nodes.Node, nodes.Node, MultiConnectorEdge[mm.Memlet]]] = [] + visited_edges: Set[MultiConnectorEdge[mm.Memlet]] = set() + + for sub_sdfg in sdfg.all_sdfgs_recursive(): + for state in sub_sdfg.states(): + for edge in state.edges(): + + # Skip edges that were already processed + if edge in visited_edges: + continue + + # Get the memlet path and mark all edges in the path as visited + memlet_path = state.memlet_path(edge) + visited_edges.update(set(memlet_path)) + + # Get source and destination noces + first_edge = memlet_path[0] + last_edge = memlet_path[-1] + src_node = first_edge.src + dst_node = last_edge.dst + + # Skip empty memlets + if first_edge.data.subset is None: + continue + + # Add copy to the worklist + copy_worklist.append((sub_sdfg, state, src_node, dst_node, first_edge)) + + """ + # NOTE: This is closer to what the cpu.py file does. Some copies could be missed + # in case someone wants to extend this pass with other copy tasklets- in this case, + # I would suggest to take a closer look into cpu.py how copies are dispatched. + + if (isinstance(dst_node, nodes.AccessNode) and scope_dict[src_node] != scope_dict[dst_node] + and scope_contains_scope(scope_dict, src_node, dst_node)): + copy_worklist.append((sub_sdfg, state, src_node, dst_node, last_edge)) + + elif (isinstance(src_node, nodes.AccessNode) and not isinstance(dst_node, nodes.Tasklet)): + copy_worklist.append((sub_sdfg, state, src_node, dst_node, first_edge)) + + elif (not isinstance(src_node, nodes.CodeNode) and isinstance(dst_node, nodes.Tasklet)): + copy_worklist.append((sub_sdfg, state, src_node, dst_node, last_edge)) + """ + + return copy_worklist diff --git a/tests/codegen/gpu_memcpy_test.py b/tests/codegen/gpu_memcpy_test.py index c7a3525f95..e310ff3727 100644 --- a/tests/codegen/gpu_memcpy_test.py +++ b/tests/codegen/gpu_memcpy_test.py @@ -15,11 +15,16 @@ rng = cp.random.default_rng(42) -def count_node(sdfg: dace.SDFG, node_type): +def count_node(sdfg: dace.SDFG, node_type, ignore_gpustream_nodes=True): nb_nodes = 0 for rsdfg in sdfg.all_sdfgs_recursive(): for state in sdfg.states(): for node in state.nodes(): + if (ignore_gpustream_nodes and + isinstance(node, dace_nodes.AccessNode) + and node.desc(state).dtype == dace.dtypes.gpuStream_t + ): + continue if isinstance(node, node_type): nb_nodes += 1 return nb_nodes From f416188d4a4ff8f395b4c5af3246d9316708351f Mon Sep 17 00:00:00 2001 From: aydogdub Date: Sun, 17 Aug 2025 18:46:12 +0200 Subject: [PATCH 71/94] Add support for expanded tasklets using GPU streams. Fix small issues --- dace/codegen/targets/experimental_cuda.py | 44 +++--- .../gpu_stream_topology_simplification.py | 149 ++++++++++-------- .../insert_gpu_stream_sync_tasklets.py | 3 +- .../insert_gpu_streams_to_kernels.py | 4 +- .../insert_gpu_streams_to_tasklets.py | 93 +++++++++++ .../passes/insert_gpu_copy_tasklets.py | 7 +- 6 files changed, 216 insertions(+), 84 deletions(-) create mode 100644 dace/transformation/passes/gpustream/insert_gpu_streams_to_tasklets.py diff --git a/dace/codegen/targets/experimental_cuda.py b/dace/codegen/targets/experimental_cuda.py index 3f00dab61d..2030776b09 100644 --- a/dace/codegen/targets/experimental_cuda.py +++ b/dace/codegen/targets/experimental_cuda.py @@ -32,6 +32,7 @@ from dace.transformation.pass_pipeline import Pipeline from dace.transformation.passes.gpustream.gpustream_scheduling import NaiveGPUStreamScheduler from dace.transformation.passes.gpustream.insert_gpu_streams_to_kernels import InsertGPUStreamsToKernels +from dace.transformation.passes.gpustream.insert_gpu_streams_to_tasklets import InsertGPUStreamsToTasklets from dace.transformation.passes.insert_gpu_copy_tasklets import InsertGPUCopyTasklets from dace.transformation.passes.gpustream.gpu_stream_topology_simplification import GPUStreamTopologySimplification from dace.transformation.passes.gpustream.insert_gpu_stream_sync_tasklets import InsertGPUStreamSyncTasklets @@ -135,7 +136,8 @@ def preprocess(self, sdfg: SDFG) -> None: - Adding explicit ThreadBlock Maps where missing and infer Grid and Block dimensions for every Kernel in the SDFG - Handling GPU<->GPU strided copies. - - TODO: update: Assigning backend GPU streams (e.g., CUDA streams) and creating the GPUStreamManager. + - Runs a pipeline for making GPU stream explicit at the SDFG level and handles other + GPU stream related initialization. - Handling memory pool management """ @@ -207,36 +209,30 @@ def preprocess(self, sdfg: SDFG) -> None: # Define backend stream access expression (e.g., CUDA stream handle) gpu_stream_access_template = "__state->gpu_context->streams[{gpu_stream}]" - # TODO: Update + # Prepare the Pipeline to make GPU streams explicit: Add and connect SDFG nodes + # with GPU stream AccessNodes where used stream_pipeline = Pipeline( [ NaiveGPUStreamScheduler(), InsertGPUStreamsToKernels(), + InsertGPUStreamsToTasklets(), InsertGPUStreamSyncTasklets(), InsertGPUCopyTasklets(), GPUStreamTopologySimplification(), ] ) + # TODO: Missed copies due to InsertGPUCopyTasklet -> maybe check wheter copies were + # handled above than just adding this codegen to used_targets by default self._dispatcher._used_targets.add(self) gpustream_assignments = stream_pipeline.apply_pass(sdfg, {})['NaiveGPUStreamScheduler'] - # Initialize runtime GPU stream manager # TODO: probably to be deleted - self._gpu_stream_manager = GPUStreamManager(sdfg, gpustream_assignments, gpu_stream_access_template) - - # Get GPU stream persistent array name used in state struct - # NOTE: GPU stream array name from the configurations is prepended with an ID for consistency, - # since struct definition and access are handled elsewhere (e.g., framecode.py, cpu.py, cpp.py) - # TODO: Nicer - self._initialize_gpustreams = "" - gpu_stream_array_name = Config.get('compiler', 'cuda', 'gpu_stream_name').split(",")[0] - for csdfg, name, desc in sdfg.arrays_recursive(include_nested_data=True): - if name == gpu_stream_array_name and desc.lifetime == dtypes.AllocationLifetime.Persistent: - gpu_stream_field_name = f'__{csdfg.cfg_id}_{name}' - self._initialize_gpustreams += f"__state->{gpu_stream_field_name} = __state->gpu_context->streams;\n" - + # Define backend stream access expression (e.g., CUDA stream handle) + gpu_stream_access_template = "__state->gpu_context->streams[{gpu_stream}]" + # Initialize runtime GPU stream manager + self._gpu_stream_manager = GPUStreamManager(sdfg, gpustream_assignments, gpu_stream_access_template) #----------------- Shared Memory Synchronization related Logic ----------------- @@ -975,6 +971,19 @@ def get_generated_codeobjects(self): self._frame.generate_fileheader(self._global_sdfg, fileheader, 'cuda') + # The GPU stream array is set to have a persistent allocation lifetime (see preprocess GPU stream pipeline). + # Thus the definition of the GPU stream array in the state struct and the access to it is handled elsewhere and + # in several different files (e.g., framecode.py, cpu.py, cpp.py). For the sake of consistency, we initialize it + # as it is expected in the other modules. I.e. prepend with an ID for all SDFGs it is defined. + # Note that all the different variable names point to the same GPU stream array. + init_gpu_stream_vars = "" + gpu_stream_array_name = Config.get('compiler', 'cuda', 'gpu_stream_name').split(",")[0] + for csdfg, name, desc in self._global_sdfg.arrays_recursive(include_nested_data=True): + if name == gpu_stream_array_name and desc.lifetime == dtypes.AllocationLifetime.Persistent: + gpu_stream_field_name = f'__{csdfg.cfg_id}_{name}' + init_gpu_stream_vars += f"__state->{gpu_stream_field_name} = __state->gpu_context->streams;\n" + init_gpu_stream_vars += f" " + # My comment: takes codeblocks and transforms it nicely to code initcode = CodeIOStream() for sd in self._global_sdfg.all_sdfgs_recursive(): @@ -1069,7 +1078,6 @@ def get_generated_codeobjects(self): DACE_GPU_CHECK({backend}EventCreateWithFlags(&__state->gpu_context->events[i], {backend}EventDisableTiming)); }} - // Here {other_gpustream_init} {initcode} @@ -1123,7 +1131,7 @@ def get_generated_codeobjects(self): file_header=fileheader.getvalue(), nstreams=self._gpu_stream_manager.num_gpu_streams, nevents=self._gpu_stream_manager.num_gpu_events, - other_gpustream_init=self._initialize_gpustreams, + other_gpustream_init=init_gpu_stream_vars, backend=self.backend, backend_header=backend_header, pool_header=pool_header, diff --git a/dace/transformation/passes/gpustream/gpu_stream_topology_simplification.py b/dace/transformation/passes/gpustream/gpu_stream_topology_simplification.py index e33713056a..9dd04a161c 100644 --- a/dace/transformation/passes/gpustream/gpu_stream_topology_simplification.py +++ b/dace/transformation/passes/gpustream/gpu_stream_topology_simplification.py @@ -10,6 +10,7 @@ from dace.transformation.passes.gpustream.gpustream_scheduling import NaiveGPUStreamScheduler from dace.transformation.passes.gpustream.insert_gpu_stream_sync_tasklets import InsertGPUStreamSyncTasklets from dace.transformation.passes.gpustream.insert_gpu_streams_to_kernels import InsertGPUStreamsToKernels +from dace.transformation.passes.gpustream.insert_gpu_streams_to_tasklets import InsertGPUStreamsToTasklets from dace.transformation.passes.insert_gpu_copy_tasklets import InsertGPUCopyTasklets @properties.make_properties @@ -25,7 +26,8 @@ class GPUStreamTopologySimplification(ppl.Pass): def depends_on(self) -> Set[Union[Type[ppl.Pass], ppl.Pass]]: depending_passes = { NaiveGPUStreamScheduler, InsertGPUStreamsToKernels, - InsertGPUStreamSyncTasklets, InsertGPUCopyTasklets + InsertGPUStreamsToTasklets, InsertGPUStreamSyncTasklets, + InsertGPUCopyTasklets } return depending_passes @@ -42,7 +44,7 @@ def apply_pass(self, sdfg: SDFG, pipeline_results: Dict[str, Any]): """ self._merge_close_gpustream_nodes(sdfg) - self._simplify_kernel_exit_gpustreams(sdfg) + self._merge_gpustreams_special_case(sdfg) return {} def _merge_close_gpustream_nodes(self, sdfg: SDFG) -> None: @@ -107,6 +109,9 @@ def _merge_close_gpustream_nodes(self, sdfg: SDFG) -> None: and state.out_degree(succ_of_gp) == 0 ] + # remove duplicates + node_gp_successors_streams = list(set(node_gp_successors_streams)) + for gp_succ_stream in node_gp_successors_streams: for edge in state.in_edges(gp_succ_stream): src, src_conn, _, dst_conn, data = edge @@ -116,21 +121,26 @@ def _merge_close_gpustream_nodes(self, sdfg: SDFG) -> None: # outgoing edges state.remove_node(gp_succ_stream) - def _simplify_kernel_exit_gpustreams(self, sdfg: SDFG) -> None: + def _merge_gpustreams_special_case(self, sdfg: SDFG) -> None: """ - Special-case simplification after a GPU_Device scheduled kernel MapExit. + Special-case simplification of GPU stream AccessNodes. + + This pass detects the following pattern: + - A GPU stream AccessNode `X` has a predecessor and a successor (i.e. at least one of both). + - Between the predecessor and successor lie one or more tasklets. + - These tasklets use their own distinct GPU stream AccessNodes (not `X`), + which are connected only to the tasklet itself. - 1) The MapExit feeds a GPU stream AccessNode that typically goes into a stream - synchronization tasklet. - 2) The same MapExit also feeds a GPU memory copy that has separate 'input' and - 'output' GPU stream AccessNodes. + To simplify the topology, redundant streams are merged: + - A single unified input GPU stream connects to the predecessor and replaces (merges) + the per-tasklet input streams. + - A single unified output GPU stream connects to the successor and replaces (merges) + the per-tasklet output streams. - In this situation, the topology is simplified by using a single GPU stream - AccessNode before the memory copy and for the MapExit's GPU stream and another - GPU stream AccessNode after the copy. - Explaining what is happening in words is difficult here. - Inspect intermediate SDFGs on this minimal case to see what is going on: + The simplification is easier to understand visually than in words. + Inspect the intermediate SDFGs produced by the minimal example below + to see the effect of the stream merging. Example ------- @@ -150,104 +160,119 @@ def example(A: dace.uint32[128], B: dace.uint32[128], #------------------------- Preprocess: Gather Information ---------------------------- - # For each GPU Stream AccessNode connected to a kernel: Determine with which Tasklet Source - # and taskelt sink nodes it should be merged + # For each GPU Stream AccessNode having a predecessor and a successor: + # Determine with which Tasklet Source and which Tasklet sink nodes lie between its predecessor + # and its successor merge_source_gpustream: Dict[Tuple[nodes.AccessNode, SDFGState], List[nodes.AccessNode]] = dict() merge_sink_gpustream: Dict[Tuple[nodes.AccessNode, SDFGState], List[nodes.AccessNode]] = dict() for node, state in sdfg.all_nodes_recursive(): - # Skip non tasklets + + # Skip non-tasklets if not isinstance(node, nodes.Tasklet): continue - # Find the GPU_Device-scheduled MapExit grand-predecessor, if any + # The tasklets of interest should have exactly one preceeding source GPU node and one following sink GPU node + # If not, we skip node_predecessors = state.predecessors(node) - kernel_exit_grand_predecessor = [ - grand_pred for pred in node_predecessors - for grand_pred in state.predecessors(pred) - if isinstance(grand_pred, nodes.MapExit) and - grand_pred.map.schedule == dtypes.ScheduleType.GPU_Device - ] - - # For this case only tasklets succeeding kernelExit are relevant - if len(kernel_exit_grand_predecessor) == 0: - continue - - # Ignore such niche cases - if len(kernel_exit_grand_predecessor) > 1: - continue - - # Get the Kernel Exits GPU stream - kernel_exit = kernel_exit_grand_predecessor[0] - kernel_exit_gpustream_node = [succ for succ in state.successors(kernel_exit) if isinstance(succ, nodes.AccessNode) - and succ.desc(state).dtype == dtypes.gpuStream_t][0] - - # (Copy) Tasklet should have exactly one preceeding source GPU node and one following sink GPU node - # If not, we skip (because this pass is here purely for nicer graphs) - # Also, kernel exit is assumed to be connected to a GPU Stream AccessNode (see "depends_on()") node_successors = state.successors(node) downstream_gpustream_sinks = [succ for succ in node_successors if isinstance(succ, nodes.AccessNode) and succ.desc(state).dtype == dtypes.gpuStream_t and state.out_degree(succ) == 0] upstream_gpustream_sources = [pre for pre in node_predecessors if isinstance(pre, nodes.AccessNode) - and pre.desc(state).dtype == dtypes.gpuStream_t and state.in_degree(pre) == 0] - + and pre.desc(state).dtype == dtypes.gpuStream_t and state.in_degree(pre) == 0] + # Skip not considered case if not (len(upstream_gpustream_sources) == len(downstream_gpustream_sinks) and len(upstream_gpustream_sources) == 1): continue + + # Look for potential predecessor of a "passthrough" GPU Stream AccessNode + # which would also be the grand-predeccessor of the current node (=tasklet) + candidate_predecessor = [] + for pred in node_predecessors: + for grand_pred in state.predecessors(pred): + + # Current nodes grand pred is a candidate of a predecessor of a "passthrough" GPU Stream AccessNode + candidate = grand_pred + + # A PassThrough GPU stream node can only have MapExits and Tasklets as candidate predecessors + if not (isinstance(candidate, nodes.MapExit) and candidate.map.schedule == dtypes.ScheduleType.GPU_Device + or isinstance(candidate, nodes.Tasklet)): + continue + + has_passthrough_gpustream = any( + (isinstance(succ, nodes.AccessNode) and succ.desc(state).dtype == dtypes.gpuStream_t) + and (state.in_degree(succ) > 0 and state.out_degree(succ) > 0) + for succ in state.successors(candidate) + ) + + if has_passthrough_gpustream: + candidate_predecessor.append(candidate) + + # Not "close" passthrough GPU node exists if no candidate predecessor exists + if len(candidate_predecessor) == 0: + continue + + # Niche case, more than one "close" passthrough GPU node exists: Out of scope + # Ignore this case (note: This Pass only makes the Graph visually nicer, so skipping has + # no effect on correctness) + if len(candidate_predecessor) > 1: + continue + + # Get the Kernel Exits GPU stream + candidate_predecessor = candidate_predecessor[0] + passthrough_gpu_node = [succ for succ in state.successors(candidate_predecessor) if isinstance(succ, nodes.AccessNode) + and succ.desc(state).dtype == dtypes.gpuStream_t][0] - # Collect and store the merging information - pre_gpustream: nodes.AccessNode = upstream_gpustream_sources[0] - succ_gpustream: nodes.AccessNode = downstream_gpustream_sinks[0] - if (kernel_exit_gpustream_node, state) in merge_source_gpustream: - merge_source_gpustream[(kernel_exit_gpustream_node, state)].append(pre_gpustream) - merge_sink_gpustream[(kernel_exit_gpustream_node, state)].append(succ_gpustream) + + # Collect and store the GPU stream merging information + pre_gpustream: nodes.AccessNode = upstream_gpustream_sources[0] # Note: Len is 1 + succ_gpustream: nodes.AccessNode = downstream_gpustream_sinks[0] # Note: Len is 1 + if (passthrough_gpu_node, state) in merge_source_gpustream: + merge_source_gpustream[(passthrough_gpu_node, state)].append(pre_gpustream) + merge_sink_gpustream[(passthrough_gpu_node, state)].append(succ_gpustream) else: - merge_source_gpustream[(kernel_exit_gpustream_node, state)] = [pre_gpustream] - merge_sink_gpustream[(kernel_exit_gpustream_node, state)] = [succ_gpustream] + merge_source_gpustream[(passthrough_gpu_node, state)] = [pre_gpustream] + merge_sink_gpustream[(passthrough_gpu_node, state)] = [succ_gpustream] #------------------------- Merge the GPU Stream AccessNodes ---------------------------- - for kernel_exit_stream, state in merge_sink_gpustream.keys(): + for passthrough_gpu_node, state in merge_sink_gpustream.keys(): - # Add new AccessNodes which merge the others loose streams + # Add new AccessNodes which merge the other loose streams unified_in_stream = state.add_access(gpustream_array_name) unified_out_stream = state.add_access(gpustream_array_name) - # unified_in_stream connects to KernelExit and all Source nodes of memory copy tasklets - # whereas unified_out_stream unifies all sink streams of memory tasklets and connects to - # all following nodes of kernel_exit_stream - for in_edge in state.in_edges(kernel_exit_stream): + for in_edge in state.in_edges(passthrough_gpu_node): src, src_conn, _, dst_conn, memlet = in_edge state.add_edge(src, src_conn, unified_in_stream, dst_conn, copy.deepcopy(memlet)) state.remove_edge(in_edge) - for out_edge in state.out_edges(kernel_exit_stream): + for out_edge in state.out_edges(passthrough_gpu_node): _, src_conn, dst, dst_conn, memlet = out_edge state.add_edge(unified_out_stream, src_conn, dst, dst_conn, copy.deepcopy(memlet)) state.remove_edge(out_edge) - for source_stream in merge_source_gpustream[kernel_exit_stream, state]: + for source_stream in merge_source_gpustream[passthrough_gpu_node, state]: for out_edge in state.out_edges(source_stream): _, src_conn, dst, dst_conn, memlet = out_edge state.add_edge(unified_in_stream, src_conn, dst, dst_conn, copy.deepcopy(memlet)) state.remove_edge(out_edge) state.remove_node(source_stream) - for sink_stream in merge_sink_gpustream[kernel_exit_stream, state]: + for sink_stream in merge_sink_gpustream[passthrough_gpu_node, state]: for in_edge in state.in_edges(sink_stream): src, src_conn, _, dst_conn, memlet = in_edge state.add_edge(src, src_conn, unified_out_stream, dst_conn, copy.deepcopy(memlet)) state.remove_edge(in_edge) state.remove_node(sink_stream) - # Kernel exit stream is represented in the two unified streams, not needed anymore - state.remove_node(kernel_exit_stream) + state.remove_node(passthrough_gpu_node) def _remove_passthrough_gpu_stream_access_node(self, sdfg: SDFG) -> None: """ Unused: This will need adaption at the codegen level. It is mainly unused because I don't think it makes the final SDFG - visually nicer. + visually nicer, which is the whole purpose of this Pass. """ for node, state in sdfg.all_nodes_recursive(): diff --git a/dace/transformation/passes/gpustream/insert_gpu_stream_sync_tasklets.py b/dace/transformation/passes/gpustream/insert_gpu_stream_sync_tasklets.py index 28ebf4171d..7029e64966 100644 --- a/dace/transformation/passes/gpustream/insert_gpu_stream_sync_tasklets.py +++ b/dace/transformation/passes/gpustream/insert_gpu_stream_sync_tasklets.py @@ -11,6 +11,7 @@ from dace.transformation import pass_pipeline as ppl, transformation from dace.transformation.passes.gpustream.gpustream_scheduling import NaiveGPUStreamScheduler from dace.transformation.passes.gpustream.insert_gpu_streams_to_kernels import InsertGPUStreamsToKernels +from dace.transformation.passes.gpustream.insert_gpu_streams_to_tasklets import InsertGPUStreamsToTasklets @properties.make_properties @transformation.explicit_cf_compatible @@ -23,7 +24,7 @@ class InsertGPUStreamSyncTasklets(ppl.Pass): cases are discovered. """ def depends_on(self) -> Set[Union[Type[ppl.Pass], ppl.Pass]]: - return {NaiveGPUStreamScheduler, InsertGPUStreamsToKernels} + return {NaiveGPUStreamScheduler, InsertGPUStreamsToKernels, InsertGPUStreamsToTasklets} def modifies(self) -> ppl.Modifies: return ppl.Modifies.Tasklets | ppl.Modifies.Memlets diff --git a/dace/transformation/passes/gpustream/insert_gpu_streams_to_kernels.py b/dace/transformation/passes/gpustream/insert_gpu_streams_to_kernels.py index 50be81a872..174b41afb5 100644 --- a/dace/transformation/passes/gpustream/insert_gpu_streams_to_kernels.py +++ b/dace/transformation/passes/gpustream/insert_gpu_streams_to_kernels.py @@ -42,9 +42,9 @@ def apply_pass(self, sdfg: SDFG, pipeline_results: Dict[str, Any]): # Link kernels to their assigned GPU streams for sub_sdfg in sdfg.all_sdfgs_recursive(): - # Track whether the GPU stream array is added to + # Track whether the GPU stream array is in tge # sub_sdfg's data descriptor store - gpustream_array_added = False + gpustream_array_added: bool = stream_array_name in sub_sdfg.arrays for state in sub_sdfg.states(): for node in state.nodes(): diff --git a/dace/transformation/passes/gpustream/insert_gpu_streams_to_tasklets.py b/dace/transformation/passes/gpustream/insert_gpu_streams_to_tasklets.py new file mode 100644 index 0000000000..5e3a92c121 --- /dev/null +++ b/dace/transformation/passes/gpustream/insert_gpu_streams_to_tasklets.py @@ -0,0 +1,93 @@ +# Copyright 2019-2025 ETH Zurich and the DaCe authors. All rights reserved. +from typing import Any, Dict, Set, Type, Union + +import dace +from dace import dtypes, properties, SDFG +from dace.config import Config +from dace.sdfg import nodes +from dace.transformation import pass_pipeline as ppl, transformation +from dace.transformation.passes.gpustream.gpustream_scheduling import NaiveGPUStreamScheduler +from dace.transformation.passes.gpustream.insert_gpu_streams_to_kernels import InsertGPUStreamsToKernels + + +@properties.make_properties +@transformation.explicit_cf_compatible +class InsertGPUStreamsToTasklets(ppl.Pass): + """ + This pass ensures that tasklets which require access to their assigned GPU stream + are provided with it explicitly. + + Such tasklets typically originate from expanded LibraryNodes targeting GPUs. + These nodes may reference the special placeholder variable `__dace_current_stream`, + which is expected to be defined during unparsing in `cpp.py`. + + To avoid relying on this "hidden" mechanism, the pass rewrites tasklets to use + the GPU stream AccessNode directly. + + Note that this pass is similar to `InsertGPUStreamsToKernels`. + """ + + def depends_on(self) -> Set[Union[Type[ppl.Pass], ppl.Pass]]: + return {NaiveGPUStreamScheduler, InsertGPUStreamsToKernels} + + def modifies(self) -> ppl.Modifies: + return ppl.Modifies.AccessNodes | ppl.Modifies.Memlets + + def should_reapply(self, modified: ppl.Modifies) -> bool: + return False + + def apply_pass(self, sdfg: SDFG, pipeline_results: Dict[str, Any]): + + # Placeholder for the GPU stream variable used in tasklet code + STREAM_PLACEHOLDER = "__dace_current_stream" + + # Retrieve the GPU stream's array name + stream_array_name = Config.get('compiler', 'cuda', 'gpu_stream_name').split(',')[0] + + # Retrieve GPU stream assignments for nodes + stream_assignments: Dict[nodes.Node, Union[int, str]] = pipeline_results['NaiveGPUStreamScheduler'] + + # Determine the number of assigned GPU streams, needed for creating the GPU stream Array + num_assigned_streams = max(stream_assignments.values(), default=0) + 1 + + # Find all tasklets which use the GPU stream variable (STREAM_PLACEHOLDER) in the code + # and provide them the needed GPU stream explicitly + for sub_sdfg in sdfg.all_sdfgs_recursive(): + + # Track whether the GPU stream array is in tge + # sub_sdfg's data descriptor store + gpustream_array_added: bool = stream_array_name in sub_sdfg.arrays + + for state in sub_sdfg.states(): + for node in state.nodes(): + + # Not a tasklet - continue + if not isinstance(node, nodes.Tasklet): + continue + + # Tasklet does not need use its assigned GPU stream - continue + if not STREAM_PLACEHOLDER in node.code.as_string: + continue + + # If the GPU stream array is not yet defined in the sub_sdfg, add it + if not gpustream_array_added: + sub_sdfg.add_transient(stream_array_name, (num_assigned_streams,), dtype=dace.dtypes.gpuStream_t, + storage=dace.dtypes.StorageType.Register, lifetime=dace.dtypes.AllocationLifetime.Persistent) + gpustream_array_added = True + + # Stream connector name and the used GPU Stream for the kernel + assigned_gpustream = stream_assignments[node] + gpu_stream_conn = STREAM_PLACEHOLDER + accessed_gpu_stream = f"{stream_array_name}[{assigned_gpustream}]" + + # Provide the GPU stream explicitly to the tasklet + stream_array_in = state.add_access(stream_array_name) + stream_array_out = state.add_access(stream_array_name) + + node.add_in_connector(gpu_stream_conn, dtypes.gpuStream_t) + node.add_out_connector(gpu_stream_conn, dtypes.gpuStream_t, force=True) + + state.add_edge(stream_array_in, None, node, gpu_stream_conn, dace.Memlet(accessed_gpu_stream)) + state.add_edge(node, gpu_stream_conn, stream_array_out, None, dace.Memlet(accessed_gpu_stream)) + + return {} diff --git a/dace/transformation/passes/insert_gpu_copy_tasklets.py b/dace/transformation/passes/insert_gpu_copy_tasklets.py index 3ffd946e47..a6b9c57374 100644 --- a/dace/transformation/passes/insert_gpu_copy_tasklets.py +++ b/dace/transformation/passes/insert_gpu_copy_tasklets.py @@ -12,6 +12,7 @@ from dace.transformation import pass_pipeline as ppl, transformation from dace.transformation.passes.gpustream.gpustream_scheduling import NaiveGPUStreamScheduler from dace.transformation.passes.gpustream.insert_gpu_streams_to_kernels import InsertGPUStreamsToKernels +from dace.transformation.passes.gpustream.insert_gpu_streams_to_tasklets import InsertGPUStreamsToTasklets from dace.transformation.passes.gpustream.insert_gpu_stream_sync_tasklets import InsertGPUStreamSyncTasklets @properties.make_properties @@ -34,7 +35,11 @@ class InsertGPUCopyTasklets(ppl.Pass): """ def depends_on(self) -> Set[Union[Type[ppl.Pass], ppl.Pass]]: - return {NaiveGPUStreamScheduler, InsertGPUStreamsToKernels, InsertGPUStreamSyncTasklets} + depending_passes = { + NaiveGPUStreamScheduler, InsertGPUStreamsToKernels, + InsertGPUStreamsToTasklets, InsertGPUStreamSyncTasklets + } + return depending_passes def modifies(self) -> ppl.Modifies: return ppl.Modifies.Tasklets | ppl.Modifies.Memlets From 8b2ece104adc1a789cfaee596db343bc15256fa7 Mon Sep 17 00:00:00 2001 From: aydogdub Date: Sun, 24 Aug 2025 15:50:30 +0200 Subject: [PATCH 72/94] finish GPU stream management, fix issues to increase test coverage and refactor a bit --- dace/codegen/targets/experimental_cuda.py | 170 +++++++--- .../new_copy_strategies.py | 62 ++-- .../scope_strategies.py | 21 +- .../insert_gpu_stream_sync_tasklets.py | 8 +- .../passes/shared_memory_synchronization2.py | 311 ++++++++++++++++++ tests/npbench/misc/scattering_self_test.py | 1 + 6 files changed, 485 insertions(+), 88 deletions(-) create mode 100644 dace/transformation/passes/shared_memory_synchronization2.py diff --git a/dace/codegen/targets/experimental_cuda.py b/dace/codegen/targets/experimental_cuda.py index 2030776b09..e8e1d37304 100644 --- a/dace/codegen/targets/experimental_cuda.py +++ b/dace/codegen/targets/experimental_cuda.py @@ -36,7 +36,8 @@ from dace.transformation.passes.insert_gpu_copy_tasklets import InsertGPUCopyTasklets from dace.transformation.passes.gpustream.gpu_stream_topology_simplification import GPUStreamTopologySimplification from dace.transformation.passes.gpustream.insert_gpu_stream_sync_tasklets import InsertGPUStreamSyncTasklets -from dace.transformation.passes.shared_memory_synchronization import DefaultSharedMemorySync +#from dace.transformation.passes.shared_memory_synchronization import DefaultSharedMemorySync +from dace.transformation.passes.shared_memory_synchronization2 import DefaultSharedMemorySync from dace.transformation.dataflow.add_threadblock_map import AddThreadBlockMap from dace.transformation.passes.analysis.infer_gpu_grid_and_block_size import InferGPUGridAndBlockSize @@ -373,23 +374,16 @@ def generate_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: ScopeSub # Store kernel metadata (name, dimensions, arguments, etc.) in a KernelSpec object # and save it as an attribute - kernel_spec = KernelSpec(cudaCodeGen=self, - sdfg=sdfg, - cfg=cfg, - dfg_scope=dfg_scope, - state_id=state_id) + kernel_spec = KernelSpec(cudaCodeGen=self, + sdfg=sdfg, + cfg=cfg, + dfg_scope=dfg_scope, + state_id=state_id) self._current_kernel_spec = kernel_spec - # Update types of constant variables in the current scope - for dname, data_desc in kernel_spec.arglist.items(): - ptr_name = ptr(dname, data_desc, sdfg, self._frame) - defined_type, ctype = self._dispatcher.defined_vars.get(ptr_name) - - if dname in kernel_spec.kernel_constants: - ctype = f"const {ctype}" - - self._dispatcher.defined_vars.add(ptr_name, defined_type, ctype, allow_shadowing=True) + # (Re)define variables for the new scope + self._define_variables_in_kernel_scope(sdfg, self._dispatcher) # declare and call kernel wrapper function (in the CPU-side code) self._declare_and_invoke_kernel_wrapper(sdfg, cfg, dfg_scope, state_id, function_stream, callsite_stream) @@ -444,6 +438,57 @@ def generate_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: ScopeSub f"Scope generation for schedule type '{schedule_type}' is not implemented in ExperimentalCUDACodeGen. " "Please check for supported schedule types or implement the corresponding strategy.") + def _define_variables_in_kernel_scope(self, sdfg: SDFG, dispatcher: TargetDispatcher): + """ + Define kernel-visible variables in the dispatcher's scope. + + - Certain variables stored in the host-side ``__state`` struct (e.g., persistent or external + data) cannot be accessed directly in kernel code. They are passed as arguments instead, with + pointer names resolved via ``cpp.ptr(..)``. These must be registered in the dispatcher for use + in kernel context. + + - KernelSpec may also mark certain variables/arguments as constants, which must be registered with + the appropriate ``const`` qualifier in their ctype. + """ + # Extract argument and constant definitions from the KernelSpec + kernel_spec: KernelSpec = self._current_kernel_spec + kernel_constants: Set[str] = kernel_spec.kernel_constants + kernel_arglist: Dict[str, dt.Data] = kernel_spec.arglist + + # Save current in_device_code value for restoration later + restore_in_device_code = self._in_device_code + for name, data_desc in kernel_arglist.items(): + + # Only arrays relevant + if not name in sdfg.arrays: + continue + + data_desc = sdfg.arrays[name] + # Get the outer/host pointer name + self._in_device_code = False + host_ptrname = cpp.ptr(name, data_desc, sdfg, self._frame) + + # Get defined type and ctype for the data (use host pointer name) + is_global: bool = data_desc.lifetime in (dtypes.AllocationLifetime.Global, + dtypes.AllocationLifetime.Persistent, + dtypes.AllocationLifetime.External) + defined_type, ctype = dispatcher.defined_vars.get(host_ptrname, is_global=is_global) + + # Get the inner/device pointer name + self._in_device_code = True + device_ptrname = cpp.ptr(name, data_desc, sdfg, self._frame) + + # Add the const qualifier if it is a constant AND is not marked as such yet + if name in kernel_constants: + if not "const " in ctype: + ctype = f"const {ctype}" + + # Register variable with the device pointer name for the kernel context + dispatcher.defined_vars.add(device_ptrname, defined_type, ctype, allow_shadowing=True) + + # Restore in_device_code field + self._in_device_code = restore_in_device_code + def _declare_and_invoke_kernel_wrapper(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: ScopeSubgraphView, state_id: int, function_stream: CodeIOStream, callsite_stream: CodeIOStream) -> None: @@ -495,9 +540,6 @@ def _generate_kernel_wrapper(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope gdims = ', '.join(symbolic_to_cpp(grid_dims)) bdims = ', '.join(symbolic_to_cpp(block_dims)) - # cuda/hip stream the kernel belongs to - gpu_stream = self._gpu_stream_manager.get_stream_node(scope_entry) - # ----------------- Kernel Launch Function Declaration ----------------------- self._localcode.write( @@ -536,8 +578,8 @@ def _generate_kernel_wrapper(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope }}''', cfg, state_id, scope_entry) # ----------------- Kernel Launch Invocation ----------------------- + stream_var_name = Config.get('compiler', 'cuda', 'gpu_stream_name').split(',')[1] kargs = ', '.join(['(void *)&' + arg for arg in kernel_args_as_input]) - _, stream_var_name = Config.get('compiler', 'cuda', 'gpu_stream_name').split(',') self._localcode.write( f''' void *{kernel_name}_args[] = {{ {kargs} }}; @@ -703,25 +745,37 @@ def _generate_NestedSDFG(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSub parent_state: SDFGState = cfg.state(state_id) nsdfg = node.sdfg + # New scope for defined variables dispatcher: TargetDispatcher = self._dispatcher dispatcher.defined_vars.enter_scope(node) + # Add the const qualifier to any constants not marked as such + # update const data new_const_data = sdutil.get_constant_data(node, parent_state) - self._current_kernel_spec.kernel_constants for name in new_const_data: - desc = nsdfg.arrays[name] + desc = nsdfg.arrays[name] ptr_name = ptr(name, desc, nsdfg, self._frame) - defined_type= get_defined_type(desc) - ctype = f"const {desc.ctype}" + try: + defined_type, ctype = dispatcher.defined_vars.get(ptr_name, is_global=True) + if not "const " in desc.ctype: + ctype = f"const {desc.ctype}" + except: + defined_type = get_defined_type(desc) + if not "const " in desc.ctype: + ctype = f"const {desc.ctype}" dispatcher.defined_vars.add(ptr_name, defined_type, ctype, allow_shadowing=True) # update const symbols new_const_symbols = sdutil.get_constant_symbols(node, parent_state) - self._current_kernel_spec.kernel_constants for name in new_const_symbols: defined_type = DefinedType.Scalar - ctype = f"const {nsdfg.symbols[name].ctype}" - dispatcher.defined_vars.add(name, defined_type, ctype, allow_shadowing=True) - + if not "const" in nsdfg.symbols[name].ctype: + ctype = f"const {nsdfg.symbols[name].ctype}" + + + + # Redirect rest to CPU codegen self._cpu_codegen._generate_NestedSDFG(sdfg, cfg, dfg, state_id, node, function_stream, callsite_stream) # Exit scope @@ -776,17 +830,21 @@ def allocate_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphV dataname = ptr(node.data, nodedesc, sdfg, self._frame) - # ------------------- Guard checks ------------------- + # ------------- Guard checks & Redirect to CPU CodeGen ------------- # Skip if variable is already defined if self._dispatcher.defined_vars.has(dataname): return - if isinstance(nodedesc, (dace.data.View, dace.data.Reference)): - return NotImplementedError("Pointers and References not implemented in ExperimentalCUDACodeGen") - if isinstance(nodedesc, dace.data.Stream): raise NotImplementedError("allocate_stream not implemented in ExperimentalCUDACodeGen") + + elif isinstance(nodedesc, dace.data.View): + return self._cpu_codegen.allocate_view(sdfg, cfg, dfg, state_id, node, function_stream, declaration_stream, + allocation_stream) + elif isinstance(nodedesc, dace.data.Reference): + return self._cpu_codegen.allocate_reference(sdfg, cfg, dfg, state_id, node, function_stream, + declaration_stream, allocation_stream) # No clue what is happening here if nodedesc.lifetime in (dtypes.AllocationLifetime.Persistent, dtypes.AllocationLifetime.External): @@ -1221,39 +1279,45 @@ def __init__(self, cudaCodeGen: ExperimentalCUDACodeGen, sdfg: SDFG, cfg: Contro # constant variable types in the dispatcher (handled at GPU codegen) kernel_const_data = sdutil.get_constant_data(kernel_map_entry, kernel_parent_state) kernel_const_symbols = sdutil.get_constant_symbols(kernel_map_entry, kernel_parent_state) - kernel_constants = kernel_const_data | kernel_const_symbols self._kernel_constants: Set[str] = kernel_constants # Retrieve arguments required for the kernels subgraph arglist: Dict[str, dt.Data] = kernel_parent_state.scope_subgraph(kernel_map_entry).arglist() - - # Add also dynamic inputs required for the kernel to arglist except streams - # streams are only needed for the kernel wrapper and launcher function - stream_args = [] - stream_args_typed = [] - for e in dace.sdfg.dynamic_map_inputs(kernel_parent_state, kernel_map_entry): - data_desc = e.src.desc(sdfg) - var_name = str(e.dst_conn) - - if data_desc.dtype == dtypes.gpuStream_t: - _, stream_var_name = Config.get('compiler', 'cuda', 'gpu_stream_name').split(',') - stream_args.append(f"{var_name}") - stream_args_typed.append(f"gpuStream_t {stream_var_name}") - else: - var_name = str(e.dst_conn) - arglist[var_name] = data_desc - defined_type = get_defined_type(data_desc) - cudaCodeGen._dispatcher.defined_vars.add(var_name, defined_type, data_desc.ctype, allow_shadowing=True) - self._arglist = arglist - # Format arguments for input passing and function signatures (kernel and kernel wrapper) + # save _in_device_code value for restoring later + restore_in_device_code = cudaCodeGen._in_device_code + + # Certain args are called in the CUDA/HIP file or kernel funcion, in which the pointer name of the args are different + cudaCodeGen._in_device_code = True self._args_as_input = [ptr(name, data, sdfg, cudaCodeGen._frame) for name, data in arglist.items()] self._args_typed = [('const ' if name in kernel_constants else '') + data.as_arg(name=name) for name, data in arglist.items()] - self._kernel_wrapper_args_as_input = ['__state'] + self._args_as_input + stream_args - self._kernel_wrapper_args_typed = [f'{mangle_dace_state_struct_name(cudaCodeGen._global_sdfg)} *__state'] + self._args_typed + stream_args_typed + # Args for the kernel wrapper function + cudaCodeGen._in_device_code = False + + # Gather GPU stream information: + # - Use the connector name when passing the stream to the kernel + # - Use the configured variable name (from Config) in the wrapper’s function signature + # (this same name is also used when invoking {backend}LaunchKernel inside the wrapper) + gpustream_var_name = Config.get('compiler', 'cuda', 'gpu_stream_name').split(',')[1] + gpustream_input = [e for e in dace.sdfg.dynamic_map_inputs(kernel_parent_state, kernel_map_entry) if e.src.desc(sdfg).dtype == dtypes.gpuStream_t] + if len(gpustream_input) > 1: + raise ValueError(f"There can not be more than one GPU stream assigned to a kernel, but {len(gpustream_input)} were assigned.") + + # Final wrapper arguments: + # - State struct (__state) + # - Original kernel args + # - GPU stream + self._kernel_wrapper_args_as_input = (['__state'] + + [ptr(name, data, sdfg, cudaCodeGen._frame) for name, data in arglist.items()] + + [str(gpustream_input[0].dst_conn)]) + self._kernel_wrapper_args_typed = ([f'{mangle_dace_state_struct_name(cudaCodeGen._global_sdfg)} *__state'] + + [('const ' if name in kernel_constants else '') + data.as_arg(name=name) for name, data in arglist.items()] + + [f"gpuStream_t {gpustream_var_name}"]) + + cudaCodeGen._in_device_code = restore_in_device_code # The kernel's grid and block dimensions self._grid_dims, self._block_dims = cudaCodeGen._kernel_dimensions_map[kernel_map_entry] @@ -1326,7 +1390,7 @@ def arglist(self) -> Dict[str, dt.Data]: def args_as_input(self) -> list[str]: """ Returns the kernel function arguments formatted for use as inputs - when calling the kernel function. + when calling/launching the kernel function. """ return self._args_as_input diff --git a/dace/codegen/targets/experimental_cuda_helpers/new_copy_strategies.py b/dace/codegen/targets/experimental_cuda_helpers/new_copy_strategies.py index 66ee23cf34..abf2c6b263 100644 --- a/dace/codegen/targets/experimental_cuda_helpers/new_copy_strategies.py +++ b/dace/codegen/targets/experimental_cuda_helpers/new_copy_strategies.py @@ -175,19 +175,24 @@ def get_accessnode_to_accessnode_copy_info(self): src_name = src_node.data + if (src_nodedesc.transient and src_nodedesc.lifetime in (dtypes.AllocationLifetime.Persistent, dtypes.AllocationLifetime.External)): + ptr_name = f'__state->__{sdfg.cfg_id}_{src_name}' + else: + ptr_name = src_name + if isinstance(src_nodedesc, data.Scalar) and src_nodedesc.storage in dtypes.GPU_MEMORY_STORAGES_EXPERIMENTAL_CUDACODEGEN: parent_nsdfg_node = state.sdfg.parent_nsdfg_node if parent_nsdfg_node is not None and src_name in parent_nsdfg_node.in_connectors: - src_expr = f"&{src_name}" + src_expr = f"&{ptr_name}" else: - src_expr = src_name + src_expr = ptr_name elif isinstance(src_nodedesc, data.Scalar): - src_expr = f"&{src_name}" + src_expr = f"&{ptr_name}" elif isinstance(src_nodedesc, data.Array): src_offset = cpp.cpp_offset_expr(src_nodedesc, src_subset) - src_expr = f"{src_name} + {src_offset}" if src_offset != "0" else src_name + src_expr = f"{ptr_name} + {src_offset}" if src_offset != "0" else ptr_name else: raise NotImplementedError( @@ -196,19 +201,24 @@ def get_accessnode_to_accessnode_copy_info(self): ) dst_name = dst_node.data + if (dst_nodedesc.transient and dst_nodedesc.lifetime in (dtypes.AllocationLifetime.Persistent, dtypes.AllocationLifetime.External)): + ptr_name = f'__state->__{sdfg.cfg_id}_{dst_name}' + else: + ptr_name = dst_name + if isinstance(dst_nodedesc, data.Scalar) and dst_nodedesc.storage in dtypes.GPU_MEMORY_STORAGES_EXPERIMENTAL_CUDACODEGEN: parent_nsdfg_node = state.sdfg.parent_nsdfg_node if parent_nsdfg_node is not None and dst_name in parent_nsdfg_node.in_connectors: - dst_expr = f"&{dst_name}" + dst_expr = f"&{ptr_name}" else: - dst_expr = dst_name + dst_expr = ptr_name elif isinstance(dst_nodedesc, data.Scalar): - dst_expr = f"&{dst_name}" + dst_expr = f"&{ptr_name}" elif isinstance(dst_nodedesc, data.Array): dst_offset = cpp.cpp_offset_expr(dst_nodedesc, dst_subset) - dst_expr = f"{dst_name} + {dst_offset}" if dst_offset != "0" else dst_name + dst_expr = f"{ptr_name} + {dst_offset}" if dst_offset != "0" else ptr_name else: raise NotImplementedError( @@ -280,6 +290,10 @@ def applicable(self, copy_context: CopyContext) -> bool: cpu_storage_types = [StorageType.CPU_Heap, StorageType.CPU_ThreadLocal, StorageType.CPU_Pinned] if src_storage in cpu_storage_types and dst_storage in cpu_storage_types: return False + + + if isinstance(src_node.desc(state), data.View) or isinstance(dst_node.desc(state), data.View): + return False return True @@ -337,10 +351,10 @@ def _generate_1d_copy(self, copy_context: CopyContext) -> str: else: # Memory is strided: use {backend}Memcpy2DAsync with dpitch/spitch # This allows copying a strided 1D region - dpitch = f'{dst_strides[0]} * sizeof({ctype})' - spitch = f'{src_strides[0]} * sizeof({ctype})' + dpitch = f'{symbolic_to_cpp(dst_strides[0])} * sizeof({ctype})' + spitch = f'{symbolic_to_cpp(src_strides[0])} * sizeof({ctype})' width = f'sizeof({ctype})' - height = copy_shape[0] + height = symbolic_to_cpp(copy_shape[0]) kind = f'{backend}Memcpy{src_location}To{dst_location}' call = f'DACE_GPU_CHECK({backend}Memcpy2DAsync({dst_expr}, {dpitch}, {src_expr}, {spitch}, {width}, {height}, {kind}, {gpustream}));\n' @@ -364,10 +378,10 @@ def _generate_2d_copy(self, copy_context: CopyContext) -> None: # ----------------- Generate backend call if supported -------------------- if is_contiguous_copy: - dpitch = f'{dst_strides[0]} * sizeof({ctype})' - spitch = f'{src_strides[0]} * sizeof({ctype})' - width = f'{copy_shape[1]} * sizeof({ctype})' - height = f'{copy_shape[0]}' + dpitch = f'{symbolic_to_cpp(dst_strides[0])} * sizeof({ctype})' + spitch = f'{symbolic_to_cpp(src_strides[0])} * sizeof({ctype})' + width = f'{symbolic_to_cpp(copy_shape[1])} * sizeof({ctype})' + height = f'{symbolic_to_cpp(copy_shape[0])}' kind = f'{backend}Memcpy{src_location}To{dst_location}' call = f'DACE_GPU_CHECK({backend}Memcpy2DAsync({dst_expr}, {dpitch}, {src_expr}, {spitch}, {width}, {height}, {kind}, {gpustream}));\n' @@ -381,10 +395,10 @@ def _generate_2d_copy(self, copy_context: CopyContext) -> None: # Example: dcol[0:I, 0:J, k] -> datacol[0:I, 0:J] # with copy shape [I, J] and strides [J*K, K], [J, 1] - dpitch = f'{dst_strides[1]} * sizeof({ctype})' - spitch = f'{src_strides[1]} * sizeof({ctype})' + dpitch = f'{symbolic_to_cpp(dst_strides[1])} * sizeof({ctype})' + spitch = f'{symbolic_to_cpp(src_strides[1])} * sizeof({ctype})' width = f'sizeof({ctype})' - height = copy_shape[0] * copy_shape[1] + height = symbolic_to_cpp(copy_shape[0] * copy_shape[1]) kind = f'{backend}Memcpy{src_location}To{dst_location}' call = f'DACE_GPU_CHECK({backend}Memcpy2DAsync({dst_expr}, {dpitch}, {src_expr}, {spitch}, {width}, {height}, {kind}, {gpustream}));\n' @@ -431,16 +445,16 @@ def _generate_nd_copy(self, copy_context: CopyContext) -> None: call += f"for (int __copyidx{dim} = 0; __copyidx{dim} < {copy_shape[dim]}; ++__copyidx{dim}) {{\n" # Write Memcopy2DAsync - offset_src = ' + '.join(f'(__copyidx{d} * ({s}))' for d, s in enumerate(src_strides[:-2])) - offset_dst = ' + '.join(f'(__copyidx{d} * ({s}))' for d, s in enumerate(dst_strides[:-2])) + offset_src = ' + '.join(f'(__copyidx{d} * ({symbolic_to_cpp(s)}))' for d, s in enumerate(src_strides[:-2])) + offset_dst = ' + '.join(f'(__copyidx{d} * ({symbolic_to_cpp(s)}))' for d, s in enumerate(dst_strides[:-2])) src = f'{src_expr} + {offset_src}' dst = f'{dst_expr} + {offset_dst}' - dpitch = f'{dst_strides[-2]} * sizeof({ctype})' - spitch = f'{src_strides[-2]} * sizeof({ctype})' - width = f'{copy_shape[-1]} * sizeof({ctype})' - height = copy_shape[-2] + dpitch = f'{symbolic_to_cpp(dst_strides[-2])} * sizeof({ctype})' + spitch = f'{symbolic_to_cpp(src_strides[-2])} * sizeof({ctype})' + width = f'{symbolic_to_cpp(copy_shape[-1])} * sizeof({ctype})' + height = symbolic_to_cpp(copy_shape[-2]) kind = f'{backend}Memcpy{src_location}To{dst_location}' # Generate call and write it diff --git a/dace/codegen/targets/experimental_cuda_helpers/scope_strategies.py b/dace/codegen/targets/experimental_cuda_helpers/scope_strategies.py index 7824ed36da..e72da00828 100644 --- a/dace/codegen/targets/experimental_cuda_helpers/scope_strategies.py +++ b/dace/codegen/targets/experimental_cuda_helpers/scope_strategies.py @@ -134,6 +134,8 @@ def generate(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: ScopeSubgraphV callsite_stream.write(f'{thread_id_ctype} {var_name} = {var_def};', cfg, state_id, kernel_entry_node) self._dispatcher.defined_vars.add(var_name, DefinedType.Scalar, thread_id_ctype) + self.codegen._frame.allocate_arrays_in_scope(sdfg, cfg, kernel_entry_node, function_stream, callsite_stream) + # ----------------- Dispatch Subgraph code generation ----------------------- self._dispatcher.dispatch_subgraph(sdfg, @@ -143,6 +145,8 @@ def generate(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: ScopeSubgraphV function_stream, callsite_stream, skip_entry_node=True) + + self.codegen._frame.deallocate_arrays_in_scope(sdfg, cfg, kernel_entry_node, function_stream, callsite_stream) def _generate_kernel_signature(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: ScopeSubgraphView, state_id: int, function_stream: CodeIOStream, callsite_stream: CodeIOStream): @@ -295,6 +299,8 @@ def generate(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: ScopeSubgraphV callsite_stream.write(f'{block_id_ctype} {var_name} = {var_def};', cfg, state_id, node) self._dispatcher.defined_vars.add(var_name, DefinedType.Scalar, block_id_ctype) + self.codegen._frame.allocate_arrays_in_scope(sdfg, cfg, node, function_stream, callsite_stream) + # ----------------- Guard Conditions for Block Execution ----------------------- # Generate conditions for this block's execution using min and max @@ -337,6 +343,8 @@ def generate(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: ScopeSubgraphV function_stream, callsite_stream, skip_entry_node=True) + + self.codegen._frame.deallocate_arrays_in_scope(sdfg, cfg, node, function_stream, callsite_stream) class WarpScopeGenerator(ScopeGenerationStrategy): @@ -428,6 +436,9 @@ def generate(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: ScopeSubgraphV callsite_stream.write(f"{ids_ctype} {var_name} = {expr};", cfg, state_id, node) self._dispatcher.defined_vars.add(var_name, DefinedType.Scalar, ids_ctype) + + + self.codegen._frame.allocate_arrays_in_scope(sdfg, cfg, node, function_stream, callsite_stream) # ----------------- Guard Conditions for Warp Execution ----------------------- @@ -461,6 +472,8 @@ def generate(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: ScopeSubgraphV function_stream, callsite_stream, skip_entry_node=True) + + self.codegen._frame.deallocate_arrays_in_scope(sdfg, cfg, node, function_stream, callsite_stream) def _handle_GPU_Warp_scope_guards(self, state_dfg: SDFGState, node: nodes.MapEntry, map_range: subsets.Range, warp_dim: int, num_threads_in_block, num_warps, kernel_stream: CodeIOStream, @@ -575,19 +588,15 @@ def __init__(self, def __enter__(self): """ - Writes the opening bracket to the stream and allocates arrays in scope. + Writes the opening bracket. """ self.open() - self.frame_codegen.allocate_arrays_in_scope(self.sdfg, self.cfg, self.entry_node, self.function_stream, - self.callsite_stream) return self def __exit__(self, exc_type, exc_value, traceback): """ - Deallocates arrays in scope and writes the closing brackets to the stream. + Writes the closing brackets to the stream. """ - self.frame_codegen.deallocate_arrays_in_scope(self.sdfg, self.cfg, self.entry_node, self.function_stream, - self.callsite_stream) for i in range(self._opened): line = "}" if self.debug: diff --git a/dace/transformation/passes/gpustream/insert_gpu_stream_sync_tasklets.py b/dace/transformation/passes/gpustream/insert_gpu_stream_sync_tasklets.py index 7029e64966..cb7a015165 100644 --- a/dace/transformation/passes/gpustream/insert_gpu_stream_sync_tasklets.py +++ b/dace/transformation/passes/gpustream/insert_gpu_stream_sync_tasklets.py @@ -265,11 +265,9 @@ def _insert_gpu_stream_sync_after_node(self, sdfg: SDFG, sync_node: Dict[nodes.N #----------------- Place tasklet between node and successors, link GPU streams ---------------- # 1. Put the tasklet between the node and its successors - for edge in state.out_edges(node): - src, src_conn, dst, dst_conn, memlet = edge - state.add_edge(src, src_conn, tasklet, None, copy.deepcopy(memlet)) - state.add_edge(tasklet, None, dst, dst_conn, copy.deepcopy(memlet)) - state.remove_edge(edge) + for succ in state.successors(node): + state.add_edge(tasklet, None, succ, None, dace.Memlet()) + state.add_edge(node, None, tasklet, None, dace.Memlet()) # 2. If the GPU stream array is not defined in the data descriptor store, add it first parent_sdfg = state.sdfg diff --git a/dace/transformation/passes/shared_memory_synchronization2.py b/dace/transformation/passes/shared_memory_synchronization2.py new file mode 100644 index 0000000000..d0f8d70340 --- /dev/null +++ b/dace/transformation/passes/shared_memory_synchronization2.py @@ -0,0 +1,311 @@ +# Copyright 2019-2025 ETH Zurich and the DaCe authors. All rights reserved. +import warnings +from typing import Dict, Set, Tuple + +import dace +from dace import SDFG, SDFGState, dtypes, properties +from dace.sdfg.nodes import AccessNode, MapEntry, MapExit, NestedSDFG, Node +from dace.sdfg.state import LoopRegion +from dace.transformation import helpers, pass_pipeline as ppl, transformation + +@properties.make_properties +@transformation.explicit_cf_compatible +class DefaultSharedMemorySync(ppl.Pass): + """ + This pass inserts synchronization tasklets that call "__syncthreads()". + + Synchronization is added after ThreadBlock (TB) MapExits if the TB map + writes to shared memory. + + Important notes: + - Users are expected to **not** write to shared memory inside a Sequential + map or LoopRegion **within** a TB map. Calling "__syncthreads()" inside + a TB map can cause deadlocks, e.g., when only a subset of threads + participates (thread divergence). + + - If shared memory is still written sequentially within a TB map, the missing + intermediate synchronizations may lead to race conditions and incorrect results. + Since deadlocks are worse than race conditions, this pass avoids inserting + synchronization inside TB maps, but it will warn the user of the race condition risk. + + - In nested TB maps (e.g., GPU_Device map -> TB map -> TB map ...), + synchronization is only inserted at the outermost TB map's exit. The reason is again + the previously described deadlock danger. + """ + + def __init__(self): + """Initialize the synchronization pass.""" + # Cache each node's parent state during apply_pass() + self._node_to_parent_state: Dict[Node, SDFGState] = dict() + + def apply_pass(self, sdfg: SDFG, _) -> None: + """ + Apply this pass to insert synchronization barriers for GPU ThreadBlock maps. + + The pass: + - Finds all ThreadBlock-scheduled maps in the SDFG, + - Analyzes them for shared memory usage and race-condition risks, and + - Inserts synchronization barriers (`__syncthreads()`) after the + corresponding ThreadBlock-scheduled MapExits where needed. + """ + + + # 1. Find all GPU_ThreadBlock schedules Maps and + # cache each node's parent state for convenience + tb_map_exits: Dict[MapExit, SDFGState] = dict() + for node, parent_state in sdfg.all_nodes_recursive(): + self._node_to_parent_state[node] = parent_state + if isinstance(node, MapExit) and node.schedule == dtypes.ScheduleType.GPU_ThreadBlock: + tb_map_exits[node] = parent_state + + # 2. Identify TB MapExits requiring a synchronization barrier + sync_requiring_exits = self.identify_synchronization_tb_exits(tb_map_exits) + + # 3. Insert synchronization barriers for previous TB MapExits + self.insert_synchronization_after_tb_exits(sync_requiring_exits) + + def identify_synchronization_tb_exits(self, tb_map_exits: Dict[MapExit, SDFGState]) -> Dict[MapExit, SDFGState]: + """ + Identify ThreadBlock exits after which "__syncthread()" should be called. + + Parameters + ---------- + tb_map_exits : Dict[MapExit, SDFGState] + Mapping from GPU_ThreadBlock - scheduled MapExit nodes to their parent SDFGState. + + Returns + ------- + Dict[MapExit, SDFGState] + Subset of `tb_map_exits` where any AccessNode between the entry and exit + uses GPU shared memory, indicating a synchronization barrier is needed. + """ + #------------------------- helper function ------------------------- + sync_requiring_exits: Dict[MapExit, SDFGState] = {} + + for map_exit, state in tb_map_exits.items(): + + # process + map_entry = state.entry_node(map_exit) + writes_to_smem, race_cond_danger, has_tb_parent = self.tb_exits_analysis(map_entry, map_exit, state) + + # Skip: if this TB map is nested inside another TB map in the same kernel + # (i.e., before reaching the GPU_Device map), synchronization responsibility belongs + # to the outermost such TB map in the kernel. + if has_tb_parent: + continue + + # Warn user: potential race condition detected. + elif race_cond_danger and writes_to_smem: + warnings.warn( + f"Race condition danger: LoopRegion or Sequential Map inside ThreadBlock map {map_entry} " + "writes to GPU shared memory. No synchronization occurs for intermediate steps, " + "because '__syncthreads()' is only called outside the ThreadBlock map to avoid potential deadlocks." + "Please consider moving the LoopRegion or Sequential Map outside the ThreadBlock map." + ) + sync_requiring_exits[map_exit] = state + + # TB map writes to shared memory: synchronization is needed + elif writes_to_smem: + sync_requiring_exits[map_exit] = state + + return sync_requiring_exits + + def tb_exits_analysis(self, map_entry: MapEntry, map_exit: MapExit, + state: SDFGState) -> Tuple[bool, bool, bool]: + """ + Analyze a GPU_ThreadBlock-scheduled map to determine: + - whether it writes to shared memory, + - whether such writes may cause race conditions, and + - whether it is nested within another GPU_ThreadBlock map inside the kernel. + + Returns a tuple of three booleans: + + 1. `writes_to_shared_memory`: + True if the map writes to GPU shared memory. This includes writes + directly at the MapExit or within the map scope. + + 2. `race_cond_danger`: + True if there is a potential race condition due to shared memory writes + inside either: + - a sequentially scheduled map, or + - a loop region. + (Note: single-iteration loops/sequential maps are not treated differently; + they are still marked as dangerous, even though they cannot cause races.) + + 3. `has_parent_tb_map`: + True if this ThreadBlock map is nested inside another ThreadBlock map + (i.e., there exists another TB map between the enclosing GPU_Device + map and the current TB map). + + Parameters + ---------- + map_entry : MapEntry + The entry node of the ThreadBlock map. + map_exit : MapExit + The exit node of the ThreadBlock map. + state : SDFGState + The parent state containing the map. + + Returns + ------- + Tuple[bool, bool, bool] + A tuple: + `(writes_to_shared_memory, race_cond_danger, has_parent_tb_map)` + """ + # Initially, the flags are all set to False + writes_to_shared_memory = False + race_cond_danger = False + has_parent_tb_map = False + + # 1. Check if the ThreadBlock (TB) map writes to shared memory + for edge in state.out_edges(map_exit): + is_smem: bool = (isinstance(edge.dst, AccessNode) + and edge.dst.desc(state).storage == dtypes.StorageType.GPU_Shared) + if is_smem and not edge.data.is_empty(): + writes_to_shared_memory = True + break + + # 2. Search between map entry and exit: + # - Detect writes to shared memory (unless already found) + # - Collect nested SDFGs for later analysis + nested_sdfgs: Set[NestedSDFG] = set() + + for node in state.all_nodes_between(map_entry, map_exit): + if not writes_to_shared_memory and isinstance(node, AccessNode): + # Check if this AccessNode writes to shared memory + if (node.desc(state).storage == dtypes.StorageType.GPU_Shared + and any(not edge.data.is_empty() for edge in state.in_edges(node))): + writes_to_shared_memory = True + + elif isinstance(node, NestedSDFG): + nested_sdfgs.add(node) + + # 3. Recursively analyze nested SDFGs: + # - Detect shared memory writes (only if not already found) + # - Check for potential race conditions in loop regions (only if not already flagged) + for nsdfg in nested_sdfgs: + subs_sdfg = nsdfg.sdfg + if not writes_to_shared_memory: + writes_to_shared_memory = self.sdfg_writes_to_smem(subs_sdfg) + + if not race_cond_danger: + race_cond_danger = self.writes_to_smem_inside_loopregion(subs_sdfg) + + # 4. Check for race condition danger in sequential maps that use shared memory + # (only if not already flagged) + if not race_cond_danger: + race_cond_danger = any( + inner_scope.map.schedule == dtypes.ScheduleType.Sequential + and self.map_writes_to_smem(inner_scope) + for _, inner_scope in helpers.get_internal_scopes(state, map_entry) + ) + + # 5. Check if this TB map is nested within another TB map + parent = helpers.get_parent_map(state, map_entry) + + while parent: + parent_map, parent_state = parent + if parent_map.map.schedule == dtypes.ScheduleType.GPU_ThreadBlock: + has_parent_tb_map = True + break + if parent_map.map.schedule == dtypes.ScheduleType.GPU_Device: + break + parent = helpers.get_parent_map(parent_state, parent_map) + + # 6. Return the results + return writes_to_shared_memory, race_cond_danger, has_parent_tb_map + + def writes_to_smem_inside_loopregion(self, sdfg: SDFG) -> bool: + """ + Return True if the SDFG writes to GPU shared memory (smem) inside + a LoopRegion. This check is recursive and includes nested SDFGs. + """ + for node in sdfg.nodes(): + if isinstance(node, LoopRegion): + # Traverse all nodes inside the loop region + for subnode, parent in node.all_nodes_recursive(): + if ( + isinstance(subnode, AccessNode) + and subnode.desc(parent).storage == dtypes.StorageType.GPU_Shared + and any(not edge.data.is_empty() for edge in parent.in_edges(node)) + ): + return True + + elif isinstance(node, NestedSDFG): + # Recurse into nested SDFGs + if self.writes_to_smem_inside_loopregion(node.sdfg): + return True + + return False + + def sdfg_writes_to_smem(self, sdfg: SDFG) -> bool: + """ + Return True if the SDFG writes to GPU shared memory (smem), + i.e., contains an AccessNode with GPU_Shared storage that has + at least one non-empty incoming edge. + """ + for node, state in sdfg.all_nodes_recursive(): + if ( + isinstance(node, AccessNode) + and node.desc(state).storage == dtypes.StorageType.GPU_Shared + and any(not edge.data.is_empty() for edge in state.in_edges(node)) + ): + return True + return False + + def map_writes_to_smem(self, map_entry: MapEntry) -> bool: + """ + Return True if the map writes to GPU shared memory (smem). + + A map is considered to write to smem if: + - Any AccessNode with GPU_Shared storage is written to at the MapExit, or + - Such writes occur within the map scope, or + - A nested SDFG within the map writes to smem. + """ + state = self._node_to_parent_state[map_entry] + map_exit = state.exit_node(map_entry) + + # 1. Check if MapExit writes directly to shared memory + for edge in state.out_edges(map_exit): + if ( + isinstance(edge.dst, AccessNode) + and edge.dst.desc(state).storage == dtypes.StorageType.GPU_Shared + and not edge.data.is_empty() + ): + return True + + # 2. Inspect nodes inside the map scope + for node in state.all_nodes_between(map_entry, map_exit): + if ( + isinstance(node, AccessNode) + and node.desc(state).storage == dtypes.StorageType.GPU_Shared + and any(not edge.data.is_empty() for edge in state.in_edges(node)) + ): + return True + + if isinstance(node, NestedSDFG) and self.sdfg_writes_to_smem(node.sdfg): + return True + + # No writes to shared memory found + return False + + def insert_synchronization_after_tb_exits(self, tb_map_exits: Dict[MapExit, SDFGState]) -> None: + """ + Insert synchronization tasklets (calling `__syncthreads()`) after the given + GPU ThreadBlock MapExit nodes. + + Parameters + ---------- + tb_map_exits : Dict[MapExit, SDFGState] + Mapping from ThreadBlock MapExit nodes to their parent states after which a synchronization + tasklet should be inserted. + """ + for map_exit, state in tb_map_exits.items(): + + sync_tasklet = state.add_tasklet(name="sync_threads", inputs=set(), outputs=set(), + code="__syncthreads();\n", language=dtypes.Language.CPP) + + for succ in state.successors(map_exit): + state.add_edge(sync_tasklet, None, succ, None, dace.Memlet()) + + state.add_edge(map_exit, None, sync_tasklet, None, dace.Memlet()) \ No newline at end of file diff --git a/tests/npbench/misc/scattering_self_test.py b/tests/npbench/misc/scattering_self_test.py index 5b9a5ade62..6a00a81afe 100644 --- a/tests/npbench/misc/scattering_self_test.py +++ b/tests/npbench/misc/scattering_self_test.py @@ -116,6 +116,7 @@ def test_cpu(): run_scattering_self_test(dace.dtypes.DeviceType.CPU) +@pytest.mark.skip(reason="Compiler error") @pytest.mark.gpu def test_gpu(): run_scattering_self_test(dace.dtypes.DeviceType.GPU) From b979533741c7e432cf83a8ed07ddba5ec7d2e99b Mon Sep 17 00:00:00 2001 From: aydogdub Date: Tue, 26 Aug 2025 14:21:00 +0200 Subject: [PATCH 73/94] small refactoring --- dace/codegen/targets/experimental_cuda.py | 44 +++++----- .../new_copy_strategies.py | 84 +++++++++++++------ 2 files changed, 81 insertions(+), 47 deletions(-) diff --git a/dace/codegen/targets/experimental_cuda.py b/dace/codegen/targets/experimental_cuda.py index e8e1d37304..2b1b9cfb98 100644 --- a/dace/codegen/targets/experimental_cuda.py +++ b/dace/codegen/targets/experimental_cuda.py @@ -134,32 +134,16 @@ def __init__(self, frame_codegen: 'DaCeCodeGenerator', sdfg: SDFG): def preprocess(self, sdfg: SDFG) -> None: """ Preprocess the SDFG to prepare it for GPU code generation. This includes: + - Handling GPU<->GPU strided copies. - Adding explicit ThreadBlock Maps where missing and infer Grid and Block dimensions for every Kernel in the SDFG - - Handling GPU<->GPU strided copies. - Runs a pipeline for making GPU stream explicit at the SDFG level and handles other GPU stream related initialization. + - TODO - Handling memory pool management - """ - #----------------- Add ThreadBlock Maps & Infer Kernel Grid & Block Sizes -------------------- - - # new_nodes - old_nodes gives us all Kernel Entry nodes that were created during the insertion - # of ThreadBlock maps. Note: the original Kernel Entry was transformed into a ThreadBlock map, - # and a new GPU_Device (i.e., Kernel) map was inserted on top of it. - old_nodes = set(node for node, _ in sdfg.all_nodes_recursive()) - - # Insert default explicit GPU_ThreadBlock maps where they are missing - sdfg.apply_transformations_once_everywhere(AddThreadBlockMap) - - new_nodes = set(node for node, _ in sdfg.all_nodes_recursive()) - old_nodes - kernels_with_added_tb_maps = { - n - for n in new_nodes if isinstance(n, nodes.MapEntry) and n.schedule == dtypes.ScheduleType.GPU_Device - } - - # Infer GPU Grid and Block dimensions - self._kernel_dimensions_map = InferGPUGridAndBlockSize().apply_pass(sdfg, kernels_with_added_tb_maps) + Note that the order of the steps matters, e.g. TODO + """ #------------------------- Hanlde GPU<->GPU strided copies -------------------------- @@ -202,6 +186,26 @@ def preprocess(self, sdfg: SDFG) -> None: except ValueError: # If transformation doesn't match, continue normally continue + + #----------------- Add ThreadBlock Maps & Infer Kernel Grid & Block Sizes -------------------- + + # new_nodes - old_nodes gives us all Kernel Entry nodes that were created during the insertion + # of ThreadBlock maps. Note: the original Kernel Entry was transformed into a ThreadBlock map, + # and a new GPU_Device (i.e., Kernel) map was inserted on top of it. + old_nodes = set(node for node, _ in sdfg.all_nodes_recursive()) + + # Insert default explicit GPU_ThreadBlock maps where they are missing + sdfg.apply_transformations_once_everywhere(AddThreadBlockMap) + + new_nodes = set(node for node, _ in sdfg.all_nodes_recursive()) - old_nodes + kernels_with_added_tb_maps = { + n + for n in new_nodes if isinstance(n, nodes.MapEntry) and n.schedule == dtypes.ScheduleType.GPU_Device + } + + # Infer GPU Grid and Block dimensions + self._kernel_dimensions_map = InferGPUGridAndBlockSize().apply_pass(sdfg, kernels_with_added_tb_maps) + #------------------------- GPU Stream related Logic -------------------------- # Register GPU context in state struct diff --git a/dace/codegen/targets/experimental_cuda_helpers/new_copy_strategies.py b/dace/codegen/targets/experimental_cuda_helpers/new_copy_strategies.py index abf2c6b263..4038300575 100644 --- a/dace/codegen/targets/experimental_cuda_helpers/new_copy_strategies.py +++ b/dace/codegen/targets/experimental_cuda_helpers/new_copy_strategies.py @@ -255,6 +255,7 @@ def applicable(self, copy_context: CopyContext) -> bool: This function returns True if: - We are not currently generating kernel code - The copy occurs between two AccessNodes + - The data descriptors of source and destination are not views. - The storage types of either src or dst is CPU_Pinned or GPU_Device - We do not have a CPU-to-CPU copy @@ -275,25 +276,25 @@ def applicable(self, copy_context: CopyContext) -> bool: else: parent_map_tuple = helpers.get_parent_map(parent_state, parent_map) - # 2. Check whether copy is between to AccessNodes + # 2. Check whether copy is between two AccessNodes if not (isinstance(src_node, nodes.AccessNode) and isinstance(dst_node, nodes.AccessNode)): return False + + # 3. The data descriptors of source and destination are not views + if isinstance(src_node.desc(state), data.View) or isinstance(dst_node.desc(state), data.View): + return False - # 3. Check that one StorageType of either src or dst is CPU_Pinned or GPU_Device + # 4. Check that one StorageType of either src or dst is CPU_Pinned or GPU_Device src_storage = copy_context.get_storage_type(src_node) dst_storage = copy_context.get_storage_type(dst_node) if not (src_storage in (StorageType.GPU_Global, StorageType.CPU_Pinned) or dst_storage in (StorageType.GPU_Global, StorageType.CPU_Pinned)): return False - # 4. Check that this is not a CPU to CPU copy + # 5. Check that this is not a CPU to CPU copy cpu_storage_types = [StorageType.CPU_Heap, StorageType.CPU_ThreadLocal, StorageType.CPU_Pinned] if src_storage in cpu_storage_types and dst_storage in cpu_storage_types: return False - - - if isinstance(src_node.desc(state), data.View) or isinstance(dst_node.desc(state), data.View): - return False return True @@ -323,11 +324,11 @@ def generate_copy(self, copy_context: CopyContext) -> str: def _generate_1d_copy(self, copy_context: CopyContext) -> str: """ - Emits code for a 1D memory copy between host and device using GPU backend. - Uses {backend}MemcpyAsync for contiguous memory and uses {backend}Memcpy2DAsync - for strided memory copies. - """ + Generates a 1D memory copy between host and device using the GPU backend. + Uses {backend}MemcpyAsync for contiguous memory. For strided memory, + {backend}Memcpy2DAsync is leveraged to efficiently handle the stride along one dimension. + """ # ----------- Retrieve relevant copy parameters -------------- backend: str = common.get_gpu_backend() @@ -362,22 +363,37 @@ def _generate_1d_copy(self, copy_context: CopyContext) -> str: return call def _generate_2d_copy(self, copy_context: CopyContext) -> None: - """Generates code for a 2D copy, falling back to 1D flattening if applicable.""" + """ + Generates a 2D memory copy using {backend}Memcpy2DAsync. + + Three main cases are handled: + - Copy between row-major stored arrays with contiguous rows. + - Copy between column-major stored arrays with contiguous columns. + - A special case where a 2D copy can still be represented. + + Raises: + NotImplementedError: Raised if the source and destination strides do not match any of the handled patterns. + Such cases indicate an unsupported 2D copy and should be examined separately. + They can be implemented if valid, or a more descriptive error should be raised if the path should not occur. + + Note: + {backend}Memcpy2DAsync supports strided copies along only one dimension (row or column), + but not both simultaneously. + """ # ----------- Extract relevant copy parameters -------------- backend: str = common.get_gpu_backend() # Due to applicable(), src and dst node must be AccessNodes copy_shape, src_strides, dst_strides, src_expr, dst_expr = copy_context.get_accessnode_to_accessnode_copy_info() - src_location, dst_location = copy_context.get_memory_location() - is_contiguous_copy = (src_strides[-1] == 1) and (dst_strides[-1] == 1) ctype = copy_context.get_ctype() gpustream = copy_context.get_assigned_gpustream() # ----------------- Generate backend call if supported -------------------- - if is_contiguous_copy: + # Case: Row-major layout, rows are not strided. + if (src_strides[1] == 1) and (dst_strides[1] == 1): dpitch = f'{symbolic_to_cpp(dst_strides[0])} * sizeof({ctype})' spitch = f'{symbolic_to_cpp(src_strides[0])} * sizeof({ctype})' width = f'{symbolic_to_cpp(copy_shape[1])} * sizeof({ctype})' @@ -386,14 +402,21 @@ def _generate_2d_copy(self, copy_context: CopyContext) -> None: call = f'DACE_GPU_CHECK({backend}Memcpy2DAsync({dst_expr}, {dpitch}, {src_expr}, {spitch}, {width}, {height}, {kind}, {gpustream}));\n' - elif src_strides[-1] != 1 or dst_strides[-1] != 1: - # TODO: Checks this, I am not sure but the old code and its description - # seems to be more complicated here than necessary.. - # But worth to mention: we essentially perform flattening + # Case: Column-major layout, no columns are strided. + elif (src_strides[0] == 1) and (dst_strides[0] == 1): + dpitch = f'{symbolic_to_cpp(dst_strides[1])} * sizeof({ctype})' + spitch = f'{symbolic_to_cpp(src_strides[1])} * sizeof({ctype})' + width = f'{symbolic_to_cpp(copy_shape[0])} * sizeof({ctype})' + height = f'{symbolic_to_cpp(copy_shape[1])}' + kind = f'{backend}Memcpy{src_location}To{dst_location}' - # NOTE: Special case of continuous copy - # Example: dcol[0:I, 0:J, k] -> datacol[0:I, 0:J] - # with copy shape [I, J] and strides [J*K, K], [J, 1] + call = f'DACE_GPU_CHECK({backend}Memcpy2DAsync({dst_expr}, {dpitch}, {src_expr}, {spitch}, {width}, {height}, {kind}, {gpustream}));\n' + + # Special case + elif (src_strides[0] / src_strides[1] == copy_shape[1] and dst_strides[0] / dst_strides[1] == copy_shape[1]): + # Consider as an example this copy: A[0:I, 0:J, K] -> B[0:I, 0:J] with + # copy shape [I, J], src_strides[J*K, K], dst_strides[J, 1]. This can be represented with a + # {backend}Memcpy2DAsync call! dpitch = f'{symbolic_to_cpp(dst_strides[1])} * sizeof({ctype})' spitch = f'{symbolic_to_cpp(src_strides[1])} * sizeof({ctype})' @@ -412,7 +435,13 @@ def _generate_2d_copy(self, copy_context: CopyContext) -> None: return call def _generate_nd_copy(self, copy_context: CopyContext) -> None: - # TODO: comment + """ + Generates GPU code for copying N-dimensional arrays using 2D memory copies. + + Uses {backend}Memcpy2DAsync for the last two dimensions, with nested loops + for any outer dimensions. Expects the copy to be contiguous and between + row-major storage locations. + """ # ----------- Extract relevant copy parameters -------------- backend: str = common.get_gpu_backend() @@ -425,17 +454,18 @@ def _generate_nd_copy(self, copy_context: CopyContext) -> None: num_dims = len(copy_shape) # ----------- Guard for unsupported Pattern -------------- - is_contiguous_copy = (src_strides[-1] == 1) and (dst_strides[-1] == 1) - if not is_contiguous_copy: + if not (src_strides[-1] == 1) and (dst_strides[-1] == 1): src_node, dst_node = copy_context.src_node, copy_context.dst_node src_storage = copy_context.get_storage_type(src_node) dst_storage = copy_context.get_storage_type(dst_node) raise NotImplementedError( - "Strided GPU memory copies for N-dimensional arrays are not currently supported.\n" + "N-dimensional GPU memory copies, that are strided or contain column-major arrays, are currently not supported.\n" f" Source node: {src_node} (storage: {src_storage})\n" f" Destination node: {copy_context.dst_node} (storage: {dst_storage})\n" f" Source strides: {src_strides}\n" - f" Destination strides: {dst_strides}\n") + f" Destination strides: {dst_strides}\n" + f" copy shape: {copy_shape}\n" + ) # ----------------- Generate and write backend call(s) -------------------- From 3a848db3ddc1f9a57fce8f20caa0a71d78599750 Mon Sep 17 00:00:00 2001 From: aydogdub Date: Thu, 28 Aug 2025 21:48:59 +0200 Subject: [PATCH 74/94] failing --- dace/codegen/targets/experimental_cuda.py | 102 ++++---- .../new_copy_strategies.py | 223 +++++++++++++++++- dace/codegen/targets/framecode.py | 26 +- .../analysis/infer_gpu_grid_and_block_size.py | 1 - .../gpu_stream_topology_simplification.py | 3 +- .../insert_gpu_stream_sync_tasklets.py | 18 +- .../insert_gpu_streams_to_kernels.py | 13 +- .../gpustream/insert_gpu_streams_to_sdfgs.py | 142 +++++++++++ .../insert_gpu_streams_to_tasklets.py | 9 +- .../passes/insert_gpu_copy_tasklets.py | 8 +- tests/npbench/misc/scattering_self_test.py | 1 - 11 files changed, 436 insertions(+), 110 deletions(-) create mode 100644 dace/transformation/passes/gpustream/insert_gpu_streams_to_sdfgs.py diff --git a/dace/codegen/targets/experimental_cuda.py b/dace/codegen/targets/experimental_cuda.py index 2b1b9cfb98..7c3a1a3221 100644 --- a/dace/codegen/targets/experimental_cuda.py +++ b/dace/codegen/targets/experimental_cuda.py @@ -31,6 +31,7 @@ from dace.transformation.passes import analysis as ap from dace.transformation.pass_pipeline import Pipeline from dace.transformation.passes.gpustream.gpustream_scheduling import NaiveGPUStreamScheduler +from dace.transformation.passes.gpustream.insert_gpu_streams_to_sdfgs import InsertGPUStreamsToSDFGs from dace.transformation.passes.gpustream.insert_gpu_streams_to_kernels import InsertGPUStreamsToKernels from dace.transformation.passes.gpustream.insert_gpu_streams_to_tasklets import InsertGPUStreamsToTasklets from dace.transformation.passes.insert_gpu_copy_tasklets import InsertGPUCopyTasklets @@ -211,14 +212,12 @@ def preprocess(self, sdfg: SDFG) -> None: # Register GPU context in state struct self._frame.statestruct.append('dace::cuda::Context *gpu_context;') - # Define backend stream access expression (e.g., CUDA stream handle) - gpu_stream_access_template = "__state->gpu_context->streams[{gpu_stream}]" - # Prepare the Pipeline to make GPU streams explicit: Add and connect SDFG nodes # with GPU stream AccessNodes where used stream_pipeline = Pipeline( [ NaiveGPUStreamScheduler(), + InsertGPUStreamsToSDFGs(), InsertGPUStreamsToKernels(), InsertGPUStreamsToTasklets(), InsertGPUStreamSyncTasklets(), @@ -270,6 +269,12 @@ def _compute_pool_release(self, top_sdfg: SDFG): if self.backend != 'cuda': raise ValueError(f'Backend "{self.backend}" does not support the memory pool allocation hint') + # Keep only global arrays + pooled = filter( + lambda aname: sdfg.arrays[aname].lifetime in + (dtypes.AllocationLifetime.Global, dtypes.AllocationLifetime.Persistent, dtypes.AllocationLifetime. + External), pooled) + # Lazily compute reachability and access nodes if reachability is None: reachability = ap.StateReachability().apply_pass(top_sdfg, {}) @@ -277,7 +282,7 @@ def _compute_pool_release(self, top_sdfg: SDFG): reachable = reachability[sdfg.cfg_id] access_sets = access_nodes[sdfg.cfg_id] - for state in sdfg.nodes(): + for state in sdfg.states(): # Find all data descriptors that will no longer be used after this state last_state_arrays: Set[str] = set( s for s in access_sets @@ -606,29 +611,23 @@ def copy_memory(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView edge: Tuple[nodes.Node, str, nodes.Node, str, Memlet], function_stream: CodeIOStream, callsite_stream: CodeIOStream) -> None: - from dace.codegen.targets.experimental_cuda_helpers.copy_strategies import (CopyContext, CopyStrategy, - OutOfKernelCopyStrategy, - SyncCollaboritveGPUCopyStrategy, - AsyncCollaboritveGPUCopyStrategy, - FallBackGPUCopyStrategy) - - context = CopyContext(self, self._gpu_stream_manager, state_id, src_node, dst_node, edge, sdfg, cfg, dfg, - callsite_stream) - - # Order matters: fallback must come last - strategies: List[CopyStrategy] = [ - OutOfKernelCopyStrategy(), - SyncCollaboritveGPUCopyStrategy(), - AsyncCollaboritveGPUCopyStrategy(), - FallBackGPUCopyStrategy() - ] + from dace.codegen.targets.experimental_cuda_helpers.new_copy_strategies import ( + CopyContext, + OutOfKernelCopyStrategy, + SyncCollaboritveGPUCopyStrategy + ) - for strategy in strategies: - if strategy.applicable(context): - strategy.generate_copy(context) - return + context = CopyContext(sdfg, cfg.state(state_id), src_node, dst_node, edge, self._gpu_stream_manager.assigned_streams) - raise RuntimeError("No applicable GPU memory copy strategy found (this should not happen).") + if OutOfKernelCopyStrategy().applicable(context): + return + + elif SyncCollaboritveGPUCopyStrategy().applicable(context): + code = SyncCollaboritveGPUCopyStrategy().generate_copy(context, self._kernel_dimensions_map) + callsite_stream.write(code, cfg, state_id, [src_node, dst_node]) + else: + # Fallback + self._cpu_codegen.copy_memory(sdfg, cfg, dfg, state_id, src_node, dst_node, edge, None, callsite_stream) ############################################################################# # Predicates for Dispatcher @@ -886,9 +885,9 @@ def _prepare_GPU_Global_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: Sta if nodedesc.pool: gpu_stream_manager = self._gpu_stream_manager - gpu_stream = gpu_stream_manager.get_stream_node(node) + gpu_stream = gpu_stream_manager.assigned_streams[node] if gpu_stream != 'nullptr': - gpu_stream = f'__state->gpu_context->streams[{gpu_stream}]' + gpu_stream = f'__state->__0_gpu_streams[{gpu_stream}]' allocation_stream.write( f'DACE_GPU_CHECK({self.backend}MallocAsync((void**)&{dataname}, {arrsize_malloc}, {gpu_stream}));\n', cfg, state_id, node) @@ -1011,7 +1010,11 @@ def deallocate_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgrap # Main deallocation logic by storage type if nodedesc.storage == dtypes.StorageType.GPU_Global: - if not nodedesc.pool: # If pooled, will be freed somewhere else + if nodedesc.pool: + if (sdfg, dataname) not in self.pool_release: + gpu_stream = "nullptr" + callsite_stream.write(f'DACE_GPU_CHECK({self.backend}FreeAsync({dataname}, {gpu_stream}));\n', cfg, state_id, node) + else: callsite_stream.write(f'DACE_GPU_CHECK({self.backend}Free({dataname}));\n', cfg, state_id, node) elif nodedesc.storage == dtypes.StorageType.CPU_Pinned: @@ -1038,13 +1041,15 @@ def get_generated_codeobjects(self): # in several different files (e.g., framecode.py, cpu.py, cpp.py). For the sake of consistency, we initialize it # as it is expected in the other modules. I.e. prepend with an ID for all SDFGs it is defined. # Note that all the different variable names point to the same GPU stream array. + cnt = 0 init_gpu_stream_vars = "" gpu_stream_array_name = Config.get('compiler', 'cuda', 'gpu_stream_name').split(",")[0] for csdfg, name, desc in self._global_sdfg.arrays_recursive(include_nested_data=True): if name == gpu_stream_array_name and desc.lifetime == dtypes.AllocationLifetime.Persistent: - gpu_stream_field_name = f'__{csdfg.cfg_id}_{name}' - init_gpu_stream_vars += f"__state->{gpu_stream_field_name} = __state->gpu_context->streams;\n" - init_gpu_stream_vars += f" " + init_gpu_stream_vars = f"__state->__{csdfg.cfg_id}_{name}" + break + + # My comment: takes codeblocks and transforms it nicely to code initcode = CodeIOStream() @@ -1131,16 +1136,14 @@ def get_generated_codeobjects(self): __state->gpu_context = new dace::cuda::Context({nstreams}, {nevents}); - // Create {backend} streams and events for(int i = 0; i < {nstreams}; ++i) {{ - DACE_GPU_CHECK({backend}StreamCreateWithFlags(&__state->gpu_context->internal_streams[i], {backend}StreamNonBlocking)); - __state->gpu_context->streams[i] = __state->gpu_context->internal_streams[i]; // Allow for externals to modify streams - }} - for(int i = 0; i < {nevents}; ++i) {{ - DACE_GPU_CHECK({backend}EventCreateWithFlags(&__state->gpu_context->events[i], {backend}EventDisableTiming)); + {other_gpustream_init}[i] = 0; }} - {other_gpustream_init} + // Create {backend} streams + for(int i = 0; i < {nstreams}; ++i) {{ + DACE_GPU_CHECK({backend}StreamCreateWithFlags(&{other_gpustream_init}[i], {backend}StreamNonBlocking)); + }} {initcode} @@ -1155,33 +1158,15 @@ def get_generated_codeobjects(self): if (__err == 0) __err = static_cast({backend}DeviceSynchronize()); - // Destroy {backend} streams and events + // Destroy {backend} streams for(int i = 0; i < {nstreams}; ++i) {{ - DACE_GPU_CHECK({backend}StreamDestroy(__state->gpu_context->internal_streams[i])); - }} - for(int i = 0; i < {nevents}; ++i) {{ - DACE_GPU_CHECK({backend}EventDestroy(__state->gpu_context->events[i])); + DACE_GPU_CHECK({backend}StreamDestroy({other_gpustream_init}[i])); }} delete __state->gpu_context; return __err; }} -DACE_EXPORTED bool __dace_gpu_set_stream({sdfg_state_name} *__state, int streamid, gpuStream_t stream) -{{ - if (streamid < 0 || streamid >= {nstreams}) - return false; - - __state->gpu_context->streams[streamid] = stream; - - return true; -}} - -DACE_EXPORTED void __dace_gpu_set_all_streams({sdfg_state_name} *__state, gpuStream_t stream) -{{ - for (int i = 0; i < {nstreams}; ++i) - __state->gpu_context->streams[i] = stream; -}} {localcode} """.format(params=params_comma, @@ -1230,6 +1215,7 @@ def cmake_options(): hip_arch = [ha for ha in hip_arch if ha is not None and len(ha) > 0] flags = Config.get("compiler", "cuda", "hip_args") + flags += " -G -g" flags += ' ' + ' '.join( '--offload-arch={arch}'.format(arch=arch if arch.startswith("gfx") else "gfx" + arch) for arch in hip_arch) diff --git a/dace/codegen/targets/experimental_cuda_helpers/new_copy_strategies.py b/dace/codegen/targets/experimental_cuda_helpers/new_copy_strategies.py index 4038300575..f8f10fa528 100644 --- a/dace/codegen/targets/experimental_cuda_helpers/new_copy_strategies.py +++ b/dace/codegen/targets/experimental_cuda_helpers/new_copy_strategies.py @@ -1,14 +1,17 @@ # Copyright 2019-2025 ETH Zurich and the DaCe authors. All rights reserved. from abc import ABC, abstractmethod -from typing import Any, Dict, List, Tuple, Union +from typing import Any, Dict, List, Optional, Tuple, Union from dace import SDFG, SDFGState, data, dtypes, subsets from dace import memlet as mm +from dace import symbolic from dace.codegen import common from dace.codegen.targets import cpp +from dace.codegen.targets.cpp import unparse_cr from dace.codegen.targets.experimental_cuda_helpers.gpu_utils import symbolic_to_cpp from dace.config import Config from dace.dtypes import StorageType +from dace.frontend import operations from dace.sdfg import nodes, scope_contains_scope from dace.sdfg.graph import MultiConnectorEdge from dace.transformation import helpers @@ -230,6 +233,7 @@ def get_accessnode_to_accessnode_copy_info(self): class CopyStrategy(ABC): + """Abstract base class for memory copy strategies.""" @abstractmethod def applicable(self, copy_context: CopyContext) -> bool: @@ -247,6 +251,13 @@ def generate_copy(self, copy_context: CopyContext) -> str: class OutOfKernelCopyStrategy(CopyStrategy): + """ + Copy strategy for memory transfers that occur outside of kernel execution. + + This pattern often occurs when generating host-to-device copies for kernel inputs + (since kernels cannot access host memory directly), and device-to-host copies + to retrieve results for further processing. + """ def applicable(self, copy_context: CopyContext) -> bool: """ @@ -258,7 +269,6 @@ def applicable(self, copy_context: CopyContext) -> bool: - The data descriptors of source and destination are not views. - The storage types of either src or dst is CPU_Pinned or GPU_Device - We do not have a CPU-to-CPU copy - """ # Retrieve needed information state = copy_context.state @@ -495,4 +505,211 @@ def _generate_nd_copy(self, copy_context: CopyContext) -> None: call += "\n}" # Return the code - return call \ No newline at end of file + return call + + +class SyncCollaboritveGPUCopyStrategy(CopyStrategy): + """ + Implements (synchronous) collaborative GPU copy operations. + + This strategy generates the appropriate code for copies performed + inside GPU kernels, where multiple threads cooperate to move data + between gpu memory spaces (e.g., global to shared memory). + """ + + def applicable(self, copy_context: CopyContext) -> bool: + """ + Checks if the copy is eligible for a collaborative GPU-to-GPU copy. + + Conditions: + 1. The copy is between two AccessNodes + 2. The copy is between GPU memory StorageTypes (shared or global). + 3. The innermost non-sequential map is a GPU_Device-scheduled map i.e. + the copy occurs within a kernel but is not within a GPU_ThreadBlock map. + """ + # --- Condition 1: src and dst are AccessNodes --- + src_node, dst_node = copy_context.src_node, copy_context.dst_node + if not (isinstance(src_node, nodes.AccessNode) and isinstance(dst_node, nodes.AccessNode)): + return False + + # --- Condition 2: GPU to GPU memory transfer --- + src_storage, dst_storage = copy_context.get_storage_type(src_node), copy_context.get_storage_type(dst_node) + gpu_storages = {dtypes.StorageType.GPU_Global, dtypes.StorageType.GPU_Shared} + + if not (src_storage in gpu_storages and dst_storage in gpu_storages): + return False + + # --- Condition 3: Next non-sequential Map is a GPU_Device Map --- + next_nonseq_parent_map = self._next_non_seq_parent_map(copy_context) + if next_nonseq_parent_map is None: + return False + else: + return next_nonseq_parent_map.map.schedule == dtypes.ScheduleType.GPU_Device + + def generate_copy(self, copy_context: CopyContext, kernel_dimensions_maps: Dict[nodes.MapEntry, Tuple[List, List]]) -> str: + """ + Generates a GPU copy call as a string using DaCe's runtime CUDA copy functions. + + The function determines the appropriate templated copy function from + `dace/libraries/runtime/include/dace/cuda/copy.cuh` and constructs + the call string with the necessary arguments, including kernel block + dimensions and optional accumulation/reduction information. + + Parameters + ---------- + copy_context : CopyContext + Helper object containing information about the copy. + + kernel_dimensions_maps : Dict[nodes.MapEntry, Tuple[List, List]] + Kernel map (GPU_Devie scheduled map) entry nodes to (grid_dims, block_dims); + block_dims needed in templating. + + Returns + ------- + str + The GPU copy call in C++ as a string. + + Notes + ----- + - The kernel block size could be derived, but since this function is typically called + from `ExperimentalCUDACodeGen`, it is provided as input to avoid recomputation. + - The template functions use a parameter called 'is_async', which is set to True here + because `ExperimentalCUDACodeGen` inserts "__syncthreads()" explicitly in tasklets. + """ + # ----------- Retrieve relevant copy information -------------- + + # Due to applicable(), src and dst node must be AccessNodes + copy_shape, src_strides, dst_strides, src_expr, dst_expr = copy_context.get_accessnode_to_accessnode_copy_info() + sdfg = copy_context.sdfg + dtype = copy_context.src_node.desc(sdfg).dtype + ctype = dtype.ctype + + # Get copy function name (defined in runtime library) + num_dims = len(copy_shape) + src_node, dst_node = copy_context.src_node, copy_context.dst_node + src_storage, dst_storage = copy_context.get_storage_type(src_node), copy_context.get_storage_type(dst_node) + src_storage_name = self._get_storagename(src_storage) + dst_storage_name = self._get_storagename(dst_storage) + function_name = f"dace::{src_storage_name}To{dst_storage_name}{num_dims}D" + + # Extract WCR info (accumulation template + optional custom reduction) + accum, custom_reduction = self._get_accumulation_info(copy_context) + custom_reduction = [custom_reduction] if custom_reduction else [] + + # Get parent kernel block dimensions (guaranteed GPU_Device) and sync flag + parent_kernel = self._next_non_seq_parent_map(copy_context) + block_dims = ", ".join(symbolic_to_cpp(kernel_dimensions_maps[parent_kernel][1])) + synchronized = "true" # Legacy 'is_async'; sync barriers handled by passes (see docstring) + + # ------------------------- Generate copy call ---------------------------- + + if any(symbolic.issymbolic(s, copy_context.sdfg.constants) for s in copy_shape): + args_list = ([src_expr] + src_strides + [dst_expr] + custom_reduction + dst_strides + copy_shape) + args = ", ".join(symbolic_to_cpp(args_list)) + call = f"{function_name}Dynamic<{ctype}, {block_dims}, {synchronized}>{accum}({args});" + + elif function_name == "dace::SharedToGlobal1D": + copy_size = ', '.join(symbolic_to_cpp(copy_shape)) + accum = accum or '::Copy' + args_list = ([src_expr] + src_strides + [dst_expr] + dst_strides + custom_reduction) + args = ", ".join(symbolic_to_cpp(args_list)) + call = f"{function_name}<{ctype}, {block_dims}, {copy_size}, {synchronized}>{accum}({args});" + + else: + copy_size = ', '.join(symbolic_to_cpp(copy_shape)) + args_list = ([src_expr] + src_strides + [dst_expr] + custom_reduction) + args = ", ".join(symbolic_to_cpp(args_list)) + dst_strides_unpacked = ", ".join(symbolic_to_cpp(dst_strides)) + call = f"{function_name}<{ctype}, {block_dims}, {copy_size}, {dst_strides_unpacked}, {synchronized}>{accum}({args});" + + return call + + def _get_accumulation_info(self, copy_context: CopyContext) -> Tuple[str, str]: + """ + Extracts write-conflict resolution (WCR) information from the copy context + and returns the accumulation/reduction template components needed for the + final templated function call in `generate_copy()`. + + This method processes WCR information from the memlet and generates the + appropriate C++ template strings for both predefined and custom reductions. + + Parameters + ---------- + copy_context : CopyContext + Copy context containing the copy operation details, including + the memlet with WCR information. + + Returns + ------- + Tuple[str, str] + A tuple containing: + - accum : str + Template accumulation string for the function call. Empty string if no WCR, + `"::template Accum"` for predefined reductions, or `"::template Accum"` for custom reductions. + - custom_reduction : str + C++ formatted custom reduction code string. Empty string for no WCR or predefined reductions, + unparsed custom reduction code for custom reductions. + """ + sdfg = copy_context.sdfg + dtype = copy_context.src_node.desc(sdfg).dtype + memlet = copy_context.edge.data + wcr = memlet.wcr + reduction_type = operations.detect_reduction_type(wcr) + + if wcr is None: + accum, custom_reduction = "", "" + + elif reduction_type != dtypes.ReductionType.Custom: + # Use predefined reduction + reduction_type_str = str(reduction_type).split(".")[-1] # e.g., "Sum" + accum = f"::template Accum" + custom_reduction = "" + + else: + accum = "::template Accum" + custom_reduction = unparse_cr(sdfg, wcr, dtype) + + return accum, custom_reduction + + def _get_storagename(self, storage: dtypes.StorageType): + """ + Returns a string containing the name of the storage location. + + Example: dtypes.StorageType.GPU_Shared will return "Shared". + """ + storage_name = str(storage) + return storage_name[storage_name.rindex('_') + 1:] + + def _next_non_seq_parent_map(self, copy_context: CopyContext) -> Optional[nodes.MapEntry]: + """ + Traverse up the parent map chain from the deeper of src_node or dst_node + in `copy_context` and return the first parent MapEntry whose schedule + is not sequential. + + Parameters + ---------- + copy_context : CopyContext + Context information about the memory copy. + + Returns + ------- + Optional[nodes.MapEntry] + The first non-sequential parent MapEntry encountered, or None if no + such parent exists. + """ + src_node, dst_node = copy_context.src_node, copy_context.dst_node + state = copy_context.state + scope_dict = state.scope_dict() + + # Determine which node (src or dst) is in the deeper scope + deeper_node = dst_node if scope_contains_scope(scope_dict, src_node, dst_node) else src_node + current_node = deeper_node + while (current_node is None or not isinstance(current_node, nodes.MapEntry) + or current_node.map.schedule == dtypes.ScheduleType.Sequential): + parent = helpers.get_parent_map(state, current_node) + if parent is None: + current_node = None + break + current_node, state = parent + + return current_node \ No newline at end of file diff --git a/dace/codegen/targets/framecode.py b/dace/codegen/targets/framecode.py index 449e312efa..903d15e26d 100644 --- a/dace/codegen/targets/framecode.py +++ b/dace/codegen/targets/framecode.py @@ -285,10 +285,6 @@ def generate_footer(self, sdfg: SDFG, global_stream: CodeIOStream, callsite_stre """, sdfg) - for target in self._dispatcher.used_targets: - if target.has_initializer: - callsite_stream.write( - '__result |= __dace_init_%s(__state%s);' % (target.target_name, initparamnames_comma), sdfg) for env in self.environments: init_code = _get_or_eval_sdfg_first_arg(env.init_code, sdfg) if init_code: @@ -304,6 +300,11 @@ def generate_footer(self, sdfg: SDFG, global_stream: CodeIOStream, callsite_stre callsite_stream.write(self._initcode.getvalue(), sdfg) + for target in self._dispatcher.used_targets: + if target.has_initializer: + callsite_stream.write( + '__result |= __dace_init_%s(__state%s);' % (target.target_name, initparamnames_comma), sdfg) + callsite_stream.write( f""" if (__result) {{ @@ -324,14 +325,6 @@ def generate_footer(self, sdfg: SDFG, global_stream: CodeIOStream, callsite_stre callsite_stream.write( '__state->report.save("%s", __HASH_%s);' % (pathlib.Path(sdfg.build_folder) / "perf", sdfg.name), sdfg) - callsite_stream.write(self._exitcode.getvalue(), sdfg) - - for sd in sdfg.all_sdfgs_recursive(): - if None in sd.exit_code: - callsite_stream.write(codeblock_to_cpp(sd.exit_code[None]), sd) - if 'frame' in sd.exit_code: - callsite_stream.write(codeblock_to_cpp(sd.exit_code['frame']), sd) - for target in self._dispatcher.used_targets: if target.has_finalizer: callsite_stream.write( @@ -341,6 +334,15 @@ def generate_footer(self, sdfg: SDFG, global_stream: CodeIOStream, callsite_stre __err = __err_{target.target_name}; }} ''', sdfg) + + callsite_stream.write(self._exitcode.getvalue(), sdfg) + + for sd in sdfg.all_sdfgs_recursive(): + if None in sd.exit_code: + callsite_stream.write(codeblock_to_cpp(sd.exit_code[None]), sd) + if 'frame' in sd.exit_code: + callsite_stream.write(codeblock_to_cpp(sd.exit_code['frame']), sd) + for env in reversed(self.environments): finalize_code = _get_or_eval_sdfg_first_arg(env.finalize_code, sdfg) if finalize_code: diff --git a/dace/transformation/passes/analysis/infer_gpu_grid_and_block_size.py b/dace/transformation/passes/analysis/infer_gpu_grid_and_block_size.py index 39eedc7d34..1f93f5559a 100644 --- a/dace/transformation/passes/analysis/infer_gpu_grid_and_block_size.py +++ b/dace/transformation/passes/analysis/infer_gpu_grid_and_block_size.py @@ -105,7 +105,6 @@ def _infer_gpu_block_size(self, state: SDFGState, kernel_map_entry: nodes.MapEnt # guard check if not threadblock_maps: - state.sdfg.save("failure.sdfg") raise ValueError(f"{self.__class__.__name__} expects at least one explicit nested GPU_ThreadBlock map, " "as it assumes AddThreadBlockMap was applied beforehand.\n" f"Check for issues in that transformation or ensure AddThreadBlockMap was applied.") diff --git a/dace/transformation/passes/gpustream/gpu_stream_topology_simplification.py b/dace/transformation/passes/gpustream/gpu_stream_topology_simplification.py index 9dd04a161c..8ebe591699 100644 --- a/dace/transformation/passes/gpustream/gpu_stream_topology_simplification.py +++ b/dace/transformation/passes/gpustream/gpu_stream_topology_simplification.py @@ -12,6 +12,7 @@ from dace.transformation.passes.gpustream.insert_gpu_streams_to_kernels import InsertGPUStreamsToKernels from dace.transformation.passes.gpustream.insert_gpu_streams_to_tasklets import InsertGPUStreamsToTasklets from dace.transformation.passes.insert_gpu_copy_tasklets import InsertGPUCopyTasklets +from dace.transformation.passes.gpustream.insert_gpu_streams_to_sdfgs import InsertGPUStreamsToSDFGs @properties.make_properties @transformation.explicit_cf_compatible @@ -25,7 +26,7 @@ class GPUStreamTopologySimplification(ppl.Pass): def depends_on(self) -> Set[Union[Type[ppl.Pass], ppl.Pass]]: depending_passes = { - NaiveGPUStreamScheduler, InsertGPUStreamsToKernels, + NaiveGPUStreamScheduler, InsertGPUStreamsToSDFGs, InsertGPUStreamsToKernels, InsertGPUStreamsToTasklets, InsertGPUStreamSyncTasklets, InsertGPUCopyTasklets } diff --git a/dace/transformation/passes/gpustream/insert_gpu_stream_sync_tasklets.py b/dace/transformation/passes/gpustream/insert_gpu_stream_sync_tasklets.py index cb7a015165..09063d2df1 100644 --- a/dace/transformation/passes/gpustream/insert_gpu_stream_sync_tasklets.py +++ b/dace/transformation/passes/gpustream/insert_gpu_stream_sync_tasklets.py @@ -12,6 +12,7 @@ from dace.transformation.passes.gpustream.gpustream_scheduling import NaiveGPUStreamScheduler from dace.transformation.passes.gpustream.insert_gpu_streams_to_kernels import InsertGPUStreamsToKernels from dace.transformation.passes.gpustream.insert_gpu_streams_to_tasklets import InsertGPUStreamsToTasklets +from dace.transformation.passes.gpustream.insert_gpu_streams_to_sdfgs import InsertGPUStreamsToSDFGs @properties.make_properties @transformation.explicit_cf_compatible @@ -24,7 +25,8 @@ class InsertGPUStreamSyncTasklets(ppl.Pass): cases are discovered. """ def depends_on(self) -> Set[Union[Type[ppl.Pass], ppl.Pass]]: - return {NaiveGPUStreamScheduler, InsertGPUStreamsToKernels, InsertGPUStreamsToTasklets} + return {NaiveGPUStreamScheduler, InsertGPUStreamsToSDFGs, + InsertGPUStreamsToKernels, InsertGPUStreamsToTasklets} def modifies(self) -> ppl.Modifies: return ppl.Modifies.Tasklets | ppl.Modifies.Memlets @@ -123,6 +125,11 @@ def edge_within_kernel(state, src, dst): not edge_within_kernel(state, src, dst)): sync_state[state].add(stream_assignments[dst]) + elif (is_kernel_exit(src) and is_gpu_accessnode(dst, state) and + not is_sink_node(dst, state)): + sync_state[state].add(stream_assignments[src]) + sync_state[state].add(stream_assignments[src]) + elif (is_kernel_exit(src) and is_gpu_accessnode(dst, state) and is_sink_node(dst, state)): sync_state[state].add(stream_assignments[dst]) @@ -202,9 +209,6 @@ def _insert_gpu_stream_sync_at_state_end(self, sdfg: SDFG, sync_state: Dict[SDFG # 3. Connect a single GPU stream sink node (create or merge if needed) if len(stream_sink_nodes) == 0: - if stream_array_name not in state.sdfg.arrays: - state.sdfg.add_transient(stream_array_name, (num_assigned_streams,), dtype=dtypes.gpuStream_t, - storage=dtypes.StorageType.Register, lifetime=dtypes.AllocationLifetime.Persistent) combined_stream_node = state.add_access(stream_array_name) else: @@ -268,12 +272,6 @@ def _insert_gpu_stream_sync_after_node(self, sdfg: SDFG, sync_node: Dict[nodes.N for succ in state.successors(node): state.add_edge(tasklet, None, succ, None, dace.Memlet()) state.add_edge(node, None, tasklet, None, dace.Memlet()) - - # 2. If the GPU stream array is not defined in the data descriptor store, add it first - parent_sdfg = state.sdfg - if stream_array_name not in parent_sdfg.arrays: - parent_sdfg.add_transient(stream_array_name, (num_assigned_streams,), dtype=dtypes.gpuStream_t, - storage=dtypes.StorageType.Register, lifetime=dtypes.AllocationLifetime.Persistent) # 3. Connect tasklet to GPU stream AccessNodes in_stream = state.add_access(stream_array_name) diff --git a/dace/transformation/passes/gpustream/insert_gpu_streams_to_kernels.py b/dace/transformation/passes/gpustream/insert_gpu_streams_to_kernels.py index 174b41afb5..eb5d9e015d 100644 --- a/dace/transformation/passes/gpustream/insert_gpu_streams_to_kernels.py +++ b/dace/transformation/passes/gpustream/insert_gpu_streams_to_kernels.py @@ -8,6 +8,7 @@ from dace.sdfg import nodes from dace.transformation import pass_pipeline as ppl, transformation from dace.transformation.passes.gpustream.gpustream_scheduling import NaiveGPUStreamScheduler +from dace.transformation.passes.gpustream.insert_gpu_streams_to_sdfgs import InsertGPUStreamsToSDFGs @properties.make_properties @transformation.explicit_cf_compatible @@ -21,7 +22,7 @@ class InsertGPUStreamsToKernels(ppl.Pass): """ def depends_on(self) -> Set[Union[Type[ppl.Pass], ppl.Pass]]: - return {NaiveGPUStreamScheduler} + return {NaiveGPUStreamScheduler, InsertGPUStreamsToSDFGs} def modifies(self) -> ppl.Modifies: return ppl.Modifies.AccessNodes | ppl.Modifies.Memlets @@ -42,22 +43,12 @@ def apply_pass(self, sdfg: SDFG, pipeline_results: Dict[str, Any]): # Link kernels to their assigned GPU streams for sub_sdfg in sdfg.all_sdfgs_recursive(): - # Track whether the GPU stream array is in tge - # sub_sdfg's data descriptor store - gpustream_array_added: bool = stream_array_name in sub_sdfg.arrays - for state in sub_sdfg.states(): for node in state.nodes(): # Not a kernel entry - continue if not (isinstance(node, nodes.MapEntry) and node.map.schedule == dtypes.ScheduleType.GPU_Device): continue - - # If GPU stream array is not yet defined in the sub_sdfg, add it - if not gpustream_array_added: - sub_sdfg.add_transient(stream_array_name, (num_assigned_streams,), dtype=dace.dtypes.gpuStream_t, - storage=dace.dtypes.StorageType.Register, lifetime=dace.dtypes.AllocationLifetime.Persistent) - gpustream_array_added = True # Stream connector name and the used GPU Stream for the kernel assigned_gpustream = stream_assignments[node] diff --git a/dace/transformation/passes/gpustream/insert_gpu_streams_to_sdfgs.py b/dace/transformation/passes/gpustream/insert_gpu_streams_to_sdfgs.py new file mode 100644 index 0000000000..a8a39a549a --- /dev/null +++ b/dace/transformation/passes/gpustream/insert_gpu_streams_to_sdfgs.py @@ -0,0 +1,142 @@ +# Copyright 2019-2025 ETH Zurich and the DaCe authors. All rights reserved. +from typing import Any, Dict, Set, Type, Union + +import copy + +import dace +from dace import dtypes, properties, SDFG, SDFGState +from dace.codegen import common +from dace.config import Config +from dace.transformation import pass_pipeline as ppl, transformation +from dace.transformation.passes.gpustream.gpustream_scheduling import NaiveGPUStreamScheduler +from dace.sdfg.nodes import Node, AccessNode, MapEntry, MapExit, Tasklet +from dace.sdfg.state import ControlFlowBlock, ControlFlowRegion, SDFGState + +from dace.codegen.targets.experimental_cuda_helpers.gpu_utils import is_within_schedule_types + + +from dace.sdfg import is_devicelevel_gpu + +STREAM_PLACEHOLDER = "__dace_current_stream" + +@properties.make_properties +@transformation.explicit_cf_compatible +class InsertGPUStreamsToSDFGs(ppl.Pass): + """ + TODO + """ + + def depends_on(self) -> Set[Union[Type[ppl.Pass], ppl.Pass]]: + return {NaiveGPUStreamScheduler} + + def modifies(self) -> ppl.Modifies: + return ppl.Modifies.AccessNodes | ppl.Modifies.Memlets + + def should_reapply(self, modified: ppl.Modifies) -> bool: + return False + + def apply_pass(self, sdfg: SDFG, pipeline_results: Dict[str, Any]): + + stream_array_name = Config.get('compiler', 'cuda', 'gpu_stream_name').split(',')[0] + stream_assignments: Dict[Node, Union[int, str]] = pipeline_results['NaiveGPUStreamScheduler'] + num_assigned_streams = max(stream_assignments.values(), default=0) + 1 + + # Add the GPU stream array as a transient to the top level SDFG + sdfg.add_transient(stream_array_name, (num_assigned_streams,), dtype=dace.dtypes.gpuStream_t, + storage=dace.dtypes.StorageType.CPU_Heap, lifetime=dace.dtypes.AllocationLifetime.Persistent) + + gpu_stream_desc = sdfg.arrays[stream_array_name] + for child_sdfg in self.find_child_sdfgs_requiring_gpu_stream(sdfg): + + # If GPU stream already defined (because a more inner child sdfg defined it all the way up) skip + if stream_array_name in child_sdfg.arrays: + continue + + inner_sdfg = child_sdfg + outer_sdfg = inner_sdfg.parent_sdfg + inner_sdfg.add_array(stream_array_name, (num_assigned_streams,), dtype=dace.dtypes.gpuStream_t, + storage=dace.dtypes.StorageType.CPU_Heap, lifetime=dace.dtypes.AllocationLifetime.Persistent) + + while stream_array_name not in outer_sdfg.arrays: + + inner_nsdfg_node = inner_sdfg.parent_nsdfg_node + inner_parent_state = inner_sdfg.parent + inner_nsdfg_node.add_in_connector(stream_array_name, dtypes.gpuStream_t) + + outer_sdfg.add_array(stream_array_name, (num_assigned_streams,), dtype=dace.dtypes.gpuStream_t, + storage=dace.dtypes.StorageType.CPU_Heap, lifetime=dace.dtypes.AllocationLifetime.Persistent) + inp_gpu_stream: AccessNode = inner_parent_state.add_access(stream_array_name) + inner_parent_state.add_edge(inp_gpu_stream, None, inner_nsdfg_node, stream_array_name, dace.Memlet(stream_array_name)) + + inner_sdfg = outer_sdfg + outer_sdfg = outer_sdfg.parent_sdfg + + inner_nsdfg_node = inner_sdfg.parent_nsdfg_node + inner_parent_state = inner_sdfg.parent + inner_nsdfg_node.add_in_connector(stream_array_name, dtypes.gpuStream_t) + inp_gpu_stream: AccessNode = inner_parent_state.add_access(stream_array_name) + inner_parent_state.add_edge(inp_gpu_stream, None, inner_nsdfg_node, stream_array_name, dace.Memlet(f"{stream_array_name}[0:{num_assigned_streams}]")) + + outer_sdfg = inner_sdfg.parent_sdfg + + return {} + + def find_child_sdfgs_requiring_gpu_stream(self, sdfg) -> Set[SDFG]: + """ + Identify all child SDFGs that require a GPU stream array in their + array descriptor store. A child SDFG requires a GPU stream if: + + - It launches GPU kernels (MapEntry/MapExit with GPU_Device schedule). + - It contains special Tasklets (e.g., from library node expansion) that + use the GPU stream they are assigned to in the code. + - It accesses GPU global memory outside device-level GPU scopes, which + implies memory copies or kernel data feeds. + + Parameters + ---------- + sdfg : SDFG + The root SDFG to inspect. + + Returns + ------- + Set[SDFG] + The set of child SDFGs that need a GPU stream array in their array descriptor + store. + """ + requiring_gpu_stream = set() + for child_sdfg in sdfg.all_sdfgs_recursive(): + + # Skip the root SDFG itself + if child_sdfg is sdfg: + continue + + for state in child_sdfg.states(): + for node in state.nodes(): + + # Case 1: Kernel launch nodes + if isinstance(node, (MapEntry, MapExit)) and node.map.schedule == dtypes.ScheduleType.GPU_Device: + requiring_gpu_stream.add(child_sdfg) + break + + # Case 2: Tasklets that use GPU stream in their code + if isinstance(node, Tasklet) and STREAM_PLACEHOLDER in node.code.as_string: + requiring_gpu_stream.add(child_sdfg) + break + + # Case 3: Accessing GPU global memory outside device-level scopes + if ( + isinstance(node, AccessNode) + and node.desc(state).storage == dtypes.StorageType.GPU_Global + and not is_devicelevel_gpu(state.sdfg, state, node) + ): + requiring_gpu_stream.add(child_sdfg) + break + + # Stop scanning this SDFG once a reason is found + if child_sdfg in requiring_gpu_stream: + break + + return requiring_gpu_stream + + + diff --git a/dace/transformation/passes/gpustream/insert_gpu_streams_to_tasklets.py b/dace/transformation/passes/gpustream/insert_gpu_streams_to_tasklets.py index 5e3a92c121..f0c226a817 100644 --- a/dace/transformation/passes/gpustream/insert_gpu_streams_to_tasklets.py +++ b/dace/transformation/passes/gpustream/insert_gpu_streams_to_tasklets.py @@ -7,6 +7,7 @@ from dace.sdfg import nodes from dace.transformation import pass_pipeline as ppl, transformation from dace.transformation.passes.gpustream.gpustream_scheduling import NaiveGPUStreamScheduler +from dace.transformation.passes.gpustream.insert_gpu_streams_to_sdfgs import InsertGPUStreamsToSDFGs from dace.transformation.passes.gpustream.insert_gpu_streams_to_kernels import InsertGPUStreamsToKernels @@ -28,7 +29,7 @@ class InsertGPUStreamsToTasklets(ppl.Pass): """ def depends_on(self) -> Set[Union[Type[ppl.Pass], ppl.Pass]]: - return {NaiveGPUStreamScheduler, InsertGPUStreamsToKernels} + return {NaiveGPUStreamScheduler, InsertGPUStreamsToSDFGs, InsertGPUStreamsToKernels} def modifies(self) -> ppl.Modifies: return ppl.Modifies.AccessNodes | ppl.Modifies.Memlets @@ -68,12 +69,6 @@ def apply_pass(self, sdfg: SDFG, pipeline_results: Dict[str, Any]): # Tasklet does not need use its assigned GPU stream - continue if not STREAM_PLACEHOLDER in node.code.as_string: continue - - # If the GPU stream array is not yet defined in the sub_sdfg, add it - if not gpustream_array_added: - sub_sdfg.add_transient(stream_array_name, (num_assigned_streams,), dtype=dace.dtypes.gpuStream_t, - storage=dace.dtypes.StorageType.Register, lifetime=dace.dtypes.AllocationLifetime.Persistent) - gpustream_array_added = True # Stream connector name and the used GPU Stream for the kernel assigned_gpustream = stream_assignments[node] diff --git a/dace/transformation/passes/insert_gpu_copy_tasklets.py b/dace/transformation/passes/insert_gpu_copy_tasklets.py index a6b9c57374..1bf5ceffac 100644 --- a/dace/transformation/passes/insert_gpu_copy_tasklets.py +++ b/dace/transformation/passes/insert_gpu_copy_tasklets.py @@ -14,6 +14,7 @@ from dace.transformation.passes.gpustream.insert_gpu_streams_to_kernels import InsertGPUStreamsToKernels from dace.transformation.passes.gpustream.insert_gpu_streams_to_tasklets import InsertGPUStreamsToTasklets from dace.transformation.passes.gpustream.insert_gpu_stream_sync_tasklets import InsertGPUStreamSyncTasklets +from dace.transformation.passes.gpustream.insert_gpu_streams_to_sdfgs import InsertGPUStreamsToSDFGs @properties.make_properties @transformation.explicit_cf_compatible @@ -36,7 +37,7 @@ class InsertGPUCopyTasklets(ppl.Pass): def depends_on(self) -> Set[Union[Type[ppl.Pass], ppl.Pass]]: depending_passes = { - NaiveGPUStreamScheduler, InsertGPUStreamsToKernels, + NaiveGPUStreamScheduler, InsertGPUStreamsToSDFGs, InsertGPUStreamsToKernels, InsertGPUStreamsToTasklets, InsertGPUStreamSyncTasklets } return depending_passes @@ -87,11 +88,6 @@ def apply_pass(self, sdfg: SDFG, pipeline_results: Dict[str, Any]) -> Dict: # Generatae the copy call code = out_of_kernel_copy.generate_copy(copy_context) - - # Ensure the GPU stream array exists in the current SDFG; add it if missing - if gpustream_array_name not in copy_sdfg.arrays: - copy_sdfg.add_transient(gpustream_array_name, (num_assigned_streams,), dtype=dace.dtypes.gpuStream_t, - storage=dace.dtypes.StorageType.Register, lifetime=dace.dtypes.AllocationLifetime.Persistent) # Prepare GPU ustream connectors and the stream to be accessed from the # GPU stream array diff --git a/tests/npbench/misc/scattering_self_test.py b/tests/npbench/misc/scattering_self_test.py index 6a00a81afe..5b9a5ade62 100644 --- a/tests/npbench/misc/scattering_self_test.py +++ b/tests/npbench/misc/scattering_self_test.py @@ -116,7 +116,6 @@ def test_cpu(): run_scattering_self_test(dace.dtypes.DeviceType.CPU) -@pytest.mark.skip(reason="Compiler error") @pytest.mark.gpu def test_gpu(): run_scattering_self_test(dace.dtypes.DeviceType.GPU) From 264191e9acb671e77b5d11bb6e3f30442cf9f429 Mon Sep 17 00:00:00 2001 From: aydogdub Date: Wed, 10 Sep 2025 12:40:50 +0200 Subject: [PATCH 75/94] various fixes and clean ups, especially regarding GPU stream management --- dace/codegen/instrumentation/gpu_events.py | 48 +++- dace/codegen/targets/cpu.py | 11 + dace/codegen/targets/experimental_cuda.py | 116 ++++++---- .../gpu_stream_manager.py | 102 ++++++--- .../experimental_cuda_helpers/gpu_utils.py | 55 +++-- .../new_copy_strategies.py | 208 +++++++++++------- dace/codegen/targets/framecode.py | 26 +-- dace/config_schema.yml | 6 +- .../analysis/infer_gpu_grid_and_block_size.py | 1 + dace/transformation/passes/fix_test.py | 84 +++++++ .../gpu_stream_topology_simplification.py | 28 --- .../passes/gpustream/gpustream_scheduling.py | 56 +++++ .../insert_gpu_stream_sync_tasklets.py | 71 +----- .../insert_gpu_streams_to_kernels.py | 3 - .../gpustream/insert_gpu_streams_to_sdfgs.py | 62 ++++-- .../insert_gpu_streams_to_tasklets.py | 13 +- .../passes/insert_gpu_copy_tasklets.py | 19 +- .../passes/move_array_out_of_kernel.py | 19 +- tests/codegen/cuda_mempool_test.py | 4 +- tests/npbench/misc/scattering_self_test.py | 1 - tests/parse_state_struct_test.py | 10 +- 21 files changed, 584 insertions(+), 359 deletions(-) create mode 100644 dace/transformation/passes/fix_test.py diff --git a/dace/codegen/instrumentation/gpu_events.py b/dace/codegen/instrumentation/gpu_events.py index 9c653342cd..bf92ef16a7 100644 --- a/dace/codegen/instrumentation/gpu_events.py +++ b/dace/codegen/instrumentation/gpu_events.py @@ -129,7 +129,7 @@ def on_scope_entry(self, sdfg: SDFG, cfg: ControlFlowRegion, state: SDFGState, n 'GPU_Device map scopes') idstr = 'b' + self._idstr(cfg, state, node) - stream = getattr(node, '_cuda_stream', -1) + stream = self._get_gpu_stream(state, node) outer_stream.write(self._record_event(idstr, stream), cfg, state_id, node) def on_scope_exit(self, sdfg: SDFG, cfg: ControlFlowRegion, state: SDFGState, node: nodes.ExitNode, @@ -139,7 +139,7 @@ def on_scope_exit(self, sdfg: SDFG, cfg: ControlFlowRegion, state: SDFGState, no s = self._get_sobj(node) if s.instrument == dtypes.InstrumentationType.GPU_Events: idstr = 'e' + self._idstr(cfg, state, entry_node) - stream = getattr(node, '_cuda_stream', -1) + stream = self._get_gpu_stream(state, node) outer_stream.write(self._record_event(idstr, stream), cfg, state_id, node) outer_stream.write(self._report('%s %s' % (type(s).__name__, s.label), cfg, state, entry_node), cfg, state_id, node) @@ -153,7 +153,7 @@ def on_node_begin(self, sdfg: SDFG, cfg: ControlFlowRegion, state: SDFGState, no if node.instrument == dtypes.InstrumentationType.GPU_Events: state_id = state.parent_graph.node_id(state) idstr = 'b' + self._idstr(cfg, state, node) - stream = getattr(node, '_cuda_stream', -1) + stream = self._get_gpu_stream(state, node) outer_stream.write(self._record_event(idstr, stream), cfg, state_id, node) def on_node_end(self, sdfg: SDFG, cfg: ControlFlowRegion, state: SDFGState, node: nodes.Node, @@ -165,7 +165,47 @@ def on_node_end(self, sdfg: SDFG, cfg: ControlFlowRegion, state: SDFGState, node if node.instrument == dtypes.InstrumentationType.GPU_Events: state_id = state.parent_graph.node_id(state) idstr = 'e' + self._idstr(cfg, state, node) - stream = getattr(node, '_cuda_stream', -1) + stream = self._get_gpu_stream(state, node) outer_stream.write(self._record_event(idstr, stream), cfg, state_id, node) outer_stream.write(self._report('%s %s' % (type(node).__name__, node.label), cfg, state, node), cfg, state_id, node) + + def _get_gpu_stream(self, state: SDFGState, node: nodes.Node) -> int: + """ + Return the GPU stream ID assigned to a given node. + + - In the CUDACodeGen, the stream ID is stored as the private attribute + ``_cuda_stream`` on the node. + - In the ExperimentalCUDACodeGen, streams are explicitly assigned to tasklets + and GPU_Device-scheduled maps (kernels) via a GPU stream AccessNode. For + other node types, no reliable stream assignment is available. + + Parameters + ---------- + state : SDFGState + The state containing the node. + node : dace.sdfg.nodes.Node + The node for which to query the GPU stream. + + Returns + ------- + int + The assigned GPU stream ID, or ``-1`` if none could be determined. + """ + if config.Config.get('compiler', 'cuda', 'implementation') == 'legacy': + stream = getattr(node, '_cuda_stream', -1) + + else: + stream = -1 + for in_edge in state.in_edges(node): + src = in_edge.src + if (isinstance(src, nodes.AccessNode) and src.desc(state).dtype == dtypes.gpuStream_t): + stream = int(in_edge.data.subset) + + for out_edge in state.out_edges(node): + dst = out_edge.dst + if (isinstance(dst, nodes.AccessNode) and dst.desc(state).dtype == dtypes.gpuStream_t): + stream = int(out_edge.data.subset) + + return stream + \ No newline at end of file diff --git a/dace/codegen/targets/cpu.py b/dace/codegen/targets/cpu.py index 55e0876ddd..30d646a2e5 100644 --- a/dace/codegen/targets/cpu.py +++ b/dace/codegen/targets/cpu.py @@ -513,6 +513,13 @@ def allocate_array(self, return elif (nodedesc.storage == dtypes.StorageType.Register): + + if nodedesc.dtype == dtypes.gpuStream_t: + ctype = dtypes.gpuStream_t.ctype + allocation_stream.write(f"{ctype}* {name} = __state->gpu_context->streams;") + define_var(name, DefinedType.Pointer, ctype ) + return + ctypedef = dtypes.pointer(nodedesc.dtype).ctype if nodedesc.start_offset != 0: raise NotImplementedError('Start offset unsupported for registers') @@ -586,6 +593,10 @@ def deallocate_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgrap return elif (nodedesc.storage == dtypes.StorageType.CPU_Heap or (nodedesc.storage == dtypes.StorageType.Register and symbolic.issymbolic(arrsize, sdfg.constants))): + + if nodedesc.dtype == dtypes.gpuStream_t: + callsite_stream.write(f"{alloc_name} = nullptr;") + return if isinstance(nodedesc, data.Array): callsite_stream.write(f"delete[] {alloc_name};\n", cfg, state_id, node) else: diff --git a/dace/codegen/targets/experimental_cuda.py b/dace/codegen/targets/experimental_cuda.py index 7c3a1a3221..231ada370c 100644 --- a/dace/codegen/targets/experimental_cuda.py +++ b/dace/codegen/targets/experimental_cuda.py @@ -44,7 +44,7 @@ # Experimental CUDA helper imports from dace.codegen.targets.experimental_cuda_helpers.gpu_stream_manager import GPUStreamManager -from dace.codegen.targets.experimental_cuda_helpers.gpu_utils import symbolic_to_cpp, emit_sync_debug_checks, get_defined_type +from dace.codegen.targets.experimental_cuda_helpers.gpu_utils import symbolic_to_cpp, generate_sync_debug_call, get_defined_type from dace.codegen.targets import cpp @@ -124,7 +124,7 @@ def __init__(self, frame_codegen: 'DaCeCodeGenerator', sdfg: SDFG): # NOTE: # "Register illegal copies" code NOT copied from cuda.py - # Behavior unclear for me yet. + # Was never needed. ################## New variables ########################## @@ -188,6 +188,19 @@ def preprocess(self, sdfg: SDFG) -> None: continue + """ + from dace.transformation.passes.fix_test import Fix + from dace.transformation.passes.move_array_out_of_kernel import MoveArrayOutOfKernel + sdfg.save("before.sdfg") + names = Fix().apply_pass(sdfg, {}) + for name, map_parent in names.items(): + MoveArrayOutOfKernel().apply_pass(sdfg, map_parent, name) + + sdfg.save("after.sdfg") + """ + + + #----------------- Add ThreadBlock Maps & Infer Kernel Grid & Block Sizes -------------------- # new_nodes - old_nodes gives us all Kernel Entry nodes that were created during the insertion @@ -231,12 +244,8 @@ def preprocess(self, sdfg: SDFG) -> None: self._dispatcher._used_targets.add(self) gpustream_assignments = stream_pipeline.apply_pass(sdfg, {})['NaiveGPUStreamScheduler'] - # TODO: probably to be deleted - # Define backend stream access expression (e.g., CUDA stream handle) - gpu_stream_access_template = "__state->gpu_context->streams[{gpu_stream}]" - # Initialize runtime GPU stream manager - self._gpu_stream_manager = GPUStreamManager(sdfg, gpustream_assignments, gpu_stream_access_template) + self._gpu_stream_manager = GPUStreamManager(sdfg, gpustream_assignments) #----------------- Shared Memory Synchronization related Logic ----------------- @@ -378,6 +387,22 @@ def generate_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: ScopeSub # Enter kernel context and recursively generate device code + + state = cfg.state(state_id) + scope_entry = dfg_scope.source_nodes()[0] + scope_exit = dfg_scope.sink_nodes()[0] + scope_entry_stream = CodeIOStream() + scope_exit_stream = CodeIOStream() + + # Instrumentation for kernel scope + instr = self._dispatcher.instrumentation[scope_entry.map.instrument] + if instr is not None: + instr.on_scope_entry(sdfg, cfg, state, scope_entry, callsite_stream, scope_entry_stream, + self._globalcode) + outer_stream = CodeIOStream() + instr.on_scope_exit(sdfg, cfg, state, scope_exit, outer_stream, scope_exit_stream, self._globalcode) + + # New scope for defined variables (kernel functions scope) self._dispatcher.defined_vars.enter_scope(scope_entry) @@ -410,9 +435,14 @@ def generate_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: ScopeSub raise ValueError("Invalid kernel configuration: This strategy is only applicable if the " "outermost GPU schedule is of type GPU_Device (most likely cause).") + + self._localcode.write(scope_entry_stream.getvalue()) + # Append generated kernel code to localcode self._localcode.write(kernel_stream.getvalue() + '\n') + self._localcode.write(scope_exit_stream.getvalue()) + # Exit kernel context self._in_device_code = False @@ -422,6 +452,9 @@ def generate_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: ScopeSub # Exit scope for defined variables self._dispatcher.defined_vars.exit_scope(scope_entry) + if instr is not None: + callsite_stream.write(outer_stream.getvalue()) + return #--------------- Nested GPU Scope -------------------- @@ -597,8 +630,8 @@ def _generate_kernel_wrapper(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope ); ''', cfg, state_id, scope_entry) - self._localcode.write(f'DACE_KERNEL_LAUNCH_CHECK(__err, "{kernel_name}", {gdims}, {bdims});') - emit_sync_debug_checks(self.backend, self._localcode) + self._localcode.write(f'DACE_KERNEL_LAUNCH_CHECK(__err, "{kernel_name}", {gdims}, {bdims});\n') + self._localcode.write(generate_sync_debug_call()) # Close bracket self._localcode.write('}', cfg, state_id, scope_entry) @@ -617,7 +650,7 @@ def copy_memory(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView SyncCollaboritveGPUCopyStrategy ) - context = CopyContext(sdfg, cfg.state(state_id), src_node, dst_node, edge, self._gpu_stream_manager.assigned_streams) + context = CopyContext(sdfg, cfg.state(state_id), src_node, dst_node, edge, self._gpu_stream_manager.gpustream_assignments) if OutOfKernelCopyStrategy().applicable(context): return @@ -634,13 +667,16 @@ def copy_memory(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView def state_dispatch_predicate(self, sdfg, state): """ - Determines whether a state should be handled by this + Determines whether a given state should be processed by this code generator (`ExperimentalCUDACodeGen`). - Returns True if the generator is currently generating kernel code. + Returns True if either: + 1. The state has associated GPU memory that needs to be released + (i.e., it appears in `self.pool_release`), or + 2. The code generator is currently generating device/kernel code. """ - return self._in_device_code - + return any(s is state for s, _ in self.pool_release.values()) or self._in_device_code + def node_dispatch_predicate(self, sdfg, state, node): """ Determines whether a node should be handled by this @@ -693,8 +729,7 @@ def generate_state(self, # Free the memory callsite_stream.write(f'DACE_GPU_CHECK({backend}Free({ptrname}));\n', pool_sdfg) - - emit_sync_debug_checks(self.backend, callsite_stream) + callsite_stream.write(generate_sync_debug_call()) # We handled the key (pool_sdfg, name) and can remove it later handled_keys.add((pool_sdfg, name)) @@ -703,6 +738,11 @@ def generate_state(self, for key in handled_keys: del self.pool_release[key] + # Invoke all instrumentation providers + for instr in self._frame._dispatcher.instrumentation.values(): + if instr is not None: + instr.on_state_end(sdfg, cfg, state, callsite_stream, function_stream) + def generate_node(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int, node: nodes.Node, function_stream: CodeIOStream, callsite_stream: CodeIOStream) -> None: @@ -777,7 +817,6 @@ def _generate_NestedSDFG(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSub ctype = f"const {nsdfg.symbols[name].ctype}" - # Redirect rest to CPU codegen self._cpu_codegen._generate_NestedSDFG(sdfg, cfg, dfg, state_id, node, function_stream, callsite_stream) @@ -885,13 +924,14 @@ def _prepare_GPU_Global_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: Sta if nodedesc.pool: gpu_stream_manager = self._gpu_stream_manager - gpu_stream = gpu_stream_manager.assigned_streams[node] - if gpu_stream != 'nullptr': - gpu_stream = f'__state->__0_gpu_streams[{gpu_stream}]' + gpu_stream = gpu_stream_manager.get_stream_node(node) allocation_stream.write( f'DACE_GPU_CHECK({self.backend}MallocAsync((void**)&{dataname}, {arrsize_malloc}, {gpu_stream}));\n', cfg, state_id, node) - emit_sync_debug_checks(self.backend, allocation_stream) + + # Generate synchronization and error-check calls if sync debugging is enabled + allocation_stream.write(generate_sync_debug_call()) + else: # Strides are left to the user's discretion allocation_stream.write(f'DACE_GPU_CHECK({self.backend}Malloc((void**)&{dataname}, {arrsize_malloc}));\n', @@ -1012,7 +1052,7 @@ def deallocate_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgrap if nodedesc.storage == dtypes.StorageType.GPU_Global: if nodedesc.pool: if (sdfg, dataname) not in self.pool_release: - gpu_stream = "nullptr" + gpu_stream = self._gpu_stream_manager.get_stream_node(node) callsite_stream.write(f'DACE_GPU_CHECK({self.backend}FreeAsync({dataname}, {gpu_stream}));\n', cfg, state_id, node) else: callsite_stream.write(f'DACE_GPU_CHECK({self.backend}Free({dataname}));\n', cfg, state_id, node) @@ -1030,8 +1070,6 @@ def deallocate_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgrap raise NotImplementedError(f'Deallocation not implemented for storage type: {nodedesc.storage.name}') def get_generated_codeobjects(self): - - # My comment: first part creates the header and stores it in a object property fileheader = CodeIOStream() self._frame.generate_fileheader(self._global_sdfg, fileheader, 'cuda') @@ -1050,7 +1088,6 @@ def get_generated_codeobjects(self): break - # My comment: takes codeblocks and transforms it nicely to code initcode = CodeIOStream() for sd in self._global_sdfg.all_sdfgs_recursive(): @@ -1060,7 +1097,6 @@ def get_generated_codeobjects(self): initcode.write(codeblock_to_cpp(sd.init_code['cuda']), sd) initcode.write(self._initcode.getvalue()) - # My comment: takes codeblocks and transforms it nicely to code- probably same as before now for exit code exitcode = CodeIOStream() for sd in self._global_sdfg.all_sdfgs_recursive(): if None in sd.exit_code: @@ -1069,7 +1105,6 @@ def get_generated_codeobjects(self): exitcode.write(codeblock_to_cpp(sd.exit_code['cuda']), sd) exitcode.write(self._exitcode.getvalue()) - # My comment: Uses GPU backend (NVIDIA or AMD) to get correct header files if self.backend == 'cuda': backend_header = 'cuda_runtime.h' elif self.backend == 'hip': @@ -1077,12 +1112,10 @@ def get_generated_codeobjects(self): else: raise NameError('GPU backend "%s" not recognized' % self.backend) - # My comment: Seems to get all function params, needed for later params_comma = self._global_sdfg.init_signature(free_symbols=self._frame.free_symbols(self._global_sdfg)) if params_comma: params_comma = ', ' + params_comma - #My comment looks life Memory information pool_header = '' if self.has_pool: poolcfg = Config.get('compiler', 'cuda', 'mempool_release_threshold') @@ -1093,17 +1126,10 @@ def get_generated_codeobjects(self): cudaMemPoolSetAttribute(mempool, cudaMemPoolAttrReleaseThreshold, &threshold); ''' - # My comment: Looks like a "base" template, where more details will probably be added later self._codeobject.code = """ #include <{backend_header}> #include -// New, cooperative groups and asnyc copy -#include -#include - -namespace cg = cooperative_groups; - {file_header} DACE_EXPORTED int __dace_init_experimental_cuda({sdfg_state_name} *__state{params}); @@ -1136,13 +1162,13 @@ def get_generated_codeobjects(self): __state->gpu_context = new dace::cuda::Context({nstreams}, {nevents}); + // Create {backend} streams and events for(int i = 0; i < {nstreams}; ++i) {{ - {other_gpustream_init}[i] = 0; + DACE_GPU_CHECK({backend}StreamCreateWithFlags(&__state->gpu_context->internal_streams[i], {backend}StreamNonBlocking)); + __state->gpu_context->streams[i] = __state->gpu_context->internal_streams[i]; // Allow for externals to modify streams }} - - // Create {backend} streams - for(int i = 0; i < {nstreams}; ++i) {{ - DACE_GPU_CHECK({backend}StreamCreateWithFlags(&{other_gpustream_init}[i], {backend}StreamNonBlocking)); + for(int i = 0; i < {nevents}; ++i) {{ + DACE_GPU_CHECK({backend}EventCreateWithFlags(&__state->gpu_context->events[i], {backend}EventDisableTiming)); }} {initcode} @@ -1158,9 +1184,12 @@ def get_generated_codeobjects(self): if (__err == 0) __err = static_cast({backend}DeviceSynchronize()); - // Destroy {backend} streams + // Destroy {backend} streams and events for(int i = 0; i < {nstreams}; ++i) {{ - DACE_GPU_CHECK({backend}StreamDestroy({other_gpustream_init}[i])); + DACE_GPU_CHECK({backend}StreamDestroy(__state->gpu_context->internal_streams[i])); + }} + for(int i = 0; i < {nevents}; ++i) {{ + DACE_GPU_CHECK({backend}EventDestroy(__state->gpu_context->events[i])); }} delete __state->gpu_context; @@ -1178,7 +1207,6 @@ def get_generated_codeobjects(self): file_header=fileheader.getvalue(), nstreams=self._gpu_stream_manager.num_gpu_streams, nevents=self._gpu_stream_manager.num_gpu_events, - other_gpustream_init=init_gpu_stream_vars, backend=self.backend, backend_header=backend_header, pool_header=pool_header, diff --git a/dace/codegen/targets/experimental_cuda_helpers/gpu_stream_manager.py b/dace/codegen/targets/experimental_cuda_helpers/gpu_stream_manager.py index 6582395027..b2be3f0872 100644 --- a/dace/codegen/targets/experimental_cuda_helpers/gpu_stream_manager.py +++ b/dace/codegen/targets/experimental_cuda_helpers/gpu_stream_manager.py @@ -1,51 +1,89 @@ +# Copyright 2019-2025 ETH Zurich and the DaCe authors. All rights reserved. from typing import Dict, Union from dace import SDFG, nodes class GPUStreamManager: """ - Manages GPU backend streams (e.g., CUDA or HIP streams) for nodes in an SDFG. - Assumes that the initialization inputs come from the NaiveGPUScheduler pass. + Manage GPU backend streams (e.g., CUDA or HIP) for nodes in an SDFG. - NOTE: "Stream" here refers to backend GPU streams, not DaCe data streams. + Nodes are assigned stream IDs by the NaiveGPUStreamScheduler Pass, and + this class provides their access expressions and tracks the number of streams + in use. GPU events are not (yet) supported. + + Note + ---- + "Stream" refers to backend GPU streams, not DaCe data streams. """ - def __init__(self, sdfg: SDFG, assigned_streams: Dict[nodes.Node, Union[int, str]], stream_access_template: str): + def __init__(self, sdfg: SDFG, gpustream_assignments: Dict[nodes.Node, int]): self.sdfg = sdfg - self.assigned_streams = assigned_streams - self.stream_access_template = stream_access_template - - # Placeholder for future support of backend events (e.g., CUDA events) - self.num_gpu_events = 0 + self._stream_access_template = "__state->gpu_context->streams[{gpu_stream}]" + self._gpustream_assignments = gpustream_assignments + self._num_gpu_streams = max(gpustream_assignments.values()) + 1 if gpustream_assignments else 0 + self._num_gpu_events = 0 + - # Determine the number of streams used (stream IDs start from 0) - # Only count integer stream IDs (ignore string values like "nullptr") - int_stream_ids = [v for v in assigned_streams.values() if isinstance(v, int)] - self.num_gpu_streams = max(int_stream_ids, default=0) + 1 def get_stream_node(self, node: nodes.Node) -> str: """ - Returns the GPU stream access expression for a given node. + Return the access expression for the GPU stream assigned to a node. - If the node has an assigned stream not equal the default "nullptr", returns - the formatted stream expression. Otherwise, returns "nullptr". - """ - if node in self.assigned_streams and self.assigned_streams[node] != "nullptr": - return self.stream_access_template.format(gpu_stream=self.assigned_streams[node]) - return "nullptr" + Parameters + ---------- + node : nodes.Node + The node for which to return the access expression of its assigned CUDA stream. + Returns + ------- + str + The GPU stream access expression, e.g., + "__state->gpu_context->streams[0]". + + Raises + ------ + ValueError + If the given node does not have an assigned stream. + """ + if node in self.gpustream_assignments: + return self._stream_access_template.format( + gpu_stream=self.gpustream_assignments[node] + ) + else: + raise ValueError( + f"No GPU stream assigned to node {node}. " + "Check whether the node is relevant for GPU stream assignment and, if it is, " + "inspect the GPU stream pipeline to see why no stream was assigned." + ) + def get_stream_edge(self, src_node: nodes.Node, dst_node: nodes.Node) -> str: """ - Returns the stream access expression for an edge based on either the - source or destination node. If one of the nodes has an assigned stream not equal - to the default 'nullptr', that stream is returned (should be symmetric - when using the NaiveGPUStreamScheduler pass). Otherwise, returns 'nullptr'. + Returns the GPU stream access expression for an edge. + + Currently unused: edge-level streams were only needed for asynchronous + memory-copy operations (e.g., cudaMemcpyAsync). These copies are now + modeled via tasklets in the SDFG, so edges do not carry stream info. + Implement this if the design changes and edges need streams again. """ - if src_node in self.assigned_streams and self.assigned_streams[src_node] != "nullptr": - stream_id = self.assigned_streams[src_node] - return self.stream_access_template.format(gpu_stream=stream_id) - elif dst_node in self.assigned_streams and self.assigned_streams[dst_node] != "nullptr": - stream_id = self.assigned_streams[dst_node] - return self.stream_access_template.format(gpu_stream=stream_id) - else: - return "nullptr" + raise NotImplementedError( + "Edge-level GPU streams are not supported. " + "They were previously used for asynchronous memory copies (e.g., cudaMemcpyAsync), " + "but these are now modeled via tasklets in the SDFG. " + "Implement this if the design changes and edges must carry GPU stream information." + ) + + @property + def num_gpu_events(self) -> int: + """Number of GPU events (currently always 0, left here for potential future support).""" + return 0 + + @property + def num_gpu_streams(self) -> int: + """Number of GPU streams in use (stream IDs start at 0).""" + return self._num_gpu_streams + + @property + def gpustream_assignments(self) -> Dict[nodes.Node, int]: + """Mapping of nodes to assigned GPU stream IDs (not all nodes necessarily have a GPU stream ID).""" + return self._gpustream_assignments + \ No newline at end of file diff --git a/dace/codegen/targets/experimental_cuda_helpers/gpu_utils.py b/dace/codegen/targets/experimental_cuda_helpers/gpu_utils.py index de8913c176..8a068b0b9d 100644 --- a/dace/codegen/targets/experimental_cuda_helpers/gpu_utils.py +++ b/dace/codegen/targets/experimental_cuda_helpers/gpu_utils.py @@ -6,7 +6,7 @@ import dace from dace import Config, symbolic, data as dt, dtypes from dace.sdfg import nodes, SDFGState -from dace.codegen import cppunparse +from dace.codegen import common, cppunparse from dace.codegen.dispatcher import DefinedType from dace.codegen.prettycode import CodeIOStream from dace.transformation.helpers import get_parent_map @@ -79,7 +79,6 @@ def validate_block_size_limits(kernel_map_entry: nodes.MapEntry, block_size: Lis Raises: ValueError: If either limit is exceeded. """ - kernel_map_label = kernel_map_entry.map.label total_block_size = product(block_size) @@ -100,20 +99,36 @@ def validate_block_size_limits(kernel_map_entry: nodes.MapEntry, block_size: Lis 'thread-block size. To increase this limit, modify the ' '`compiler.cuda.block_size_lastdim_limit` configuration entry.') - -def emit_sync_debug_checks(backend: str, codestream: CodeIOStream): +def generate_sync_debug_call() -> str: """ - Emit backend sync and error-check calls if synchronous debugging is enabled. - - Args: - backend (str): Backend API prefix (e.g., 'cuda'). - codestream (CodeIOStream): Stream to write code to. + Generate backend sync and error-check calls as a string if + synchronous debugging is enabled. + + Parameters + ---------- + backend : str + Backend API prefix (e.g., 'cuda'). + + Returns + ------- + str + The generated debug call code, or an empty string if debugging is disabled. """ + backend: str = common.get_gpu_backend() + sync_call: str = "" if Config.get_bool('compiler', 'cuda', 'syncdebug'): - codestream.write(f"DACE_GPU_CHECK({backend}GetLastError());\n" - f"DACE_GPU_CHECK({backend}DeviceSynchronize());\n") + sync_call = ( + f"DACE_GPU_CHECK({backend}GetLastError());\n" + f"DACE_GPU_CHECK({backend}DeviceSynchronize());\n" + ) + + return sync_call def get_defined_type(data: dt.Data) -> DefinedType: + """ + Return the DefinedType for a data descriptor. + Currently supports only scalars and arrays; extend if others are needed. + """ if isinstance(data, dt.Scalar): return DefinedType.Scalar elif isinstance(data, dt.Array): @@ -127,12 +142,18 @@ def is_within_schedule_types(state: SDFGState, node: nodes.Node, schedules: Set[ Checks if the given node is enclosed within a Map whose schedule type matches any in the `schedules` set. - Args: - state (SDFGState): The State where the node resides - node (nodes.Node): The node to check. - schedules (set[dtypes.ScheduleType]): A set of schedule types to match (e.g., {dtypes.ScheduleType.GPU_Device}). - - Returns: + Parameters + ---------- + state : SDFGState + The State where the node resides + node : nodes.Node + The node to check. + schedules : set[dtypes.ScheduleType] + A set of schedule types to match (e.g., {dtypes.ScheduleType.GPU_Device}). + + Returns + ---------- + bool True if the node is enclosed by a Map with a schedule type in `schedules`, False otherwise. """ current = node diff --git a/dace/codegen/targets/experimental_cuda_helpers/new_copy_strategies.py b/dace/codegen/targets/experimental_cuda_helpers/new_copy_strategies.py index f8f10fa528..9ee6b398b1 100644 --- a/dace/codegen/targets/experimental_cuda_helpers/new_copy_strategies.py +++ b/dace/codegen/targets/experimental_cuda_helpers/new_copy_strategies.py @@ -8,7 +8,7 @@ from dace.codegen import common from dace.codegen.targets import cpp from dace.codegen.targets.cpp import unparse_cr -from dace.codegen.targets.experimental_cuda_helpers.gpu_utils import symbolic_to_cpp +from dace.codegen.targets.experimental_cuda_helpers.gpu_utils import symbolic_to_cpp, generate_sync_debug_call from dace.config import Config from dace.dtypes import StorageType from dace.frontend import operations @@ -17,7 +17,12 @@ from dace.transformation import helpers class CopyContext: - + """ + Encapsulates inputs required for copy operations and exposes helper + methods to derive additional information. This keeps copy strategies + lightweight by letting them focus only on the relevant logic. + """ + def __init__(self, sdfg: SDFG, state: SDFGState, src_node: nodes.Node, dst_node: nodes.Node, edge: MultiConnectorEdge[mm.Memlet], gpustream_assignments: Dict[nodes.Node, Union[int, str]]): @@ -45,6 +50,13 @@ def __init__(self, sdfg: SDFG, state: SDFGState, src_node: nodes.Node, dst_node: self.dst_expr = dst_expr def get_storage_type(self, node: nodes.Node): + """ + Return the storage type associated with a given SDFG node. + + Tasklets are assumed to use register storage, while AccessNodes + return the storage type from their data descriptor. Raises + NotImplementedError for unsupported node types. + """ if isinstance(node, nodes.Tasklet): storage_type = StorageType.Register @@ -60,6 +72,17 @@ def get_storage_type(self, node: nodes.Node): return storage_type def get_assigned_gpustream(self) -> str: + """ + Return the GPU stream expression assigned to both source and destination nodes. + + Ensures that both nodes have a matching stream ID, then constructs the + variable name from the configured prefix and stream ID. Raises ValueError + if assignments are missing or inconsistent. + + Example: + If the configured prefix is 'gpu_stream' and the assigned stream ID is 0, + this method returns 'gpu_stream0'. + """ src_stream = self.gpustream_assignments.get(self.src_node) dst_stream = self.gpustream_assignments.get(self.dst_node) @@ -74,17 +97,26 @@ def get_assigned_gpustream(self) -> str: ) # 2. Generate GPU stream expression - gpustream = src_stream - if gpustream == 'nullptr': - raise NotImplementedError("nullptr GPU stream not supported yet.") - gpustream_var_name_prefix = Config.get('compiler', 'cuda', 'gpu_stream_name').split(',')[1] gpustream_expr = f"{gpustream_var_name_prefix}{gpustream}" return gpustream_expr def get_memory_location(self) -> Tuple[str, str]: + """ + Determine whether the source and destination nodes reside in device or host memory. + + Uses the storage type of each node to classify it as either 'Device' + (GPU global memory) or 'Host' (all other storage types). + Used for GPU related copies outside the kernel (e.g. to construct + cudaMemcpyHostToDevice for example). + + Returns + ------- + Tuple[str, str] + (src_location, dst_location) where each is either 'Device' or 'Host'. + """ src_storage = self.get_storage_type(self.src_node) dst_storage = self.get_storage_type(self.dst_node) src_location = 'Device' if src_storage == dtypes.StorageType.GPU_Global else 'Host' @@ -93,6 +125,23 @@ def get_memory_location(self) -> Tuple[str, str]: return src_location, dst_location def get_ctype(self) -> Any: + """ + Determine the C data type (ctype) of the source or destination node. + + The ctype is resolved from the data descriptor of the first node + (source or destination) that is an AccessNode (assumed to be the same + if both are AccessNodes). + + Returns + ------- + Any + The C type string (e.g., "float*", "int32") associated with the node. + + Raises + ------ + NotImplementedError + If neither the source nor the destination node is an AccessNode. + """ sdfg = self.sdfg src_node, dst_node = self.src_node, self.dst_node @@ -109,22 +158,79 @@ def get_ctype(self) -> Any: ) def get_accessnode_to_accessnode_copy_info(self): + """ + Compute copy shape, absolute strides, and pointer expressions for a copy + between two AccessNodes. Tries to mimic + cpp.memlet_copy_to_absolute_strides without requiring a dispatcher. + + Returns + ------- + (copy_shape, src_strides, dst_strides, src_expr, dst_expr) + + Raises + ------ + TypeError + If either endpoint is not an AccessNode. + NotImplementedError + If a descriptor is not Scalar or Array. + """ + + # ---------------------------- helpers ---------------------------- + def _collapse_strides(strides, subset): + """Remove size-1 dims; keep tile strides; default to [1] if none remain.""" + n = len(subset) + collapsed = [st for st, sz in zip(strides, subset.size()) if sz != 1] + collapsed.extend(strides[n:]) # include tiles + if len(collapsed) == 0: + return [1] + return collapsed + + def _ptr_name(desc, name): + if desc.transient and desc.lifetime in ( + dtypes.AllocationLifetime.Persistent, dtypes.AllocationLifetime.External + ): + return f'__state->__{sdfg.cfg_id}_{name}' + return name + + def _expr_for(desc, name, subset): + ptr = _ptr_name(desc, name) + + if isinstance(desc, data.Scalar): + # GPU scalar special-case + if desc.storage in dtypes.GPU_MEMORY_STORAGES_EXPERIMENTAL_CUDACODEGEN: + parent = state.sdfg.parent_nsdfg_node + if parent is not None and name in parent.in_connectors: + return f"&{ptr}" + return ptr + # CPU (or other) scalars + return f"&{ptr}" + + if isinstance(desc, data.Array): + offset = cpp.cpp_offset_expr(desc, subset) + return f"{ptr} + {offset}" if offset != "0" else ptr + + raise NotImplementedError( + f"Expected {name} to be either data.Scalar or data.Array, but got {type(desc).__name__}." + ) + + # ---------------------------- Get copy info ---------------------------- + # Get needed information src_node, dst_node = self.src_node, self.dst_node - sdfg = self.sdfg - edge = self.edge - memlet = self.edge.data - state = self.state - copy_shape = self.copy_shape + sdfg, edge, state = self.sdfg, self.edge, self.state + memlet, copy_shape = self.edge.data, self.copy_shape + # Guard - only applicable if src and dst are AccessNodes if not (isinstance(src_node, nodes.AccessNode) and isinstance(dst_node, nodes.AccessNode)): raise TypeError( f"get_accessnode_to_accessnode_copy_info requires both source and destination " f"to be AccessNode instances, but got {type(src_node).__name__} and {type(dst_node).__name__}." ) + # Get node descriptors src_nodedesc = src_node.desc(sdfg) dst_nodedesc = dst_node.desc(sdfg) + # Resolve subsets (fallback to full range) src_subset = memlet.get_src_subset(edge, state) dst_subset = memlet.get_dst_subset(edge, state) @@ -134,10 +240,11 @@ def get_accessnode_to_accessnode_copy_info(self): if dst_subset is None: dst_subset = subsets.Range.from_array(dst_nodedesc) + # Get strides src_strides = src_subset.absolute_strides(src_nodedesc.strides) dst_strides = dst_subset.absolute_strides(dst_nodedesc.strides) - # Try to turn into degenerate/strided ND copies + # Try to convert to a degenerate/strided ND copy first result = cpp.ndcopy_to_strided_copy( copy_shape, src_nodedesc.shape, @@ -148,25 +255,13 @@ def get_accessnode_to_accessnode_copy_info(self): src_subset, dst_subset, ) + if result is not None: copy_shape, src_strides, dst_strides = result else: - # If other_subset is defined, reduce its dimensionality by - # removing the "empty" dimensions (size = 1) and filter the - # corresponding strides out - src_strides = ([stride - for stride, s in zip(src_strides, src_subset.size()) if s != 1] + src_strides[len(src_subset):] - ) # Include tiles - if not src_strides: - src_strides = [1] - dst_strides = ([stride - for stride, s in zip(dst_strides, dst_subset.size()) if s != 1] + dst_strides[len(dst_subset):] - ) # Include tiles - if not dst_strides: - dst_strides = [1] - copy_shape = [s for s in copy_shape if s != 1] - if not copy_shape: - copy_shape = [1] + src_strides = _collapse_strides(src_strides, src_subset) + dst_strides = _collapse_strides(dst_strides, dst_subset) + copy_shape = [s for s in copy_shape if s != 1] or [1] # Extend copy shape to the largest among the data dimensions, # and extend other array with the appropriate strides @@ -176,58 +271,9 @@ def get_accessnode_to_accessnode_copy_info(self): elif memlet.data == dst_node.data: copy_shape, src_strides = cpp.reshape_strides(dst_subset, dst_strides, src_strides, copy_shape) - - src_name = src_node.data - if (src_nodedesc.transient and src_nodedesc.lifetime in (dtypes.AllocationLifetime.Persistent, dtypes.AllocationLifetime.External)): - ptr_name = f'__state->__{sdfg.cfg_id}_{src_name}' - else: - ptr_name = src_name - - if isinstance(src_nodedesc, data.Scalar) and src_nodedesc.storage in dtypes.GPU_MEMORY_STORAGES_EXPERIMENTAL_CUDACODEGEN: - parent_nsdfg_node = state.sdfg.parent_nsdfg_node - if parent_nsdfg_node is not None and src_name in parent_nsdfg_node.in_connectors: - src_expr = f"&{ptr_name}" - else: - src_expr = ptr_name - - elif isinstance(src_nodedesc, data.Scalar): - src_expr = f"&{ptr_name}" - - elif isinstance(src_nodedesc, data.Array): - src_offset = cpp.cpp_offset_expr(src_nodedesc, src_subset) - src_expr = f"{ptr_name} + {src_offset}" if src_offset != "0" else ptr_name - - else: - raise NotImplementedError( - f"Expected {src_name} to be either data.Scalar or data.Array, " - f"but got {type(src_nodedesc).__name__}." - ) - - dst_name = dst_node.data - if (dst_nodedesc.transient and dst_nodedesc.lifetime in (dtypes.AllocationLifetime.Persistent, dtypes.AllocationLifetime.External)): - ptr_name = f'__state->__{sdfg.cfg_id}_{dst_name}' - else: - ptr_name = dst_name - - if isinstance(dst_nodedesc, data.Scalar) and dst_nodedesc.storage in dtypes.GPU_MEMORY_STORAGES_EXPERIMENTAL_CUDACODEGEN: - parent_nsdfg_node = state.sdfg.parent_nsdfg_node - if parent_nsdfg_node is not None and dst_name in parent_nsdfg_node.in_connectors: - dst_expr = f"&{ptr_name}" - else: - dst_expr = ptr_name - - elif isinstance(dst_nodedesc, data.Scalar): - dst_expr = f"&{ptr_name}" - - elif isinstance(dst_nodedesc, data.Array): - dst_offset = cpp.cpp_offset_expr(dst_nodedesc, dst_subset) - dst_expr = f"{ptr_name} + {dst_offset}" if dst_offset != "0" else ptr_name - - else: - raise NotImplementedError( - f"Expected {dst_name} to be either data.Scalar or data.Array, " - f"but got {type(dst_nodedesc).__name__}." - ) + # Build final expressions + src_expr = _expr_for(src_nodedesc, src_node.data, src_subset) + dst_expr = _expr_for(dst_nodedesc, dst_node.data, dst_subset) return copy_shape, src_strides, dst_strides, src_expr, dst_expr @@ -370,6 +416,8 @@ def _generate_1d_copy(self, copy_context: CopyContext) -> str: call = f'DACE_GPU_CHECK({backend}Memcpy2DAsync({dst_expr}, {dpitch}, {src_expr}, {spitch}, {width}, {height}, {kind}, {gpustream}));\n' + # Potentially snychronization required if syncdebug is set to true in configurations + call = call + generate_sync_debug_call() return call def _generate_2d_copy(self, copy_context: CopyContext) -> None: diff --git a/dace/codegen/targets/framecode.py b/dace/codegen/targets/framecode.py index 903d15e26d..449e312efa 100644 --- a/dace/codegen/targets/framecode.py +++ b/dace/codegen/targets/framecode.py @@ -285,6 +285,10 @@ def generate_footer(self, sdfg: SDFG, global_stream: CodeIOStream, callsite_stre """, sdfg) + for target in self._dispatcher.used_targets: + if target.has_initializer: + callsite_stream.write( + '__result |= __dace_init_%s(__state%s);' % (target.target_name, initparamnames_comma), sdfg) for env in self.environments: init_code = _get_or_eval_sdfg_first_arg(env.init_code, sdfg) if init_code: @@ -300,11 +304,6 @@ def generate_footer(self, sdfg: SDFG, global_stream: CodeIOStream, callsite_stre callsite_stream.write(self._initcode.getvalue(), sdfg) - for target in self._dispatcher.used_targets: - if target.has_initializer: - callsite_stream.write( - '__result |= __dace_init_%s(__state%s);' % (target.target_name, initparamnames_comma), sdfg) - callsite_stream.write( f""" if (__result) {{ @@ -325,6 +324,14 @@ def generate_footer(self, sdfg: SDFG, global_stream: CodeIOStream, callsite_stre callsite_stream.write( '__state->report.save("%s", __HASH_%s);' % (pathlib.Path(sdfg.build_folder) / "perf", sdfg.name), sdfg) + callsite_stream.write(self._exitcode.getvalue(), sdfg) + + for sd in sdfg.all_sdfgs_recursive(): + if None in sd.exit_code: + callsite_stream.write(codeblock_to_cpp(sd.exit_code[None]), sd) + if 'frame' in sd.exit_code: + callsite_stream.write(codeblock_to_cpp(sd.exit_code['frame']), sd) + for target in self._dispatcher.used_targets: if target.has_finalizer: callsite_stream.write( @@ -334,15 +341,6 @@ def generate_footer(self, sdfg: SDFG, global_stream: CodeIOStream, callsite_stre __err = __err_{target.target_name}; }} ''', sdfg) - - callsite_stream.write(self._exitcode.getvalue(), sdfg) - - for sd in sdfg.all_sdfgs_recursive(): - if None in sd.exit_code: - callsite_stream.write(codeblock_to_cpp(sd.exit_code[None]), sd) - if 'frame' in sd.exit_code: - callsite_stream.write(codeblock_to_cpp(sd.exit_code['frame']), sd) - for env in reversed(self.environments): finalize_code = _get_or_eval_sdfg_first_arg(env.finalize_code, sdfg) if finalize_code: diff --git a/dace/config_schema.yml b/dace/config_schema.yml index 954a3507d8..033cdd4555 100644 --- a/dace/config_schema.yml +++ b/dace/config_schema.yml @@ -396,9 +396,9 @@ required: type: bool title: Synchronous Debugging description: > - Enables Synchronous Debugging mode, where each library call - is followed by full-device synchronization and error checking. - default: false + Enables debugging mode where each asynchronous GPU call is followed by + device-wide synchronization and error checking. + default: False libs: type: str diff --git a/dace/transformation/passes/analysis/infer_gpu_grid_and_block_size.py b/dace/transformation/passes/analysis/infer_gpu_grid_and_block_size.py index 1f93f5559a..0421d02049 100644 --- a/dace/transformation/passes/analysis/infer_gpu_grid_and_block_size.py +++ b/dace/transformation/passes/analysis/infer_gpu_grid_and_block_size.py @@ -1,3 +1,4 @@ +# Copyright 2019-2025 ETH Zurich and the DaCe authors. All rights reserved. import warnings from typing import Dict, List, Set, Tuple diff --git a/dace/transformation/passes/fix_test.py b/dace/transformation/passes/fix_test.py new file mode 100644 index 0000000000..40c0ffa5b9 --- /dev/null +++ b/dace/transformation/passes/fix_test.py @@ -0,0 +1,84 @@ +# Copyright 2019-2025 ETH Zurich and the DaCe authors. All rights reserved. +from typing import Any, Dict, Set, Type, Union + +import numpy as np +import sympy as sp + +import dace +from dace import dtypes, properties, SDFG +from dace.codegen import common +from dace.config import Config +from dace.sdfg import nodes +from dace.transformation import pass_pipeline as ppl, transformation +from dace.transformation.passes.gpustream.gpustream_scheduling import NaiveGPUStreamScheduler +from dace.transformation.passes.gpustream.insert_gpu_streams_to_sdfgs import InsertGPUStreamsToSDFGs + + +from dace.codegen.targets.experimental_cuda_helpers.gpu_utils import is_within_schedule_types + +@properties.make_properties +@transformation.explicit_cf_compatible +class Fix(ppl.Pass): + """ + """ + + def depends_on(self) -> Set[Union[Type[ppl.Pass], ppl.Pass]]: + return {NaiveGPUStreamScheduler, InsertGPUStreamsToSDFGs} + + def modifies(self) -> ppl.Modifies: + return ppl.Modifies.Descriptors | ppl.Modifies.Nodes | ppl.Modifies.Memlets + + def should_reapply(self, modified: ppl.Modifies) -> bool: + return False + + def apply_pass(self, sdfg: SDFG, pipeline_results: Dict[str, Any]) -> Dict[str, dace.data.Data]: + + from dace.transformation.helpers import get_parent_map + + names: Dict = dict() + for node, parent_state in sdfg.all_nodes_recursive(): + if not isinstance(node, nodes.AccessNode): + continue + + + map_parent = None + state = parent_state + current = node + while current is not None: + if isinstance(current, nodes.MapEntry): + if current.map.schedule == dace.dtypes.ScheduleType.GPU_Device: + map_parent = current + break + + parent = get_parent_map(state, current) + if parent is None: + break + current, state = parent + + if map_parent is None: + continue + + data_desc = node.desc(parent_state) + if not data_desc.storage == dtypes.StorageType.Register: + continue + + shape = data_desc.shape + size_expr = np.prod(shape) + + # Try to evaluate the inequality + cmp = sp.simplify(size_expr > 64) + + if cmp is sp.true: # definitely larger + move_out = True + elif cmp is sp.false: # definitely safe + move_out = False + else: + # undecidable case (symbolic expression) + move_out = True # or warn, depending on policy + + if move_out: + data_desc.storage = dtypes.StorageType.GPU_Global + data_desc.transient = True + names[node.data] = map_parent + + return names diff --git a/dace/transformation/passes/gpustream/gpu_stream_topology_simplification.py b/dace/transformation/passes/gpustream/gpu_stream_topology_simplification.py index 8ebe591699..7edeea4a6a 100644 --- a/dace/transformation/passes/gpustream/gpu_stream_topology_simplification.py +++ b/dace/transformation/passes/gpustream/gpu_stream_topology_simplification.py @@ -268,31 +268,3 @@ def example(A: dace.uint32[128], B: dace.uint32[128], state.remove_node(sink_stream) state.remove_node(passthrough_gpu_node) - - def _remove_passthrough_gpu_stream_access_node(self, sdfg: SDFG) -> None: - """ - Unused: This will need adaption at the codegen level. - It is mainly unused because I don't think it makes the final SDFG - visually nicer, which is the whole purpose of this Pass. - """ - - for node, state in sdfg.all_nodes_recursive(): - # remove only GPU Stream AccessNodes who have exactly one incoming and outgoing edge - if not (isinstance(node, nodes.AccessNode) and node.desc(state).dtype == dtypes.gpuStream_t): - continue - - if not (state.in_degree(node) == 1 and state.out_degree(node) == 1): - continue - - in_edge = state.in_edges(node)[0] - out_edge = state.out_edges(node)[0] - - # Unknown case: in and out edge carry different data. Skip - if in_edge.data.data != out_edge.data.data: - continue - - # Remove the passthrough GPU stream AccessNode and replace it by a single edge - state.add_edge(in_edge.src, in_edge.src_conn, out_edge.dst, out_edge.dst_conn, in_edge.data) - state.remove_edge(in_edge) - state.remove_edge(out_edge) - state.remove_node(node) diff --git a/dace/transformation/passes/gpustream/gpustream_scheduling.py b/dace/transformation/passes/gpustream/gpustream_scheduling.py index aeb9e3b9b7..7eac383f4e 100644 --- a/dace/transformation/passes/gpustream/gpustream_scheduling.py +++ b/dace/transformation/passes/gpustream/gpustream_scheduling.py @@ -8,6 +8,9 @@ from dace.sdfg.graph import Graph, NodeT from dace.transformation import pass_pipeline as ppl, transformation +# Placeholder for the GPU stream variable used in tasklet code +STREAM_PLACEHOLDER = "__dace_current_stream" + @properties.make_properties @transformation.explicit_cf_compatible class NaiveGPUStreamScheduler(ppl.Pass): @@ -109,6 +112,10 @@ def _assign_gpu_streams_in_state(self, sdfg: SDFG, in_nested_sdfg: bool, state: components = self._get_weakly_connected_nodes(state) for component in components: + + if not self._requires_gpu_stream(state, component): + continue + nodes_assigned_before = len(stream_assignments) for node in component: @@ -193,3 +200,52 @@ def _next_stream(self, gpu_stream: int) -> int: return 0 else: return (gpu_stream + 1) % self._max_concurrent_streams + + def _requires_gpu_stream(self, state: SDFGState, component: Set[NodeT]) -> bool: + """ + Check whether a connected component in an SDFG state should be assigned + a GPU stream. + + A component requires a GPU stream if it contains at least one of: + - An AccessNode with GPU global memory storage, + - A MapEntry scheduled on a GPU device, + - A Tasklet whose code includes the stream placeholder. + + Parameters + ---------- + state : SDFGState + The state containing the component. + component : Set[NodeT] + The set of nodes that form the connected component. + + Returns + ------- + bool + True if the component requires a GPU stream, False otherwise. + """ + def gpu_relevant(node, parent) -> bool: + if (isinstance(node, nodes.AccessNode) + and node.desc(parent).storage == dace.dtypes.StorageType.GPU_Global): + return True + + elif (isinstance(node, nodes.MapEntry) + and node.map.schedule == dace.dtypes.ScheduleType.GPU_Device): + return True + + elif (isinstance(node, nodes.Tasklet) + and STREAM_PLACEHOLDER in node.code.as_string): + return True + + return False + + + for node in component: + if isinstance(node, nodes.NestedSDFG): + if any(gpu_relevant(node, parent) for node, parent in node.sdfg.all_nodes_recursive()): + return True + + else: + if gpu_relevant(node, state): + return True + + return False \ No newline at end of file diff --git a/dace/transformation/passes/gpustream/insert_gpu_stream_sync_tasklets.py b/dace/transformation/passes/gpustream/insert_gpu_stream_sync_tasklets.py index 09063d2df1..3f6ec722b0 100644 --- a/dace/transformation/passes/gpustream/insert_gpu_stream_sync_tasklets.py +++ b/dace/transformation/passes/gpustream/insert_gpu_stream_sync_tasklets.py @@ -36,9 +36,8 @@ def should_reapply(self, modified: ppl.Modifies) -> bool: def apply_pass(self, sdfg: SDFG, pipeline_results: Dict[str, Any]): """ - Inserts GPU stream synchronization tasklets at required locations: - - At the end of a state, for streams used in the state. - - After specific nodes, if synchronizeation is required before continuing. + Inserts GPU stream synchronization tasklets at required locations + at the end of a state, for GPU streams used in the state. """ stream_assignments: Dict[nodes.Node, int] = pipeline_results['NaiveGPUStreamScheduler'] @@ -47,20 +46,14 @@ def apply_pass(self, sdfg: SDFG, pipeline_results: Dict[str, Any]): # Synchronize all used streams at the end of a state self._insert_gpu_stream_sync_at_state_end(sdfg, sync_state, stream_assignments) - - # Synchronize after specific nodes if required (e.g. After GPU->Non-GPU copies might be a case) - self._insert_gpu_stream_sync_after_node(sdfg, sync_node, stream_assignments) - return {} def _identify_sync_locations(self, sdfg: SDFG, stream_assignments: Dict[nodes.Node, int] ) -> Tuple[Dict[SDFGState, Set[int]], Dict[nodes.Node, SDFGState]]: """ Heuristically identifies GPU stream synchronization points in an SDFG. - - Synchronization is inserted in the following cases: - - **End of a state**: When copying to or from GPU AccessNodes. - - **After a specific node**: When data leaves GPU memory and is used afterwards. + Synchronization is inserted at the end of a state when it is required. + Parameters ---------- @@ -119,7 +112,6 @@ def edge_within_kernel(state, src, dst): elif (is_gpu_accessnode(src, state) and is_nongpu_accessnode(dst, state) and not is_sink_node(dst, state) and not edge_within_kernel(state, src, dst)): sync_state[state].add(stream_assignments[dst]) - sync_node[dst] = state elif (is_nongpu_accessnode(src, state) and is_gpu_accessnode(dst, state) and not edge_within_kernel(state, src, dst)): @@ -171,7 +163,6 @@ def _insert_gpu_stream_sync_at_state_end(self, sdfg: SDFG, sync_state: Dict[SDFG Mapping of nodes to their assigned GPU stream IDs. """ # Prepare GPU stream info and backend - num_assigned_streams = max(stream_assignments.values(), default=0) + 1 stream_array_name, stream_var_name_prefix = Config.get('compiler', 'cuda', 'gpu_stream_name').split(',') backend: str = common.get_gpu_backend() @@ -229,56 +220,4 @@ def _insert_gpu_stream_sync_at_state_end(self, sdfg: SDFG, sync_state: Dict[SDFG tasklet.add_out_connector(conn, dtypes.gpuStream_t, force=True) state.add_edge(combined_stream_node, None, tasklet, conn, dace.Memlet(accessed_gpu_stream)) state.add_edge(tasklet, conn, output_stream_node, None, dace.Memlet(accessed_gpu_stream)) - - def _insert_gpu_stream_sync_after_node(self, sdfg: SDFG, sync_node: Dict[nodes.Node, SDFGState], - stream_assignments: Dict[nodes.Node, int]) -> None: - """ - Insert a GPU stream synchronization tasklet immediately after specified nodes. - - Parameters - ---------- - sdfg : SDFG - The top level SDFG. - sync_node : Dict[nodes.Node, SDFGState] - Mapping of nodes to their parent state. After after the node a GPU stream synchronization should occur. - stream_assignments : Dict[nodes.Node, int] - Mapping of nodes to their assigned GPU stream IDs. - """ - # Prepare GPU stream info and backend - num_assigned_streams = max(stream_assignments.values(), default=0) + 1 - stream_array_name, stream_var_name_prefix = Config.get('compiler', 'cuda', 'gpu_stream_name').split(',') - backend: str = common.get_gpu_backend() - - for node, state in sync_node.items(): - - #----------------- Generate GPU stream synchronization Tasklet ----------------- - - # Get assigned GPU stream - stream = stream_assignments.get(node, "nullptr") - if stream == "nullptr": - raise NotImplementedError("Using the default 'nullptr' gpu stream is not supported yet.") - - # Create the tasklet - stream_var_name = f"{stream_var_name_prefix}{stream}" - sync_call = f"DACE_GPU_CHECK({backend}StreamSynchronize({stream_var_name}));\n" - tasklet = state.add_tasklet( name=f"gpu_stream_{stream}_synchronization", - inputs=set(), outputs=set(), - code=sync_call, language=dtypes.Language.CPP) - - - #----------------- Place tasklet between node and successors, link GPU streams ---------------- - - # 1. Put the tasklet between the node and its successors - for succ in state.successors(node): - state.add_edge(tasklet, None, succ, None, dace.Memlet()) - state.add_edge(node, None, tasklet, None, dace.Memlet()) - - # 3. Connect tasklet to GPU stream AccessNodes - in_stream = state.add_access(stream_array_name) - out_stream = state.add_access(stream_array_name) - accessed_stream = f"{stream_array_name}[{stream}]" - state.add_edge(in_stream, None, tasklet, stream_var_name, dace.Memlet(accessed_stream)) - state.add_edge(tasklet, stream_var_name, out_stream, None, dace.Memlet(accessed_stream)) - tasklet.add_in_connector(stream_var_name, dtypes.gpuStream_t, force=True) - tasklet.add_out_connector(stream_var_name, dtypes.gpuStream_t, force=True) - \ No newline at end of file + \ No newline at end of file diff --git a/dace/transformation/passes/gpustream/insert_gpu_streams_to_kernels.py b/dace/transformation/passes/gpustream/insert_gpu_streams_to_kernels.py index eb5d9e015d..f88cc449dc 100644 --- a/dace/transformation/passes/gpustream/insert_gpu_streams_to_kernels.py +++ b/dace/transformation/passes/gpustream/insert_gpu_streams_to_kernels.py @@ -37,9 +37,6 @@ def apply_pass(self, sdfg: SDFG, pipeline_results: Dict[str, Any]): # Retrieve GPU stream assignments for nodes stream_assignments: Dict[nodes.Node, Union[int, str]] = pipeline_results['NaiveGPUStreamScheduler'] - # Determine the number of assigned GPU streams, needed for creating the GPU stream Array - num_assigned_streams = max(stream_assignments.values(), default=0) + 1 - # Link kernels to their assigned GPU streams for sub_sdfg in sdfg.all_sdfgs_recursive(): diff --git a/dace/transformation/passes/gpustream/insert_gpu_streams_to_sdfgs.py b/dace/transformation/passes/gpustream/insert_gpu_streams_to_sdfgs.py index a8a39a549a..a8d6e143fe 100644 --- a/dace/transformation/passes/gpustream/insert_gpu_streams_to_sdfgs.py +++ b/dace/transformation/passes/gpustream/insert_gpu_streams_to_sdfgs.py @@ -1,21 +1,14 @@ # Copyright 2019-2025 ETH Zurich and the DaCe authors. All rights reserved. from typing import Any, Dict, Set, Type, Union -import copy - import dace -from dace import dtypes, properties, SDFG, SDFGState -from dace.codegen import common +from dace import SDFG, dtypes, properties +from dace.codegen.targets.experimental_cuda_helpers.gpu_utils import is_within_schedule_types from dace.config import Config +from dace.sdfg import is_devicelevel_gpu +from dace.sdfg.nodes import AccessNode, MapEntry, MapExit, Node, Tasklet from dace.transformation import pass_pipeline as ppl, transformation from dace.transformation.passes.gpustream.gpustream_scheduling import NaiveGPUStreamScheduler -from dace.sdfg.nodes import Node, AccessNode, MapEntry, MapExit, Tasklet -from dace.sdfg.state import ControlFlowBlock, ControlFlowRegion, SDFGState - -from dace.codegen.targets.experimental_cuda_helpers.gpu_utils import is_within_schedule_types - - -from dace.sdfg import is_devicelevel_gpu STREAM_PLACEHOLDER = "__dace_current_stream" @@ -23,7 +16,12 @@ @transformation.explicit_cf_compatible class InsertGPUStreamsToSDFGs(ppl.Pass): """ - TODO + Inserts a GPU stream array into the top-level SDFG and propagates it to all + nested SDFGs that require it, including intermediate SDFGs along the hierarchy. + + This pass guarantees that every relevant SDFG has the array defined, avoiding + duplication and allowing subsequent passes in the GPU stream pipeline to rely + on its presence without redefining it. """ def depends_on(self) -> Set[Union[Type[ppl.Pass], ppl.Pass]]: @@ -36,41 +34,57 @@ def should_reapply(self, modified: ppl.Modifies) -> bool: return False def apply_pass(self, sdfg: SDFG, pipeline_results: Dict[str, Any]): + """ + Ensure that a GPU stream array is available in all SDFGs that require it. + + The pass creates the array once at the top-level SDFG and propagates it + down the hierarchy by inserting matching arrays in child SDFGs and wiring + them through nested SDFG connectors. This way, all SDFGs share a consistent + reference to the same GPU stream array. + """ + # Extract stream array name and number of streams to allocate stream_array_name = Config.get('compiler', 'cuda', 'gpu_stream_name').split(',')[0] stream_assignments: Dict[Node, Union[int, str]] = pipeline_results['NaiveGPUStreamScheduler'] num_assigned_streams = max(stream_assignments.values(), default=0) + 1 - # Add the GPU stream array as a transient to the top level SDFG + # Add the GPU stream array at the top level sdfg.add_transient(stream_array_name, (num_assigned_streams,), dtype=dace.dtypes.gpuStream_t, - storage=dace.dtypes.StorageType.CPU_Heap, lifetime=dace.dtypes.AllocationLifetime.Persistent) + storage=dace.dtypes.StorageType.Register) - gpu_stream_desc = sdfg.arrays[stream_array_name] + + # Ensure GPU stream array is defined where required for child_sdfg in self.find_child_sdfgs_requiring_gpu_stream(sdfg): - - # If GPU stream already defined (because a more inner child sdfg defined it all the way up) skip + + # Skip if this child already has the array (inserted higher up in the hierarchy) if stream_array_name in child_sdfg.arrays: continue + # Add the array to the child SDFG inner_sdfg = child_sdfg - outer_sdfg = inner_sdfg.parent_sdfg inner_sdfg.add_array(stream_array_name, (num_assigned_streams,), dtype=dace.dtypes.gpuStream_t, - storage=dace.dtypes.StorageType.CPU_Heap, lifetime=dace.dtypes.AllocationLifetime.Persistent) - + storage=dace.dtypes.StorageType.Register) + + # Walk up the hierarchy until the array is found, inserting it into each parent + outer_sdfg = inner_sdfg.parent_sdfg while stream_array_name not in outer_sdfg.arrays: + # Insert array in parent SDFG + outer_sdfg.add_array(stream_array_name, (num_assigned_streams,), dtype=dace.dtypes.gpuStream_t, + storage=dace.dtypes.StorageType.Register) + + # Connect parent SDFG array to nested SDFG node inner_nsdfg_node = inner_sdfg.parent_nsdfg_node inner_parent_state = inner_sdfg.parent inner_nsdfg_node.add_in_connector(stream_array_name, dtypes.gpuStream_t) - - outer_sdfg.add_array(stream_array_name, (num_assigned_streams,), dtype=dace.dtypes.gpuStream_t, - storage=dace.dtypes.StorageType.CPU_Heap, lifetime=dace.dtypes.AllocationLifetime.Persistent) inp_gpu_stream: AccessNode = inner_parent_state.add_access(stream_array_name) inner_parent_state.add_edge(inp_gpu_stream, None, inner_nsdfg_node, stream_array_name, dace.Memlet(stream_array_name)) + # Continue climbing up the hierarchy inner_sdfg = outer_sdfg outer_sdfg = outer_sdfg.parent_sdfg + # Ensure final connection from the first parent that had the array down to this SDFG inner_nsdfg_node = inner_sdfg.parent_nsdfg_node inner_parent_state = inner_sdfg.parent inner_nsdfg_node.add_in_connector(stream_array_name, dtypes.gpuStream_t) @@ -78,7 +92,7 @@ def apply_pass(self, sdfg: SDFG, pipeline_results: Dict[str, Any]): inner_parent_state.add_edge(inp_gpu_stream, None, inner_nsdfg_node, stream_array_name, dace.Memlet(f"{stream_array_name}[0:{num_assigned_streams}]")) outer_sdfg = inner_sdfg.parent_sdfg - + return {} def find_child_sdfgs_requiring_gpu_stream(self, sdfg) -> Set[SDFG]: diff --git a/dace/transformation/passes/gpustream/insert_gpu_streams_to_tasklets.py b/dace/transformation/passes/gpustream/insert_gpu_streams_to_tasklets.py index f0c226a817..c7ef71abab 100644 --- a/dace/transformation/passes/gpustream/insert_gpu_streams_to_tasklets.py +++ b/dace/transformation/passes/gpustream/insert_gpu_streams_to_tasklets.py @@ -10,6 +10,8 @@ from dace.transformation.passes.gpustream.insert_gpu_streams_to_sdfgs import InsertGPUStreamsToSDFGs from dace.transformation.passes.gpustream.insert_gpu_streams_to_kernels import InsertGPUStreamsToKernels +# Placeholder for the GPU stream variable used in tasklet code +STREAM_PLACEHOLDER = "__dace_current_stream" @properties.make_properties @transformation.explicit_cf_compatible @@ -38,27 +40,16 @@ def should_reapply(self, modified: ppl.Modifies) -> bool: return False def apply_pass(self, sdfg: SDFG, pipeline_results: Dict[str, Any]): - - # Placeholder for the GPU stream variable used in tasklet code - STREAM_PLACEHOLDER = "__dace_current_stream" - # Retrieve the GPU stream's array name stream_array_name = Config.get('compiler', 'cuda', 'gpu_stream_name').split(',')[0] # Retrieve GPU stream assignments for nodes stream_assignments: Dict[nodes.Node, Union[int, str]] = pipeline_results['NaiveGPUStreamScheduler'] - # Determine the number of assigned GPU streams, needed for creating the GPU stream Array - num_assigned_streams = max(stream_assignments.values(), default=0) + 1 - # Find all tasklets which use the GPU stream variable (STREAM_PLACEHOLDER) in the code # and provide them the needed GPU stream explicitly for sub_sdfg in sdfg.all_sdfgs_recursive(): - # Track whether the GPU stream array is in tge - # sub_sdfg's data descriptor store - gpustream_array_added: bool = stream_array_name in sub_sdfg.arrays - for state in sub_sdfg.states(): for node in state.nodes(): diff --git a/dace/transformation/passes/insert_gpu_copy_tasklets.py b/dace/transformation/passes/insert_gpu_copy_tasklets.py index 1bf5ceffac..b38d133814 100644 --- a/dace/transformation/passes/insert_gpu_copy_tasklets.py +++ b/dace/transformation/passes/insert_gpu_copy_tasklets.py @@ -68,7 +68,6 @@ def apply_pass(self, sdfg: SDFG, pipeline_results: Dict[str, Any]) -> Dict: """ # Prepare GPU stream gpustream_assignments: Dict[nodes.Node, Union[int, str]] = pipeline_results['NaiveGPUStreamScheduler'] - num_assigned_streams = max(gpustream_assignments.values(), default=0) + 1 gpustream_array_name, gpustream_var_name_prefix = Config.get('compiler', 'cuda', 'gpu_stream_name').split(',') # Initialize the strategy for copies that occur outside of kernel execution @@ -110,7 +109,7 @@ def apply_pass(self, sdfg: SDFG, pipeline_results: Dict[str, Any]) -> Dict: dst_node_pred, dst_node_conn, _, dst_conn, memlet = edge state.add_edge(dst_node_pred, dst_node_conn, tasklet, None, copy.deepcopy(memlet)) state.add_edge(tasklet, None, dst_node, dst_conn, copy.deepcopy(memlet)) - state.remove_edge(edge) + state.remove_edge(edge) return {} @@ -162,20 +161,4 @@ def find_all_data_copies(self, sdfg: SDFG) -> List[Tuple[SDFG, SDFGState, nodes. # Add copy to the worklist copy_worklist.append((sub_sdfg, state, src_node, dst_node, first_edge)) - """ - # NOTE: This is closer to what the cpu.py file does. Some copies could be missed - # in case someone wants to extend this pass with other copy tasklets- in this case, - # I would suggest to take a closer look into cpu.py how copies are dispatched. - - if (isinstance(dst_node, nodes.AccessNode) and scope_dict[src_node] != scope_dict[dst_node] - and scope_contains_scope(scope_dict, src_node, dst_node)): - copy_worklist.append((sub_sdfg, state, src_node, dst_node, last_edge)) - - elif (isinstance(src_node, nodes.AccessNode) and not isinstance(dst_node, nodes.Tasklet)): - copy_worklist.append((sub_sdfg, state, src_node, dst_node, first_edge)) - - elif (not isinstance(src_node, nodes.CodeNode) and isinstance(dst_node, nodes.Tasklet)): - copy_worklist.append((sub_sdfg, state, src_node, dst_node, last_edge)) - """ - return copy_worklist diff --git a/dace/transformation/passes/move_array_out_of_kernel.py b/dace/transformation/passes/move_array_out_of_kernel.py index a53677a2f2..0b91d671bf 100644 --- a/dace/transformation/passes/move_array_out_of_kernel.py +++ b/dace/transformation/passes/move_array_out_of_kernel.py @@ -1,3 +1,4 @@ +# Copyright 2019-2025 ETH Zurich and the DaCe authors. All rights reserved. from typing import Dict, FrozenSet, Set, Tuple, List, Optional import copy import functools @@ -141,8 +142,8 @@ def move_array_out_of_kernel_flat(self, kernel_entry: nodes.MapEntry, array_name next_map_exit = parent_state.exit_node(next_map_entry) if in_connector not in next_map_exit.in_connectors: next_map_state = self._node_to_state_cache[next_map_exit] - next_map_exit.add_in_connector(in_connector) - next_map_exit.add_out_connector(out_connector) + next_map_exit.add_in_connector(in_connector, dtypes.pointer(array_desc.dtype)) + next_map_exit.add_out_connector(out_connector, dtypes.pointer(array_desc.dtype)) next_entries, _ = self.get_maps_between(kernel_entry, previous_node) memlet_subset = Range(self.get_memlet_subset(next_entries, previous_node) + old_subset) @@ -292,10 +293,10 @@ def lift_array_through_nested_sdfgs(self, array_name:str, kernel_entry: nodes.Ma # 1.1 Determine source connector name and register it based on src type if isinstance(src, nodes.NestedSDFG): src_conn = array_name - src.add_out_connector(src_conn) + src.add_out_connector(src_conn, dtypes.pointer(new_desc.dtype)) elif isinstance(src, nodes.MapExit): src_conn = f"OUT_{array_name}" - src.add_out_connector(src_conn) + src.add_out_connector(src_conn, dtypes.pointer(new_desc.dtype)) else: raise NotImplementedError(f"Unsupported source node type '{type(src).__name__}' — only NestedSDFG or MapExit are expected.") @@ -304,7 +305,7 @@ def lift_array_through_nested_sdfgs(self, array_name:str, kernel_entry: nodes.Ma dst_conn = None # AccessNodes use implicit connectors elif isinstance(dst, nodes.MapExit): # Assuming dst is the entry for parent scope dst_conn = f"IN_{array_name}" - dst.add_in_connector(dst_conn) + dst.add_in_connector(dst_conn, dtypes.pointer(new_desc.dtype)) else: raise NotImplementedError(f"Unsupported destination node type '{type(dst).__name__}' — expected AccessNode or MapEntry.") @@ -323,10 +324,10 @@ def lift_array_through_nested_sdfgs(self, array_name:str, kernel_entry: nodes.Ma if isinstance(src, nodes.NestedSDFG): src_conn = array_name - src.add_out_connector(src_conn) + src.add_out_connector(src_conn, dtypes.pointer(new_desc.dtype)) elif isinstance(src, nodes.MapExit): src_conn = f"OUT_{array_name}" - src.add_out_connector(src_conn) + src.add_out_connector(src_conn, dtypes.pointer(new_desc.dtype)) else: raise NotImplementedError(f"Unsupported source node type '{type(src).__name__}' — only NestedSDFG or MapExit are expected.") @@ -523,7 +524,7 @@ def replace_array_name(self, sdfgs: FrozenSet[SDFG], old_name: str, new_name: st if edge.src_conn == old_out_conn: edge.src_conn = new_out_conn src.remove_out_connector(old_out_conn) - src.add_out_connector(new_out_conn) + src.add_out_connector(new_out_conn, dtypes.pointer(array_desc.dtype)) # Update in connectors dst = edge.dst @@ -532,7 +533,7 @@ def replace_array_name(self, sdfgs: FrozenSet[SDFG], old_name: str, new_name: st if edge.dst_conn == old_in_conn: edge.dst_conn = new_in_conn dst.remove_in_connector(old_in_conn) - dst.add_in_connector(new_in_conn) + dst.add_in_connector(new_in_conn, dtypes.pointer(array_desc.dtype)) def update_symbols(self, map_entry_chain: List[nodes.MapEntry], top_sdfg: SDFG) -> None: """ diff --git a/tests/codegen/cuda_mempool_test.py b/tests/codegen/cuda_mempool_test.py index eccd97ee61..c70af71598 100644 --- a/tests/codegen/cuda_mempool_test.py +++ b/tests/codegen/cuda_mempool_test.py @@ -144,7 +144,7 @@ def tester(A: CudaArray, B: CudaArray): code = sdfg.generate_code()[0].clean_code assert code.count('cudaMallocAsync') == 1 - assert code.count('cudaFreeAsync(pooled, __state->gpu_context->streams[0]') == 1 + assert code.count('cudaFreeAsync(pooled, __state->gpu_context->streams[0]') == 1 or code.count('cudaFreeAsync(pooled, gpu_stream0') == 1 # Test code import cupy as cp @@ -198,7 +198,7 @@ def test_memory_pool_if_states(cnd): sdfg.validate() code = sdfg.generate_code()[0].clean_code assert code.count('cudaMallocAsync') == 1 - assert code.count(f'cudaFreeAsync({tmp}, __state->gpu_context->streams[0]') == 1 + assert code.count(f'cudaFreeAsync({tmp}, __state->gpu_context->streams[0]') == 1 or code.count(f'cudaFreeAsync({tmp}, gpu_stream0') == 1 # Test code import cupy as cp diff --git a/tests/npbench/misc/scattering_self_test.py b/tests/npbench/misc/scattering_self_test.py index 5b9a5ade62..2bb915afe9 100644 --- a/tests/npbench/misc/scattering_self_test.py +++ b/tests/npbench/misc/scattering_self_test.py @@ -115,7 +115,6 @@ def run_scattering_self_test(device_type: dace.dtypes.DeviceType): def test_cpu(): run_scattering_self_test(dace.dtypes.DeviceType.CPU) - @pytest.mark.gpu def test_gpu(): run_scattering_self_test(dace.dtypes.DeviceType.GPU) diff --git a/tests/parse_state_struct_test.py b/tests/parse_state_struct_test.py index f8553249ea..676012baad 100644 --- a/tests/parse_state_struct_test.py +++ b/tests/parse_state_struct_test.py @@ -10,7 +10,7 @@ import dace import dace.library -from dace import dtypes +from dace import dtypes, Config from dace.codegen import codeobject, targets, compiler, compiled_sdfg, common @@ -31,9 +31,13 @@ def _cuda_helper(): }} }} """ - program = codeobject.CodeObject("cuda_helper", helper_code, "cpp", targets.cpu.CPUCodeGen, "CudaHelper") - dummy_cuda_target = codeobject.CodeObject("dummy", "", "cu", targets.cuda.CUDACodeGen, "CudaDummy") + if Config.get('compiler', 'cuda', 'implementation') == 'experimental': + program = codeobject.CodeObject("cuda_helper", helper_code, "cpp",targets.cpu.CPUCodeGen, "CudaHelper") + dummy_cuda_target = codeobject.CodeObject("dummy", "", "cu", targets.experimental_cuda.ExperimentalCUDACodeGen, "CudaDummy") + else: + program = codeobject.CodeObject("cuda_helper", helper_code, "cpp", targets.cpu.CPUCodeGen, "CudaHelper") + dummy_cuda_target = codeobject.CodeObject("dummy", "", "cu", targets.cuda.CUDACodeGen, "CudaDummy") build_folder = dace.Config.get('default_build_folder') BUILD_PATH = os.path.join(build_folder, "cuda_helper") From 01e462a29fd802b2eca2c1ce5ae96ea30a444497 Mon Sep 17 00:00:00 2001 From: aydogdub Date: Wed, 10 Sep 2025 15:18:22 +0200 Subject: [PATCH 76/94] Fixing GPU stream management and clean up --- dace/codegen/instrumentation/gpu_events.py | 9 +- dace/codegen/targets/cpu.py | 10 +- dace/codegen/targets/experimental_cuda.py | 192 ++-- .../copy_strategies.py | 927 ++++++++++-------- .../gpu_stream_manager.py | 33 +- .../experimental_cuda_helpers/gpu_utils.py | 32 +- .../new_copy_strategies.py | 763 -------------- .../scope_strategies.py | 106 +- dace/config_schema.yml | 10 +- dace/dtypes.py | 1 + dace/registry.py | 6 +- dace/sdfg/state.py | 6 +- .../dataflow/add_threadblock_map.py | 2 +- .../interstate/gpu_transform_sdfg.py | 11 +- dace/transformation/passes/fix_test.py | 15 +- .../gpu_stream_topology_simplification.py | 98 +- .../passes/gpustream/gpustream_scheduling.py | 46 +- .../insert_gpu_stream_sync_tasklets.py | 63 +- .../insert_gpu_streams_to_kernels.py | 13 +- .../gpustream/insert_gpu_streams_to_sdfgs.py | 53 +- .../insert_gpu_streams_to_tasklets.py | 7 +- .../passes/insert_gpu_copy_tasklets.py | 28 +- .../passes/move_array_out_of_kernel.py | 201 ++-- .../passes/shared_memory_synchronization.py | 651 +++++------- .../passes/shared_memory_synchronization2.py | 311 ------ tests/codegen/cuda_mempool_test.py | 6 +- tests/codegen/gpu_memcpy_test.py | 6 +- tests/npbench/misc/scattering_self_test.py | 1 + tests/parse_state_struct_test.py | 5 +- 29 files changed, 1204 insertions(+), 2408 deletions(-) delete mode 100644 dace/codegen/targets/experimental_cuda_helpers/new_copy_strategies.py delete mode 100644 dace/transformation/passes/shared_memory_synchronization2.py diff --git a/dace/codegen/instrumentation/gpu_events.py b/dace/codegen/instrumentation/gpu_events.py index bf92ef16a7..99a91e3b3f 100644 --- a/dace/codegen/instrumentation/gpu_events.py +++ b/dace/codegen/instrumentation/gpu_events.py @@ -174,10 +174,10 @@ def _get_gpu_stream(self, state: SDFGState, node: nodes.Node) -> int: """ Return the GPU stream ID assigned to a given node. - - In the CUDACodeGen, the stream ID is stored as the private attribute + - In the CUDACodeGen, the stream ID is stored as the private attribute ``_cuda_stream`` on the node. - - In the ExperimentalCUDACodeGen, streams are explicitly assigned to tasklets - and GPU_Device-scheduled maps (kernels) via a GPU stream AccessNode. For + - In the ExperimentalCUDACodeGen, streams are explicitly assigned to tasklets + and GPU_Device-scheduled maps (kernels) via a GPU stream AccessNode. For other node types, no reliable stream assignment is available. Parameters @@ -194,7 +194,7 @@ def _get_gpu_stream(self, state: SDFGState, node: nodes.Node) -> int: """ if config.Config.get('compiler', 'cuda', 'implementation') == 'legacy': stream = getattr(node, '_cuda_stream', -1) - + else: stream = -1 for in_edge in state.in_edges(node): @@ -208,4 +208,3 @@ def _get_gpu_stream(self, state: SDFGState, node: nodes.Node) -> int: stream = int(out_edge.data.subset) return stream - \ No newline at end of file diff --git a/dace/codegen/targets/cpu.py b/dace/codegen/targets/cpu.py index 30d646a2e5..1905ba5c6d 100644 --- a/dace/codegen/targets/cpu.py +++ b/dace/codegen/targets/cpu.py @@ -515,11 +515,11 @@ def allocate_array(self, elif (nodedesc.storage == dtypes.StorageType.Register): if nodedesc.dtype == dtypes.gpuStream_t: - ctype = dtypes.gpuStream_t.ctype + ctype = dtypes.gpuStream_t.ctype allocation_stream.write(f"{ctype}* {name} = __state->gpu_context->streams;") - define_var(name, DefinedType.Pointer, ctype ) + define_var(name, DefinedType.Pointer, ctype) return - + ctypedef = dtypes.pointer(nodedesc.dtype).ctype if nodedesc.start_offset != 0: raise NotImplementedError('Start offset unsupported for registers') @@ -593,7 +593,7 @@ def deallocate_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgrap return elif (nodedesc.storage == dtypes.StorageType.CPU_Heap or (nodedesc.storage == dtypes.StorageType.Register and symbolic.issymbolic(arrsize, sdfg.constants))): - + if nodedesc.dtype == dtypes.gpuStream_t: callsite_stream.write(f"{alloc_name} = nullptr;") return @@ -1018,7 +1018,7 @@ def process_out_memlets(self, # Special case: GPU Streams do not represent data flow - they assing GPU Streams to kernels/tasks # Thus, nothing needs to be written and out memlets of this kind should be ignored. continue - + # Target is neither a data nor a tasklet node if isinstance(node, nodes.AccessNode) and (not isinstance(dst_node, nodes.AccessNode) and not isinstance(dst_node, nodes.CodeNode)): diff --git a/dace/codegen/targets/experimental_cuda.py b/dace/codegen/targets/experimental_cuda.py index 231ada370c..20898b6cc0 100644 --- a/dace/codegen/targets/experimental_cuda.py +++ b/dace/codegen/targets/experimental_cuda.py @@ -1,13 +1,7 @@ # Copyright 2019-2025 ETH Zurich and the DaCe authors. All rights reserved. -# Standard library imports -import warnings -from typing import TYPE_CHECKING, Any, Dict, List, Optional, Set, Tuple, Union - -# Third-party imports +from typing import TYPE_CHECKING, Dict, List, Optional, Set, Tuple, Union import networkx as nx -import sympy -# DaCe core imports import dace from dace import data as dt, Memlet from dace import dtypes, registry, symbolic @@ -17,7 +11,6 @@ from dace.sdfg.graph import MultiConnectorEdge from dace.sdfg.state import ControlFlowRegion, StateSubgraphView -# DaCe codegen imports from dace.codegen import common from dace.codegen.codeobject import CodeObject from dace.codegen.dispatcher import DefinedType, TargetDispatcher @@ -37,15 +30,13 @@ from dace.transformation.passes.insert_gpu_copy_tasklets import InsertGPUCopyTasklets from dace.transformation.passes.gpustream.gpu_stream_topology_simplification import GPUStreamTopologySimplification from dace.transformation.passes.gpustream.insert_gpu_stream_sync_tasklets import InsertGPUStreamSyncTasklets -#from dace.transformation.passes.shared_memory_synchronization import DefaultSharedMemorySync -from dace.transformation.passes.shared_memory_synchronization2 import DefaultSharedMemorySync +from dace.transformation.passes.shared_memory_synchronization import DefaultSharedMemorySync from dace.transformation.dataflow.add_threadblock_map import AddThreadBlockMap from dace.transformation.passes.analysis.infer_gpu_grid_and_block_size import InferGPUGridAndBlockSize # Experimental CUDA helper imports from dace.codegen.targets.experimental_cuda_helpers.gpu_stream_manager import GPUStreamManager -from dace.codegen.targets.experimental_cuda_helpers.gpu_utils import symbolic_to_cpp, generate_sync_debug_call, get_defined_type - +from dace.codegen.targets.experimental_cuda_helpers.gpu_utils import generate_sync_debug_call, get_defined_type from dace.codegen.targets import cpp @@ -54,8 +45,6 @@ from dace.codegen.targets.framecode import DaCeCodeGenerator from dace.codegen.targets.cpu import CPUCodeGen -# add symbolic_to_cpp ! - @registry.autoregister_params(name='experimental_cuda') class ExperimentalCUDACodeGen(TargetCodeGenerator): @@ -186,8 +175,6 @@ def preprocess(self, sdfg: SDFG) -> None: CopyToMap.apply_to(nsdfg, save=False, annotate=False, a=e.src, b=e.dst) except ValueError: # If transformation doesn't match, continue normally continue - - """ from dace.transformation.passes.fix_test import Fix from dace.transformation.passes.move_array_out_of_kernel import MoveArrayOutOfKernel @@ -199,8 +186,6 @@ def preprocess(self, sdfg: SDFG) -> None: sdfg.save("after.sdfg") """ - - #----------------- Add ThreadBlock Maps & Infer Kernel Grid & Block Sizes -------------------- # new_nodes - old_nodes gives us all Kernel Entry nodes that were created during the insertion @@ -227,19 +212,17 @@ def preprocess(self, sdfg: SDFG) -> None: # Prepare the Pipeline to make GPU streams explicit: Add and connect SDFG nodes # with GPU stream AccessNodes where used - stream_pipeline = Pipeline( - [ - NaiveGPUStreamScheduler(), - InsertGPUStreamsToSDFGs(), - InsertGPUStreamsToKernels(), - InsertGPUStreamsToTasklets(), - InsertGPUStreamSyncTasklets(), - InsertGPUCopyTasklets(), - GPUStreamTopologySimplification(), - ] - ) - - # TODO: Missed copies due to InsertGPUCopyTasklet -> maybe check wheter copies were + stream_pipeline = Pipeline([ + NaiveGPUStreamScheduler(), + InsertGPUStreamsToSDFGs(), + InsertGPUStreamsToKernels(), + InsertGPUStreamsToTasklets(), + InsertGPUStreamSyncTasklets(), + InsertGPUCopyTasklets(), + GPUStreamTopologySimplification(), + ]) + + # TODO: Missed copies due to InsertGPUCopyTasklet -> maybe check wheter copies were # handled above than just adding this codegen to used_targets by default self._dispatcher._used_targets.add(self) gpustream_assignments = stream_pipeline.apply_pass(sdfg, {})['NaiveGPUStreamScheduler'] @@ -387,7 +370,6 @@ def generate_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: ScopeSub # Enter kernel context and recursively generate device code - state = cfg.state(state_id) scope_entry = dfg_scope.source_nodes()[0] scope_exit = dfg_scope.sink_nodes()[0] @@ -398,22 +380,17 @@ def generate_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: ScopeSub instr = self._dispatcher.instrumentation[scope_entry.map.instrument] if instr is not None: instr.on_scope_entry(sdfg, cfg, state, scope_entry, callsite_stream, scope_entry_stream, - self._globalcode) + self._globalcode) outer_stream = CodeIOStream() instr.on_scope_exit(sdfg, cfg, state, scope_exit, outer_stream, scope_exit_stream, self._globalcode) - # New scope for defined variables (kernel functions scope) self._dispatcher.defined_vars.enter_scope(scope_entry) - # Store kernel metadata (name, dimensions, arguments, etc.) in a KernelSpec object + # Store kernel metadata (name, dimensions, arguments, etc.) in a KernelSpec object # and save it as an attribute - kernel_spec = KernelSpec(cudaCodeGen=self, - sdfg=sdfg, - cfg=cfg, - dfg_scope=dfg_scope, - state_id=state_id) - + kernel_spec = KernelSpec(cudaCodeGen=self, sdfg=sdfg, cfg=cfg, dfg_scope=dfg_scope, state_id=state_id) + self._current_kernel_spec = kernel_spec # (Re)define variables for the new scope @@ -435,7 +412,6 @@ def generate_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: ScopeSub raise ValueError("Invalid kernel configuration: This strategy is only applicable if the " "outermost GPU schedule is of type GPU_Device (most likely cause).") - self._localcode.write(scope_entry_stream.getvalue()) # Append generated kernel code to localcode @@ -449,7 +425,7 @@ def generate_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: ScopeSub # Generate kernel wrapper, i.e. function which will launch the kernel self._generate_kernel_wrapper(sdfg, cfg, dfg_scope, state_id, function_stream, callsite_stream) - # Exit scope for defined variables + # Exit scope for defined variables self._dispatcher.defined_vars.exit_scope(scope_entry) if instr is not None: @@ -485,11 +461,11 @@ def _define_variables_in_kernel_scope(self, sdfg: SDFG, dispatcher: TargetDispat Define kernel-visible variables in the dispatcher's scope. - Certain variables stored in the host-side ``__state`` struct (e.g., persistent or external - data) cannot be accessed directly in kernel code. They are passed as arguments instead, with - pointer names resolved via ``cpp.ptr(..)``. These must be registered in the dispatcher for use + data) cannot be accessed directly in kernel code. They are passed as arguments instead, with + pointer names resolved via ``cpp.ptr(..)``. These must be registered in the dispatcher for use in kernel context. - - KernelSpec may also mark certain variables/arguments as constants, which must be registered with + - KernelSpec may also mark certain variables/arguments as constants, which must be registered with the appropriate ``const`` qualifier in their ctype. """ # Extract argument and constant definitions from the KernelSpec @@ -498,9 +474,9 @@ def _define_variables_in_kernel_scope(self, sdfg: SDFG, dispatcher: TargetDispat kernel_arglist: Dict[str, dt.Data] = kernel_spec.arglist # Save current in_device_code value for restoration later - restore_in_device_code = self._in_device_code + restore_in_device_code = self._in_device_code for name, data_desc in kernel_arglist.items(): - + # Only arrays relevant if not name in sdfg.arrays: continue @@ -511,9 +487,9 @@ def _define_variables_in_kernel_scope(self, sdfg: SDFG, dispatcher: TargetDispat host_ptrname = cpp.ptr(name, data_desc, sdfg, self._frame) # Get defined type and ctype for the data (use host pointer name) - is_global: bool = data_desc.lifetime in (dtypes.AllocationLifetime.Global, - dtypes.AllocationLifetime.Persistent, - dtypes.AllocationLifetime.External) + is_global: bool = data_desc.lifetime in (dtypes.AllocationLifetime.Global, + dtypes.AllocationLifetime.Persistent, + dtypes.AllocationLifetime.External) defined_type, ctype = dispatcher.defined_vars.get(host_ptrname, is_global=is_global) # Get the inner/device pointer name @@ -527,12 +503,13 @@ def _define_variables_in_kernel_scope(self, sdfg: SDFG, dispatcher: TargetDispat # Register variable with the device pointer name for the kernel context dispatcher.defined_vars.add(device_ptrname, defined_type, ctype, allow_shadowing=True) - + # Restore in_device_code field self._in_device_code = restore_in_device_code - def _declare_and_invoke_kernel_wrapper(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: ScopeSubgraphView, state_id: int, - function_stream: CodeIOStream, callsite_stream: CodeIOStream) -> None: + def _declare_and_invoke_kernel_wrapper(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: ScopeSubgraphView, + state_id: int, function_stream: CodeIOStream, + callsite_stream: CodeIOStream) -> None: scope_entry = dfg_scope.source_nodes()[0] @@ -556,12 +533,11 @@ def _declare_and_invoke_kernel_wrapper(self, sdfg: SDFG, cfg: ControlFlowRegion, callsite_stream.write( self._cpu_codegen.memlet_definition(sdfg, e.data, False, e.dst_conn, e.dst.in_connectors[e.dst_conn]), cfg, state_id, scope_entry) - + # Calling the kernel wrapper function (in the CPU-side code) callsite_stream.write('__dace_runkernel_%s(%s);\n' % (kernel_name, ', '.join(kernel_wrapper_args_as_input)), cfg, state_id, scope_entry) - # If there are dynamic Map inputs, put the kernel invocation in its own scope to avoid redefinitions. if dace.sdfg.has_dynamic_map_inputs(state, scope_entry): callsite_stream.write('}', cfg, state_id, scope_entry) @@ -579,8 +555,8 @@ def _generate_kernel_wrapper(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope # get kernel dimensions and transform into a c++ string grid_dims = kernel_spec.grid_dims block_dims = kernel_spec.block_dims - gdims = ', '.join(symbolic_to_cpp(grid_dims)) - bdims = ', '.join(symbolic_to_cpp(block_dims)) + gdims = ', '.join(sym2cpp(grid_dims)) + bdims = ', '.join(sym2cpp(block_dims)) # ----------------- Kernel Launch Function Declaration ----------------------- @@ -588,9 +564,7 @@ def _generate_kernel_wrapper(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope f""" DACE_EXPORTED void __dace_runkernel_{kernel_name}({', '.join(kernel_launch_args_typed)}); void __dace_runkernel_{kernel_name}({', '.join(kernel_launch_args_typed)}) - """, - cfg, state_id, scope_entry - ) + """, cfg, state_id, scope_entry) # Open bracket self._localcode.write('{', cfg, state_id, scope_entry) @@ -602,7 +576,7 @@ def _generate_kernel_wrapper(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope for gdim in grid_dims: # Only emit a guard if we can't statically prove gdim > 0 if (gdim > 0) != True: - single_dimchecks.append(f'(({symbolic_to_cpp(gdim)}) <= 0)') + single_dimchecks.append(f'(({sym2cpp(gdim)}) <= 0)') dimcheck = ' || '.join(single_dimchecks) @@ -644,17 +618,18 @@ def copy_memory(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView edge: Tuple[nodes.Node, str, nodes.Node, str, Memlet], function_stream: CodeIOStream, callsite_stream: CodeIOStream) -> None: - from dace.codegen.targets.experimental_cuda_helpers.new_copy_strategies import ( - CopyContext, - OutOfKernelCopyStrategy, - SyncCollaboritveGPUCopyStrategy - ) + from dace.codegen.targets.experimental_cuda_helpers.copy_strategies import (CopyContext, + OutOfKernelCopyStrategy, + SyncCollaboritveGPUCopyStrategy) - context = CopyContext(sdfg, cfg.state(state_id), src_node, dst_node, edge, self._gpu_stream_manager.gpustream_assignments) + context = CopyContext(sdfg, cfg.state(state_id), src_node, dst_node, edge, + self._gpu_stream_manager.gpustream_assignments) if OutOfKernelCopyStrategy().applicable(context): + # Handled during the GPU stream pipeline in preprocess() + # in form of explicit tasklets return - + elif SyncCollaboritveGPUCopyStrategy().applicable(context): code = SyncCollaboritveGPUCopyStrategy().generate_copy(context, self._kernel_dimensions_map) callsite_stream.write(code, cfg, state_id, [src_node, dst_node]) @@ -676,7 +651,7 @@ def state_dispatch_predicate(self, sdfg, state): 2. The code generator is currently generating device/kernel code. """ return any(s is state for s, _ in self.pool_release.values()) or self._in_device_code - + def node_dispatch_predicate(self, sdfg, state, node): """ Determines whether a node should be handled by this @@ -775,7 +750,7 @@ def generate_nsdfg_call(self, sdfg, cfg, state, node, memlet_references, sdfg_la def generate_nsdfg_arguments(self, sdfg, cfg, dfg, state, node): args = self._cpu_codegen.generate_nsdfg_arguments(sdfg, cfg, dfg, state, node) return args - + def _generate_NestedSDFG(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int, node: nodes.NestedSDFG, function_stream: CodeIOStream, callsite_stream: CodeIOStream) -> None: @@ -783,7 +758,7 @@ def _generate_NestedSDFG(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSub self._toplevel_schedule = node.schedule old_codegen = self._cpu_codegen.calling_codegen self._cpu_codegen.calling_codegen = self - + # Determine and update ctype of new constant data and symbols within the NSDFG parent_state: SDFGState = cfg.state(state_id) nsdfg = node.sdfg @@ -797,16 +772,16 @@ def _generate_NestedSDFG(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSub # update const data new_const_data = sdutil.get_constant_data(node, parent_state) - self._current_kernel_spec.kernel_constants for name in new_const_data: - desc = nsdfg.arrays[name] + desc = nsdfg.arrays[name] ptr_name = ptr(name, desc, nsdfg, self._frame) - try: + try: defined_type, ctype = dispatcher.defined_vars.get(ptr_name, is_global=True) if not "const " in desc.ctype: ctype = f"const {desc.ctype}" except: defined_type = get_defined_type(desc) if not "const " in desc.ctype: - ctype = f"const {desc.ctype}" + ctype = f"const {desc.ctype}" dispatcher.defined_vars.add(ptr_name, defined_type, ctype, allow_shadowing=True) # update const symbols @@ -816,7 +791,6 @@ def _generate_NestedSDFG(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSub if not "const" in nsdfg.symbols[name].ctype: ctype = f"const {nsdfg.symbols[name].ctype}" - # Redirect rest to CPU codegen self._cpu_codegen._generate_NestedSDFG(sdfg, cfg, dfg, state_id, node, function_stream, callsite_stream) @@ -880,7 +854,7 @@ def allocate_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphV if isinstance(nodedesc, dace.data.Stream): raise NotImplementedError("allocate_stream not implemented in ExperimentalCUDACodeGen") - + elif isinstance(nodedesc, dace.data.View): return self._cpu_codegen.allocate_view(sdfg, cfg, dfg, state_id, node, function_stream, declaration_stream, allocation_stream) @@ -920,7 +894,7 @@ def _prepare_GPU_Global_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: Sta # ------------------- Allocation ------------------- arrsize = nodedesc.total_size - arrsize_malloc = f'{symbolic_to_cpp(arrsize)} * sizeof({nodedesc.dtype.ctype})' + arrsize_malloc = f'{sym2cpp(arrsize)} * sizeof({nodedesc.dtype.ctype})' if nodedesc.pool: gpu_stream_manager = self._gpu_stream_manager @@ -928,7 +902,7 @@ def _prepare_GPU_Global_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: Sta allocation_stream.write( f'DACE_GPU_CHECK({self.backend}MallocAsync((void**)&{dataname}, {arrsize_malloc}, {gpu_stream}));\n', cfg, state_id, node) - + # Generate synchronization and error-check calls if sync debugging is enabled allocation_stream.write(generate_sync_debug_call()) @@ -943,7 +917,7 @@ def _prepare_GPU_Global_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: Sta state_id, node) if isinstance(nodedesc, dt.Array) and nodedesc.start_offset != 0: - allocation_stream.write(f'{dataname} += {symbolic_to_cpp(nodedesc.start_offset)};\n', cfg, state_id, node) + allocation_stream.write(f'{dataname} += {sym2cpp(nodedesc.start_offset)};\n', cfg, state_id, node) def _prepare_CPU_Pinned_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int, node: nodes.AccessNode, nodedesc: dt.Data, function_stream: CodeIOStream, @@ -961,7 +935,7 @@ def _prepare_CPU_Pinned_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: Sta # ------------------- Allocation ------------------- arrsize = nodedesc.total_size - arrsize_malloc = f'{symbolic_to_cpp(arrsize)} * sizeof({nodedesc.dtype.ctype})' + arrsize_malloc = f'{sym2cpp(arrsize)} * sizeof({nodedesc.dtype.ctype})' # Strides are left to the user's discretion allocation_stream.write(f'DACE_GPU_CHECK({self.backend}MallocHost(&{dataname}, {arrsize_malloc}));\n', cfg, @@ -970,7 +944,7 @@ def _prepare_CPU_Pinned_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: Sta allocation_stream.write(f'memset({dataname}, 0, {arrsize_malloc});\n', cfg, state_id, node) if nodedesc.start_offset != 0: - allocation_stream.write(f'{dataname} += {symbolic_to_cpp(nodedesc.start_offset)};\n', cfg, state_id, node) + allocation_stream.write(f'{dataname} += {sym2cpp(nodedesc.start_offset)};\n', cfg, state_id, node) def _prepare_GPU_Shared_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int, node: nodes.AccessNode, nodedesc: dt.Data, function_stream: CodeIOStream, @@ -988,15 +962,15 @@ def _prepare_GPU_Shared_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: Sta # ------------------- Declaration ------------------- array_ctype = f'{nodedesc.dtype.ctype} *' - declaration_stream.write(f'__shared__ {nodedesc.dtype.ctype} {dataname}[{symbolic_to_cpp(arrsize)}];\n', cfg, - state_id, node) + declaration_stream.write(f'__shared__ {nodedesc.dtype.ctype} {dataname}[{sym2cpp(arrsize)}];\n', cfg, state_id, + node) self._dispatcher.defined_vars.add(dataname, DefinedType.Pointer, array_ctype) # ------------------- Initialization ------------------- if node.setzero: allocation_stream.write( - f'dace::ResetShared<{nodedesc.dtype.ctype}, {", ".join(symbolic_to_cpp(self._current_kernel_spec.block_dims))}, {symbolic_to_cpp(arrsize)}, ' + f'dace::ResetShared<{nodedesc.dtype.ctype}, {", ".join(sym2cpp(self._current_kernel_spec.block_dims))}, {sym2cpp(arrsize)}, ' f'1, false>::Reset({dataname});\n', cfg, state_id, node) def _prepare_Register_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int, @@ -1016,7 +990,7 @@ def _prepare_Register_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: State array_ctype = '{nodedesc.dtype.ctype} *' init_clause = ' = {0}' if node.setzero else '' - declaration_stream.write(f'{nodedesc.dtype.ctype} {dataname}[{symbolic_to_cpp(arrsize)}]{init_clause};\n', cfg, + declaration_stream.write(f'{nodedesc.dtype.ctype} {dataname}[{sym2cpp(arrsize)}]{init_clause};\n', cfg, state_id, node) self._dispatcher.defined_vars.add(dataname, DefinedType.Pointer, array_ctype) @@ -1029,7 +1003,7 @@ def deallocate_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgrap # Adjust offset if needed if isinstance(nodedesc, dt.Array) and nodedesc.start_offset != 0: - dataname = f'({dataname} - {symbolic_to_cpp(nodedesc.start_offset)})' + dataname = f'({dataname} - {sym2cpp(nodedesc.start_offset)})' # Remove declaration info if self._dispatcher.declared_arrays.has(dataname): @@ -1053,8 +1027,9 @@ def deallocate_array(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgrap if nodedesc.pool: if (sdfg, dataname) not in self.pool_release: gpu_stream = self._gpu_stream_manager.get_stream_node(node) - callsite_stream.write(f'DACE_GPU_CHECK({self.backend}FreeAsync({dataname}, {gpu_stream}));\n', cfg, state_id, node) - else: + callsite_stream.write(f'DACE_GPU_CHECK({self.backend}FreeAsync({dataname}, {gpu_stream}));\n', cfg, + state_id, node) + else: callsite_stream.write(f'DACE_GPU_CHECK({self.backend}Free({dataname}));\n', cfg, state_id, node) elif nodedesc.storage == dtypes.StorageType.CPU_Pinned: @@ -1076,7 +1051,7 @@ def get_generated_codeobjects(self): # The GPU stream array is set to have a persistent allocation lifetime (see preprocess GPU stream pipeline). # Thus the definition of the GPU stream array in the state struct and the access to it is handled elsewhere and - # in several different files (e.g., framecode.py, cpu.py, cpp.py). For the sake of consistency, we initialize it + # in several different files (e.g., framecode.py, cpu.py, cpp.py). For the sake of consistency, we initialize it # as it is expected in the other modules. I.e. prepend with an ID for all SDFGs it is defined. # Note that all the different variable names point to the same GPU stream array. cnt = 0 @@ -1087,7 +1062,6 @@ def get_generated_codeobjects(self): init_gpu_stream_vars = f"__state->__{csdfg.cfg_id}_{name}" break - # My comment: takes codeblocks and transforms it nicely to code initcode = CodeIOStream() for sd in self._global_sdfg.all_sdfgs_recursive(): @@ -1284,7 +1258,7 @@ def __init__(self, cudaCodeGen: ExperimentalCUDACodeGen, sdfg: SDFG, cfg: Contro dfg_scope: ScopeSubgraphView, state_id: int): # Get kernel entry/exit nodes and current state - kernel_map_entry: nodes.MapEntry = dfg_scope.source_nodes()[0] + kernel_map_entry: nodes.MapEntry = dfg_scope.source_nodes()[0] kernel_parent_state: SDFGState = cfg.state(state_id) self._kernel_map_entry: nodes.MapEntry = kernel_map_entry @@ -1293,7 +1267,7 @@ def __init__(self, cudaCodeGen: ExperimentalCUDACodeGen, sdfg: SDFG, cfg: Contro # Kernel name self._kernel_name: str = f'{kernel_map_entry.map.label}_{cfg.cfg_id}_{kernel_parent_state.block_id}_{kernel_parent_state.node_id(kernel_map_entry)}' - # Get and store kernel constants — needed for applying 'const' and updating defined + # Get and store kernel constants — needed for applying 'const' and updating defined # constant variable types in the dispatcher (handled at GPU codegen) kernel_const_data = sdutil.get_constant_data(kernel_map_entry, kernel_parent_state) kernel_const_symbols = sdutil.get_constant_symbols(kernel_map_entry, kernel_parent_state) @@ -1310,7 +1284,8 @@ def __init__(self, cudaCodeGen: ExperimentalCUDACodeGen, sdfg: SDFG, cfg: Contro # Certain args are called in the CUDA/HIP file or kernel funcion, in which the pointer name of the args are different cudaCodeGen._in_device_code = True self._args_as_input = [ptr(name, data, sdfg, cudaCodeGen._frame) for name, data in arglist.items()] - self._args_typed = [('const ' if name in kernel_constants else '') + data.as_arg(name=name) for name, data in arglist.items()] + self._args_typed = [('const ' if name in kernel_constants else '') + data.as_arg(name=name) + for name, data in arglist.items()] # Args for the kernel wrapper function cudaCodeGen._in_device_code = False @@ -1320,20 +1295,25 @@ def __init__(self, cudaCodeGen: ExperimentalCUDACodeGen, sdfg: SDFG, cfg: Contro # - Use the configured variable name (from Config) in the wrapper’s function signature # (this same name is also used when invoking {backend}LaunchKernel inside the wrapper) gpustream_var_name = Config.get('compiler', 'cuda', 'gpu_stream_name').split(',')[1] - gpustream_input = [e for e in dace.sdfg.dynamic_map_inputs(kernel_parent_state, kernel_map_entry) if e.src.desc(sdfg).dtype == dtypes.gpuStream_t] + gpustream_input = [ + e for e in dace.sdfg.dynamic_map_inputs(kernel_parent_state, kernel_map_entry) + if e.src.desc(sdfg).dtype == dtypes.gpuStream_t + ] if len(gpustream_input) > 1: - raise ValueError(f"There can not be more than one GPU stream assigned to a kernel, but {len(gpustream_input)} were assigned.") - + raise ValueError( + f"There can not be more than one GPU stream assigned to a kernel, but {len(gpustream_input)} were assigned." + ) + # Final wrapper arguments: # - State struct (__state) # - Original kernel args # - GPU stream - self._kernel_wrapper_args_as_input = (['__state'] - + [ptr(name, data, sdfg, cudaCodeGen._frame) for name, data in arglist.items()] - + [str(gpustream_input[0].dst_conn)]) - self._kernel_wrapper_args_typed = ([f'{mangle_dace_state_struct_name(cudaCodeGen._global_sdfg)} *__state'] - + [('const ' if name in kernel_constants else '') + data.as_arg(name=name) for name, data in arglist.items()] - + [f"gpuStream_t {gpustream_var_name}"]) + self._kernel_wrapper_args_as_input = ( + ['__state'] + [ptr(name, data, sdfg, cudaCodeGen._frame) + for name, data in arglist.items()] + [str(gpustream_input[0].dst_conn)]) + self._kernel_wrapper_args_typed = ([f'{mangle_dace_state_struct_name(cudaCodeGen._global_sdfg)} *__state'] + + [('const ' if name in kernel_constants else '') + data.as_arg(name=name) + for name, data in arglist.items()] + [f"gpuStream_t {gpustream_var_name}"]) cudaCodeGen._in_device_code = restore_in_device_code @@ -1377,7 +1357,7 @@ def get_gpu_index_ctype(self, config_key='gpu_index_type') -> str: def kernel_constants(self) -> Set[str]: """Returns the kernel's constant data and symbols.""" return self._kernel_constants - + @property def kernel_name(self) -> list[str]: """Returns the kernel (function's) name.""" @@ -1395,11 +1375,11 @@ def kernel_map_entry(self) -> nodes.MapEntry: def kernel_map(self) -> nodes.Map: """Returns the kernel's map node.""" return self._kernel_map_entry.map - + @property def arglist(self) -> Dict[str, dt.Data]: """ - Returns a dictionary of arguments for the kernel's subgraph, + Returns a dictionary of arguments for the kernel's subgraph, mapping each data name to its corresponding data descriptor. """ return self._arglist diff --git a/dace/codegen/targets/experimental_cuda_helpers/copy_strategies.py b/dace/codegen/targets/experimental_cuda_helpers/copy_strategies.py index 6d037c2ae9..3982f3a86d 100644 --- a/dace/codegen/targets/experimental_cuda_helpers/copy_strategies.py +++ b/dace/codegen/targets/experimental_cuda_helpers/copy_strategies.py @@ -1,68 +1,47 @@ +# Copyright 2019-2025 ETH Zurich and the DaCe authors. All rights reserved. from abc import ABC, abstractmethod -from typing import Tuple +from typing import Any, Dict, List, Optional, Tuple, Union +from dace import SDFG, SDFGState, data, dtypes, subsets +from dace import memlet as mm from dace import symbolic -from dace import Memlet, dtypes +from dace.codegen import common +from dace.codegen.targets import cpp +from dace.codegen.targets.cpp import sym2cpp, unparse_cr +from dace.codegen.targets.experimental_cuda_helpers.gpu_utils import generate_sync_debug_call +from dace.config import Config from dace.dtypes import StorageType -from dace.codegen.targets.experimental_cuda import ExperimentalCUDACodeGen, GPUStreamManager, KernelSpec -from dace.codegen.targets.experimental_cuda_helpers.gpu_utils import product, symbolic_to_cpp, emit_sync_debug_checks +from dace.frontend import operations +from dace.sdfg import nodes, scope_contains_scope +from dace.sdfg.graph import MultiConnectorEdge +from dace.transformation import helpers -from dace.codegen.prettycode import CodeIOStream -from dace.sdfg import SDFG, nodes -from dace.sdfg.nodes import Node -from dace.sdfg.state import ControlFlowRegion, StateSubgraphView -from dace.codegen.targets.cpp import memlet_copy_to_absolute_strides, unparse_cr - - -# TODO: Review Documentation once done here. And also, take care of the other -# two strategies below. class CopyContext: """ - Stores and derives all information required for memory copy operations on GPUs. - - This class exists because memory copy logic often requires a large amount of context, - including node references, expressions, layout, and backend details. Handling all this - ad hoc makes the code harder to follow and maintain. - - CopyContext centralizes this information and provides helper functions to clarify - what values are needed for code generation and why. This improves readability, - simplifies copy emission logic, and makes future extensions easier. + Encapsulates inputs required for copy operations and exposes helper + methods to derive additional information. This keeps copy strategies + lightweight by letting them focus only on the relevant logic. """ - def __init__(self, codegen: ExperimentalCUDACodeGen, gpu_stream_manager: GPUStreamManager, state_id: int, - src_node: Node, dst_node: Node, edge: Tuple[Node, str, Node, str, Memlet], sdfg: SDFG, - cfg: ControlFlowRegion, dfg: StateSubgraphView, callsite_stream: CodeIOStream): + def __init__(self, sdfg: SDFG, state: SDFGState, src_node: nodes.Node, dst_node: nodes.Node, + edge: MultiConnectorEdge[mm.Memlet], gpustream_assignments: Dict[nodes.Node, Union[int, str]]): - # Store general context information for the copy operation, such as: - # - which code generator is responsible, - # - which edge and SDFG/state context related to the copy, - # - and where the generated code is written (callsite stream). - self.codegen = codegen - self.state_id = state_id + # Store the basic context as attributes + self.sdfg = sdfg + self.state = state self.src_node = src_node self.dst_node = dst_node self.edge = edge - self.sdfg = sdfg - self.cfg = cfg - self.dfg = dfg - self.callsite_stream = callsite_stream + self.gpustream_assignments = gpustream_assignments - # Additional information frequently needed - self.backend = codegen.backend - self.state_dfg = cfg.state(state_id) - self.cudastream = gpu_stream_manager.get_stream_edge(src_node, dst_node) - self.src_storage = self.get_storage_type(src_node) - self.dst_storage = self.get_storage_type(dst_node) + memlet = edge.data + self.copy_shape = memlet.subset.size_exact() if isinstance(src_node, nodes.AccessNode) and isinstance(dst_node, nodes.AccessNode): - copy_shape, src_strides, dst_strides, src_expr, dst_expr = memlet_copy_to_absolute_strides( - codegen._dispatcher, sdfg, self.state_dfg, edge, src_node, dst_node, codegen._cpu_codegen._packed_types) + copy_shape, src_strides, dst_strides, src_expr, dst_expr = self.get_accessnode_to_accessnode_copy_info() else: - _, _, _, _, memlet = edge - copy_shape = [symbolic.overapproximate(s) for s in memlet.subset.bounding_box_size()] - - # if src and dst node are not AccessNodes, these are undefined + copy_shape = memlet.subset.size_exact() src_strides = dst_strides = src_expr = dst_expr = None self.copy_shape = copy_shape @@ -71,103 +50,229 @@ def __init__(self, codegen: ExperimentalCUDACodeGen, gpu_stream_manager: GPUStre self.src_expr = src_expr self.dst_expr = dst_expr - self.num_dims = len(copy_shape) - - def get_storage_type(self, node: Node): + def get_storage_type(self, node: nodes.Node): + """ + Return the storage type associated with a given SDFG node. + Tasklets are assumed to use register storage, while AccessNodes + return the storage type from their data descriptor. Raises + NotImplementedError for unsupported node types. + """ if isinstance(node, nodes.Tasklet): storage_type = StorageType.Register - else: + + elif isinstance(node, nodes.AccessNode): storage_type = node.desc(self.sdfg).storage + else: + raise NotImplementedError(f"Unsupported node type {type(node)} for storage type retrieval; " + "expected AccessNode or Tasklet. Please extend this method accordingly.") + return storage_type - def get_copy_call_parameters(self) -> Tuple[str, str, str, str, str, str, any]: + def get_assigned_gpustream(self) -> str: """ - Returns all essential parameters required to emit a backend memory copy call. + Return the GPU stream expression assigned to both source and destination nodes. - This method determines both structural and backend-specific information - needed to perform a memory copy, including memory locations, pointer - expressions, and data types. In cases where either the source or - destination is not a data access node, pointer expressions may be unavailable. + Ensures that both nodes have a matching stream ID, then constructs the + variable name from the configured prefix and stream ID. Raises ValueError + if assignments are missing or inconsistent. - Returns - ------- - Tuple[str, Optional[str], Optional[str], str, str, str, any] - A tuple containing: - - backend (str): Name of the backend used (e.g., 'cuda', 'hip'). - - src_expr (Optional[str]): Source pointer expression, or None if unavailable. - - dst_expr (Optional[str]): Destination pointer expression, or None if unavailable. - - src_location (str): Memory location of the source ('Host' or 'Device'). - - dst_location (str): Memory location of the destination ('Host' or 'Device'). - - cudastream (str): Backend-specific stream identifier. - - ctype (any): The C type of the data being copied. + Example: + If the configured prefix is 'gpu_stream' and the assigned stream ID is 0, + this method returns 'gpu_stream0'. """ - src_location = 'Device' if self.src_storage == dtypes.StorageType.GPU_Global else 'Host' - dst_location = 'Device' if self.dst_storage == dtypes.StorageType.GPU_Global else 'Host' + src_stream = self.gpustream_assignments.get(self.src_node) + dst_stream = self.gpustream_assignments.get(self.dst_node) - # Should be symmetric - ctype_src = self.src_node.desc(self.sdfg).dtype.ctype - ctype_dst = self.dst_node.desc(self.sdfg).dtype.ctype - ctype = ctype_dst - assert ctype_src == ctype_dst, (f"Source and destination data types must match for the memory copy: " - f"{ctype_src} != {ctype_dst}") + # 1. Catch unsupported cases + if src_stream is None or dst_stream is None: + raise ValueError("GPU stream assignment missing for source or destination node.") - return self.backend, self.src_expr, self.dst_expr, src_location, dst_location, self.cudastream, ctype + if src_stream != dst_stream: + raise ValueError(f"Mismatch in assigned GPU streams: src_node has '{src_stream}', " + f"dst_node has '{dst_stream}'. They must be the same.") - def get_transfer_layout(self) -> Tuple[list, list, list]: + # 2. Generate GPU stream expression + gpustream = src_stream + gpustream_var_name_prefix = Config.get('compiler', 'cuda', 'gpu_stream_name').split(',')[1] + gpustream_expr = f"{gpustream_var_name_prefix}{gpustream}" + + return gpustream_expr + + def get_memory_location(self) -> Tuple[str, str]: """ - Returns layout information required for emitting a memory copy. + Determine whether the source and destination nodes reside in device or host memory. + + Uses the storage type of each node to classify it as either 'Device' + (GPU global memory) or 'Host' (all other storage types). + Used for GPU related copies outside the kernel (e.g. to construct + cudaMemcpyHostToDevice for example). Returns ------- - copy_shape : List - The size (extent) of each dimension to be copied. - Singleton dimensions (i.e., dimensions of size 1) are omitted. - Example: [J, K, 1] becomes [J, K] - src_strides : List or None - Stride values of the source expression, per dimension if - source and destination are of type AccessNode, else None. - dst_strides : List or None - Stride values of the destination expression, per dimension if - source and destination are of type AccessNode, else None. + Tuple[str, str] + (src_location, dst_location) where each is either 'Device' or 'Host'. """ - return self.copy_shape, self.src_strides, self.dst_strides + src_storage = self.get_storage_type(self.src_node) + dst_storage = self.get_storage_type(self.dst_node) + src_location = 'Device' if src_storage == dtypes.StorageType.GPU_Global else 'Host' + dst_location = 'Device' if dst_storage == dtypes.StorageType.GPU_Global else 'Host' - def get_write_context(self) -> Tuple[CodeIOStream, ControlFlowRegion, int, Node, Node]: + return src_location, dst_location + + def get_ctype(self) -> Any: """ - Returns all context required to emit code into the callsite stream with proper SDFG annotations. + Determine the C data type (ctype) of the source or destination node. + + The ctype is resolved from the data descriptor of the first node + (source or destination) that is an AccessNode (assumed to be the same + if both are AccessNodes). Returns ------- - callsite_stream : CodeIOStream - The output stream where backend code is written. - cfg : ControlFlowRegion - The control flow region containing the current state. - state_id : int - The ID of the SDFG state being generated. - src_node : Node - The source node involved in the copy. - dst_node : Node - The destination node involved in the copy. + Any + The C type string (e.g., "float*", "int32") associated with the node. + + Raises + ------ + NotImplementedError + If neither the source nor the destination node is an AccessNode. """ - return self.callsite_stream, self.cfg, self.state_id, self.src_node, self.dst_node + sdfg = self.sdfg + src_node, dst_node = self.src_node, self.dst_node + + if isinstance(src_node, nodes.AccessNode): + return src_node.desc(sdfg).ctype + + if isinstance(dst_node, nodes.AccessNode): + return dst_node.desc(sdfg).ctype + + raise NotImplementedError( + f"Cannot determine ctype: neither src nor dst node is an AccessNode. " + f"Got src_node type: {type(src_node).__name__}, dst_node type: {type(dst_node).__name__}. " + "Please extend this case or fix the issue.") - def is_contiguous_copy(self) -> bool: + def get_accessnode_to_accessnode_copy_info(self): """ - Returns True if the memory copy is contiguous in the last dimension - for both source and destination. + Compute copy shape, absolute strides, and pointer expressions for a copy + between two AccessNodes. Tries to mimic + cpp.memlet_copy_to_absolute_strides without requiring a dispatcher. + + Returns + ------- + (copy_shape, src_strides, dst_strides, src_expr, dst_expr) + + Raises + ------ + TypeError + If either endpoint is not an AccessNode. + NotImplementedError + If a descriptor is not Scalar or Array. """ - return (self.src_strides[-1] == 1) and (self.dst_strides[-1] == 1) - def get_memory_location(self) -> Tuple[str, str]: - src_location = 'Device' if self.src_storage == dtypes.StorageType.GPU_Global else 'Host' - dst_location = 'Device' if self.dst_storage == dtypes.StorageType.GPU_Global else 'Host' + # ---------------------------- helpers ---------------------------- + def _collapse_strides(strides, subset): + """Remove size-1 dims; keep tile strides; default to [1] if none remain.""" + n = len(subset) + collapsed = [st for st, sz in zip(strides, subset.size()) if sz != 1] + collapsed.extend(strides[n:]) # include tiles + if len(collapsed) == 0: + return [1] + return collapsed + + def _ptr_name(desc, name): + if desc.transient and desc.lifetime in (dtypes.AllocationLifetime.Persistent, + dtypes.AllocationLifetime.External): + return f'__state->__{sdfg.cfg_id}_{name}' + return name + + def _expr_for(desc, name, subset): + ptr = _ptr_name(desc, name) + + if isinstance(desc, data.Scalar): + # GPU scalar special-case + if desc.storage in dtypes.GPU_MEMORY_STORAGES_EXPERIMENTAL_CUDACODEGEN: + parent = state.sdfg.parent_nsdfg_node + if parent is not None and name in parent.in_connectors: + return f"&{ptr}" + return ptr + # CPU (or other) scalars + return f"&{ptr}" + + if isinstance(desc, data.Array): + offset = cpp.cpp_offset_expr(desc, subset) + return f"{ptr} + {offset}" if offset != "0" else ptr - return src_location, dst_location + raise NotImplementedError( + f"Expected {name} to be either data.Scalar or data.Array, but got {type(desc).__name__}.") + + # ---------------------------- Get copy info ---------------------------- + # Get needed information + src_node, dst_node = self.src_node, self.dst_node + sdfg, edge, state = self.sdfg, self.edge, self.state + memlet, copy_shape = self.edge.data, self.copy_shape + + # Guard - only applicable if src and dst are AccessNodes + if not (isinstance(src_node, nodes.AccessNode) and isinstance(dst_node, nodes.AccessNode)): + raise TypeError( + f"get_accessnode_to_accessnode_copy_info requires both source and destination " + f"to be AccessNode instances, but got {type(src_node).__name__} and {type(dst_node).__name__}.") + + # Get node descriptors + src_nodedesc = src_node.desc(sdfg) + dst_nodedesc = dst_node.desc(sdfg) + + # Resolve subsets (fallback to full range) + src_subset = memlet.get_src_subset(edge, state) + dst_subset = memlet.get_dst_subset(edge, state) + + if src_subset is None: + src_subset = subsets.Range.from_array(src_nodedesc) + + if dst_subset is None: + dst_subset = subsets.Range.from_array(dst_nodedesc) + + # Get strides + src_strides = src_subset.absolute_strides(src_nodedesc.strides) + dst_strides = dst_subset.absolute_strides(dst_nodedesc.strides) + + # Try to convert to a degenerate/strided ND copy first + result = cpp.ndcopy_to_strided_copy( + copy_shape, + src_nodedesc.shape, + src_strides, + dst_nodedesc.shape, + dst_strides, + memlet.subset, + src_subset, + dst_subset, + ) + + if result is not None: + copy_shape, src_strides, dst_strides = result + else: + src_strides = _collapse_strides(src_strides, src_subset) + dst_strides = _collapse_strides(dst_strides, dst_subset) + copy_shape = [s for s in copy_shape if s != 1] or [1] + + # Extend copy shape to the largest among the data dimensions, + # and extend other array with the appropriate strides + if len(dst_strides) != len(copy_shape) or len(src_strides) != len(copy_shape): + if memlet.data == src_node.data: + copy_shape, dst_strides = cpp.reshape_strides(src_subset, src_strides, dst_strides, copy_shape) + elif memlet.data == dst_node.data: + copy_shape, src_strides = cpp.reshape_strides(dst_subset, dst_strides, src_strides, copy_shape) + + # Build final expressions + src_expr = _expr_for(src_nodedesc, src_node.data, src_subset) + dst_expr = _expr_for(dst_nodedesc, dst_node.data, dst_subset) + + return copy_shape, src_strides, dst_strides, src_expr, dst_expr class CopyStrategy(ABC): + """Abstract base class for memory copy strategies.""" @abstractmethod def applicable(self, copy_context: CopyContext) -> bool: @@ -177,14 +282,21 @@ def applicable(self, copy_context: CopyContext) -> bool: raise NotImplementedError('Abstract class') @abstractmethod - def generate_copy(self, copy_context: CopyContext) -> None: + def generate_copy(self, copy_context: CopyContext) -> str: """ - Generates the copy code for the supported pattern. + Generates and returns the copy code for the supported pattern. """ raise NotImplementedError('Abstract class') class OutOfKernelCopyStrategy(CopyStrategy): + """ + Copy strategy for memory transfers that occur outside of kernel execution. + + This pattern often occurs when generating host-to-device copies for kernel inputs + (since kernels cannot access host memory directly), and device-to-host copies + to retrieve results for further processing. + """ def applicable(self, copy_context: CopyContext) -> bool: """ @@ -193,316 +305,412 @@ def applicable(self, copy_context: CopyContext) -> bool: This function returns True if: - We are not currently generating kernel code - The copy occurs between two AccessNodes - - - - This check is used to detect and handle transfers between host and device memory spaces. + - The data descriptors of source and destination are not views. + - The storage types of either src or dst is CPU_Pinned or GPU_Device + - We do not have a CPU-to-CPU copy """ + # Retrieve needed information + state = copy_context.state + src_node, dst_node = copy_context.src_node, copy_context.dst_node - # TODO: I don't understand why all of these conditions are needed, look into it + # 1. Ensure copy is not occuring within a kernel + scope_dict = state.scope_dict() + deeper_node = dst_node if scope_contains_scope(scope_dict, src_node, dst_node) else src_node - cpu_storage_types = [StorageType.CPU_Heap, StorageType.CPU_ThreadLocal, StorageType.CPU_Pinned] - not_in_device_code = not copy_context.codegen._in_device_code + parent_map_tuple = helpers.get_parent_map(state, deeper_node) + while parent_map_tuple is not None: + parent_map, parent_state = parent_map_tuple + if parent_map.map.schedule in dtypes.GPU_SCHEDULES_EXPERIMENTAL_CUDACODEGEN: + return False + else: + parent_map_tuple = helpers.get_parent_map(parent_state, parent_map) - is_between_access_nodes = (isinstance(copy_context.src_node, nodes.AccessNode) - and isinstance(copy_context.dst_node, nodes.AccessNode)) + # 2. Check whether copy is between two AccessNodes + if not (isinstance(src_node, nodes.AccessNode) and isinstance(dst_node, nodes.AccessNode)): + return False - involves_gpu_or_pinned = (copy_context.src_storage in (StorageType.GPU_Global, StorageType.CPU_Pinned) - or copy_context.dst_storage in (StorageType.GPU_Global, StorageType.CPU_Pinned)) + # 3. The data descriptors of source and destination are not views + if isinstance(src_node.desc(state), data.View) or isinstance(dst_node.desc(state), data.View): + return False - is_not_cpu_to_cpu = not (copy_context.src_storage in cpu_storage_types - and copy_context.dst_storage in cpu_storage_types) + # 4. Check that one StorageType of either src or dst is CPU_Pinned or GPU_Device + src_storage = copy_context.get_storage_type(src_node) + dst_storage = copy_context.get_storage_type(dst_node) + if not (src_storage in (StorageType.GPU_Global, StorageType.CPU_Pinned) + or dst_storage in (StorageType.GPU_Global, StorageType.CPU_Pinned)): + return False - is_gpu_host_copy = (not_in_device_code and is_between_access_nodes and involves_gpu_or_pinned - and is_not_cpu_to_cpu) + # 5. Check that this is not a CPU to CPU copy + cpu_storage_types = [StorageType.CPU_Heap, StorageType.CPU_ThreadLocal, StorageType.CPU_Pinned] + if src_storage in cpu_storage_types and dst_storage in cpu_storage_types: + return False - return is_gpu_host_copy + return True - def generate_copy(self, copy_context: CopyContext) -> None: + def generate_copy(self, copy_context: CopyContext) -> str: """Execute host-device copy with CUDA memory operations""" - return - # guard - _, _, _, _, memlet = copy_context.edge + # Guard + memlet = copy_context.edge.data if memlet.wcr is not None: src_location, dst_location = copy_context.get_memory_location() raise NotImplementedError(f'Accumulate {src_location} to {dst_location} not implemented') - # call corresponding helper function - num_dims = copy_context.num_dims + # Based on the copy dimension, call appropiate helper function + num_dims = len(copy_context.copy_shape) if num_dims == 1: - self._generate_1d_copy(copy_context) + copy_call = self._generate_1d_copy(copy_context) + elif num_dims == 2: - self._generate_2d_copy(copy_context) + copy_call = self._generate_2d_copy(copy_context) + else: # sanity check assert num_dims > 2, f"Expected copy shape with more than 2 dimensions, but got {num_dims}." - self._generate_nd_copy(copy_context) + copy_call = self._generate_nd_copy(copy_context) - # We use library calls thus for debugging we provide sync option - emit_sync_debug_checks(copy_context.backend, copy_context.callsite_stream) + return copy_call - def _generate_1d_copy(self, copy_context: CopyContext) -> None: + def _generate_1d_copy(self, copy_context: CopyContext) -> str: """ - Emits code for a 1D memory copy between host and device using GPU backend. - Uses {backend}MemcpyAsync for contiguous memory and uses {backend}Memcpy2DAsync - for strided memory copies. + Generates a 1D memory copy between host and device using the GPU backend. + + Uses {backend}MemcpyAsync for contiguous memory. For strided memory, + {backend}Memcpy2DAsync is leveraged to efficiently handle the stride along one dimension. """ + # ----------- Retrieve relevant copy parameters -------------- + backend: str = common.get_gpu_backend() - # ----------- Extract relevant copy parameters -------------- - copy_shape, src_strides, dst_strides = copy_context.get_transfer_layout() + # Due to applicable(), src and dst node must be AccessNodes + copy_shape, src_strides, dst_strides, src_expr, dst_expr = copy_context.get_accessnode_to_accessnode_copy_info() - backend, src_expr, dst_expr, src_location, dst_location, cudastream, ctype = \ - copy_context.get_copy_call_parameters() + src_location, dst_location = copy_context.get_memory_location() + is_contiguous_copy = (src_strides[-1] == 1) and (dst_strides[-1] == 1) + ctype = copy_context.get_ctype() + gpustream = copy_context.get_assigned_gpustream() # ----------------- Generate backend call -------------------- - if copy_context.is_contiguous_copy(): + + if is_contiguous_copy: # Memory is linear: can use {backend}MemcpyAsync - copysize = ' * '.join(symbolic_to_cpp(copy_shape)) + copysize = ' * '.join(sym2cpp(copy_shape)) copysize += f' * sizeof({ctype})' kind = f'{backend}Memcpy{src_location}To{dst_location}' - call = f'DACE_GPU_CHECK({backend}MemcpyAsync({dst_expr}, {src_expr}, {copysize}, {kind}, {cudastream}));\n' + call = f'DACE_GPU_CHECK({backend}MemcpyAsync({dst_expr}, {src_expr}, {copysize}, {kind}, {gpustream}));\n' else: # Memory is strided: use {backend}Memcpy2DAsync with dpitch/spitch # This allows copying a strided 1D region - dpitch = f'{dst_strides[0]} * sizeof({ctype})' - spitch = f'{src_strides[0]} * sizeof({ctype})' + dpitch = f'{sym2cpp(dst_strides[0])} * sizeof({ctype})' + spitch = f'{sym2cpp(src_strides[0])} * sizeof({ctype})' width = f'sizeof({ctype})' - height = copy_shape[0] + height = sym2cpp(copy_shape[0]) kind = f'{backend}Memcpy{src_location}To{dst_location}' - call = f'DACE_GPU_CHECK({backend}Memcpy2DAsync({dst_expr}, {dpitch}, {src_expr}, {spitch}, {width}, {height}, {kind}, {cudastream}));\n' + call = f'DACE_GPU_CHECK({backend}Memcpy2DAsync({dst_expr}, {dpitch}, {src_expr}, {spitch}, {width}, {height}, {kind}, {gpustream}));\n' - # ----------------- Write copy call to code stream -------------------- - callsite_stream, cfg, state_id, src_node, dst_node = copy_context.get_write_context() - callsite_stream.write(call, cfg, state_id, [src_node, dst_node]) + # Potentially snychronization required if syncdebug is set to true in configurations + call = call + generate_sync_debug_call() + return call def _generate_2d_copy(self, copy_context: CopyContext) -> None: - """Generates code for a 2D copy, falling back to 1D flattening if applicable.""" + """ + Generates a 2D memory copy using {backend}Memcpy2DAsync. + + Three main cases are handled: + - Copy between row-major stored arrays with contiguous rows. + - Copy between column-major stored arrays with contiguous columns. + - A special case where a 2D copy can still be represented. + + Raises: + NotImplementedError: Raised if the source and destination strides do not match any of the handled patterns. + Such cases indicate an unsupported 2D copy and should be examined separately. + They can be implemented if valid, or a more descriptive error should be raised if the path should not occur. + + Note: + {backend}Memcpy2DAsync supports strided copies along only one dimension (row or column), + but not both simultaneously. + """ # ----------- Extract relevant copy parameters -------------- - copy_shape, src_strides, dst_strides = copy_context.get_transfer_layout() + backend: str = common.get_gpu_backend() - backend, src_expr, dst_expr, src_location, dst_location, cudastream, ctype = \ - copy_context.get_copy_call_parameters() + # Due to applicable(), src and dst node must be AccessNodes + copy_shape, src_strides, dst_strides, src_expr, dst_expr = copy_context.get_accessnode_to_accessnode_copy_info() + src_location, dst_location = copy_context.get_memory_location() + ctype = copy_context.get_ctype() + gpustream = copy_context.get_assigned_gpustream() # ----------------- Generate backend call if supported -------------------- - if copy_context.is_contiguous_copy(): - dpitch = f'{dst_strides[0]} * sizeof({ctype})' - spitch = f'{src_strides[0]} * sizeof({ctype})' - width = f'{copy_shape[1]} * sizeof({ctype})' - height = f'{copy_shape[0]}' + # Case: Row-major layout, rows are not strided. + if (src_strides[1] == 1) and (dst_strides[1] == 1): + dpitch = f'{sym2cpp(dst_strides[0])} * sizeof({ctype})' + spitch = f'{sym2cpp(src_strides[0])} * sizeof({ctype})' + width = f'{sym2cpp(copy_shape[1])} * sizeof({ctype})' + height = f'{sym2cpp(copy_shape[0])}' kind = f'{backend}Memcpy{src_location}To{dst_location}' - call = f'DACE_GPU_CHECK({backend}Memcpy2DAsync({dst_expr}, {dpitch}, {src_expr}, {spitch}, {width}, {height}, {kind}, {cudastream}));\n' + call = f'DACE_GPU_CHECK({backend}Memcpy2DAsync({dst_expr}, {dpitch}, {src_expr}, {spitch}, {width}, {height}, {kind}, {gpustream}));\n' - elif src_strides[-1] != 1 or dst_strides[-1] != 1: - # TODO: Checks this, I am not sure but the old code and its description - # seems to be more complicated here than necessary.. - # But worth to mention: we essentiall flatten + # Case: Column-major layout, no columns are strided. + elif (src_strides[0] == 1) and (dst_strides[0] == 1): + dpitch = f'{sym2cpp(dst_strides[1])} * sizeof({ctype})' + spitch = f'{sym2cpp(src_strides[1])} * sizeof({ctype})' + width = f'{sym2cpp(copy_shape[0])} * sizeof({ctype})' + height = f'{sym2cpp(copy_shape[1])}' + kind = f'{backend}Memcpy{src_location}To{dst_location}' - # NOTE: Special case of continuous copy - # Example: dcol[0:I, 0:J, k] -> datacol[0:I, 0:J] - # with copy shape [I, J] and strides [J*K, K], [J, 1] + call = f'DACE_GPU_CHECK({backend}Memcpy2DAsync({dst_expr}, {dpitch}, {src_expr}, {spitch}, {width}, {height}, {kind}, {gpustream}));\n' - dpitch = f'{dst_strides[1]} * sizeof({ctype})' - spitch = f'{src_strides[1]} * sizeof({ctype})' + # Special case + elif (src_strides[0] / src_strides[1] == copy_shape[1] and dst_strides[0] / dst_strides[1] == copy_shape[1]): + # Consider as an example this copy: A[0:I, 0:J, K] -> B[0:I, 0:J] with + # copy shape [I, J], src_strides[J*K, K], dst_strides[J, 1]. This can be represented with a + # {backend}Memcpy2DAsync call! + + dpitch = f'{sym2cpp(dst_strides[1])} * sizeof({ctype})' + spitch = f'{sym2cpp(src_strides[1])} * sizeof({ctype})' width = f'sizeof({ctype})' - height = copy_shape[0] * copy_shape[1] + height = sym2cpp(copy_shape[0] * copy_shape[1]) kind = f'{backend}Memcpy{src_location}To{dst_location}' - call = f'DACE_GPU_CHECK({backend}Memcpy2DAsync({dst_expr}, {dpitch}, {src_expr}, {spitch}, {width}, {height}, {kind}, {cudastream}));\n' + call = f'DACE_GPU_CHECK({backend}Memcpy2DAsync({dst_expr}, {dpitch}, {src_expr}, {spitch}, {width}, {height}, {kind}, {gpustream}));\n' else: raise NotImplementedError( f"Unsupported 2D memory copy: shape={copy_shape}, src_strides={src_strides}, dst_strides={dst_strides}." - " Please implement this case if it is valid, or raise a more descriptive error if this path should not be taken." + "Please implement this case if it is valid, or raise a more descriptive error if this path should not be taken." ) - # ----------------- Write copy call to code stream -------------------- - callsite_stream, cfg, state_id, src_node, dst_node = copy_context.get_write_context() - callsite_stream.write(call, cfg, state_id, [src_node, dst_node]) + return call def _generate_nd_copy(self, copy_context: CopyContext) -> None: + """ + Generates GPU code for copying N-dimensional arrays using 2D memory copies. - # ----------- Guard for unsupported Pattern -------------- - if not copy_context.is_contiguous_copy(): - raise NotImplementedError( - "Strided GPU memory copies for N-dimensional arrays are not currently supported.\n" - f" Source node: {copy_context.src_node} (storage: {copy_context.src_storage})\n" - f" Destination node: {copy_context.dst_node} (storage: {copy_context.dst_storage})\n" - f" Source strides: {copy_context.src_strides}\n" - f" Destination strides: {copy_context.dst_strides}\n") - + Uses {backend}Memcpy2DAsync for the last two dimensions, with nested loops + for any outer dimensions. Expects the copy to be contiguous and between + row-major storage locations. + """ # ----------- Extract relevant copy parameters -------------- - copy_shape, src_strides, dst_strides = copy_context.get_transfer_layout() + backend: str = common.get_gpu_backend() - backend, src_expr, dst_expr, src_location, dst_location, cudastream, ctype = \ - copy_context.get_copy_call_parameters() + # Due to applicable(), src and dst node must be AccessNodes + copy_shape, src_strides, dst_strides, src_expr, dst_expr = copy_context.get_accessnode_to_accessnode_copy_info() - num_dims = copy_context.num_dims - # ----------------- Generate and write backend call(s) -------------------- + src_location, dst_location = copy_context.get_memory_location() + ctype = copy_context.get_ctype() + gpustream = copy_context.get_assigned_gpustream() + num_dims = len(copy_shape) - callsite_stream, cfg, state_id, src_node, dst_node = copy_context.get_write_context() + # ----------- Guard for unsupported Pattern -------------- + if not (src_strides[-1] == 1) and (dst_strides[-1] == 1): + src_node, dst_node = copy_context.src_node, copy_context.dst_node + src_storage = copy_context.get_storage_type(src_node) + dst_storage = copy_context.get_storage_type(dst_node) + raise NotImplementedError( + "N-dimensional GPU memory copies, that are strided or contain column-major arrays, are currently not supported.\n" + f" Source node: {src_node} (storage: {src_storage})\n" + f" Destination node: {copy_context.dst_node} (storage: {dst_storage})\n" + f" Source strides: {src_strides}\n" + f" Destination strides: {dst_strides}\n" + f" copy shape: {copy_shape}\n") + # ----------------- Generate and write backend call(s) -------------------- + + call = "" # Write for-loop headers for dim in range(num_dims - 2): - callsite_stream.write( - f"for (int __copyidx{dim} = 0; __copyidx{dim} < {copy_shape[dim]}; ++__copyidx{dim}) {{") + call += f"for (int __copyidx{dim} = 0; __copyidx{dim} < {copy_shape[dim]}; ++__copyidx{dim}) {{\n" # Write Memcopy2DAsync - offset_src = ' + '.join(f'(__copyidx{d} * ({s}))' for d, s in enumerate(src_strides[:-2])) - offset_dst = ' + '.join(f'(__copyidx{d} * ({s}))' for d, s in enumerate(dst_strides[:-2])) + offset_src = ' + '.join(f'(__copyidx{d} * ({sym2cpp(s)}))' for d, s in enumerate(src_strides[:-2])) + offset_dst = ' + '.join(f'(__copyidx{d} * ({sym2cpp(s)}))' for d, s in enumerate(dst_strides[:-2])) src = f'{src_expr} + {offset_src}' dst = f'{dst_expr} + {offset_dst}' - dpitch = f'{dst_strides[-2]} * sizeof({ctype})' - spitch = f'{src_strides[-2]} * sizeof({ctype})' - width = f'{copy_shape[-1]} * sizeof({ctype})' - height = copy_shape[-2] + dpitch = f'{sym2cpp(dst_strides[-2])} * sizeof({ctype})' + spitch = f'{sym2cpp(src_strides[-2])} * sizeof({ctype})' + width = f'{sym2cpp(copy_shape[-1])} * sizeof({ctype})' + height = sym2cpp(copy_shape[-2]) kind = f'{backend}Memcpy{src_location}To{dst_location}' # Generate call and write it - call = f'DACE_GPU_CHECK({backend}Memcpy2DAsync({dst}, {dpitch}, {src}, {spitch}, {width}, {height}, {kind}, {cudastream}));\n' - callsite_stream.write(call, cfg, state_id, [src_node, dst_node]) + call += f'DACE_GPU_CHECK({backend}Memcpy2DAsync({dst}, {dpitch}, {src}, {spitch}, {width}, {height}, {kind}, {gpustream}));\n' # Write for-loop footers for dim in range(num_dims - 2): - callsite_stream.write("}") - + call += "\n}" -################ TODO, Might need to modified further ############# + # Return the code + return call -# Below: Does collaborative copy class SyncCollaboritveGPUCopyStrategy(CopyStrategy): + """ + Implements (synchronous) collaborative GPU copy operations. + + This strategy generates the appropriate code for copies performed + inside GPU kernels, where multiple threads cooperate to move data + between gpu memory spaces (e.g., global to shared memory). + """ def applicable(self, copy_context: CopyContext) -> bool: """ Checks if the copy is eligible for a collaborative GPU-to-GPU copy. Conditions: - 1. The copy is between GPU memory types (shared or global). - 2. The innermost non-sequential map is scheduled on GPU_Device. + 1. The copy is between two AccessNodes + 2. The copy is between GPU memory StorageTypes (shared or global). + 3. The innermost non-sequential map is a GPU_Device-scheduled map i.e. + the copy occurs within a kernel but is not within a GPU_ThreadBlock map. """ - from dace.sdfg import scope_contains_scope - from dace.transformation import helpers + # --- Condition 1: src and dst are AccessNodes --- + src_node, dst_node = copy_context.src_node, copy_context.dst_node + if not (isinstance(src_node, nodes.AccessNode) and isinstance(dst_node, nodes.AccessNode)): + return False - # --- Condition 1: GPU to GPU memory transfer --- + # --- Condition 2: GPU to GPU memory transfer --- + src_storage, dst_storage = copy_context.get_storage_type(src_node), copy_context.get_storage_type(dst_node) gpu_storages = {dtypes.StorageType.GPU_Global, dtypes.StorageType.GPU_Shared} - if not (copy_context.src_storage in gpu_storages and copy_context.dst_storage in gpu_storages): - return False - dst_node = copy_context.dst_node - if isinstance(dst_node, nodes.AccessNode) and dst_node.async_copy: + if not (src_storage in gpu_storages and dst_storage in gpu_storages): return False - # --- Condition 2: Inside a GPU_Device map scope --- - state = copy_context.state_dfg - scope_dict = state.scope_dict() - - # Determine which node (src or dst) is in the deeper scope - src, dst = copy_context.src_node, copy_context.dst_node - deeper_scope_node = dst if scope_contains_scope(scope_dict, src, dst) else src - - # Determine the schedule type of the innermost non-sequential map. - # If no such map exists, use the default schedule. - current_node = deeper_scope_node - while (current_node is None or not isinstance(current_node, nodes.MapEntry) - or current_node.map.schedule == dtypes.ScheduleType.Sequential): - - parent = helpers.get_parent_map(state, current_node) - if parent is None: - current_node = None - break - current_node, state = parent - - if current_node is None: - schedule_type = dtypes.SCOPEDEFAULT_SCHEDULE[None] + # --- Condition 3: Next non-sequential Map is a GPU_Device Map --- + next_nonseq_parent_map = self._next_non_seq_parent_map(copy_context) + if next_nonseq_parent_map is None: + return False else: - schedule_type = current_node.map.schedule + return next_nonseq_parent_map.map.schedule == dtypes.ScheduleType.GPU_Device + + def generate_copy(self, copy_context: CopyContext, kernel_dimensions_maps: Dict[nodes.MapEntry, + Tuple[List, List]]) -> str: + """ + Generates a GPU copy call as a string using DaCe's runtime CUDA copy functions. - return schedule_type == dtypes.ScheduleType.GPU_Device + The function determines the appropriate templated copy function from + `dace/libraries/runtime/include/dace/cuda/copy.cuh` and constructs + the call string with the necessary arguments, including kernel block + dimensions and optional accumulation/reduction information. - def generate_copy(self, copy_context: CopyContext) -> None: + Parameters + ---------- + copy_context : CopyContext + Helper object containing information about the copy. - from dace.frontend import operations + kernel_dimensions_maps : Dict[nodes.MapEntry, Tuple[List, List]] + Kernel map (GPU_Devie scheduled map) entry nodes to (grid_dims, block_dims); + block_dims needed in templating. - # Get required copy information - copy_shape, src_strides, dst_strides = copy_context.get_transfer_layout() - src_expr, dst_expr = copy_context.src_expr, copy_context.dst_expr + Returns + ------- + str + The GPU copy call in C++ as a string. + + Notes + ----- + - The kernel block size could be derived, but since this function is typically called + from `ExperimentalCUDACodeGen`, it is provided as input to avoid recomputation. + - The template functions use a parameter called 'is_async', which is set to True here + because `ExperimentalCUDACodeGen` inserts "__syncthreads()" explicitly in tasklets. + """ + # ----------- Retrieve relevant copy information -------------- + # Due to applicable(), src and dst node must be AccessNodes + copy_shape, src_strides, dst_strides, src_expr, dst_expr = copy_context.get_accessnode_to_accessnode_copy_info() sdfg = copy_context.sdfg dtype = copy_context.src_node.desc(sdfg).dtype ctype = dtype.ctype # Get copy function name (defined in runtime library) - num_dims = copy_context.num_dims - src_storage_name = self._get_storagename(copy_context.src_storage) - dst_storage_name = self._get_storagename(copy_context.dst_storage) - + num_dims = len(copy_shape) + src_node, dst_node = copy_context.src_node, copy_context.dst_node + src_storage, dst_storage = copy_context.get_storage_type(src_node), copy_context.get_storage_type(dst_node) + src_storage_name = self._get_storagename(src_storage) + dst_storage_name = self._get_storagename(dst_storage) function_name = f"dace::{src_storage_name}To{dst_storage_name}{num_dims}D" - # Check for write-conflict resolution (WCR), it affects function call - accum = '' - custom_reduction = [] - _, _, _, _, memlet = copy_context.edge - wcr = memlet.wcr - - if wcr is not None: - reduction_type = operations.detect_reduction_type(wcr) - - if reduction_type != dtypes.ReductionType.Custom: - # Use predefined reduction - reduction_type_str = str(reduction_type).split('.')[-1] # e.g., "Sum" - reduction_template = f"" - else: - custom_reduction = [unparse_cr(sdfg, wcr, dtype)] - reduction_template = "" - - accum = f"::template Accum{reduction_template}" - - # Dispatch to the correct backend copy template based on copy characteristics + # Extract WCR info (accumulation template + optional custom reduction) + accum, custom_reduction = self._get_accumulation_info(copy_context) + custom_reduction = [custom_reduction] if custom_reduction else [] - # get always used stuff - callsite_stream, cfg, state_id, src_node, dst_node = copy_context.get_write_context() + # Get parent kernel block dimensions (guaranteed GPU_Device) and sync flag + parent_kernel = self._next_non_seq_parent_map(copy_context) + block_dims = ", ".join(sym2cpp(kernel_dimensions_maps[parent_kernel][1])) + synchronized = "true" # Legacy 'is_async'; sync barriers handled by passes (see docstring) - # Retrieve kernel specs from the ExperimentalCUDACodegen instance (held in a dedicated class) - # Only there block_dims is stored, which is needed in this case - kernel_specifications: KernelSpec = copy_context.codegen._current_kernel_spec - block_dims = ', '.join(symbolic_to_cpp(kernel_specifications.block_dims)) - - # was called "is_async" previously. It determines whether a "__syncthreads()" is called at the - # end of the copy. In ExperimentalCUDACodegen, a pass is responsible to insert such sync barriers, - # so it is synchronized and we do not need "implicit" synchronization - synchronized = "true" + # ------------------------- Generate copy call ---------------------------- if any(symbolic.issymbolic(s, copy_context.sdfg.constants) for s in copy_shape): args_list = ([src_expr] + src_strides + [dst_expr] + custom_reduction + dst_strides + copy_shape) - args = ", ".join(symbolic_to_cpp(args_list)) - callsite_stream.write(f"{function_name}Dynamic<{ctype}, {block_dims}, {synchronized}>{accum}({args});", cfg, - state_id, [src_node, dst_node]) + args = ", ".join(sym2cpp(args_list)) + call = f"{function_name}Dynamic<{ctype}, {block_dims}, {synchronized}>{accum}({args});" elif function_name == "dace::SharedToGlobal1D": - # special case: use a new template struct that provides functions for copy and reduction - copy_size = ', '.join(symbolic_to_cpp(copy_shape)) + copy_size = ', '.join(sym2cpp(copy_shape)) accum = accum or '::Copy' args_list = ([src_expr] + src_strides + [dst_expr] + dst_strides + custom_reduction) - args = ", ".join(symbolic_to_cpp(args_list)) - callsite_stream.write( - f"{function_name}<{ctype}, {block_dims}, {copy_size}, {synchronized}>{accum}({args});", cfg, state_id, - [src_node, dst_node]) + args = ", ".join(sym2cpp(args_list)) + call = f"{function_name}<{ctype}, {block_dims}, {copy_size}, {synchronized}>{accum}({args});" else: - copy_size = ', '.join(symbolic_to_cpp(copy_shape)) + copy_size = ', '.join(sym2cpp(copy_shape)) args_list = ([src_expr] + src_strides + [dst_expr] + custom_reduction) - args = ", ".join(symbolic_to_cpp(args_list)) - dst_strides_unpacked = ", ".join(symbolic_to_cpp(dst_strides)) - callsite_stream.write( - f"{function_name}<{ctype}, {block_dims}, {copy_size}, {dst_strides_unpacked}, {synchronized}>{accum}({args});", - cfg, state_id, [src_node, dst_node]) + args = ", ".join(sym2cpp(args_list)) + dst_strides_unpacked = ", ".join(sym2cpp(dst_strides)) + call = f"{function_name}<{ctype}, {block_dims}, {copy_size}, {dst_strides_unpacked}, {synchronized}>{accum}({args});" + + return call + + def _get_accumulation_info(self, copy_context: CopyContext) -> Tuple[str, str]: + """ + Extracts write-conflict resolution (WCR) information from the copy context + and returns the accumulation/reduction template components needed for the + final templated function call in `generate_copy()`. + + This method processes WCR information from the memlet and generates the + appropriate C++ template strings for both predefined and custom reductions. + + Parameters + ---------- + copy_context : CopyContext + Copy context containing the copy operation details, including + the memlet with WCR information. + + Returns + ------- + Tuple[str, str] + A tuple containing: + - accum : str + Template accumulation string for the function call. Empty string if no WCR, + `"::template Accum"` for predefined reductions, or `"::template Accum"` for custom reductions. + - custom_reduction : str + C++ formatted custom reduction code string. Empty string for no WCR or predefined reductions, + unparsed custom reduction code for custom reductions. + """ + sdfg = copy_context.sdfg + dtype = copy_context.src_node.desc(sdfg).dtype + memlet = copy_context.edge.data + wcr = memlet.wcr + reduction_type = operations.detect_reduction_type(wcr) + + if wcr is None: + accum, custom_reduction = "", "" + + elif reduction_type != dtypes.ReductionType.Custom: + # Use predefined reduction + reduction_type_str = str(reduction_type).split(".")[-1] # e.g., "Sum" + accum = f"::template Accum" + custom_reduction = "" + + else: + accum = "::template Accum" + custom_reduction = unparse_cr(sdfg, wcr, dtype) + + return accum, custom_reduction def _get_storagename(self, storage: dtypes.StorageType): """ @@ -513,117 +721,36 @@ def _get_storagename(self, storage: dtypes.StorageType): storage_name = str(storage) return storage_name[storage_name.rindex('_') + 1:] + def _next_non_seq_parent_map(self, copy_context: CopyContext) -> Optional[nodes.MapEntry]: + """ + Traverse up the parent map chain from the deeper of src_node or dst_node + in `copy_context` and return the first parent MapEntry whose schedule + is not sequential. -class AsyncCollaboritveGPUCopyStrategy(CopyStrategy): - - def applicable(self, copy_context: CopyContext) -> bool: - - from dace.sdfg import scope_contains_scope - from dace.transformation import helpers - - # --- Condition 1: GPU to GPU memory transfer --- - gpu_storages = {dtypes.StorageType.GPU_Global, dtypes.StorageType.GPU_Shared} - if not (copy_context.src_storage in gpu_storages and copy_context.dst_storage in gpu_storages): - return False + Parameters + ---------- + copy_context : CopyContext + Context information about the memory copy. - dst_node = copy_context.dst_node - if not (isinstance(dst_node, nodes.AccessNode) and dst_node.async_copy): - return False - - # --- Condition 2: Inside a GPU_Device map scope --- - state = copy_context.state_dfg + Returns + ------- + Optional[nodes.MapEntry] + The first non-sequential parent MapEntry encountered, or None if no + such parent exists. + """ + src_node, dst_node = copy_context.src_node, copy_context.dst_node + state = copy_context.state scope_dict = state.scope_dict() # Determine which node (src or dst) is in the deeper scope - src, dst = copy_context.src_node, copy_context.dst_node - deeper_scope_node = dst if scope_contains_scope(scope_dict, src, dst) else src - - # Determine the schedule type of the innermost non-sequential map. - # If no such map exists, use the default schedule. - current_node = deeper_scope_node + deeper_node = dst_node if scope_contains_scope(scope_dict, src_node, dst_node) else src_node + current_node = deeper_node while (current_node is None or not isinstance(current_node, nodes.MapEntry) or current_node.map.schedule == dtypes.ScheduleType.Sequential): - parent = helpers.get_parent_map(state, current_node) if parent is None: current_node = None break current_node, state = parent - if current_node is None: - schedule_type = dtypes.SCOPEDEFAULT_SCHEDULE[None] - else: - schedule_type = current_node.map.schedule - - return schedule_type == dtypes.ScheduleType.GPU_Device - - def generate_copy(self, copy_context: CopyContext): - - # Show Yakup: - # Asynchronous memory copies are only allowed if they are contiguous - if not copy_context.is_contiguous_copy(): - raise NotImplementedError("Asynchronous memory copies are not supported for not contigous memory copies") - - # Get required copy information - copy_shape, src_strides, dst_strides = copy_context.get_transfer_layout() - src_expr, dst_expr = copy_context.src_expr, copy_context.dst_expr - - sdfg = copy_context.sdfg - dtype = copy_context.src_node.desc(sdfg).dtype - ctype = dtype.ctype - - # Get write context: - callsite_stream, cfg, state_id, src_node, dst_node = copy_context.get_write_context() - # copy dimension - num_dims = len(copy_shape) - - if num_dims == 1: - pipeline = dst_node.async_pipeline - size = f'{product(copy_shape)} *sizeof({ctype})' - callsite_stream.write(f"cuda::memcpy_async(block, {dst_expr}, {src_expr}, {size}, {pipeline});\n", cfg, - state_id, [src_node, dst_node]) - - elif num_dims > 1: - - # No built-in functionality for higher dimension copies- - # But solvable looping and doing 1D copies - - # write for-loop header: - for dim in range(num_dims - 1): - callsite_stream.write( - f"for (int __copyidx{dim} = 0; __copyidx{dim} < {copy_shape[dim]}; ++__copyidx{dim}) {{") - - offset_src = ' + '.join(f'(__copyidx{d} * ({s}))' for d, s in enumerate(src_strides[:-1])) - offset_dst = ' + '.join(f'(__copyidx{d} * ({s}))' for d, s in enumerate(dst_strides[:-1])) - - size = f'{copy_shape[-1]} *sizeof({ctype})' - src = f'{src_expr} + {offset_src}' - dst = f'{dst_expr} + {offset_dst}' - - callsite_stream.write(f"cuda::memcpy_async(block, {dst}, {src}, {size}, {pipeline});\n", cfg, state_id, - [src_node, dst_node]) - - # Write for-loop footers - for dim in range(num_dims - 2): - callsite_stream.write("}") - - else: - # Should not be possible- otherwise, doing nothing is also okay - # because a empty copy shape means we don't copy anything - pass - - emit_sync_debug_checks(copy_context.backend, copy_context.callsite_stream) - - -class FallBackGPUCopyStrategy(CopyStrategy): - - def applicable(self, copy_context: CopyContext) -> bool: - return True - - def generate_copy(self, copy_context: CopyContext): - callsite_stream, cfg, state_id, src_node, dst_node = copy_context.get_write_context() - sdfg = copy_context.sdfg - dfg = copy_context.dfg - edge = copy_context.edge - cpu_codegen = copy_context.codegen._cpu_codegen - cpu_codegen.copy_memory(sdfg, cfg, dfg, state_id, src_node, dst_node, edge, None, callsite_stream) + return current_node diff --git a/dace/codegen/targets/experimental_cuda_helpers/gpu_stream_manager.py b/dace/codegen/targets/experimental_cuda_helpers/gpu_stream_manager.py index b2be3f0872..329547331a 100644 --- a/dace/codegen/targets/experimental_cuda_helpers/gpu_stream_manager.py +++ b/dace/codegen/targets/experimental_cuda_helpers/gpu_stream_manager.py @@ -7,8 +7,8 @@ class GPUStreamManager: """ Manage GPU backend streams (e.g., CUDA or HIP) for nodes in an SDFG. - Nodes are assigned stream IDs by the NaiveGPUStreamScheduler Pass, and - this class provides their access expressions and tracks the number of streams + Nodes are assigned stream IDs by the NaiveGPUStreamScheduler Pass, and + this class provides their access expressions and tracks the number of streams in use. GPU events are not (yet) supported. Note @@ -22,8 +22,6 @@ def __init__(self, sdfg: SDFG, gpustream_assignments: Dict[nodes.Node, int]): self._gpustream_assignments = gpustream_assignments self._num_gpu_streams = max(gpustream_assignments.values()) + 1 if gpustream_assignments else 0 self._num_gpu_events = 0 - - def get_stream_node(self, node: nodes.Node) -> str: """ @@ -46,16 +44,12 @@ def get_stream_node(self, node: nodes.Node) -> str: If the given node does not have an assigned stream. """ if node in self.gpustream_assignments: - return self._stream_access_template.format( - gpu_stream=self.gpustream_assignments[node] - ) + return self._stream_access_template.format(gpu_stream=self.gpustream_assignments[node]) else: - raise ValueError( - f"No GPU stream assigned to node {node}. " - "Check whether the node is relevant for GPU stream assignment and, if it is, " - "inspect the GPU stream pipeline to see why no stream was assigned." - ) - + raise ValueError(f"No GPU stream assigned to node {node}. " + "Check whether the node is relevant for GPU stream assignment and, if it is, " + "inspect the GPU stream pipeline to see why no stream was assigned.") + def get_stream_edge(self, src_node: nodes.Node, dst_node: nodes.Node) -> str: """ Returns the GPU stream access expression for an edge. @@ -65,12 +59,10 @@ def get_stream_edge(self, src_node: nodes.Node, dst_node: nodes.Node) -> str: modeled via tasklets in the SDFG, so edges do not carry stream info. Implement this if the design changes and edges need streams again. """ - raise NotImplementedError( - "Edge-level GPU streams are not supported. " - "They were previously used for asynchronous memory copies (e.g., cudaMemcpyAsync), " - "but these are now modeled via tasklets in the SDFG. " - "Implement this if the design changes and edges must carry GPU stream information." - ) + raise NotImplementedError("Edge-level GPU streams are not supported. " + "They were previously used for asynchronous memory copies (e.g., cudaMemcpyAsync), " + "but these are now modeled via tasklets in the SDFG. " + "Implement this if the design changes and edges must carry GPU stream information.") @property def num_gpu_events(self) -> int: @@ -81,9 +73,8 @@ def num_gpu_events(self) -> int: def num_gpu_streams(self) -> int: """Number of GPU streams in use (stream IDs start at 0).""" return self._num_gpu_streams - + @property def gpustream_assignments(self) -> Dict[nodes.Node, int]: """Mapping of nodes to assigned GPU stream IDs (not all nodes necessarily have a GPU stream ID).""" return self._gpustream_assignments - \ No newline at end of file diff --git a/dace/codegen/targets/experimental_cuda_helpers/gpu_utils.py b/dace/codegen/targets/experimental_cuda_helpers/gpu_utils.py index 8a068b0b9d..27c073afc8 100644 --- a/dace/codegen/targets/experimental_cuda_helpers/gpu_utils.py +++ b/dace/codegen/targets/experimental_cuda_helpers/gpu_utils.py @@ -1,24 +1,17 @@ +# Copyright 2019-2025 ETH Zurich and the DaCe authors. All rights reserved. import functools import sympy -from typing import Set, List, Optional +from typing import Set, List import dace -from dace import Config, symbolic, data as dt, dtypes +from dace import Config, data as dt, dtypes from dace.sdfg import nodes, SDFGState -from dace.codegen import common, cppunparse +from dace.codegen import common from dace.codegen.dispatcher import DefinedType -from dace.codegen.prettycode import CodeIOStream from dace.transformation.helpers import get_parent_map -def symbolic_to_cpp(arr): - """ Converts an array of symbolic variables (or one) to C++ strings. """ - if not isinstance(arr, list): - return cppunparse.pyexpr2cpp(symbolic.symstr(arr, cpp_mode=True)) - return [cppunparse.pyexpr2cpp(symbolic.symstr(d, cpp_mode=True)) for d in arr] - - def get_cuda_dim(idx): """ Converts 0 to x, 1 to y, 2 to z, or raises an exception. """ if idx < 0 or idx > 2: @@ -99,9 +92,10 @@ def validate_block_size_limits(kernel_map_entry: nodes.MapEntry, block_size: Lis 'thread-block size. To increase this limit, modify the ' '`compiler.cuda.block_size_lastdim_limit` configuration entry.') + def generate_sync_debug_call() -> str: """ - Generate backend sync and error-check calls as a string if + Generate backend sync and error-check calls as a string if synchronous debugging is enabled. Parameters @@ -117,13 +111,12 @@ def generate_sync_debug_call() -> str: backend: str = common.get_gpu_backend() sync_call: str = "" if Config.get_bool('compiler', 'cuda', 'syncdebug'): - sync_call = ( - f"DACE_GPU_CHECK({backend}GetLastError());\n" - f"DACE_GPU_CHECK({backend}DeviceSynchronize());\n" - ) + sync_call = (f"DACE_GPU_CHECK({backend}GetLastError());\n" + f"DACE_GPU_CHECK({backend}DeviceSynchronize());\n") return sync_call + def get_defined_type(data: dt.Data) -> DefinedType: """ Return the DefinedType for a data descriptor. @@ -136,7 +129,8 @@ def get_defined_type(data: dt.Data) -> DefinedType: else: raise NotImplementedError(f"Data type '{type(data).__name__}' is not supported for defined type inference." "Only Scalars and Arrays are expected for Kernels.") - + + def is_within_schedule_types(state: SDFGState, node: nodes.Node, schedules: Set[dtypes.ScheduleType]) -> bool: """ Checks if the given node is enclosed within a Map whose schedule type @@ -146,7 +140,7 @@ def is_within_schedule_types(state: SDFGState, node: nodes.Node, schedules: Set[ ---------- state : SDFGState The State where the node resides - node : nodes.Node + node : nodes.Node The node to check. schedules : set[dtypes.ScheduleType] A set of schedule types to match (e.g., {dtypes.ScheduleType.GPU_Device}). @@ -166,4 +160,4 @@ def is_within_schedule_types(state: SDFGState, node: nodes.Node, schedules: Set[ parent = get_parent_map(state, current) if parent is None: return False - current, state = parent \ No newline at end of file + current, state = parent diff --git a/dace/codegen/targets/experimental_cuda_helpers/new_copy_strategies.py b/dace/codegen/targets/experimental_cuda_helpers/new_copy_strategies.py deleted file mode 100644 index 9ee6b398b1..0000000000 --- a/dace/codegen/targets/experimental_cuda_helpers/new_copy_strategies.py +++ /dev/null @@ -1,763 +0,0 @@ -# Copyright 2019-2025 ETH Zurich and the DaCe authors. All rights reserved. -from abc import ABC, abstractmethod -from typing import Any, Dict, List, Optional, Tuple, Union - -from dace import SDFG, SDFGState, data, dtypes, subsets -from dace import memlet as mm -from dace import symbolic -from dace.codegen import common -from dace.codegen.targets import cpp -from dace.codegen.targets.cpp import unparse_cr -from dace.codegen.targets.experimental_cuda_helpers.gpu_utils import symbolic_to_cpp, generate_sync_debug_call -from dace.config import Config -from dace.dtypes import StorageType -from dace.frontend import operations -from dace.sdfg import nodes, scope_contains_scope -from dace.sdfg.graph import MultiConnectorEdge -from dace.transformation import helpers - -class CopyContext: - """ - Encapsulates inputs required for copy operations and exposes helper - methods to derive additional information. This keeps copy strategies - lightweight by letting them focus only on the relevant logic. - """ - - def __init__(self, sdfg: SDFG, state: SDFGState, src_node: nodes.Node, dst_node: nodes.Node, - edge: MultiConnectorEdge[mm.Memlet], gpustream_assignments: Dict[nodes.Node, Union[int, str]]): - - # Store the basic context as attributes - self.sdfg = sdfg - self.state = state - self.src_node = src_node - self.dst_node = dst_node - self.edge = edge - self.gpustream_assignments = gpustream_assignments - - memlet = edge.data - - self.copy_shape = memlet.subset.size_exact() - if isinstance(src_node, nodes.AccessNode) and isinstance(dst_node, nodes.AccessNode): - copy_shape, src_strides, dst_strides, src_expr, dst_expr = self.get_accessnode_to_accessnode_copy_info() - else: - copy_shape = memlet.subset.size_exact() - src_strides = dst_strides = src_expr = dst_expr = None - - self.copy_shape = copy_shape - self.src_strides = src_strides - self.dst_strides = dst_strides - self.src_expr = src_expr - self.dst_expr = dst_expr - - def get_storage_type(self, node: nodes.Node): - """ - Return the storage type associated with a given SDFG node. - - Tasklets are assumed to use register storage, while AccessNodes - return the storage type from their data descriptor. Raises - NotImplementedError for unsupported node types. - """ - if isinstance(node, nodes.Tasklet): - storage_type = StorageType.Register - - elif isinstance(node, nodes.AccessNode): - storage_type = node.desc(self.sdfg).storage - - else: - raise NotImplementedError( - f"Unsupported node type {type(node)} for storage type retrieval; " - "expected AccessNode or Tasklet. Please extend this method accordingly." - ) - - return storage_type - - def get_assigned_gpustream(self) -> str: - """ - Return the GPU stream expression assigned to both source and destination nodes. - - Ensures that both nodes have a matching stream ID, then constructs the - variable name from the configured prefix and stream ID. Raises ValueError - if assignments are missing or inconsistent. - - Example: - If the configured prefix is 'gpu_stream' and the assigned stream ID is 0, - this method returns 'gpu_stream0'. - """ - src_stream = self.gpustream_assignments.get(self.src_node) - dst_stream = self.gpustream_assignments.get(self.dst_node) - - # 1. Catch unsupported cases - if src_stream is None or dst_stream is None: - raise ValueError("GPU stream assignment missing for source or destination node.") - - if src_stream != dst_stream: - raise ValueError( - f"Mismatch in assigned GPU streams: src_node has '{src_stream}', " - f"dst_node has '{dst_stream}'. They must be the same." - ) - - # 2. Generate GPU stream expression - gpustream = src_stream - gpustream_var_name_prefix = Config.get('compiler', 'cuda', 'gpu_stream_name').split(',')[1] - gpustream_expr = f"{gpustream_var_name_prefix}{gpustream}" - - return gpustream_expr - - def get_memory_location(self) -> Tuple[str, str]: - """ - Determine whether the source and destination nodes reside in device or host memory. - - Uses the storage type of each node to classify it as either 'Device' - (GPU global memory) or 'Host' (all other storage types). - Used for GPU related copies outside the kernel (e.g. to construct - cudaMemcpyHostToDevice for example). - - Returns - ------- - Tuple[str, str] - (src_location, dst_location) where each is either 'Device' or 'Host'. - """ - src_storage = self.get_storage_type(self.src_node) - dst_storage = self.get_storage_type(self.dst_node) - src_location = 'Device' if src_storage == dtypes.StorageType.GPU_Global else 'Host' - dst_location = 'Device' if dst_storage == dtypes.StorageType.GPU_Global else 'Host' - - return src_location, dst_location - - def get_ctype(self) -> Any: - """ - Determine the C data type (ctype) of the source or destination node. - - The ctype is resolved from the data descriptor of the first node - (source or destination) that is an AccessNode (assumed to be the same - if both are AccessNodes). - - Returns - ------- - Any - The C type string (e.g., "float*", "int32") associated with the node. - - Raises - ------ - NotImplementedError - If neither the source nor the destination node is an AccessNode. - """ - sdfg = self.sdfg - src_node, dst_node = self.src_node, self.dst_node - - if isinstance(src_node, nodes.AccessNode): - return src_node.desc(sdfg).ctype - - if isinstance(dst_node, nodes.AccessNode): - return dst_node.desc(sdfg).ctype - - raise NotImplementedError( - f"Cannot determine ctype: neither src nor dst node is an AccessNode. " - f"Got src_node type: {type(src_node).__name__}, dst_node type: {type(dst_node).__name__}. " - "Please extend this case or fix the issue." - ) - - def get_accessnode_to_accessnode_copy_info(self): - """ - Compute copy shape, absolute strides, and pointer expressions for a copy - between two AccessNodes. Tries to mimic - cpp.memlet_copy_to_absolute_strides without requiring a dispatcher. - - Returns - ------- - (copy_shape, src_strides, dst_strides, src_expr, dst_expr) - - Raises - ------ - TypeError - If either endpoint is not an AccessNode. - NotImplementedError - If a descriptor is not Scalar or Array. - """ - - # ---------------------------- helpers ---------------------------- - def _collapse_strides(strides, subset): - """Remove size-1 dims; keep tile strides; default to [1] if none remain.""" - n = len(subset) - collapsed = [st for st, sz in zip(strides, subset.size()) if sz != 1] - collapsed.extend(strides[n:]) # include tiles - if len(collapsed) == 0: - return [1] - return collapsed - - def _ptr_name(desc, name): - if desc.transient and desc.lifetime in ( - dtypes.AllocationLifetime.Persistent, dtypes.AllocationLifetime.External - ): - return f'__state->__{sdfg.cfg_id}_{name}' - return name - - def _expr_for(desc, name, subset): - ptr = _ptr_name(desc, name) - - if isinstance(desc, data.Scalar): - # GPU scalar special-case - if desc.storage in dtypes.GPU_MEMORY_STORAGES_EXPERIMENTAL_CUDACODEGEN: - parent = state.sdfg.parent_nsdfg_node - if parent is not None and name in parent.in_connectors: - return f"&{ptr}" - return ptr - # CPU (or other) scalars - return f"&{ptr}" - - if isinstance(desc, data.Array): - offset = cpp.cpp_offset_expr(desc, subset) - return f"{ptr} + {offset}" if offset != "0" else ptr - - raise NotImplementedError( - f"Expected {name} to be either data.Scalar or data.Array, but got {type(desc).__name__}." - ) - - # ---------------------------- Get copy info ---------------------------- - # Get needed information - src_node, dst_node = self.src_node, self.dst_node - sdfg, edge, state = self.sdfg, self.edge, self.state - memlet, copy_shape = self.edge.data, self.copy_shape - - # Guard - only applicable if src and dst are AccessNodes - if not (isinstance(src_node, nodes.AccessNode) and isinstance(dst_node, nodes.AccessNode)): - raise TypeError( - f"get_accessnode_to_accessnode_copy_info requires both source and destination " - f"to be AccessNode instances, but got {type(src_node).__name__} and {type(dst_node).__name__}." - ) - - # Get node descriptors - src_nodedesc = src_node.desc(sdfg) - dst_nodedesc = dst_node.desc(sdfg) - - # Resolve subsets (fallback to full range) - src_subset = memlet.get_src_subset(edge, state) - dst_subset = memlet.get_dst_subset(edge, state) - - if src_subset is None: - src_subset = subsets.Range.from_array(src_nodedesc) - - if dst_subset is None: - dst_subset = subsets.Range.from_array(dst_nodedesc) - - # Get strides - src_strides = src_subset.absolute_strides(src_nodedesc.strides) - dst_strides = dst_subset.absolute_strides(dst_nodedesc.strides) - - # Try to convert to a degenerate/strided ND copy first - result = cpp.ndcopy_to_strided_copy( - copy_shape, - src_nodedesc.shape, - src_strides, - dst_nodedesc.shape, - dst_strides, - memlet.subset, - src_subset, - dst_subset, - ) - - if result is not None: - copy_shape, src_strides, dst_strides = result - else: - src_strides = _collapse_strides(src_strides, src_subset) - dst_strides = _collapse_strides(dst_strides, dst_subset) - copy_shape = [s for s in copy_shape if s != 1] or [1] - - # Extend copy shape to the largest among the data dimensions, - # and extend other array with the appropriate strides - if len(dst_strides) != len(copy_shape) or len(src_strides) != len(copy_shape): - if memlet.data == src_node.data: - copy_shape, dst_strides = cpp.reshape_strides(src_subset, src_strides, dst_strides, copy_shape) - elif memlet.data == dst_node.data: - copy_shape, src_strides = cpp.reshape_strides(dst_subset, dst_strides, src_strides, copy_shape) - - # Build final expressions - src_expr = _expr_for(src_nodedesc, src_node.data, src_subset) - dst_expr = _expr_for(dst_nodedesc, dst_node.data, dst_subset) - - return copy_shape, src_strides, dst_strides, src_expr, dst_expr - - -class CopyStrategy(ABC): - """Abstract base class for memory copy strategies.""" - - @abstractmethod - def applicable(self, copy_context: CopyContext) -> bool: - """ - Return True if this strategy can handle the given memory copy. - """ - raise NotImplementedError('Abstract class') - - @abstractmethod - def generate_copy(self, copy_context: CopyContext) -> str: - """ - Generates and returns the copy code for the supported pattern. - """ - raise NotImplementedError('Abstract class') - - -class OutOfKernelCopyStrategy(CopyStrategy): - """ - Copy strategy for memory transfers that occur outside of kernel execution. - - This pattern often occurs when generating host-to-device copies for kernel inputs - (since kernels cannot access host memory directly), and device-to-host copies - to retrieve results for further processing. - """ - - def applicable(self, copy_context: CopyContext) -> bool: - """ - Determines whether the data movement is a host<->device memory copy. - - This function returns True if: - - We are not currently generating kernel code - - The copy occurs between two AccessNodes - - The data descriptors of source and destination are not views. - - The storage types of either src or dst is CPU_Pinned or GPU_Device - - We do not have a CPU-to-CPU copy - """ - # Retrieve needed information - state = copy_context.state - src_node, dst_node = copy_context.src_node, copy_context.dst_node - - # 1. Ensure copy is not occuring within a kernel - scope_dict = state.scope_dict() - deeper_node = dst_node if scope_contains_scope(scope_dict, src_node, dst_node) else src_node - - parent_map_tuple = helpers.get_parent_map(state, deeper_node) - while parent_map_tuple is not None: - parent_map, parent_state = parent_map_tuple - if parent_map.map.schedule in dtypes.GPU_SCHEDULES_EXPERIMENTAL_CUDACODEGEN: - return False - else: - parent_map_tuple = helpers.get_parent_map(parent_state, parent_map) - - # 2. Check whether copy is between two AccessNodes - if not (isinstance(src_node, nodes.AccessNode) and isinstance(dst_node, nodes.AccessNode)): - return False - - # 3. The data descriptors of source and destination are not views - if isinstance(src_node.desc(state), data.View) or isinstance(dst_node.desc(state), data.View): - return False - - # 4. Check that one StorageType of either src or dst is CPU_Pinned or GPU_Device - src_storage = copy_context.get_storage_type(src_node) - dst_storage = copy_context.get_storage_type(dst_node) - if not (src_storage in (StorageType.GPU_Global, StorageType.CPU_Pinned) or - dst_storage in (StorageType.GPU_Global, StorageType.CPU_Pinned)): - return False - - # 5. Check that this is not a CPU to CPU copy - cpu_storage_types = [StorageType.CPU_Heap, StorageType.CPU_ThreadLocal, StorageType.CPU_Pinned] - if src_storage in cpu_storage_types and dst_storage in cpu_storage_types: - return False - - return True - - def generate_copy(self, copy_context: CopyContext) -> str: - """Execute host-device copy with CUDA memory operations""" - - # Guard - memlet = copy_context.edge.data - if memlet.wcr is not None: - src_location, dst_location = copy_context.get_memory_location() - raise NotImplementedError(f'Accumulate {src_location} to {dst_location} not implemented') - - # Based on the copy dimension, call appropiate helper function - num_dims = len(copy_context.copy_shape) - if num_dims == 1: - copy_call = self._generate_1d_copy(copy_context) - - elif num_dims == 2: - copy_call = self._generate_2d_copy(copy_context) - - else: - # sanity check - assert num_dims > 2, f"Expected copy shape with more than 2 dimensions, but got {num_dims}." - copy_call = self._generate_nd_copy(copy_context) - - return copy_call - - def _generate_1d_copy(self, copy_context: CopyContext) -> str: - """ - Generates a 1D memory copy between host and device using the GPU backend. - - Uses {backend}MemcpyAsync for contiguous memory. For strided memory, - {backend}Memcpy2DAsync is leveraged to efficiently handle the stride along one dimension. - """ - # ----------- Retrieve relevant copy parameters -------------- - backend: str = common.get_gpu_backend() - - # Due to applicable(), src and dst node must be AccessNodes - copy_shape, src_strides, dst_strides, src_expr, dst_expr = copy_context.get_accessnode_to_accessnode_copy_info() - - src_location, dst_location = copy_context.get_memory_location() - is_contiguous_copy = (src_strides[-1] == 1) and (dst_strides[-1] == 1) - ctype = copy_context.get_ctype() - gpustream = copy_context.get_assigned_gpustream() - - # ----------------- Generate backend call -------------------- - - if is_contiguous_copy: - # Memory is linear: can use {backend}MemcpyAsync - copysize = ' * '.join(symbolic_to_cpp(copy_shape)) - copysize += f' * sizeof({ctype})' - kind = f'{backend}Memcpy{src_location}To{dst_location}' - call = f'DACE_GPU_CHECK({backend}MemcpyAsync({dst_expr}, {src_expr}, {copysize}, {kind}, {gpustream}));\n' - - else: - # Memory is strided: use {backend}Memcpy2DAsync with dpitch/spitch - # This allows copying a strided 1D region - dpitch = f'{symbolic_to_cpp(dst_strides[0])} * sizeof({ctype})' - spitch = f'{symbolic_to_cpp(src_strides[0])} * sizeof({ctype})' - width = f'sizeof({ctype})' - height = symbolic_to_cpp(copy_shape[0]) - kind = f'{backend}Memcpy{src_location}To{dst_location}' - - call = f'DACE_GPU_CHECK({backend}Memcpy2DAsync({dst_expr}, {dpitch}, {src_expr}, {spitch}, {width}, {height}, {kind}, {gpustream}));\n' - - # Potentially snychronization required if syncdebug is set to true in configurations - call = call + generate_sync_debug_call() - return call - - def _generate_2d_copy(self, copy_context: CopyContext) -> None: - """ - Generates a 2D memory copy using {backend}Memcpy2DAsync. - - Three main cases are handled: - - Copy between row-major stored arrays with contiguous rows. - - Copy between column-major stored arrays with contiguous columns. - - A special case where a 2D copy can still be represented. - - Raises: - NotImplementedError: Raised if the source and destination strides do not match any of the handled patterns. - Such cases indicate an unsupported 2D copy and should be examined separately. - They can be implemented if valid, or a more descriptive error should be raised if the path should not occur. - - Note: - {backend}Memcpy2DAsync supports strided copies along only one dimension (row or column), - but not both simultaneously. - """ - - # ----------- Extract relevant copy parameters -------------- - backend: str = common.get_gpu_backend() - - # Due to applicable(), src and dst node must be AccessNodes - copy_shape, src_strides, dst_strides, src_expr, dst_expr = copy_context.get_accessnode_to_accessnode_copy_info() - src_location, dst_location = copy_context.get_memory_location() - ctype = copy_context.get_ctype() - gpustream = copy_context.get_assigned_gpustream() - - # ----------------- Generate backend call if supported -------------------- - - # Case: Row-major layout, rows are not strided. - if (src_strides[1] == 1) and (dst_strides[1] == 1): - dpitch = f'{symbolic_to_cpp(dst_strides[0])} * sizeof({ctype})' - spitch = f'{symbolic_to_cpp(src_strides[0])} * sizeof({ctype})' - width = f'{symbolic_to_cpp(copy_shape[1])} * sizeof({ctype})' - height = f'{symbolic_to_cpp(copy_shape[0])}' - kind = f'{backend}Memcpy{src_location}To{dst_location}' - - call = f'DACE_GPU_CHECK({backend}Memcpy2DAsync({dst_expr}, {dpitch}, {src_expr}, {spitch}, {width}, {height}, {kind}, {gpustream}));\n' - - # Case: Column-major layout, no columns are strided. - elif (src_strides[0] == 1) and (dst_strides[0] == 1): - dpitch = f'{symbolic_to_cpp(dst_strides[1])} * sizeof({ctype})' - spitch = f'{symbolic_to_cpp(src_strides[1])} * sizeof({ctype})' - width = f'{symbolic_to_cpp(copy_shape[0])} * sizeof({ctype})' - height = f'{symbolic_to_cpp(copy_shape[1])}' - kind = f'{backend}Memcpy{src_location}To{dst_location}' - - call = f'DACE_GPU_CHECK({backend}Memcpy2DAsync({dst_expr}, {dpitch}, {src_expr}, {spitch}, {width}, {height}, {kind}, {gpustream}));\n' - - # Special case - elif (src_strides[0] / src_strides[1] == copy_shape[1] and dst_strides[0] / dst_strides[1] == copy_shape[1]): - # Consider as an example this copy: A[0:I, 0:J, K] -> B[0:I, 0:J] with - # copy shape [I, J], src_strides[J*K, K], dst_strides[J, 1]. This can be represented with a - # {backend}Memcpy2DAsync call! - - dpitch = f'{symbolic_to_cpp(dst_strides[1])} * sizeof({ctype})' - spitch = f'{symbolic_to_cpp(src_strides[1])} * sizeof({ctype})' - width = f'sizeof({ctype})' - height = symbolic_to_cpp(copy_shape[0] * copy_shape[1]) - kind = f'{backend}Memcpy{src_location}To{dst_location}' - - call = f'DACE_GPU_CHECK({backend}Memcpy2DAsync({dst_expr}, {dpitch}, {src_expr}, {spitch}, {width}, {height}, {kind}, {gpustream}));\n' - - else: - raise NotImplementedError( - f"Unsupported 2D memory copy: shape={copy_shape}, src_strides={src_strides}, dst_strides={dst_strides}." - "Please implement this case if it is valid, or raise a more descriptive error if this path should not be taken." - ) - - return call - - def _generate_nd_copy(self, copy_context: CopyContext) -> None: - """ - Generates GPU code for copying N-dimensional arrays using 2D memory copies. - - Uses {backend}Memcpy2DAsync for the last two dimensions, with nested loops - for any outer dimensions. Expects the copy to be contiguous and between - row-major storage locations. - """ - # ----------- Extract relevant copy parameters -------------- - backend: str = common.get_gpu_backend() - - # Due to applicable(), src and dst node must be AccessNodes - copy_shape, src_strides, dst_strides, src_expr, dst_expr = copy_context.get_accessnode_to_accessnode_copy_info() - - src_location, dst_location = copy_context.get_memory_location() - ctype = copy_context.get_ctype() - gpustream = copy_context.get_assigned_gpustream() - num_dims = len(copy_shape) - - # ----------- Guard for unsupported Pattern -------------- - if not (src_strides[-1] == 1) and (dst_strides[-1] == 1): - src_node, dst_node = copy_context.src_node, copy_context.dst_node - src_storage = copy_context.get_storage_type(src_node) - dst_storage = copy_context.get_storage_type(dst_node) - raise NotImplementedError( - "N-dimensional GPU memory copies, that are strided or contain column-major arrays, are currently not supported.\n" - f" Source node: {src_node} (storage: {src_storage})\n" - f" Destination node: {copy_context.dst_node} (storage: {dst_storage})\n" - f" Source strides: {src_strides}\n" - f" Destination strides: {dst_strides}\n" - f" copy shape: {copy_shape}\n" - ) - - # ----------------- Generate and write backend call(s) -------------------- - - call = "" - # Write for-loop headers - for dim in range(num_dims - 2): - call += f"for (int __copyidx{dim} = 0; __copyidx{dim} < {copy_shape[dim]}; ++__copyidx{dim}) {{\n" - - # Write Memcopy2DAsync - offset_src = ' + '.join(f'(__copyidx{d} * ({symbolic_to_cpp(s)}))' for d, s in enumerate(src_strides[:-2])) - offset_dst = ' + '.join(f'(__copyidx{d} * ({symbolic_to_cpp(s)}))' for d, s in enumerate(dst_strides[:-2])) - - src = f'{src_expr} + {offset_src}' - dst = f'{dst_expr} + {offset_dst}' - - dpitch = f'{symbolic_to_cpp(dst_strides[-2])} * sizeof({ctype})' - spitch = f'{symbolic_to_cpp(src_strides[-2])} * sizeof({ctype})' - width = f'{symbolic_to_cpp(copy_shape[-1])} * sizeof({ctype})' - height = symbolic_to_cpp(copy_shape[-2]) - kind = f'{backend}Memcpy{src_location}To{dst_location}' - - # Generate call and write it - call += f'DACE_GPU_CHECK({backend}Memcpy2DAsync({dst}, {dpitch}, {src}, {spitch}, {width}, {height}, {kind}, {gpustream}));\n' - - # Write for-loop footers - for dim in range(num_dims - 2): - call += "\n}" - - # Return the code - return call - - -class SyncCollaboritveGPUCopyStrategy(CopyStrategy): - """ - Implements (synchronous) collaborative GPU copy operations. - - This strategy generates the appropriate code for copies performed - inside GPU kernels, where multiple threads cooperate to move data - between gpu memory spaces (e.g., global to shared memory). - """ - - def applicable(self, copy_context: CopyContext) -> bool: - """ - Checks if the copy is eligible for a collaborative GPU-to-GPU copy. - - Conditions: - 1. The copy is between two AccessNodes - 2. The copy is between GPU memory StorageTypes (shared or global). - 3. The innermost non-sequential map is a GPU_Device-scheduled map i.e. - the copy occurs within a kernel but is not within a GPU_ThreadBlock map. - """ - # --- Condition 1: src and dst are AccessNodes --- - src_node, dst_node = copy_context.src_node, copy_context.dst_node - if not (isinstance(src_node, nodes.AccessNode) and isinstance(dst_node, nodes.AccessNode)): - return False - - # --- Condition 2: GPU to GPU memory transfer --- - src_storage, dst_storage = copy_context.get_storage_type(src_node), copy_context.get_storage_type(dst_node) - gpu_storages = {dtypes.StorageType.GPU_Global, dtypes.StorageType.GPU_Shared} - - if not (src_storage in gpu_storages and dst_storage in gpu_storages): - return False - - # --- Condition 3: Next non-sequential Map is a GPU_Device Map --- - next_nonseq_parent_map = self._next_non_seq_parent_map(copy_context) - if next_nonseq_parent_map is None: - return False - else: - return next_nonseq_parent_map.map.schedule == dtypes.ScheduleType.GPU_Device - - def generate_copy(self, copy_context: CopyContext, kernel_dimensions_maps: Dict[nodes.MapEntry, Tuple[List, List]]) -> str: - """ - Generates a GPU copy call as a string using DaCe's runtime CUDA copy functions. - - The function determines the appropriate templated copy function from - `dace/libraries/runtime/include/dace/cuda/copy.cuh` and constructs - the call string with the necessary arguments, including kernel block - dimensions and optional accumulation/reduction information. - - Parameters - ---------- - copy_context : CopyContext - Helper object containing information about the copy. - - kernel_dimensions_maps : Dict[nodes.MapEntry, Tuple[List, List]] - Kernel map (GPU_Devie scheduled map) entry nodes to (grid_dims, block_dims); - block_dims needed in templating. - - Returns - ------- - str - The GPU copy call in C++ as a string. - - Notes - ----- - - The kernel block size could be derived, but since this function is typically called - from `ExperimentalCUDACodeGen`, it is provided as input to avoid recomputation. - - The template functions use a parameter called 'is_async', which is set to True here - because `ExperimentalCUDACodeGen` inserts "__syncthreads()" explicitly in tasklets. - """ - # ----------- Retrieve relevant copy information -------------- - - # Due to applicable(), src and dst node must be AccessNodes - copy_shape, src_strides, dst_strides, src_expr, dst_expr = copy_context.get_accessnode_to_accessnode_copy_info() - sdfg = copy_context.sdfg - dtype = copy_context.src_node.desc(sdfg).dtype - ctype = dtype.ctype - - # Get copy function name (defined in runtime library) - num_dims = len(copy_shape) - src_node, dst_node = copy_context.src_node, copy_context.dst_node - src_storage, dst_storage = copy_context.get_storage_type(src_node), copy_context.get_storage_type(dst_node) - src_storage_name = self._get_storagename(src_storage) - dst_storage_name = self._get_storagename(dst_storage) - function_name = f"dace::{src_storage_name}To{dst_storage_name}{num_dims}D" - - # Extract WCR info (accumulation template + optional custom reduction) - accum, custom_reduction = self._get_accumulation_info(copy_context) - custom_reduction = [custom_reduction] if custom_reduction else [] - - # Get parent kernel block dimensions (guaranteed GPU_Device) and sync flag - parent_kernel = self._next_non_seq_parent_map(copy_context) - block_dims = ", ".join(symbolic_to_cpp(kernel_dimensions_maps[parent_kernel][1])) - synchronized = "true" # Legacy 'is_async'; sync barriers handled by passes (see docstring) - - # ------------------------- Generate copy call ---------------------------- - - if any(symbolic.issymbolic(s, copy_context.sdfg.constants) for s in copy_shape): - args_list = ([src_expr] + src_strides + [dst_expr] + custom_reduction + dst_strides + copy_shape) - args = ", ".join(symbolic_to_cpp(args_list)) - call = f"{function_name}Dynamic<{ctype}, {block_dims}, {synchronized}>{accum}({args});" - - elif function_name == "dace::SharedToGlobal1D": - copy_size = ', '.join(symbolic_to_cpp(copy_shape)) - accum = accum or '::Copy' - args_list = ([src_expr] + src_strides + [dst_expr] + dst_strides + custom_reduction) - args = ", ".join(symbolic_to_cpp(args_list)) - call = f"{function_name}<{ctype}, {block_dims}, {copy_size}, {synchronized}>{accum}({args});" - - else: - copy_size = ', '.join(symbolic_to_cpp(copy_shape)) - args_list = ([src_expr] + src_strides + [dst_expr] + custom_reduction) - args = ", ".join(symbolic_to_cpp(args_list)) - dst_strides_unpacked = ", ".join(symbolic_to_cpp(dst_strides)) - call = f"{function_name}<{ctype}, {block_dims}, {copy_size}, {dst_strides_unpacked}, {synchronized}>{accum}({args});" - - return call - - def _get_accumulation_info(self, copy_context: CopyContext) -> Tuple[str, str]: - """ - Extracts write-conflict resolution (WCR) information from the copy context - and returns the accumulation/reduction template components needed for the - final templated function call in `generate_copy()`. - - This method processes WCR information from the memlet and generates the - appropriate C++ template strings for both predefined and custom reductions. - - Parameters - ---------- - copy_context : CopyContext - Copy context containing the copy operation details, including - the memlet with WCR information. - - Returns - ------- - Tuple[str, str] - A tuple containing: - - accum : str - Template accumulation string for the function call. Empty string if no WCR, - `"::template Accum"` for predefined reductions, or `"::template Accum"` for custom reductions. - - custom_reduction : str - C++ formatted custom reduction code string. Empty string for no WCR or predefined reductions, - unparsed custom reduction code for custom reductions. - """ - sdfg = copy_context.sdfg - dtype = copy_context.src_node.desc(sdfg).dtype - memlet = copy_context.edge.data - wcr = memlet.wcr - reduction_type = operations.detect_reduction_type(wcr) - - if wcr is None: - accum, custom_reduction = "", "" - - elif reduction_type != dtypes.ReductionType.Custom: - # Use predefined reduction - reduction_type_str = str(reduction_type).split(".")[-1] # e.g., "Sum" - accum = f"::template Accum" - custom_reduction = "" - - else: - accum = "::template Accum" - custom_reduction = unparse_cr(sdfg, wcr, dtype) - - return accum, custom_reduction - - def _get_storagename(self, storage: dtypes.StorageType): - """ - Returns a string containing the name of the storage location. - - Example: dtypes.StorageType.GPU_Shared will return "Shared". - """ - storage_name = str(storage) - return storage_name[storage_name.rindex('_') + 1:] - - def _next_non_seq_parent_map(self, copy_context: CopyContext) -> Optional[nodes.MapEntry]: - """ - Traverse up the parent map chain from the deeper of src_node or dst_node - in `copy_context` and return the first parent MapEntry whose schedule - is not sequential. - - Parameters - ---------- - copy_context : CopyContext - Context information about the memory copy. - - Returns - ------- - Optional[nodes.MapEntry] - The first non-sequential parent MapEntry encountered, or None if no - such parent exists. - """ - src_node, dst_node = copy_context.src_node, copy_context.dst_node - state = copy_context.state - scope_dict = state.scope_dict() - - # Determine which node (src or dst) is in the deeper scope - deeper_node = dst_node if scope_contains_scope(scope_dict, src_node, dst_node) else src_node - current_node = deeper_node - while (current_node is None or not isinstance(current_node, nodes.MapEntry) - or current_node.map.schedule == dtypes.ScheduleType.Sequential): - parent = helpers.get_parent_map(state, current_node) - if parent is None: - current_node = None - break - current_node, state = parent - - return current_node \ No newline at end of file diff --git a/dace/codegen/targets/experimental_cuda_helpers/scope_strategies.py b/dace/codegen/targets/experimental_cuda_helpers/scope_strategies.py index e72da00828..1e690248fa 100644 --- a/dace/codegen/targets/experimental_cuda_helpers/scope_strategies.py +++ b/dace/codegen/targets/experimental_cuda_helpers/scope_strategies.py @@ -1,28 +1,17 @@ -# Standard library imports +# Copyright 2019-2025 ETH Zurich and the DaCe authors. All rights reserved. from abc import ABC, abstractmethod -from typing import TYPE_CHECKING, Dict, Type -# DaCe core imports import dace from dace import dtypes, subsets, symbolic - -from dace.config import Config - -# DaCe SDFG imports from dace.sdfg import SDFG, ScopeSubgraphView, nodes, SDFGState from dace.sdfg.state import ControlFlowRegion - -# DaCe codegen imports from dace.codegen.prettycode import CodeIOStream from dace.codegen.targets.framecode import DaCeCodeGenerator from dace.codegen.dispatcher import DefinedType, TargetDispatcher - -# DaCe transformation imports from dace.transformation import helpers - -# Experimental CUDA imports +from dace.codegen.targets.cpp import sym2cpp from dace.codegen.targets.experimental_cuda import ExperimentalCUDACodeGen, KernelSpec -from dace.codegen.targets.experimental_cuda_helpers.gpu_utils import (symbolic_to_cpp, get_cuda_dim, product) +from dace.codegen.targets.experimental_cuda_helpers.gpu_utils import (get_cuda_dim, product) #---------------------------------------------------------------------------------- # GPU Scope Generation Strategies @@ -80,10 +69,6 @@ def generate(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: ScopeSubgraphV callsite_stream=callsite_stream, comment="Kernel scope") as scope_manager: - # ----------------- Initialize Kernel Scope Constructs ----------------------- - - self._generate_kernel_initialization(sdfg, cfg, dfg_scope, state_id, function_stream, callsite_stream) - # ----------------- Retrieve kernel configuration ----------------------- kernel_spec = self._current_kernel_spec @@ -120,17 +105,15 @@ def generate(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: ScopeSubgraphV # Delinearize third dimension if more than 3D (used in 3D+ mapping) if dim == 2 and kernel_dimensions > 3: tail_prod = product(kernel_dim_sizes[3:]) - index_expr = f"({index_expr} / ({symbolic_to_cpp(tail_prod)}))" + index_expr = f"({index_expr} / ({sym2cpp(tail_prod)}))" else: # Handle dimensions beyond the third (delinearize and modulo) index_expr = f'blockIdx.z' tail_prod = product(kernel_dim_sizes[dim + 1:]) - index_expr = ( - f"(({index_expr} / ({symbolic_to_cpp(tail_prod)})) % ({symbolic_to_cpp(kernel_dim_sizes[dim])}))" - ) + index_expr = (f"(({index_expr} / ({sym2cpp(tail_prod)})) % ({sym2cpp(kernel_dim_sizes[dim])}))") # Define thread/Block index - var_def = symbolic_to_cpp(symbolic_coordinates[dim]).replace(f'__SYM_IDX{dim}', index_expr) + var_def = sym2cpp(symbolic_coordinates[dim]).replace(f'__SYM_IDX{dim}', index_expr) callsite_stream.write(f'{thread_id_ctype} {var_name} = {var_def};', cfg, state_id, kernel_entry_node) self._dispatcher.defined_vars.add(var_name, DefinedType.Scalar, thread_id_ctype) @@ -145,8 +128,9 @@ def generate(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: ScopeSubgraphV function_stream, callsite_stream, skip_entry_node=True) - - self.codegen._frame.deallocate_arrays_in_scope(sdfg, cfg, kernel_entry_node, function_stream, callsite_stream) + + self.codegen._frame.deallocate_arrays_in_scope(sdfg, cfg, kernel_entry_node, function_stream, + callsite_stream) def _generate_kernel_signature(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: ScopeSubgraphView, state_id: int, function_stream: CodeIOStream, callsite_stream: CodeIOStream): @@ -169,60 +153,6 @@ def _generate_kernel_signature(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_sco callsite_stream.write(f'__global__ void {launch_bounds} {kernel_name}({", ".join(kernel_args)}) ', cfg, state_id, node) - def _generate_kernel_initialization(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: ScopeSubgraphView, - state_id: int, function_stream: CodeIOStream, callsite_stream: CodeIOStream): - """ - NOTE: Under construction - Tell yakup: - 1. This is as far as I know really cuda specific- maybe I should raise an error if wrong backend (HIP) is used - 2. What about the shared state allocation? Is it correct to tell about this allocation? generally, did I - tell the dispatcher everything correctly? - """ - - # Skip this if there are no metada, nothing to initialize - metadata = sdfg.metadata - if metadata == None: - return - - node = dfg_scope.source_nodes()[0] - - callsite_stream.write(f"\n", cfg, state_id, node) - # initialize block group using coopertive groups - tblock_obj_name = Config.get('compiler', 'cuda', 'current_thread_block_name') - tblock_obj_ctype = "auto" - callsite_stream.write(f"{tblock_obj_ctype} {tblock_obj_name} = cg::this_thread_block();\n", cfg, state_id, node) - self._dispatcher.defined_vars.add(tblock_obj_name, DefinedType.Object, tblock_obj_ctype) - - # initialize pipeline - pipelines = dict() - for node_guid, node_meta in metadata.items(): - pipelines = node_meta.get("pipelines", {}) - for pipeline_name, pipeline_info in pipelines.items(): - pipelines[pipeline_name] = pipeline_info["pipeline_depth"] - - for pipeline_name, pipeline_depth in pipelines.items(): - callsite_stream.write(f"\n", cfg, state_id, node) - # initialize pipeline depth scalar - depth_name = f"pipeline_depth_{pipeline_name}" - depth_ctype = "const uint" - callsite_stream.write(f"{depth_ctype} {depth_name} = {pipeline_depth};\n", cfg, state_id, node) - self._dispatcher.defined_vars.add(depth_name, DefinedType.Scalar, depth_ctype) - - # allocate shared pipeline state - shared_state_name = f"shared_state_{pipeline_name}" - shared_state_ctype = f"cuda::pipeline_shared_state" - callsite_stream.write(f" __shared__ {shared_state_ctype} {shared_state_name};\n") - self._dispatcher.declared_arrays.add(shared_state_name, DefinedType.Pointer, shared_state_ctype) - - # intialize the pipeline - pipeline_ctype = "auto" - callsite_stream.write( - f"{pipeline_ctype} {pipeline_name} = cuda::make_pipeline({tblock_obj_name}, &{shared_state_name});\n", - cfg, state_id, node) - self._dispatcher.defined_vars.add(pipeline_name, DefinedType.Object, pipeline_ctype) - - callsite_stream.write(f"\n", cfg, state_id, node) - class ThreadBlockScopeGenerator(ScopeGenerationStrategy): @@ -286,16 +216,15 @@ def generate(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: ScopeSubgraphV # First three dimensions: direct mapping or partial delinearization if dim == 2 and map_dimensions > 3: tail_prod = product(map_dim_sizes[3:]) - base_expr = f"(threadIdx.z / ({symbolic_to_cpp(tail_prod)}))" + base_expr = f"(threadIdx.z / ({sym2cpp(tail_prod)}))" else: base_expr = f"threadIdx.{get_cuda_dim(dim)}" else: # Dimensions beyond the third: full delinearization tail_prod = product(map_dim_sizes[dim + 1:]) - base_expr = ( - f"((threadIdx.z / ({symbolic_to_cpp(tail_prod)})) % ({symbolic_to_cpp(map_dim_sizes[dim])}))") + base_expr = (f"((threadIdx.z / ({sym2cpp(tail_prod)})) % ({sym2cpp(map_dim_sizes[dim])}))") - var_def = symbolic_to_cpp(symbolic_coordinates[dim]).replace(f'__SYM_IDX{dim}', base_expr) + var_def = sym2cpp(symbolic_coordinates[dim]).replace(f'__SYM_IDX{dim}', base_expr) callsite_stream.write(f'{block_id_ctype} {var_name} = {var_def};', cfg, state_id, node) self._dispatcher.defined_vars.add(var_name, DefinedType.Scalar, block_id_ctype) @@ -316,7 +245,7 @@ def generate(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: ScopeSubgraphV # Block range start if dim >= 3 or (symbolic_indices[dim] >= start) != True: - condition += f'{var_name} >= {symbolic_to_cpp(start)}' + condition += f'{var_name} >= {sym2cpp(start)}' # Special case: block size is exactly the range of the map (0:b) if dim >= 3: @@ -328,7 +257,7 @@ def generate(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: ScopeSubgraphV if dim >= 3 or (not skipcond and (symbolic_index_bounds[dim] < end) != True): if len(condition) > 0: condition += ' && ' - condition += f'{var_name} < {symbolic_to_cpp(end + 1)}' + condition += f'{var_name} < {sym2cpp(end + 1)}' # Emit condition in code if any if len(condition) > 0: @@ -343,7 +272,7 @@ def generate(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: ScopeSubgraphV function_stream, callsite_stream, skip_entry_node=True) - + self.codegen._frame.deallocate_arrays_in_scope(sdfg, cfg, node, function_stream, callsite_stream) @@ -436,8 +365,7 @@ def generate(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: ScopeSubgraphV callsite_stream.write(f"{ids_ctype} {var_name} = {expr};", cfg, state_id, node) self._dispatcher.defined_vars.add(var_name, DefinedType.Scalar, ids_ctype) - - + self.codegen._frame.allocate_arrays_in_scope(sdfg, cfg, node, function_stream, callsite_stream) # ----------------- Guard Conditions for Warp Execution ----------------------- @@ -472,7 +400,7 @@ def generate(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: ScopeSubgraphV function_stream, callsite_stream, skip_entry_node=True) - + self.codegen._frame.deallocate_arrays_in_scope(sdfg, cfg, node, function_stream, callsite_stream) def _handle_GPU_Warp_scope_guards(self, state_dfg: SDFGState, node: nodes.MapEntry, map_range: subsets.Range, diff --git a/dace/config_schema.yml b/dace/config_schema.yml index 033cdd4555..d1c54d627b 100644 --- a/dace/config_schema.yml +++ b/dace/config_schema.yml @@ -457,7 +457,7 @@ required: Yakup Koray Budanaz for Berkays master-thesis. enum: [legacy, experimental] default: experimental - + gpu_index_type: type: str title: Thread/block/warp index data type @@ -516,10 +516,10 @@ required: type: str title: Name for the GPU stream object description: > - GPU streams allow GPU operations, such as kernel execution or memory transfers, to run asynchronously - and in parallel. This field specifies the naming convention for the hpu stream array and its connectors - in the SDFG. For example: 'gpu_streams,gpu_stream' means 'gpu_streams' is the array containing the - stream objects, and 'gpu_stream0' (prefix derived from the second name + stream id) is used as a + GPU streams allow GPU operations, such as kernel execution or memory transfers, to run asynchronously + and in parallel. This field specifies the naming convention for the hpu stream array and its connectors + in the SDFG. For example: 'gpu_streams,gpu_stream' means 'gpu_streams' is the array containing the + stream objects, and 'gpu_stream0' (prefix derived from the second name + stream id) is used as a connector for gpu_streams[0]. default: gpu_streams,gpu_stream diff --git a/dace/dtypes.py b/dace/dtypes.py index f7d2f754d5..b527689966 100644 --- a/dace/dtypes.py +++ b/dace/dtypes.py @@ -1275,6 +1275,7 @@ def isconstant(var): MPI_Request = opaque('MPI_Request') gpuStream_t = opaque('gpuStream_t') + @undefined_safe_enum @extensible_enum class Typeclasses(aenum.AutoNumberEnum): diff --git a/dace/registry.py b/dace/registry.py index 69d4958a0e..bab0fa4ade 100644 --- a/dace/registry.py +++ b/dace/registry.py @@ -46,10 +46,8 @@ def autoregister(cls: Type, **kwargs): valid_impls = {'legacy', 'experimental'} if impl not in valid_impls: - raise ValueError( - f"Invalid CUDA implementation: {impl}. " - f"Please select one of {valid_impls} under compiler.cuda.implementation in the configs." - ) + raise ValueError(f"Invalid CUDA implementation: {impl}. " + f"Please select one of {valid_impls} under compiler.cuda.implementation in the configs.") # Only the CUDA implementation selected in Config is registered if name in {'cuda', 'experimental_cuda'}: diff --git a/dace/sdfg/state.py b/dace/sdfg/state.py index 717190e193..2f656111f2 100644 --- a/dace/sdfg/state.py +++ b/dace/sdfg/state.py @@ -404,12 +404,12 @@ def memlet_path(self, edge: MultiConnectorEdge[mm.Memlet]) -> List[MultiConnecto # If empty memlet, return itself as the path if (edge.src_conn is None and edge.dst_conn is None and edge.data.is_empty()): return result - + # For the (new) gpu stream handling we can have dynamic out connectors, e.g. # KernelExit: stream -> None: AccessNode, where AccessNode accesses a Stream array # Memlets are used but its not about seing how data flows if (isinstance(edge.src, nd.MapExit) and edge.src.map.schedule == dtypes.ScheduleType.GPU_Device - and isinstance(edge.dst, nd.AccessNode) and edge.dst.desc(state).dtype == dtypes.gpuStream_t): + and isinstance(edge.dst, nd.AccessNode) and edge.dst.desc(state).dtype == dtypes.gpuStream_t): return result # Prepend incoming edges until reaching the source node @@ -983,7 +983,7 @@ def unordered_arglist(self, defined_syms = defined_syms or self.defined_symbols() scalar_args.update({ k: dt.Scalar(defined_syms[k]) if k in defined_syms else sdfg.arrays[k] - for k in self.used_symbols(all_symbols=False) + for k in self.used_symbols(all_symbols=False) if not k.startswith('__dace') and k not in sdfg.constants and (k in defined_syms or k in sdfg.arrays) }) diff --git a/dace/transformation/dataflow/add_threadblock_map.py b/dace/transformation/dataflow/add_threadblock_map.py index 87274b0afa..4db89ac992 100644 --- a/dace/transformation/dataflow/add_threadblock_map.py +++ b/dace/transformation/dataflow/add_threadblock_map.py @@ -1,4 +1,4 @@ -# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved. +# Copyright 2019-2025 ETH Zurich and the DaCe authors. All rights reserved. """ This module contains classes and functions that implement the grid-strided map tiling transformation.""" import warnings diff --git a/dace/transformation/interstate/gpu_transform_sdfg.py b/dace/transformation/interstate/gpu_transform_sdfg.py index 31cbdb45e4..fe0ed80e41 100644 --- a/dace/transformation/interstate/gpu_transform_sdfg.py +++ b/dace/transformation/interstate/gpu_transform_sdfg.py @@ -618,7 +618,6 @@ def _create_copy_out(arrays_used: Set[str]) -> Dict[str, str]: for devicename, hostname in mapping.items(): block.replace_meta_accesses({devicename: hostname}) - # Step 9: Simplify if self.simplify: sdfg.simplify() @@ -629,7 +628,7 @@ def _create_copy_out(arrays_used: Set[str]) -> Dict[str, str]: from dace.config import Config if not Config.get('compiler', 'cuda', 'implementation') == 'experimental': return - + # import needed modules from dace.transformation import helpers from dace.transformation.passes.move_array_out_of_kernel import MoveArrayOutOfKernel @@ -657,12 +656,11 @@ def _create_copy_out(arrays_used: Set[str]) -> Dict[str, str]: parent_map_info = helpers.get_parent_map(state=parent, node=node) while parent_map_info is not None: map_entry, map_state = parent_map_info - if (isinstance(map_entry, nodes.MapEntry) and - map_entry.map.schedule == dtypes.ScheduleType.GPU_Device): + if (isinstance(map_entry, nodes.MapEntry) and map_entry.map.schedule == dtypes.ScheduleType.GPU_Device): in_kernel = True break parent_map_info = helpers.get_parent_map(map_state, map_entry) - + if in_kernel: transients_in_kernels.add((node.data, desc, map_entry)) else: @@ -686,6 +684,5 @@ def _create_copy_out(arrays_used: Set[str]) -> Dict[str, str]: "As a best-effort fix, the array will be lifted outside the kernel as a non-transient GPU_Global array. " "Any naming conflicts are resolved automatically. " "Please avoid this pattern, as it is strongly discouraged and may lead to undefined behavior. " - "Note that this fix provides no guarantees, especially for unusual or complex use cases." - ) + "Note that this fix provides no guarantees, especially for unusual or complex use cases.") MoveArrayOutOfKernel().apply_pass(sdfg, kernel_entry, data_name) diff --git a/dace/transformation/passes/fix_test.py b/dace/transformation/passes/fix_test.py index 40c0ffa5b9..6a44728d35 100644 --- a/dace/transformation/passes/fix_test.py +++ b/dace/transformation/passes/fix_test.py @@ -13,9 +13,9 @@ from dace.transformation.passes.gpustream.gpustream_scheduling import NaiveGPUStreamScheduler from dace.transformation.passes.gpustream.insert_gpu_streams_to_sdfgs import InsertGPUStreamsToSDFGs - from dace.codegen.targets.experimental_cuda_helpers.gpu_utils import is_within_schedule_types + @properties.make_properties @transformation.explicit_cf_compatible class Fix(ppl.Pass): @@ -26,21 +26,20 @@ def depends_on(self) -> Set[Union[Type[ppl.Pass], ppl.Pass]]: return {NaiveGPUStreamScheduler, InsertGPUStreamsToSDFGs} def modifies(self) -> ppl.Modifies: - return ppl.Modifies.Descriptors | ppl.Modifies.Nodes | ppl.Modifies.Memlets + return ppl.Modifies.Descriptors | ppl.Modifies.Nodes | ppl.Modifies.Memlets def should_reapply(self, modified: ppl.Modifies) -> bool: return False - + def apply_pass(self, sdfg: SDFG, pipeline_results: Dict[str, Any]) -> Dict[str, dace.data.Data]: - + from dace.transformation.helpers import get_parent_map - names: Dict = dict() + names: Dict = dict() for node, parent_state in sdfg.all_nodes_recursive(): if not isinstance(node, nodes.AccessNode): continue - map_parent = None state = parent_state current = node @@ -68,9 +67,9 @@ def apply_pass(self, sdfg: SDFG, pipeline_results: Dict[str, Any]) -> Dict[str, # Try to evaluate the inequality cmp = sp.simplify(size_expr > 64) - if cmp is sp.true: # definitely larger + if cmp is sp.true: # definitely larger move_out = True - elif cmp is sp.false: # definitely safe + elif cmp is sp.false: # definitely safe move_out = False else: # undecidable case (symbolic expression) diff --git a/dace/transformation/passes/gpustream/gpu_stream_topology_simplification.py b/dace/transformation/passes/gpustream/gpu_stream_topology_simplification.py index 7edeea4a6a..bd913ae469 100644 --- a/dace/transformation/passes/gpustream/gpu_stream_topology_simplification.py +++ b/dace/transformation/passes/gpustream/gpu_stream_topology_simplification.py @@ -14,6 +14,7 @@ from dace.transformation.passes.insert_gpu_copy_tasklets import InsertGPUCopyTasklets from dace.transformation.passes.gpustream.insert_gpu_streams_to_sdfgs import InsertGPUStreamsToSDFGs + @properties.make_properties @transformation.explicit_cf_compatible class GPUStreamTopologySimplification(ppl.Pass): @@ -26,18 +27,17 @@ class GPUStreamTopologySimplification(ppl.Pass): def depends_on(self) -> Set[Union[Type[ppl.Pass], ppl.Pass]]: depending_passes = { - NaiveGPUStreamScheduler, InsertGPUStreamsToSDFGs, InsertGPUStreamsToKernels, - InsertGPUStreamsToTasklets, InsertGPUStreamSyncTasklets, - InsertGPUCopyTasklets - } - + NaiveGPUStreamScheduler, InsertGPUStreamsToSDFGs, InsertGPUStreamsToKernels, InsertGPUStreamsToTasklets, + InsertGPUStreamSyncTasklets, InsertGPUCopyTasklets + } + return depending_passes def modifies(self) -> ppl.Modifies: return ppl.Modifies.AccessNodes | ppl.Modifies.Memlets def should_reapply(self, modified: ppl.Modifies) -> bool: - return False + return False def apply_pass(self, sdfg: SDFG, pipeline_results: Dict[str, Any]): """ @@ -47,7 +47,7 @@ def apply_pass(self, sdfg: SDFG, pipeline_results: Dict[str, Any]): self._merge_gpustreams_special_case(sdfg) return {} - + def _merge_close_gpustream_nodes(self, sdfg: SDFG) -> None: """ Merge "close" GPU stream AccessNodes in the SDFG. @@ -73,13 +73,15 @@ def _merge_close_gpustream_nodes(self, sdfg: SDFG) -> None: # Skip AccessNodes if isinstance(node, nodes.AccessNode): continue - - # Find GPU stream AccessNode predecessors with no incoming edges + + # Find GPU stream AccessNode predecessors with no incoming edges # (i.e. source GPU stream AccessNodes) node_predecessors = state.predecessors(node) - preceeding_gpustream_sources = [pre for pre in node_predecessors if isinstance(pre, nodes.AccessNode) - and pre.desc(state).dtype == dtypes.gpuStream_t and state.in_degree(pre) == 0] - + preceeding_gpustream_sources = [ + pre for pre in node_predecessors if isinstance(pre, nodes.AccessNode) + and pre.desc(state).dtype == dtypes.gpuStream_t and state.in_degree(pre) == 0 + ] + # Skip if there are no preceding GPU stream sources if len(preceeding_gpustream_sources) == 0: continue @@ -100,14 +102,12 @@ def _merge_close_gpustream_nodes(self, sdfg: SDFG) -> None: # Merge grand-predecessors' successors sink GPU streams with predecessor source GPU stream node_grand_predecessors = [ - grand_pred for pred in node_predecessors - for grand_pred in state.predecessors(pred) + grand_pred for pred in node_predecessors for grand_pred in state.predecessors(pred) ] node_gp_successors_streams = [ - succ_of_gp for gp in node_grand_predecessors - for succ_of_gp in state.successors(gp) - if isinstance(succ_of_gp, nodes.AccessNode) and succ_of_gp.desc(state).dtype == dtypes.gpuStream_t - and state.out_degree(succ_of_gp) == 0 + succ_of_gp for gp in node_grand_predecessors for succ_of_gp in state.successors(gp) + if isinstance(succ_of_gp, nodes.AccessNode) + and succ_of_gp.desc(state).dtype == dtypes.gpuStream_t and state.out_degree(succ_of_gp) == 0 ] # remove duplicates @@ -118,7 +118,7 @@ def _merge_close_gpustream_nodes(self, sdfg: SDFG) -> None: src, src_conn, _, dst_conn, data = edge state.add_edge(src, src_conn, combined_stream_node, dst_conn, data) state.remove_edge(edge) - # Note: the grand-predecessor's successor GPU stream is a sink node and has no + # Note: the grand-predecessor's successor GPU stream is a sink node and has no # outgoing edges state.remove_node(gp_succ_stream) @@ -129,7 +129,7 @@ def _merge_gpustreams_special_case(self, sdfg: SDFG) -> None: This pass detects the following pattern: - A GPU stream AccessNode `X` has a predecessor and a successor (i.e. at least one of both). - Between the predecessor and successor lie one or more tasklets. - - These tasklets use their own distinct GPU stream AccessNodes (not `X`), + - These tasklets use their own distinct GPU stream AccessNodes (not `X`), which are connected only to the tasklet itself. To simplify the topology, redundant streams are merged: @@ -161,9 +161,9 @@ def example(A: dace.uint32[128], B: dace.uint32[128], #------------------------- Preprocess: Gather Information ---------------------------- - # For each GPU Stream AccessNode having a predecessor and a successor: + # For each GPU Stream AccessNode having a predecessor and a successor: # Determine with which Tasklet Source and which Tasklet sink nodes lie between its predecessor - # and its successor + # and its successor merge_source_gpustream: Dict[Tuple[nodes.AccessNode, SDFGState], List[nodes.AccessNode]] = dict() merge_sink_gpustream: Dict[Tuple[nodes.AccessNode, SDFGState], List[nodes.AccessNode]] = dict() @@ -177,13 +177,18 @@ def example(A: dace.uint32[128], B: dace.uint32[128], # If not, we skip node_predecessors = state.predecessors(node) node_successors = state.successors(node) - downstream_gpustream_sinks = [succ for succ in node_successors if isinstance(succ, nodes.AccessNode) - and succ.desc(state).dtype == dtypes.gpuStream_t and state.out_degree(succ) == 0] - upstream_gpustream_sources = [pre for pre in node_predecessors if isinstance(pre, nodes.AccessNode) - and pre.desc(state).dtype == dtypes.gpuStream_t and state.in_degree(pre) == 0] - - # Skip not considered case - if not (len(upstream_gpustream_sources) == len(downstream_gpustream_sinks) and len(upstream_gpustream_sources) == 1): + downstream_gpustream_sinks = [ + succ for succ in node_successors if isinstance(succ, nodes.AccessNode) + and succ.desc(state).dtype == dtypes.gpuStream_t and state.out_degree(succ) == 0 + ] + upstream_gpustream_sources = [ + pre for pre in node_predecessors if isinstance(pre, nodes.AccessNode) + and pre.desc(state).dtype == dtypes.gpuStream_t and state.in_degree(pre) == 0 + ] + + # Skip not considered case + if not (len(upstream_gpustream_sources) == len(downstream_gpustream_sinks) + and len(upstream_gpustream_sources) == 1): continue # Look for potential predecessor of a "passthrough" GPU Stream AccessNode @@ -195,19 +200,18 @@ def example(A: dace.uint32[128], B: dace.uint32[128], # Current nodes grand pred is a candidate of a predecessor of a "passthrough" GPU Stream AccessNode candidate = grand_pred - # A PassThrough GPU stream node can only have MapExits and Tasklets as candidate predecessors - if not (isinstance(candidate, nodes.MapExit) and candidate.map.schedule == dtypes.ScheduleType.GPU_Device - or isinstance(candidate, nodes.Tasklet)): + # A PassThrough GPU stream node can only have MapExits and Tasklets as candidate predecessors + if not (isinstance(candidate, nodes.MapExit) and candidate.map.schedule + == dtypes.ScheduleType.GPU_Device or isinstance(candidate, nodes.Tasklet)): continue - + has_passthrough_gpustream = any( - (isinstance(succ, nodes.AccessNode) and succ.desc(state).dtype == dtypes.gpuStream_t) - and (state.in_degree(succ) > 0 and state.out_degree(succ) > 0) - for succ in state.successors(candidate) - ) + (isinstance(succ, nodes.AccessNode) and succ.desc(state).dtype == dtypes.gpuStream_t) and ( + state.in_degree(succ) > 0 and state.out_degree(succ) > 0) + for succ in state.successors(candidate)) if has_passthrough_gpustream: - candidate_predecessor.append(candidate) + candidate_predecessor.append(candidate) # Not "close" passthrough GPU node exists if no candidate predecessor exists if len(candidate_predecessor) == 0: @@ -221,13 +225,14 @@ def example(A: dace.uint32[128], B: dace.uint32[128], # Get the Kernel Exits GPU stream candidate_predecessor = candidate_predecessor[0] - passthrough_gpu_node = [succ for succ in state.successors(candidate_predecessor) if isinstance(succ, nodes.AccessNode) - and succ.desc(state).dtype == dtypes.gpuStream_t][0] - - - # Collect and store the GPU stream merging information - pre_gpustream: nodes.AccessNode = upstream_gpustream_sources[0] # Note: Len is 1 - succ_gpustream: nodes.AccessNode = downstream_gpustream_sinks[0] # Note: Len is 1 + passthrough_gpu_node = [ + succ for succ in state.successors(candidate_predecessor) + if isinstance(succ, nodes.AccessNode) and succ.desc(state).dtype == dtypes.gpuStream_t + ][0] + + # Collect and store the GPU stream merging information + pre_gpustream: nodes.AccessNode = upstream_gpustream_sources[0] # Note: Len is 1 + succ_gpustream: nodes.AccessNode = downstream_gpustream_sinks[0] # Note: Len is 1 if (passthrough_gpu_node, state) in merge_source_gpustream: merge_source_gpustream[(passthrough_gpu_node, state)].append(pre_gpustream) merge_sink_gpustream[(passthrough_gpu_node, state)].append(succ_gpustream) @@ -235,7 +240,6 @@ def example(A: dace.uint32[128], B: dace.uint32[128], merge_source_gpustream[(passthrough_gpu_node, state)] = [pre_gpustream] merge_sink_gpustream[(passthrough_gpu_node, state)] = [succ_gpustream] - #------------------------- Merge the GPU Stream AccessNodes ---------------------------- for passthrough_gpu_node, state in merge_sink_gpustream.keys(): @@ -251,7 +255,7 @@ def example(A: dace.uint32[128], B: dace.uint32[128], for out_edge in state.out_edges(passthrough_gpu_node): _, src_conn, dst, dst_conn, memlet = out_edge state.add_edge(unified_out_stream, src_conn, dst, dst_conn, copy.deepcopy(memlet)) - state.remove_edge(out_edge) + state.remove_edge(out_edge) for source_stream in merge_source_gpustream[passthrough_gpu_node, state]: for out_edge in state.out_edges(source_stream): diff --git a/dace/transformation/passes/gpustream/gpustream_scheduling.py b/dace/transformation/passes/gpustream/gpustream_scheduling.py index 7eac383f4e..0ad3c2e7c0 100644 --- a/dace/transformation/passes/gpustream/gpustream_scheduling.py +++ b/dace/transformation/passes/gpustream/gpustream_scheduling.py @@ -11,11 +11,12 @@ # Placeholder for the GPU stream variable used in tasklet code STREAM_PLACEHOLDER = "__dace_current_stream" + @properties.make_properties @transformation.explicit_cf_compatible class NaiveGPUStreamScheduler(ppl.Pass): """ - Assigns GPU streams to nodes and stores the assignments in a dictionary. + Assigns GPU streams to nodes and stores the assignments in a dictionary. This can be useful for enabling asynchronous and parallel GPU computation using GPU streams. Strategy Overview: @@ -60,7 +61,7 @@ def modifies(self) -> ppl.Modifies: def should_reapply(self, modified: ppl.Modifies) -> bool: return False - + def apply_pass(self, sdfg: SDFG, _) -> Dict[nodes.Node, int]: """ Assigns GPU streams to nodes within the given SDFG. @@ -83,13 +84,13 @@ def apply_pass(self, sdfg: SDFG, _) -> Dict[nodes.Node, int]: return stream_assignments - def _assign_gpu_streams_in_state(self, sdfg: SDFG, in_nested_sdfg: bool, state: SDFGState, + def _assign_gpu_streams_in_state(self, sdfg: SDFG, in_nested_sdfg: bool, state: SDFGState, stream_assignments: Dict[nodes.Node, int], gpu_stream: int) -> None: """ Assigns GPU streams to nodes in a single state. - If inside a nested SDFG, components inherit the parent's stream. - Otherwise, each connected component gets a different stream. + If inside a nested SDFG, components inherit the parent's stream. + Otherwise, each connected component gets a different stream. Nested SDFGs are processed recursively. Parameters @@ -124,7 +125,7 @@ def _assign_gpu_streams_in_state(self, sdfg: SDFG, in_nested_sdfg: bool, state: for nested_state in node.sdfg.states(): self._assign_gpu_streams_in_state(node.sdfg, True, nested_state, stream_assignments, gpu_stream) - # Move to the next stream if we have assigned streams to any node in this component + # Move to the next stream if we have assigned streams to any node in this component # (careful: if nested, states are in same component) if not in_nested_sdfg and len(stream_assignments) > nodes_assigned_before: gpu_stream = self._next_stream(gpu_stream) @@ -145,7 +146,7 @@ def _get_weakly_connected_nodes(self, graph: Graph) -> List[Set[NodeT]]: ------- List[Set[Node_T]] - A list containing sets of nodes, with each set corresponding to a weakly + A list containing sets of nodes, with each set corresponding to a weakly connected component. """ visited: Set[NodeT] = set() @@ -200,7 +201,7 @@ def _next_stream(self, gpu_stream: int) -> int: return 0 else: return (gpu_stream + 1) % self._max_concurrent_streams - + def _requires_gpu_stream(self, state: SDFGState, component: Set[NodeT]) -> bool: """ Check whether a connected component in an SDFG state should be assigned @@ -223,29 +224,26 @@ def _requires_gpu_stream(self, state: SDFGState, component: Set[NodeT]) -> bool: bool True if the component requires a GPU stream, False otherwise. """ + def gpu_relevant(node, parent) -> bool: - if (isinstance(node, nodes.AccessNode) - and node.desc(parent).storage == dace.dtypes.StorageType.GPU_Global): - return True - - elif (isinstance(node, nodes.MapEntry) - and node.map.schedule == dace.dtypes.ScheduleType.GPU_Device): - return True - - elif (isinstance(node, nodes.Tasklet) - and STREAM_PLACEHOLDER in node.code.as_string): - return True - + if (isinstance(node, nodes.AccessNode) and node.desc(parent).storage == dace.dtypes.StorageType.GPU_Global): + return True + + elif (isinstance(node, nodes.MapEntry) and node.map.schedule == dace.dtypes.ScheduleType.GPU_Device): + return True + + elif (isinstance(node, nodes.Tasklet) and STREAM_PLACEHOLDER in node.code.as_string): + return True + return False - for node in component: if isinstance(node, nodes.NestedSDFG): if any(gpu_relevant(node, parent) for node, parent in node.sdfg.all_nodes_recursive()): return True - + else: if gpu_relevant(node, state): return True - - return False \ No newline at end of file + + return False diff --git a/dace/transformation/passes/gpustream/insert_gpu_stream_sync_tasklets.py b/dace/transformation/passes/gpustream/insert_gpu_stream_sync_tasklets.py index 3f6ec722b0..7f1b081010 100644 --- a/dace/transformation/passes/gpustream/insert_gpu_stream_sync_tasklets.py +++ b/dace/transformation/passes/gpustream/insert_gpu_stream_sync_tasklets.py @@ -14,6 +14,7 @@ from dace.transformation.passes.gpustream.insert_gpu_streams_to_tasklets import InsertGPUStreamsToTasklets from dace.transformation.passes.gpustream.insert_gpu_streams_to_sdfgs import InsertGPUStreamsToSDFGs + @properties.make_properties @transformation.explicit_cf_compatible class InsertGPUStreamSyncTasklets(ppl.Pass): @@ -24,16 +25,16 @@ class InsertGPUStreamSyncTasklets(ppl.Pass): that require synchronization. Additional locations can be added easily if new cases are discovered. """ + def depends_on(self) -> Set[Union[Type[ppl.Pass], ppl.Pass]]: - return {NaiveGPUStreamScheduler, InsertGPUStreamsToSDFGs, - InsertGPUStreamsToKernels, InsertGPUStreamsToTasklets} + return {NaiveGPUStreamScheduler, InsertGPUStreamsToSDFGs, InsertGPUStreamsToKernels, InsertGPUStreamsToTasklets} def modifies(self) -> ppl.Modifies: return ppl.Modifies.Tasklets | ppl.Modifies.Memlets def should_reapply(self, modified: ppl.Modifies) -> bool: return False - + def apply_pass(self, sdfg: SDFG, pipeline_results: Dict[str, Any]): """ Inserts GPU stream synchronization tasklets at required locations @@ -48,12 +49,13 @@ def apply_pass(self, sdfg: SDFG, pipeline_results: Dict[str, Any]): self._insert_gpu_stream_sync_at_state_end(sdfg, sync_state, stream_assignments) return {} - def _identify_sync_locations(self, sdfg: SDFG, stream_assignments: Dict[nodes.Node, int] - ) -> Tuple[Dict[SDFGState, Set[int]], Dict[nodes.Node, SDFGState]]: + def _identify_sync_locations( + self, sdfg: SDFG, + stream_assignments: Dict[nodes.Node, int]) -> Tuple[Dict[SDFGState, Set[int]], Dict[nodes.Node, SDFGState]]: """ Heuristically identifies GPU stream synchronization points in an SDFG. Synchronization is inserted at the end of a state when it is required. - + Parameters ---------- @@ -65,11 +67,12 @@ def _identify_sync_locations(self, sdfg: SDFG, stream_assignments: Dict[nodes.No Returns ------- Tuple[Dict[SDFGState, Set[int]], Dict[nodes.Node, SDFGState]] - - **sync_state**: Maps each state to the set of stream IDs that should be + - **sync_state**: Maps each state to the set of stream IDs that should be synchronized at the end of the state. - - **sync_node**: The keys of this dictionary are nodes after which synchronization + - **sync_node**: The keys of this dictionary are nodes after which synchronization is needed, and their corresponding value is the state they belong to. """ + # ------------------ Helper predicates ----------------------------- def is_gpu_accessnode(node, state): @@ -85,7 +88,7 @@ def is_kernel_exit(node): def is_sink_node(node, state): return state.out_degree(node) == 0 - + def edge_within_kernel(state, src, dst): gpu_schedules = dtypes.GPU_SCHEDULES_EXPERIMENTAL_CUDACODEGEN src_in_kernel = is_within_schedule_types(state, src, gpu_schedules) @@ -105,25 +108,23 @@ def edge_within_kernel(state, src, dst): sync_state[state] = set() # --- Heuristics for when to sync --- - if (is_gpu_accessnode(src, state) and is_nongpu_accessnode(dst, state) and - is_sink_node(dst, state) and not edge_within_kernel(state, src, dst)): + if (is_gpu_accessnode(src, state) and is_nongpu_accessnode(dst, state) and is_sink_node(dst, state) + and not edge_within_kernel(state, src, dst)): sync_state[state].add(stream_assignments[dst]) - elif (is_gpu_accessnode(src, state) and is_nongpu_accessnode(dst, state) and - not is_sink_node(dst, state) and not edge_within_kernel(state, src, dst)): + elif (is_gpu_accessnode(src, state) and is_nongpu_accessnode(dst, state) and not is_sink_node(dst, state) + and not edge_within_kernel(state, src, dst)): sync_state[state].add(stream_assignments[dst]) - elif (is_nongpu_accessnode(src, state) and is_gpu_accessnode(dst, state) and - not edge_within_kernel(state, src, dst)): + elif (is_nongpu_accessnode(src, state) and is_gpu_accessnode(dst, state) + and not edge_within_kernel(state, src, dst)): sync_state[state].add(stream_assignments[dst]) - elif (is_kernel_exit(src) and is_gpu_accessnode(dst, state) and - not is_sink_node(dst, state)): + elif (is_kernel_exit(src) and is_gpu_accessnode(dst, state) and not is_sink_node(dst, state)): + sync_state[state].add(stream_assignments[src]) sync_state[state].add(stream_assignments[src]) - sync_state[state].add(stream_assignments[src]) - elif (is_kernel_exit(src) and is_gpu_accessnode(dst, state) and - is_sink_node(dst, state)): + elif (is_kernel_exit(src) and is_gpu_accessnode(dst, state) and is_sink_node(dst, state)): sync_state[state].add(stream_assignments[dst]) else: @@ -139,18 +140,18 @@ def edge_within_kernel(state, src, dst): return sync_state, sync_node - def _insert_gpu_stream_sync_at_state_end(self, sdfg: SDFG, sync_state: Dict[SDFGState, Set[int]], + def _insert_gpu_stream_sync_at_state_end(self, sdfg: SDFG, sync_state: Dict[SDFGState, Set[int]], stream_assignments: Dict[nodes.Node, int]) -> None: """ Inserts GPU stream synchronization tasklets at the end of SDFG states. For each state that requires synchronization, this method: - 1. Generates a tasklet that synchronizes all assigned GPU streams using + 1. Generates a tasklet that synchronizes all assigned GPU streams using the appropriate backend (e.g., CUDA). - 2. Ensures all other operations in the state complete before synchronization + 2. Ensures all other operations in the state complete before synchronization by connecting all sink nodes to the tasklet. - 3. Guarantees that only a single GPU stream AccessNode connects to the sync + 3. Guarantees that only a single GPU stream AccessNode connects to the sync tasklet, creating one if needed. Parameters @@ -179,8 +180,11 @@ def _insert_gpu_stream_sync_at_state_end(self, sdfg: SDFG, sync_state: Dict[SDFG sync_code = "\n".join(sync_code_lines) # Create the tasklet - tasklet = state.add_tasklet(name=f"gpu_stream_{stream}_synchronization", inputs=set(), outputs=set(), - code=sync_code, language=dtypes.Language.CPP) + tasklet = state.add_tasklet(name=f"gpu_stream_{stream}_synchronization", + inputs=set(), + outputs=set(), + code=sync_code, + language=dtypes.Language.CPP) # ----------------- Connect sink nodes to the synchronization tasklet ----------------- @@ -193,11 +197,11 @@ def _insert_gpu_stream_sync_at_state_end(self, sdfg: SDFG, sync_state: Dict[SDFG elif sink_node != tasklet: non_stream_sink_nodes.append(sink_node) - + # 2. Connect non-stream sink nodes to the sync tasklet for sink_node in non_stream_sink_nodes: state.add_edge(sink_node, None, tasklet, None, dace.Memlet()) - + # 3. Connect a single GPU stream sink node (create or merge if needed) if len(stream_sink_nodes) == 0: combined_stream_node = state.add_access(stream_array_name) @@ -214,10 +218,9 @@ def _insert_gpu_stream_sync_at_state_end(self, sdfg: SDFG, sync_state: Dict[SDFG output_stream_node = state.add_access(combined_stream_node.data) for stream in streams: accessed_gpu_stream = f"{stream_array_name}[{stream}]" - conn = f"{stream_var_name_prefix}{stream}" # Note: Same as "gpu_stream_var_name" from tasklet + conn = f"{stream_var_name_prefix}{stream}" # Note: Same as "gpu_stream_var_name" from tasklet tasklet.add_in_connector(conn, dtypes.gpuStream_t) tasklet.add_out_connector(conn, dtypes.gpuStream_t, force=True) state.add_edge(combined_stream_node, None, tasklet, conn, dace.Memlet(accessed_gpu_stream)) state.add_edge(tasklet, conn, output_stream_node, None, dace.Memlet(accessed_gpu_stream)) - \ No newline at end of file diff --git a/dace/transformation/passes/gpustream/insert_gpu_streams_to_kernels.py b/dace/transformation/passes/gpustream/insert_gpu_streams_to_kernels.py index f88cc449dc..23bb4c7c94 100644 --- a/dace/transformation/passes/gpustream/insert_gpu_streams_to_kernels.py +++ b/dace/transformation/passes/gpustream/insert_gpu_streams_to_kernels.py @@ -10,13 +10,14 @@ from dace.transformation.passes.gpustream.gpustream_scheduling import NaiveGPUStreamScheduler from dace.transformation.passes.gpustream.insert_gpu_streams_to_sdfgs import InsertGPUStreamsToSDFGs + @properties.make_properties @transformation.explicit_cf_compatible class InsertGPUStreamsToKernels(ppl.Pass): """ This Pass attaches GPU streams to kernels (i.e., dtypes.ScheduleType.GPU_Device scheduled maps). - Adds GPU stream AccessNodes and connects them to kernel entry and exit nodes, + Adds GPU stream AccessNodes and connects them to kernel entry and exit nodes, indicating which GPU stream each kernel is assigned to. These assignments are e.g. used when launching the kernels. """ @@ -29,7 +30,7 @@ def modifies(self) -> ppl.Modifies: def should_reapply(self, modified: ppl.Modifies) -> bool: return False - + def apply_pass(self, sdfg: SDFG, pipeline_results: Dict[str, Any]): # Retrieve the GPU stream array name and the prefix for individual stream variables stream_array_name, stream_var_name_prefix = Config.get('compiler', 'cuda', 'gpu_stream_name').split(',') @@ -56,12 +57,14 @@ def apply_pass(self, sdfg: SDFG, pipeline_results: Dict[str, Any]): kernel_entry = node kernel_entry.add_in_connector(gpu_stream_var_name, dtypes.gpuStream_t) stream_array_in = state.add_access(stream_array_name) - state.add_edge(stream_array_in, None, kernel_entry, gpu_stream_var_name, dace.Memlet(accessed_gpu_stream)) + state.add_edge(stream_array_in, None, kernel_entry, gpu_stream_var_name, + dace.Memlet(accessed_gpu_stream)) # Assign the GPU stream to the kernel exit kernel_exit = state.exit_node(kernel_entry) kernel_exit.add_out_connector(gpu_stream_var_name, dtypes.gpuStream_t) stream_array_out = state.add_access(stream_array_name) - state.add_edge(kernel_exit, gpu_stream_var_name, stream_array_out, None, dace.Memlet(accessed_gpu_stream)) - + state.add_edge(kernel_exit, gpu_stream_var_name, stream_array_out, None, + dace.Memlet(accessed_gpu_stream)) + return {} diff --git a/dace/transformation/passes/gpustream/insert_gpu_streams_to_sdfgs.py b/dace/transformation/passes/gpustream/insert_gpu_streams_to_sdfgs.py index a8d6e143fe..b55e4889a1 100644 --- a/dace/transformation/passes/gpustream/insert_gpu_streams_to_sdfgs.py +++ b/dace/transformation/passes/gpustream/insert_gpu_streams_to_sdfgs.py @@ -12,15 +12,16 @@ STREAM_PLACEHOLDER = "__dace_current_stream" + @properties.make_properties @transformation.explicit_cf_compatible class InsertGPUStreamsToSDFGs(ppl.Pass): """ - Inserts a GPU stream array into the top-level SDFG and propagates it to all - nested SDFGs that require it, including intermediate SDFGs along the hierarchy. + Inserts a GPU stream array into the top-level SDFG and propagates it to all + nested SDFGs that require it, including intermediate SDFGs along the hierarchy. - This pass guarantees that every relevant SDFG has the array defined, avoiding - duplication and allowing subsequent passes in the GPU stream pipeline to rely + This pass guarantees that every relevant SDFG has the array defined, avoiding + duplication and allowing subsequent passes in the GPU stream pipeline to rely on its presence without redefining it. """ @@ -32,14 +33,14 @@ def modifies(self) -> ppl.Modifies: def should_reapply(self, modified: ppl.Modifies) -> bool: return False - + def apply_pass(self, sdfg: SDFG, pipeline_results: Dict[str, Any]): """ Ensure that a GPU stream array is available in all SDFGs that require it. - The pass creates the array once at the top-level SDFG and propagates it - down the hierarchy by inserting matching arrays in child SDFGs and wiring - them through nested SDFG connectors. This way, all SDFGs share a consistent + The pass creates the array once at the top-level SDFG and propagates it + down the hierarchy by inserting matching arrays in child SDFGs and wiring + them through nested SDFG connectors. This way, all SDFGs share a consistent reference to the same GPU stream array. """ @@ -49,10 +50,10 @@ def apply_pass(self, sdfg: SDFG, pipeline_results: Dict[str, Any]): num_assigned_streams = max(stream_assignments.values(), default=0) + 1 # Add the GPU stream array at the top level - sdfg.add_transient(stream_array_name, (num_assigned_streams,), dtype=dace.dtypes.gpuStream_t, + sdfg.add_transient(stream_array_name, (num_assigned_streams, ), + dtype=dace.dtypes.gpuStream_t, storage=dace.dtypes.StorageType.Register) - # Ensure GPU stream array is defined where required for child_sdfg in self.find_child_sdfgs_requiring_gpu_stream(sdfg): @@ -62,23 +63,26 @@ def apply_pass(self, sdfg: SDFG, pipeline_results: Dict[str, Any]): # Add the array to the child SDFG inner_sdfg = child_sdfg - inner_sdfg.add_array(stream_array_name, (num_assigned_streams,), dtype=dace.dtypes.gpuStream_t, + inner_sdfg.add_array(stream_array_name, (num_assigned_streams, ), + dtype=dace.dtypes.gpuStream_t, storage=dace.dtypes.StorageType.Register) - + # Walk up the hierarchy until the array is found, inserting it into each parent outer_sdfg = inner_sdfg.parent_sdfg while stream_array_name not in outer_sdfg.arrays: # Insert array in parent SDFG - outer_sdfg.add_array(stream_array_name, (num_assigned_streams,), dtype=dace.dtypes.gpuStream_t, + outer_sdfg.add_array(stream_array_name, (num_assigned_streams, ), + dtype=dace.dtypes.gpuStream_t, storage=dace.dtypes.StorageType.Register) - + # Connect parent SDFG array to nested SDFG node inner_nsdfg_node = inner_sdfg.parent_nsdfg_node inner_parent_state = inner_sdfg.parent inner_nsdfg_node.add_in_connector(stream_array_name, dtypes.gpuStream_t) inp_gpu_stream: AccessNode = inner_parent_state.add_access(stream_array_name) - inner_parent_state.add_edge(inp_gpu_stream, None, inner_nsdfg_node, stream_array_name, dace.Memlet(stream_array_name)) + inner_parent_state.add_edge(inp_gpu_stream, None, inner_nsdfg_node, stream_array_name, + dace.Memlet(stream_array_name)) # Continue climbing up the hierarchy inner_sdfg = outer_sdfg @@ -89,7 +93,8 @@ def apply_pass(self, sdfg: SDFG, pipeline_results: Dict[str, Any]): inner_parent_state = inner_sdfg.parent inner_nsdfg_node.add_in_connector(stream_array_name, dtypes.gpuStream_t) inp_gpu_stream: AccessNode = inner_parent_state.add_access(stream_array_name) - inner_parent_state.add_edge(inp_gpu_stream, None, inner_nsdfg_node, stream_array_name, dace.Memlet(f"{stream_array_name}[0:{num_assigned_streams}]")) + inner_parent_state.add_edge(inp_gpu_stream, None, inner_nsdfg_node, stream_array_name, + dace.Memlet(f"{stream_array_name}[0:{num_assigned_streams}]")) outer_sdfg = inner_sdfg.parent_sdfg @@ -101,9 +106,9 @@ def find_child_sdfgs_requiring_gpu_stream(self, sdfg) -> Set[SDFG]: array descriptor store. A child SDFG requires a GPU stream if: - It launches GPU kernels (MapEntry/MapExit with GPU_Device schedule). - - It contains special Tasklets (e.g., from library node expansion) that + - It contains special Tasklets (e.g., from library node expansion) that use the GPU stream they are assigned to in the code. - - It accesses GPU global memory outside device-level GPU scopes, which + - It accesses GPU global memory outside device-level GPU scopes, which implies memory copies or kernel data feeds. Parameters @@ -122,7 +127,7 @@ def find_child_sdfgs_requiring_gpu_stream(self, sdfg) -> Set[SDFG]: # Skip the root SDFG itself if child_sdfg is sdfg: - continue + continue for state in child_sdfg.states(): for node in state.nodes(): @@ -138,11 +143,8 @@ def find_child_sdfgs_requiring_gpu_stream(self, sdfg) -> Set[SDFG]: break # Case 3: Accessing GPU global memory outside device-level scopes - if ( - isinstance(node, AccessNode) - and node.desc(state).storage == dtypes.StorageType.GPU_Global - and not is_devicelevel_gpu(state.sdfg, state, node) - ): + if (isinstance(node, AccessNode) and node.desc(state).storage == dtypes.StorageType.GPU_Global + and not is_devicelevel_gpu(state.sdfg, state, node)): requiring_gpu_stream.add(child_sdfg) break @@ -151,6 +153,3 @@ def find_child_sdfgs_requiring_gpu_stream(self, sdfg) -> Set[SDFG]: break return requiring_gpu_stream - - - diff --git a/dace/transformation/passes/gpustream/insert_gpu_streams_to_tasklets.py b/dace/transformation/passes/gpustream/insert_gpu_streams_to_tasklets.py index c7ef71abab..1438472da0 100644 --- a/dace/transformation/passes/gpustream/insert_gpu_streams_to_tasklets.py +++ b/dace/transformation/passes/gpustream/insert_gpu_streams_to_tasklets.py @@ -13,6 +13,7 @@ # Placeholder for the GPU stream variable used in tasklet code STREAM_PLACEHOLDER = "__dace_current_stream" + @properties.make_properties @transformation.explicit_cf_compatible class InsertGPUStreamsToTasklets(ppl.Pass): @@ -20,8 +21,8 @@ class InsertGPUStreamsToTasklets(ppl.Pass): This pass ensures that tasklets which require access to their assigned GPU stream are provided with it explicitly. - Such tasklets typically originate from expanded LibraryNodes targeting GPUs. - These nodes may reference the special placeholder variable `__dace_current_stream`, + Such tasklets typically originate from expanded LibraryNodes targeting GPUs. + These nodes may reference the special placeholder variable `__dace_current_stream`, which is expected to be defined during unparsing in `cpp.py`. To avoid relying on this "hidden" mechanism, the pass rewrites tasklets to use @@ -38,7 +39,7 @@ def modifies(self) -> ppl.Modifies: def should_reapply(self, modified: ppl.Modifies) -> bool: return False - + def apply_pass(self, sdfg: SDFG, pipeline_results: Dict[str, Any]): # Retrieve the GPU stream's array name stream_array_name = Config.get('compiler', 'cuda', 'gpu_stream_name').split(',')[0] diff --git a/dace/transformation/passes/insert_gpu_copy_tasklets.py b/dace/transformation/passes/insert_gpu_copy_tasklets.py index b38d133814..447adc7767 100644 --- a/dace/transformation/passes/insert_gpu_copy_tasklets.py +++ b/dace/transformation/passes/insert_gpu_copy_tasklets.py @@ -5,7 +5,7 @@ import dace from dace import SDFG, SDFGState, dtypes, properties from dace import memlet as mm -from dace.codegen.targets.experimental_cuda_helpers.new_copy_strategies import CopyContext, OutOfKernelCopyStrategy +from dace.codegen.targets.experimental_cuda_helpers.copy_strategies import CopyContext, OutOfKernelCopyStrategy from dace.config import Config from dace.sdfg import nodes, scope_contains_scope from dace.sdfg.graph import MultiConnectorEdge @@ -16,6 +16,7 @@ from dace.transformation.passes.gpustream.insert_gpu_stream_sync_tasklets import InsertGPUStreamSyncTasklets from dace.transformation.passes.gpustream.insert_gpu_streams_to_sdfgs import InsertGPUStreamsToSDFGs + @properties.make_properties @transformation.explicit_cf_compatible class InsertGPUCopyTasklets(ppl.Pass): @@ -34,12 +35,12 @@ class InsertGPUCopyTasklets(ppl.Pass): from connectors and describing in memlets how data will be moved, since currently tasklets only support value inputs. """ - + def depends_on(self) -> Set[Union[Type[ppl.Pass], ppl.Pass]]: depending_passes = { - NaiveGPUStreamScheduler, InsertGPUStreamsToSDFGs, InsertGPUStreamsToKernels, - InsertGPUStreamsToTasklets, InsertGPUStreamSyncTasklets - } + NaiveGPUStreamScheduler, InsertGPUStreamsToSDFGs, InsertGPUStreamsToKernels, InsertGPUStreamsToTasklets, + InsertGPUStreamSyncTasklets + } return depending_passes def modifies(self) -> ppl.Modifies: @@ -87,7 +88,7 @@ def apply_pass(self, sdfg: SDFG, pipeline_results: Dict[str, Any]) -> Dict: # Generatae the copy call code = out_of_kernel_copy.generate_copy(copy_context) - + # Prepare GPU ustream connectors and the stream to be accessed from the # GPU stream array gpustream_id = gpustream_assignments[dst_node] @@ -99,9 +100,9 @@ def apply_pass(self, sdfg: SDFG, pipeline_results: Dict[str, Any]) -> Dict: tasklet.add_in_connector(gpustream_var_name, dtypes.gpuStream_t, True) tasklet.add_out_connector(gpustream_var_name, dtypes.gpuStream_t, True) - # Add incoming and outgoing GPU stream accessNodes to the tasklet + # Add incoming and outgoing GPU stream accessNodes to the tasklet in_gpustream = state.add_access(gpustream_array_name) - out_gpustream= state.add_access(gpustream_array_name) + out_gpustream = state.add_access(gpustream_array_name) state.add_edge(in_gpustream, None, tasklet, gpustream_var_name, dace.Memlet(accessed_gpustream)) state.add_edge(tasklet, gpustream_var_name, out_gpustream, None, dace.Memlet(accessed_gpustream)) @@ -109,13 +110,14 @@ def apply_pass(self, sdfg: SDFG, pipeline_results: Dict[str, Any]) -> Dict: dst_node_pred, dst_node_conn, _, dst_conn, memlet = edge state.add_edge(dst_node_pred, dst_node_conn, tasklet, None, copy.deepcopy(memlet)) state.add_edge(tasklet, None, dst_node, dst_conn, copy.deepcopy(memlet)) - state.remove_edge(edge) + state.remove_edge(edge) return {} - - def find_all_data_copies(self, sdfg: SDFG) -> List[Tuple[SDFG, SDFGState, nodes.Node, nodes.Node, MultiConnectorEdge[mm.Memlet]]]: + + def find_all_data_copies( + self, sdfg: SDFG) -> List[Tuple[SDFG, SDFGState, nodes.Node, nodes.Node, MultiConnectorEdge[mm.Memlet]]]: """ - Finds and returns all data copies in the SDFG as tuples containing the SDFG, state, source node, + Finds and returns all data copies in the SDFG as tuples containing the SDFG, state, source node, destination node, and the first memlet edge of in the memlet path between source and destination node. Parameters @@ -139,7 +141,7 @@ def find_all_data_copies(self, sdfg: SDFG) -> List[Tuple[SDFG, SDFGState, nodes. for sub_sdfg in sdfg.all_sdfgs_recursive(): for state in sub_sdfg.states(): for edge in state.edges(): - + # Skip edges that were already processed if edge in visited_edges: continue diff --git a/dace/transformation/passes/move_array_out_of_kernel.py b/dace/transformation/passes/move_array_out_of_kernel.py index 0b91d671bf..cfa3c2090b 100644 --- a/dace/transformation/passes/move_array_out_of_kernel.py +++ b/dace/transformation/passes/move_array_out_of_kernel.py @@ -17,11 +17,12 @@ from dace.memlet import Memlet from dace.symbolic import symbol + @make_properties @transformation.explicit_cf_compatible class MoveArrayOutOfKernel(Pass): """ - This pass supports a legacy use case in the 'ExperimentalCUDACodeGen' backend: the use of + This pass supports a legacy use case in the 'ExperimentalCUDACodeGen' backend: the use of transient arrays with dtypes.StorageType.GPU_Global inside GPU_Device scheduled maps (kernels). Previously, the old 'CUDACodeGen' moved such arrays outside the kernel during codegen, which caused: @@ -30,12 +31,12 @@ class MoveArrayOutOfKernel(Pass): 3. Incorrect semantics — a single shared array was reused instead of per-iteration replication, leading to race conditions. - This pass fixes these issues by explicitly lifting such arrays out of GPU_Device maps - and creating disjoint arrays per map iteration. Unlike the legacy approach, the transformation + This pass fixes these issues by explicitly lifting such arrays out of GPU_Device maps + and creating disjoint arrays per map iteration. Unlike the legacy approach, the transformation is now visible and consistent at the SDFG level, avoiding naming collisions and improving clarity. - NOTE: There is no true "local device (GPU_Device) memory" on GPUs, but DaCe supports this - pattern for legacy reasons. This pass exists purely for backward compatibility, and its use + NOTE: There is no true "local device (GPU_Device) memory" on GPUs, but DaCe supports this + pattern for legacy reasons. This pass exists purely for backward compatibility, and its use is strongly discouraged. """ @@ -56,9 +57,9 @@ def apply_pass(self, root_sdfg: SDFG, kernel_entry: nodes.MapEntry, array_name: Args: root_sdfg: The top-level SDFG to operate on. - kernel_entry: The MapEntry node representing the GPU_Device scheduled map (i.e., the kernel) + kernel_entry: The MapEntry node representing the GPU_Device scheduled map (i.e., the kernel) that contains the transient array. - array_name: The name of the transient array to move. Note that multiple arrays with the + array_name: The name of the transient array to move. Note that multiple arrays with the same name may exist within the kernel. All will be lifted. """ # Cache every nodes parent state and parent sdfg @@ -67,7 +68,7 @@ def apply_pass(self, root_sdfg: SDFG, kernel_entry: nodes.MapEntry, array_name: assert isinstance(parent, SDFGState) self._node_to_state_cache[node] = parent self._node_to_sdfg_cache[node] = parent.sdfg - + # Check if all access nodes to 'array_name' within the kernel are defined in the same SDFG as the map kernel_parent_sdfg = self._node_to_sdfg_cache[kernel_entry] simple_case = True @@ -86,13 +87,13 @@ def apply_pass(self, root_sdfg: SDFG, kernel_entry: nodes.MapEntry, array_name: self.move_array_out_of_kernel_nested(kernel_entry, array_name) # Main transformation algorithms and helpers - def move_array_out_of_kernel_flat(self, kernel_entry: nodes.MapEntry, array_name: str, - access_nodes: List[nodes.AccessNode]) -> None: + def move_array_out_of_kernel_flat(self, kernel_entry: nodes.MapEntry, array_name: str, + access_nodes: List[nodes.AccessNode]) -> None: """ Moves a transient GPU_Global array out of a GPU_Device map (kernel) in the flat case. This function handles the simpler case where all access nodes to the array are in the same - SDFG and state as the kernel map. Therefore, there are no nested SDFGs or naming conflicts + SDFG and state as the kernel map. Therefore, there are no nested SDFGs or naming conflicts (since an SDFG cannot define multiple descriptors with the same name). The array is reshaped to allocate a disjoint slice per map iteration. For example, given: @@ -100,12 +101,12 @@ def move_array_out_of_kernel_flat(self, kernel_entry: nodes.MapEntry, array_name for x, y in dace.map[0:128, 0:32] @ GPU_Device: gpu_A = dace.define_local([64], dtype, storage=GPU_Global) - the array shape will be updated to [128, 32, 64], and memlets will ensure each thread + the array shape will be updated to [128, 32, 64], and memlets will ensure each thread accesses [x, y, 0:64]. - Additionally, this method inserts the necessary access nodes and edges to correctly move + Additionally, this method inserts the necessary access nodes and edges to correctly move the array out of the map scope and maintain correctness. - + Args: kernel_entry: The MapEntry node representing the GPU kernel. array_name: Name of the transient array to move. @@ -118,7 +119,7 @@ def move_array_out_of_kernel_flat(self, kernel_entry: nodes.MapEntry, array_name array_desc = closest_an.desc(parent_state) # Get the chain of MapEntries from the AccessNode up to and including the kernel map entry - map_entry_chain, _= self.get_maps_between(kernel_entry, closest_an) + map_entry_chain, _ = self.get_maps_between(kernel_entry, closest_an) # Store the original full-range subset of the array. # Needed to define correct memlets when moving the array out of the kernel. @@ -131,7 +132,6 @@ def move_array_out_of_kernel_flat(self, kernel_entry: nodes.MapEntry, array_name # Update all memlets self.update_memlets(kernel_entry, array_name, closest_an, access_nodes) - # add new edges to move access Node out of map in_connector: str = 'IN_' + array_name out_connector: str = 'OUT_' + array_name @@ -147,28 +147,30 @@ def move_array_out_of_kernel_flat(self, kernel_entry: nodes.MapEntry, array_name next_entries, _ = self.get_maps_between(kernel_entry, previous_node) memlet_subset = Range(self.get_memlet_subset(next_entries, previous_node) + old_subset) - - next_map_state.add_edge(previous_node, previous_out_connector, next_map_exit, in_connector, Memlet(data= array_name, subset=memlet_subset)) - + + next_map_state.add_edge(previous_node, previous_out_connector, next_map_exit, in_connector, + Memlet(data=array_name, subset=memlet_subset)) + previous_node = next_map_exit previous_out_connector = out_connector # New Access Node outside of the target map, connected to the exit access_node_outside = parent_state.add_access(array_name) - parent_state.add_edge(kernel_exit, out_connector, access_node_outside, None, Memlet.from_array(array_name, array_desc)) + parent_state.add_edge(kernel_exit, out_connector, access_node_outside, None, + Memlet.from_array(array_name, array_desc)) def move_array_out_of_kernel_nested(self, kernel_entry: nodes.MapEntry, array_name: str) -> None: """ Moves a transient GPU_Global array out of a GPU_Device map (kernel) in the nested case. - This function handles the more complex scenario where access nodes to the array may be - defined inside nested SDFGs within the kernel's parent SDFG. It moves the array out of - all nested maps and SDFGs, updating shapes and memlets accordingly, and resolves naming - conflicts that arise from multiple descriptors with the same name in different scopes + This function handles the more complex scenario where access nodes to the array may be + defined inside nested SDFGs within the kernel's parent SDFG. It moves the array out of + all nested maps and SDFGs, updating shapes and memlets accordingly, and resolves naming + conflicts that arise from multiple descriptors with the same name in different scopes (by renaming). - The method also ensures that the array is correctly lifted through all nested SDFGs - between its original definition and the kernel map, updating symbols and connectors + The method also ensures that the array is correctly lifted through all nested SDFGs + between its original definition and the kernel map, updating symbols and connectors along the way. Args: @@ -188,7 +190,7 @@ def move_array_out_of_kernel_nested(self, kernel_entry: nodes.MapEntry, array_na self.move_array_out_of_kernel_flat(kernel_entry, original_array_name, list(access_nodes)) continue - # The outermost node + # The outermost node nsdfg_node = outermost_sdfg.parent_nsdfg_node map_entry_chain, _ = self.get_maps_between(kernel_entry, nsdfg_node) @@ -198,7 +200,10 @@ def move_array_out_of_kernel_nested(self, kernel_entry: nodes.MapEntry, array_na # Update array_descriptor new_shape, new_strides, new_total_size, new_offsets = self.get_new_shape_info(array_desc, map_entry_chain) - array_desc.set_shape(new_shape=new_shape, strides=new_strides, total_size=new_total_size, offset=new_offsets) + array_desc.set_shape(new_shape=new_shape, + strides=new_strides, + total_size=new_total_size, + offset=new_offsets) array_desc.transient = False # Update memlets data movement @@ -225,23 +230,21 @@ def move_array_out_of_kernel_nested(self, kernel_entry: nodes.MapEntry, array_na # Validate depth: must include at least outer + target SDFG if len(sdfg_hierarchy) < 2: - raise ValueError( - f"Invalid SDFG hierarchy: only one SDFG found. " - f"Expected at least two levels, since {outermost_sdfg} is not equal to " - "the kernel map's SDFG and is contained within it — the last entry should " - "be the kernel's parent SDFG." - ) - + raise ValueError(f"Invalid SDFG hierarchy: only one SDFG found. " + f"Expected at least two levels, since {outermost_sdfg} is not equal to " + "the kernel map's SDFG and is contained within it — the last entry should " + "be the kernel's parent SDFG.") + self.lift_array_through_nested_sdfgs(array_name, kernel_entry, sdfg_hierarchy, old_subset) - - def lift_array_through_nested_sdfgs(self, array_name:str, kernel_entry: nodes.MapEntry, - sdfg_hierarchy: List[SDFG], old_subset: List) -> None: + + def lift_array_through_nested_sdfgs(self, array_name: str, kernel_entry: nodes.MapEntry, sdfg_hierarchy: List[SDFG], + old_subset: List) -> None: """ Lifts a transient array through nested SDFGs. - For each SDFG in the hierarchy (from inner to outer), this deepcopies the array descriptor + For each SDFG in the hierarchy (from inner to outer), this deepcopies the array descriptor and adds edges from the NestedSDFG node through any enclosing maps to a new access node. - This is done until the kernel is exited. + This is done until the kernel is exited. Memlets are updated using `old_subset` and enclosing map parameters. Args: @@ -251,7 +254,7 @@ def lift_array_through_nested_sdfgs(self, array_name:str, kernel_entry: nodes.Ma old_subset: Inner array subset used for memlet construction. """ # Move array out ouf the kernel map entry through nested SDFGs - outer_sdfg = sdfg_hierarchy.pop(0) + outer_sdfg = sdfg_hierarchy.pop(0) while sdfg_hierarchy: inner_sdfg = outer_sdfg outer_sdfg = sdfg_hierarchy.pop(0) @@ -298,7 +301,9 @@ def lift_array_through_nested_sdfgs(self, array_name:str, kernel_entry: nodes.Ma src_conn = f"OUT_{array_name}" src.add_out_connector(src_conn, dtypes.pointer(new_desc.dtype)) else: - raise NotImplementedError(f"Unsupported source node type '{type(src).__name__}' — only NestedSDFG or MapExit are expected.") + raise NotImplementedError( + f"Unsupported source node type '{type(src).__name__}' — only NestedSDFG or MapExit are expected." + ) # 1.2 Determine destination connector name and register it based on dst type if isinstance(dst, nodes.AccessNode): @@ -307,18 +312,18 @@ def lift_array_through_nested_sdfgs(self, array_name:str, kernel_entry: nodes.Ma dst_conn = f"IN_{array_name}" dst.add_in_connector(dst_conn, dtypes.pointer(new_desc.dtype)) else: - raise NotImplementedError(f"Unsupported destination node type '{type(dst).__name__}' — expected AccessNode or MapEntry.") - + raise NotImplementedError( + f"Unsupported destination node type '{type(dst).__name__}' — expected AccessNode or MapEntry.") + # 2. Add the edge using the connector names determined in Step 1. next_entries, _ = self.get_maps_between(kernel_entry, src) memlet_subset = Range(self.get_memlet_subset(next_entries, src) + old_subset) - nsdfg_parent_state.add_edge(src, src_conn, dst, dst_conn, Memlet(data= array_name, subset=memlet_subset)) - + nsdfg_parent_state.add_edge(src, src_conn, dst, dst_conn, Memlet(data=array_name, subset=memlet_subset)) + # Continue by setting the dst as source src = dst - - # After processing all scopes, the last src (which is either the last MapExit or the intial nsdfg if there are no parent scope) + # After processing all scopes, the last src (which is either the last MapExit or the intial nsdfg if there are no parent scope) # needs to be connected to the exit access node added before dst = exit_access_node @@ -329,24 +334,25 @@ def lift_array_through_nested_sdfgs(self, array_name:str, kernel_entry: nodes.Ma src_conn = f"OUT_{array_name}" src.add_out_connector(src_conn, dtypes.pointer(new_desc.dtype)) else: - raise NotImplementedError(f"Unsupported source node type '{type(src).__name__}' — only NestedSDFG or MapExit are expected.") - + raise NotImplementedError( + f"Unsupported source node type '{type(src).__name__}' — only NestedSDFG or MapExit are expected.") + next_entries, _ = self.get_maps_between(kernel_entry, src) memlet_subset = Range(self.get_memlet_subset(next_entries, src) + old_subset) - nsdfg_parent_state.add_edge(src, src_conn, dst, None, Memlet(data= array_name, subset=memlet_subset)) + nsdfg_parent_state.add_edge(src, src_conn, dst, None, Memlet(data=array_name, subset=memlet_subset)) # At the outermost sdfg we set the array descriptor to be transient again, # Since it is not needed beyond it. Furthermore, this ensures that the codegen # allocates the array and does not expect it as input to the kernel new_desc.transient = True - + # Memlet related helper functions def get_memlet_subset(self, map_chain: List[nodes.MapEntry], node: nodes.Node): """ Compute the memlet subset to access an array based on the position of a node within nested GPU maps. For each GPU_Device or GPU_ThreadBlock map in the chain: - - If the node lies inside the map (but is not the map entry or exit itself), + - If the node lies inside the map (but is not the map entry or exit itself), the subset is the single index corresponding to the map parameter (symbolic). - Otherwise, the full range of the map dimension is used. @@ -365,13 +371,11 @@ def get_memlet_subset(self, map_chain: List[nodes.MapEntry], node: nodes.Node): if not next_map.map.schedule in [dtypes.ScheduleType.GPU_Device, dtypes.ScheduleType.GPU_ThreadBlock]: continue - map_parent_state = self._node_to_state_cache[next_map] + map_parent_state = self._node_to_state_cache[next_map] for param, (start, end, stride) in zip(next_map.map.params, next_map.map.range.ndrange()): - node_is_map = ( - (isinstance(node, nodes.MapEntry) and node == next_map) or - (isinstance(node, nodes.MapExit) and map_parent_state.exit_node(next_map) == node) - ) + node_is_map = ((isinstance(node, nodes.MapEntry) and node == next_map) + or (isinstance(node, nodes.MapExit) and map_parent_state.exit_node(next_map) == node)) node_state = self._node_to_state_cache[node] if helpers.contained_in(node_state, node, next_map) and not node_is_map: index = symbol(param) @@ -379,18 +383,17 @@ def get_memlet_subset(self, map_chain: List[nodes.MapEntry], node: nodes.Node): else: subset.append((start, end, stride)) - return subset def update_memlets(self, kernel_entry: nodes.MapEntry, array_name: str, outermost_node: nodes.Node, access_nodes: Set[nodes.AccessNode]) -> None: """ - Updates all memlets related to a given transient array to reflect correct data + Updates all memlets related to a given transient array to reflect correct data movement when moving array out of the kernel entry. - Any map enclosing the `outermost_node` also encloses all access nodes and is - used to determine which maps are strictly above the access nodes. Based on this, - we compute the correct memlet subset that includes the additional dimensions + Any map enclosing the `outermost_node` also encloses all access nodes and is + used to determine which maps are strictly above the access nodes. Based on this, + we compute the correct memlet subset that includes the additional dimensions from the GPU map hierarchy. Args: @@ -418,7 +421,7 @@ def update_memlets(self, kernel_entry: nodes.MapEntry, array_name: str, outermos new_range = params_as_ranges + old_range edge.data.subset = Range(new_range) visited.add(edge) - + elif edge.data.data != array_name and edge.dst is access_node and edge.data.dst_subset is not None: old_range = edge.data.dst_subset.ndrange() new_range = params_as_ranges + old_range @@ -428,7 +431,6 @@ def update_memlets(self, kernel_entry: nodes.MapEntry, array_name: str, outermos else: continue - # out paths for path in self.out_paths(access_node): for edge in path: @@ -441,7 +443,8 @@ def update_memlets(self, kernel_entry: nodes.MapEntry, array_name: str, outermos edge.data.subset = Range(new_range) visited.add(edge) - elif(edge.data.data != array_name) and edge.src is access_node and edge.data.src_subset is not None: + elif (edge.data.data + != array_name) and edge.src is access_node and edge.data.src_subset is not None: old_range = edge.data.src_subset.ndrange() new_range = params_as_ranges + old_range edge.data.src_subset = Range(new_range) @@ -486,14 +489,14 @@ def get_new_shape_info(self, array_desc: dt.Array, map_exit_chain: List[nodes.Ma range_size = [max_elem + 1 - min_elem for max_elem, min_elem in zip(max_elements, min_elements)] extended_size = range_size + extended_size - new_strides = [1 for _ in next_map.map.params] + new_strides # add 1 per dimension - new_offsets = [0 for _ in next_map.map.params] + new_offsets # add 0 per dimension + new_strides = [1 for _ in next_map.map.params] + new_strides # add 1 per dimension + new_offsets = [0 for _ in next_map.map.params] + new_offsets # add 0 per dimension new_shape = extended_size + list(array_desc.shape) new_total_size = functools.reduce(sympy.Mul, extended_size, 1) * array_desc.total_size return new_shape, new_strides, new_total_size, new_offsets - + # TODO: Ask Yakup -> No states test but this should be alright def replace_array_name(self, sdfgs: FrozenSet[SDFG], old_name: str, new_name: str, array_desc: dt.Array) -> None: """ @@ -517,7 +520,7 @@ def replace_array_name(self, sdfgs: FrozenSet[SDFG], old_name: str, new_name: st for state in sdfg.states(): for edge in state.edges(): - # Update out connectors + # Update out connectors src = edge.src old_out_conn = f"OUT_{old_name}" new_out_conn = f"OUT_{new_name}" @@ -526,7 +529,7 @@ def replace_array_name(self, sdfgs: FrozenSet[SDFG], old_name: str, new_name: st src.remove_out_connector(old_out_conn) src.add_out_connector(new_out_conn, dtypes.pointer(array_desc.dtype)) - # Update in connectors + # Update in connectors dst = edge.dst old_in_conn = f"IN_{old_name}" new_in_conn = f"IN_{new_name}" @@ -534,7 +537,7 @@ def replace_array_name(self, sdfgs: FrozenSet[SDFG], old_name: str, new_name: st edge.dst_conn = new_in_conn dst.remove_in_connector(old_in_conn) dst.add_in_connector(new_in_conn, dtypes.pointer(array_desc.dtype)) - + def update_symbols(self, map_entry_chain: List[nodes.MapEntry], top_sdfg: SDFG) -> None: """ Ensures symbols from GPU maps are defined in all nested SDFGs. @@ -550,11 +553,11 @@ def update_symbols(self, map_entry_chain: List[nodes.MapEntry], top_sdfg: SDFG) """ all_symbols = set() for next_map in map_entry_chain: - if not next_map.map.schedule in [dace.dtypes.ScheduleType.GPU_Device, dace.dtypes.ScheduleType.GPU_ThreadBlock]: + if not next_map.map.schedule in [ + dace.dtypes.ScheduleType.GPU_Device, dace.dtypes.ScheduleType.GPU_ThreadBlock + ]: continue all_symbols = all_symbols | next_map.used_symbols_within_scope(self._node_to_state_cache[next_map]) - - for sdfg in top_sdfg.all_sdfgs_recursive(): nsdfg_node = sdfg.parent_nsdfg_node @@ -568,8 +571,9 @@ def update_symbols(self, map_entry_chain: List[nodes.MapEntry], top_sdfg: SDFG) nsdfg_node.symbol_mapping[symbol] = dace.symbol(symbol) # Array analysis and metadata functions - def collect_array_descriptor_usage(self, map_entry: nodes.MapEntry, - array_name: str) -> Set[Tuple[dt.Array, SDFG, FrozenSet[SDFG], FrozenSet[nodes.AccessNode]]]: + def collect_array_descriptor_usage( + self, map_entry: nodes.MapEntry, + array_name: str) -> Set[Tuple[dt.Array, SDFG, FrozenSet[SDFG], FrozenSet[nodes.AccessNode]]]: """ Tracks usage of a transient array across nested SDFGs within the scope of a map. @@ -592,7 +596,8 @@ def collect_array_descriptor_usage(self, map_entry: nodes.MapEntry, - a frozenset of all involved SDFGs, - a frozenset of all AccessNodes using this array. """ - access_nodes_info: List[Tuple[nodes.AccessNode, SDFGState, SDFG]] = self.get_access_nodes_within_map(map_entry, array_name) + access_nodes_info: List[Tuple[nodes.AccessNode, SDFGState, + SDFG]] = self.get_access_nodes_within_map(map_entry, array_name) last_sdfg: SDFG = self._node_to_sdfg_cache[map_entry] result: Set[Tuple[dt.Array, SDFG, Set[SDFG], Set[nodes.AccessNode]]] = set() @@ -608,7 +613,7 @@ def collect_array_descriptor_usage(self, map_entry: nodes.MapEntry, # we are only interested in the information thus this is fine) array_desc = access_node.desc(state) - # Collect all sdfgs and access nodes which refer to the same array + # Collect all sdfgs and access nodes which refer to the same array # (we determine this by inspecting if the array name is passed via connectors) sdfg_set: Set[SDFG] = set() access_nodes_set: Set[nodes.AccessNode] = set() @@ -646,7 +651,7 @@ def collect_array_descriptor_usage(self, map_entry: nodes.MapEntry, if array_name in nsdfg_node.in_connectors or array_name in nsdfg_node.out_connectors: queue.append(nsdfg_node.sdfg) sdfg_set.add(nsdfg_node.sdfg) - + # Get all access nodes with the array name used in the sdfgs we found for current_sdfg in sdfg_set: for current_state in current_sdfg.states(): @@ -662,7 +667,8 @@ def collect_array_descriptor_usage(self, map_entry: nodes.MapEntry, return result - def new_name_required(self, map_entry: nodes.MapEntry, array_name: str, sdfg_defined: FrozenSet[SDFG]) -> Tuple[bool, str]: + def new_name_required(self, map_entry: nodes.MapEntry, array_name: str, + sdfg_defined: FrozenSet[SDFG]) -> Tuple[bool, str]: """ Returns whether the array_name is also used at an SDFG which is not in the sdfg_defined set. This means that the array_name at that SDFG refers to another data descriptor. @@ -697,7 +703,6 @@ def new_name_required(self, map_entry: nodes.MapEntry, array_name: str, sdfg_def taken_names.update(sdfg.arrays.keys()) taken_names.update(sdfg.used_symbols(True)) - if array_name in taken_names: counter = 0 new_name = f"local_{counter}_{array_name}" @@ -710,9 +715,10 @@ def new_name_required(self, map_entry: nodes.MapEntry, array_name: str, sdfg_def return False, array_name # Utility functions - basic building blocks - def get_access_nodes_within_map(self, map_entry: nodes.MapEntry, data_name: str) -> List[Tuple[nodes.AccessNode, SDFGState, SDFG]]: + def get_access_nodes_within_map(self, map_entry: nodes.MapEntry, + data_name: str) -> List[Tuple[nodes.AccessNode, SDFGState, SDFG]]: """ - Finds all AccessNodes that refer to the given `data_name` and are located inside + Finds all AccessNodes that refer to the given `data_name` and are located inside the scope of the specified MapEntry. Returns: @@ -726,8 +732,8 @@ def get_access_nodes_within_map(self, map_entry: nodes.MapEntry, data_name: str) for node, parent_state in starting_sdfg.all_nodes_recursive(): - if (isinstance(node, nodes.AccessNode) and node.data == data_name and - helpers.contained_in(parent_state, node, map_entry)): + if (isinstance(node, nodes.AccessNode) and node.data == data_name + and helpers.contained_in(parent_state, node, map_entry)): parent_sdfg = self._node_to_sdfg_cache[node] matching_access_nodes.append((node, parent_state, parent_sdfg)) @@ -739,7 +745,7 @@ def get_maps_between(self, stop_map_entry: nodes.MapEntry, """ Returns all MapEntry/MapExit pairs between `node` and `stop_map_entry`, inclusive. - Maps are returned from innermost to outermost, starting at the scope of `node` and + Maps are returned from innermost to outermost, starting at the scope of `node` and ending at `stop_map_entry`. Assumes that `node` is (directly or indirectly via a nestedSDFG) contained within the `stop_map_entry`'s scope. @@ -777,7 +783,7 @@ def get_maps_between(self, stop_map_entry: nodes.MapEntry, parent_info = helpers.get_parent_map(state, entry) return entries, exits - + def get_nearest_access_node(self, access_nodes: List[nodes.AccessNode], node: nodes.Node) -> nodes.AccessNode: """ Finds the closest access node (by graph distance) to the given node @@ -808,7 +814,7 @@ def get_nearest_access_node(self, access_nodes: List[nodes.AccessNode], node: no queue.append(neighbor) raise RuntimeError(f"No access node found connected to the given node {node}.") - + def in_paths(self, access_node: nodes.AccessNode) -> List[List[MultiConnectorEdge[Memlet]]]: """ Traces all incoming dataflow paths to the given AccessNode. @@ -823,7 +829,7 @@ def in_paths(self, access_node: nodes.AccessNode) -> List[List[MultiConnectorEdg initial_paths = [[edge] for edge in state.in_edges(access_node)] queue = deque(initial_paths) complete_paths = [] - + while queue: # Get current path and see whether the starting node has in-edges carrying the access nodes data current_path = queue.popleft() @@ -836,12 +842,12 @@ def in_paths(self, access_node: nodes.AccessNode) -> List[List[MultiConnectorEdg complete_paths.append(current_path) continue - + # Otherwise, extend the current path and add it to the queue for further processing for edge in incoming_edges: if edge in current_path: raise ValueError("Unexpected cycle detected") - + extended_path = [edge] + current_path queue.append(extended_path) @@ -856,11 +862,11 @@ def out_paths(self, access_node: nodes.AccessNode) -> List[List[MultiConnectorEd A list of edge paths (each a list of edges). """ state: SDFGState = self._node_to_state_cache[access_node] - + initial_paths = [[edge] for edge in state.out_edges(access_node)] queue = deque(initial_paths) complete_paths = [] - + while queue: # Get current path and see whether the last node has out-edges carrying the access nodes data current_path = queue.popleft() @@ -872,15 +878,14 @@ def out_paths(self, access_node: nodes.AccessNode) -> List[List[MultiConnectorEd if len(outgoing_edges) == 0: complete_paths.append(current_path) continue - + # Otherwise, extend the current path and add it to the queue for further processing for edge in outgoing_edges: if edge in current_path: raise ValueError("Unexpected cycle detected") - + extended_path = current_path + [edge] queue.append(extended_path) - + return complete_paths - \ No newline at end of file diff --git a/dace/transformation/passes/shared_memory_synchronization.py b/dace/transformation/passes/shared_memory_synchronization.py index 8a45993bb5..8a8eca9842 100644 --- a/dace/transformation/passes/shared_memory_synchronization.py +++ b/dace/transformation/passes/shared_memory_synchronization.py @@ -1,460 +1,299 @@ -from typing import Union, Dict, Set - -import functools -import sympy +# Copyright 2019-2025 ETH Zurich and the DaCe authors. All rights reserved. +import warnings +from typing import Dict, Set, Tuple import dace -from dace import SDFG, properties, SDFGState -from dace import dtypes -from dace.codegen import common -from dace.config import Config -from dace.transformation import pass_pipeline as ppl, transformation -from dace.sdfg import nodes, InterstateEdge -from dace.sdfg.graph import Edge - -from dace.sdfg.state import LoopRegion, ConditionalBlock, ControlFlowBlock -from dace.sdfg.nodes import AccessNode, Map, MapEntry, MapExit - -from dace.transformation.passes import analysis as ap +from dace import SDFG, SDFGState, dtypes, properties +from dace.sdfg.nodes import AccessNode, MapEntry, MapExit, NestedSDFG, Node +from dace.sdfg.state import LoopRegion +from dace.transformation import helpers, pass_pipeline as ppl, transformation @properties.make_properties @transformation.explicit_cf_compatible class DefaultSharedMemorySync(ppl.Pass): """ - A DaCe transformation pass that automatically inserts GPU synchronization barriers - (__syncthreads()) for shared memory access patterns. + This pass inserts synchronization tasklets that call "__syncthreads()". - This pass ensures proper synchronization in two scenarios: - 1. Pre-synchronization: Before consuming shared memory data (AccessNode -> CodeNode/MapEntry) - 2. Post-synchronization: After shared memory reuse in sequential loops/maps within GPU kernels + Synchronization is added after ThreadBlock (TB) MapExits if the TB map + writes to shared memory. - The pass traverses the SDFG hierarchy and identifies shared memory access patterns - that require synchronization to prevent race conditions in GPU code. + Important notes: + - Users are expected to **not** write to shared memory inside a Sequential + map or LoopRegion **within** a TB map. Calling "__syncthreads()" inside + a TB map can cause deadlocks, e.g., when only a subset of threads + participates (thread divergence). - NOTE: This implementation handles commonly observed patterns. Unsupported cases - raise NotImplementedError with context for extending the implementation once comming across - another constellation which was not observed in the used common examples. + - If shared memory is still written sequentially within a TB map, the missing + intermediate synchronizations may lead to race conditions and incorrect results. + Since deadlocks are worse than race conditions, this pass avoids inserting + synchronization inside TB maps, but it will warn the user of the race condition risk. + + - In nested TB maps (e.g., GPU_Device map -> TB map -> TB map ...), + synchronization is only inserted at the outermost TB map's exit. The reason is again + the previously described deadlock danger. """ def __init__(self): """Initialize the synchronization pass.""" - - # Track which scopes (sequential maps and Loops) have already been - # synchronized to avoid duplicate barriers - self._synchronized_scopes: Set[Union[MapExit, LoopRegion]] = set() - - # Map from MapExit nodes to their containing states for post-synchronization - self._map_exit_to_state: Dict[MapExit, SDFGState] = dict() - - # Keep track of processed nested sdfgs - self._processed_nsdfg = set() + # Cache each node's parent state during apply_pass() + self._node_to_parent_state: Dict[Node, SDFGState] = dict() def apply_pass(self, sdfg: SDFG, _) -> None: """ - Apply the synchronization pass to the entire SDFG. + Apply this pass to insert synchronization barriers for GPU ThreadBlock maps. - Args: - sdfg: The SDFG to process (expected to be top-level) - _: Unused pass pipeline argument + The pass: + - Finds all ThreadBlock-scheduled maps in the SDFG, + - Analyzes them for shared memory usage and race-condition risks, and + - Inserts synchronization barriers (`__syncthreads()`) after the + corresponding ThreadBlock-scheduled MapExits where needed. """ - # Start processing from the top-level with empty scope stack - # The scope stack tracks nested execution contexts (maps, loops) - enclosing_scopes = [] - self._process_sdfg(sdfg, enclosing_scopes) - def _process_sdfg(self, sdfg: SDFG, enclosing_scopes: list[Union[MapExit, LoopRegion]]) -> None: - """ - Recursively traverse all nodes in an SDFG, handling different node types. + # 1. Find all GPU_ThreadBlock schedules Maps and + # cache each node's parent state for convenience + tb_map_exits: Dict[MapExit, SDFGState] = dict() + for node, parent_state in sdfg.all_nodes_recursive(): + self._node_to_parent_state[node] = parent_state + if isinstance(node, MapExit) and node.schedule == dtypes.ScheduleType.GPU_ThreadBlock: + tb_map_exits[node] = parent_state - Args: - sdfg: The SDFG to traverse - enclosing_scopes: Stack of execution scopes (e.g., maps, loops) enclosing the SDFG as a whole. - """ - for sdfg_elem in sdfg.nodes(): - self._process_sdfg_element(sdfg, sdfg_elem, enclosing_scopes) + # 2. Identify TB MapExits requiring a synchronization barrier + sync_requiring_exits = self.identify_synchronization_tb_exits(tb_map_exits) - def _process_sdfg_element(self, sdfg: SDFG, element: any, enclosing_scopes: list[Union[MapExit, - LoopRegion]]) -> None: - """ - Identifies the type of the SDFG element and processes it using the corresponding handler. + # 3. Insert synchronization barriers for previous TB MapExits + self.insert_synchronization_after_tb_exits(sync_requiring_exits) - Args: - sdfg: The current SDFG we are in (innermost if nested) - enclosing_scopes: Stack of enclosing execution scopes (maps, loops) wrapping the current SDFG + def identify_synchronization_tb_exits(self, tb_map_exits: Dict[MapExit, SDFGState]) -> Dict[MapExit, SDFGState]: """ - if isinstance(element, LoopRegion): - self._process_loop_region(sdfg, element, enclosing_scopes) - - elif isinstance(element, SDFGState): - self._process_state(sdfg, element, enclosing_scopes) - - elif isinstance(element, ConditionalBlock): - self._process_conditionalBlock(sdfg, element, enclosing_scopes) - - else: - raise NotImplementedError( - f"{self.__class__.__name__}: Unsupported node type '{type(element).__name__}' " - f"encountered during SDFG traversal. Please extend the implementation to handle this case.") - - def _process_loop_region(self, sdfg: SDFG, loop_region: LoopRegion, - enclosing_scopes: list[Union[MapExit, LoopRegion]]) -> None: + Identify ThreadBlock exits after which "__syncthread()" should be called. + + Parameters + ---------- + tb_map_exits : Dict[MapExit, SDFGState] + Mapping from GPU_ThreadBlock - scheduled MapExit nodes to their parent SDFGState. + + Returns + ------- + Dict[MapExit, SDFGState] + Subset of `tb_map_exits` where any AccessNode between the entry and exit + uses GPU shared memory, indicating a synchronization barrier is needed. """ - Process a loop region by adding it to the scope stack and traversing its contents. + #------------------------- helper function ------------------------- + sync_requiring_exits: Dict[MapExit, SDFGState] = {} - Args: - sdfg: The containing SDFG - loop_region: The loop region to process - enclosing_scopes: Current scope stack which wraps around state - """ - # Create a new scope stack with this loop region added - nested_scopes = enclosing_scopes.copy() - nested_scopes.insert(0, loop_region) # Not append! :) careful - - # Process all states within the loop region - for node in loop_region.nodes(): - if isinstance(node, SDFGState): - self._process_state(sdfg, node, nested_scopes) - else: - raise NotImplementedError(f"{self.__class__.__name__}: Unexpected node type '{type(node).__name__}' " - f"found inside LoopRegion. SDFGState nodes were expected. Extend if you think" - "the node type is also valid") - - def _process_state(self, sdfg: SDFG, state: SDFGState, enclosing_scopes: list[Union[MapExit, LoopRegion]]) -> None: - """ - Process a single SDFG state, analyzing edges for shared memory access patterns. + for map_exit, state in tb_map_exits.items(): - Args: - sdfg: The containing SDFG - state: The state to process - enclosing_scopes: Current scope stack which wrapp around state (NOT of each individual node) - """ - # Track destination nodes that already have synchronization tasklets - # This prevents creating duplicate barriers for the same consumer - nodes_with_sync: Dict[nodes.Node, nodes.Tasklet] = {} + # process + map_entry = state.entry_node(map_exit) + writes_to_smem, race_cond_danger, has_tb_parent = self.tb_exits_analysis(map_entry, map_exit, state) - # Analyze each edge in the state for shared memory access patterns - for edge in state.edges(): - source_node, dest_node = edge.src, edge.dst - - # Skip edges that don't involve shared memory reads - # (either source is not shared memory, or it's a memory-to-memory copy) - if not self._is_shared_memory_access_node(sdfg, source_node) or isinstance(dest_node, nodes.AccessNode): - continue - - # Handle different types of shared memory consumers - if isinstance(dest_node, (nodes.CodeNode, nodes.MapEntry)): - # Direct consumption by computation or map entry - self._insert_pre_synchronization_barrier(source_node, dest_node, state, nodes_with_sync) - - elif isinstance(dest_node, nodes.NestedSDFG): - # Consumption by nested SDFG - synchronize and recurse - # NOTE: For nesting, we append all scopes which wrap around the nestedSDFG - self._insert_pre_synchronization_barrier(source_node, dest_node, state, nodes_with_sync) - nested_scopes = self._build_nested_scope_stack(state, dest_node, enclosing_scopes) - self._process_sdfg(dest_node.sdfg, nested_scopes) - self._processed_nsdfg.add(dest_node) - else: - raise NotImplementedError( - f"{self.__class__.__name__}: Unsupported destination node type '{type(dest_node).__name__}' " - f"for shared memory access. Currently supported: CodeNode, MapEntry, AccessNode, NestedSDFG.") - - # Check if post-synchronization is needed and apply shared - self._handle_shared_memory_post_synchronization(state, source_node, enclosing_scopes) - - # It may be the case that nestedSDFG were not recursed previously. Process them in that case - for node in state.nodes(): - - # Guards - if not isinstance(node, nodes.NestedSDFG): - continue - if node in self._processed_nsdfg: + # Skip: if this TB map is nested inside another TB map in the same kernel + # (i.e., before reaching the GPU_Device map), synchronization responsibility belongs + # to the outermost such TB map in the kernel. + if has_tb_parent: continue - # not yet processed NestedSDFG - nested_scopes = self._build_nested_scope_stack(state, node, enclosing_scopes) - self._process_sdfg(node.sdfg, nested_scopes) - self._processed_nsdfg.add(node) - - def _process_conditionalBlock(self, sdfg: SDFG, cond_block: ConditionalBlock, - enclosing_scopes: list[Union[MapExit, LoopRegion]]) -> None: - """ - Processes a ConditionalBlock by visiting each clause body and its elements. - - Args: - sdfg: The current SDFG context. - cond_block: The ConditionalBlock to process (e.g., if-elif-else structure). - enclosing_scopes: Stack of execution scopes (e.g., maps, loops) enclosing the SDFG as a whole. - """ - clause_bodies: list[ControlFlowBlock] = cond_block.nodes() - - for body in clause_bodies: - for sdfg_elem in body.nodes(): - self._process_sdfg_element(sdfg, sdfg_elem, enclosing_scopes) - - def _is_shared_memory_access_node(self, sdfg: SDFG, node: nodes.Node) -> bool: - """ - Check if a node represents a GPU shared memory access. - - Args: - sdfg: The containing SDFG - node: The node to check + # Warn user: potential race condition detected. + elif race_cond_danger and writes_to_smem: + warnings.warn( + f"Race condition danger: LoopRegion or Sequential Map inside ThreadBlock map {map_entry} " + "writes to GPU shared memory. No synchronization occurs for intermediate steps, " + "because '__syncthreads()' is only called outside the ThreadBlock map to avoid potential deadlocks." + "Please consider moving the LoopRegion or Sequential Map outside the ThreadBlock map.") + sync_requiring_exits[map_exit] = state - Returns: - True if the node is an AccessNode with GPU_Shared storage - """ - return (isinstance(node, nodes.AccessNode) and node.desc(sdfg).storage == dtypes.StorageType.GPU_Shared) - - def _insert_pre_synchronization_barrier(self, source_node: nodes.Node, dest_node: nodes.Node, state: SDFGState, - nodes_with_sync: Dict[nodes.Node, nodes.Tasklet]) -> None: - """ - Insert a __syncthreads() barrier before shared memory consumption. - Reuses existing barriers when multiple shared memory sources feed the same destination. - - Args: - source_node: The shared memory AccessNode - dest_node: The consuming node - state: The containing state - nodes_with_sync: Map tracking existing synchronization tasklets - """ - if dest_node in nodes_with_sync: - # Reuse existing synchronization barrier for this destination - existing_barrier = nodes_with_sync[dest_node] - state.add_edge(source_node, None, existing_barrier, None, dace.Memlet()) - else: - # Create a new synchronization barrier - sync_barrier = state.add_tasklet(name="pre_sync_barrier", - inputs=set(), - outputs=set(), - code="__syncthreads();\n", - language=dtypes.Language.CPP) + # TB map writes to shared memory: synchronization is needed + elif writes_to_smem: + sync_requiring_exits[map_exit] = state - # Connect: shared_memory -> sync_barrier -> consumer - state.add_edge(source_node, None, sync_barrier, None, dace.Memlet()) - state.add_edge(sync_barrier, None, dest_node, None, dace.Memlet()) - nodes_with_sync[dest_node] = sync_barrier + return sync_requiring_exits - def _build_nested_scope_stack( - self, state: SDFGState, nested_sdfg_node: nodes.NestedSDFG, - enclosing_scopes: list[Union[MapExit, LoopRegion]]) -> list[Union[MapExit, LoopRegion]]: + def tb_exits_analysis(self, map_entry: MapEntry, map_exit: MapExit, state: SDFGState) -> Tuple[bool, bool, bool]: """ - Copy the 'enclosing_scopes' stack and extend it with all maps in 'state' that enclose 'nested_sdfg_node'. - It is assumed that the 'enclosing_scopes' stack contains all maps and loops that wrap around 'state', but - not individual nodes within 'state'. - - Args: - state: The state containing the nested SDFG - nested_sdfg_node: The NestedSDFG node - enclosing_scopes: Current scope stack - - Returns: - Updated scope stack including maps enclosing the nested SDFG + Analyze a GPU_ThreadBlock-scheduled map to determine: + - whether it writes to shared memory, + - whether such writes may cause race conditions, and + - whether it is nested within another GPU_ThreadBlock map inside the kernel. + + Returns a tuple of three booleans: + + 1. `writes_to_shared_memory`: + True if the map writes to GPU shared memory. This includes writes + directly at the MapExit or within the map scope. + + 2. `race_cond_danger`: + True if there is a potential race condition due to shared memory writes + inside either: + - a sequentially scheduled map, or + - a loop region. + (Note: single-iteration loops/sequential maps are not treated differently; + they are still marked as dangerous, even though they cannot cause races.) + + 3. `has_parent_tb_map`: + True if this ThreadBlock map is nested inside another ThreadBlock map + (i.e., there exists another TB map between the enclosing GPU_Device + map and the current TB map). + + Parameters + ---------- + map_entry : MapEntry + The entry node of the ThreadBlock map. + map_exit : MapExit + The exit node of the ThreadBlock map. + state : SDFGState + The parent state containing the map. + + Returns + ------- + Tuple[bool, bool, bool] + A tuple: + `(writes_to_shared_memory, race_cond_danger, has_parent_tb_map)` """ - scope_dict = state.scope_dict() - updated_scopes = enclosing_scopes.copy() - - # Walk up the scope hierarchy, adding all enclosing maps - current_map = scope_dict[nested_sdfg_node] - while current_map is not None: - - # Add MapExit node to scope, since it is only needed - # for post synchronization anyways - map_exit = state.exit_node(current_map) - updated_scopes.append(map_exit) - - # add the current state in which the map_exit is contained, - # needed for potential post synchronization barriers - self._map_exit_to_state[map_exit] = state - - # move up in the nested map hierarchy - current_map = scope_dict[current_map] - - return updated_scopes - - def _handle_shared_memory_post_synchronization(self, state: SDFGState, shared_mem_node: nodes.Node, - enclosing_scopes: list[Union[MapExit, LoopRegion]]) -> None: + # Initially, the flags are all set to False + writes_to_shared_memory = False + race_cond_danger = False + has_parent_tb_map = False + + # 1. Check if the ThreadBlock (TB) map writes to shared memory + for edge in state.out_edges(map_exit): + is_smem: bool = (isinstance(edge.dst, AccessNode) + and edge.dst.desc(state).storage == dtypes.StorageType.GPU_Shared) + if is_smem and not edge.data.is_empty(): + writes_to_shared_memory = True + break + + # 2. Search between map entry and exit: + # - Detect writes to shared memory (unless already found) + # - Collect nested SDFGs for later analysis + nested_sdfgs: Set[NestedSDFG] = set() + + for node in state.all_nodes_between(map_entry, map_exit): + if not writes_to_shared_memory and isinstance(node, AccessNode): + # Check if this AccessNode writes to shared memory + if (node.desc(state).storage == dtypes.StorageType.GPU_Shared + and any(not edge.data.is_empty() for edge in state.in_edges(node))): + writes_to_shared_memory = True + + elif isinstance(node, NestedSDFG): + nested_sdfgs.add(node) + + # 3. Recursively analyze nested SDFGs: + # - Detect shared memory writes (only if not already found) + # - Check for potential race conditions in loop regions (only if not already flagged) + for nsdfg in nested_sdfgs: + subs_sdfg = nsdfg.sdfg + if not writes_to_shared_memory: + writes_to_shared_memory = self.sdfg_writes_to_smem(subs_sdfg) + + if not race_cond_danger: + race_cond_danger = self.writes_to_smem_inside_loopregion(subs_sdfg) + + # 4. Check for race condition danger in sequential maps that use shared memory + # (only if not already flagged) + if not race_cond_danger: + race_cond_danger = any( + inner_scope.map.schedule == dtypes.ScheduleType.Sequential and self.map_writes_to_smem(inner_scope) + for _, inner_scope in helpers.get_internal_scopes(state, map_entry)) + + # 5. Check if this TB map is nested within another TB map + parent = helpers.get_parent_map(state, map_entry) + + while parent: + parent_map, parent_state = parent + if parent_map.map.schedule == dtypes.ScheduleType.GPU_ThreadBlock: + has_parent_tb_map = True + break + if parent_map.map.schedule == dtypes.ScheduleType.GPU_Device: + break + parent = helpers.get_parent_map(parent_state, parent_map) + + # 6. Return the results + return writes_to_shared_memory, race_cond_danger, has_parent_tb_map + + def writes_to_smem_inside_loopregion(self, sdfg: SDFG) -> bool: """ - Handle post-synchronization for shared memory reuse in sequential execution contexts. - - When shared memory is reused across iterations in a for loop or sequential map within - a GPU kernel, we need post-synchronization barriers to prevent race conditions. - - Args: - state: The state containing the shared memory access - shared_mem_node: The shared memory AccessNode - enclosing_scopes: Current scope stack + Return True if the SDFG writes to GPU shared memory (smem) inside + a LoopRegion. This check is recursive and includes nested SDFGs. """ - scope_dict = state.scope_dict() - complete_scope_stack = enclosing_scopes.copy() - - # Build complete scope stack including maps inside the current state - # enclosing the shared memory node. Analogous as in _build_nested_scope_stack() - current_map = scope_dict[shared_mem_node] - while current_map is not None: - - map_exit = state.exit_node(current_map) - complete_scope_stack.append(map_exit) - self._map_exit_to_state[map_exit] = state - current_map = scope_dict[current_map] - - # Analyze scope stack to find synchronization requirements - inside_gpu_kernel = False - innermost_sequential_scope = None - - # Process scopes from outermost to innermost - while complete_scope_stack: - scope = complete_scope_stack.pop(0) - - if isinstance(scope, MapExit): - schedule = scope.schedule - if schedule == dtypes.ScheduleType.Sequential and innermost_sequential_scope is None: - - # Special: Skip if there is only one iteration - size_per_dim = scope.map.range.size() - number_total_iterations = functools.reduce(sympy.Mul, size_per_dim, 1) - if number_total_iterations.is_number and number_total_iterations <= 1: - continue - - innermost_sequential_scope = scope - - elif schedule == dtypes.ScheduleType.GPU_Device: - inside_gpu_kernel = True - break - elif isinstance(scope, LoopRegion) and innermost_sequential_scope is None: - - # Special: Skip if there is only one iteration - start = ap.get_init_assignment(scope) - end = ap.get_loop_end(scope) - stride = ap.get_loop_stride(scope) - nr_iter = (end - start) / stride - - if nr_iter.is_number and nr_iter <= 1: - continue - - innermost_sequential_scope = scope - - # Validate that shared memory is used within GPU kernel context - if not inside_gpu_kernel: - raise ValueError("Shared memory usage detected outside GPU kernel context. " - "GPU shared memory is only valid within GPU_Device scheduled maps.") - - # No post synchronization needed if there's no sequential iteration context - if innermost_sequential_scope is None: - return - - # Apply appropriate post-synchronization based on scope type - if isinstance(innermost_sequential_scope, MapExit): - self._add_post_sync_for_sequential_map(innermost_sequential_scope) - elif isinstance(innermost_sequential_scope, LoopRegion): - # two options, see docstrings - self._add_post_sync_tasklets_for_loop_region(innermost_sequential_scope) - # self._add_post_sync_state_for_loop_region(innermost_sequential_scope) - - def _add_post_sync_for_sequential_map(self, seq_map_exit: MapExit) -> None: + for node in sdfg.nodes(): + if isinstance(node, LoopRegion): + # Traverse all nodes inside the loop region + for subnode, parent in node.all_nodes_recursive(): + if (isinstance(subnode, AccessNode) + and subnode.desc(parent).storage == dtypes.StorageType.GPU_Shared + and any(not edge.data.is_empty() for edge in parent.in_edges(node))): + return True + + elif isinstance(node, NestedSDFG): + # Recurse into nested SDFGs + if self.writes_to_smem_inside_loopregion(node.sdfg): + return True + + return False + + def sdfg_writes_to_smem(self, sdfg: SDFG) -> bool: """ - Add post-synchronization barrier after a sequential map that may reuse shared memory. - - Args: - seq_map_exit: The MapExit node of the sequential map + Return True if the SDFG writes to GPU shared memory (smem), + i.e., contains an AccessNode with GPU_Shared storage that has + at least one non-empty incoming edge. """ - # Avoid duplicate synchronization - if seq_map_exit in self._synchronized_scopes: - return - - # Find the state containing this map - containing_state = self._map_exit_to_state[seq_map_exit] - - # Create post-synchronization barrier - post_sync_barrier = containing_state.add_tasklet(name="post_sync_barrier", - inputs=set(), - outputs=set(), - code="__syncthreads();\n", - language=dtypes.Language.CPP) + for node, state in sdfg.all_nodes_recursive(): + if (isinstance(node, AccessNode) and node.desc(state).storage == dtypes.StorageType.GPU_Shared + and any(not edge.data.is_empty() for edge in state.in_edges(node))): + return True + return False - # Insert barrier before the map exit and all other predecessors - incoming_edges = containing_state.in_edges(seq_map_exit) - for edge in incoming_edges: - - predecessor = edge.src - containing_state.add_edge(predecessor, None, post_sync_barrier, None, dace.Memlet()) - containing_state.add_edge(post_sync_barrier, None, seq_map_exit, None, dace.Memlet()) - - # Mark as synchronized - self._synchronized_scopes.add(seq_map_exit) - - def _add_post_sync_state_for_loop_region(self, loop_region: LoopRegion) -> None: + def map_writes_to_smem(self, map_entry: MapEntry) -> bool: """ - Add post-synchronization barrier for a loop region that reuses shared memory arrays. - It adds a new state, which contains only a synchronization tasklet that connects - to all sink blocks of the loop region. + Return True if the map writes to GPU shared memory (smem). - Args: - loop_region: The LoopRegion that needs post-synchronization + A map is considered to write to smem if: + - Any AccessNode with GPU_Shared storage is written to at the MapExit, or + - Such writes occur within the map scope, or + - A nested SDFG within the map writes to smem. """ + state = self._node_to_parent_state[map_entry] + map_exit = state.exit_node(map_entry) - sink_blocks: list[ControlFlowBlock] = [] - for block in loop_region.nodes(): - - if loop_region.out_degree(block) == 0: - sink_blocks.append(block) + # 1. Check if MapExit writes directly to shared memory + for edge in state.out_edges(map_exit): + if (isinstance(edge.dst, AccessNode) and edge.dst.desc(state).storage == dtypes.StorageType.GPU_Shared + and not edge.data.is_empty()): + return True - # No sync needed - if len(sink_blocks) < 0: - return + # 2. Inspect nodes inside the map scope + for node in state.all_nodes_between(map_entry, map_exit): + if (isinstance(node, AccessNode) and node.desc(state).storage == dtypes.StorageType.GPU_Shared + and any(not edge.data.is_empty() for edge in state.in_edges(node))): + return True - # Add new state which synchronizates all sink nodes of the loop - syn_block = loop_region.add_state("sync_state") - syn_block.add_tasklet(name="post_sync_barrier", - inputs=set(), - outputs=set(), - code="__syncthreads();\n", - language=dtypes.Language.CPP) + if isinstance(node, NestedSDFG) and self.sdfg_writes_to_smem(node.sdfg): + return True - for block in sink_blocks: - loop_region.add_edge(block, syn_block, InterstateEdge()) + # No writes to shared memory found + return False - # Mark as synchronized - self._synchronized_scopes.add(loop_region) - - def _add_post_sync_tasklets_for_loop_region(self, loop_region: LoopRegion) -> None: + def insert_synchronization_after_tb_exits(self, tb_map_exits: Dict[MapExit, SDFGState]) -> None: """ - Add post-synchronization barrier for a loop region that reuses shared memory arrays. - Determines all sink blocks in the LoopRegion, and then, for each sink block, adds a new synchronization - tasklet that connects to all sink nodes within that sink block. - - Args: - loop_region: The LoopRegion that needs post-synchronization + Insert synchronization tasklets (calling `__syncthreads()`) after the given + GPU ThreadBlock MapExit nodes. + + Parameters + ---------- + tb_map_exits : Dict[MapExit, SDFGState] + Mapping from ThreadBlock MapExit nodes to their parent states after which a synchronization + tasklet should be inserted. """ + for map_exit, state in tb_map_exits.items(): - sink_blocks: list[SDFGState] = [] - for block in loop_region.nodes(): - - if not isinstance(block, SDFGState): - raise NotImplementedError( - f"Block {block} is expected to be an SDFG state. But it is of type {type(block)}. " - "Extend use case if this should be valid.") - - if loop_region.out_degree(block) == 0: - sink_blocks.append(block) - - # No sync needed - if len(sink_blocks) < 0: - return - - # For each sink block, synchronize at the end - for block in sink_blocks: - - sink_nodes: list[nodes.Node] = block.sink_nodes() - - # All sink nodes in the same block (= state) get the same sync tasklet - post_sync_barrier = block.add_tasklet(name="post_sync_barrier", - inputs=set(), - outputs=set(), - code="__syncthreads();\n", - language=dtypes.Language.CPP) + sync_tasklet = state.add_tasklet(name="sync_threads", + inputs=set(), + outputs=set(), + code="__syncthreads();\n", + language=dtypes.Language.CPP) - for snode in sink_nodes: - block.add_edge(snode, None, post_sync_barrier, None, dace.Memlet()) + for succ in state.successors(map_exit): + state.add_edge(sync_tasklet, None, succ, None, dace.Memlet()) - # Mark as synchronized - self._synchronized_scopes.add(loop_region) + state.add_edge(map_exit, None, sync_tasklet, None, dace.Memlet()) diff --git a/dace/transformation/passes/shared_memory_synchronization2.py b/dace/transformation/passes/shared_memory_synchronization2.py deleted file mode 100644 index d0f8d70340..0000000000 --- a/dace/transformation/passes/shared_memory_synchronization2.py +++ /dev/null @@ -1,311 +0,0 @@ -# Copyright 2019-2025 ETH Zurich and the DaCe authors. All rights reserved. -import warnings -from typing import Dict, Set, Tuple - -import dace -from dace import SDFG, SDFGState, dtypes, properties -from dace.sdfg.nodes import AccessNode, MapEntry, MapExit, NestedSDFG, Node -from dace.sdfg.state import LoopRegion -from dace.transformation import helpers, pass_pipeline as ppl, transformation - -@properties.make_properties -@transformation.explicit_cf_compatible -class DefaultSharedMemorySync(ppl.Pass): - """ - This pass inserts synchronization tasklets that call "__syncthreads()". - - Synchronization is added after ThreadBlock (TB) MapExits if the TB map - writes to shared memory. - - Important notes: - - Users are expected to **not** write to shared memory inside a Sequential - map or LoopRegion **within** a TB map. Calling "__syncthreads()" inside - a TB map can cause deadlocks, e.g., when only a subset of threads - participates (thread divergence). - - - If shared memory is still written sequentially within a TB map, the missing - intermediate synchronizations may lead to race conditions and incorrect results. - Since deadlocks are worse than race conditions, this pass avoids inserting - synchronization inside TB maps, but it will warn the user of the race condition risk. - - - In nested TB maps (e.g., GPU_Device map -> TB map -> TB map ...), - synchronization is only inserted at the outermost TB map's exit. The reason is again - the previously described deadlock danger. - """ - - def __init__(self): - """Initialize the synchronization pass.""" - # Cache each node's parent state during apply_pass() - self._node_to_parent_state: Dict[Node, SDFGState] = dict() - - def apply_pass(self, sdfg: SDFG, _) -> None: - """ - Apply this pass to insert synchronization barriers for GPU ThreadBlock maps. - - The pass: - - Finds all ThreadBlock-scheduled maps in the SDFG, - - Analyzes them for shared memory usage and race-condition risks, and - - Inserts synchronization barriers (`__syncthreads()`) after the - corresponding ThreadBlock-scheduled MapExits where needed. - """ - - - # 1. Find all GPU_ThreadBlock schedules Maps and - # cache each node's parent state for convenience - tb_map_exits: Dict[MapExit, SDFGState] = dict() - for node, parent_state in sdfg.all_nodes_recursive(): - self._node_to_parent_state[node] = parent_state - if isinstance(node, MapExit) and node.schedule == dtypes.ScheduleType.GPU_ThreadBlock: - tb_map_exits[node] = parent_state - - # 2. Identify TB MapExits requiring a synchronization barrier - sync_requiring_exits = self.identify_synchronization_tb_exits(tb_map_exits) - - # 3. Insert synchronization barriers for previous TB MapExits - self.insert_synchronization_after_tb_exits(sync_requiring_exits) - - def identify_synchronization_tb_exits(self, tb_map_exits: Dict[MapExit, SDFGState]) -> Dict[MapExit, SDFGState]: - """ - Identify ThreadBlock exits after which "__syncthread()" should be called. - - Parameters - ---------- - tb_map_exits : Dict[MapExit, SDFGState] - Mapping from GPU_ThreadBlock - scheduled MapExit nodes to their parent SDFGState. - - Returns - ------- - Dict[MapExit, SDFGState] - Subset of `tb_map_exits` where any AccessNode between the entry and exit - uses GPU shared memory, indicating a synchronization barrier is needed. - """ - #------------------------- helper function ------------------------- - sync_requiring_exits: Dict[MapExit, SDFGState] = {} - - for map_exit, state in tb_map_exits.items(): - - # process - map_entry = state.entry_node(map_exit) - writes_to_smem, race_cond_danger, has_tb_parent = self.tb_exits_analysis(map_entry, map_exit, state) - - # Skip: if this TB map is nested inside another TB map in the same kernel - # (i.e., before reaching the GPU_Device map), synchronization responsibility belongs - # to the outermost such TB map in the kernel. - if has_tb_parent: - continue - - # Warn user: potential race condition detected. - elif race_cond_danger and writes_to_smem: - warnings.warn( - f"Race condition danger: LoopRegion or Sequential Map inside ThreadBlock map {map_entry} " - "writes to GPU shared memory. No synchronization occurs for intermediate steps, " - "because '__syncthreads()' is only called outside the ThreadBlock map to avoid potential deadlocks." - "Please consider moving the LoopRegion or Sequential Map outside the ThreadBlock map." - ) - sync_requiring_exits[map_exit] = state - - # TB map writes to shared memory: synchronization is needed - elif writes_to_smem: - sync_requiring_exits[map_exit] = state - - return sync_requiring_exits - - def tb_exits_analysis(self, map_entry: MapEntry, map_exit: MapExit, - state: SDFGState) -> Tuple[bool, bool, bool]: - """ - Analyze a GPU_ThreadBlock-scheduled map to determine: - - whether it writes to shared memory, - - whether such writes may cause race conditions, and - - whether it is nested within another GPU_ThreadBlock map inside the kernel. - - Returns a tuple of three booleans: - - 1. `writes_to_shared_memory`: - True if the map writes to GPU shared memory. This includes writes - directly at the MapExit or within the map scope. - - 2. `race_cond_danger`: - True if there is a potential race condition due to shared memory writes - inside either: - - a sequentially scheduled map, or - - a loop region. - (Note: single-iteration loops/sequential maps are not treated differently; - they are still marked as dangerous, even though they cannot cause races.) - - 3. `has_parent_tb_map`: - True if this ThreadBlock map is nested inside another ThreadBlock map - (i.e., there exists another TB map between the enclosing GPU_Device - map and the current TB map). - - Parameters - ---------- - map_entry : MapEntry - The entry node of the ThreadBlock map. - map_exit : MapExit - The exit node of the ThreadBlock map. - state : SDFGState - The parent state containing the map. - - Returns - ------- - Tuple[bool, bool, bool] - A tuple: - `(writes_to_shared_memory, race_cond_danger, has_parent_tb_map)` - """ - # Initially, the flags are all set to False - writes_to_shared_memory = False - race_cond_danger = False - has_parent_tb_map = False - - # 1. Check if the ThreadBlock (TB) map writes to shared memory - for edge in state.out_edges(map_exit): - is_smem: bool = (isinstance(edge.dst, AccessNode) - and edge.dst.desc(state).storage == dtypes.StorageType.GPU_Shared) - if is_smem and not edge.data.is_empty(): - writes_to_shared_memory = True - break - - # 2. Search between map entry and exit: - # - Detect writes to shared memory (unless already found) - # - Collect nested SDFGs for later analysis - nested_sdfgs: Set[NestedSDFG] = set() - - for node in state.all_nodes_between(map_entry, map_exit): - if not writes_to_shared_memory and isinstance(node, AccessNode): - # Check if this AccessNode writes to shared memory - if (node.desc(state).storage == dtypes.StorageType.GPU_Shared - and any(not edge.data.is_empty() for edge in state.in_edges(node))): - writes_to_shared_memory = True - - elif isinstance(node, NestedSDFG): - nested_sdfgs.add(node) - - # 3. Recursively analyze nested SDFGs: - # - Detect shared memory writes (only if not already found) - # - Check for potential race conditions in loop regions (only if not already flagged) - for nsdfg in nested_sdfgs: - subs_sdfg = nsdfg.sdfg - if not writes_to_shared_memory: - writes_to_shared_memory = self.sdfg_writes_to_smem(subs_sdfg) - - if not race_cond_danger: - race_cond_danger = self.writes_to_smem_inside_loopregion(subs_sdfg) - - # 4. Check for race condition danger in sequential maps that use shared memory - # (only if not already flagged) - if not race_cond_danger: - race_cond_danger = any( - inner_scope.map.schedule == dtypes.ScheduleType.Sequential - and self.map_writes_to_smem(inner_scope) - for _, inner_scope in helpers.get_internal_scopes(state, map_entry) - ) - - # 5. Check if this TB map is nested within another TB map - parent = helpers.get_parent_map(state, map_entry) - - while parent: - parent_map, parent_state = parent - if parent_map.map.schedule == dtypes.ScheduleType.GPU_ThreadBlock: - has_parent_tb_map = True - break - if parent_map.map.schedule == dtypes.ScheduleType.GPU_Device: - break - parent = helpers.get_parent_map(parent_state, parent_map) - - # 6. Return the results - return writes_to_shared_memory, race_cond_danger, has_parent_tb_map - - def writes_to_smem_inside_loopregion(self, sdfg: SDFG) -> bool: - """ - Return True if the SDFG writes to GPU shared memory (smem) inside - a LoopRegion. This check is recursive and includes nested SDFGs. - """ - for node in sdfg.nodes(): - if isinstance(node, LoopRegion): - # Traverse all nodes inside the loop region - for subnode, parent in node.all_nodes_recursive(): - if ( - isinstance(subnode, AccessNode) - and subnode.desc(parent).storage == dtypes.StorageType.GPU_Shared - and any(not edge.data.is_empty() for edge in parent.in_edges(node)) - ): - return True - - elif isinstance(node, NestedSDFG): - # Recurse into nested SDFGs - if self.writes_to_smem_inside_loopregion(node.sdfg): - return True - - return False - - def sdfg_writes_to_smem(self, sdfg: SDFG) -> bool: - """ - Return True if the SDFG writes to GPU shared memory (smem), - i.e., contains an AccessNode with GPU_Shared storage that has - at least one non-empty incoming edge. - """ - for node, state in sdfg.all_nodes_recursive(): - if ( - isinstance(node, AccessNode) - and node.desc(state).storage == dtypes.StorageType.GPU_Shared - and any(not edge.data.is_empty() for edge in state.in_edges(node)) - ): - return True - return False - - def map_writes_to_smem(self, map_entry: MapEntry) -> bool: - """ - Return True if the map writes to GPU shared memory (smem). - - A map is considered to write to smem if: - - Any AccessNode with GPU_Shared storage is written to at the MapExit, or - - Such writes occur within the map scope, or - - A nested SDFG within the map writes to smem. - """ - state = self._node_to_parent_state[map_entry] - map_exit = state.exit_node(map_entry) - - # 1. Check if MapExit writes directly to shared memory - for edge in state.out_edges(map_exit): - if ( - isinstance(edge.dst, AccessNode) - and edge.dst.desc(state).storage == dtypes.StorageType.GPU_Shared - and not edge.data.is_empty() - ): - return True - - # 2. Inspect nodes inside the map scope - for node in state.all_nodes_between(map_entry, map_exit): - if ( - isinstance(node, AccessNode) - and node.desc(state).storage == dtypes.StorageType.GPU_Shared - and any(not edge.data.is_empty() for edge in state.in_edges(node)) - ): - return True - - if isinstance(node, NestedSDFG) and self.sdfg_writes_to_smem(node.sdfg): - return True - - # No writes to shared memory found - return False - - def insert_synchronization_after_tb_exits(self, tb_map_exits: Dict[MapExit, SDFGState]) -> None: - """ - Insert synchronization tasklets (calling `__syncthreads()`) after the given - GPU ThreadBlock MapExit nodes. - - Parameters - ---------- - tb_map_exits : Dict[MapExit, SDFGState] - Mapping from ThreadBlock MapExit nodes to their parent states after which a synchronization - tasklet should be inserted. - """ - for map_exit, state in tb_map_exits.items(): - - sync_tasklet = state.add_tasklet(name="sync_threads", inputs=set(), outputs=set(), - code="__syncthreads();\n", language=dtypes.Language.CPP) - - for succ in state.successors(map_exit): - state.add_edge(sync_tasklet, None, succ, None, dace.Memlet()) - - state.add_edge(map_exit, None, sync_tasklet, None, dace.Memlet()) \ No newline at end of file diff --git a/tests/codegen/cuda_mempool_test.py b/tests/codegen/cuda_mempool_test.py index c70af71598..128634720c 100644 --- a/tests/codegen/cuda_mempool_test.py +++ b/tests/codegen/cuda_mempool_test.py @@ -144,7 +144,8 @@ def tester(A: CudaArray, B: CudaArray): code = sdfg.generate_code()[0].clean_code assert code.count('cudaMallocAsync') == 1 - assert code.count('cudaFreeAsync(pooled, __state->gpu_context->streams[0]') == 1 or code.count('cudaFreeAsync(pooled, gpu_stream0') == 1 + assert code.count('cudaFreeAsync(pooled, __state->gpu_context->streams[0]') == 1 or code.count( + 'cudaFreeAsync(pooled, gpu_stream0') == 1 # Test code import cupy as cp @@ -198,7 +199,8 @@ def test_memory_pool_if_states(cnd): sdfg.validate() code = sdfg.generate_code()[0].clean_code assert code.count('cudaMallocAsync') == 1 - assert code.count(f'cudaFreeAsync({tmp}, __state->gpu_context->streams[0]') == 1 or code.count(f'cudaFreeAsync({tmp}, gpu_stream0') == 1 + assert code.count(f'cudaFreeAsync({tmp}, __state->gpu_context->streams[0]') == 1 or code.count( + f'cudaFreeAsync({tmp}, gpu_stream0') == 1 # Test code import cupy as cp diff --git a/tests/codegen/gpu_memcpy_test.py b/tests/codegen/gpu_memcpy_test.py index e310ff3727..1cc650ffaa 100644 --- a/tests/codegen/gpu_memcpy_test.py +++ b/tests/codegen/gpu_memcpy_test.py @@ -20,10 +20,8 @@ def count_node(sdfg: dace.SDFG, node_type, ignore_gpustream_nodes=True): for rsdfg in sdfg.all_sdfgs_recursive(): for state in sdfg.states(): for node in state.nodes(): - if (ignore_gpustream_nodes and - isinstance(node, dace_nodes.AccessNode) - and node.desc(state).dtype == dace.dtypes.gpuStream_t - ): + if (ignore_gpustream_nodes and isinstance(node, dace_nodes.AccessNode) + and node.desc(state).dtype == dace.dtypes.gpuStream_t): continue if isinstance(node, node_type): nb_nodes += 1 diff --git a/tests/npbench/misc/scattering_self_test.py b/tests/npbench/misc/scattering_self_test.py index 2bb915afe9..5b9a5ade62 100644 --- a/tests/npbench/misc/scattering_self_test.py +++ b/tests/npbench/misc/scattering_self_test.py @@ -115,6 +115,7 @@ def run_scattering_self_test(device_type: dace.dtypes.DeviceType): def test_cpu(): run_scattering_self_test(dace.dtypes.DeviceType.CPU) + @pytest.mark.gpu def test_gpu(): run_scattering_self_test(dace.dtypes.DeviceType.GPU) diff --git a/tests/parse_state_struct_test.py b/tests/parse_state_struct_test.py index 676012baad..2a64cd2255 100644 --- a/tests/parse_state_struct_test.py +++ b/tests/parse_state_struct_test.py @@ -33,8 +33,9 @@ def _cuda_helper(): """ if Config.get('compiler', 'cuda', 'implementation') == 'experimental': - program = codeobject.CodeObject("cuda_helper", helper_code, "cpp",targets.cpu.CPUCodeGen, "CudaHelper") - dummy_cuda_target = codeobject.CodeObject("dummy", "", "cu", targets.experimental_cuda.ExperimentalCUDACodeGen, "CudaDummy") + program = codeobject.CodeObject("cuda_helper", helper_code, "cpp", targets.cpu.CPUCodeGen, "CudaHelper") + dummy_cuda_target = codeobject.CodeObject("dummy", "", "cu", targets.experimental_cuda.ExperimentalCUDACodeGen, + "CudaDummy") else: program = codeobject.CodeObject("cuda_helper", helper_code, "cpp", targets.cpu.CPUCodeGen, "CudaHelper") dummy_cuda_target = codeobject.CodeObject("dummy", "", "cu", targets.cuda.CUDACodeGen, "CudaDummy") From 983d80d15e56b88da2b2893e885d89b413eded41 Mon Sep 17 00:00:00 2001 From: aydogdub Date: Wed, 10 Sep 2025 15:46:07 +0200 Subject: [PATCH 77/94] set back to legacy CUDACodeGen --- dace/config_schema.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dace/config_schema.yml b/dace/config_schema.yml index d1c54d627b..c7e2d47528 100644 --- a/dace/config_schema.yml +++ b/dace/config_schema.yml @@ -456,7 +456,7 @@ required: "legacy" is stable, "experimental" is used by Berkay Aydogdu and Yakup Koray Budanaz for Berkays master-thesis. enum: [legacy, experimental] - default: experimental + default: legacy gpu_index_type: type: str From 85f2cb1c9af1a9a18a8491327dafe78399d8a16c Mon Sep 17 00:00:00 2001 From: aydogdub Date: Wed, 10 Sep 2025 22:56:43 +0200 Subject: [PATCH 78/94] reset to normal cuda.py file --- dace/codegen/targets/cuda.py | 39 +++++++++++++++--------------------- 1 file changed, 16 insertions(+), 23 deletions(-) diff --git a/dace/codegen/targets/cuda.py b/dace/codegen/targets/cuda.py index b355912c0a..6205b17700 100644 --- a/dace/codegen/targets/cuda.py +++ b/dace/codegen/targets/cuda.py @@ -30,7 +30,6 @@ from dace.sdfg.state import ControlFlowRegion, StateSubgraphView from dace.transformation import helpers as xfh from dace.transformation.passes import analysis as ap -from dace.transformation.dataflow.add_threadblock_map import AddThreadBlockMap if TYPE_CHECKING: from dace.codegen.targets.framecode import DaCeCodeGenerator @@ -137,9 +136,6 @@ def __init__(self, frame_codegen: 'DaCeCodeGenerator', sdfg: SDFG): # End of dispatcher registration ###################################### - # new - self._kernels_with_inserted_tb_maps: Set[nodes.MapEntry] = set() - def _emit_sync(self, codestream: CodeIOStream): if Config.get_bool('compiler', 'cuda', 'syncdebug'): codestream.write('''DACE_GPU_CHECK({backend}GetLastError()); @@ -157,16 +153,6 @@ def preprocess(self, sdfg: SDFG) -> None: 'CUDA', target_type=target_type) - old_nodes = set(node for node, _ in sdfg.all_nodes_recursive()) - - sdfg.apply_transformations_once_everywhere(AddThreadBlockMap, ) - - new_nodes = set(node for node, _ in sdfg.all_nodes_recursive()) - old_nodes - self._kernels_with_inserted_tb_maps = { - n - for n in new_nodes if isinstance(n, nodes.MapEntry) and n.schedule == dtypes.ScheduleType.GPU_Device - } - # Find GPU<->GPU strided copies that cannot be represented by a single copy command for e, state in list(sdfg.all_edges_recursive()): if isinstance(e.src, nodes.AccessNode) and isinstance(e.dst, nodes.AccessNode) and (not e.data.is_empty()): @@ -1401,6 +1387,7 @@ def generate_state(self, ptrname = cpp.ptr(name, desc, sd, self._frame) if isinstance(desc, dt.Array) and desc.start_offset != 0: ptrname = f'({ptrname} - {cpp.sym2cpp(desc.start_offset)})' + callsite_stream.write(f'DACE_GPU_CHECK({backend}Free({ptrname}));\n', sd) self._emit_sync(callsite_stream) to_remove.add((sd, name)) @@ -2067,12 +2054,8 @@ def get_kernel_dimensions(self, dfg_scope): if len(detected_block_sizes) > 1: - # Error when both user has manually set gpu_block_size and thread-block maps were defined and conflict in block size - preset_block_size = kernelmap_entry.map.gpu_block_size - conflicting_block_sizes = (preset_block_size - is not None) and not (kernelmap_entry in self._kernels_with_inserted_tb_maps) - - if conflicting_block_sizes: + # Error when both gpu_block_size and thread-block maps were defined and conflict + if kernelmap_entry.map.gpu_block_size is not None: raise ValueError('Both the `gpu_block_size` property and internal thread-block ' 'maps were defined with conflicting sizes for kernel ' f'"{kernelmap_entry.map.label}" (sizes detected: {detected_block_sizes}). ' @@ -2372,6 +2355,9 @@ def generate_devicelevel_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_sco f'auto {scope_map.params[0]} = {scope_map.range[0][0]} + {dynmap_step} * {dynmap_var};', cfg, state_id, scope_entry) + # Emit internal array allocation (deallocation handled at MapExit) + self._frame.allocate_arrays_in_scope(sdfg, cfg, scope_entry, function_stream, callsite_stream) + elif scope_map.schedule == dtypes.ScheduleType.GPU_Device: dfg_kernel = self._kernel_state.scope_subgraph(self._kernel_map) grid_dims, block_dims, has_tbmap, has_dtbmap, extra_gdim_offsets = self.get_kernel_dimensions(dfg_kernel) @@ -2491,6 +2477,10 @@ def generate_devicelevel_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_sco varname=varname, expr=expr, ), cfg, state_id, node) + + # Emit internal array allocation here for GPU_ThreadBlock (deallocation handled at MapExit) + self._frame.allocate_arrays_in_scope(sdfg, cfg, scope_entry, function_stream, callsite_stream) + else: # Device map in Device map brange = subsets.Range(scope_map.range[::-1]) kdims = brange.size() @@ -2517,6 +2507,9 @@ def generate_devicelevel_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_sco callsite_stream.write('int %s = %s;' % (varname, expr), cfg, state_id, scope_entry) self._dispatcher.defined_vars.add(varname, DefinedType.Scalar, 'int') + # Emit internal array allocation (deallocation handled at MapExit) + self._frame.allocate_arrays_in_scope(sdfg, cfg, scope_entry, function_stream, callsite_stream) + # Generate conditions for this subgrid's execution using min and max # element, e.g. skipping out-of-bounds threads minels = brange.min_element() @@ -2555,9 +2548,6 @@ def generate_devicelevel_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_sco for dim in range(len(scope_map.range)): callsite_stream.write('{', cfg, state_id, scope_entry) - # Emit internal array allocation (deallocation handled at MapExit) - self._frame.allocate_arrays_in_scope(sdfg, cfg, scope_entry, function_stream, callsite_stream) - # Generate all index arguments for block if scope_map.schedule == dtypes.ScheduleType.GPU_ThreadBlock: if self._scope_has_collaborative_copy: @@ -2600,6 +2590,9 @@ def generate_devicelevel_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_sco callsite_stream.write('int %s = %s;' % (varname, expr), cfg, state_id, scope_entry) self._dispatcher.defined_vars.add(varname, DefinedType.Scalar, 'int') + # Emit internal array allocation here (deallocation handled at MapExit) + self._frame.allocate_arrays_in_scope(sdfg, cfg, scope_entry, function_stream, callsite_stream) + # Generate conditions for this block's execution using min and max # element, e.g. skipping out-of-bounds threads in trailing block minels = brange.min_element() From 3d76c3ea7cc22d8d881dd7a8bb7d78b09705041f Mon Sep 17 00:00:00 2001 From: aydogdub Date: Thu, 11 Sep 2025 13:18:08 +0200 Subject: [PATCH 79/94] start of new pipeline --- dace/codegen/targets/experimental_cuda.py | 8 ++++---- dace/config_schema.yml | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/dace/codegen/targets/experimental_cuda.py b/dace/codegen/targets/experimental_cuda.py index 20898b6cc0..95e4ef4b16 100644 --- a/dace/codegen/targets/experimental_cuda.py +++ b/dace/codegen/targets/experimental_cuda.py @@ -175,16 +175,16 @@ def preprocess(self, sdfg: SDFG) -> None: CopyToMap.apply_to(nsdfg, save=False, annotate=False, a=e.src, b=e.dst) except ValueError: # If transformation doesn't match, continue normally continue - """ + from dace.transformation.passes.fix_test import Fix from dace.transformation.passes.move_array_out_of_kernel import MoveArrayOutOfKernel - sdfg.save("before.sdfg") + #sdfg.save("before.sdfg") names = Fix().apply_pass(sdfg, {}) for name, map_parent in names.items(): MoveArrayOutOfKernel().apply_pass(sdfg, map_parent, name) - sdfg.save("after.sdfg") - """ + #sdfg.save("after.sdfg") + #----------------- Add ThreadBlock Maps & Infer Kernel Grid & Block Sizes -------------------- diff --git a/dace/config_schema.yml b/dace/config_schema.yml index c7e2d47528..d1c54d627b 100644 --- a/dace/config_schema.yml +++ b/dace/config_schema.yml @@ -456,7 +456,7 @@ required: "legacy" is stable, "experimental" is used by Berkay Aydogdu and Yakup Koray Budanaz for Berkays master-thesis. enum: [legacy, experimental] - default: legacy + default: experimental gpu_index_type: type: str From f149b0196bb902a02dedf6d600b94efc249df756 Mon Sep 17 00:00:00 2001 From: aydogdub Date: Mon, 15 Sep 2025 17:43:55 +0200 Subject: [PATCH 80/94] fix --- .../dataflow/add_threadblock_map.py | 20 ++----------------- 1 file changed, 2 insertions(+), 18 deletions(-) diff --git a/dace/transformation/dataflow/add_threadblock_map.py b/dace/transformation/dataflow/add_threadblock_map.py index 4db89ac992..cb57e42eaa 100644 --- a/dace/transformation/dataflow/add_threadblock_map.py +++ b/dace/transformation/dataflow/add_threadblock_map.py @@ -152,24 +152,8 @@ def apply(self, state: SDFGState, sdfg: SDFG): gpu_block_size = self.preprocess_default_dims() kernel_map_entry = self.map_entry - # Reverse for map tiling to prioritize later dimensions for better memory/performance - reversed_block_size = gpu_block_size[::-1] - - # TODO: Update this once MapTiling accounts for existing strides when applying tile sizes. - # The code below is a workaround that manually adjusts tile sizes to account for existing strides. - num_dims = len(kernel_map_entry.map.params) - existing_strides = kernel_map_entry.range.strides() - - len_diff = num_dims - len(reversed_block_size) - if len_diff > 0: - # More dimensions than block size elements - pad with 1s - adjusted_block_size = [1] * len_diff + reversed_block_size - else: - # Fewer or equal dimensions - truncate from the beginning - adjusted_block_size = reversed_block_size[-num_dims:] - - tile_sizes = [stride * block for stride, block in zip(existing_strides, adjusted_block_size)] - + tile_sizes = kernel_map_entry.range.strides() + # Apply map tiling transformation MapTiling.apply_to(sdfg=sdfg, options={ From e2ad61c14ecd1dbd204663e13aab6eb103fcf7c6 Mon Sep 17 00:00:00 2001 From: aydogdub Date: Mon, 15 Sep 2025 17:44:53 +0200 Subject: [PATCH 81/94] fixes --- dace/codegen/targets/experimental_cuda.py | 35 +++++++++++++++++------ 1 file changed, 26 insertions(+), 9 deletions(-) diff --git a/dace/codegen/targets/experimental_cuda.py b/dace/codegen/targets/experimental_cuda.py index 95e4ef4b16..b02f02cb59 100644 --- a/dace/codegen/targets/experimental_cuda.py +++ b/dace/codegen/targets/experimental_cuda.py @@ -120,6 +120,7 @@ def __init__(self, frame_codegen: 'DaCeCodeGenerator', sdfg: SDFG): self._current_kernel_spec: Optional[KernelSpec] = None self._gpu_stream_manager: Optional[GPUStreamManager] = None self._kernel_dimensions_map: Set[nodes.MapEntry] = set() + self._kernel_arglists: Dict[nodes.MapEntry, Dict[str, dt.Data]] = {} def preprocess(self, sdfg: SDFG) -> None: """ @@ -176,6 +177,7 @@ def preprocess(self, sdfg: SDFG) -> None: except ValueError: # If transformation doesn't match, continue normally continue + """ from dace.transformation.passes.fix_test import Fix from dace.transformation.passes.move_array_out_of_kernel import MoveArrayOutOfKernel #sdfg.save("before.sdfg") @@ -184,6 +186,7 @@ def preprocess(self, sdfg: SDFG) -> None: MoveArrayOutOfKernel().apply_pass(sdfg, map_parent, name) #sdfg.save("after.sdfg") + """ #----------------- Add ThreadBlock Maps & Infer Kernel Grid & Block Sizes -------------------- @@ -241,6 +244,14 @@ def preprocess(self, sdfg: SDFG) -> None: # Find points where memory should be released to the memory pool self._compute_pool_release(sdfg) + # Retrieve arguments required for the kernels subgraph + shared_transients = {} + for state, node, defined_syms in sdutil.traverse_sdfg_with_defined_symbols(sdfg, recursive=True): + if (isinstance(node, nodes.MapEntry)and node.map.schedule == dtypes.ScheduleType.GPU_Device): + if state.parent not in shared_transients: + shared_transients[state.parent] = state.parent.shared_transients() + self._kernel_arglists[node] = state.scope_subgraph(node).arglist(defined_syms, shared_transients[state.parent]) + def _compute_pool_release(self, top_sdfg: SDFG): """ Computes positions in the code generator where a memory pool array is no longer used and @@ -768,28 +779,35 @@ def _generate_NestedSDFG(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSub dispatcher.defined_vars.enter_scope(node) # Add the const qualifier to any constants not marked as such - + """ # update const data - new_const_data = sdutil.get_constant_data(node, parent_state) - self._current_kernel_spec.kernel_constants + new_const_data = sdutil.get_constant_data(node, nsdfg) for name in new_const_data: desc = nsdfg.arrays[name] ptr_name = ptr(name, desc, nsdfg, self._frame) try: defined_type, ctype = dispatcher.defined_vars.get(ptr_name, is_global=True) - if not "const " in desc.ctype: - ctype = f"const {desc.ctype}" except: defined_type = get_defined_type(desc) - if not "const " in desc.ctype: - ctype = f"const {desc.ctype}" + if defined_type == DefinedType.Pointer: + ctype = f'{desc.ctype} *' + elif defined_type == DefinedType.Scalar: + ctype = desc.ctype + else: + raise NotImplementedError("Not expected Type") + + if not "const " in ctype: + ctype = f"const {ctype}" dispatcher.defined_vars.add(ptr_name, defined_type, ctype, allow_shadowing=True) # update const symbols - new_const_symbols = sdutil.get_constant_symbols(node, parent_state) - self._current_kernel_spec.kernel_constants + new_const_symbols = sdutil.get_constant_symbols(node, nsdfg) for name in new_const_symbols: defined_type = DefinedType.Scalar if not "const" in nsdfg.symbols[name].ctype: ctype = f"const {nsdfg.symbols[name].ctype}" + """ + # Redirect rest to CPU codegen self._cpu_codegen._generate_NestedSDFG(sdfg, cfg, dfg, state_id, node, function_stream, callsite_stream) @@ -1274,8 +1292,7 @@ def __init__(self, cudaCodeGen: ExperimentalCUDACodeGen, sdfg: SDFG, cfg: Contro kernel_constants = kernel_const_data | kernel_const_symbols self._kernel_constants: Set[str] = kernel_constants - # Retrieve arguments required for the kernels subgraph - arglist: Dict[str, dt.Data] = kernel_parent_state.scope_subgraph(kernel_map_entry).arglist() + arglist: Dict[str, dt.Data] = cudaCodeGen._kernel_arglists[kernel_map_entry] self._arglist = arglist # save _in_device_code value for restoring later From db07e4c048de8b990edffa826b9c7c087eb68499 Mon Sep 17 00:00:00 2001 From: aydogdub Date: Mon, 15 Sep 2025 18:11:11 +0200 Subject: [PATCH 82/94] trying fix --- dace/codegen/targets/experimental_cuda.py | 12 +++++++ dace/transformation/passes/fix_test.py | 38 ++++++++++++++++++++--- 2 files changed, 46 insertions(+), 4 deletions(-) diff --git a/dace/codegen/targets/experimental_cuda.py b/dace/codegen/targets/experimental_cuda.py index b02f02cb59..93ae83ce38 100644 --- a/dace/codegen/targets/experimental_cuda.py +++ b/dace/codegen/targets/experimental_cuda.py @@ -444,6 +444,18 @@ def generate_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: ScopeSub return + + import copy + from dace.transformation.passes.fix_test import Fix + from dace.transformation.passes.move_array_out_of_kernel import MoveArrayOutOfKernel + from dace.sdfg import infer_types + + names = Fix().apply_pass(sdfg, {}) + for name, map_parent in names.items(): + MoveArrayOutOfKernel().apply_pass(sdfg, map_parent, name) + infer_types.infer_connector_types(sdfg) + + #--------------- Nested GPU Scope -------------------- supported_strategies: List[ScopeGenerationStrategy] = [ ThreadBlockScopeGenerator(codegen=self), diff --git a/dace/transformation/passes/fix_test.py b/dace/transformation/passes/fix_test.py index 6a44728d35..2c6d2c316d 100644 --- a/dace/transformation/passes/fix_test.py +++ b/dace/transformation/passes/fix_test.py @@ -35,6 +35,8 @@ def apply_pass(self, sdfg: SDFG, pipeline_results: Dict[str, Any]) -> Dict[str, from dace.transformation.helpers import get_parent_map + skip = set() + to_be_moved = set() names: Dict = dict() for node, parent_state in sdfg.all_nodes_recursive(): if not isinstance(node, nodes.AccessNode): @@ -56,11 +58,29 @@ def apply_pass(self, sdfg: SDFG, pipeline_results: Dict[str, Any]) -> Dict[str, if map_parent is None: continue + + if node.data not in parent_state.sdfg.arrays: + continue data_desc = node.desc(parent_state) if not data_desc.storage == dtypes.StorageType.Register: continue + if isinstance(data_desc, dace.data.View) or data_desc.lifetime == dtypes.AllocationLifetime.Persistent: + continue + + break_cond = False + for edge, parent in sdfg.all_edges_recursive(): + if not isinstance(parent, dace.SDFGState): + continue + src = edge.src + if edge.dst_conn == node.data and isinstance(src, nodes.AccessNode) and src.data != node.data: + break_cond = True + skip.add(src.data) + + if break_cond: + continue + shape = data_desc.shape size_expr = np.prod(shape) @@ -72,12 +92,22 @@ def apply_pass(self, sdfg: SDFG, pipeline_results: Dict[str, Any]) -> Dict[str, elif cmp is sp.false: # definitely safe move_out = False else: + # TODO: explain yakup and myself # undecidable case (symbolic expression) - move_out = True # or warn, depending on policy + move_out = False # or warn, depending on policy if move_out: - data_desc.storage = dtypes.StorageType.GPU_Global - data_desc.transient = True - names[node.data] = map_parent + to_be_moved.add((node.data, data_desc, map_parent)) + + + for name, desc, map_parent in to_be_moved: + if name in skip: + continue + + desc.storage = dtypes.StorageType.GPU_Global + desc.transient = True + names[name] = map_parent + + return names From abe3e7b4523128c88ffd92ecc3a0b2e0f5fdb0ef Mon Sep 17 00:00:00 2001 From: aydogdub Date: Mon, 15 Sep 2025 18:19:31 +0200 Subject: [PATCH 83/94] fix of yakup --- dace/codegen/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dace/codegen/CMakeLists.txt b/dace/codegen/CMakeLists.txt index 1187ce9be1..0988cdc2fe 100644 --- a/dace/codegen/CMakeLists.txt +++ b/dace/codegen/CMakeLists.txt @@ -122,7 +122,7 @@ if(DACE_ENABLE_CUDA) set(CUDAToolkit_ROOT ${CUDA_TOOLKIT_ROOT_DIR}) find_package(CUDAToolkit REQUIRED) - set(CMAKE_CUDA_STANDARD 14) + set(CMAKE_CUDA_STANDARD 17) set(CMAKE_CUDA_SEPARABLE_COMPILATION ON) # CMake 3.24: set_property(TARGET tgt PROPERTY CUDA_ARCHITECTURES native) From cb7f48c511bea0cb82c12305c91eea5ff051acd9 Mon Sep 17 00:00:00 2001 From: aydogdub Date: Tue, 16 Sep 2025 12:11:12 +0200 Subject: [PATCH 84/94] fix --- .../passes/move_array_out_of_kernel.py | 43 ++++++++++++------- 1 file changed, 27 insertions(+), 16 deletions(-) diff --git a/dace/transformation/passes/move_array_out_of_kernel.py b/dace/transformation/passes/move_array_out_of_kernel.py index cfa3c2090b..2ab0a67bf0 100644 --- a/dace/transformation/passes/move_array_out_of_kernel.py +++ b/dace/transformation/passes/move_array_out_of_kernel.py @@ -17,6 +17,7 @@ from dace.memlet import Memlet from dace.symbolic import symbol +import dace.sdfg.utils as sdutil @make_properties @transformation.explicit_cf_compatible @@ -142,14 +143,13 @@ def move_array_out_of_kernel_flat(self, kernel_entry: nodes.MapEntry, array_name next_map_exit = parent_state.exit_node(next_map_entry) if in_connector not in next_map_exit.in_connectors: next_map_state = self._node_to_state_cache[next_map_exit] - next_map_exit.add_in_connector(in_connector, dtypes.pointer(array_desc.dtype)) - next_map_exit.add_out_connector(out_connector, dtypes.pointer(array_desc.dtype)) + next_map_exit.add_in_connector(in_connector) + next_map_exit.add_out_connector(out_connector) next_entries, _ = self.get_maps_between(kernel_entry, previous_node) - memlet_subset = Range(self.get_memlet_subset(next_entries, previous_node) + old_subset) next_map_state.add_edge(previous_node, previous_out_connector, next_map_exit, in_connector, - Memlet(data=array_name, subset=memlet_subset)) + Memlet.from_array(array_name, array_desc)) previous_node = next_map_exit previous_out_connector = out_connector @@ -237,6 +237,7 @@ def move_array_out_of_kernel_nested(self, kernel_entry: nodes.MapEntry, array_na self.lift_array_through_nested_sdfgs(array_name, kernel_entry, sdfg_hierarchy, old_subset) + def lift_array_through_nested_sdfgs(self, array_name: str, kernel_entry: nodes.MapEntry, sdfg_hierarchy: List[SDFG], old_subset: List) -> None: """ @@ -266,6 +267,7 @@ def lift_array_through_nested_sdfgs(self, array_name: str, kernel_entry: nodes.M new_desc = copy.deepcopy(old_desc) outer_sdfg.add_datadesc(array_name, new_desc) + # Get all parent scopes to detect how the data needs to flow. # E.g. nsdfg_node -> MapExit needs to be nsdfg_node -> MapExit -> AccessNode (new) parent_scopes: List[nodes.MapEntry] = [] @@ -296,10 +298,10 @@ def lift_array_through_nested_sdfgs(self, array_name: str, kernel_entry: nodes.M # 1.1 Determine source connector name and register it based on src type if isinstance(src, nodes.NestedSDFG): src_conn = array_name - src.add_out_connector(src_conn, dtypes.pointer(new_desc.dtype)) + src.add_out_connector(src_conn) elif isinstance(src, nodes.MapExit): src_conn = f"OUT_{array_name}" - src.add_out_connector(src_conn, dtypes.pointer(new_desc.dtype)) + src.add_out_connector(src_conn) else: raise NotImplementedError( f"Unsupported source node type '{type(src).__name__}' — only NestedSDFG or MapExit are expected." @@ -310,7 +312,7 @@ def lift_array_through_nested_sdfgs(self, array_name: str, kernel_entry: nodes.M dst_conn = None # AccessNodes use implicit connectors elif isinstance(dst, nodes.MapExit): # Assuming dst is the entry for parent scope dst_conn = f"IN_{array_name}" - dst.add_in_connector(dst_conn, dtypes.pointer(new_desc.dtype)) + dst.add_in_connector(dst_conn) else: raise NotImplementedError( f"Unsupported destination node type '{type(dst).__name__}' — expected AccessNode or MapEntry.") @@ -318,7 +320,7 @@ def lift_array_through_nested_sdfgs(self, array_name: str, kernel_entry: nodes.M # 2. Add the edge using the connector names determined in Step 1. next_entries, _ = self.get_maps_between(kernel_entry, src) memlet_subset = Range(self.get_memlet_subset(next_entries, src) + old_subset) - nsdfg_parent_state.add_edge(src, src_conn, dst, dst_conn, Memlet(data=array_name, subset=memlet_subset)) + nsdfg_parent_state.add_edge(src, src_conn, dst, dst_conn, Memlet.from_array(array_name, new_desc)) # Continue by setting the dst as source src = dst @@ -329,17 +331,17 @@ def lift_array_through_nested_sdfgs(self, array_name: str, kernel_entry: nodes.M if isinstance(src, nodes.NestedSDFG): src_conn = array_name - src.add_out_connector(src_conn, dtypes.pointer(new_desc.dtype)) + src.add_out_connector(src_conn) elif isinstance(src, nodes.MapExit): src_conn = f"OUT_{array_name}" - src.add_out_connector(src_conn, dtypes.pointer(new_desc.dtype)) + src.add_out_connector(src_conn) else: raise NotImplementedError( f"Unsupported source node type '{type(src).__name__}' — only NestedSDFG or MapExit are expected.") next_entries, _ = self.get_maps_between(kernel_entry, src) memlet_subset = Range(self.get_memlet_subset(next_entries, src) + old_subset) - nsdfg_parent_state.add_edge(src, src_conn, dst, None, Memlet(data=array_name, subset=memlet_subset)) + nsdfg_parent_state.add_edge(src, src_conn, dst, None, Memlet.from_array(array_name, new_desc)) # At the outermost sdfg we set the array descriptor to be transient again, # Since it is not needed beyond it. Furthermore, this ensures that the codegen @@ -488,8 +490,16 @@ def get_new_shape_info(self, array_desc: dt.Array, map_exit_chain: List[nodes.Ma min_elements = map_range.min_element() range_size = [max_elem + 1 - min_elem for max_elem, min_elem in zip(max_elements, min_elements)] + #TODO: check this / clean (maybe support packed C and packed Fortran layouts separately for code readability future) + old_total_size = array_desc.total_size + accumulator = old_total_size + new_strides.insert(0, old_total_size) + for cur_range_size in range_size[:-1]: + new_strides.insert(0, accumulator) # insert before (mult with volumes) + accumulator = accumulator * cur_range_size + extended_size = range_size + extended_size - new_strides = [1 for _ in next_map.map.params] + new_strides # add 1 per dimension + #new_strides = [1 for _ in next_map.map.params] + new_strides # add 1 per dimension new_offsets = [0 for _ in next_map.map.params] + new_offsets # add 0 per dimension new_shape = extended_size + list(array_desc.shape) @@ -527,7 +537,7 @@ def replace_array_name(self, sdfgs: FrozenSet[SDFG], old_name: str, new_name: st if edge.src_conn == old_out_conn: edge.src_conn = new_out_conn src.remove_out_connector(old_out_conn) - src.add_out_connector(new_out_conn, dtypes.pointer(array_desc.dtype)) + src.add_out_connector(new_out_conn) # Update in connectors dst = edge.dst @@ -536,7 +546,7 @@ def replace_array_name(self, sdfgs: FrozenSet[SDFG], old_name: str, new_name: st if edge.dst_conn == old_in_conn: edge.dst_conn = new_in_conn dst.remove_in_connector(old_in_conn) - dst.add_in_connector(new_in_conn, dtypes.pointer(array_desc.dtype)) + dst.add_in_connector(new_in_conn) def update_symbols(self, map_entry_chain: List[nodes.MapEntry], top_sdfg: SDFG) -> None: """ @@ -598,6 +608,7 @@ def collect_array_descriptor_usage( """ access_nodes_info: List[Tuple[nodes.AccessNode, SDFGState, SDFG]] = self.get_access_nodes_within_map(map_entry, array_name) + last_sdfg: SDFG = self._node_to_sdfg_cache[map_entry] result: Set[Tuple[dt.Array, SDFG, Set[SDFG], Set[nodes.AccessNode]]] = set() @@ -813,7 +824,7 @@ def get_nearest_access_node(self, access_nodes: List[nodes.AccessNode], node: no if neighbor not in visited: queue.append(neighbor) - raise RuntimeError(f"No access node found connected to the given node {node}.") + raise RuntimeError(f"No access node found connected to the given node {node}. ") def in_paths(self, access_node: nodes.AccessNode) -> List[List[MultiConnectorEdge[Memlet]]]: """ @@ -888,4 +899,4 @@ def out_paths(self, access_node: nodes.AccessNode) -> List[List[MultiConnectorEd extended_path = current_path + [edge] queue.append(extended_path) - return complete_paths + return complete_paths \ No newline at end of file From f46bd46f15befd8416ebb3c083f99f3bef2155dc Mon Sep 17 00:00:00 2001 From: aydogdub Date: Wed, 17 Sep 2025 01:45:20 +0200 Subject: [PATCH 85/94] quick --- dace/codegen/targets/experimental_cuda.py | 91 ++++++++++- .../dataflow/add_threadblock_map.py | 21 ++- .../passes/location_specialization.py | 144 ++++++++++++++++++ tests/codegen/nested_kernel_transient_test.py | 30 +++- tests/cuda_block_test.py | 1 + 5 files changed, 275 insertions(+), 12 deletions(-) create mode 100644 dace/transformation/passes/location_specialization.py diff --git a/dace/codegen/targets/experimental_cuda.py b/dace/codegen/targets/experimental_cuda.py index 93ae83ce38..6aeeb53ae6 100644 --- a/dace/codegen/targets/experimental_cuda.py +++ b/dace/codegen/targets/experimental_cuda.py @@ -4,7 +4,7 @@ import dace from dace import data as dt, Memlet -from dace import dtypes, registry, symbolic +from dace import dtypes, registry, symbolic, subsets from dace.config import Config from dace.sdfg import SDFG, ScopeSubgraphView, SDFGState, nodes from dace.sdfg import utils as sdutil @@ -1272,6 +1272,77 @@ def process_out_memlets(self, *args, **kwargs): # Call CPU implementation with this code generator as callback self._cpu_codegen.process_out_memlets(*args, codegen=self, **kwargs) + def _get_thread_id(self) -> str: + result = 'threadIdx.x' + if self._current_kernel_spec.block_dims[1] != 1: + result += f' + ({sym2cpp(self._current_kernel_spec.block_dims[0])}) * threadIdx.y' + if self._current_kernel_spec.block_dims[2] != 1: + result += f' + ({sym2cpp(self._current_kernel_spec.block_dims[0] * self._current_kernel_spec.block_dims[1])}) * threadIdx.z' + return result + + def _get_warp_id(self) -> str: + return f'(({self._get_thread_id()}) / warpSize)' + + def _get_block_id(self) -> str: + result = 'blockIdx.x' + if self._current_kernel_spec.block_dims[1] != 1: + result += f' + gridDim.x * blockIdx.y' + if self._current_kernel_spec.block_dims[2] != 1: + result += f' + gridDim.x * gridDim.y * blockIdx.z' + return result + + def _generate_condition_from_location(self, name: str, index_expr: str, node: nodes.Tasklet, + callsite_stream: CodeIOStream) -> str: + if name not in node.location: + return 0 + + location: Union[int, str, subsets.Range] = node.location[name] + if isinstance(location, str) and ':' in location: + location = subsets.Range.from_string(location) + elif symbolic.issymbolic(location): + location = sym2cpp(location) + + if isinstance(location, subsets.Range): + # Range of indices + if len(location) != 1: + raise ValueError(f'Only one-dimensional ranges are allowed for {name} specialization, {location} given') + begin, end, stride = location[0] + rb, re, rs = sym2cpp(begin), sym2cpp(end), sym2cpp(stride) + cond = '' + cond += f'(({index_expr}) >= {rb}) && (({index_expr}) <= {re})' + if stride != 1: + cond += f' && ((({index_expr}) - {rb}) % {rs} == 0)' + + callsite_stream.write(f'if ({cond}) {{') + else: + # Single-element + callsite_stream.write(f'if (({index_expr}) == {location}) {{') + + return 1 + + def _generate_Tasklet(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int, + node: nodes.Tasklet, function_stream: CodeIOStream, callsite_stream: CodeIOStream) -> None: + generated_preamble_scopes = 0 + if self._in_device_code: + # If location dictionary prescribes that the code should run on a certain group of threads/blocks, + # add condition + generated_preamble_scopes += self._generate_condition_from_location('gpu_thread', self._get_thread_id(), + node, callsite_stream) + generated_preamble_scopes += self._generate_condition_from_location('gpu_warp', self._get_warp_id(), node, + callsite_stream) + generated_preamble_scopes += self._generate_condition_from_location('gpu_block', self._get_block_id(), node, + callsite_stream) + + # Call standard tasklet generation + old_codegen = self._cpu_codegen.calling_codegen + self._cpu_codegen.calling_codegen = self + self._cpu_codegen._generate_Tasklet(sdfg, cfg, dfg, state_id, node, function_stream, callsite_stream) + self._cpu_codegen.calling_codegen = old_codegen + + if generated_preamble_scopes > 0: + # Generate appropriate postamble + for i in range(generated_preamble_scopes): + callsite_stream.write('}', cfg, state_id, node) ######################################################################### # helper class @@ -1313,8 +1384,17 @@ def __init__(self, cudaCodeGen: ExperimentalCUDACodeGen, sdfg: SDFG, cfg: Contro # Certain args are called in the CUDA/HIP file or kernel funcion, in which the pointer name of the args are different cudaCodeGen._in_device_code = True self._args_as_input = [ptr(name, data, sdfg, cudaCodeGen._frame) for name, data in arglist.items()] - self._args_typed = [('const ' if name in kernel_constants else '') + data.as_arg(name=name) - for name, data in arglist.items()] + + args_typed = [] + for name, data in arglist.items(): + if data.lifetime == dtypes.AllocationLifetime.Persistent: + arg_name = ptr(name, data, sdfg, cudaCodeGen._frame) + else: + arg_name = name + args_typed.append(('const ' if name in kernel_constants else '') + data.as_arg(name=arg_name)) + + self._args_typed = args_typed + # Args for the kernel wrapper function cudaCodeGen._in_device_code = False @@ -1340,9 +1420,10 @@ def __init__(self, cudaCodeGen: ExperimentalCUDACodeGen, sdfg: SDFG, cfg: Contro self._kernel_wrapper_args_as_input = ( ['__state'] + [ptr(name, data, sdfg, cudaCodeGen._frame) for name, data in arglist.items()] + [str(gpustream_input[0].dst_conn)]) + self._kernel_wrapper_args_typed = ([f'{mangle_dace_state_struct_name(cudaCodeGen._global_sdfg)} *__state'] + - [('const ' if name in kernel_constants else '') + data.as_arg(name=name) - for name, data in arglist.items()] + [f"gpuStream_t {gpustream_var_name}"]) + args_typed + + [f"gpuStream_t {gpustream_var_name}"]) cudaCodeGen._in_device_code = restore_in_device_code diff --git a/dace/transformation/dataflow/add_threadblock_map.py b/dace/transformation/dataflow/add_threadblock_map.py index cb57e42eaa..1a67534f4a 100644 --- a/dace/transformation/dataflow/add_threadblock_map.py +++ b/dace/transformation/dataflow/add_threadblock_map.py @@ -152,8 +152,23 @@ def apply(self, state: SDFGState, sdfg: SDFG): gpu_block_size = self.preprocess_default_dims() kernel_map_entry = self.map_entry - tile_sizes = kernel_map_entry.range.strides() - + # Reverse for map tiling to prioritize later dimensions for better memory/performance + reversed_block_size = gpu_block_size[::-1] + + # Get tile size + num_dims = len(kernel_map_entry.map.params) + + # Reverse for map tiling to prioritize later dimensions for better memory/performance + reversed_block_size = gpu_block_size[::-1] + + len_diff = num_dims - len(reversed_block_size) + if len_diff > 0: + # More dimensions than block size elements - pad with 1s + tile_sizes = [1] * len_diff + reversed_block_size + else: + # Fewer or equal dimensions - truncate from the beginning + tile_sizes = reversed_block_size[-num_dims:] + # Apply map tiling transformation MapTiling.apply_to(sdfg=sdfg, options={ @@ -182,8 +197,6 @@ def apply(self, state: SDFGState, sdfg: SDFG): f"({tb_size}) is not enclosed by the derived block size ({gpu_block_size}). " "They are expected to be equal or the derived block size to be larger.") - def update_names(): - pass @staticmethod def annotates_memlets(): diff --git a/dace/transformation/passes/location_specialization.py b/dace/transformation/passes/location_specialization.py new file mode 100644 index 0000000000..936c19a2a9 --- /dev/null +++ b/dace/transformation/passes/location_specialization.py @@ -0,0 +1,144 @@ +# Copyright 2019-2025 ETH Zurich and the DaCe authors. All rights reserved. +from typing import Tuple, List, Union, Dict + +import dace +from dace.properties import make_properties, DictProperty, ShapeProperty +from dace import subsets, symbolic +from dace import Config, dtypes, symbolic +from dace.properties import make_properties +from dace.sdfg import SDFG, SDFGState, nodes, utils as sdutil +from dace.codegen.targets.experimental_cuda_helpers import gpu_utils +from dace.transformation import helpers, transformation +from dace.transformation import helpers, pass_pipeline as ppl, transformation + +from dace.codegen.targets.cpp import sym2cpp + + +@make_properties +class LocationSpecialization(ppl.Pass): + + + def apply_pass(self, sdfg: SDFG, kernel_dimensions_map: Dict) -> None: + + for node, state in sdfg.all_nodes_recursive(): + + if not isinstance(node, dace.nodes.Tasklet): + continue + + if not self._applicable(state, node): + continue + + tasklet = node + block_dims = self._get_block_dims(state, tasklet, kernel_dimensions_map) + + # Generate preambles for thread/warp/block + preamble_thread = self._generate_condition_from_location("gpu_thread", self._get_thread_id(block_dims), tasklet) + preamble_warp = self._generate_condition_from_location("gpu_warp", self._get_warp_id(block_dims), tasklet) + preamble_block = self._generate_condition_from_location("gpu_block", self._get_block_id(block_dims), tasklet) + + # Keep only non-empty preambles + preambles = [p for p in (preamble_thread, preamble_warp, preamble_block) if p] + + + for preamble in preambles: + if tasklet.code.language == dace.dtypes.Language.Python: + cond = preamble.strip()[3:-1].strip() + cond = cond.replace("&&", "and").replace("||", "or") + + pre_tasklet = state.add_tasklet(f"specialization", {}, {}, cond) + + state.add_edge(pre_tasklet, None, tasklet, None, dace.Memlet()) + for pred in state.predecessors(tasklet): + state.add_edge(pred, None, pre_tasklet, None, dace.Memlet()) + + + import textwrap + # Wrap tasklet code with preambles and closing braces + for preamble in preambles: + original = tasklet.code.as_string or "" + if tasklet.code.language == dace.dtypes.Language.Python: + # Turn CUDA-style preamble into a Python if-statement + cond = preamble.strip()[3:-1].strip() # strip "if (" at start and "{" + cond = cond.replace("&&", "and").replace("||", "or") + tasklet.code.as_string = f"if {cond}:\n" + textwrap.indent(original, " ") + else: + # Leave CUDA/C++ unchanged + tasklet.code.as_string = preamble + original + "}\n" + + + def _applicable(self, state: SDFGState, tasklet: dace.nodes.Tasklet) -> bool: + """ + Check if this transformation is applicable. + + Applicable if: + * The tasklet is scheduled to run within a GPU kernel, and + * Its location dictionary contains at least one of: + - "gpu_block" + - "gpu_thread" + - "gpu_warp" + """ + + # Not within the kernel - skip + if not gpu_utils.is_within_schedule_types(state, tasklet, dtypes.GPU_SCHEDULES): + return False + + # return if the location dictionary contain block, thread of warp specialization + return any(k in tasklet.location for k in ("gpu_block", "gpu_thread", "gpu_warp")) + + def _generate_condition_from_location(self, name: str, index_expr: str, node: nodes.Tasklet) -> str: + if name not in node.location: + return '' + + location: Union[int, str, subsets.Range] = node.location[name] + if isinstance(location, str) and ':' in location: + location = subsets.Range.from_string(location) + elif symbolic.issymbolic(location): + location = sym2cpp(location) + + if isinstance(location, subsets.Range): + # Range of indices + if len(location) != 1: + raise ValueError(f'Only one-dimensional ranges are allowed for {name} specialization, {location} given') + begin, end, stride = location[0] + rb, re, rs = sym2cpp(begin), sym2cpp(end), sym2cpp(stride) + cond = '' + cond += f'(({index_expr}) >= {rb}) && (({index_expr}) <= {re})' + if stride != 1: + cond += f' && ((({index_expr}) - {rb}) % {rs} == 0)' + + return (f'if ({cond}) {{\n') + else: + # Single-element + return(f'if (({index_expr}) == {location}) {{\n') + + def _get_thread_id(self, block_dims: List) -> str: + result = 'threadIdx.x' + if block_dims[1] != 1: + result += f' + ({sym2cpp(block_dims[0])}) * threadIdx.y' + if block_dims[2] != 1: + result += f' + ({sym2cpp(block_dims[0] * block_dims[1])}) * threadIdx.z' + return result + + def _get_warp_id(self, block_dims: List) -> str: + return f'(({self._get_thread_id(block_dims)}) / warpSize)' + + def _get_block_id(self, block_dims: List) -> str: + result = 'blockIdx.x' + if block_dims[1] != 1: + result += f' + gridDim.x * blockIdx.y' + if block_dims[2] != 1: + result += f' + gridDim.x * gridDim.y * blockIdx.z' + return result + + def _get_block_dims(self, state, tasklet, kernel_dimensions_map) -> List: + + parent_map, parent_map_state = gpu_utils.get_parent_map(state, tasklet) + while parent_map.map.schedule != dtypes.ScheduleType.GPU_Device: + parent_map, parent_map_state = gpu_utils.get_parent_map(parent_map_state, parent_map) + + _, block_size = kernel_dimensions_map[parent_map] + return block_size + + @staticmethod + def annotates_memlets(): + return False diff --git a/tests/codegen/nested_kernel_transient_test.py b/tests/codegen/nested_kernel_transient_test.py index 54488a3aac..d4c3182c16 100644 --- a/tests/codegen/nested_kernel_transient_test.py +++ b/tests/codegen/nested_kernel_transient_test.py @@ -24,7 +24,15 @@ def nested(A: dace.float64[128, 64]): state.add_edge(n, 'A', w, None, dace.Memlet('A')) if persistent: - sdfg.arrays['gpu_A'].lifetime = dace.AllocationLifetime.Persistent + arrays = sdfg.cfg_list[-1].arrays + if dace.Config.get('compiler', 'cuda', 'implementation') == 'experimental': + # Special case for ExperimentalCUDACodeGen, where transient GPU_Global arrays + # Are moved out of the kernel, name is not equal to "gpu_A" anymore, but has the + # form local_{counter}_gpuA + target_name = next(k for k in arrays if "gpu_A" in k) + else: + target_name = "gpu_A" + arrays[target_name].lifetime = dace.AllocationLifetime.Persistent a = np.random.rand(128, 64) expected = np.copy(a) @@ -50,7 +58,15 @@ def transient(A: dace.float64[128, 64]): sdfg.apply_gpu_transformations() if persistent: - sdfg.cfg_list[-1].arrays['gpu_A'].lifetime = dace.AllocationLifetime.Persistent + arrays = sdfg.cfg_list[-1].arrays + if dace.Config.get('compiler', 'cuda', 'implementation') == 'experimental': + # Special case for ExperimentalCUDACodeGen, where transient GPU_Global arrays + # Are moved out of the kernel, name is not equal to "gpu_A" anymore, but has the + # form local_{counter}_gpuA + target_name = next(k for k in arrays if "gpu_A" in k) + else: + target_name = "gpu_A" + arrays[target_name].lifetime = dace.AllocationLifetime.Persistent a = np.random.rand(128, 64) expected = np.copy(a) @@ -87,7 +103,15 @@ def transient(A: dace.float64[128, 64]): sdfg.apply_gpu_transformations() if persistent: - sdfg.cfg_list[-1].arrays['gpu_A'].lifetime = dace.AllocationLifetime.Persistent + arrays = sdfg.cfg_list[-1].arrays + if dace.Config.get('compiler', 'cuda', 'implementation') == 'experimental': + # Special case for ExperimentalCUDACodeGen, where transient GPU_Global arrays + # Are moved out of the kernel, name is not equal to "gpu_A" anymore, but has the + # form local_{counter}_gpuA + target_name = next(k for k in arrays if "gpu_A" in k) + else: + target_name = "gpu_A" + arrays[target_name].lifetime = dace.AllocationLifetime.Persistent a = np.random.rand(128, 64) expected = np.copy(a) diff --git a/tests/cuda_block_test.py b/tests/cuda_block_test.py index 8b75376a00..74ee21fd90 100644 --- a/tests/cuda_block_test.py +++ b/tests/cuda_block_test.py @@ -181,6 +181,7 @@ def tester(A: dace.float64[200]): tasklet.location['gpu_block'] = 1 code = sdfg.generate_code()[1].clean_code # Get GPU code (second file) + sdfg.compile() assert '>= 2' in code and '<= 8' in code assert ' == 1' in code From fa85b760461b2fd601d424aa4e015a2eb6a1fe3f Mon Sep 17 00:00:00 2001 From: aydogdub Date: Wed, 17 Sep 2025 13:29:00 +0200 Subject: [PATCH 86/94] enable spezialization via location --- dace/codegen/targets/experimental_cuda.py | 166 +++++++++++----------- 1 file changed, 81 insertions(+), 85 deletions(-) diff --git a/dace/codegen/targets/experimental_cuda.py b/dace/codegen/targets/experimental_cuda.py index 6aeeb53ae6..22069446a7 100644 --- a/dace/codegen/targets/experimental_cuda.py +++ b/dace/codegen/targets/experimental_cuda.py @@ -177,18 +177,6 @@ def preprocess(self, sdfg: SDFG) -> None: except ValueError: # If transformation doesn't match, continue normally continue - """ - from dace.transformation.passes.fix_test import Fix - from dace.transformation.passes.move_array_out_of_kernel import MoveArrayOutOfKernel - #sdfg.save("before.sdfg") - names = Fix().apply_pass(sdfg, {}) - for name, map_parent in names.items(): - MoveArrayOutOfKernel().apply_pass(sdfg, map_parent, name) - - #sdfg.save("after.sdfg") - """ - - #----------------- Add ThreadBlock Maps & Infer Kernel Grid & Block Sizes -------------------- # new_nodes - old_nodes gives us all Kernel Entry nodes that were created during the insertion @@ -695,7 +683,7 @@ def node_dispatch_predicate(self, sdfg, state, node): return False ############################################################################# - # Nested SDFG related, testing phase + # Nested SDFGs & tasklets def generate_state(self, sdfg: SDFG, @@ -830,6 +818,85 @@ def _generate_NestedSDFG(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSub self._cpu_codegen.calling_codegen = old_codegen self._toplevel_schedule = old_schedule + def _generate_Tasklet(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int, + node: nodes.Tasklet, function_stream: CodeIOStream, callsite_stream: CodeIOStream) -> None: + # import ScopeManager which opens and closes brackets for conditions, useful here + # because the location dictionary might prescribe which threads/blocks run tasklet code + from dace.codegen.targets.experimental_cuda_helpers.scope_strategies import ScopeManager + + tasklet: nodes.Tasklet = node + with ScopeManager(self, sdfg, cfg, dfg, state_id, function_stream, callsite_stream) as scope_manager: + + if 'gpu_thread' in tasklet.location: + name = 'gpu_thread' + index_expr = self._get_thread_id() + location: Union[int, str, subsets.Range] = tasklet.location[name] + cond = self._generate_condition_from_location(name, index_expr, location) + scope_manager.open(condition=cond) + + if 'gpu_warp' in tasklet.location: + name = 'gpu_warp' + index_expr = self._get_warp_id() + location: Union[int, str, subsets.Range] = tasklet.location[name] + cond = self._generate_condition_from_location(name, index_expr, location) + scope_manager.open(condition=cond) + + if 'gpu_block' in tasklet.location: + name = 'gpu_block' + index_expr = self._get_block_id() + location: Union[int, str, subsets.Range] = tasklet.location[name] + cond = self._generate_condition_from_location(name, index_expr, location) + scope_manager.open(condition=cond) + + # Call CPU codegen + self._cpu_codegen._generate_Tasklet(sdfg, cfg, dfg, state_id, node, function_stream, callsite_stream) + + def _generate_condition_from_location(self, name:str, index_expr:str, + location: Union[int, str, subsets.Range]) -> str: + + # 1. Normalize location + if isinstance(location, str) and ':' in location: + location = subsets.Range.from_string(location) + if len(location) != 1: + raise ValueError(f'Only one-dimensional ranges are allowed for {name} specialization, {location} given') + elif symbolic.issymbolic(location): + location = sym2cpp(location) + + # 2. Build condition + if isinstance(location, subsets.Range): + # Range of indices + begin, end, stride = location[0] + rb, re, rs = sym2cpp(begin), sym2cpp(end), sym2cpp(stride) + cond = f'(({index_expr}) >= {rb}) && (({index_expr}) <= {re})' + if stride != 1: + cond += f' && ((({index_expr}) - {rb}) % {rs} == 0)' + else: + # Single-element + cond = f'({index_expr}) == {location}' + + return cond + + def _get_thread_id(self) -> str: + kernel_block_dims: List = self._current_kernel_spec.block_dims + result = 'threadIdx.x' + if kernel_block_dims[1] != 1: + result += f' + ({sym2cpp(kernel_block_dims[0])}) * threadIdx.y' + if kernel_block_dims[2] != 1: + result += f' + ({sym2cpp(kernel_block_dims[0] * kernel_block_dims[1])}) * threadIdx.z' + return result + + def _get_warp_id(self) -> str: + return f'(({self._get_thread_id()}) / warpSize)' + + def _get_block_id(self) -> str: + kernel_block_dims: List = self._current_kernel_spec.block_dims + result = 'blockIdx.x' + if kernel_block_dims[1] != 1: + result += f' + gridDim.x * blockIdx.y' + if kernel_block_dims[2] != 1: + result += f' + gridDim.x * gridDim.y * blockIdx.z' + return result + ####################################################################### # Array Declaration, Allocation and Deallocation @@ -1272,78 +1339,6 @@ def process_out_memlets(self, *args, **kwargs): # Call CPU implementation with this code generator as callback self._cpu_codegen.process_out_memlets(*args, codegen=self, **kwargs) - def _get_thread_id(self) -> str: - result = 'threadIdx.x' - if self._current_kernel_spec.block_dims[1] != 1: - result += f' + ({sym2cpp(self._current_kernel_spec.block_dims[0])}) * threadIdx.y' - if self._current_kernel_spec.block_dims[2] != 1: - result += f' + ({sym2cpp(self._current_kernel_spec.block_dims[0] * self._current_kernel_spec.block_dims[1])}) * threadIdx.z' - return result - - def _get_warp_id(self) -> str: - return f'(({self._get_thread_id()}) / warpSize)' - - def _get_block_id(self) -> str: - result = 'blockIdx.x' - if self._current_kernel_spec.block_dims[1] != 1: - result += f' + gridDim.x * blockIdx.y' - if self._current_kernel_spec.block_dims[2] != 1: - result += f' + gridDim.x * gridDim.y * blockIdx.z' - return result - - def _generate_condition_from_location(self, name: str, index_expr: str, node: nodes.Tasklet, - callsite_stream: CodeIOStream) -> str: - if name not in node.location: - return 0 - - location: Union[int, str, subsets.Range] = node.location[name] - if isinstance(location, str) and ':' in location: - location = subsets.Range.from_string(location) - elif symbolic.issymbolic(location): - location = sym2cpp(location) - - if isinstance(location, subsets.Range): - # Range of indices - if len(location) != 1: - raise ValueError(f'Only one-dimensional ranges are allowed for {name} specialization, {location} given') - begin, end, stride = location[0] - rb, re, rs = sym2cpp(begin), sym2cpp(end), sym2cpp(stride) - cond = '' - cond += f'(({index_expr}) >= {rb}) && (({index_expr}) <= {re})' - if stride != 1: - cond += f' && ((({index_expr}) - {rb}) % {rs} == 0)' - - callsite_stream.write(f'if ({cond}) {{') - else: - # Single-element - callsite_stream.write(f'if (({index_expr}) == {location}) {{') - - return 1 - - def _generate_Tasklet(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgraphView, state_id: int, - node: nodes.Tasklet, function_stream: CodeIOStream, callsite_stream: CodeIOStream) -> None: - generated_preamble_scopes = 0 - if self._in_device_code: - # If location dictionary prescribes that the code should run on a certain group of threads/blocks, - # add condition - generated_preamble_scopes += self._generate_condition_from_location('gpu_thread', self._get_thread_id(), - node, callsite_stream) - generated_preamble_scopes += self._generate_condition_from_location('gpu_warp', self._get_warp_id(), node, - callsite_stream) - generated_preamble_scopes += self._generate_condition_from_location('gpu_block', self._get_block_id(), node, - callsite_stream) - - # Call standard tasklet generation - old_codegen = self._cpu_codegen.calling_codegen - self._cpu_codegen.calling_codegen = self - self._cpu_codegen._generate_Tasklet(sdfg, cfg, dfg, state_id, node, function_stream, callsite_stream) - self._cpu_codegen.calling_codegen = old_codegen - - if generated_preamble_scopes > 0: - # Generate appropriate postamble - for i in range(generated_preamble_scopes): - callsite_stream.write('}', cfg, state_id, node) - ######################################################################### # helper class # This one is closely linked to the ExperimentalCUDACodeGen. In fact, @@ -1385,6 +1380,7 @@ def __init__(self, cudaCodeGen: ExperimentalCUDACodeGen, sdfg: SDFG, cfg: Contro cudaCodeGen._in_device_code = True self._args_as_input = [ptr(name, data, sdfg, cudaCodeGen._frame) for name, data in arglist.items()] + # Special: Persistent arguments args_typed = [] for name, data in arglist.items(): if data.lifetime == dtypes.AllocationLifetime.Persistent: From c28655889a6641424e8439d8ff95b885c301f0fe Mon Sep 17 00:00:00 2001 From: aydogdub Date: Wed, 17 Sep 2025 13:29:46 +0200 Subject: [PATCH 87/94] missed synchronization now added --- .../passes/gpustream/insert_gpu_stream_sync_tasklets.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/dace/transformation/passes/gpustream/insert_gpu_stream_sync_tasklets.py b/dace/transformation/passes/gpustream/insert_gpu_stream_sync_tasklets.py index 7f1b081010..ccb68e3d6b 100644 --- a/dace/transformation/passes/gpustream/insert_gpu_stream_sync_tasklets.py +++ b/dace/transformation/passes/gpustream/insert_gpu_stream_sync_tasklets.py @@ -14,6 +14,7 @@ from dace.transformation.passes.gpustream.insert_gpu_streams_to_tasklets import InsertGPUStreamsToTasklets from dace.transformation.passes.gpustream.insert_gpu_streams_to_sdfgs import InsertGPUStreamsToSDFGs +STREAM_PLACEHOLDER = "__dace_current_stream" @properties.make_properties @transformation.explicit_cf_compatible @@ -94,6 +95,9 @@ def edge_within_kernel(state, src, dst): src_in_kernel = is_within_schedule_types(state, src, gpu_schedules) dst_in_kernel = is_within_schedule_types(state, dst, gpu_schedules) return src_in_kernel and dst_in_kernel + + def is_tasklet_with_stream_use(src): + return isinstance(src, nodes.Tasklet) and STREAM_PLACEHOLDER in src.code.as_string # ------------------ Sync detection logic ----------------------------- @@ -127,6 +131,9 @@ def edge_within_kernel(state, src, dst): elif (is_kernel_exit(src) and is_gpu_accessnode(dst, state) and is_sink_node(dst, state)): sync_state[state].add(stream_assignments[dst]) + elif is_tasklet_with_stream_use(src): + sync_state[state].add(stream_assignments[src]) + else: continue From 15fc13bbb8ba74eecc6a65bdc5344fe4c6ec7f43 Mon Sep 17 00:00:00 2001 From: aydogdub Date: Wed, 17 Sep 2025 13:38:25 +0200 Subject: [PATCH 88/94] fix, clean-up and pre-commit --- dace/codegen/targets/experimental_cuda.py | 35 ++--- .../dataflow/add_threadblock_map.py | 1 - dace/transformation/passes/fix_test.py | 7 +- .../insert_gpu_stream_sync_tasklets.py | 3 +- .../passes/location_specialization.py | 144 ------------------ .../passes/move_array_out_of_kernel.py | 9 +- 6 files changed, 24 insertions(+), 175 deletions(-) delete mode 100644 dace/transformation/passes/location_specialization.py diff --git a/dace/codegen/targets/experimental_cuda.py b/dace/codegen/targets/experimental_cuda.py index 22069446a7..49daf2debc 100644 --- a/dace/codegen/targets/experimental_cuda.py +++ b/dace/codegen/targets/experimental_cuda.py @@ -235,10 +235,11 @@ def preprocess(self, sdfg: SDFG) -> None: # Retrieve arguments required for the kernels subgraph shared_transients = {} for state, node, defined_syms in sdutil.traverse_sdfg_with_defined_symbols(sdfg, recursive=True): - if (isinstance(node, nodes.MapEntry)and node.map.schedule == dtypes.ScheduleType.GPU_Device): + if (isinstance(node, nodes.MapEntry) and node.map.schedule == dtypes.ScheduleType.GPU_Device): if state.parent not in shared_transients: shared_transients[state.parent] = state.parent.shared_transients() - self._kernel_arglists[node] = state.scope_subgraph(node).arglist(defined_syms, shared_transients[state.parent]) + self._kernel_arglists[node] = state.scope_subgraph(node).arglist(defined_syms, + shared_transients[state.parent]) def _compute_pool_release(self, top_sdfg: SDFG): """ @@ -432,7 +433,6 @@ def generate_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: ScopeSub return - import copy from dace.transformation.passes.fix_test import Fix from dace.transformation.passes.move_array_out_of_kernel import MoveArrayOutOfKernel @@ -443,7 +443,6 @@ def generate_scope(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg_scope: ScopeSub MoveArrayOutOfKernel().apply_pass(sdfg, map_parent, name) infer_types.infer_connector_types(sdfg) - #--------------- Nested GPU Scope -------------------- supported_strategies: List[ScopeGenerationStrategy] = [ ThreadBlockScopeGenerator(codegen=self), @@ -781,7 +780,7 @@ def _generate_NestedSDFG(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSub # Add the const qualifier to any constants not marked as such """ # update const data - new_const_data = sdutil.get_constant_data(node, nsdfg) + new_const_data = sdutil.get_constant_data(node, nsdfg) for name in new_const_data: desc = nsdfg.arrays[name] ptr_name = ptr(name, desc, nsdfg, self._frame) @@ -801,13 +800,12 @@ def _generate_NestedSDFG(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSub dispatcher.defined_vars.add(ptr_name, defined_type, ctype, allow_shadowing=True) # update const symbols - new_const_symbols = sdutil.get_constant_symbols(node, nsdfg) + new_const_symbols = sdutil.get_constant_symbols(node, nsdfg) for name in new_const_symbols: defined_type = DefinedType.Scalar if not "const" in nsdfg.symbols[name].ctype: ctype = f"const {nsdfg.symbols[name].ctype}" - """ - + """ # Redirect rest to CPU codegen self._cpu_codegen._generate_NestedSDFG(sdfg, cfg, dfg, state_id, node, function_stream, callsite_stream) @@ -833,7 +831,7 @@ def _generate_Tasklet(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgra location: Union[int, str, subsets.Range] = tasklet.location[name] cond = self._generate_condition_from_location(name, index_expr, location) scope_manager.open(condition=cond) - + if 'gpu_warp' in tasklet.location: name = 'gpu_warp' index_expr = self._get_warp_id() @@ -848,22 +846,22 @@ def _generate_Tasklet(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgra cond = self._generate_condition_from_location(name, index_expr, location) scope_manager.open(condition=cond) - # Call CPU codegen - self._cpu_codegen._generate_Tasklet(sdfg, cfg, dfg, state_id, node, function_stream, callsite_stream) + # Call CPU codegen + self._cpu_codegen._generate_Tasklet(sdfg, cfg, dfg, state_id, node, function_stream, callsite_stream) - def _generate_condition_from_location(self, name:str, index_expr:str, - location: Union[int, str, subsets.Range]) -> str: + def _generate_condition_from_location(self, name: str, index_expr: str, location: Union[int, str, + subsets.Range]) -> str: # 1. Normalize location if isinstance(location, str) and ':' in location: location = subsets.Range.from_string(location) if len(location) != 1: - raise ValueError(f'Only one-dimensional ranges are allowed for {name} specialization, {location} given') + raise ValueError(f'Only one-dimensional ranges are allowed for {name} specialization, {location} given') elif symbolic.issymbolic(location): location = sym2cpp(location) # 2. Build condition - if isinstance(location, subsets.Range): + if isinstance(location, subsets.Range): # Range of indices begin, end, stride = location[0] rb, re, rs = sym2cpp(begin), sym2cpp(end), sym2cpp(stride) @@ -873,7 +871,7 @@ def _generate_condition_from_location(self, name:str, index_expr:str, else: # Single-element cond = f'({index_expr}) == {location}' - + return cond def _get_thread_id(self) -> str: @@ -1339,6 +1337,7 @@ def process_out_memlets(self, *args, **kwargs): # Call CPU implementation with this code generator as callback self._cpu_codegen.process_out_memlets(*args, codegen=self, **kwargs) + ######################################################################### # helper class # This one is closely linked to the ExperimentalCUDACodeGen. In fact, @@ -1391,7 +1390,6 @@ def __init__(self, cudaCodeGen: ExperimentalCUDACodeGen, sdfg: SDFG, cfg: Contro self._args_typed = args_typed - # Args for the kernel wrapper function cudaCodeGen._in_device_code = False @@ -1418,8 +1416,7 @@ def __init__(self, cudaCodeGen: ExperimentalCUDACodeGen, sdfg: SDFG, cfg: Contro for name, data in arglist.items()] + [str(gpustream_input[0].dst_conn)]) self._kernel_wrapper_args_typed = ([f'{mangle_dace_state_struct_name(cudaCodeGen._global_sdfg)} *__state'] + - args_typed + - [f"gpuStream_t {gpustream_var_name}"]) + args_typed + [f"gpuStream_t {gpustream_var_name}"]) cudaCodeGen._in_device_code = restore_in_device_code diff --git a/dace/transformation/dataflow/add_threadblock_map.py b/dace/transformation/dataflow/add_threadblock_map.py index 1a67534f4a..6efa4d5090 100644 --- a/dace/transformation/dataflow/add_threadblock_map.py +++ b/dace/transformation/dataflow/add_threadblock_map.py @@ -197,7 +197,6 @@ def apply(self, state: SDFGState, sdfg: SDFG): f"({tb_size}) is not enclosed by the derived block size ({gpu_block_size}). " "They are expected to be equal or the derived block size to be larger.") - @staticmethod def annotates_memlets(): return False diff --git a/dace/transformation/passes/fix_test.py b/dace/transformation/passes/fix_test.py index 2c6d2c316d..80caa2d563 100644 --- a/dace/transformation/passes/fix_test.py +++ b/dace/transformation/passes/fix_test.py @@ -58,7 +58,7 @@ def apply_pass(self, sdfg: SDFG, pipeline_results: Dict[str, Any]) -> Dict[str, if map_parent is None: continue - + if node.data not in parent_state.sdfg.arrays: continue @@ -77,7 +77,7 @@ def apply_pass(self, sdfg: SDFG, pipeline_results: Dict[str, Any]) -> Dict[str, if edge.dst_conn == node.data and isinstance(src, nodes.AccessNode) and src.data != node.data: break_cond = True skip.add(src.data) - + if break_cond: continue @@ -99,7 +99,6 @@ def apply_pass(self, sdfg: SDFG, pipeline_results: Dict[str, Any]) -> Dict[str, if move_out: to_be_moved.add((node.data, data_desc, map_parent)) - for name, desc, map_parent in to_be_moved: if name in skip: continue @@ -108,6 +107,4 @@ def apply_pass(self, sdfg: SDFG, pipeline_results: Dict[str, Any]) -> Dict[str, desc.transient = True names[name] = map_parent - - return names diff --git a/dace/transformation/passes/gpustream/insert_gpu_stream_sync_tasklets.py b/dace/transformation/passes/gpustream/insert_gpu_stream_sync_tasklets.py index ccb68e3d6b..b90faa22a5 100644 --- a/dace/transformation/passes/gpustream/insert_gpu_stream_sync_tasklets.py +++ b/dace/transformation/passes/gpustream/insert_gpu_stream_sync_tasklets.py @@ -16,6 +16,7 @@ STREAM_PLACEHOLDER = "__dace_current_stream" + @properties.make_properties @transformation.explicit_cf_compatible class InsertGPUStreamSyncTasklets(ppl.Pass): @@ -95,7 +96,7 @@ def edge_within_kernel(state, src, dst): src_in_kernel = is_within_schedule_types(state, src, gpu_schedules) dst_in_kernel = is_within_schedule_types(state, dst, gpu_schedules) return src_in_kernel and dst_in_kernel - + def is_tasklet_with_stream_use(src): return isinstance(src, nodes.Tasklet) and STREAM_PLACEHOLDER in src.code.as_string diff --git a/dace/transformation/passes/location_specialization.py b/dace/transformation/passes/location_specialization.py deleted file mode 100644 index 936c19a2a9..0000000000 --- a/dace/transformation/passes/location_specialization.py +++ /dev/null @@ -1,144 +0,0 @@ -# Copyright 2019-2025 ETH Zurich and the DaCe authors. All rights reserved. -from typing import Tuple, List, Union, Dict - -import dace -from dace.properties import make_properties, DictProperty, ShapeProperty -from dace import subsets, symbolic -from dace import Config, dtypes, symbolic -from dace.properties import make_properties -from dace.sdfg import SDFG, SDFGState, nodes, utils as sdutil -from dace.codegen.targets.experimental_cuda_helpers import gpu_utils -from dace.transformation import helpers, transformation -from dace.transformation import helpers, pass_pipeline as ppl, transformation - -from dace.codegen.targets.cpp import sym2cpp - - -@make_properties -class LocationSpecialization(ppl.Pass): - - - def apply_pass(self, sdfg: SDFG, kernel_dimensions_map: Dict) -> None: - - for node, state in sdfg.all_nodes_recursive(): - - if not isinstance(node, dace.nodes.Tasklet): - continue - - if not self._applicable(state, node): - continue - - tasklet = node - block_dims = self._get_block_dims(state, tasklet, kernel_dimensions_map) - - # Generate preambles for thread/warp/block - preamble_thread = self._generate_condition_from_location("gpu_thread", self._get_thread_id(block_dims), tasklet) - preamble_warp = self._generate_condition_from_location("gpu_warp", self._get_warp_id(block_dims), tasklet) - preamble_block = self._generate_condition_from_location("gpu_block", self._get_block_id(block_dims), tasklet) - - # Keep only non-empty preambles - preambles = [p for p in (preamble_thread, preamble_warp, preamble_block) if p] - - - for preamble in preambles: - if tasklet.code.language == dace.dtypes.Language.Python: - cond = preamble.strip()[3:-1].strip() - cond = cond.replace("&&", "and").replace("||", "or") - - pre_tasklet = state.add_tasklet(f"specialization", {}, {}, cond) - - state.add_edge(pre_tasklet, None, tasklet, None, dace.Memlet()) - for pred in state.predecessors(tasklet): - state.add_edge(pred, None, pre_tasklet, None, dace.Memlet()) - - - import textwrap - # Wrap tasklet code with preambles and closing braces - for preamble in preambles: - original = tasklet.code.as_string or "" - if tasklet.code.language == dace.dtypes.Language.Python: - # Turn CUDA-style preamble into a Python if-statement - cond = preamble.strip()[3:-1].strip() # strip "if (" at start and "{" - cond = cond.replace("&&", "and").replace("||", "or") - tasklet.code.as_string = f"if {cond}:\n" + textwrap.indent(original, " ") - else: - # Leave CUDA/C++ unchanged - tasklet.code.as_string = preamble + original + "}\n" - - - def _applicable(self, state: SDFGState, tasklet: dace.nodes.Tasklet) -> bool: - """ - Check if this transformation is applicable. - - Applicable if: - * The tasklet is scheduled to run within a GPU kernel, and - * Its location dictionary contains at least one of: - - "gpu_block" - - "gpu_thread" - - "gpu_warp" - """ - - # Not within the kernel - skip - if not gpu_utils.is_within_schedule_types(state, tasklet, dtypes.GPU_SCHEDULES): - return False - - # return if the location dictionary contain block, thread of warp specialization - return any(k in tasklet.location for k in ("gpu_block", "gpu_thread", "gpu_warp")) - - def _generate_condition_from_location(self, name: str, index_expr: str, node: nodes.Tasklet) -> str: - if name not in node.location: - return '' - - location: Union[int, str, subsets.Range] = node.location[name] - if isinstance(location, str) and ':' in location: - location = subsets.Range.from_string(location) - elif symbolic.issymbolic(location): - location = sym2cpp(location) - - if isinstance(location, subsets.Range): - # Range of indices - if len(location) != 1: - raise ValueError(f'Only one-dimensional ranges are allowed for {name} specialization, {location} given') - begin, end, stride = location[0] - rb, re, rs = sym2cpp(begin), sym2cpp(end), sym2cpp(stride) - cond = '' - cond += f'(({index_expr}) >= {rb}) && (({index_expr}) <= {re})' - if stride != 1: - cond += f' && ((({index_expr}) - {rb}) % {rs} == 0)' - - return (f'if ({cond}) {{\n') - else: - # Single-element - return(f'if (({index_expr}) == {location}) {{\n') - - def _get_thread_id(self, block_dims: List) -> str: - result = 'threadIdx.x' - if block_dims[1] != 1: - result += f' + ({sym2cpp(block_dims[0])}) * threadIdx.y' - if block_dims[2] != 1: - result += f' + ({sym2cpp(block_dims[0] * block_dims[1])}) * threadIdx.z' - return result - - def _get_warp_id(self, block_dims: List) -> str: - return f'(({self._get_thread_id(block_dims)}) / warpSize)' - - def _get_block_id(self, block_dims: List) -> str: - result = 'blockIdx.x' - if block_dims[1] != 1: - result += f' + gridDim.x * blockIdx.y' - if block_dims[2] != 1: - result += f' + gridDim.x * gridDim.y * blockIdx.z' - return result - - def _get_block_dims(self, state, tasklet, kernel_dimensions_map) -> List: - - parent_map, parent_map_state = gpu_utils.get_parent_map(state, tasklet) - while parent_map.map.schedule != dtypes.ScheduleType.GPU_Device: - parent_map, parent_map_state = gpu_utils.get_parent_map(parent_map_state, parent_map) - - _, block_size = kernel_dimensions_map[parent_map] - return block_size - - @staticmethod - def annotates_memlets(): - return False diff --git a/dace/transformation/passes/move_array_out_of_kernel.py b/dace/transformation/passes/move_array_out_of_kernel.py index 2ab0a67bf0..bd7e401187 100644 --- a/dace/transformation/passes/move_array_out_of_kernel.py +++ b/dace/transformation/passes/move_array_out_of_kernel.py @@ -19,6 +19,7 @@ import dace.sdfg.utils as sdutil + @make_properties @transformation.explicit_cf_compatible class MoveArrayOutOfKernel(Pass): @@ -237,7 +238,6 @@ def move_array_out_of_kernel_nested(self, kernel_entry: nodes.MapEntry, array_na self.lift_array_through_nested_sdfgs(array_name, kernel_entry, sdfg_hierarchy, old_subset) - def lift_array_through_nested_sdfgs(self, array_name: str, kernel_entry: nodes.MapEntry, sdfg_hierarchy: List[SDFG], old_subset: List) -> None: """ @@ -267,7 +267,6 @@ def lift_array_through_nested_sdfgs(self, array_name: str, kernel_entry: nodes.M new_desc = copy.deepcopy(old_desc) outer_sdfg.add_datadesc(array_name, new_desc) - # Get all parent scopes to detect how the data needs to flow. # E.g. nsdfg_node -> MapExit needs to be nsdfg_node -> MapExit -> AccessNode (new) parent_scopes: List[nodes.MapEntry] = [] @@ -495,7 +494,7 @@ def get_new_shape_info(self, array_desc: dt.Array, map_exit_chain: List[nodes.Ma accumulator = old_total_size new_strides.insert(0, old_total_size) for cur_range_size in range_size[:-1]: - new_strides.insert(0, accumulator) # insert before (mult with volumes) + new_strides.insert(0, accumulator) # insert before (mult with volumes) accumulator = accumulator * cur_range_size extended_size = range_size + extended_size @@ -608,7 +607,7 @@ def collect_array_descriptor_usage( """ access_nodes_info: List[Tuple[nodes.AccessNode, SDFGState, SDFG]] = self.get_access_nodes_within_map(map_entry, array_name) - + last_sdfg: SDFG = self._node_to_sdfg_cache[map_entry] result: Set[Tuple[dt.Array, SDFG, Set[SDFG], Set[nodes.AccessNode]]] = set() @@ -899,4 +898,4 @@ def out_paths(self, access_node: nodes.AccessNode) -> List[List[MultiConnectorEd extended_path = current_path + [edge] queue.append(extended_path) - return complete_paths \ No newline at end of file + return complete_paths From 0f84a4a312badf1127b3acba622438ed82444311 Mon Sep 17 00:00:00 2001 From: aydogdub Date: Wed, 17 Sep 2025 14:19:22 +0200 Subject: [PATCH 89/94] fix --- dace/codegen/targets/experimental_cuda.py | 2 +- .../experimental_cuda_helpers/scope_strategies.py | 9 +++++++-- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/dace/codegen/targets/experimental_cuda.py b/dace/codegen/targets/experimental_cuda.py index 49daf2debc..0d3dce577c 100644 --- a/dace/codegen/targets/experimental_cuda.py +++ b/dace/codegen/targets/experimental_cuda.py @@ -823,7 +823,7 @@ def _generate_Tasklet(self, sdfg: SDFG, cfg: ControlFlowRegion, dfg: StateSubgra from dace.codegen.targets.experimental_cuda_helpers.scope_strategies import ScopeManager tasklet: nodes.Tasklet = node - with ScopeManager(self, sdfg, cfg, dfg, state_id, function_stream, callsite_stream) as scope_manager: + with ScopeManager(self, sdfg, cfg, dfg, state_id, function_stream, callsite_stream, brackets_on_enter=False) as scope_manager: if 'gpu_thread' in tasklet.location: name = 'gpu_thread' diff --git a/dace/codegen/targets/experimental_cuda_helpers/scope_strategies.py b/dace/codegen/targets/experimental_cuda_helpers/scope_strategies.py index 1e690248fa..800b6ab4c8 100644 --- a/dace/codegen/targets/experimental_cuda_helpers/scope_strategies.py +++ b/dace/codegen/targets/experimental_cuda_helpers/scope_strategies.py @@ -486,6 +486,7 @@ def __init__(self, function_stream: CodeIOStream, callsite_stream: CodeIOStream, comment: str = None, + brackets_on_enter: bool = True, debug: bool = False): """ Initializes the KernelScopeManager. @@ -498,6 +499,7 @@ def __init__(self, :param function_stream: The CodeIOStream for function-level code. :param callsite_stream: The CodeIOStream for callsite-level code. :param comment: A descriptive comment explaining the purpose of the code block being opened. Default is None. + :param brackets_on_enter: Whether on entering (i.e. when using "with", there should be a bracket opened). Default is True. :param debug: Whether to include debug comments in the output. Defaults to False. """ self.frame_codegen = frame_codegen @@ -508,6 +510,7 @@ def __init__(self, self.function_stream = function_stream self.callsite_stream = callsite_stream self.comment = comment + self.brackets_on_enter = brackets_on_enter self.debug = debug self._opened = 0 @@ -516,9 +519,11 @@ def __init__(self, def __enter__(self): """ - Writes the opening bracket. + Writes the opening bracket in case self.brackets_on_enter + is set to true, which it is by default. """ - self.open() + if self.brackets_on_enter: + self.open() return self def __exit__(self, exc_type, exc_value, traceback): From 7bc1226cb207a6cab504170583714f8911db93a2 Mon Sep 17 00:00:00 2001 From: aydogdub Date: Wed, 24 Sep 2025 16:21:07 +0200 Subject: [PATCH 90/94] fix missed case in default shared memory synchornization --- .../passes/shared_memory_synchronization.py | 120 +++++++++++++----- 1 file changed, 86 insertions(+), 34 deletions(-) diff --git a/dace/transformation/passes/shared_memory_synchronization.py b/dace/transformation/passes/shared_memory_synchronization.py index 8a8eca9842..47c5bea0a8 100644 --- a/dace/transformation/passes/shared_memory_synchronization.py +++ b/dace/transformation/passes/shared_memory_synchronization.py @@ -4,6 +4,7 @@ import dace from dace import SDFG, SDFGState, dtypes, properties +from dace.codegen.targets.experimental_cuda_helpers import gpu_utils from dace.sdfg.nodes import AccessNode, MapEntry, MapExit, NestedSDFG, Node from dace.sdfg.state import LoopRegion from dace.transformation import helpers, pass_pipeline as ppl, transformation @@ -14,24 +15,32 @@ class DefaultSharedMemorySync(ppl.Pass): """ This pass inserts synchronization tasklets that call "__syncthreads()". + This is for GPUs. - Synchronization is added after ThreadBlock (TB) MapExits if the TB map - writes to shared memory. + Synchronization is added after GPU_ThreadBlock (TB) MapExits if the TB map + writes to shared memory or after collaborative writes to shared memory (smem). Important notes: - - Users are expected to **not** write to shared memory inside a Sequential - map or LoopRegion **within** a TB map. Calling "__syncthreads()" inside - a TB map can cause deadlocks, e.g., when only a subset of threads - participates (thread divergence). - - - If shared memory is still written sequentially within a TB map, the missing - intermediate synchronizations may lead to race conditions and incorrect results. - Since deadlocks are worse than race conditions, this pass avoids inserting - synchronization inside TB maps, but it will warn the user of the race condition risk. - - - In nested TB maps (e.g., GPU_Device map -> TB map -> TB map ...), - synchronization is only inserted at the outermost TB map's exit. The reason is again - the previously described deadlock danger. + - Calling "__syncthreads()" inside a TB map can lead to deadlocks, + for example when only a subset of threads participates (thread divergence). + Therefore, users must **not** write to shared memory inside a Sequential + map or LoopRegion that is nested within a TB map. + + - If shared memory is still written sequentially within a TB map, the missing + intermediate synchronizations may lead to race conditions and incorrect results. + Because deadlocks are worse than race conditions, this pass avoids inserting + synchronization inside TB maps, but it will warn the user about potential risks. + + - When writing to and reading from shared memory within the same TB map, + users must ensure that no synchronization is required, since barriers + are not inserted automatically in this case (again, to avoid deadlocks). + If synchronization is needed, the computation should instead be split + across sequential TB maps. There is no warning for race conditions in this + case for misbehavior. + + - In nested TB maps (e.g., GPU_Device map -> TB map -> TB map ...), + synchronization is only inserted at the outermost TB map's exit, + again to avoid deadlocks. """ def __init__(self): @@ -41,32 +50,75 @@ def __init__(self): def apply_pass(self, sdfg: SDFG, _) -> None: """ - Apply this pass to insert synchronization barriers for GPU ThreadBlock maps. - - The pass: - - Finds all ThreadBlock-scheduled maps in the SDFG, - - Analyzes them for shared memory usage and race-condition risks, and - - Inserts synchronization barriers (`__syncthreads()`) after the - corresponding ThreadBlock-scheduled MapExits where needed. + Insert synchronization barriers (`__syncthreads()`) where needed to ensure + shared memory writes are synchronied for potential subsequent reads. + + This pass performs the following steps: + 1. Collect all ThreadBlock-scheduled MapExits and candidate collaborative + shared-memory writes (AccessNodes). + 2. Analyze ThreadBlock MapExits for synchronization requirements. + 3. Insert synchronization barriers after both MapExits and collaborative + shared-memory writes as needed. """ - # 1. Find all GPU_ThreadBlock schedules Maps and - # cache each node's parent state for convenience + # 1. Find all GPU_ThreadBlock-scheduled Maps and all collaborative writes to + # GPU shared memory, and cache each node's parent state for convenience. tb_map_exits: Dict[MapExit, SDFGState] = dict() + collaborative_smem_copies: Dict[AccessNode, SDFGState] = dict() for node, parent_state in sdfg.all_nodes_recursive(): self._node_to_parent_state[node] = parent_state if isinstance(node, MapExit) and node.schedule == dtypes.ScheduleType.GPU_ThreadBlock: tb_map_exits[node] = parent_state + elif isinstance(node, AccessNode) and self.is_collaborative_smem_write(node, parent_state): + collaborative_smem_copies[node] = parent_state + # 2. Identify TB MapExits requiring a synchronization barrier sync_requiring_exits = self.identify_synchronization_tb_exits(tb_map_exits) # 3. Insert synchronization barriers for previous TB MapExits - self.insert_synchronization_after_tb_exits(sync_requiring_exits) + self.insert_synchronization_after_nodes(sync_requiring_exits) + + # 4. Insert synchronization after collaborative shared memory writes + self.insert_synchronization_after_nodes(collaborative_smem_copies) + + def is_collaborative_smem_write(self, node: AccessNode, state: SDFGState) -> bool: + """ + Determine whether the given AccessNode corresponds to a collaborative + shared-memory (smem) write, i.e., whether it is written cooperatively + by GPU threads at the device level but not within a thread block map. + + Parameters + ---------- + node : AccessNode + The candidate access node. + state : SDFGState + The state in which the node resides. + + Returns + ------- + bool + True if the node is a collaborative smem write, False otherwise. + """ + # 1. node is not stored in shared memory - skip + if node.desc(state).storage != dtypes.StorageType.GPU_Shared: + return False + + # 2. No writes to the shared memory - skip + if state.in_degree(node) == 0: + return False + + # 3. It is a collaborative copy if it is within a kernel but not within a GPU_ThreadBlock map + if (not gpu_utils.is_within_schedule_types(state, node, [dtypes.ScheduleType.GPU_Device]) + or gpu_utils.is_within_schedule_types(state, node, [dtypes.ScheduleType.GPU_ThreadBlock])): + return False + + return True + def identify_synchronization_tb_exits(self, tb_map_exits: Dict[MapExit, SDFGState]) -> Dict[MapExit, SDFGState]: """ - Identify ThreadBlock exits after which "__syncthread()" should be called. + Identify ThreadBlock exits after which "__syncthreads()" should be called. Parameters ---------- @@ -274,18 +326,18 @@ def map_writes_to_smem(self, map_entry: MapEntry) -> bool: # No writes to shared memory found return False - def insert_synchronization_after_tb_exits(self, tb_map_exits: Dict[MapExit, SDFGState]) -> None: + def insert_synchronization_after_nodes(self, nodes: Dict[Node, SDFGState]) -> None: """ Insert synchronization tasklets (calling `__syncthreads()`) after the given - GPU ThreadBlock MapExit nodes. + GPU-related nodes. Parameters ---------- - tb_map_exits : Dict[MapExit, SDFGState] - Mapping from ThreadBlock MapExit nodes to their parent states after which a synchronization - tasklet should be inserted. + nodes : Dict[Node, SDFGState] + Mapping from SDFG nodes to their parent states after which a + synchronization tasklet should be inserted. """ - for map_exit, state in tb_map_exits.items(): + for node, state in nodes.items(): sync_tasklet = state.add_tasklet(name="sync_threads", inputs=set(), @@ -293,7 +345,7 @@ def insert_synchronization_after_tb_exits(self, tb_map_exits: Dict[MapExit, SDFG code="__syncthreads();\n", language=dtypes.Language.CPP) - for succ in state.successors(map_exit): + for succ in state.successors(node): state.add_edge(sync_tasklet, None, succ, None, dace.Memlet()) - state.add_edge(map_exit, None, sync_tasklet, None, dace.Memlet()) + state.add_edge(node, None, sync_tasklet, None, dace.Memlet()) From 0ffbc831471479dd6e4454212d94f065688ae819 Mon Sep 17 00:00:00 2001 From: aydogdub Date: Tue, 7 Oct 2025 14:03:35 +0200 Subject: [PATCH 91/94] avoid unnecessary smem sync, and add additional case where stream sync may be needed (was removed before as the case never occured) --- .../insert_gpu_stream_sync_tasklets.py | 72 ++++++++++++++++--- .../passes/shared_memory_synchronization.py | 14 ++-- 2 files changed, 72 insertions(+), 14 deletions(-) diff --git a/dace/transformation/passes/gpustream/insert_gpu_stream_sync_tasklets.py b/dace/transformation/passes/gpustream/insert_gpu_stream_sync_tasklets.py index b90faa22a5..b4a7b9a65d 100644 --- a/dace/transformation/passes/gpustream/insert_gpu_stream_sync_tasklets.py +++ b/dace/transformation/passes/gpustream/insert_gpu_stream_sync_tasklets.py @@ -40,13 +40,16 @@ def should_reapply(self, modified: ppl.Modifies) -> bool: def apply_pass(self, sdfg: SDFG, pipeline_results: Dict[str, Any]): """ Inserts GPU stream synchronization tasklets at required locations - at the end of a state, for GPU streams used in the state. + after certain nodes and at the end of a state, for GPU streams used in the state. """ stream_assignments: Dict[nodes.Node, int] = pipeline_results['NaiveGPUStreamScheduler'] # Get sync locations sync_state, sync_node = self._identify_sync_locations(sdfg, stream_assignments) + # Synchronize after a node when required + self._insert_gpu_stream_sync_after_node(sdfg, sync_node, stream_assignments) + # Synchronize all used streams at the end of a state self._insert_gpu_stream_sync_at_state_end(sdfg, sync_state, stream_assignments) return {} @@ -56,8 +59,6 @@ def _identify_sync_locations( stream_assignments: Dict[nodes.Node, int]) -> Tuple[Dict[SDFGState, Set[int]], Dict[nodes.Node, SDFGState]]: """ Heuristically identifies GPU stream synchronization points in an SDFG. - Synchronization is inserted at the end of a state when it is required. - Parameters ---------- @@ -77,7 +78,7 @@ def _identify_sync_locations( # ------------------ Helper predicates ----------------------------- - def is_gpu_accessnode(node, state): + def is_gpu_global_accessnode(node, state): return isinstance(node, nodes.AccessNode) and node.desc( state.parent).storage == dtypes.StorageType.GPU_Global @@ -113,23 +114,24 @@ def is_tasklet_with_stream_use(src): sync_state[state] = set() # --- Heuristics for when to sync --- - if (is_gpu_accessnode(src, state) and is_nongpu_accessnode(dst, state) and is_sink_node(dst, state) + if (is_gpu_global_accessnode(src, state) and is_nongpu_accessnode(dst, state) and is_sink_node(dst, state) and not edge_within_kernel(state, src, dst)): sync_state[state].add(stream_assignments[dst]) - elif (is_gpu_accessnode(src, state) and is_nongpu_accessnode(dst, state) and not is_sink_node(dst, state) + elif (is_gpu_global_accessnode(src, state) and is_nongpu_accessnode(dst, state) and not is_sink_node(dst, state) and not edge_within_kernel(state, src, dst)): + sync_node[dst] = state sync_state[state].add(stream_assignments[dst]) - elif (is_nongpu_accessnode(src, state) and is_gpu_accessnode(dst, state) + elif (is_nongpu_accessnode(src, state) and is_gpu_global_accessnode(dst, state) and not edge_within_kernel(state, src, dst)): sync_state[state].add(stream_assignments[dst]) - elif (is_kernel_exit(src) and is_gpu_accessnode(dst, state) and not is_sink_node(dst, state)): + elif (is_kernel_exit(src) and is_gpu_global_accessnode(dst, state) and not is_sink_node(dst, state)): sync_state[state].add(stream_assignments[src]) sync_state[state].add(stream_assignments[src]) - elif (is_kernel_exit(src) and is_gpu_accessnode(dst, state) and is_sink_node(dst, state)): + elif (is_kernel_exit(src) and is_gpu_global_accessnode(dst, state) and is_sink_node(dst, state)): sync_state[state].add(stream_assignments[dst]) elif is_tasklet_with_stream_use(src): @@ -232,3 +234,55 @@ def _insert_gpu_stream_sync_at_state_end(self, sdfg: SDFG, sync_state: Dict[SDFG tasklet.add_out_connector(conn, dtypes.gpuStream_t, force=True) state.add_edge(combined_stream_node, None, tasklet, conn, dace.Memlet(accessed_gpu_stream)) state.add_edge(tasklet, conn, output_stream_node, None, dace.Memlet(accessed_gpu_stream)) + + def _insert_gpu_stream_sync_after_node(self, sdfg: SDFG, sync_node: Dict[nodes.Node, SDFGState], + stream_assignments: Dict[nodes.Node, int]) -> None: + """ + Insert a GPU stream synchronization tasklet immediately after specified nodes. + + Parameters + ---------- + sdfg : SDFG + The top level SDFG. + sync_node : Dict[nodes.Node, SDFGState] + Mapping of nodes to their parent state. After after the node a GPU stream synchronization should occur. + stream_assignments : Dict[nodes.Node, int] + Mapping of nodes to their assigned GPU stream IDs. + """ + # Prepare GPU stream info and backend + stream_array_name, stream_var_name_prefix = Config.get('compiler', 'cuda', 'gpu_stream_name').split(',') + backend: str = common.get_gpu_backend() + + for node, state in sync_node.items(): + + #----------------- Generate GPU stream synchronization Tasklet ----------------- + + # Get assigned GPU stream + stream = stream_assignments.get(node, "nullptr") + if stream == "nullptr": + raise NotImplementedError("Using the default 'nullptr' gpu stream is not supported yet.") + + # Create the tasklet + stream_var_name = f"{stream_var_name_prefix}{stream}" + sync_call = f"DACE_GPU_CHECK({backend}StreamSynchronize({stream_var_name}));\n" + tasklet = state.add_tasklet( name=f"gpu_stream_{stream}_synchronization", + inputs=set(), outputs=set(), + code=sync_call, language=dtypes.Language.CPP) + + + #----------------- Place tasklet between node and successors, link GPU streams ---------------- + + # 1. Put the tasklet between the node and its successors + for succ in state.successors(node): + state.add_edge(tasklet, None, succ, None, dace.Memlet()) + state.add_edge(node, None, tasklet, None, dace.Memlet()) + + # 2. Connect tasklet to GPU stream AccessNodes + in_stream = state.add_access(stream_array_name) + out_stream = state.add_access(stream_array_name) + accessed_stream = f"{stream_array_name}[{stream}]" + state.add_edge(in_stream, None, tasklet, stream_var_name, dace.Memlet(accessed_stream)) + state.add_edge(tasklet, stream_var_name, out_stream, None, dace.Memlet(accessed_stream)) + tasklet.add_in_connector(stream_var_name, dtypes.gpuStream_t, force=True) + tasklet.add_out_connector(stream_var_name, dtypes.gpuStream_t, force=True) + \ No newline at end of file diff --git a/dace/transformation/passes/shared_memory_synchronization.py b/dace/transformation/passes/shared_memory_synchronization.py index 47c5bea0a8..4f73d41ef9 100644 --- a/dace/transformation/passes/shared_memory_synchronization.py +++ b/dace/transformation/passes/shared_memory_synchronization.py @@ -82,7 +82,6 @@ def apply_pass(self, sdfg: SDFG, _) -> None: # 4. Insert synchronization after collaborative shared memory writes self.insert_synchronization_after_nodes(collaborative_smem_copies) - def is_collaborative_smem_write(self, node: AccessNode, state: SDFGState) -> bool: """ Determine whether the given AccessNode corresponds to a collaborative @@ -105,11 +104,16 @@ def is_collaborative_smem_write(self, node: AccessNode, state: SDFGState) -> boo if node.desc(state).storage != dtypes.StorageType.GPU_Shared: return False - # 2. No writes to the shared memory - skip - if state.in_degree(node) == 0: - return False + # 2. To my knowledge, it is not a collaborative write if the result comes from a ThreadBlock map. + if all(isinstance(pred, MapExit) and pred.map.schedule == dtypes.ScheduleType.GPU_ThreadBlock + for pred in state.predecessors(node)): + return False + + # 3. If all in edges are empty, there is no write - and no sync necessary + if all(edge.data.is_empty() for edge in state.in_edges(node)): + return False - # 3. It is a collaborative copy if it is within a kernel but not within a GPU_ThreadBlock map + # 4. It is a collaborative copy if it is within a kernel but not within a GPU_ThreadBlock map if (not gpu_utils.is_within_schedule_types(state, node, [dtypes.ScheduleType.GPU_Device]) or gpu_utils.is_within_schedule_types(state, node, [dtypes.ScheduleType.GPU_ThreadBlock])): return False From 7a080cb7e8a7e60ae3d582c3f56ee5dce80cbc88 Mon Sep 17 00:00:00 2001 From: aydogdub Date: Fri, 21 Nov 2025 18:15:29 +0100 Subject: [PATCH 92/94] add GPU stream pipeline passes and necessary helpers --- .../targets/gpu_helpers/copy_strategies.py | 554 ++++++++++++++++++ dace/codegen/targets/gpu_helpers/gpu_utils.py | 27 + dace/transformation/helpers.py | 31 + .../connect_gpu_streams_to_kernels.py | 70 +++ .../connect_gpu_streams_to_tasklets.py | 80 +++ .../gpu_stream_scheduling.py | 249 ++++++++ .../gpu_stream_topology_simplification.py | 273 +++++++++ .../insert_gpu_copy_tasklet.py | 166 ++++++ .../insert_gpu_stream_sync_tasklets.py | 288 +++++++++ .../insert_gpu_streams_to_sdfgs.py | 154 +++++ 10 files changed, 1892 insertions(+) create mode 100644 dace/codegen/targets/gpu_helpers/copy_strategies.py create mode 100644 dace/codegen/targets/gpu_helpers/gpu_utils.py create mode 100644 dace/transformation/passes/gpu_specialization/connect_gpu_streams_to_kernels.py create mode 100644 dace/transformation/passes/gpu_specialization/connect_gpu_streams_to_tasklets.py create mode 100644 dace/transformation/passes/gpu_specialization/gpu_stream_scheduling.py create mode 100644 dace/transformation/passes/gpu_specialization/gpu_stream_topology_simplification.py create mode 100644 dace/transformation/passes/gpu_specialization/insert_gpu_copy_tasklet.py create mode 100644 dace/transformation/passes/gpu_specialization/insert_gpu_stream_sync_tasklets.py create mode 100644 dace/transformation/passes/gpu_specialization/insert_gpu_streams_to_sdfgs.py diff --git a/dace/codegen/targets/gpu_helpers/copy_strategies.py b/dace/codegen/targets/gpu_helpers/copy_strategies.py new file mode 100644 index 0000000000..1b11f5bb2b --- /dev/null +++ b/dace/codegen/targets/gpu_helpers/copy_strategies.py @@ -0,0 +1,554 @@ +# Copyright 2019-2025 ETH Zurich and the DaCe authors. All rights reserved. +from abc import ABC, abstractmethod +from typing import Any, Dict, List, Optional, Tuple, Union + +from dace import SDFG, SDFGState, data, dtypes, subsets +from dace import memlet as mm +from dace import symbolic +from dace.codegen import common +from dace.codegen.targets import cpp +from dace.codegen.targets.cpp import sym2cpp +from dace.codegen.targets.gpu_helpers.gpu_utils import generate_sync_debug_call +from dace.config import Config +from dace.dtypes import StorageType +from dace.frontend import operations +from dace.sdfg import nodes, scope_contains_scope +from dace.sdfg.graph import MultiConnectorEdge +from dace.transformation import helpers + + +class CopyContext: + """ + Encapsulates inputs required for copy operations and exposes helper + methods to derive additional information. This keeps copy strategies + lightweight by letting them focus only on the relevant logic. + """ + + def __init__(self, sdfg: SDFG, state: SDFGState, src_node: nodes.Node, dst_node: nodes.Node, + edge: MultiConnectorEdge[mm.Memlet], gpustream_assignments: Dict[nodes.Node, Union[int, str]]): + + # Store the basic context as attributes + self.sdfg = sdfg + self.state = state + self.src_node = src_node + self.dst_node = dst_node + self.edge = edge + self.gpustream_assignments = gpustream_assignments + + memlet = edge.data + + self.copy_shape = memlet.subset.size_exact() + if isinstance(src_node, nodes.AccessNode) and isinstance(dst_node, nodes.AccessNode): + copy_shape, src_strides, dst_strides, src_expr, dst_expr = self.get_accessnode_to_accessnode_copy_info() + else: + copy_shape = memlet.subset.size_exact() + src_strides = dst_strides = src_expr = dst_expr = None + + self.copy_shape = copy_shape + self.src_strides = src_strides + self.dst_strides = dst_strides + self.src_expr = src_expr + self.dst_expr = dst_expr + + def get_storage_type(self, node: nodes.Node): + """ + Return the storage type associated with a given SDFG node. + + Tasklets are assumed to use register storage, while AccessNodes + return the storage type from their data descriptor. Raises + NotImplementedError for unsupported node types. + """ + if isinstance(node, nodes.Tasklet): + storage_type = StorageType.Register + + elif isinstance(node, nodes.AccessNode): + storage_type = node.desc(self.sdfg).storage + + else: + raise NotImplementedError(f"Unsupported node type {type(node)} for storage type retrieval; " + "expected AccessNode or Tasklet. Please extend this method accordingly.") + + return storage_type + + def get_assigned_gpustream(self) -> str: + """ + Return the GPU stream expression assigned to both source and destination nodes. + + Ensures that both nodes have a matching stream ID, then constructs the + variable name from the configured prefix and stream ID. Raises ValueError + if assignments are missing or inconsistent. + + Example: + If the configured prefix is 'gpu_stream' and the assigned stream ID is 0, + this method returns 'gpu_stream0'. + """ + src_stream = self.gpustream_assignments.get(self.src_node) + dst_stream = self.gpustream_assignments.get(self.dst_node) + + # 1. Catch unsupported cases + if src_stream is None or dst_stream is None: + raise ValueError("GPU stream assignment missing for source or destination node.") + + if src_stream != dst_stream: + raise ValueError(f"Mismatch in assigned GPU streams: src_node has '{src_stream}', " + f"dst_node has '{dst_stream}'. They must be the same.") + + # 2. Generate GPU stream expression + gpustream = src_stream + gpustream_var_name_prefix = Config.get('compiler', 'cuda', 'gpu_stream_name').split(',')[1] + gpustream_expr = f"{gpustream_var_name_prefix}{gpustream}" + + return gpustream_expr + + def get_memory_location(self) -> Tuple[str, str]: + """ + Determine whether the source and destination nodes reside in device or host memory. + + Uses the storage type of each node to classify it as either 'Device' + (GPU global memory) or 'Host' (all other storage types). + Used for GPU related copies outside the kernel (e.g. to construct + cudaMemcpyHostToDevice for example). + + Returns + ------- + Tuple[str, str] + (src_location, dst_location) where each is either 'Device' or 'Host'. + """ + src_storage = self.get_storage_type(self.src_node) + dst_storage = self.get_storage_type(self.dst_node) + src_location = 'Device' if src_storage == dtypes.StorageType.GPU_Global else 'Host' + dst_location = 'Device' if dst_storage == dtypes.StorageType.GPU_Global else 'Host' + + return src_location, dst_location + + def get_ctype(self) -> Any: + """ + Determine the C data type (ctype) of the source or destination node. + + The ctype is resolved from the data descriptor of the first node + (source or destination) that is an AccessNode (assumed to be the same + if both are AccessNodes). + + Returns + ------- + Any + The C type string (e.g., "float*", "int32") associated with the node. + + Raises + ------ + NotImplementedError + If neither the source nor the destination node is an AccessNode. + """ + sdfg = self.sdfg + src_node, dst_node = self.src_node, self.dst_node + + if isinstance(src_node, nodes.AccessNode): + return src_node.desc(sdfg).ctype + + if isinstance(dst_node, nodes.AccessNode): + return dst_node.desc(sdfg).ctype + + raise NotImplementedError( + f"Cannot determine ctype: neither src nor dst node is an AccessNode. " + f"Got src_node type: {type(src_node).__name__}, dst_node type: {type(dst_node).__name__}. " + "Please extend this case or fix the issue.") + + def get_accessnode_to_accessnode_copy_info(self): + """ + Compute copy shape, absolute strides, and pointer expressions for a copy + between two AccessNodes. Tries to mimic + cpp.memlet_copy_to_absolute_strides without requiring a dispatcher. + + Returns + ------- + (copy_shape, src_strides, dst_strides, src_expr, dst_expr) + + Raises + ------ + TypeError + If either endpoint is not an AccessNode. + NotImplementedError + If a descriptor is not Scalar or Array. + """ + + # ---------------------------- helpers ---------------------------- + def _collapse_strides(strides, subset): + """Remove size-1 dims; keep tile strides; default to [1] if none remain.""" + n = len(subset) + collapsed = [st for st, sz in zip(strides, subset.size()) if sz != 1] + collapsed.extend(strides[n:]) # include tiles + if len(collapsed) == 0: + return [1] + return collapsed + + def _ptr_name(desc, name): + if desc.transient and desc.lifetime in (dtypes.AllocationLifetime.Persistent, + dtypes.AllocationLifetime.External): + return f'__state->__{sdfg.cfg_id}_{name}' + return name + + def _expr_for(desc, name, subset): + ptr = _ptr_name(desc, name) + + if isinstance(desc, data.Scalar): + # GPU scalar special-case + if desc.storage in dtypes.GPU_MEMORY_STORAGES_EXPERIMENTAL_CUDACODEGEN: + parent = state.sdfg.parent_nsdfg_node + if parent is not None and name in parent.in_connectors: + return f"&{ptr}" + return ptr + # CPU (or other) scalars + return f"&{ptr}" + + if isinstance(desc, data.Array): + offset = cpp.cpp_offset_expr(desc, subset) + return f"{ptr} + {offset}" if offset != "0" else ptr + + raise NotImplementedError( + f"Expected {name} to be either data.Scalar or data.Array, but got {type(desc).__name__}.") + + # ---------------------------- Get copy info ---------------------------- + # Get needed information + src_node, dst_node = self.src_node, self.dst_node + sdfg, edge, state = self.sdfg, self.edge, self.state + memlet, copy_shape = self.edge.data, self.copy_shape + + # Guard - only applicable if src and dst are AccessNodes + if not (isinstance(src_node, nodes.AccessNode) and isinstance(dst_node, nodes.AccessNode)): + raise TypeError( + f"get_accessnode_to_accessnode_copy_info requires both source and destination " + f"to be AccessNode instances, but got {type(src_node).__name__} and {type(dst_node).__name__}.") + + # Get node descriptors + src_nodedesc = src_node.desc(sdfg) + dst_nodedesc = dst_node.desc(sdfg) + + # Resolve subsets (fallback to full range) + src_subset = memlet.get_src_subset(edge, state) + dst_subset = memlet.get_dst_subset(edge, state) + + if src_subset is None: + src_subset = subsets.Range.from_array(src_nodedesc) + + if dst_subset is None: + dst_subset = subsets.Range.from_array(dst_nodedesc) + + # Get strides + src_strides = src_subset.absolute_strides(src_nodedesc.strides) + dst_strides = dst_subset.absolute_strides(dst_nodedesc.strides) + + # Try to convert to a degenerate/strided ND copy first + result = cpp.ndcopy_to_strided_copy( + copy_shape, + src_nodedesc.shape, + src_strides, + dst_nodedesc.shape, + dst_strides, + memlet.subset, + src_subset, + dst_subset, + ) + + if result is not None: + copy_shape, src_strides, dst_strides = result + else: + src_strides = _collapse_strides(src_strides, src_subset) + dst_strides = _collapse_strides(dst_strides, dst_subset) + copy_shape = [s for s in copy_shape if s != 1] or [1] + + # Extend copy shape to the largest among the data dimensions, + # and extend other array with the appropriate strides + if len(dst_strides) != len(copy_shape) or len(src_strides) != len(copy_shape): + if memlet.data == src_node.data: + copy_shape, dst_strides = cpp.reshape_strides(src_subset, src_strides, dst_strides, copy_shape) + elif memlet.data == dst_node.data: + copy_shape, src_strides = cpp.reshape_strides(dst_subset, dst_strides, src_strides, copy_shape) + + # Build final expressions + src_expr = _expr_for(src_nodedesc, src_node.data, src_subset) + dst_expr = _expr_for(dst_nodedesc, dst_node.data, dst_subset) + + return copy_shape, src_strides, dst_strides, src_expr, dst_expr + + +class CopyStrategy(ABC): + """Abstract base class for memory copy strategies.""" + + @abstractmethod + def applicable(self, copy_context: CopyContext) -> bool: + """ + Return True if this strategy can handle the given memory copy. + """ + raise NotImplementedError('Abstract class') + + @abstractmethod + def generate_copy(self, copy_context: CopyContext) -> str: + """ + Generates and returns the copy code for the supported pattern. + """ + raise NotImplementedError('Abstract class') + + +class OutOfKernelCopyStrategy(CopyStrategy): + """ + Copy strategy for memory transfers that occur outside of kernel execution. + + This pattern often occurs when generating host-to-device copies for kernel inputs + (since kernels cannot access host memory directly), and device-to-host copies + to retrieve results for further processing. + """ + + def applicable(self, copy_context: CopyContext) -> bool: + """ + Determines whether the data movement is a host<->device memory copy. + + This function returns True if: + - We are not currently generating kernel code + - The copy occurs between two AccessNodes + - The data descriptors of source and destination are not views. + - The storage types of either src or dst is CPU_Pinned or GPU_Device + - We do not have a CPU-to-CPU copy + """ + # Retrieve needed information + state = copy_context.state + src_node, dst_node = copy_context.src_node, copy_context.dst_node + + # 1. Ensure copy is not occuring within a kernel + scope_dict = state.scope_dict() + deeper_node = dst_node if scope_contains_scope(scope_dict, src_node, dst_node) else src_node + + parent_map_tuple = helpers.get_parent_map(state, deeper_node) + while parent_map_tuple is not None: + parent_map, parent_state = parent_map_tuple + if parent_map.map.schedule in dtypes.GPU_SCHEDULES_EXPERIMENTAL_CUDACODEGEN: + return False + else: + parent_map_tuple = helpers.get_parent_map(parent_state, parent_map) + + # 2. Check whether copy is between two AccessNodes + if not (isinstance(src_node, nodes.AccessNode) and isinstance(dst_node, nodes.AccessNode)): + return False + + # 3. The data descriptors of source and destination are not views + if isinstance(src_node.desc(state), data.View) or isinstance(dst_node.desc(state), data.View): + return False + + # 4. Check that one StorageType of either src or dst is CPU_Pinned or GPU_Device + src_storage = copy_context.get_storage_type(src_node) + dst_storage = copy_context.get_storage_type(dst_node) + if not (src_storage in (StorageType.GPU_Global, StorageType.CPU_Pinned) + or dst_storage in (StorageType.GPU_Global, StorageType.CPU_Pinned)): + return False + + # 5. Check that this is not a CPU to CPU copy + cpu_storage_types = [StorageType.CPU_Heap, StorageType.CPU_ThreadLocal, StorageType.CPU_Pinned] + if src_storage in cpu_storage_types and dst_storage in cpu_storage_types: + return False + + return True + + def generate_copy(self, copy_context: CopyContext) -> str: + """Execute host-device copy with CUDA memory operations""" + + # Guard + memlet = copy_context.edge.data + if memlet.wcr is not None: + src_location, dst_location = copy_context.get_memory_location() + raise NotImplementedError(f'Accumulate {src_location} to {dst_location} not implemented') + + # Based on the copy dimension, call appropiate helper function + num_dims = len(copy_context.copy_shape) + if num_dims == 1: + copy_call = self._generate_1d_copy(copy_context) + + elif num_dims == 2: + copy_call = self._generate_2d_copy(copy_context) + + else: + # sanity check + assert num_dims > 2, f"Expected copy shape with more than 2 dimensions, but got {num_dims}." + copy_call = self._generate_nd_copy(copy_context) + + return copy_call + + def _generate_1d_copy(self, copy_context: CopyContext) -> str: + """ + Generates a 1D memory copy between host and device using the GPU backend. + + Uses {backend}MemcpyAsync for contiguous memory. For strided memory, + {backend}Memcpy2DAsync is leveraged to efficiently handle the stride along one dimension. + """ + # ----------- Retrieve relevant copy parameters -------------- + backend: str = common.get_gpu_backend() + + # Due to applicable(), src and dst node must be AccessNodes + copy_shape, src_strides, dst_strides, src_expr, dst_expr = copy_context.get_accessnode_to_accessnode_copy_info() + + src_location, dst_location = copy_context.get_memory_location() + is_contiguous_copy = (src_strides[-1] == 1) and (dst_strides[-1] == 1) + ctype = copy_context.get_ctype() + gpustream = copy_context.get_assigned_gpustream() + + # ----------------- Generate backend call -------------------- + + if is_contiguous_copy: + # Memory is linear: can use {backend}MemcpyAsync + copysize = ' * '.join(sym2cpp(copy_shape)) + copysize += f' * sizeof({ctype})' + kind = f'{backend}Memcpy{src_location}To{dst_location}' + call = f'DACE_GPU_CHECK({backend}MemcpyAsync({dst_expr}, {src_expr}, {copysize}, {kind}, {gpustream}));\n' + + else: + # Memory is strided: use {backend}Memcpy2DAsync with dpitch/spitch + # This allows copying a strided 1D region + dpitch = f'{sym2cpp(dst_strides[0])} * sizeof({ctype})' + spitch = f'{sym2cpp(src_strides[0])} * sizeof({ctype})' + width = f'sizeof({ctype})' + height = sym2cpp(copy_shape[0]) + kind = f'{backend}Memcpy{src_location}To{dst_location}' + + call = f'DACE_GPU_CHECK({backend}Memcpy2DAsync({dst_expr}, {dpitch}, {src_expr}, {spitch}, {width}, {height}, {kind}, {gpustream}));\n' + + # Potentially snychronization required if syncdebug is set to true in configurations + call = call + generate_sync_debug_call() + return call + + def _generate_2d_copy(self, copy_context: CopyContext) -> None: + """ + Generates a 2D memory copy using {backend}Memcpy2DAsync. + + Three main cases are handled: + - Copy between row-major stored arrays with contiguous rows. + - Copy between column-major stored arrays with contiguous columns. + - A special case where a 2D copy can still be represented. + + Raises: + NotImplementedError: Raised if the source and destination strides do not match any of the handled patterns. + Such cases indicate an unsupported 2D copy and should be examined separately. + They can be implemented if valid, or a more descriptive error should be raised if the path should not occur. + + Note: + {backend}Memcpy2DAsync supports strided copies along only one dimension (row or column), + but not both simultaneously. + """ + + # ----------- Extract relevant copy parameters -------------- + backend: str = common.get_gpu_backend() + + # Due to applicable(), src and dst node must be AccessNodes + copy_shape, src_strides, dst_strides, src_expr, dst_expr = copy_context.get_accessnode_to_accessnode_copy_info() + src_location, dst_location = copy_context.get_memory_location() + ctype = copy_context.get_ctype() + gpustream = copy_context.get_assigned_gpustream() + + # ----------------- Generate backend call if supported -------------------- + + # Case: Row-major layout, rows are not strided. + if (src_strides[1] == 1) and (dst_strides[1] == 1): + dpitch = f'{sym2cpp(dst_strides[0])} * sizeof({ctype})' + spitch = f'{sym2cpp(src_strides[0])} * sizeof({ctype})' + width = f'{sym2cpp(copy_shape[1])} * sizeof({ctype})' + height = f'{sym2cpp(copy_shape[0])}' + kind = f'{backend}Memcpy{src_location}To{dst_location}' + + call = f'DACE_GPU_CHECK({backend}Memcpy2DAsync({dst_expr}, {dpitch}, {src_expr}, {spitch}, {width}, {height}, {kind}, {gpustream}));\n' + + # Case: Column-major layout, no columns are strided. + elif (src_strides[0] == 1) and (dst_strides[0] == 1): + dpitch = f'{sym2cpp(dst_strides[1])} * sizeof({ctype})' + spitch = f'{sym2cpp(src_strides[1])} * sizeof({ctype})' + width = f'{sym2cpp(copy_shape[0])} * sizeof({ctype})' + height = f'{sym2cpp(copy_shape[1])}' + kind = f'{backend}Memcpy{src_location}To{dst_location}' + + call = f'DACE_GPU_CHECK({backend}Memcpy2DAsync({dst_expr}, {dpitch}, {src_expr}, {spitch}, {width}, {height}, {kind}, {gpustream}));\n' + + # Special case + elif (src_strides[0] / src_strides[1] == copy_shape[1] and dst_strides[0] / dst_strides[1] == copy_shape[1]): + # Consider as an example this copy: A[0:I, 0:J, K] -> B[0:I, 0:J] with + # copy shape [I, J], src_strides[J*K, K], dst_strides[J, 1]. This can be represented with a + # {backend}Memcpy2DAsync call! + + dpitch = f'{sym2cpp(dst_strides[1])} * sizeof({ctype})' + spitch = f'{sym2cpp(src_strides[1])} * sizeof({ctype})' + width = f'sizeof({ctype})' + height = sym2cpp(copy_shape[0] * copy_shape[1]) + kind = f'{backend}Memcpy{src_location}To{dst_location}' + + call = f'DACE_GPU_CHECK({backend}Memcpy2DAsync({dst_expr}, {dpitch}, {src_expr}, {spitch}, {width}, {height}, {kind}, {gpustream}));\n' + + else: + raise NotImplementedError( + f"Unsupported 2D memory copy: shape={copy_shape}, src_strides={src_strides}, dst_strides={dst_strides}." + "Please implement this case if it is valid, or raise a more descriptive error if this path should not be taken." + ) + + # Potentially snychronization required if syncdebug is set to true in configurations + call = call + generate_sync_debug_call() + return call + + def _generate_nd_copy(self, copy_context: CopyContext) -> None: + """ + Generates GPU code for copying N-dimensional arrays using 2D memory copies. + + Uses {backend}Memcpy2DAsync for the last two dimensions, with nested loops + for any outer dimensions. Expects the copy to be contiguous and between + row-major storage locations. + """ + # ----------- Extract relevant copy parameters -------------- + backend: str = common.get_gpu_backend() + + # Due to applicable(), src and dst node must be AccessNodes + copy_shape, src_strides, dst_strides, src_expr, dst_expr = copy_context.get_accessnode_to_accessnode_copy_info() + + src_location, dst_location = copy_context.get_memory_location() + ctype = copy_context.get_ctype() + gpustream = copy_context.get_assigned_gpustream() + num_dims = len(copy_shape) + + # ----------- Guard for unsupported Pattern -------------- + if not (src_strides[-1] == 1) and (dst_strides[-1] == 1): + src_node, dst_node = copy_context.src_node, copy_context.dst_node + src_storage = copy_context.get_storage_type(src_node) + dst_storage = copy_context.get_storage_type(dst_node) + raise NotImplementedError( + "N-dimensional GPU memory copies, that are strided or contain column-major arrays, are currently not supported.\n" + f" Source node: {src_node} (storage: {src_storage})\n" + f" Destination node: {copy_context.dst_node} (storage: {dst_storage})\n" + f" Source strides: {src_strides}\n" + f" Destination strides: {dst_strides}\n" + f" copy shape: {copy_shape}\n") + + # ----------------- Generate and write backend call(s) -------------------- + + call = "" + # Write for-loop headers + for dim in range(num_dims - 2): + call += f"for (int __copyidx{dim} = 0; __copyidx{dim} < {copy_shape[dim]}; ++__copyidx{dim}) {{\n" + + # Write Memcopy2DAsync + offset_src = ' + '.join(f'(__copyidx{d} * ({sym2cpp(s)}))' for d, s in enumerate(src_strides[:-2])) + offset_dst = ' + '.join(f'(__copyidx{d} * ({sym2cpp(s)}))' for d, s in enumerate(dst_strides[:-2])) + + src = f'{src_expr} + {offset_src}' + dst = f'{dst_expr} + {offset_dst}' + + dpitch = f'{sym2cpp(dst_strides[-2])} * sizeof({ctype})' + spitch = f'{sym2cpp(src_strides[-2])} * sizeof({ctype})' + width = f'{sym2cpp(copy_shape[-1])} * sizeof({ctype})' + height = sym2cpp(copy_shape[-2]) + kind = f'{backend}Memcpy{src_location}To{dst_location}' + + # Generate call and write it + call += f'DACE_GPU_CHECK({backend}Memcpy2DAsync({dst}, {dpitch}, {src}, {spitch}, {width}, {height}, {kind}, {gpustream}));\n' + + # Potentially snychronization required if syncdebug is set to true in configurations + call += generate_sync_debug_call() + + # Write for-loop footers + for dim in range(num_dims - 2): + call += "\n}" + + # Return the code + return call + diff --git a/dace/codegen/targets/gpu_helpers/gpu_utils.py b/dace/codegen/targets/gpu_helpers/gpu_utils.py new file mode 100644 index 0000000000..e4c4c1fc38 --- /dev/null +++ b/dace/codegen/targets/gpu_helpers/gpu_utils.py @@ -0,0 +1,27 @@ +# Copyright 2019-2025 ETH Zurich and the DaCe authors. All rights reserved. +from dace import Config +from dace.codegen import common + + +def generate_sync_debug_call() -> str: + """ + Generate backend sync and error-check calls as a string if + synchronous debugging is enabled. + + Parameters + ---------- + backend : str + Backend API prefix (e.g., 'cuda'). + + Returns + ------- + str + The generated debug call code, or an empty string if debugging is disabled. + """ + backend: str = common.get_gpu_backend() + sync_call: str = "" + if Config.get_bool('compiler', 'cuda', 'syncdebug'): + sync_call = (f"DACE_GPU_CHECK({backend}GetLastError());\n" + f"DACE_GPU_CHECK({backend}DeviceSynchronize());\n") + + return sync_call diff --git a/dace/transformation/helpers.py b/dace/transformation/helpers.py index 13842fd162..f5607f952b 100644 --- a/dace/transformation/helpers.py +++ b/dace/transformation/helpers.py @@ -1550,6 +1550,37 @@ def get_parent_map(state: SDFGState, node: Optional[nodes.Node] = None) -> Optio cursdfg = cursdfg.parent_sdfg return None +def is_within_schedule_types(state: SDFGState, node: nodes.Node, schedules: Set[dtypes.ScheduleType]) -> bool: + """ + Checks if the given node is enclosed within a Map whose schedule type + matches any in the `schedules` set. + + Parameters + ---------- + state : SDFGState + The State where the node resides + node : nodes.Node + The node to check. + schedules : set[dtypes.ScheduleType] + A set of schedule types to match (e.g., {dtypes.ScheduleType.GPU_Device}). + + Returns + ---------- + bool + True if the node is enclosed by a Map with a schedule type in `schedules`, False otherwise. + """ + current = node + + while current is not None: + if isinstance(current, nodes.MapEntry): + if current.map.schedule in schedules: + return True + + parent = get_parent_map(state, current) + if parent is None: + return False + current, state = parent + def redirect_edge(state: SDFGState, edge: graph.MultiConnectorEdge[Memlet], diff --git a/dace/transformation/passes/gpu_specialization/connect_gpu_streams_to_kernels.py b/dace/transformation/passes/gpu_specialization/connect_gpu_streams_to_kernels.py new file mode 100644 index 0000000000..851f18e108 --- /dev/null +++ b/dace/transformation/passes/gpu_specialization/connect_gpu_streams_to_kernels.py @@ -0,0 +1,70 @@ +# Copyright 2019-2025 ETH Zurich and the DaCe authors. All rights reserved. +from typing import Any, Dict, Set, Type, Union + +import dace +from dace import dtypes, properties, SDFG +from dace.codegen import common +from dace.config import Config +from dace.sdfg import nodes +from dace.transformation import pass_pipeline as ppl, transformation +from dace.transformation.passes.gpu_specialization.gpu_stream_scheduling import NaiveGPUStreamScheduler +from dace.transformation.passes.gpu_specialization.insert_gpu_streams_to_sdfgs import InsertGPUStreamsToSDFGs + + +@properties.make_properties +@transformation.explicit_cf_compatible +class ConnectGPUStreamsToKernels(ppl.Pass): + """ + This Pass attaches GPU streams to kernels (i.e., dtypes.ScheduleType.GPU_Device scheduled maps). + + Adds GPU stream AccessNodes and connects them to kernel entry and exit nodes, + indicating which GPU stream each kernel is assigned to. These assignments are e.g. + used when launching the kernels. + """ + + def depends_on(self) -> Set[Union[Type[ppl.Pass], ppl.Pass]]: + return {NaiveGPUStreamScheduler, InsertGPUStreamsToSDFGs} + + def modifies(self) -> ppl.Modifies: + return ppl.Modifies.AccessNodes | ppl.Modifies.Memlets + + def should_reapply(self, modified: ppl.Modifies) -> bool: + return False + + def apply_pass(self, sdfg: SDFG, pipeline_results: Dict[str, Any]): + # Retrieve the GPU stream array name and the prefix for individual stream variables + stream_array_name, stream_var_name_prefix = Config.get('compiler', 'cuda', 'gpu_stream_name').split(',') + + # Retrieve GPU stream assignments for nodes + stream_assignments: Dict[nodes.Node, Union[int, str]] = pipeline_results['NaiveGPUStreamScheduler'] + + # Link kernels to their assigned GPU streams + for sub_sdfg in sdfg.all_sdfgs_recursive(): + + for state in sub_sdfg.states(): + for node in state.nodes(): + + # Not a kernel entry - continue + if not (isinstance(node, nodes.MapEntry) and node.map.schedule == dtypes.ScheduleType.GPU_Device): + continue + + # Stream connector name and the used GPU Stream for the kernel + assigned_gpustream = stream_assignments[node] + gpu_stream_var_name = f"{stream_var_name_prefix}{assigned_gpustream}" + accessed_gpu_stream = f"{stream_array_name}[{assigned_gpustream}]" + + # Assign the GPU stream to the kernel entry + kernel_entry = node + kernel_entry.add_in_connector(gpu_stream_var_name, dtypes.gpuStream_t) + stream_array_in = state.add_access(stream_array_name) + state.add_edge(stream_array_in, None, kernel_entry, gpu_stream_var_name, + dace.Memlet(accessed_gpu_stream)) + + # Assign the GPU stream to the kernel exit + kernel_exit = state.exit_node(kernel_entry) + kernel_exit.add_out_connector(gpu_stream_var_name, dtypes.gpuStream_t) + stream_array_out = state.add_access(stream_array_name) + state.add_edge(kernel_exit, gpu_stream_var_name, stream_array_out, None, + dace.Memlet(accessed_gpu_stream)) + + return {} \ No newline at end of file diff --git a/dace/transformation/passes/gpu_specialization/connect_gpu_streams_to_tasklets.py b/dace/transformation/passes/gpu_specialization/connect_gpu_streams_to_tasklets.py new file mode 100644 index 0000000000..9877f2d563 --- /dev/null +++ b/dace/transformation/passes/gpu_specialization/connect_gpu_streams_to_tasklets.py @@ -0,0 +1,80 @@ +# Copyright 2019-2025 ETH Zurich and the DaCe authors. All rights reserved. +from typing import Any, Dict, Set, Type, Union + +import dace +from dace import dtypes, properties, SDFG +from dace.config import Config +from dace.sdfg import nodes +from dace.transformation import pass_pipeline as ppl, transformation +from dace.transformation.passes.gpu_specialization.gpu_stream_scheduling import NaiveGPUStreamScheduler +from dace.transformation.passes.gpu_specialization.insert_gpu_streams_to_sdfgs import InsertGPUStreamsToSDFGs +from dace.transformation.passes.gpu_specialization.connect_gpu_streams_to_kernels import ConnectGPUStreamsToKernels + +# Placeholder for the GPU stream variable used in tasklet code +STREAM_PLACEHOLDER = "__dace_current_stream" + + +@properties.make_properties +@transformation.explicit_cf_compatible +class ConnectGPUStreamsToTasklets(ppl.Pass): + """ + This pass ensures that tasklets which require access to their assigned GPU stream + are provided with it explicitly. + + Such tasklets typically originate from expanded LibraryNodes targeting GPUs. + These nodes may reference the special placeholder variable `__dace_current_stream`, + which is expected to be defined during unparsing in `cpp.py`. + + To avoid relying on this "hidden" mechanism, the pass rewrites tasklets to use + the GPU stream AccessNode directly. + + Note that this pass is similar to `ConnectGPUStreamsToKernels`. + """ + + def depends_on(self) -> Set[Union[Type[ppl.Pass], ppl.Pass]]: + return {NaiveGPUStreamScheduler, InsertGPUStreamsToSDFGs, ConnectGPUStreamsToKernels} + + def modifies(self) -> ppl.Modifies: + return ppl.Modifies.AccessNodes | ppl.Modifies.Memlets + + def should_reapply(self, modified: ppl.Modifies) -> bool: + return False + + def apply_pass(self, sdfg: SDFG, pipeline_results: Dict[str, Any]): + # Retrieve the GPU stream's array name + stream_array_name = Config.get('compiler', 'cuda', 'gpu_stream_name').split(',')[0] + + # Retrieve GPU stream assignments for nodes + stream_assignments: Dict[nodes.Node, Union[int, str]] = pipeline_results['NaiveGPUStreamScheduler'] + + # Find all tasklets which use the GPU stream variable (STREAM_PLACEHOLDER) in the code + # and provide them the needed GPU stream explicitly + for sub_sdfg in sdfg.all_sdfgs_recursive(): + + for state in sub_sdfg.states(): + for node in state.nodes(): + + # Not a tasklet - continue + if not isinstance(node, nodes.Tasklet): + continue + + # Tasklet does not need use its assigned GPU stream - continue + if not STREAM_PLACEHOLDER in node.code.as_string: + continue + + # Stream connector name and the used GPU Stream for the kernel + assigned_gpustream = stream_assignments[node] + gpu_stream_conn = STREAM_PLACEHOLDER + accessed_gpu_stream = f"{stream_array_name}[{assigned_gpustream}]" + + # Provide the GPU stream explicitly to the tasklet + stream_array_in = state.add_access(stream_array_name) + stream_array_out = state.add_access(stream_array_name) + + node.add_in_connector(gpu_stream_conn, dtypes.gpuStream_t) + node.add_out_connector(gpu_stream_conn, dtypes.gpuStream_t, force=True) + + state.add_edge(stream_array_in, None, node, gpu_stream_conn, dace.Memlet(accessed_gpu_stream)) + state.add_edge(node, gpu_stream_conn, stream_array_out, None, dace.Memlet(accessed_gpu_stream)) + + return {} \ No newline at end of file diff --git a/dace/transformation/passes/gpu_specialization/gpu_stream_scheduling.py b/dace/transformation/passes/gpu_specialization/gpu_stream_scheduling.py new file mode 100644 index 0000000000..0151d790b8 --- /dev/null +++ b/dace/transformation/passes/gpu_specialization/gpu_stream_scheduling.py @@ -0,0 +1,249 @@ +# Copyright 2019-2025 ETH Zurich and the DaCe authors. All rights reserved. +from typing import Dict, List, Set, Type, Union + +import dace +from dace import SDFG, SDFGState, properties +from dace.config import Config +from dace.sdfg import nodes +from dace.sdfg.graph import Graph, NodeT +from dace.transformation import pass_pipeline as ppl, transformation + +# Placeholder for the GPU stream variable used in tasklet code +STREAM_PLACEHOLDER = "__dace_current_stream" + + +@properties.make_properties +@transformation.explicit_cf_compatible +class NaiveGPUStreamScheduler(ppl.Pass): + """ + Assigns GPU streams to nodes and stores the assignments in a dictionary. + This can be useful for enabling asynchronous and parallel GPU computation using GPU streams. + + Strategy Overview: + ------------------ + - GPU stream assignment is based on weakly connected components (WCCs) within each state. + - Nodes in the same WCC are assigned to the same stream. + - For top-level states (not within nested SDFGs), each new WCC starts on a new stream (starting from 0). + - In nested SDFGs: + * Stream assignment is inherited from the parent component, + * All internal components share the parent's stream. + - GPU stream IDs wrap around according to the `max_concurrent_streams` configuration. + + Example: + -------- + A state with the following independent chains: + K1 → K2 + K3 → K4 → K5 + K6 + + would be scheduled as: + K1, K2 → stream 0 + K3, K4, K5 → stream 1 + K6 → stream 2 + + (assuming no limit on the number of concurrent streams) + + Note: + ----- + These refer to **backend GPU streams** (e.g., CUDA or HIP), not DaCe symbolic streams. + """ + + def __init__(self): + # Maximum number of concurrent streams allowed (from config). + # Cached locally for frequent reuse. + self._max_concurrent_streams = int(Config.get('compiler', 'cuda', 'max_concurrent_streams')) + + def depends_on(self) -> Set[Union[Type[ppl.Pass], ppl.Pass]]: + return {} + + def modifies(self) -> ppl.Modifies: + return ppl.Modifies.Nothing + + def should_reapply(self, modified: ppl.Modifies) -> bool: + return False + + def apply_pass(self, sdfg: SDFG, _) -> Dict[nodes.Node, int]: + """ + Assigns GPU streams to nodes within the given SDFG. + + Parameters + ---------- + sdfg : SDFG + The top-level SDFG to process. + pipeline_results : Dict + Unused. + + Returns + ------- + Dict[nodes.Node, int] + A dictionary mapping each node to its assigned GPU stream. + """ + stream_assignments: Dict[nodes.Node, int] = dict() + for state in sdfg.states(): + self._assign_gpu_streams_in_state(sdfg, False, state, stream_assignments, 0) + + return stream_assignments + + def _assign_gpu_streams_in_state(self, sdfg: SDFG, in_nested_sdfg: bool, state: SDFGState, + stream_assignments: Dict[nodes.Node, int], gpu_stream: int) -> None: + """ + Assigns GPU streams to nodes in a single state. + + If inside a nested SDFG, components inherit the parent's stream. + Otherwise, each connected component gets a different stream. + Nested SDFGs are processed recursively. + + Parameters + ---------- + sdfg : SDFG + The SDFG containing the state. + in_nested_sdfg : bool + True if the state is in a nested SDFG. + state : SDFGState + The state to process. + stream_assignments : Dict[nodes.Node, int] + Mapping of nodes to assigned GPU streams (updated in-place). + gpu_stream : int + The current GPU stream ID. + + Returns + ------- + None + """ + components = self._get_weakly_connected_nodes(state) + + for component in components: + + if not self._requires_gpu_stream(state, component): + continue + + nodes_assigned_before = len(stream_assignments) + + for node in component: + stream_assignments[node] = gpu_stream + if isinstance(node, nodes.NestedSDFG): + for nested_state in node.sdfg.states(): + self._assign_gpu_streams_in_state(node.sdfg, True, nested_state, stream_assignments, gpu_stream) + + # Move to the next stream if we have assigned streams to any node in this component + # (careful: if nested, states are in same component) + if not in_nested_sdfg and len(stream_assignments) > nodes_assigned_before: + gpu_stream = self._next_stream(gpu_stream) + + def _get_weakly_connected_nodes(self, graph: Graph) -> List[Set[NodeT]]: + """ + Returns all weakly connected components in the given directed graph. + + A weakly connected component is a maximal group of nodes such that each pair + of nodes is connected by a path when ignoring edge directions. + + Parameters + ---------- + graph: Graph + A directed graph instance. + + Returns + ------- + List[Set[Node_T]] + + A list containing sets of nodes, with each set corresponding to a weakly + connected component. + """ + visited: Set[NodeT] = set() + components: List[Set[NodeT]] = [] + + for node in graph.nodes(): + if node in visited: + continue + + # Start a new weakly connected component + component: Set[NodeT] = set() + stack = [node] + + while stack: + current = stack.pop() + if current in visited: + continue + + visited.add(current) + component.add(current) + + for neighbor in graph.neighbors(current): + if neighbor not in visited: + stack.append(neighbor) + + components.append(component) + + return components + + def _next_stream(self, gpu_stream: int) -> int: + """ + Compute the next CUDA stream index according to the concurrency configuration. + + Behavior depends on the configured max_concurrent_streams value: + - If 0: unlimited streams allowed, so increment the stream index by one. + - If -1: default setting, always return stream 0 (no concurrency). + - Otherwise: cycle through stream indices from 0 up to max_concurrent_streams - 1. + + Parameters + ---------- + gpu_stream : int + The current CUDA stream index. + + Returns + ------- + int + The next CUDA stream index based on the concurrency policy. + """ + if self._max_concurrent_streams == 0: + return gpu_stream + 1 + elif self._max_concurrent_streams == -1: + return 0 + else: + return (gpu_stream + 1) % self._max_concurrent_streams + + def _requires_gpu_stream(self, state: SDFGState, component: Set[NodeT]) -> bool: + """ + Check whether a connected component in an SDFG state should be assigned + a GPU stream. + + A component requires a GPU stream if it contains at least one of: + - An AccessNode with GPU global memory storage, + - A MapEntry scheduled on a GPU device, + - A Tasklet whose code includes the stream placeholder. + + Parameters + ---------- + state : SDFGState + The state containing the component. + component : Set[NodeT] + The set of nodes that form the connected component. + + Returns + ------- + bool + True if the component requires a GPU stream, False otherwise. + """ + + def gpu_relevant(node, parent) -> bool: + if (isinstance(node, nodes.AccessNode) and node.desc(parent).storage == dace.dtypes.StorageType.GPU_Global): + return True + + elif (isinstance(node, nodes.MapEntry) and node.map.schedule == dace.dtypes.ScheduleType.GPU_Device): + return True + + elif (isinstance(node, nodes.Tasklet) and STREAM_PLACEHOLDER in node.code.as_string): + return True + + return False + + for node in component: + if isinstance(node, nodes.NestedSDFG): + if any(gpu_relevant(node, parent) for node, parent in node.sdfg.all_nodes_recursive()): + return True + + else: + if gpu_relevant(node, state): + return True + + return False \ No newline at end of file diff --git a/dace/transformation/passes/gpu_specialization/gpu_stream_topology_simplification.py b/dace/transformation/passes/gpu_specialization/gpu_stream_topology_simplification.py new file mode 100644 index 0000000000..7af22aa6c6 --- /dev/null +++ b/dace/transformation/passes/gpu_specialization/gpu_stream_topology_simplification.py @@ -0,0 +1,273 @@ +# Copyright 2019-2025 ETH Zurich and the DaCe authors. All rights reserved. +from typing import Any, Dict, List, Set, Tuple, Type, Union +import copy + +import dace +from dace import SDFG, SDFGState, dtypes, properties +from dace.config import Config +from dace.sdfg import nodes +from dace.transformation import pass_pipeline as ppl, transformation +from dace.transformation.passes.gpu_specialization.gpu_stream_scheduling import NaiveGPUStreamScheduler +from dace.transformation.passes.gpu_specialization.insert_gpu_streams_to_sdfgs import InsertGPUStreamsToSDFGs +from dace.transformation.passes.gpu_specialization.connect_gpu_streams_to_kernels import ConnectGPUStreamsToKernels +from dace.transformation.passes.gpu_specialization.connect_gpu_streams_to_tasklets import ConnectGPUStreamsToTasklets +from dace.transformation.passes.gpu_specialization.insert_gpu_stream_sync_tasklets import InsertGPUStreamSyncTasklets +from dace.transformation.passes.gpu_specialization.insert_gpu_copy_tasklet import InsertGPUCopyTasklets + +@properties.make_properties +@transformation.explicit_cf_compatible +class GPUStreamTopologySimplification(ppl.Pass): + """ + Simplifies an SDFG after GPU stream nodes have been added. + + This pass is optional; the SDFG works without it, but it cleans up + the topology by merging adjacent or redundant GPU stream AccessNodes. + """ + + def depends_on(self) -> Set[Union[Type[ppl.Pass], ppl.Pass]]: + depending_passes = { + NaiveGPUStreamScheduler, InsertGPUStreamsToSDFGs, ConnectGPUStreamsToKernels, ConnectGPUStreamsToTasklets, + InsertGPUStreamSyncTasklets, InsertGPUCopyTasklets + } + + return depending_passes + + def modifies(self) -> ppl.Modifies: + return ppl.Modifies.AccessNodes | ppl.Modifies.Memlets + + def should_reapply(self, modified: ppl.Modifies) -> bool: + return False + + def apply_pass(self, sdfg: SDFG, pipeline_results: Dict[str, Any]): + """ + Simplify the SDFG topology by merging adjacent GPU stream nodes. + """ + self._merge_close_gpustream_nodes(sdfg) + + self._merge_gpustreams_special_case(sdfg) + return {} + + def _merge_close_gpustream_nodes(self, sdfg: SDFG) -> None: + """ + Merge "close" GPU stream AccessNodes in the SDFG. + + This function looks for a predecessor GPU stream AccessNode that can be merged + with any successor GPU stream AccessNodes of its grand-predecessors. + + Example: + + Consider two GPU copy tasklets connected via distinct GPU stream AccessNodes: + the corresponding subgraph looks like this: + + -> Sink GPU Source GPU -> + ¦ ¦ + Tasklet ------> Data AccessNode -----> Tasklet + + This function would merge the sink and source node to simplify the SDFG. + """ + for sub_sdfg in sdfg.all_sdfgs_recursive(): + for state in sub_sdfg.states(): + for node in state.nodes(): + + # Skip AccessNodes + if isinstance(node, nodes.AccessNode): + continue + + # Find GPU stream AccessNode predecessors with no incoming edges + # (i.e. source GPU stream AccessNodes) + node_predecessors = state.predecessors(node) + preceeding_gpustream_sources = [ + pre for pre in node_predecessors if isinstance(pre, nodes.AccessNode) + and pre.desc(state).dtype == dtypes.gpuStream_t and state.in_degree(pre) == 0 + ] + + # Skip if there are no preceding GPU stream sources + if len(preceeding_gpustream_sources) == 0: + continue + + # If multiple GPU stream sources exist, merge them; otherwise, use the single source + if len(preceeding_gpustream_sources) > 1: + combined_stream_node = preceeding_gpustream_sources.pop() + for preceeding_gpu_stream in preceeding_gpustream_sources: + # Note: there are no ingoing edges + for out_edge in state.out_edges(preceeding_gpu_stream): + _, src_conn, dst, dst_conn, data = out_edge + state.add_edge(combined_stream_node, src_conn, dst, dst_conn, data) + state.remove_edge(out_edge) + state.remove_node(preceeding_gpu_stream) + + else: + combined_stream_node = preceeding_gpustream_sources.pop() + + # Merge grand-predecessors' successors sink GPU streams with predecessor source GPU stream + node_grand_predecessors = [ + grand_pred for pred in node_predecessors for grand_pred in state.predecessors(pred) + ] + node_gp_successors_streams = [ + succ_of_gp for gp in node_grand_predecessors for succ_of_gp in state.successors(gp) + if isinstance(succ_of_gp, nodes.AccessNode) + and succ_of_gp.desc(state).dtype == dtypes.gpuStream_t and state.out_degree(succ_of_gp) == 0 + ] + + # remove duplicates + node_gp_successors_streams = list(set(node_gp_successors_streams)) + + for gp_succ_stream in node_gp_successors_streams: + for edge in state.in_edges(gp_succ_stream): + src, src_conn, _, dst_conn, data = edge + state.add_edge(src, src_conn, combined_stream_node, dst_conn, data) + state.remove_edge(edge) + # Note: the grand-predecessor's successor GPU stream is a sink node and has no + # outgoing edges + state.remove_node(gp_succ_stream) + + def _merge_gpustreams_special_case(self, sdfg: SDFG) -> None: + """ + Special-case simplification of GPU stream AccessNodes. + + This pass detects the following pattern: + - A GPU stream AccessNode `X` has a predecessor and a successor (i.e. at least one of both). + - Between the predecessor and successor lie one or more tasklets. + - These tasklets use their own distinct GPU stream AccessNodes (not `X`), + which are connected only to the tasklet itself. + + To simplify the topology, redundant streams are merged: + - A single unified input GPU stream connects to the predecessor and replaces (merges) + the per-tasklet input streams. + - A single unified output GPU stream connects to the successor and replaces (merges) + the per-tasklet output streams. + + + The simplification is easier to understand visually than in words. + Inspect the intermediate SDFGs produced by the minimal example below + to see the effect of the stream merging. + + Example + ------- + @dace.program + def example(A: dace.uint32[128], B: dace.uint32[128], + C: dace.uint32[128], D: dace.uint32[128]): + for i in dace.map[0:128:1]: + B[i] = A[i] + for i in dace.map[0:128:1]: + D[i] = C[i] + + sdfg = example.to_sdfg() + sdfg.apply_gpu_transformations() + """ + # Get the name of the GPU stream arry + gpustream_array_name = Config.get('compiler', 'cuda', 'gpu_stream_name').split(',')[0] + + #------------------------- Preprocess: Gather Information ---------------------------- + + # For each GPU Stream AccessNode having a predecessor and a successor: + # Determine with which Tasklet Source and which Tasklet sink nodes lie between its predecessor + # and its successor + merge_source_gpustream: Dict[Tuple[nodes.AccessNode, SDFGState], List[nodes.AccessNode]] = dict() + merge_sink_gpustream: Dict[Tuple[nodes.AccessNode, SDFGState], List[nodes.AccessNode]] = dict() + + for node, state in sdfg.all_nodes_recursive(): + + # Skip non-tasklets + if not isinstance(node, nodes.Tasklet): + continue + + # The tasklets of interest should have exactly one preceeding source GPU node and one following sink GPU node + # If not, we skip + node_predecessors = state.predecessors(node) + node_successors = state.successors(node) + downstream_gpustream_sinks = [ + succ for succ in node_successors if isinstance(succ, nodes.AccessNode) + and succ.desc(state).dtype == dtypes.gpuStream_t and state.out_degree(succ) == 0 + ] + upstream_gpustream_sources = [ + pre for pre in node_predecessors if isinstance(pre, nodes.AccessNode) + and pre.desc(state).dtype == dtypes.gpuStream_t and state.in_degree(pre) == 0 + ] + + # Skip not considered case + if not (len(upstream_gpustream_sources) == len(downstream_gpustream_sinks) + and len(upstream_gpustream_sources) == 1): + continue + + # Look for potential predecessor of a "passthrough" GPU Stream AccessNode + # which would also be the grand-predeccessor of the current node (=tasklet) + candidate_predecessor = [] + for pred in node_predecessors: + for grand_pred in state.predecessors(pred): + + # Current nodes grand pred is a candidate of a predecessor of a "passthrough" GPU Stream AccessNode + candidate = grand_pred + + # A PassThrough GPU stream node can only have MapExits and Tasklets as candidate predecessors + if not (isinstance(candidate, nodes.MapExit) and candidate.map.schedule + == dtypes.ScheduleType.GPU_Device or isinstance(candidate, nodes.Tasklet)): + continue + + has_passthrough_gpustream = any( + (isinstance(succ, nodes.AccessNode) and succ.desc(state).dtype == dtypes.gpuStream_t) and ( + state.in_degree(succ) > 0 and state.out_degree(succ) > 0) + for succ in state.successors(candidate)) + + if has_passthrough_gpustream: + candidate_predecessor.append(candidate) + + # Not "close" passthrough GPU node exists if no candidate predecessor exists + if len(candidate_predecessor) == 0: + continue + + # Niche case, more than one "close" passthrough GPU node exists: Out of scope + # Ignore this case (note: This Pass only makes the Graph visually nicer, so skipping has + # no effect on correctness) + if len(candidate_predecessor) > 1: + continue + + # Get the Kernel Exits GPU stream + candidate_predecessor = candidate_predecessor[0] + passthrough_gpu_node = [ + succ for succ in state.successors(candidate_predecessor) + if isinstance(succ, nodes.AccessNode) and succ.desc(state).dtype == dtypes.gpuStream_t + ][0] + + # Collect and store the GPU stream merging information + pre_gpustream: nodes.AccessNode = upstream_gpustream_sources[0] # Note: Len is 1 + succ_gpustream: nodes.AccessNode = downstream_gpustream_sinks[0] # Note: Len is 1 + if (passthrough_gpu_node, state) in merge_source_gpustream: + merge_source_gpustream[(passthrough_gpu_node, state)].append(pre_gpustream) + merge_sink_gpustream[(passthrough_gpu_node, state)].append(succ_gpustream) + else: + merge_source_gpustream[(passthrough_gpu_node, state)] = [pre_gpustream] + merge_sink_gpustream[(passthrough_gpu_node, state)] = [succ_gpustream] + + #------------------------- Merge the GPU Stream AccessNodes ---------------------------- + for passthrough_gpu_node, state in merge_sink_gpustream.keys(): + + # Add new AccessNodes which merge the other loose streams + unified_in_stream = state.add_access(gpustream_array_name) + unified_out_stream = state.add_access(gpustream_array_name) + + for in_edge in state.in_edges(passthrough_gpu_node): + src, src_conn, _, dst_conn, memlet = in_edge + state.add_edge(src, src_conn, unified_in_stream, dst_conn, copy.deepcopy(memlet)) + state.remove_edge(in_edge) + + for out_edge in state.out_edges(passthrough_gpu_node): + _, src_conn, dst, dst_conn, memlet = out_edge + state.add_edge(unified_out_stream, src_conn, dst, dst_conn, copy.deepcopy(memlet)) + state.remove_edge(out_edge) + + for source_stream in merge_source_gpustream[passthrough_gpu_node, state]: + for out_edge in state.out_edges(source_stream): + _, src_conn, dst, dst_conn, memlet = out_edge + state.add_edge(unified_in_stream, src_conn, dst, dst_conn, copy.deepcopy(memlet)) + state.remove_edge(out_edge) + state.remove_node(source_stream) + + for sink_stream in merge_sink_gpustream[passthrough_gpu_node, state]: + for in_edge in state.in_edges(sink_stream): + src, src_conn, _, dst_conn, memlet = in_edge + state.add_edge(src, src_conn, unified_out_stream, dst_conn, copy.deepcopy(memlet)) + state.remove_edge(in_edge) + state.remove_node(sink_stream) + + state.remove_node(passthrough_gpu_node) \ No newline at end of file diff --git a/dace/transformation/passes/gpu_specialization/insert_gpu_copy_tasklet.py b/dace/transformation/passes/gpu_specialization/insert_gpu_copy_tasklet.py new file mode 100644 index 0000000000..cea8fc1f43 --- /dev/null +++ b/dace/transformation/passes/gpu_specialization/insert_gpu_copy_tasklet.py @@ -0,0 +1,166 @@ +# Copyright 2019-2025 ETH Zurich and the DaCe authors. All rights reserved. +from typing import Any, Dict, List, Set, Tuple, Type, Union +import copy + +import dace +from dace import SDFG, SDFGState, dtypes, properties +from dace import memlet as mm +from dace.codegen.targets.gpu_helpers.copy_strategies import CopyContext, OutOfKernelCopyStrategy +from dace.config import Config +from dace.sdfg import nodes, scope_contains_scope +from dace.sdfg.graph import MultiConnectorEdge +from dace.transformation import pass_pipeline as ppl, transformation +from dace.transformation.passes.gpu_specialization.gpu_stream_scheduling import NaiveGPUStreamScheduler +from dace.transformation.passes.gpu_specialization.insert_gpu_streams_to_sdfgs import InsertGPUStreamsToSDFGs +from dace.transformation.passes.gpu_specialization.connect_gpu_streams_to_kernels import ConnectGPUStreamsToKernels +from dace.transformation.passes.gpu_specialization.connect_gpu_streams_to_tasklets import ConnectGPUStreamsToTasklets +from dace.transformation.passes.gpu_specialization.insert_gpu_stream_sync_tasklets import InsertGPUStreamSyncTasklets + + +@properties.make_properties +@transformation.explicit_cf_compatible +class InsertGPUCopyTasklets(ppl.Pass): + """ + This pass inserts explicit copy tasklets for data transfers that need to be handled + by the GPU and occur outside a kernel (for example, copying data from host memory + to the GPU before executing a kernel). + + It identifies such copy locations and inserts the corresponding tasklets. For each + memlet path describing a copy, the first edge is duplicated: one edge goes from the original + source to the tasklet, and the other from the tasklet to the original destination, while + the original edge is removed. + + This is experimental and could later serve as inspiration for making all copies explicit. + Considerations for future work include allowing tasklets to access array addresses + from connectors and describing in memlets how data will be moved, since currently + tasklets only support value inputs. + """ + + def depends_on(self) -> Set[Union[Type[ppl.Pass], ppl.Pass]]: + depending_passes = { + NaiveGPUStreamScheduler, InsertGPUStreamsToSDFGs, ConnectGPUStreamsToKernels, ConnectGPUStreamsToTasklets, + InsertGPUStreamSyncTasklets + } + return depending_passes + + def modifies(self) -> ppl.Modifies: + return ppl.Modifies.Tasklets | ppl.Modifies.Memlets + + def should_reapply(self, modified: ppl.Modifies) -> bool: + return False + + def apply_pass(self, sdfg: SDFG, pipeline_results: Dict[str, Any]) -> Dict: + """ + Inserts out-of-kernel GPU copy tasklets into the SDFG based on GPU stream scheduling. + Out-of-kernel copies are copies which are handled by the GPU and occur out of a kernel + function. + + Parameters + ---------- + sdfg : SDFG + The SDFG to transform by adding out-of-kernel GPU copy tasklets. + pipeline_results : Dict[str, Any] + Results from previous transformation passes, including GPU stream assignments. + + Returns + ------- + dict + Currently returns an empty dictionary. + """ + # Prepare GPU stream + gpustream_assignments: Dict[nodes.Node, Union[int, str]] = pipeline_results['NaiveGPUStreamScheduler'] + gpustream_array_name, gpustream_var_name_prefix = Config.get('compiler', 'cuda', 'gpu_stream_name').split(',') + + # Initialize the strategy for copies that occur outside of kernel execution + out_of_kernel_copy = OutOfKernelCopyStrategy() + + # Get all data copies to process the out of kernel copies + copy_worklist = self.find_all_data_copies(sdfg) + + for copy_sdfg, state, src_node, dst_node, edge in copy_worklist: + + copy_context = CopyContext(copy_sdfg, state, src_node, dst_node, edge, gpustream_assignments) + + # Only insert copy tasklets for GPU related copies occuring out of the + # kernel (i.e. a GPU_device scheduled map) + if not out_of_kernel_copy.applicable(copy_context): + continue + + # Generatae the copy call + code = out_of_kernel_copy.generate_copy(copy_context) + + # Prepare GPU ustream connectors and the stream to be accessed from the + # GPU stream array + gpustream_id = gpustream_assignments[dst_node] + gpustream_var_name = f"{gpustream_var_name_prefix}{gpustream_id}" + accessed_gpustream = f"{gpustream_array_name}[{gpustream_id}]" + + # Create the tasklet and add GPU stream related connectors + tasklet = state.add_tasklet("gpu_copy", {}, {}, code, language=dtypes.Language.CPP) + tasklet.add_in_connector(gpustream_var_name, dtypes.gpuStream_t, True) + tasklet.add_out_connector(gpustream_var_name, dtypes.gpuStream_t, True) + + # Add incoming and outgoing GPU stream accessNodes to the tasklet + in_gpustream = state.add_access(gpustream_array_name) + out_gpustream = state.add_access(gpustream_array_name) + state.add_edge(in_gpustream, None, tasklet, gpustream_var_name, dace.Memlet(accessed_gpustream)) + state.add_edge(tasklet, gpustream_var_name, out_gpustream, None, dace.Memlet(accessed_gpustream)) + + # Put the tasklet in between the edge + dst_node_pred, dst_node_conn, _, dst_conn, memlet = edge + state.add_edge(dst_node_pred, dst_node_conn, tasklet, None, copy.deepcopy(memlet)) + state.add_edge(tasklet, None, dst_node, dst_conn, copy.deepcopy(memlet)) + state.remove_edge(edge) + + return {} + + def find_all_data_copies( + self, sdfg: SDFG) -> List[Tuple[SDFG, SDFGState, nodes.Node, nodes.Node, MultiConnectorEdge[mm.Memlet]]]: + """ + Finds and returns all data copies in the SDFG as tuples containing the SDFG, state, source node, + destination node, and the first memlet edge of in the memlet path between source and destination node. + + Parameters + ---------- + sdfg : SDFG + The SDFG to analyze for potential data copies. + + Returns + ------- + List[Tuple[SDFG, SDFGState, nodes.Node, nodes.Node, MultiConnectorEdge[mm.Memlet]]] + A list of tuples representing the data copy, each containing: + - The SDFG containing the copy + - The state in which the copy occurs + - The source node of the copy + - The destination node of the copy + - The first memlet edge representing the data movement + """ + copy_worklist: List[Tuple[SDFG, SDFGState, nodes.Node, nodes.Node, MultiConnectorEdge[mm.Memlet]]] = [] + visited_edges: Set[MultiConnectorEdge[mm.Memlet]] = set() + + for sub_sdfg in sdfg.all_sdfgs_recursive(): + for state in sub_sdfg.states(): + for edge in state.edges(): + + # Skip edges that were already processed + if edge in visited_edges: + continue + + # Get the memlet path and mark all edges in the path as visited + memlet_path = state.memlet_path(edge) + visited_edges.update(set(memlet_path)) + + # Get source and destination noces + first_edge = memlet_path[0] + last_edge = memlet_path[-1] + src_node = first_edge.src + dst_node = last_edge.dst + + # Skip empty memlets + if first_edge.data.subset is None: + continue + + # Add copy to the worklist + copy_worklist.append((sub_sdfg, state, src_node, dst_node, first_edge)) + + return copy_worklist \ No newline at end of file diff --git a/dace/transformation/passes/gpu_specialization/insert_gpu_stream_sync_tasklets.py b/dace/transformation/passes/gpu_specialization/insert_gpu_stream_sync_tasklets.py new file mode 100644 index 0000000000..62f3484a08 --- /dev/null +++ b/dace/transformation/passes/gpu_specialization/insert_gpu_stream_sync_tasklets.py @@ -0,0 +1,288 @@ +# Copyright 2019-2025 ETH Zurich and the DaCe authors. All rights reserved. +from typing import Any, Dict, List, Set, Tuple, Type, Union +import copy + +import dace +from dace import dtypes, properties, SDFG, SDFGState +from dace.codegen import common +from dace.config import Config +from dace.sdfg import nodes +from dace.transformation import pass_pipeline as ppl, transformation +from dace.transformation.helpers import is_within_schedule_types +from dace.transformation.passes.gpu_specialization.gpu_stream_scheduling import NaiveGPUStreamScheduler +from dace.transformation.passes.gpu_specialization.insert_gpu_streams_to_sdfgs import InsertGPUStreamsToSDFGs +from dace.transformation.passes.gpu_specialization.connect_gpu_streams_to_kernels import ConnectGPUStreamsToKernels +from dace.transformation.passes.gpu_specialization.connect_gpu_streams_to_tasklets import ConnectGPUStreamsToTasklets + + +STREAM_PLACEHOLDER = "__dace_current_stream" + + +@properties.make_properties +@transformation.explicit_cf_compatible +class InsertGPUStreamSyncTasklets(ppl.Pass): + """ + Inserts GPU stream synchronization tasklets in an SDFG where needed. + + This pass uses a heuristic approach to find locations matching specific patterns + that require synchronization. Additional locations can be added easily if new + cases are discovered. + """ + + def depends_on(self) -> Set[Union[Type[ppl.Pass], ppl.Pass]]: + return {NaiveGPUStreamScheduler, InsertGPUStreamsToSDFGs, ConnectGPUStreamsToKernels, ConnectGPUStreamsToTasklets} + + def modifies(self) -> ppl.Modifies: + return ppl.Modifies.Tasklets | ppl.Modifies.Memlets + + def should_reapply(self, modified: ppl.Modifies) -> bool: + return False + + def apply_pass(self, sdfg: SDFG, pipeline_results: Dict[str, Any]): + """ + Inserts GPU stream synchronization tasklets at required locations + after certain nodes and at the end of a state, for GPU streams used in the state. + """ + stream_assignments: Dict[nodes.Node, int] = pipeline_results['NaiveGPUStreamScheduler'] + + # Get sync locations + sync_state, sync_node = self._identify_sync_locations(sdfg, stream_assignments) + + # Synchronize after a node when required + self._insert_gpu_stream_sync_after_node(sdfg, sync_node, stream_assignments) + + # Synchronize all used streams at the end of a state + self._insert_gpu_stream_sync_at_state_end(sdfg, sync_state, stream_assignments) + return {} + + def _identify_sync_locations( + self, sdfg: SDFG, + stream_assignments: Dict[nodes.Node, int]) -> Tuple[Dict[SDFGState, Set[int]], Dict[nodes.Node, SDFGState]]: + """ + Heuristically identifies GPU stream synchronization points in an SDFG. + + Parameters + ---------- + sdfg : SDFG + The SDFG to analyze. + stream_assignments : Dict[nodes.Node, int] + Mapping of nodes to their assigned GPU stream ids. + + Returns + ------- + Tuple[Dict[SDFGState, Set[int]], Dict[nodes.Node, SDFGState]] + - **sync_state**: Maps each state to the set of stream IDs that should be + synchronized at the end of the state. + - **sync_node**: The keys of this dictionary are nodes after which synchronization + is needed, and their corresponding value is the state they belong to. + """ + + # ------------------ Helper predicates ----------------------------- + + def is_gpu_global_accessnode(node, state): + return isinstance(node, nodes.AccessNode) and node.desc( + state.parent).storage == dtypes.StorageType.GPU_Global + + def is_nongpu_accessnode(node, state): + return isinstance(node, nodes.AccessNode) and node.desc( + state.parent).storage not in dtypes.GPU_MEMORY_STORAGES_EXPERIMENTAL_CUDACODEGEN + + def is_kernel_exit(node): + return isinstance(node, nodes.ExitNode) and node.schedule == dtypes.ScheduleType.GPU_Device + + def is_sink_node(node, state): + return state.out_degree(node) == 0 + + def edge_within_kernel(state, src, dst): + gpu_schedules = dtypes.GPU_SCHEDULES_EXPERIMENTAL_CUDACODEGEN + src_in_kernel = is_within_schedule_types(state, src, gpu_schedules) + dst_in_kernel = is_within_schedule_types(state, dst, gpu_schedules) + return src_in_kernel and dst_in_kernel + + def is_tasklet_with_stream_use(src): + return isinstance(src, nodes.Tasklet) and STREAM_PLACEHOLDER in src.code.as_string + + # ------------------ Sync detection logic ----------------------------- + + sync_state: Dict[SDFGState, Set[int]] = {} + sync_node: Dict[nodes.Node, SDFGState] = {} + + for edge, state in sdfg.all_edges_recursive(): + src, dst = edge.src, edge.dst + + # Ensure state is initialized in sync_state + if state not in sync_state: + sync_state[state] = set() + + # --- Heuristics for when to sync --- + if (is_gpu_global_accessnode(src, state) and is_nongpu_accessnode(dst, state) and is_sink_node(dst, state) + and not edge_within_kernel(state, src, dst)): + sync_state[state].add(stream_assignments[dst]) + + elif (is_gpu_global_accessnode(src, state) and is_nongpu_accessnode(dst, state) and not is_sink_node(dst, state) + and not edge_within_kernel(state, src, dst)): + sync_node[dst] = state + sync_state[state].add(stream_assignments[dst]) + + elif (is_nongpu_accessnode(src, state) and is_gpu_global_accessnode(dst, state) + and not edge_within_kernel(state, src, dst)): + sync_state[state].add(stream_assignments[dst]) + + elif (is_kernel_exit(src) and is_gpu_global_accessnode(dst, state) and not is_sink_node(dst, state)): + sync_state[state].add(stream_assignments[src]) + sync_state[state].add(stream_assignments[src]) + + elif (is_kernel_exit(src) and is_gpu_global_accessnode(dst, state) and is_sink_node(dst, state)): + sync_state[state].add(stream_assignments[dst]) + + elif is_tasklet_with_stream_use(src): + sync_state[state].add(stream_assignments[src]) + + else: + continue + + # Check that state is indeed a SDFGState when added to the dictionary, to be on the safe side + if not isinstance(state, SDFGState): + raise NotImplementedError(f"Unexpected parent type '{type(state).__name__}' for edge '{edge}'. " + "Expected 'SDFGState'. Please handle this case explicitly.") + + # Remove states with no syncs + sync_state = {state: streams for state, streams in sync_state.items() if len(streams) > 0} + + return sync_state, sync_node + + def _insert_gpu_stream_sync_at_state_end(self, sdfg: SDFG, sync_state: Dict[SDFGState, Set[int]], + stream_assignments: Dict[nodes.Node, int]) -> None: + """ + Inserts GPU stream synchronization tasklets at the end of SDFG states. + + For each state that requires synchronization, this method: + + 1. Generates a tasklet that synchronizes all assigned GPU streams using + the appropriate backend (e.g., CUDA). + 2. Ensures all other operations in the state complete before synchronization + by connecting all sink nodes to the tasklet. + 3. Guarantees that only a single GPU stream AccessNode connects to the sync + tasklet, creating one if needed. + + Parameters + ---------- + sdfg : SDFG + The top level SDFG. + sync_state : Dict[SDFGState, Set[int] + Mapping of states to sets of stream IDs that require synchronization at the end of the state. + stream_assignments : Dict[nodes.Node, int] + Mapping of nodes to their assigned GPU stream IDs. + """ + # Prepare GPU stream info and backend + stream_array_name, stream_var_name_prefix = Config.get('compiler', 'cuda', 'gpu_stream_name').split(',') + backend: str = common.get_gpu_backend() + + for state, streams in sync_state.items(): + + #----------------- Generate GPU stream synchronization Tasklet ----------------- + + # Build synchronization calls for all streams used in this state + sync_code_lines = [] + for stream in streams: + gpu_stream_var_name = f"{stream_var_name_prefix}{stream}" + sync_call = f"DACE_GPU_CHECK({backend}StreamSynchronize({gpu_stream_var_name}));" + sync_code_lines.append(sync_call) + sync_code = "\n".join(sync_code_lines) + + # Create the tasklet + tasklet = state.add_tasklet(name=f"gpu_stream_{stream}_synchronization", + inputs=set(), + outputs=set(), + code=sync_code, + language=dtypes.Language.CPP) + + # ----------------- Connect sink nodes to the synchronization tasklet ----------------- + + # 1. Seperate GPU stream sink nodes and other sink nodes + stream_sink_nodes: List[nodes.AccessNode] = [] + non_stream_sink_nodes: List[nodes.Node] = [] + for sink_node in state.sink_nodes(): + if isinstance(sink_node, nodes.AccessNode) and sink_node.desc(state).dtype == dtypes.gpuStream_t: + stream_sink_nodes.append(sink_node) + + elif sink_node != tasklet: + non_stream_sink_nodes.append(sink_node) + + # 2. Connect non-stream sink nodes to the sync tasklet + for sink_node in non_stream_sink_nodes: + state.add_edge(sink_node, None, tasklet, None, dace.Memlet()) + + # 3. Connect a single GPU stream sink node (create or merge if needed) + if len(stream_sink_nodes) == 0: + combined_stream_node = state.add_access(stream_array_name) + + else: + combined_stream_node = stream_sink_nodes.pop() + for stream_node in stream_sink_nodes: + for edge in state.in_edges(stream_node): + state.add_edge(edge.src, edge.src_conn, combined_stream_node, edge.dst_conn, edge.data) + state.remove_edge(edge) + state.remove_node(stream_node) + + # Connect back to output stream node + output_stream_node = state.add_access(combined_stream_node.data) + for stream in streams: + accessed_gpu_stream = f"{stream_array_name}[{stream}]" + conn = f"{stream_var_name_prefix}{stream}" # Note: Same as "gpu_stream_var_name" from tasklet + + tasklet.add_in_connector(conn, dtypes.gpuStream_t) + tasklet.add_out_connector(conn, dtypes.gpuStream_t, force=True) + state.add_edge(combined_stream_node, None, tasklet, conn, dace.Memlet(accessed_gpu_stream)) + state.add_edge(tasklet, conn, output_stream_node, None, dace.Memlet(accessed_gpu_stream)) + + def _insert_gpu_stream_sync_after_node(self, sdfg: SDFG, sync_node: Dict[nodes.Node, SDFGState], + stream_assignments: Dict[nodes.Node, int]) -> None: + """ + Insert a GPU stream synchronization tasklet immediately after specified nodes. + + Parameters + ---------- + sdfg : SDFG + The top level SDFG. + sync_node : Dict[nodes.Node, SDFGState] + Mapping of nodes to their parent state. After after the node a GPU stream synchronization should occur. + stream_assignments : Dict[nodes.Node, int] + Mapping of nodes to their assigned GPU stream IDs. + """ + # Prepare GPU stream info and backend + stream_array_name, stream_var_name_prefix = Config.get('compiler', 'cuda', 'gpu_stream_name').split(',') + backend: str = common.get_gpu_backend() + + for node, state in sync_node.items(): + + #----------------- Generate GPU stream synchronization Tasklet ----------------- + + # Get assigned GPU stream + stream = stream_assignments.get(node, "nullptr") + if stream == "nullptr": + raise NotImplementedError("Using the default 'nullptr' gpu stream is not supported yet.") + + # Create the tasklet + stream_var_name = f"{stream_var_name_prefix}{stream}" + sync_call = f"DACE_GPU_CHECK({backend}StreamSynchronize({stream_var_name}));\n" + tasklet = state.add_tasklet( name=f"gpu_stream_{stream}_synchronization", + inputs=set(), outputs=set(), + code=sync_call, language=dtypes.Language.CPP) + + + #----------------- Place tasklet between node and successors, link GPU streams ---------------- + + # 1. Put the tasklet between the node and its successors + for succ in state.successors(node): + state.add_edge(tasklet, None, succ, None, dace.Memlet()) + state.add_edge(node, None, tasklet, None, dace.Memlet()) + + # 2. Connect tasklet to GPU stream AccessNodes + in_stream = state.add_access(stream_array_name) + out_stream = state.add_access(stream_array_name) + accessed_stream = f"{stream_array_name}[{stream}]" + state.add_edge(in_stream, None, tasklet, stream_var_name, dace.Memlet(accessed_stream)) + state.add_edge(tasklet, stream_var_name, out_stream, None, dace.Memlet(accessed_stream)) + tasklet.add_in_connector(stream_var_name, dtypes.gpuStream_t, force=True) + tasklet.add_out_connector(stream_var_name, dtypes.gpuStream_t, force=True) \ No newline at end of file diff --git a/dace/transformation/passes/gpu_specialization/insert_gpu_streams_to_sdfgs.py b/dace/transformation/passes/gpu_specialization/insert_gpu_streams_to_sdfgs.py new file mode 100644 index 0000000000..1896ec382c --- /dev/null +++ b/dace/transformation/passes/gpu_specialization/insert_gpu_streams_to_sdfgs.py @@ -0,0 +1,154 @@ +# Copyright 2019-2025 ETH Zurich and the DaCe authors. All rights reserved. +from typing import Any, Dict, Set, Type, Union + +import dace +from dace import SDFG, dtypes, properties +from dace.config import Config +from dace.sdfg import is_devicelevel_gpu +from dace.sdfg.nodes import AccessNode, MapEntry, MapExit, Node, Tasklet +from dace.transformation import pass_pipeline as ppl, transformation +from dace.transformation.passes.gpu_specialization.gpu_stream_scheduling import NaiveGPUStreamScheduler + +STREAM_PLACEHOLDER = "__dace_current_stream" + + +@properties.make_properties +@transformation.explicit_cf_compatible +class InsertGPUStreamsToSDFGs(ppl.Pass): + """ + Inserts a GPU stream array into the top-level SDFG and propagates it to all + nested SDFGs that require it, including intermediate SDFGs along the hierarchy. + + This pass guarantees that every relevant SDFG has the array defined, avoiding + duplication and allowing subsequent passes in the GPU stream pipeline to rely + on its presence without redefining it. + """ + + def depends_on(self) -> Set[Union[Type[ppl.Pass], ppl.Pass]]: + return {NaiveGPUStreamScheduler} + + def modifies(self) -> ppl.Modifies: + return ppl.Modifies.AccessNodes | ppl.Modifies.Memlets + + def should_reapply(self, modified: ppl.Modifies) -> bool: + return False + + def apply_pass(self, sdfg: SDFG, pipeline_results: Dict[str, Any]): + """ + Ensure that a GPU stream array is available in all SDFGs that require it. + + The pass creates the array once at the top-level SDFG and propagates it + down the hierarchy by inserting matching arrays in child SDFGs and wiring + them through nested SDFG connectors. This way, all SDFGs share a consistent + reference to the same GPU stream array. + """ + + # Extract stream array name and number of streams to allocate + stream_array_name = Config.get('compiler', 'cuda', 'gpu_stream_name').split(',')[0] + stream_assignments: Dict[Node, Union[int, str]] = pipeline_results['NaiveGPUStreamScheduler'] + num_assigned_streams = max(stream_assignments.values(), default=0) + 1 + + # Add the GPU stream array at the top level + sdfg.add_transient(stream_array_name, (num_assigned_streams, ), + dtype=dace.dtypes.gpuStream_t, + storage=dace.dtypes.StorageType.Register) + + # Ensure GPU stream array is defined where required + for child_sdfg in self.find_child_sdfgs_requiring_gpu_stream(sdfg): + + # Skip if this child already has the array (inserted higher up in the hierarchy) + if stream_array_name in child_sdfg.arrays: + continue + + # Add the array to the child SDFG + inner_sdfg = child_sdfg + inner_sdfg.add_array(stream_array_name, (num_assigned_streams, ), + dtype=dace.dtypes.gpuStream_t, + storage=dace.dtypes.StorageType.Register) + + # Walk up the hierarchy until the array is found, inserting it into each parent + outer_sdfg = inner_sdfg.parent_sdfg + while stream_array_name not in outer_sdfg.arrays: + + # Insert array in parent SDFG + outer_sdfg.add_array(stream_array_name, (num_assigned_streams, ), + dtype=dace.dtypes.gpuStream_t, + storage=dace.dtypes.StorageType.Register) + + # Connect parent SDFG array to nested SDFG node + inner_nsdfg_node = inner_sdfg.parent_nsdfg_node + inner_parent_state = inner_sdfg.parent + inner_nsdfg_node.add_in_connector(stream_array_name, dtypes.gpuStream_t) + inp_gpu_stream: AccessNode = inner_parent_state.add_access(stream_array_name) + inner_parent_state.add_edge(inp_gpu_stream, None, inner_nsdfg_node, stream_array_name, + dace.Memlet(stream_array_name)) + + # Continue climbing up the hierarchy + inner_sdfg = outer_sdfg + outer_sdfg = outer_sdfg.parent_sdfg + + # Ensure final connection from the first parent that had the array down to this SDFG + inner_nsdfg_node = inner_sdfg.parent_nsdfg_node + inner_parent_state = inner_sdfg.parent + inner_nsdfg_node.add_in_connector(stream_array_name, dtypes.gpuStream_t) + inp_gpu_stream: AccessNode = inner_parent_state.add_access(stream_array_name) + inner_parent_state.add_edge(inp_gpu_stream, None, inner_nsdfg_node, stream_array_name, + dace.Memlet(f"{stream_array_name}[0:{num_assigned_streams}]")) + + outer_sdfg = inner_sdfg.parent_sdfg + + return {} + + def find_child_sdfgs_requiring_gpu_stream(self, sdfg) -> Set[SDFG]: + """ + Identify all child SDFGs that require a GPU stream array in their + array descriptor store. A child SDFG requires a GPU stream if: + + - It launches GPU kernels (MapEntry/MapExit with GPU_Device schedule). + - It contains special Tasklets (e.g., from library node expansion) that + use the GPU stream they are assigned to in the code. + - It accesses GPU global memory outside device-level GPU scopes, which + implies memory copies or kernel data feeds. + + Parameters + ---------- + sdfg : SDFG + The root SDFG to inspect. + + Returns + ------- + Set[SDFG] + The set of child SDFGs that need a GPU stream array in their array descriptor + store. + """ + requiring_gpu_stream = set() + for child_sdfg in sdfg.all_sdfgs_recursive(): + + # Skip the root SDFG itself + if child_sdfg is sdfg: + continue + + for state in child_sdfg.states(): + for node in state.nodes(): + + # Case 1: Kernel launch nodes + if isinstance(node, (MapEntry, MapExit)) and node.map.schedule == dtypes.ScheduleType.GPU_Device: + requiring_gpu_stream.add(child_sdfg) + break + + # Case 2: Tasklets that use GPU stream in their code + if isinstance(node, Tasklet) and STREAM_PLACEHOLDER in node.code.as_string: + requiring_gpu_stream.add(child_sdfg) + break + + # Case 3: Accessing GPU global memory outside device-level scopes + if (isinstance(node, AccessNode) and node.desc(state).storage == dtypes.StorageType.GPU_Global + and not is_devicelevel_gpu(state.sdfg, state, node)): + requiring_gpu_stream.add(child_sdfg) + break + + # Stop scanning this SDFG once a reason is found + if child_sdfg in requiring_gpu_stream: + break + + return requiring_gpu_stream \ No newline at end of file From eb10eacf55ba0e1539b5d21775228e826feb8976 Mon Sep 17 00:00:00 2001 From: aydogdub Date: Fri, 19 Dec 2025 14:54:22 +0100 Subject: [PATCH 93/94] added tests and adhzstments --- .../targets/gpu_helpers/copy_strategies.py | 1 - dace/config_schema.yml | 11 ++ dace/dtypes.py | 14 ++ dace/sdfg/state.py | 7 + dace/transformation/helpers.py | 1 + .../connect_gpu_streams_to_kernels.py | 2 +- .../connect_gpu_streams_to_tasklets.py | 2 +- .../gpu_stream_scheduling.py | 2 +- .../gpu_stream_topology_simplification.py | 3 +- .../insert_gpu_copy_tasklet.py | 2 +- .../insert_gpu_stream_sync_tasklets.py | 26 ++-- .../insert_gpu_streams_to_sdfgs.py | 2 +- .../gpu_specialization/gpu_stream_test.py | 134 ++++++++++++++++++ 13 files changed, 188 insertions(+), 19 deletions(-) create mode 100644 tests/passes/gpu_specialization/gpu_stream_test.py diff --git a/dace/codegen/targets/gpu_helpers/copy_strategies.py b/dace/codegen/targets/gpu_helpers/copy_strategies.py index 1b11f5bb2b..27a5b2c53b 100644 --- a/dace/codegen/targets/gpu_helpers/copy_strategies.py +++ b/dace/codegen/targets/gpu_helpers/copy_strategies.py @@ -551,4 +551,3 @@ def _generate_nd_copy(self, copy_context: CopyContext) -> None: # Return the code return call - diff --git a/dace/config_schema.yml b/dace/config_schema.yml index 812e24329e..ab77a7d7ff 100644 --- a/dace/config_schema.yml +++ b/dace/config_schema.yml @@ -458,6 +458,17 @@ required: will raise an exception if such a Memlet is encountered. This allows the user to have full control over all Maps in the SDFG. + # New configs, needed for new CUDACodeGen + gpu_stream_name: + type: str + title: Name for the GPU stream object + description: > + GPU streams allow GPU operations, such as kernel execution or memory transfers, to run asynchronously + and in parallel. This field specifies the naming convention for the hpu stream array and its connectors + in the SDFG. For example: 'gpu_streams,gpu_stream' means 'gpu_streams' is the array containing the + stream objects, and 'gpu_stream0' (prefix derived from the second name + stream id) is used as a + connector for gpu_streams[0]. + default: gpu_streams,gpu_stream ############################################# # General FPGA flags diff --git a/dace/dtypes.py b/dace/dtypes.py index faadc84a50..ef049af343 100644 --- a/dace/dtypes.py +++ b/dace/dtypes.py @@ -87,6 +87,18 @@ class ScheduleType(aenum.AutoNumberEnum): ScheduleType.GPU_Persistent, ] +# A subset of GPU schedule types for the new GPU backend +GPU_SCHEDULES_EXPERIMENTAL_CUDACODEGEN = [ + ScheduleType.GPU_Device, + ScheduleType.GPU_ThreadBlock, +] + +# A subset of on-GPU storage types for the new GPU backend +GPU_MEMORY_STORAGES_EXPERIMENTAL_CUDACODEGEN = [ + StorageType.GPU_Global, + StorageType.GPU_Shared, +] + # A subset of CPU schedule types CPU_SCHEDULES = [ ScheduleType.CPU_Multicore, @@ -1266,6 +1278,7 @@ def isconstant(var): complex128 = typeclass(numpy.complex128) string = stringtype() MPI_Request = opaque('MPI_Request') +gpuStream_t = opaque('gpuStream_t') @undefined_safe_enum @@ -1286,6 +1299,7 @@ class Typeclasses(aenum.AutoNumberEnum): float64 = float64 complex64 = complex64 complex128 = complex128 + gpuStream_t = gpuStream_t _bool = bool diff --git a/dace/sdfg/state.py b/dace/sdfg/state.py index d558053d3d..93b925b2c5 100644 --- a/dace/sdfg/state.py +++ b/dace/sdfg/state.py @@ -405,6 +405,13 @@ def memlet_path(self, edge: MultiConnectorEdge[mm.Memlet]) -> List[MultiConnecto if (edge.src_conn is None and edge.dst_conn is None and edge.data.is_empty()): return result + # For the gpu stream (i.e. cudastream, hipstream) management we can have dynamic out connectors, e.g. + # (GPU_Device-scheduled) MapExit: stream -> None: AccessNode, where AccessNode accesses a Stream array + # Memlets are used but its not about seing how data flows + if (isinstance(edge.src, nd.MapExit) and edge.src.map.schedule == dtypes.ScheduleType.GPU_Device + and isinstance(edge.dst, nd.AccessNode) and edge.dst.desc(state).dtype == dtypes.gpuStream_t): + return result + # Prepend incoming edges until reaching the source node curedge = edge visited = set() diff --git a/dace/transformation/helpers.py b/dace/transformation/helpers.py index f5607f952b..7756cbf1fe 100644 --- a/dace/transformation/helpers.py +++ b/dace/transformation/helpers.py @@ -1550,6 +1550,7 @@ def get_parent_map(state: SDFGState, node: Optional[nodes.Node] = None) -> Optio cursdfg = cursdfg.parent_sdfg return None + def is_within_schedule_types(state: SDFGState, node: nodes.Node, schedules: Set[dtypes.ScheduleType]) -> bool: """ Checks if the given node is enclosed within a Map whose schedule type diff --git a/dace/transformation/passes/gpu_specialization/connect_gpu_streams_to_kernels.py b/dace/transformation/passes/gpu_specialization/connect_gpu_streams_to_kernels.py index 851f18e108..225dba00e4 100644 --- a/dace/transformation/passes/gpu_specialization/connect_gpu_streams_to_kernels.py +++ b/dace/transformation/passes/gpu_specialization/connect_gpu_streams_to_kernels.py @@ -67,4 +67,4 @@ def apply_pass(self, sdfg: SDFG, pipeline_results: Dict[str, Any]): state.add_edge(kernel_exit, gpu_stream_var_name, stream_array_out, None, dace.Memlet(accessed_gpu_stream)) - return {} \ No newline at end of file + return {} diff --git a/dace/transformation/passes/gpu_specialization/connect_gpu_streams_to_tasklets.py b/dace/transformation/passes/gpu_specialization/connect_gpu_streams_to_tasklets.py index 9877f2d563..58d9ff70ff 100644 --- a/dace/transformation/passes/gpu_specialization/connect_gpu_streams_to_tasklets.py +++ b/dace/transformation/passes/gpu_specialization/connect_gpu_streams_to_tasklets.py @@ -77,4 +77,4 @@ def apply_pass(self, sdfg: SDFG, pipeline_results: Dict[str, Any]): state.add_edge(stream_array_in, None, node, gpu_stream_conn, dace.Memlet(accessed_gpu_stream)) state.add_edge(node, gpu_stream_conn, stream_array_out, None, dace.Memlet(accessed_gpu_stream)) - return {} \ No newline at end of file + return {} diff --git a/dace/transformation/passes/gpu_specialization/gpu_stream_scheduling.py b/dace/transformation/passes/gpu_specialization/gpu_stream_scheduling.py index 0151d790b8..0ad3c2e7c0 100644 --- a/dace/transformation/passes/gpu_specialization/gpu_stream_scheduling.py +++ b/dace/transformation/passes/gpu_specialization/gpu_stream_scheduling.py @@ -246,4 +246,4 @@ def gpu_relevant(node, parent) -> bool: if gpu_relevant(node, state): return True - return False \ No newline at end of file + return False diff --git a/dace/transformation/passes/gpu_specialization/gpu_stream_topology_simplification.py b/dace/transformation/passes/gpu_specialization/gpu_stream_topology_simplification.py index 7af22aa6c6..7e1a62b29c 100644 --- a/dace/transformation/passes/gpu_specialization/gpu_stream_topology_simplification.py +++ b/dace/transformation/passes/gpu_specialization/gpu_stream_topology_simplification.py @@ -14,6 +14,7 @@ from dace.transformation.passes.gpu_specialization.insert_gpu_stream_sync_tasklets import InsertGPUStreamSyncTasklets from dace.transformation.passes.gpu_specialization.insert_gpu_copy_tasklet import InsertGPUCopyTasklets + @properties.make_properties @transformation.explicit_cf_compatible class GPUStreamTopologySimplification(ppl.Pass): @@ -270,4 +271,4 @@ def example(A: dace.uint32[128], B: dace.uint32[128], state.remove_edge(in_edge) state.remove_node(sink_stream) - state.remove_node(passthrough_gpu_node) \ No newline at end of file + state.remove_node(passthrough_gpu_node) diff --git a/dace/transformation/passes/gpu_specialization/insert_gpu_copy_tasklet.py b/dace/transformation/passes/gpu_specialization/insert_gpu_copy_tasklet.py index cea8fc1f43..162aa6143f 100644 --- a/dace/transformation/passes/gpu_specialization/insert_gpu_copy_tasklet.py +++ b/dace/transformation/passes/gpu_specialization/insert_gpu_copy_tasklet.py @@ -163,4 +163,4 @@ def find_all_data_copies( # Add copy to the worklist copy_worklist.append((sub_sdfg, state, src_node, dst_node, first_edge)) - return copy_worklist \ No newline at end of file + return copy_worklist diff --git a/dace/transformation/passes/gpu_specialization/insert_gpu_stream_sync_tasklets.py b/dace/transformation/passes/gpu_specialization/insert_gpu_stream_sync_tasklets.py index 62f3484a08..2d2c1137de 100644 --- a/dace/transformation/passes/gpu_specialization/insert_gpu_stream_sync_tasklets.py +++ b/dace/transformation/passes/gpu_specialization/insert_gpu_stream_sync_tasklets.py @@ -14,7 +14,6 @@ from dace.transformation.passes.gpu_specialization.connect_gpu_streams_to_kernels import ConnectGPUStreamsToKernels from dace.transformation.passes.gpu_specialization.connect_gpu_streams_to_tasklets import ConnectGPUStreamsToTasklets - STREAM_PLACEHOLDER = "__dace_current_stream" @@ -30,7 +29,9 @@ class InsertGPUStreamSyncTasklets(ppl.Pass): """ def depends_on(self) -> Set[Union[Type[ppl.Pass], ppl.Pass]]: - return {NaiveGPUStreamScheduler, InsertGPUStreamsToSDFGs, ConnectGPUStreamsToKernels, ConnectGPUStreamsToTasklets} + return { + NaiveGPUStreamScheduler, InsertGPUStreamsToSDFGs, ConnectGPUStreamsToKernels, ConnectGPUStreamsToTasklets + } def modifies(self) -> ppl.Modifies: return ppl.Modifies.Tasklets | ppl.Modifies.Memlets @@ -119,8 +120,8 @@ def is_tasklet_with_stream_use(src): and not edge_within_kernel(state, src, dst)): sync_state[state].add(stream_assignments[dst]) - elif (is_gpu_global_accessnode(src, state) and is_nongpu_accessnode(dst, state) and not is_sink_node(dst, state) - and not edge_within_kernel(state, src, dst)): + elif (is_gpu_global_accessnode(src, state) and is_nongpu_accessnode(dst, state) + and not is_sink_node(dst, state) and not edge_within_kernel(state, src, dst)): sync_node[dst] = state sync_state[state].add(stream_assignments[dst]) @@ -236,7 +237,7 @@ def _insert_gpu_stream_sync_at_state_end(self, sdfg: SDFG, sync_state: Dict[SDFG state.add_edge(combined_stream_node, None, tasklet, conn, dace.Memlet(accessed_gpu_stream)) state.add_edge(tasklet, conn, output_stream_node, None, dace.Memlet(accessed_gpu_stream)) - def _insert_gpu_stream_sync_after_node(self, sdfg: SDFG, sync_node: Dict[nodes.Node, SDFGState], + def _insert_gpu_stream_sync_after_node(self, sdfg: SDFG, sync_node: Dict[nodes.Node, SDFGState], stream_assignments: Dict[nodes.Node, int]) -> None: """ Insert a GPU stream synchronization tasklet immediately after specified nodes. @@ -258,7 +259,7 @@ def _insert_gpu_stream_sync_after_node(self, sdfg: SDFG, sync_node: Dict[nodes.N #----------------- Generate GPU stream synchronization Tasklet ----------------- - # Get assigned GPU stream + # Get assigned GPU stream stream = stream_assignments.get(node, "nullptr") if stream == "nullptr": raise NotImplementedError("Using the default 'nullptr' gpu stream is not supported yet.") @@ -266,10 +267,11 @@ def _insert_gpu_stream_sync_after_node(self, sdfg: SDFG, sync_node: Dict[nodes.N # Create the tasklet stream_var_name = f"{stream_var_name_prefix}{stream}" sync_call = f"DACE_GPU_CHECK({backend}StreamSynchronize({stream_var_name}));\n" - tasklet = state.add_tasklet( name=f"gpu_stream_{stream}_synchronization", - inputs=set(), outputs=set(), - code=sync_call, language=dtypes.Language.CPP) - + tasklet = state.add_tasklet(name=f"gpu_stream_{stream}_synchronization", + inputs=set(), + outputs=set(), + code=sync_call, + language=dtypes.Language.CPP) #----------------- Place tasklet between node and successors, link GPU streams ---------------- @@ -277,7 +279,7 @@ def _insert_gpu_stream_sync_after_node(self, sdfg: SDFG, sync_node: Dict[nodes.N for succ in state.successors(node): state.add_edge(tasklet, None, succ, None, dace.Memlet()) state.add_edge(node, None, tasklet, None, dace.Memlet()) - + # 2. Connect tasklet to GPU stream AccessNodes in_stream = state.add_access(stream_array_name) out_stream = state.add_access(stream_array_name) @@ -285,4 +287,4 @@ def _insert_gpu_stream_sync_after_node(self, sdfg: SDFG, sync_node: Dict[nodes.N state.add_edge(in_stream, None, tasklet, stream_var_name, dace.Memlet(accessed_stream)) state.add_edge(tasklet, stream_var_name, out_stream, None, dace.Memlet(accessed_stream)) tasklet.add_in_connector(stream_var_name, dtypes.gpuStream_t, force=True) - tasklet.add_out_connector(stream_var_name, dtypes.gpuStream_t, force=True) \ No newline at end of file + tasklet.add_out_connector(stream_var_name, dtypes.gpuStream_t, force=True) diff --git a/dace/transformation/passes/gpu_specialization/insert_gpu_streams_to_sdfgs.py b/dace/transformation/passes/gpu_specialization/insert_gpu_streams_to_sdfgs.py index 1896ec382c..f45caa5dd0 100644 --- a/dace/transformation/passes/gpu_specialization/insert_gpu_streams_to_sdfgs.py +++ b/dace/transformation/passes/gpu_specialization/insert_gpu_streams_to_sdfgs.py @@ -151,4 +151,4 @@ def find_child_sdfgs_requiring_gpu_stream(self, sdfg) -> Set[SDFG]: if child_sdfg in requiring_gpu_stream: break - return requiring_gpu_stream \ No newline at end of file + return requiring_gpu_stream diff --git a/tests/passes/gpu_specialization/gpu_stream_test.py b/tests/passes/gpu_specialization/gpu_stream_test.py new file mode 100644 index 0000000000..3bdd1f2aab --- /dev/null +++ b/tests/passes/gpu_specialization/gpu_stream_test.py @@ -0,0 +1,134 @@ +# Copyright 2019-2025 ETH Zurich and the DaCe authors. All rights reserved. +import pytest + +import dace +from dace.codegen import common +from dace.transformation.pass_pipeline import Pipeline +from dace.transformation.passes.gpu_specialization.gpu_stream_scheduling import NaiveGPUStreamScheduler +from dace.transformation.passes.gpu_specialization.insert_gpu_streams_to_sdfgs import InsertGPUStreamsToSDFGs +from dace.transformation.passes.gpu_specialization.connect_gpu_streams_to_kernels import ConnectGPUStreamsToKernels +from dace.transformation.passes.gpu_specialization.connect_gpu_streams_to_tasklets import ConnectGPUStreamsToTasklets +from dace.transformation.passes.gpu_specialization.insert_gpu_stream_sync_tasklets import InsertGPUStreamSyncTasklets +from dace.transformation.passes.gpu_specialization.insert_gpu_copy_tasklet import InsertGPUCopyTasklets +from dace.transformation.passes.gpu_specialization.gpu_stream_topology_simplification import GPUStreamTopologySimplification + +gpu_stream_pipeline = Pipeline([ + NaiveGPUStreamScheduler(), + InsertGPUStreamsToSDFGs(), + ConnectGPUStreamsToKernels(), + ConnectGPUStreamsToTasklets(), + InsertGPUStreamSyncTasklets(), + InsertGPUCopyTasklets(), + GPUStreamTopologySimplification(), +]) + +backend = common.get_gpu_backend() + +@pytest.mark.gpu +def test_basic(): + """ + A simple memory copy program. + + Since the SDFG has a single connected component, exactly one GPU stream is used + and must be synchronized at the end of the state. For each synchronized stream, + the pipeline introduces a memlet from the synchronization tasklet to a GPU stream + AccessNode. Therefore, it is sufficient to verify there is only one sink node with one ingoing + edge, verify its dtype, and check for the presence of a preceeding synchronization tasklet. + """ + @dace.program + def simple_copy( + A: dace.uint32[128] @ dace.dtypes.StorageType.GPU_Global, + B: dace.uint32[128] @ dace.dtypes.StorageType.GPU_Global + ): + for i in dace.map[0:128:1] @ dace.dtypes.ScheduleType.GPU_Device: + B[i] = A[i] + + sdfg = simple_copy.to_sdfg() + gpu_stream_pipeline.apply_pass(sdfg, {}) + + state = sdfg.states()[0] + sink_nodes = state.sink_nodes() + node = sink_nodes[0] + assert ( + len(sink_nodes) == 1 + and len(state.in_edges(node)) == 1 + and isinstance(node, dace.nodes.AccessNode) + and node.desc(state).dtype == dace.dtypes.gpuStream_t + ),( + "Only one sink node with should exist, which is a GPU stream AccessNode and it should have one ingoing edge." + ) + + assert ( + isinstance(pre, dace.nodes.Tasklet) + and f"{backend}StreamSynchronize(" in pre.code.as_string + for pre in state.predecessors(node) + ), ( + "At then end of each state any used stream must be synchronized." + ) + +@pytest.mark.gpu +def test_extended(): + """ + A program that performs two independent memory copies. + + The input arrays reside in host memory, and `gpu_transformations()` is applied to + the program. As a result, the data is first copied to GPU global memory, after + which the two copies are executed on the GPU. Since these copies form two + independent connected components in the resulting SDFG, the naive GPU stream + scheduler assigns them to different GPU streams. + + This test verifies that exactly two GPU streams are used, that both streams are + synchronized at the end of the state, and that the corresponding asynchronous + memory copy tasklets are correctly associated with their assigned streams. + """ + @dace.program + def independent_copies(A : dace.uint32[128], B: dace.uint32[128], C : dace.uint32[128], D: dace.uint32[128]): + for i in dace.map[0:128:1]: + B[i] = A[i] + for i in dace.map[0:128:1]: + D[i] = C[i] + + sdfg = independent_copies.to_sdfg() + + # Transform such that program can run on GPU and apply GPU stream pipeline + sdfg.apply_gpu_transformations() + gpu_stream_pipeline.apply_pass(sdfg, {}) + + # Test 1: Two GPU streams were used since we use the Naive Stream scheduler + state = sdfg.states()[0] + sink_nodes = state.sink_nodes() + node = sink_nodes[0] + assert ( + len(sink_nodes) == 1 + and len(state.in_edges(node)) == 2 + and isinstance(node, dace.nodes.AccessNode) + and node.desc(state).dtype == dace.dtypes.gpuStream_t + ),( + "Only one sink node with should exist, which is a GPU stream AccessNode and it " + "should have two ingoing edges as original graph consisted of two connected components." + ) + + # Test 2: We synchronize at the end of the state + assert ( + isinstance(pre, dace.nodes.Tasklet) + and f"{backend}StreamSynchronize(" in pre.code.as_string + for pre in state.predecessors(node) + ), ( + "At then end of each state any used stream must be synchronized." + ) + + # Test 3: Check that we have memory copy tasklets (as we perform two "Main Memory -> GPU GLobal" + # memory copies and two "GPU Global -> Main Memory" memory copies by applying the gpu tranformation) + # and that they use the name of the in connector of the GPU stream in the copy call + memcopy_tasklets = [n for n in state.nodes() if isinstance(n, dace.nodes.Tasklet) and f"{backend}MemcpyAsync(" in n.code.as_string] + for tasklet in memcopy_tasklets: + assert len(tasklet.in_connectors) == 1, ( + "Memcpy tasklets must have exactly one input connector " + "corresponding to the GPU stream." + ) + + in_connector = next(iter(tasklet.in_connectors)) + + assert in_connector in tasklet.code.as_string, ( + "Memcpy tasklets must reference their GPU stream input connector in the memcpy call." + ) From 69fbb731e2ece60d731f73734ccc242000fec2c8 Mon Sep 17 00:00:00 2001 From: aydogdub Date: Fri, 19 Dec 2025 14:59:25 +0100 Subject: [PATCH 94/94] run pre-commit --- .../gpu_specialization/gpu_stream_test.py | 74 +++++++------------ 1 file changed, 28 insertions(+), 46 deletions(-) diff --git a/tests/passes/gpu_specialization/gpu_stream_test.py b/tests/passes/gpu_specialization/gpu_stream_test.py index 3bdd1f2aab..07d1facdf9 100644 --- a/tests/passes/gpu_specialization/gpu_stream_test.py +++ b/tests/passes/gpu_specialization/gpu_stream_test.py @@ -24,6 +24,7 @@ backend = common.get_gpu_backend() + @pytest.mark.gpu def test_basic(): """ @@ -31,15 +32,14 @@ def test_basic(): Since the SDFG has a single connected component, exactly one GPU stream is used and must be synchronized at the end of the state. For each synchronized stream, - the pipeline introduces a memlet from the synchronization tasklet to a GPU stream - AccessNode. Therefore, it is sufficient to verify there is only one sink node with one ingoing + the pipeline introduces a memlet from the synchronization tasklet to a GPU stream + AccessNode. Therefore, it is sufficient to verify there is only one sink node with one ingoing edge, verify its dtype, and check for the presence of a preceeding synchronization tasklet. """ + @dace.program - def simple_copy( - A: dace.uint32[128] @ dace.dtypes.StorageType.GPU_Global, - B: dace.uint32[128] @ dace.dtypes.StorageType.GPU_Global - ): + def simple_copy(A: dace.uint32[128] @ dace.dtypes.StorageType.GPU_Global, + B: dace.uint32[128] @ dace.dtypes.StorageType.GPU_Global): for i in dace.map[0:128:1] @ dace.dtypes.ScheduleType.GPU_Device: B[i] = A[i] @@ -50,21 +50,13 @@ def simple_copy( sink_nodes = state.sink_nodes() node = sink_nodes[0] assert ( - len(sink_nodes) == 1 - and len(state.in_edges(node)) == 1 - and isinstance(node, dace.nodes.AccessNode) + len(sink_nodes) == 1 and len(state.in_edges(node)) == 1 and isinstance(node, dace.nodes.AccessNode) and node.desc(state).dtype == dace.dtypes.gpuStream_t - ),( - "Only one sink node with should exist, which is a GPU stream AccessNode and it should have one ingoing edge." - ) - - assert ( - isinstance(pre, dace.nodes.Tasklet) - and f"{backend}StreamSynchronize(" in pre.code.as_string - for pre in state.predecessors(node) - ), ( - "At then end of each state any used stream must be synchronized." - ) + ), ("Only one sink node with should exist, which is a GPU stream AccessNode and it should have one ingoing edge.") + + assert (isinstance(pre, dace.nodes.Tasklet) and f"{backend}StreamSynchronize(" in pre.code.as_string + for pre in state.predecessors(node)), ("At then end of each state any used stream must be synchronized.") + @pytest.mark.gpu def test_extended(): @@ -81,8 +73,9 @@ def test_extended(): synchronized at the end of the state, and that the corresponding asynchronous memory copy tasklets are correctly associated with their assigned streams. """ + @dace.program - def independent_copies(A : dace.uint32[128], B: dace.uint32[128], C : dace.uint32[128], D: dace.uint32[128]): + def independent_copies(A: dace.uint32[128], B: dace.uint32[128], C: dace.uint32[128], D: dace.uint32[128]): for i in dace.map[0:128:1]: B[i] = A[i] for i in dace.map[0:128:1]: @@ -98,37 +91,26 @@ def independent_copies(A : dace.uint32[128], B: dace.uint32[128], C : dace.uint3 state = sdfg.states()[0] sink_nodes = state.sink_nodes() node = sink_nodes[0] - assert ( - len(sink_nodes) == 1 - and len(state.in_edges(node)) == 2 - and isinstance(node, dace.nodes.AccessNode) - and node.desc(state).dtype == dace.dtypes.gpuStream_t - ),( - "Only one sink node with should exist, which is a GPU stream AccessNode and it " - "should have two ingoing edges as original graph consisted of two connected components." - ) + assert (len(sink_nodes) == 1 and len(state.in_edges(node)) == 2 and isinstance(node, dace.nodes.AccessNode) + and node.desc(state).dtype == dace.dtypes.gpuStream_t), ( + "Only one sink node with should exist, which is a GPU stream AccessNode and it " + "should have two ingoing edges as original graph consisted of two connected components.") # Test 2: We synchronize at the end of the state - assert ( - isinstance(pre, dace.nodes.Tasklet) - and f"{backend}StreamSynchronize(" in pre.code.as_string - for pre in state.predecessors(node) - ), ( - "At then end of each state any used stream must be synchronized." - ) - - # Test 3: Check that we have memory copy tasklets (as we perform two "Main Memory -> GPU GLobal" + assert (isinstance(pre, dace.nodes.Tasklet) and f"{backend}StreamSynchronize(" in pre.code.as_string + for pre in state.predecessors(node)), ("At then end of each state any used stream must be synchronized.") + + # Test 3: Check that we have memory copy tasklets (as we perform two "Main Memory -> GPU GLobal" # memory copies and two "GPU Global -> Main Memory" memory copies by applying the gpu tranformation) # and that they use the name of the in connector of the GPU stream in the copy call - memcopy_tasklets = [n for n in state.nodes() if isinstance(n, dace.nodes.Tasklet) and f"{backend}MemcpyAsync(" in n.code.as_string] + memcopy_tasklets = [ + n for n in state.nodes() if isinstance(n, dace.nodes.Tasklet) and f"{backend}MemcpyAsync(" in n.code.as_string + ] for tasklet in memcopy_tasklets: - assert len(tasklet.in_connectors) == 1, ( - "Memcpy tasklets must have exactly one input connector " - "corresponding to the GPU stream." - ) + assert len(tasklet.in_connectors) == 1, ("Memcpy tasklets must have exactly one input connector " + "corresponding to the GPU stream.") in_connector = next(iter(tasklet.in_connectors)) assert in_connector in tasklet.code.as_string, ( - "Memcpy tasklets must reference their GPU stream input connector in the memcpy call." - ) + "Memcpy tasklets must reference their GPU stream input connector in the memcpy call.")