From 610634f0d7dff2e2282ca194251cd88ab5614cfd Mon Sep 17 00:00:00 2001 From: Robert Chisholm Date: Wed, 15 Nov 2023 15:55:50 +0000 Subject: [PATCH] wip Having issues on windows, will try Linux --- cmake/dependencies/Jitify.cmake | 2 +- cmake/modules/FindJitify.cmake | 2 +- include/flamegpu/detail/JitifyCache.h | 18 ++-- .../runtime/detail/curve/curve_rtc.cuh | 10 +- .../flamegpu/simulation/detail/CUDAAgent.h | 6 +- src/flamegpu/detail/JitifyCache.cu | 93 ++++++++++++++----- .../runtime/detail/curve/curve_rtc.cpp | 22 +++-- src/flamegpu/simulation/CUDASimulation.cu | 26 +++--- src/flamegpu/simulation/detail/CUDAAgent.cu | 4 +- 9 files changed, 110 insertions(+), 73 deletions(-) diff --git a/cmake/dependencies/Jitify.cmake b/cmake/dependencies/Jitify.cmake index d5aa509c2..309b4e6db 100644 --- a/cmake/dependencies/Jitify.cmake +++ b/cmake/dependencies/Jitify.cmake @@ -10,7 +10,7 @@ cmake_policy(SET CMP0079 NEW) FetchContent_Declare( jitify GIT_REPOSITORY https://github.com/NVIDIA/jitify.git - GIT_TAG cd6b56bf0c63fcce74a59cd021bf63e5c2a32c73 + GIT_TAG jitify2 #-preprocessing-overhaul SOURCE_DIR ${FETCHCONTENT_BASE_DIR}/jitify-src/jitify GIT_PROGRESS ON # UPDATE_DISCONNECTED ON diff --git a/cmake/modules/FindJitify.cmake b/cmake/modules/FindJitify.cmake index 2f3cc3952..84ca8e0d9 100644 --- a/cmake/modules/FindJitify.cmake +++ b/cmake/modules/FindJitify.cmake @@ -20,7 +20,7 @@ include(FindPackageHandleStandardArgs) # Find the main Jitify header find_path(Jitify_INCLUDE_DIRS NAMES - jitify/jitify.hpp + jitify/jitify2.hpp ) # if found, get the version number. diff --git a/include/flamegpu/detail/JitifyCache.h b/include/flamegpu/detail/JitifyCache.h index 12bcb4ddc..3ada8d5b2 100644 --- a/include/flamegpu/detail/JitifyCache.h +++ b/include/flamegpu/detail/JitifyCache.h @@ -7,15 +7,9 @@ #include #include -#ifdef _MSC_VER -#pragma warning(push, 2) -#include "jitify/jitify.hpp" -#pragma warning(pop) -#else -#include "jitify/jitify.hpp" -#endif - -using jitify::experimental::KernelInstantiation; +namespace jitify2 { +class KernelData; +} // namespace jitify2 namespace flamegpu { namespace detail { @@ -36,7 +30,7 @@ class JitifyCache { // dynamic header concatenated to kernel // We check this is an exact match before loading from cache std::string long_reference; - std::string serialised_kernelinst; + std::string serialised_kernel; }; public: @@ -50,7 +44,7 @@ class JitifyCache { * @param dynamic_header Dynamic header source generated by curve rtc * @return A jitify RTC kernel instance of the provided kernel sources */ - std::unique_ptr loadKernel( + std::unique_ptr loadKernel( const std::string &func_name, const std::vector &template_args, const std::string &kernel_src, @@ -97,7 +91,7 @@ class JitifyCache { * @param dynamic_header Dynamic header source generated by curve rtc * @return A jitify RTC kernel instance of the provided kernel sources */ - static std::unique_ptr compileKernel( + static std::unique_ptr compileKernel( const std::string &func_name, const std::vector &template_args, const std::string &kernel_src, diff --git a/include/flamegpu/runtime/detail/curve/curve_rtc.cuh b/include/flamegpu/runtime/detail/curve/curve_rtc.cuh index 557bb753e..2ef1162df 100644 --- a/include/flamegpu/runtime/detail/curve/curve_rtc.cuh +++ b/include/flamegpu/runtime/detail/curve/curve_rtc.cuh @@ -9,11 +9,9 @@ #include #include -namespace jitify { -namespace experimental { -class KernelInstantiation; -} // namespace experimental -} // namespace jitify +namespace jitify2 { +class KernelData; +} // namespace jitify2 namespace flamegpu { namespace detail { namespace curve { @@ -214,7 +212,7 @@ class CurveRTCHost { * @param stream The CUDA stream used for the cuda memcpy * @note This is async, the stream is non synchronised */ - void updateDevice_async(const jitify::experimental::KernelInstantiation& instance, cudaStream_t stream); + void updateDevice_async(const jitify2::KernelData& instance, cudaStream_t stream); protected: /** diff --git a/include/flamegpu/simulation/detail/CUDAAgent.h b/include/flamegpu/simulation/detail/CUDAAgent.h index 828ab0ebb..bb53b9dfa 100644 --- a/include/flamegpu/simulation/detail/CUDAAgent.h +++ b/include/flamegpu/simulation/detail/CUDAAgent.h @@ -39,12 +39,12 @@ class CUDAAgent : public AgentInterface { /** * map of agent function name to RTC function instance */ - typedef std::map> CUDARTCFuncMap; + typedef std::map> CUDARTCFuncMap; typedef std::map> CUDARTCHeaderMap; /** * Element type of CUDARTCFuncMap */ - typedef std::pair> CUDARTCFuncMapPair; + typedef std::pair> CUDARTCFuncMapPair; /** * Normal constructor * @param description Agent description of the agent @@ -224,7 +224,7 @@ class CUDAAgent : public AgentInterface { * Will throw an exception::InvalidAgentFunc excpetion if the function name does not have a valid instantiation * @param function_name the name of the RTC agent function or the agent function name suffixed with condition (if it is a function condition) */ - const jitify::experimental::KernelInstantiation& getRTCInstantiation(const std::string &function_name) const; + const jitify2::KernelData& getRTCInstantiation(const std::string &function_name) const; detail::curve::CurveRTCHost &getRTCHeader(const std::string &function_name) const; /** * Returns the host interface for managing the curve instance for the named agent function diff --git a/src/flamegpu/detail/JitifyCache.cu b/src/flamegpu/detail/JitifyCache.cu index 554589dd0..6b390f596 100644 --- a/src/flamegpu/detail/JitifyCache.cu +++ b/src/flamegpu/detail/JitifyCache.cu @@ -7,13 +7,13 @@ #include #include +#include "jitify/jitify2.hpp" + #include "flamegpu/version.h" #include "flamegpu/exception/FLAMEGPUException.h" #include "flamegpu/detail/compute_capability.cuh" #include "flamegpu/util/nvtx.h" -using jitify::detail::hash_combine; -using jitify::detail::hash_larson64; namespace flamegpu { namespace detail { @@ -307,7 +307,7 @@ bool confirmFLAMEGPUHeaderVersion(const std::string &flamegpuIncludeDir, const s } // namespace std::mutex JitifyCache::instance_mutex; -std::unique_ptr JitifyCache::compileKernel(const std::string &func_name, const std::vector &template_args, const std::string &kernel_src, const std::string &dynamic_header) { +std::unique_ptr JitifyCache::compileKernel(const std::string &func_name, const std::vector &template_args, const std::string &kernel_src, const std::string &dynamic_header) { flamegpu::util::nvtx::Range range{"JitifyCache::compileKernel"}; // find and validate the cuda include directory via CUDA_PATH or CUDA_HOME. static const std::string cuda_include_dir = getCUDAIncludeDir(); @@ -319,7 +319,7 @@ std::unique_ptr JitifyCache::compileKernel(const std::strin // vector of compiler options for jitify std::vector options; - std::vector headers; + std::unordered_map headers; // fpgu include directory options.push_back(std::string("-I" + std::string(flamegpu_include_dir))); @@ -402,23 +402,55 @@ std::unique_ptr JitifyCache::compileKernel(const std::strin options.push_back(include_cuda_h); // get the dynamically generated header from curve rtc - headers.push_back(dynamic_header); + headers.emplace("dynamic/curve_rtc_dynamic.h", dynamic_header); // cassert header (to remove remaining warnings) TODO: Ask Jitify to implement safe version of this - std::string cassert_h = "cassert\n"; - headers.push_back(cassert_h); + //std::string cassert_h = "cassert\n"; + //headers.push_back(cassert_h); // Add static list of known headers (this greatly improves compilation speed) - getKnownHeaders(headers); + //getKnownHeaders(headers); // jitify to create program (with compilation settings) - try { - auto program = jitify::experimental::Program(kernel_src, headers, options); - assert(template_args.size() == 1 || template_args.size() == 3); // Add this assertion incase template args change - auto kernel = program.kernel(template_args.size() > 1 ? "flamegpu::agent_function_wrapper" : "flamegpu::agent_function_condition_wrapper"); - return std::make_unique(kernel, template_args); - } catch (std::runtime_error const&) { - // jitify does not have a method for getting compile logs so rely on JITIFY_PRINT_LOG defined in cmake + const std::string program_name = func_name + "_program"; // Does this name actually matter? + jitify2::PreprocessedProgram program = jitify2::Program(program_name, kernel_src, headers)->preprocess(options); + if (!program.ok()) { + const jitify2::ErrorMsg& compile_error = program.error(); + fprintf(stderr, "Failed to load program for agent function (condition) '%s', log:\n%s", + func_name.c_str(), compile_error.c_str()); + THROW exception::InvalidAgentFunc("Error loading agent function (or function condition) ('%s'): function had compilation errors (see std::cout), " + "in JitifyCache::buildProgram().", + func_name.c_str()); + } + // Build the name of the template configuration to be instantiated + std::stringstream name_expression; + if (template_args.size() == 1) { + name_expression << "flamegpu::agent_function_condition_wrapper<"; + name_expression << template_args[0]; + name_expression << ">"; + } else if (template_args.size() == 3) { + name_expression << "flamegpu::agent_function_wrapper<"; + name_expression << template_args[0] << "," << template_args[1] << "," << template_args[2]; + name_expression << ">"; + } else { + THROW exception::UnknownInternalError("Unexpected AgentFunction template arg count!"); + } + auto loaded_program = program->load({ name_expression.str() }); + if (!loaded_program.ok()) { + const jitify2::ErrorMsg &compile_error = loaded_program.error(); + fprintf(stderr, "Failed to load program for agent function (condition) '%s', log:\n%s", + func_name.c_str(), compile_error.c_str()); + THROW exception::InvalidAgentFunc("Error loading agent function (or function condition) ('%s'): function had compilation errors (see std::cout), " + "in JitifyCache::buildProgram().", + func_name.c_str()); + } + auto loaded_kernel = loaded_program->get_kernel(""); + if (loaded_kernel.ok()) { + return std::make_unique(loaded_kernel.value()); + } else { + const jitify2::ErrorMsg &compile_error = loaded_kernel.error(); + fprintf(stderr, "Failed to compile and link agent function (condition) '%s', log:\n%s", + func_name.c_str(), compile_error.c_str()); THROW exception::InvalidAgentFunc("Error compiling runtime agent function (or function condition) ('%s'): function had compilation errors (see std::cout), " "in JitifyCache::buildProgram().", func_name.c_str()); @@ -497,7 +529,7 @@ void JitifyCache::getKnownHeaders(std::vector& headers) { headers.push_back("type_traits"); } -std::unique_ptr JitifyCache::loadKernel(const std::string &func_name, const std::vector &template_args, const std::string &kernel_src, const std::string &dynamic_header) { +std::unique_ptr JitifyCache::loadKernel(const std::string &func_name, const std::vector &template_args, const std::string &kernel_src, const std::string &dynamic_header) { flamegpu::util::nvtx::Range range{"JitifyCache::loadKernel"}; std::lock_guard lock(cache_mutex); // Detect current compute capability= @@ -527,14 +559,19 @@ std::unique_ptr JitifyCache::loadKernel(const std::string & "XORWOW_" + #endif // Use jitify hash methods for consistent hashing between OSs - std::to_string(hash_combine(hash_larson64(kernel_src.c_str()), hash_larson64(dynamic_header.c_str()))); + jitify2::detail::sha256(kernel_src + dynamic_header);/* // Does a copy with the right reference exist in memory? if (use_memory_cache) { const auto it = cache.find(short_reference); if (it != cache.end()) { // Check long reference if (it->second.long_reference == long_reference) { - return std::make_unique(KernelInstantiation::deserialize(it->second.serialised_kernelinst)); + // Deserialize and return program + jitify2::Kernel prog = jitify2::Kernel::deserialize(it->second.serialised_kernel); + if (prog.ok()) { + return std::make_unique(prog.value()); + } + // Fail silently and try to build code } } } @@ -551,24 +588,29 @@ std::unique_ptr JitifyCache::loadKernel(const std::string & // Add it to cache for later loads cache.emplace(short_reference, CachedProgram{long_reference, serialised_kernelinst}); // Deserialize and return program - return std::make_unique(KernelInstantiation::deserialize(serialised_kernelinst)); + jitify2::Kernel prog = jitify2::Kernel::deserialize(serialised_kernelinst); + if (prog.ok()) { + return std::make_unique(prog.value()); + } + // Fail silently and try to build code } } - } + }*/ // Kernel has not yet been cached { // Build kernel - auto kernelinst = compileKernel(func_name, template_args, kernel_src, dynamic_header); + std::unique_ptr kernel = compileKernel(func_name, template_args, kernel_src, dynamic_header); +/* // Add it to cache for later loads - const std::string serialised_kernelinst = use_memory_cache || use_disk_cache ? kernelinst->serialize() : ""; + const std::string serialised_kernel = use_memory_cache || use_disk_cache ? kernel->serialize() : ""; if (use_memory_cache) { - cache.emplace(short_reference, CachedProgram{long_reference, serialised_kernelinst}); + cache.emplace(short_reference, CachedProgram{long_reference, serialised_kernel }); } // Save it to disk if (use_disk_cache) { std::ofstream ofs(cache_file, std::ofstream::out | std::ofstream::binary | std::ofstream::trunc); if (ofs) { - ofs << serialised_kernelinst; + ofs << serialised_kernel; ofs.close(); } ofs = std::ofstream(reference_file, std::ofstream::out | std::ofstream::binary | std::ofstream::trunc); @@ -577,7 +619,8 @@ std::unique_ptr JitifyCache::loadKernel(const std::string & ofs.close(); } } - return kernelinst; +*/ + return kernel; } } void JitifyCache::useMemoryCache(bool yesno) { diff --git a/src/flamegpu/runtime/detail/curve/curve_rtc.cpp b/src/flamegpu/runtime/detail/curve/curve_rtc.cpp index f9cf4dc38..7adeb36e8 100644 --- a/src/flamegpu/runtime/detail/curve/curve_rtc.cpp +++ b/src/flamegpu/runtime/detail/curve/curve_rtc.cpp @@ -6,14 +6,15 @@ #include "flamegpu/simulation/detail/EnvironmentManager.cuh" #include "flamegpu/detail/cuda.cuh" -// jitify include for demangle -#ifdef _MSC_VER -#pragma warning(push, 2) -#include "jitify/jitify.hpp" -#pragma warning(pop) -#else -#include "jitify/jitify.hpp" -#endif +#include "jitify/jitify2.hpp" +//// jitify include for demangle +//#ifdef _MSC_VER +//#pragma warning(push, 2) +//#include "jitify/jitify.hpp" +//#pragma warning(pop) +//#else +//#include "jitify/jitify.hpp" +//#endif namespace flamegpu { namespace detail { @@ -1052,10 +1053,11 @@ void CurveRTCHost::updateEnvCache(const void *env_ptr, const size_t bufferLen) { bufferLen, agent_data_offset); } } -void CurveRTCHost::updateDevice_async(const jitify::experimental::KernelInstantiation& instance, cudaStream_t stream) { +void CurveRTCHost::updateDevice_async(const jitify2::KernelData& instance, cudaStream_t stream) { // The namespace is required here, but not in other uses of getVariableSymbolName. std::string cache_var_name = std::string("flamegpu::detail::curve::") + getVariableSymbolName(); - CUdeviceptr d_var_ptr = instance.get_global_ptr(cache_var_name.c_str()); + CUdeviceptr d_var_ptr; + instance.program().get_global_ptr(cache_var_name.c_str(), &d_var_ptr); gpuErrchkDriverAPI(cuMemcpyHtoDAsync(d_var_ptr, h_data_buffer, data_buffer_size, stream)); } diff --git a/src/flamegpu/simulation/CUDASimulation.cu b/src/flamegpu/simulation/CUDASimulation.cu index 202d2ec59..feacd63d4 100644 --- a/src/flamegpu/simulation/CUDASimulation.cu +++ b/src/flamegpu/simulation/CUDASimulation.cu @@ -6,6 +6,8 @@ #include #include +#include "jitify/jitify2.hpp" + #include "flamegpu/detail/curand.cuh" #include "flamegpu/model/AgentFunctionData.cuh" #include "flamegpu/model/LayerData.h" @@ -736,24 +738,22 @@ void CUDASimulation::stepLayer(const std::shared_ptr& layer, const un } else { // RTC function std::string func_condition_identifier = func_name + "_condition"; // get instantiation - const jitify::experimental::KernelInstantiation& instance = cuda_agent.getRTCInstantiation(func_condition_identifier); + const jitify2::KernelData& instance = cuda_agent.getRTCInstantiation(func_condition_identifier); // calculate the grid block size for main agent function - CUfunction cu_func = (CUfunction)instance; + CUfunction cu_func = instance.function(); cuOccupancyMaxPotentialBlockSize(&minGridSize, &blockSize, cu_func, 0, 0, state_list_size); //! Round up according to CUDAAgent state list size gridSize = (state_list_size + blockSize - 1) / blockSize; // launch the kernel - CUresult a = instance.configure(gridSize, blockSize, 0, this->getStream(streamIdx)).launch({ + jitify2::ErrorMsg a = instance.configure(gridSize, blockSize, 0, this->getStream(streamIdx))->launch({ #if !defined(FLAMEGPU_SEATBELTS) || FLAMEGPU_SEATBELTS reinterpret_cast(&error_buffer), #endif const_cast(reinterpret_cast(&state_list_size)), reinterpret_cast(&t_rng), reinterpret_cast(&scanFlag_agentDeath) }); - if (a != CUresult::CUDA_SUCCESS) { - const char* err_str = nullptr; - cuGetErrorString(a, &err_str); - THROW exception::InvalidAgentFunc("There was a problem launching the runtime agent function condition '%s': %s", func_des->rtc_func_condition_name.c_str(), err_str); + if (!a.empty()) { + THROW exception::InvalidAgentFunc("There was a problem launching the runtime agent function condition '%s': %s", func_des->rtc_func_condition_name.c_str(), a.c_str()); } gpuErrchkLaunch(); } @@ -966,14 +966,14 @@ void CUDASimulation::stepLayer(const std::shared_ptr& layer, const un gpuErrchkLaunch(); } else { // assume this is a runtime specified agent function // get instantiation - const jitify::experimental::KernelInstantiation& instance = cuda_agent.getRTCInstantiation(func_name); + const jitify2::KernelData& instance = cuda_agent.getRTCInstantiation(func_name); // calculate the grid block size for main agent function - CUfunction cu_func = (CUfunction)instance; + CUfunction cu_func = (CUfunction)instance.function(); cuOccupancyMaxPotentialBlockSize(&minGridSize, &blockSize, cu_func, 0, 0, state_list_size); //! Round up according to CUDAAgent state list size gridSize = (state_list_size + blockSize - 1) / blockSize; // launch the kernel - CUresult a = instance.configure(gridSize, blockSize, 0, this->getStream(streamIdx)).launch({ + jitify2::ErrorMsg a = instance.configure(gridSize, blockSize, 0, this->getStream(streamIdx))->launch({ #if !defined(FLAMEGPU_SEATBELTS) || FLAMEGPU_SEATBELTS reinterpret_cast(&error_buffer), #endif @@ -985,10 +985,8 @@ void CUDASimulation::stepLayer(const std::shared_ptr& layer, const un reinterpret_cast(&scanFlag_agentDeath), reinterpret_cast(&scanFlag_messageOutput), reinterpret_cast(&scanFlag_agentOutput)}); - if (a != CUresult::CUDA_SUCCESS) { - const char* err_str = nullptr; - cuGetErrorString(a, &err_str); - THROW exception::InvalidAgentFunc("There was a problem launching the runtime agent function '%s': %s", func_name.c_str(), err_str); + if (!a.empty()) { + THROW exception::InvalidAgentFunc("There was a problem launching the runtime agent function '%s': %s", func_name.c_str(), a.c_str()); } gpuErrchkLaunch(); } diff --git a/src/flamegpu/simulation/detail/CUDAAgent.cu b/src/flamegpu/simulation/detail/CUDAAgent.cu index a7f228d6d..66a00cc8f 100644 --- a/src/flamegpu/simulation/detail/CUDAAgent.cu +++ b/src/flamegpu/simulation/detail/CUDAAgent.cu @@ -26,6 +26,8 @@ #pragma warning(pop) #endif // _MSC_VER +#include "jitify/jitify2.hpp" + #include "flamegpu/version.h" #include "flamegpu/simulation/detail/CUDAFatAgent.h" #include "flamegpu/simulation/detail/CUDAAgentStateList.h" @@ -610,7 +612,7 @@ void CUDAAgent::addInstantitateFunction(const AgentFunctionData& func, const std curve_map.insert(std::unordered_map>::value_type(key_name, std::move(curve))); } -const jitify::experimental::KernelInstantiation& CUDAAgent::getRTCInstantiation(const std::string &function_name) const { +const jitify2::KernelData& CUDAAgent::getRTCInstantiation(const std::string &function_name) const { CUDARTCFuncMap::const_iterator mm = rtc_func_map.find(function_name); if (mm == rtc_func_map.end()) { THROW exception::InvalidAgentFunc("Function name '%s' is not a runtime compiled agent function in agent '%s', "