diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index d53176133b..56f72c49e9 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -2,16 +2,21 @@ # Level Zero adapter source/adapters/level_zero @oneapi-src/unified-runtime-level-zero-write +test/adapters/level_zero @oneapi-src/unified-runtime-level-zero-write # CUDA and HIP adapters source/adapters/cuda @oneapi-src/unified-runtime-cuda-write +test/adapters/cuda @oneapi-src/unified-runtime-cuda-write source/adapters/hip @oneapi-src/unified-runtime-hip-write +test/adapters/hip @oneapi-src/unified-runtime-hip-write # OpenCL adapter source/adapters/opencl @oneapi-src/unified-runtime-opencl-write +test/adapters/opencl @oneapi-src/unified-runtime-opencl-write # Native CPU adapter source/adapters/native_cpu @oneapi-src/unified-runtime-native-cpu-write +test/adapters/native_cpu @oneapi-src/unified-runtime-native-cpu-write # Command-buffer experimental feature source/adapters/**/command_buffer.* @oneapi-src/unified-runtime-command-buffer-write @@ -20,6 +25,7 @@ scripts/core/exp-command-buffer.yml @oneapi-src/unified-runtime-command-buff test/conformance/exp_command_buffer** @oneapi-src/unified-runtime-command-buffer-write # Bindless Images experimental feature -scripts/core/EXP-BINDLESS-IMAGES.rst @oneapi-src/unified-runtime-bindless-images-write -scripts/core/exp-bindless-images.yml @oneapi-src/unified-runtime-bindless-images-write -source/adapters/**/image.* @oneapi-src/unified-runtime-bindless-images-write +source/adapters/**/image.* @oneapi-src/unified-runtime-bindless-images-write +scripts/core/EXP-BINDLESS-IMAGES.rst @oneapi-src/unified-runtime-bindless-images-write +scripts/core/exp-bindless-images.yml @oneapi-src/unified-runtime-bindless-images-write +test/conformance/exp_bindless_images** @oneapi-src/unified-runtime-bindless-images-write diff --git a/CMakeLists.txt b/CMakeLists.txt index 4fcd74e729..a8f5f2ad96 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -48,6 +48,7 @@ option(UR_BUILD_EXAMPLE_CODEGEN "Build the codegen example." OFF) option(VAL_USE_LIBBACKTRACE_BACKTRACE "enable libbacktrace validation backtrace for linux" OFF) option(UR_ENABLE_ASSERTIONS "Enable assertions for all build types" OFF) set(UR_DPCXX "" CACHE FILEPATH "Path of the DPC++ compiler executable") +set(UR_DPCXX_BUILD_FLAGS "" CACHE STRING "Build flags to pass to DPC++ when compiling device programs") set(UR_SYCL_LIBRARY_DIR "" CACHE PATH "Path of the SYCL runtime library directory") set(UR_CONFORMANCE_TARGET_TRIPLES "" CACHE STRING diff --git a/README.md b/README.md index cb43c380b9..b1c6420420 100644 --- a/README.md +++ b/README.md @@ -140,6 +140,7 @@ List of options provided by CMake: | UR_HIP_PLATFORM | Build HIP adapter for AMD or NVIDIA platform | AMD/NVIDIA | AMD | | UR_ENABLE_COMGR | Enable comgr lib usage | AMD/NVIDIA | AMD | | UR_DPCXX | Path of the DPC++ compiler executable to build CTS device binaries | File path | `""` | +| UR_DPCXX_BUILD_FLAGS | Build flags to pass to DPC++ when compiling device programs | Space-separated options list | `""` | | UR_SYCL_LIBRARY_DIR | Path of the SYCL runtime library directory to build CTS device binaries | Directory path | `""` | | UR_HIP_ROCM_DIR | Path of the default ROCm HIP installation | Directory path | `/opt/rocm` | | UR_HIP_INCLUDE_DIR | Path of the ROCm HIP include directory | Directory path | `${UR_HIP_ROCM_DIR}/include` | diff --git a/include/ur_api.h b/include/ur_api.h index 8680dbeffb..93dd26cd50 100644 --- a/include/ur_api.h +++ b/include/ur_api.h @@ -1434,201 +1434,213 @@ urDeviceGetSelected( /////////////////////////////////////////////////////////////////////////////// /// @brief Supported device info typedef enum ur_device_info_t { - UR_DEVICE_INFO_TYPE = 0, ///< [::ur_device_type_t] type of the device - UR_DEVICE_INFO_VENDOR_ID = 1, ///< [uint32_t] vendor Id of the device - UR_DEVICE_INFO_DEVICE_ID = 2, ///< [uint32_t] Id of the device - UR_DEVICE_INFO_MAX_COMPUTE_UNITS = 3, ///< [uint32_t] the number of compute units - UR_DEVICE_INFO_MAX_WORK_ITEM_DIMENSIONS = 4, ///< [uint32_t] max work item dimensions - UR_DEVICE_INFO_MAX_WORK_ITEM_SIZES = 5, ///< [size_t[]] return an array of max work item sizes - UR_DEVICE_INFO_MAX_WORK_GROUP_SIZE = 6, ///< [size_t] max work group size - UR_DEVICE_INFO_SINGLE_FP_CONFIG = 7, ///< [::ur_device_fp_capability_flags_t] single precision floating point - ///< capability - UR_DEVICE_INFO_HALF_FP_CONFIG = 8, ///< [::ur_device_fp_capability_flags_t] half precision floating point - ///< capability - UR_DEVICE_INFO_DOUBLE_FP_CONFIG = 9, ///< [::ur_device_fp_capability_flags_t] double precision floating point - ///< capability - UR_DEVICE_INFO_QUEUE_PROPERTIES = 10, ///< [::ur_queue_flags_t] command queue properties supported by the device - UR_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_CHAR = 11, ///< [uint32_t] preferred vector width for char - UR_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_SHORT = 12, ///< [uint32_t] preferred vector width for short - UR_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_INT = 13, ///< [uint32_t] preferred vector width for int - UR_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_LONG = 14, ///< [uint32_t] preferred vector width for long - UR_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_FLOAT = 15, ///< [uint32_t] preferred vector width for float - UR_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_DOUBLE = 16, ///< [uint32_t] preferred vector width for double - UR_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_HALF = 17, ///< [uint32_t] preferred vector width for half float - UR_DEVICE_INFO_NATIVE_VECTOR_WIDTH_CHAR = 18, ///< [uint32_t] native vector width for char - UR_DEVICE_INFO_NATIVE_VECTOR_WIDTH_SHORT = 19, ///< [uint32_t] native vector width for short - UR_DEVICE_INFO_NATIVE_VECTOR_WIDTH_INT = 20, ///< [uint32_t] native vector width for int - UR_DEVICE_INFO_NATIVE_VECTOR_WIDTH_LONG = 21, ///< [uint32_t] native vector width for long - UR_DEVICE_INFO_NATIVE_VECTOR_WIDTH_FLOAT = 22, ///< [uint32_t] native vector width for float - UR_DEVICE_INFO_NATIVE_VECTOR_WIDTH_DOUBLE = 23, ///< [uint32_t] native vector width for double - UR_DEVICE_INFO_NATIVE_VECTOR_WIDTH_HALF = 24, ///< [uint32_t] native vector width for half float - UR_DEVICE_INFO_MAX_CLOCK_FREQUENCY = 25, ///< [uint32_t] max clock frequency in MHz - UR_DEVICE_INFO_MEMORY_CLOCK_RATE = 26, ///< [uint32_t] memory clock frequency in MHz - UR_DEVICE_INFO_ADDRESS_BITS = 27, ///< [uint32_t] address bits - UR_DEVICE_INFO_MAX_MEM_ALLOC_SIZE = 28, ///< [uint64_t] max memory allocation size - UR_DEVICE_INFO_IMAGE_SUPPORTED = 29, ///< [::ur_bool_t] images are supported - UR_DEVICE_INFO_MAX_READ_IMAGE_ARGS = 30, ///< [uint32_t] max number of image objects arguments of a kernel declared - ///< with the read_only qualifier - UR_DEVICE_INFO_MAX_WRITE_IMAGE_ARGS = 31, ///< [uint32_t] max number of image objects arguments of a kernel declared - ///< with the write_only qualifier - UR_DEVICE_INFO_MAX_READ_WRITE_IMAGE_ARGS = 32, ///< [uint32_t] max number of image objects arguments of a kernel declared - ///< with the read_write qualifier - UR_DEVICE_INFO_IMAGE2D_MAX_WIDTH = 33, ///< [size_t] max width of Image2D object - UR_DEVICE_INFO_IMAGE2D_MAX_HEIGHT = 34, ///< [size_t] max height of Image2D object - UR_DEVICE_INFO_IMAGE3D_MAX_WIDTH = 35, ///< [size_t] max width of Image3D object - UR_DEVICE_INFO_IMAGE3D_MAX_HEIGHT = 36, ///< [size_t] max height of Image3D object - UR_DEVICE_INFO_IMAGE3D_MAX_DEPTH = 37, ///< [size_t] max depth of Image3D object - UR_DEVICE_INFO_IMAGE_MAX_BUFFER_SIZE = 38, ///< [size_t] max image buffer size - UR_DEVICE_INFO_IMAGE_MAX_ARRAY_SIZE = 39, ///< [size_t] max image array size - UR_DEVICE_INFO_MAX_SAMPLERS = 40, ///< [uint32_t] max number of samplers that can be used in a kernel - UR_DEVICE_INFO_MAX_PARAMETER_SIZE = 41, ///< [size_t] max size in bytes of all arguments passed to a kernel - UR_DEVICE_INFO_MEM_BASE_ADDR_ALIGN = 42, ///< [uint32_t] memory base address alignment - UR_DEVICE_INFO_GLOBAL_MEM_CACHE_TYPE = 43, ///< [::ur_device_mem_cache_type_t] global memory cache type - UR_DEVICE_INFO_GLOBAL_MEM_CACHELINE_SIZE = 44, ///< [uint32_t] global memory cache line size in bytes - UR_DEVICE_INFO_GLOBAL_MEM_CACHE_SIZE = 45, ///< [uint64_t] size of global memory cache in bytes - UR_DEVICE_INFO_GLOBAL_MEM_SIZE = 46, ///< [uint64_t] size of global memory in bytes - UR_DEVICE_INFO_GLOBAL_MEM_FREE = 47, ///< [uint64_t] size of global memory which is free in bytes - UR_DEVICE_INFO_MAX_CONSTANT_BUFFER_SIZE = 48, ///< [uint64_t] max constant buffer size in bytes - UR_DEVICE_INFO_MAX_CONSTANT_ARGS = 49, ///< [uint32_t] max number of __const declared arguments in a kernel - UR_DEVICE_INFO_LOCAL_MEM_TYPE = 50, ///< [::ur_device_local_mem_type_t] local memory type - UR_DEVICE_INFO_LOCAL_MEM_SIZE = 51, ///< [uint64_t] local memory size in bytes - UR_DEVICE_INFO_ERROR_CORRECTION_SUPPORT = 52, ///< [::ur_bool_t] support error correction to global and local memory - UR_DEVICE_INFO_HOST_UNIFIED_MEMORY = 53, ///< [::ur_bool_t] unified host device memory - UR_DEVICE_INFO_PROFILING_TIMER_RESOLUTION = 54, ///< [size_t] profiling timer resolution in nanoseconds - UR_DEVICE_INFO_ENDIAN_LITTLE = 55, ///< [::ur_bool_t] little endian byte order - UR_DEVICE_INFO_AVAILABLE = 56, ///< [::ur_bool_t] device is available - UR_DEVICE_INFO_COMPILER_AVAILABLE = 57, ///< [::ur_bool_t] device compiler is available - UR_DEVICE_INFO_LINKER_AVAILABLE = 58, ///< [::ur_bool_t] device linker is available - UR_DEVICE_INFO_EXECUTION_CAPABILITIES = 59, ///< [::ur_device_exec_capability_flags_t] device kernel execution - ///< capability bit-field - UR_DEVICE_INFO_QUEUE_ON_DEVICE_PROPERTIES = 60, ///< [::ur_queue_flags_t] device command queue property bit-field - UR_DEVICE_INFO_QUEUE_ON_HOST_PROPERTIES = 61, ///< [::ur_queue_flags_t] host queue property bit-field - UR_DEVICE_INFO_BUILT_IN_KERNELS = 62, ///< [char[]] a semi-colon separated list of built-in kernels - UR_DEVICE_INFO_PLATFORM = 63, ///< [::ur_platform_handle_t] the platform associated with the device - UR_DEVICE_INFO_REFERENCE_COUNT = 64, ///< [uint32_t] Reference count of the device object. - ///< The reference count returned should be considered immediately stale. - ///< It is unsuitable for general use in applications. This feature is - ///< provided for identifying memory leaks. - UR_DEVICE_INFO_IL_VERSION = 65, ///< [char[]] IL version - UR_DEVICE_INFO_NAME = 66, ///< [char[]] Device name - UR_DEVICE_INFO_VENDOR = 67, ///< [char[]] Device vendor - UR_DEVICE_INFO_DRIVER_VERSION = 68, ///< [char[]] Driver version - UR_DEVICE_INFO_PROFILE = 69, ///< [char[]] Device profile - UR_DEVICE_INFO_VERSION = 70, ///< [char[]] Device version - UR_DEVICE_INFO_BACKEND_RUNTIME_VERSION = 71, ///< [char[]] Version of backend runtime - UR_DEVICE_INFO_EXTENSIONS = 72, ///< [char[]] Return a space separated list of extension names - UR_DEVICE_INFO_PRINTF_BUFFER_SIZE = 73, ///< [size_t] Maximum size in bytes of internal printf buffer - UR_DEVICE_INFO_PREFERRED_INTEROP_USER_SYNC = 74, ///< [::ur_bool_t] prefer user synchronization when sharing object with - ///< other API - UR_DEVICE_INFO_PARENT_DEVICE = 75, ///< [::ur_device_handle_t] return parent device handle - UR_DEVICE_INFO_SUPPORTED_PARTITIONS = 76, ///< [::ur_device_partition_t[]] Returns an array of partition types - ///< supported by the device - UR_DEVICE_INFO_PARTITION_MAX_SUB_DEVICES = 77, ///< [uint32_t] maximum number of sub-devices when the device is - ///< partitioned - UR_DEVICE_INFO_PARTITION_AFFINITY_DOMAIN = 78, ///< [::ur_device_affinity_domain_flags_t] Returns a bit-field of the - ///< supported affinity domains for partitioning. - ///< If the device does not support any affinity domains, then 0 will be returned. - UR_DEVICE_INFO_PARTITION_TYPE = 79, ///< [::ur_device_partition_property_t[]] return an array of - ///< ::ur_device_partition_property_t for properties specified in - ///< ::urDevicePartition - UR_DEVICE_INFO_MAX_NUM_SUB_GROUPS = 80, ///< [uint32_t] max number of sub groups - UR_DEVICE_INFO_SUB_GROUP_INDEPENDENT_FORWARD_PROGRESS = 81, ///< [::ur_bool_t] support sub group independent forward progress - UR_DEVICE_INFO_SUB_GROUP_SIZES_INTEL = 82, ///< [uint32_t[]] return an array of sub group sizes supported on Intel - ///< device - UR_DEVICE_INFO_USM_HOST_SUPPORT = 83, ///< [::ur_device_usm_access_capability_flags_t] support USM host memory - ///< access - UR_DEVICE_INFO_USM_DEVICE_SUPPORT = 84, ///< [::ur_device_usm_access_capability_flags_t] support USM device memory - ///< access - UR_DEVICE_INFO_USM_SINGLE_SHARED_SUPPORT = 85, ///< [::ur_device_usm_access_capability_flags_t] support USM single device - ///< shared memory access - UR_DEVICE_INFO_USM_CROSS_SHARED_SUPPORT = 86, ///< [::ur_device_usm_access_capability_flags_t] support USM cross device - ///< shared memory access - UR_DEVICE_INFO_USM_SYSTEM_SHARED_SUPPORT = 87, ///< [::ur_device_usm_access_capability_flags_t] support USM system wide - ///< shared memory access - UR_DEVICE_INFO_UUID = 88, ///< [uint8_t[]] return device UUID - UR_DEVICE_INFO_PCI_ADDRESS = 89, ///< [char[]] return device PCI address - UR_DEVICE_INFO_GPU_EU_COUNT = 90, ///< [uint32_t] return Intel GPU EU count - UR_DEVICE_INFO_GPU_EU_SIMD_WIDTH = 91, ///< [uint32_t] return Intel GPU EU SIMD width - UR_DEVICE_INFO_GPU_EU_SLICES = 92, ///< [uint32_t] return Intel GPU number of slices - UR_DEVICE_INFO_GPU_EU_COUNT_PER_SUBSLICE = 93, ///< [uint32_t] return Intel GPU EU count per subslice - UR_DEVICE_INFO_GPU_SUBSLICES_PER_SLICE = 94, ///< [uint32_t] return Intel GPU number of subslices per slice - UR_DEVICE_INFO_GPU_HW_THREADS_PER_EU = 95, ///< [uint32_t] return Intel GPU number of threads per EU - UR_DEVICE_INFO_MAX_MEMORY_BANDWIDTH = 96, ///< [uint32_t] return max memory bandwidth in Mb/s - UR_DEVICE_INFO_IMAGE_SRGB = 97, ///< [::ur_bool_t] device supports sRGB images - UR_DEVICE_INFO_BUILD_ON_SUBDEVICE = 98, ///< [::ur_bool_t] Return true if sub-device should do its own program - ///< build - UR_DEVICE_INFO_ATOMIC_64 = 99, ///< [::ur_bool_t] support 64 bit atomics - UR_DEVICE_INFO_ATOMIC_MEMORY_ORDER_CAPABILITIES = 100, ///< [::ur_memory_order_capability_flags_t] return a bit-field of atomic - ///< memory order capabilities - UR_DEVICE_INFO_ATOMIC_MEMORY_SCOPE_CAPABILITIES = 101, ///< [::ur_memory_scope_capability_flags_t] return a bit-field of atomic - ///< memory scope capabilities - UR_DEVICE_INFO_ATOMIC_FENCE_ORDER_CAPABILITIES = 102, ///< [::ur_memory_order_capability_flags_t] return a bit-field of atomic - ///< memory fence order capabilities - UR_DEVICE_INFO_ATOMIC_FENCE_SCOPE_CAPABILITIES = 103, ///< [::ur_memory_scope_capability_flags_t] return a bit-field of atomic - ///< memory fence scope capabilities - UR_DEVICE_INFO_BFLOAT16 = 104, ///< [::ur_bool_t] support for bfloat16 - UR_DEVICE_INFO_MAX_COMPUTE_QUEUE_INDICES = 105, ///< [uint32_t] Returns 1 if the device doesn't have a notion of a - ///< queue index. Otherwise, returns the number of queue indices that are - ///< available for this device. - UR_DEVICE_INFO_KERNEL_SET_SPECIALIZATION_CONSTANTS = 106, ///< [::ur_bool_t] support the ::urKernelSetSpecializationConstants entry - ///< point - UR_DEVICE_INFO_MEMORY_BUS_WIDTH = 107, ///< [uint32_t] return the width in bits of the memory bus interface of the - ///< device. - UR_DEVICE_INFO_MAX_WORK_GROUPS_3D = 108, ///< [size_t[3]] return max 3D work groups - UR_DEVICE_INFO_ASYNC_BARRIER = 109, ///< [::ur_bool_t] return true if Async Barrier is supported - UR_DEVICE_INFO_MEM_CHANNEL_SUPPORT = 110, ///< [::ur_bool_t] return true if specifying memory channels is supported - UR_DEVICE_INFO_HOST_PIPE_READ_WRITE_SUPPORTED = 111, ///< [::ur_bool_t] Return true if the device supports enqueueing commands - ///< to read and write pipes from the host. - UR_DEVICE_INFO_MAX_REGISTERS_PER_WORK_GROUP = 112, ///< [uint32_t] The maximum number of registers available per block. - UR_DEVICE_INFO_IP_VERSION = 113, ///< [uint32_t] The device IP version. The meaning of the device IP version - ///< is implementation-defined, but newer devices should have a higher - ///< version than older devices. - UR_DEVICE_INFO_VIRTUAL_MEMORY_SUPPORT = 114, ///< [::ur_bool_t] return true if the device supports virtual memory. - UR_DEVICE_INFO_ESIMD_SUPPORT = 115, ///< [::ur_bool_t] return true if the device supports ESIMD. - UR_DEVICE_INFO_COMPONENT_DEVICES = 116, ///< [::ur_device_handle_t[]] The set of component devices contained by - ///< this composite device. - UR_DEVICE_INFO_COMPOSITE_DEVICE = 117, ///< [::ur_device_handle_t] The composite device containing this component - ///< device. - UR_DEVICE_INFO_COMMAND_BUFFER_SUPPORT_EXP = 0x1000, ///< [::ur_bool_t] Returns true if the device supports the use of - ///< command-buffers. - UR_DEVICE_INFO_COMMAND_BUFFER_UPDATE_SUPPORT_EXP = 0x1001, ///< [::ur_bool_t] Returns true if the device supports updating the kernel - ///< commands in a command-buffer. - UR_DEVICE_INFO_BINDLESS_IMAGES_SUPPORT_EXP = 0x2000, ///< [::ur_bool_t] returns true if the device supports the creation of - ///< bindless images - UR_DEVICE_INFO_BINDLESS_IMAGES_SHARED_USM_SUPPORT_EXP = 0x2001, ///< [::ur_bool_t] returns true if the device supports the creation of - ///< bindless images backed by shared USM - UR_DEVICE_INFO_BINDLESS_IMAGES_1D_USM_SUPPORT_EXP = 0x2002, ///< [::ur_bool_t] returns true if the device supports the creation of 1D - ///< bindless images backed by USM - UR_DEVICE_INFO_BINDLESS_IMAGES_2D_USM_SUPPORT_EXP = 0x2003, ///< [::ur_bool_t] returns true if the device supports the creation of 2D - ///< bindless images backed by USM - UR_DEVICE_INFO_IMAGE_PITCH_ALIGN_EXP = 0x2004, ///< [uint32_t] returns the required alignment of the pitch between two - ///< rows of an image in bytes - UR_DEVICE_INFO_MAX_IMAGE_LINEAR_WIDTH_EXP = 0x2005, ///< [size_t] returns the maximum linear width allowed for images allocated - ///< using USM - UR_DEVICE_INFO_MAX_IMAGE_LINEAR_HEIGHT_EXP = 0x2006, ///< [size_t] returns the maximum linear height allowed for images - ///< allocated using USM - UR_DEVICE_INFO_MAX_IMAGE_LINEAR_PITCH_EXP = 0x2007, ///< [size_t] returns the maximum linear pitch allowed for images allocated - ///< using USM - UR_DEVICE_INFO_MIPMAP_SUPPORT_EXP = 0x2008, ///< [::ur_bool_t] returns true if the device supports allocating mipmap - ///< resources - UR_DEVICE_INFO_MIPMAP_ANISOTROPY_SUPPORT_EXP = 0x2009, ///< [::ur_bool_t] returns true if the device supports sampling mipmap - ///< images with anisotropic filtering - UR_DEVICE_INFO_MIPMAP_MAX_ANISOTROPY_EXP = 0x200A, ///< [uint32_t] returns the maximum anisotropic ratio supported by the - ///< device - UR_DEVICE_INFO_MIPMAP_LEVEL_REFERENCE_SUPPORT_EXP = 0x200B, ///< [::ur_bool_t] returns true if the device supports using images created - ///< from individual mipmap levels - UR_DEVICE_INFO_INTEROP_MEMORY_IMPORT_SUPPORT_EXP = 0x200C, ///< [::ur_bool_t] returns true if the device supports importing external - ///< memory resources - UR_DEVICE_INFO_INTEROP_MEMORY_EXPORT_SUPPORT_EXP = 0x200D, ///< [::ur_bool_t] returns true if the device supports exporting internal - ///< memory resources - UR_DEVICE_INFO_INTEROP_SEMAPHORE_IMPORT_SUPPORT_EXP = 0x200E, ///< [::ur_bool_t] returns true if the device supports importing external - ///< semaphore resources - UR_DEVICE_INFO_INTEROP_SEMAPHORE_EXPORT_SUPPORT_EXP = 0x200F, ///< [::ur_bool_t] returns true if the device supports exporting internal - ///< event resources - UR_DEVICE_INFO_CUBEMAP_SUPPORT_EXP = 0x2010, ///< [::ur_bool_t] returns true if the device supports allocating and - ///< accessing cubemap resources - UR_DEVICE_INFO_CUBEMAP_SEAMLESS_FILTERING_SUPPORT_EXP = 0x2011, ///< [::ur_bool_t] returns true if the device supports sampling cubemapped - ///< images across face boundaries + UR_DEVICE_INFO_TYPE = 0, ///< [::ur_device_type_t] type of the device + UR_DEVICE_INFO_VENDOR_ID = 1, ///< [uint32_t] vendor Id of the device + UR_DEVICE_INFO_DEVICE_ID = 2, ///< [uint32_t] Id of the device + UR_DEVICE_INFO_MAX_COMPUTE_UNITS = 3, ///< [uint32_t] the number of compute units + UR_DEVICE_INFO_MAX_WORK_ITEM_DIMENSIONS = 4, ///< [uint32_t] max work item dimensions + UR_DEVICE_INFO_MAX_WORK_ITEM_SIZES = 5, ///< [size_t[]] return an array of max work item sizes + UR_DEVICE_INFO_MAX_WORK_GROUP_SIZE = 6, ///< [size_t] max work group size + UR_DEVICE_INFO_SINGLE_FP_CONFIG = 7, ///< [::ur_device_fp_capability_flags_t] single precision floating point + ///< capability + UR_DEVICE_INFO_HALF_FP_CONFIG = 8, ///< [::ur_device_fp_capability_flags_t] half precision floating point + ///< capability + UR_DEVICE_INFO_DOUBLE_FP_CONFIG = 9, ///< [::ur_device_fp_capability_flags_t] double precision floating point + ///< capability + UR_DEVICE_INFO_QUEUE_PROPERTIES = 10, ///< [::ur_queue_flags_t] command queue properties supported by the device + UR_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_CHAR = 11, ///< [uint32_t] preferred vector width for char + UR_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_SHORT = 12, ///< [uint32_t] preferred vector width for short + UR_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_INT = 13, ///< [uint32_t] preferred vector width for int + UR_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_LONG = 14, ///< [uint32_t] preferred vector width for long + UR_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_FLOAT = 15, ///< [uint32_t] preferred vector width for float + UR_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_DOUBLE = 16, ///< [uint32_t] preferred vector width for double + UR_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_HALF = 17, ///< [uint32_t] preferred vector width for half float + UR_DEVICE_INFO_NATIVE_VECTOR_WIDTH_CHAR = 18, ///< [uint32_t] native vector width for char + UR_DEVICE_INFO_NATIVE_VECTOR_WIDTH_SHORT = 19, ///< [uint32_t] native vector width for short + UR_DEVICE_INFO_NATIVE_VECTOR_WIDTH_INT = 20, ///< [uint32_t] native vector width for int + UR_DEVICE_INFO_NATIVE_VECTOR_WIDTH_LONG = 21, ///< [uint32_t] native vector width for long + UR_DEVICE_INFO_NATIVE_VECTOR_WIDTH_FLOAT = 22, ///< [uint32_t] native vector width for float + UR_DEVICE_INFO_NATIVE_VECTOR_WIDTH_DOUBLE = 23, ///< [uint32_t] native vector width for double + UR_DEVICE_INFO_NATIVE_VECTOR_WIDTH_HALF = 24, ///< [uint32_t] native vector width for half float + UR_DEVICE_INFO_MAX_CLOCK_FREQUENCY = 25, ///< [uint32_t] max clock frequency in MHz + UR_DEVICE_INFO_MEMORY_CLOCK_RATE = 26, ///< [uint32_t] memory clock frequency in MHz + UR_DEVICE_INFO_ADDRESS_BITS = 27, ///< [uint32_t] address bits + UR_DEVICE_INFO_MAX_MEM_ALLOC_SIZE = 28, ///< [uint64_t] max memory allocation size + UR_DEVICE_INFO_IMAGE_SUPPORTED = 29, ///< [::ur_bool_t] images are supported + UR_DEVICE_INFO_MAX_READ_IMAGE_ARGS = 30, ///< [uint32_t] max number of image objects arguments of a kernel declared + ///< with the read_only qualifier + UR_DEVICE_INFO_MAX_WRITE_IMAGE_ARGS = 31, ///< [uint32_t] max number of image objects arguments of a kernel declared + ///< with the write_only qualifier + UR_DEVICE_INFO_MAX_READ_WRITE_IMAGE_ARGS = 32, ///< [uint32_t] max number of image objects arguments of a kernel declared + ///< with the read_write qualifier + UR_DEVICE_INFO_IMAGE2D_MAX_WIDTH = 33, ///< [size_t] max width of Image2D object + UR_DEVICE_INFO_IMAGE2D_MAX_HEIGHT = 34, ///< [size_t] max height of Image2D object + UR_DEVICE_INFO_IMAGE3D_MAX_WIDTH = 35, ///< [size_t] max width of Image3D object + UR_DEVICE_INFO_IMAGE3D_MAX_HEIGHT = 36, ///< [size_t] max height of Image3D object + UR_DEVICE_INFO_IMAGE3D_MAX_DEPTH = 37, ///< [size_t] max depth of Image3D object + UR_DEVICE_INFO_IMAGE_MAX_BUFFER_SIZE = 38, ///< [size_t] max image buffer size + UR_DEVICE_INFO_IMAGE_MAX_ARRAY_SIZE = 39, ///< [size_t] max image array size + UR_DEVICE_INFO_MAX_SAMPLERS = 40, ///< [uint32_t] max number of samplers that can be used in a kernel + UR_DEVICE_INFO_MAX_PARAMETER_SIZE = 41, ///< [size_t] max size in bytes of all arguments passed to a kernel + UR_DEVICE_INFO_MEM_BASE_ADDR_ALIGN = 42, ///< [uint32_t] memory base address alignment + UR_DEVICE_INFO_GLOBAL_MEM_CACHE_TYPE = 43, ///< [::ur_device_mem_cache_type_t] global memory cache type + UR_DEVICE_INFO_GLOBAL_MEM_CACHELINE_SIZE = 44, ///< [uint32_t] global memory cache line size in bytes + UR_DEVICE_INFO_GLOBAL_MEM_CACHE_SIZE = 45, ///< [uint64_t] size of global memory cache in bytes + UR_DEVICE_INFO_GLOBAL_MEM_SIZE = 46, ///< [uint64_t] size of global memory in bytes + UR_DEVICE_INFO_GLOBAL_MEM_FREE = 47, ///< [uint64_t] size of global memory which is free in bytes + UR_DEVICE_INFO_MAX_CONSTANT_BUFFER_SIZE = 48, ///< [uint64_t] max constant buffer size in bytes + UR_DEVICE_INFO_MAX_CONSTANT_ARGS = 49, ///< [uint32_t] max number of __const declared arguments in a kernel + UR_DEVICE_INFO_LOCAL_MEM_TYPE = 50, ///< [::ur_device_local_mem_type_t] local memory type + UR_DEVICE_INFO_LOCAL_MEM_SIZE = 51, ///< [uint64_t] local memory size in bytes + UR_DEVICE_INFO_ERROR_CORRECTION_SUPPORT = 52, ///< [::ur_bool_t] support error correction to global and local memory + UR_DEVICE_INFO_HOST_UNIFIED_MEMORY = 53, ///< [::ur_bool_t] unified host device memory + UR_DEVICE_INFO_PROFILING_TIMER_RESOLUTION = 54, ///< [size_t] profiling timer resolution in nanoseconds + UR_DEVICE_INFO_ENDIAN_LITTLE = 55, ///< [::ur_bool_t] little endian byte order + UR_DEVICE_INFO_AVAILABLE = 56, ///< [::ur_bool_t] device is available + UR_DEVICE_INFO_COMPILER_AVAILABLE = 57, ///< [::ur_bool_t] device compiler is available + UR_DEVICE_INFO_LINKER_AVAILABLE = 58, ///< [::ur_bool_t] device linker is available + UR_DEVICE_INFO_EXECUTION_CAPABILITIES = 59, ///< [::ur_device_exec_capability_flags_t] device kernel execution + ///< capability bit-field + UR_DEVICE_INFO_QUEUE_ON_DEVICE_PROPERTIES = 60, ///< [::ur_queue_flags_t] device command queue property bit-field + UR_DEVICE_INFO_QUEUE_ON_HOST_PROPERTIES = 61, ///< [::ur_queue_flags_t] host queue property bit-field + UR_DEVICE_INFO_BUILT_IN_KERNELS = 62, ///< [char[]] a semi-colon separated list of built-in kernels + UR_DEVICE_INFO_PLATFORM = 63, ///< [::ur_platform_handle_t] the platform associated with the device + UR_DEVICE_INFO_REFERENCE_COUNT = 64, ///< [uint32_t] Reference count of the device object. + ///< The reference count returned should be considered immediately stale. + ///< It is unsuitable for general use in applications. This feature is + ///< provided for identifying memory leaks. + UR_DEVICE_INFO_IL_VERSION = 65, ///< [char[]] IL version + UR_DEVICE_INFO_NAME = 66, ///< [char[]] Device name + UR_DEVICE_INFO_VENDOR = 67, ///< [char[]] Device vendor + UR_DEVICE_INFO_DRIVER_VERSION = 68, ///< [char[]] Driver version + UR_DEVICE_INFO_PROFILE = 69, ///< [char[]] Device profile + UR_DEVICE_INFO_VERSION = 70, ///< [char[]] Device version + UR_DEVICE_INFO_BACKEND_RUNTIME_VERSION = 71, ///< [char[]] Version of backend runtime + UR_DEVICE_INFO_EXTENSIONS = 72, ///< [char[]] Return a space separated list of extension names + UR_DEVICE_INFO_PRINTF_BUFFER_SIZE = 73, ///< [size_t] Maximum size in bytes of internal printf buffer + UR_DEVICE_INFO_PREFERRED_INTEROP_USER_SYNC = 74, ///< [::ur_bool_t] prefer user synchronization when sharing object with + ///< other API + UR_DEVICE_INFO_PARENT_DEVICE = 75, ///< [::ur_device_handle_t] return parent device handle + UR_DEVICE_INFO_SUPPORTED_PARTITIONS = 76, ///< [::ur_device_partition_t[]] Returns an array of partition types + ///< supported by the device + UR_DEVICE_INFO_PARTITION_MAX_SUB_DEVICES = 77, ///< [uint32_t] maximum number of sub-devices when the device is + ///< partitioned + UR_DEVICE_INFO_PARTITION_AFFINITY_DOMAIN = 78, ///< [::ur_device_affinity_domain_flags_t] Returns a bit-field of the + ///< supported affinity domains for partitioning. + ///< If the device does not support any affinity domains, then 0 will be returned. + UR_DEVICE_INFO_PARTITION_TYPE = 79, ///< [::ur_device_partition_property_t[]] return an array of + ///< ::ur_device_partition_property_t for properties specified in + ///< ::urDevicePartition + UR_DEVICE_INFO_MAX_NUM_SUB_GROUPS = 80, ///< [uint32_t] max number of sub groups + UR_DEVICE_INFO_SUB_GROUP_INDEPENDENT_FORWARD_PROGRESS = 81, ///< [::ur_bool_t] support sub group independent forward progress + UR_DEVICE_INFO_SUB_GROUP_SIZES_INTEL = 82, ///< [uint32_t[]] return an array of sub group sizes supported on Intel + ///< device + UR_DEVICE_INFO_USM_HOST_SUPPORT = 83, ///< [::ur_device_usm_access_capability_flags_t] support USM host memory + ///< access + UR_DEVICE_INFO_USM_DEVICE_SUPPORT = 84, ///< [::ur_device_usm_access_capability_flags_t] support USM device memory + ///< access + UR_DEVICE_INFO_USM_SINGLE_SHARED_SUPPORT = 85, ///< [::ur_device_usm_access_capability_flags_t] support USM single device + ///< shared memory access + UR_DEVICE_INFO_USM_CROSS_SHARED_SUPPORT = 86, ///< [::ur_device_usm_access_capability_flags_t] support USM cross device + ///< shared memory access + UR_DEVICE_INFO_USM_SYSTEM_SHARED_SUPPORT = 87, ///< [::ur_device_usm_access_capability_flags_t] support USM system wide + ///< shared memory access + UR_DEVICE_INFO_UUID = 88, ///< [uint8_t[]] return device UUID + UR_DEVICE_INFO_PCI_ADDRESS = 89, ///< [char[]] return device PCI address + UR_DEVICE_INFO_GPU_EU_COUNT = 90, ///< [uint32_t] return Intel GPU EU count + UR_DEVICE_INFO_GPU_EU_SIMD_WIDTH = 91, ///< [uint32_t] return Intel GPU EU SIMD width + UR_DEVICE_INFO_GPU_EU_SLICES = 92, ///< [uint32_t] return Intel GPU number of slices + UR_DEVICE_INFO_GPU_EU_COUNT_PER_SUBSLICE = 93, ///< [uint32_t] return Intel GPU EU count per subslice + UR_DEVICE_INFO_GPU_SUBSLICES_PER_SLICE = 94, ///< [uint32_t] return Intel GPU number of subslices per slice + UR_DEVICE_INFO_GPU_HW_THREADS_PER_EU = 95, ///< [uint32_t] return Intel GPU number of threads per EU + UR_DEVICE_INFO_MAX_MEMORY_BANDWIDTH = 96, ///< [uint32_t] return max memory bandwidth in Mb/s + UR_DEVICE_INFO_IMAGE_SRGB = 97, ///< [::ur_bool_t] device supports sRGB images + UR_DEVICE_INFO_BUILD_ON_SUBDEVICE = 98, ///< [::ur_bool_t] Return true if sub-device should do its own program + ///< build + UR_DEVICE_INFO_ATOMIC_64 = 99, ///< [::ur_bool_t] support 64 bit atomics + UR_DEVICE_INFO_ATOMIC_MEMORY_ORDER_CAPABILITIES = 100, ///< [::ur_memory_order_capability_flags_t] return a bit-field of atomic + ///< memory order capabilities + UR_DEVICE_INFO_ATOMIC_MEMORY_SCOPE_CAPABILITIES = 101, ///< [::ur_memory_scope_capability_flags_t] return a bit-field of atomic + ///< memory scope capabilities + UR_DEVICE_INFO_ATOMIC_FENCE_ORDER_CAPABILITIES = 102, ///< [::ur_memory_order_capability_flags_t] return a bit-field of atomic + ///< memory fence order capabilities + UR_DEVICE_INFO_ATOMIC_FENCE_SCOPE_CAPABILITIES = 103, ///< [::ur_memory_scope_capability_flags_t] return a bit-field of atomic + ///< memory fence scope capabilities + UR_DEVICE_INFO_BFLOAT16 = 104, ///< [::ur_bool_t] support for bfloat16 + UR_DEVICE_INFO_MAX_COMPUTE_QUEUE_INDICES = 105, ///< [uint32_t] Returns 1 if the device doesn't have a notion of a + ///< queue index. Otherwise, returns the number of queue indices that are + ///< available for this device. + UR_DEVICE_INFO_KERNEL_SET_SPECIALIZATION_CONSTANTS = 106, ///< [::ur_bool_t] support the ::urKernelSetSpecializationConstants entry + ///< point + UR_DEVICE_INFO_MEMORY_BUS_WIDTH = 107, ///< [uint32_t] return the width in bits of the memory bus interface of the + ///< device. + UR_DEVICE_INFO_MAX_WORK_GROUPS_3D = 108, ///< [size_t[3]] return max 3D work groups + UR_DEVICE_INFO_ASYNC_BARRIER = 109, ///< [::ur_bool_t] return true if Async Barrier is supported + UR_DEVICE_INFO_MEM_CHANNEL_SUPPORT = 110, ///< [::ur_bool_t] return true if specifying memory channels is supported + UR_DEVICE_INFO_HOST_PIPE_READ_WRITE_SUPPORTED = 111, ///< [::ur_bool_t] Return true if the device supports enqueueing commands + ///< to read and write pipes from the host. + UR_DEVICE_INFO_MAX_REGISTERS_PER_WORK_GROUP = 112, ///< [uint32_t] The maximum number of registers available per block. + UR_DEVICE_INFO_IP_VERSION = 113, ///< [uint32_t] The device IP version. The meaning of the device IP version + ///< is implementation-defined, but newer devices should have a higher + ///< version than older devices. + UR_DEVICE_INFO_VIRTUAL_MEMORY_SUPPORT = 114, ///< [::ur_bool_t] return true if the device supports virtual memory. + UR_DEVICE_INFO_ESIMD_SUPPORT = 115, ///< [::ur_bool_t] return true if the device supports ESIMD. + UR_DEVICE_INFO_COMPONENT_DEVICES = 116, ///< [::ur_device_handle_t[]] The set of component devices contained by + ///< this composite device. + UR_DEVICE_INFO_COMPOSITE_DEVICE = 117, ///< [::ur_device_handle_t] The composite device containing this component + ///< device. + UR_DEVICE_INFO_COMMAND_BUFFER_SUPPORT_EXP = 0x1000, ///< [::ur_bool_t] Returns true if the device supports the use of + ///< command-buffers. + UR_DEVICE_INFO_COMMAND_BUFFER_UPDATE_SUPPORT_EXP = 0x1001, ///< [::ur_bool_t] Returns true if the device supports updating the kernel + ///< commands in a command-buffer. + UR_DEVICE_INFO_BINDLESS_IMAGES_SUPPORT_EXP = 0x2000, ///< [::ur_bool_t] returns true if the device supports the creation of + ///< bindless images + UR_DEVICE_INFO_BINDLESS_IMAGES_SHARED_USM_SUPPORT_EXP = 0x2001, ///< [::ur_bool_t] returns true if the device supports the creation of + ///< bindless images backed by shared USM + UR_DEVICE_INFO_BINDLESS_IMAGES_1D_USM_SUPPORT_EXP = 0x2002, ///< [::ur_bool_t] returns true if the device supports the creation of 1D + ///< bindless images backed by USM + UR_DEVICE_INFO_BINDLESS_IMAGES_2D_USM_SUPPORT_EXP = 0x2003, ///< [::ur_bool_t] returns true if the device supports the creation of 2D + ///< bindless images backed by USM + UR_DEVICE_INFO_IMAGE_PITCH_ALIGN_EXP = 0x2004, ///< [uint32_t] returns the required alignment of the pitch between two + ///< rows of an image in bytes + UR_DEVICE_INFO_MAX_IMAGE_LINEAR_WIDTH_EXP = 0x2005, ///< [size_t] returns the maximum linear width allowed for images allocated + ///< using USM + UR_DEVICE_INFO_MAX_IMAGE_LINEAR_HEIGHT_EXP = 0x2006, ///< [size_t] returns the maximum linear height allowed for images + ///< allocated using USM + UR_DEVICE_INFO_MAX_IMAGE_LINEAR_PITCH_EXP = 0x2007, ///< [size_t] returns the maximum linear pitch allowed for images allocated + ///< using USM + UR_DEVICE_INFO_MIPMAP_SUPPORT_EXP = 0x2008, ///< [::ur_bool_t] returns true if the device supports allocating mipmap + ///< resources + UR_DEVICE_INFO_MIPMAP_ANISOTROPY_SUPPORT_EXP = 0x2009, ///< [::ur_bool_t] returns true if the device supports sampling mipmap + ///< images with anisotropic filtering + UR_DEVICE_INFO_MIPMAP_MAX_ANISOTROPY_EXP = 0x200A, ///< [uint32_t] returns the maximum anisotropic ratio supported by the + ///< device + UR_DEVICE_INFO_MIPMAP_LEVEL_REFERENCE_SUPPORT_EXP = 0x200B, ///< [::ur_bool_t] returns true if the device supports using images created + ///< from individual mipmap levels + UR_DEVICE_INFO_INTEROP_MEMORY_IMPORT_SUPPORT_EXP = 0x200C, ///< [::ur_bool_t] returns true if the device supports importing external + ///< memory resources + UR_DEVICE_INFO_INTEROP_MEMORY_EXPORT_SUPPORT_EXP = 0x200D, ///< [::ur_bool_t] returns true if the device supports exporting internal + ///< memory resources + UR_DEVICE_INFO_INTEROP_SEMAPHORE_IMPORT_SUPPORT_EXP = 0x200E, ///< [::ur_bool_t] returns true if the device supports importing external + ///< semaphore resources + UR_DEVICE_INFO_INTEROP_SEMAPHORE_EXPORT_SUPPORT_EXP = 0x200F, ///< [::ur_bool_t] returns true if the device supports exporting internal + ///< event resources + UR_DEVICE_INFO_CUBEMAP_SUPPORT_EXP = 0x2010, ///< [::ur_bool_t] returns true if the device supports allocating and + ///< accessing cubemap resources + UR_DEVICE_INFO_CUBEMAP_SEAMLESS_FILTERING_SUPPORT_EXP = 0x2011, ///< [::ur_bool_t] returns true if the device supports sampling cubemapped + ///< images across face boundaries + UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_1D_USM_EXP = 0x2012, ///< [::ur_bool_t] returns true if the device is capable of fetching USM + ///< backed 1D sampled image data. + UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_1D_EXP = 0x2013, ///< [::ur_bool_t] returns true if the device is capable of fetching + ///< non-USM backed 1D sampled image data. + UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_2D_USM_EXP = 0x2014, ///< [::ur_bool_t] returns true if the device is capable of fetching USM + ///< backed 2D sampled image data. + UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_2D_EXP = 0x2015, ///< [::ur_bool_t] returns true if the device is capable of fetching + ///< non-USM backed 2D sampled image data. + UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_3D_USM_EXP = 0x2016, ///< [::ur_bool_t] returns true if the device is capable of fetching USM + ///< backed 3D sampled image data. + UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_3D_EXP = 0x2017, ///< [::ur_bool_t] returns true if the device is capable of fetching + ///< non-USM backed 3D sampled image data. /// @cond UR_DEVICE_INFO_FORCE_UINT32 = 0x7fffffff /// @endcond @@ -1654,7 +1666,7 @@ typedef enum ur_device_info_t { /// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE /// + `NULL == hDevice` /// - ::UR_RESULT_ERROR_INVALID_ENUMERATION -/// + `::UR_DEVICE_INFO_CUBEMAP_SEAMLESS_FILTERING_SUPPORT_EXP < propName` +/// + `::UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_3D_EXP < propName` /// - ::UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION /// + If `propName` is not supported by the adapter. /// - ::UR_RESULT_ERROR_INVALID_SIZE diff --git a/include/ur_print.hpp b/include/ur_print.hpp index a5074c5da1..3d48ae9a35 100644 --- a/include/ur_print.hpp +++ b/include/ur_print.hpp @@ -2553,6 +2553,24 @@ inline std::ostream &operator<<(std::ostream &os, enum ur_device_info_t value) { case UR_DEVICE_INFO_CUBEMAP_SEAMLESS_FILTERING_SUPPORT_EXP: os << "UR_DEVICE_INFO_CUBEMAP_SEAMLESS_FILTERING_SUPPORT_EXP"; break; + case UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_1D_USM_EXP: + os << "UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_1D_USM_EXP"; + break; + case UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_1D_EXP: + os << "UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_1D_EXP"; + break; + case UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_2D_USM_EXP: + os << "UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_2D_USM_EXP"; + break; + case UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_2D_EXP: + os << "UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_2D_EXP"; + break; + case UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_3D_USM_EXP: + os << "UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_3D_USM_EXP"; + break; + case UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_3D_EXP: + os << "UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_3D_EXP"; + break; default: os << "unknown enumerator"; break; @@ -4190,6 +4208,78 @@ inline ur_result_t printTagged(std::ostream &os, const void *ptr, ur_device_info os << ")"; } break; + case UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_1D_USM_EXP: { + const ur_bool_t *tptr = (const ur_bool_t *)ptr; + if (sizeof(ur_bool_t) > size) { + os << "invalid size (is: " << size << ", expected: >=" << sizeof(ur_bool_t) << ")"; + return UR_RESULT_ERROR_INVALID_SIZE; + } + os << (const void *)(tptr) << " ("; + + os << *tptr; + + os << ")"; + } break; + case UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_1D_EXP: { + const ur_bool_t *tptr = (const ur_bool_t *)ptr; + if (sizeof(ur_bool_t) > size) { + os << "invalid size (is: " << size << ", expected: >=" << sizeof(ur_bool_t) << ")"; + return UR_RESULT_ERROR_INVALID_SIZE; + } + os << (const void *)(tptr) << " ("; + + os << *tptr; + + os << ")"; + } break; + case UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_2D_USM_EXP: { + const ur_bool_t *tptr = (const ur_bool_t *)ptr; + if (sizeof(ur_bool_t) > size) { + os << "invalid size (is: " << size << ", expected: >=" << sizeof(ur_bool_t) << ")"; + return UR_RESULT_ERROR_INVALID_SIZE; + } + os << (const void *)(tptr) << " ("; + + os << *tptr; + + os << ")"; + } break; + case UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_2D_EXP: { + const ur_bool_t *tptr = (const ur_bool_t *)ptr; + if (sizeof(ur_bool_t) > size) { + os << "invalid size (is: " << size << ", expected: >=" << sizeof(ur_bool_t) << ")"; + return UR_RESULT_ERROR_INVALID_SIZE; + } + os << (const void *)(tptr) << " ("; + + os << *tptr; + + os << ")"; + } break; + case UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_3D_USM_EXP: { + const ur_bool_t *tptr = (const ur_bool_t *)ptr; + if (sizeof(ur_bool_t) > size) { + os << "invalid size (is: " << size << ", expected: >=" << sizeof(ur_bool_t) << ")"; + return UR_RESULT_ERROR_INVALID_SIZE; + } + os << (const void *)(tptr) << " ("; + + os << *tptr; + + os << ")"; + } break; + case UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_3D_EXP: { + const ur_bool_t *tptr = (const ur_bool_t *)ptr; + if (sizeof(ur_bool_t) > size) { + os << "invalid size (is: " << size << ", expected: >=" << sizeof(ur_bool_t) << ")"; + return UR_RESULT_ERROR_INVALID_SIZE; + } + os << (const void *)(tptr) << " ("; + + os << *tptr; + + os << ")"; + } break; default: os << "unknown enumerator"; return UR_RESULT_ERROR_INVALID_ENUMERATION; diff --git a/scripts/core/EXP-BINDLESS-IMAGES.rst b/scripts/core/EXP-BINDLESS-IMAGES.rst index af90c1ea0f..ee54c69291 100644 --- a/scripts/core/EXP-BINDLESS-IMAGES.rst +++ b/scripts/core/EXP-BINDLESS-IMAGES.rst @@ -91,6 +91,12 @@ Enums * ${X}_DEVICE_INFO_INTEROP_SEMAPHORE_EXPORT_SUPPORT_EXP * ${X}_DEVICE_INFO_CUBEMAP_SUPPORT_EXP * ${X}_DEVICE_INFO_CUBEMAP_SEAMLESS_FILTERING_SUPPORT_EXP + * ${X}_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_1D_USM_EXP + * ${X}_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_1D_EXP + * ${X}_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_2D_USM_EXP + * ${X}_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_2D_EXP + * ${X}_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_3D_USM_EXP + * ${X}_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_3D_EXP * ${x}_command_t * ${X}_COMMAND_INTEROP_SEMAPHORE_WAIT_EXP @@ -198,6 +204,8 @@ Changelog +------------------------------------------------------------------------+ | 10.0 | Added cubemap image type, sampling properties, and device | | | queries. | ++------------------------------------------------------------------------+ +| 11.0 | Added device queries for sampled image fetch capabilities. | +----------+-------------------------------------------------------------+ Contributors diff --git a/scripts/core/exp-bindless-images.yml b/scripts/core/exp-bindless-images.yml index a6f17b1a74..42c9701433 100644 --- a/scripts/core/exp-bindless-images.yml +++ b/scripts/core/exp-bindless-images.yml @@ -92,6 +92,24 @@ etors: - name: CUBEMAP_SEAMLESS_FILTERING_SUPPORT_EXP value: "0x2011" desc: "[$x_bool_t] returns true if the device supports sampling cubemapped images across face boundaries" + - name: BINDLESS_SAMPLED_IMAGE_FETCH_1D_USM_EXP + value: "0x2012" + desc: "[$x_bool_t] returns true if the device is capable of fetching USM backed 1D sampled image data." + - name: BINDLESS_SAMPLED_IMAGE_FETCH_1D_EXP + value: "0x2013" + desc: "[$x_bool_t] returns true if the device is capable of fetching non-USM backed 1D sampled image data." + - name: BINDLESS_SAMPLED_IMAGE_FETCH_2D_USM_EXP + value: "0x2014" + desc: "[$x_bool_t] returns true if the device is capable of fetching USM backed 2D sampled image data." + - name: BINDLESS_SAMPLED_IMAGE_FETCH_2D_EXP + value: "0x2015" + desc: "[$x_bool_t] returns true if the device is capable of fetching non-USM backed 2D sampled image data." + - name: BINDLESS_SAMPLED_IMAGE_FETCH_3D_USM_EXP + value: "0x2016" + desc: "[$x_bool_t] returns true if the device is capable of fetching USM backed 3D sampled image data." + - name: BINDLESS_SAMPLED_IMAGE_FETCH_3D_EXP + value: "0x2017" + desc: "[$x_bool_t] returns true if the device is capable of fetching non-USM backed 3D sampled image data." --- #-------------------------------------------------------------------------- type: enum extend: true diff --git a/source/adapters/cuda/device.cpp b/source/adapters/cuda/device.cpp index da11f3f1bf..cca00c0b85 100644 --- a/source/adapters/cuda/device.cpp +++ b/source/adapters/cuda/device.cpp @@ -926,6 +926,30 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, // CUDA supports cubemap seamless filtering. return ReturnValue(true); } + case UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_1D_USM_EXP: { + // CUDA does support fetching 1D USM sampled image data. + return ReturnValue(true); + } + case UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_1D_EXP: { + // CUDA does not support fetching 1D non-USM sampled image data. + return ReturnValue(false); + } + case UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_2D_USM_EXP: { + // CUDA does support fetching 2D USM sampled image data. + return ReturnValue(true); + } + case UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_2D_EXP: { + // CUDA does support fetching 2D non-USM sampled image data. + return ReturnValue(true); + } + case UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_3D_USM_EXP: { + // CUDA does not support 3D USM sampled textures + return ReturnValue(false); + } + case UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_3D_EXP: { + // CUDA does support fetching 3D non-USM sampled image data. + return ReturnValue(true); + } case UR_DEVICE_INFO_DEVICE_ID: { int Value = 0; UR_CHECK_ERROR(cuDeviceGetAttribute( diff --git a/source/adapters/level_zero/command_buffer.cpp b/source/adapters/level_zero/command_buffer.cpp index a06163c2a9..67415a0de0 100644 --- a/source/adapters/level_zero/command_buffer.cpp +++ b/source/adapters/level_zero/command_buffer.cpp @@ -1048,8 +1048,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferEnqueueExp( // Create command-list to execute before `CommandListPtr` and will signal // when `EventWaitList` dependencies are complete. ur_command_list_ptr_t WaitCommandList{}; - UR_CALL(Queue->Context->getAvailableCommandList(Queue, WaitCommandList, - false, false)); + UR_CALL(Queue->Context->getAvailableCommandList( + Queue, WaitCommandList, false, NumEventsInWaitList, EventWaitList, + false)); ZE2UR_CALL(zeCommandListAppendBarrier, (WaitCommandList->first, CommandBuffer->WaitEvent->ZeEvent, @@ -1086,7 +1087,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferEnqueueExp( // Create a command-list to signal RetEvent on completion ur_command_list_ptr_t SignalCommandList{}; UR_CALL(Queue->Context->getAvailableCommandList(Queue, SignalCommandList, - false, false)); + false, NumEventsInWaitList, + EventWaitList, false)); // Reset the wait-event for the UR command-buffer that is signaled when its // submission dependencies have been satisfied. ZE2UR_CALL(zeCommandListAppendEventReset, diff --git a/source/adapters/level_zero/common.hpp b/source/adapters/level_zero/common.hpp index 93bf407567..e16d767b71 100644 --- a/source/adapters/level_zero/common.hpp +++ b/source/adapters/level_zero/common.hpp @@ -241,6 +241,19 @@ static const uint32_t UrL0QueueSyncNonBlocking = [] { return L0QueueSyncLockingModeValue; }(); +// Controls whether the L0 Adapter creates signal events for commands on +// integrated gpu devices. +static const uint32_t UrL0OutOfOrderIntegratedSignalEvent = [] { + const char *UrL0OutOfOrderIntegratedSignalEventEnv = + std::getenv("UR_L0_OOQ_INTEGRATED_SIGNAL_EVENT"); + uint32_t UrL0OutOfOrderIntegratedSignalEventValue = 1; + if (UrL0OutOfOrderIntegratedSignalEventEnv) { + UrL0OutOfOrderIntegratedSignalEventValue = + std::atoi(UrL0OutOfOrderIntegratedSignalEventEnv); + } + return UrL0OutOfOrderIntegratedSignalEventValue; +}(); + // This class encapsulates actions taken along with a call to Level Zero API. class ZeCall { private: diff --git a/source/adapters/level_zero/context.cpp b/source/adapters/level_zero/context.cpp index 4880c14c4b..19696142f5 100644 --- a/source/adapters/level_zero/context.cpp +++ b/source/adapters/level_zero/context.cpp @@ -645,7 +645,8 @@ static const size_t CmdListsCleanupThreshold = [] { // Retrieve an available command list to be used in a PI call. ur_result_t ur_context_handle_t_::getAvailableCommandList( ur_queue_handle_t Queue, ur_command_list_ptr_t &CommandList, - bool UseCopyEngine, bool AllowBatching, + bool UseCopyEngine, uint32_t NumEventsInWaitList, + const ur_event_handle_t *EventWaitList, bool AllowBatching, ze_command_queue_handle_t *ForcedCmdQueue) { // Immediate commandlists have been pre-allocated and are always available. if (Queue->UsingImmCmdLists) { @@ -677,9 +678,17 @@ ur_result_t ur_context_handle_t_::getAvailableCommandList( // for this queue. if (Queue->hasOpenCommandList(UseCopyEngine)) { if (AllowBatching) { - CommandList = CommandBatch.OpenCommandList; - UR_CALL(Queue->insertStartBarrierIfDiscardEventsMode(CommandList)); - return UR_RESULT_SUCCESS; + bool batchingAllowed = true; + if (!UrL0OutOfOrderIntegratedSignalEvent && + Queue->Device->isIntegrated()) { + batchingAllowed = eventCanBeBatched(Queue, UseCopyEngine, + NumEventsInWaitList, EventWaitList); + } + if (batchingAllowed) { + CommandList = CommandBatch.OpenCommandList; + UR_CALL(Queue->insertStartBarrierIfDiscardEventsMode(CommandList)); + return UR_RESULT_SUCCESS; + } } // If this command isn't allowed to be batched or doesn't match the forced // command queue, then we need to go ahead and execute what is already in diff --git a/source/adapters/level_zero/context.hpp b/source/adapters/level_zero/context.hpp index ff173aa984..4184411de7 100644 --- a/source/adapters/level_zero/context.hpp +++ b/source/adapters/level_zero/context.hpp @@ -292,11 +292,11 @@ struct ur_context_handle_t_ : _ur_object { // When using immediate commandlists, retrieves an immediate command list // for executing on this device. Immediate commandlists are created only // once for each SYCL Queue and after that they are reused. - ur_result_t - getAvailableCommandList(ur_queue_handle_t Queue, - ur_command_list_ptr_t &CommandList, - bool UseCopyEngine, bool AllowBatching = false, - ze_command_queue_handle_t *ForcedCmdQueue = nullptr); + ur_result_t getAvailableCommandList( + ur_queue_handle_t Queue, ur_command_list_ptr_t &CommandList, + bool UseCopyEngine, uint32_t NumEventsInWaitList, + const ur_event_handle_t *EventWaitList, bool AllowBatching = false, + ze_command_queue_handle_t *ForcedCmdQueue = nullptr); // Checks if Device is covered by this context. // For that the Device or its root devices need to be in the context. diff --git a/source/adapters/level_zero/device.hpp b/source/adapters/level_zero/device.hpp index 484890670b..3cdfcbce7e 100644 --- a/source/adapters/level_zero/device.hpp +++ b/source/adapters/level_zero/device.hpp @@ -176,6 +176,10 @@ struct ur_device_handle_t_ : _ur_object { (ZeDeviceProperties->deviceId & 0xff0) == 0xb60; } + bool isIntegrated() { + return (ZeDeviceProperties->flags & ZE_DEVICE_PROPERTY_FLAG_INTEGRATED); + } + // Does this device represent a single compute slice? bool isCCS() const { return QueueGroup[ur_device_handle_t_::queue_group_info_t::Compute] diff --git a/source/adapters/level_zero/event.cpp b/source/adapters/level_zero/event.cpp index cd61a8aa23..e472c2490c 100644 --- a/source/adapters/level_zero/event.cpp +++ b/source/adapters/level_zero/event.cpp @@ -84,8 +84,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueEventsWait( // Get a new command list to be used on this call ur_command_list_ptr_t CommandList{}; - UR_CALL(Queue->Context->getAvailableCommandList(Queue, CommandList, - UseCopyEngine)); + UR_CALL(Queue->Context->getAvailableCommandList( + Queue, CommandList, UseCopyEngine, NumEventsInWaitList, EventWaitList)); ze_event_handle_t ZeEvent = nullptr; ur_event_handle_t InternalEvent; @@ -256,7 +256,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueEventsWaitWithBarrier( // Get an arbitrary command-list in the queue. ur_command_list_ptr_t CmdList; UR_CALL(Queue->Context->getAvailableCommandList( - Queue, CmdList, false /*UseCopyEngine=*/, OkToBatch)); + Queue, CmdList, false /*UseCopyEngine=*/, NumEventsInWaitList, + EventWaitList, OkToBatch)); // Insert the barrier into the command-list and execute. UR_CALL(insertBarrierIntoCmdList(CmdList, TmpWaitList, *Event, IsInternal)); @@ -311,7 +312,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueEventsWaitWithBarrier( if (ZeQueue) { ur_command_list_ptr_t CmdList; UR_CALL(Queue->Context->getAvailableCommandList( - Queue, CmdList, UseCopyEngine, OkToBatch, &ZeQueue)); + Queue, CmdList, UseCopyEngine, NumEventsInWaitList, + EventWaitList, OkToBatch, &ZeQueue)); CmdLists.push_back(CmdList); } } @@ -324,7 +326,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueEventsWaitWithBarrier( // Get any available command list. ur_command_list_ptr_t CmdList; UR_CALL(Queue->Context->getAvailableCommandList( - Queue, CmdList, false /*UseCopyEngine=*/, OkToBatch)); + Queue, CmdList, false /*UseCopyEngine=*/, NumEventsInWaitList, + EventWaitList, OkToBatch)); CmdLists.push_back(CmdList); } @@ -598,6 +601,7 @@ ur_result_t ur_event_handle_t_::getOrCreateHostVisibleEvent( this->Mutex); if (!HostVisibleEvent) { + this->IsCreatingHostProxyEvent = true; if (UrQueue->ZeEventsScope != OnDemandHostVisibleProxy) die("getOrCreateHostVisibleEvent: missing host-visible event"); @@ -612,7 +616,7 @@ ur_result_t ur_event_handle_t_::getOrCreateHostVisibleEvent( ur_command_list_ptr_t CommandList{}; UR_CALL(UrQueue->Context->getAvailableCommandList( - UrQueue, CommandList, false /* UseCopyEngine */, OkToBatch)) + UrQueue, CommandList, false /* UseCopyEngine */, 0, nullptr, OkToBatch)) // Create a "proxy" host-visible event. UR_CALL(createEventAndAssociateQueue( @@ -620,12 +624,18 @@ ur_result_t ur_event_handle_t_::getOrCreateHostVisibleEvent( /* IsInternal */ false, /* IsMultiDevice */ false, /* HostVisible */ true)); - ZE2UR_CALL(zeCommandListAppendWaitOnEvents, - (CommandList->first, 1, &ZeEvent)); + if (this->IsInnerBatchedEvent) { + ZE2UR_CALL(zeCommandListAppendBarrier, + (CommandList->first, ZeEvent, 0, nullptr)); + } else { + ZE2UR_CALL(zeCommandListAppendWaitOnEvents, + (CommandList->first, 1, &ZeEvent)); + } ZE2UR_CALL(zeCommandListAppendSignalEvent, (CommandList->first, HostVisibleEvent->ZeEvent)); UR_CALL(UrQueue->executeCommandList(CommandList, false, OkToBatch)) + this->IsCreatingHostProxyEvent = false; } ZeHostVisibleEvent = HostVisibleEvent->ZeEvent; @@ -682,7 +692,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventWait( ze_event_handle_t ZeEvent = HostVisibleEvent->ZeEvent; logger::debug("ZeEvent = {}", ur_cast(ZeEvent)); - ZE2UR_CALL(zeHostSynchronize, (ZeEvent)); + // If this event was an inner batched event, then sync with + // the Queue instead of waiting on the event. + if (HostVisibleEvent->IsInnerBatchedEvent && Event->ZeBatchedQueue) { + ZE2UR_CALL(zeHostSynchronize, (Event->ZeBatchedQueue)); + } else { + ZE2UR_CALL(zeHostSynchronize, (ZeEvent)); + } Event->Completed = true; } } @@ -938,7 +954,12 @@ ur_result_t CleanupCompletedEvent(ur_event_handle_t Event, bool QueueLocked, std::list EventsToBeReleased; ur_queue_handle_t AssociatedQueue = nullptr; { - std::scoped_lock EventLock(Event->Mutex); + // If the Event is already locked, then continue with the cleanup, otherwise + // block on locking the event. + std::unique_lock EventLock(Event->Mutex, std::try_to_lock); + if (!EventLock.owns_lock() && !Event->IsCreatingHostProxyEvent) { + EventLock.lock(); + } if (SetEventCompleted) Event->Completed = true; // Exit early of event was already cleanedup. @@ -1324,8 +1345,8 @@ ur_result_t _ur_ze_event_list_t::createAndRetainUrZeEventList( // Get a command list prior to acquiring an event lock. // This prevents a potential deadlock with recursive // event locks. - UR_CALL(Queue->Context->getAvailableCommandList(Queue, CommandList, - false, true)); + UR_CALL(Queue->Context->getAvailableCommandList( + Queue, CommandList, false, 0, nullptr, true)); } std::shared_lock Lock(EventList[I]->Mutex); diff --git a/source/adapters/level_zero/event.hpp b/source/adapters/level_zero/event.hpp index 9fa4663ab4..08b4be7969 100644 --- a/source/adapters/level_zero/event.hpp +++ b/source/adapters/level_zero/event.hpp @@ -198,6 +198,15 @@ struct ur_event_handle_t_ : _ur_object { // performance bool IsMultiDevice = {false}; + // Indicates inner batched event which was not used as a signal event. + bool IsInnerBatchedEvent = {false}; + + // Queue where the batched command was executed. + ze_command_queue_handle_t ZeBatchedQueue = {nullptr}; + + // Indicates within creation of proxy event. + bool IsCreatingHostProxyEvent = {false}; + // Besides each PI object keeping a total reference count in // _ur_object::RefCount we keep special track of the event *external* // references. This way we are able to tell when the event is not referenced diff --git a/source/adapters/level_zero/image.cpp b/source/adapters/level_zero/image.cpp index 3b767f9127..d9cb19c398 100644 --- a/source/adapters/level_zero/image.cpp +++ b/source/adapters/level_zero/image.cpp @@ -790,8 +790,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp( // Get a new command list to be used on this call ur_command_list_ptr_t CommandList{}; - UR_CALL(hQueue->Context->getAvailableCommandList(hQueue, CommandList, - UseCopyEngine, OkToBatch)); + UR_CALL(hQueue->Context->getAvailableCommandList( + hQueue, CommandList, UseCopyEngine, numEventsInWaitList, phEventWaitList, + OkToBatch)); ze_event_handle_t ZeEvent = nullptr; ur_event_handle_t InternalEvent; @@ -800,7 +801,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp( UR_CALL(createEventAndAssociateQueue(hQueue, Event, UR_COMMAND_MEM_IMAGE_COPY, CommandList, IsInternal, /*IsMultiDevice*/ false)); - ZeEvent = (*Event)->ZeEvent; + UR_CALL(setSignalEvent(hQueue, UseCopyEngine, &ZeEvent, Event, + numEventsInWaitList, phEventWaitList, + CommandList->second.ZeQueue)); (*Event)->WaitList = TmpWaitList; const auto &ZeCommandList = CommandList->first; diff --git a/source/adapters/level_zero/kernel.cpp b/source/adapters/level_zero/kernel.cpp index d96351dd5d..65feaae511 100644 --- a/source/adapters/level_zero/kernel.cpp +++ b/source/adapters/level_zero/kernel.cpp @@ -201,7 +201,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( // Get a new command list to be used on this call ur_command_list_ptr_t CommandList{}; UR_CALL(Queue->Context->getAvailableCommandList( - Queue, CommandList, UseCopyEngine, true /* AllowBatching */)); + Queue, CommandList, UseCopyEngine, NumEventsInWaitList, EventWaitList, + true /* AllowBatching */)); ze_event_handle_t ZeEvent = nullptr; ur_event_handle_t InternalEvent{}; @@ -210,7 +211,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( UR_CALL(createEventAndAssociateQueue(Queue, Event, UR_COMMAND_KERNEL_LAUNCH, CommandList, IsInternal, false)); - ZeEvent = (*Event)->ZeEvent; + UR_CALL(setSignalEvent(Queue, UseCopyEngine, &ZeEvent, Event, + NumEventsInWaitList, EventWaitList, + CommandList->second.ZeQueue)); (*Event)->WaitList = TmpWaitList; // Save the kernel in the event, so that when the event is signalled diff --git a/source/adapters/level_zero/memory.cpp b/source/adapters/level_zero/memory.cpp index 4757a0563d..77cb6abb38 100644 --- a/source/adapters/level_zero/memory.cpp +++ b/source/adapters/level_zero/memory.cpp @@ -62,8 +62,9 @@ ur_result_t enqueueMemCopyHelper(ur_command_t CommandType, // Get a new command list to be used on this call ur_command_list_ptr_t CommandList{}; - UR_CALL(Queue->Context->getAvailableCommandList(Queue, CommandList, - UseCopyEngine, OkToBatch)); + UR_CALL(Queue->Context->getAvailableCommandList( + Queue, CommandList, UseCopyEngine, NumEventsInWaitList, EventWaitList, + OkToBatch)); ze_event_handle_t ZeEvent = nullptr; ur_event_handle_t InternalEvent; @@ -71,7 +72,9 @@ ur_result_t enqueueMemCopyHelper(ur_command_t CommandType, ur_event_handle_t *Event = OutEvent ? OutEvent : &InternalEvent; UR_CALL(createEventAndAssociateQueue(Queue, Event, CommandType, CommandList, IsInternal, false)); - ZeEvent = (*Event)->ZeEvent; + UR_CALL(setSignalEvent(Queue, UseCopyEngine, &ZeEvent, Event, + NumEventsInWaitList, EventWaitList, + CommandList->second.ZeQueue)); (*Event)->WaitList = TmpWaitList; const auto &ZeCommandList = CommandList->first; @@ -112,8 +115,9 @@ ur_result_t enqueueMemCopyRectHelper( // Get a new command list to be used on this call ur_command_list_ptr_t CommandList{}; - UR_CALL(Queue->Context->getAvailableCommandList(Queue, CommandList, - UseCopyEngine, OkToBatch)); + UR_CALL(Queue->Context->getAvailableCommandList( + Queue, CommandList, UseCopyEngine, NumEventsInWaitList, EventWaitList, + OkToBatch)); ze_event_handle_t ZeEvent = nullptr; ur_event_handle_t InternalEvent; @@ -121,8 +125,9 @@ ur_result_t enqueueMemCopyRectHelper( ur_event_handle_t *Event = OutEvent ? OutEvent : &InternalEvent; UR_CALL(createEventAndAssociateQueue(Queue, Event, CommandType, CommandList, IsInternal, false)); - - ZeEvent = (*Event)->ZeEvent; + UR_CALL(setSignalEvent(Queue, UseCopyEngine, &ZeEvent, Event, + NumEventsInWaitList, EventWaitList, + CommandList->second.ZeQueue)); (*Event)->WaitList = TmpWaitList; const auto &ZeCommandList = CommandList->first; @@ -219,8 +224,9 @@ static ur_result_t enqueueMemFillHelper(ur_command_t CommandType, ur_command_list_ptr_t CommandList{}; // We want to batch these commands to avoid extra submissions (costly) bool OkToBatch = true; - UR_CALL(Queue->Context->getAvailableCommandList(Queue, CommandList, - UseCopyEngine, OkToBatch)); + UR_CALL(Queue->Context->getAvailableCommandList( + Queue, CommandList, UseCopyEngine, NumEventsInWaitList, EventWaitList, + OkToBatch)); ze_event_handle_t ZeEvent = nullptr; ur_event_handle_t InternalEvent; @@ -228,8 +234,9 @@ static ur_result_t enqueueMemFillHelper(ur_command_t CommandType, ur_event_handle_t *Event = OutEvent ? OutEvent : &InternalEvent; UR_CALL(createEventAndAssociateQueue(Queue, Event, CommandType, CommandList, IsInternal, false)); - - ZeEvent = (*Event)->ZeEvent; + UR_CALL(setSignalEvent(Queue, UseCopyEngine, &ZeEvent, Event, + NumEventsInWaitList, EventWaitList, + CommandList->second.ZeQueue)); (*Event)->WaitList = TmpWaitList; const auto &ZeCommandList = CommandList->first; @@ -334,8 +341,9 @@ static ur_result_t enqueueMemImageCommandHelper( // Get a new command list to be used on this call ur_command_list_ptr_t CommandList{}; - UR_CALL(Queue->Context->getAvailableCommandList(Queue, CommandList, - UseCopyEngine, OkToBatch)); + UR_CALL(Queue->Context->getAvailableCommandList( + Queue, CommandList, UseCopyEngine, NumEventsInWaitList, EventWaitList, + OkToBatch)); ze_event_handle_t ZeEvent = nullptr; ur_event_handle_t InternalEvent; @@ -343,7 +351,9 @@ static ur_result_t enqueueMemImageCommandHelper( ur_event_handle_t *Event = OutEvent ? OutEvent : &InternalEvent; UR_CALL(createEventAndAssociateQueue(Queue, Event, CommandType, CommandList, IsInternal, false)); - ZeEvent = (*Event)->ZeEvent; + UR_CALL(setSignalEvent(Queue, UseCopyEngine, &ZeEvent, Event, + NumEventsInWaitList, EventWaitList, + CommandList->second.ZeQueue)); (*Event)->WaitList = TmpWaitList; const auto &ZeCommandList = CommandList->first; @@ -991,8 +1001,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferMap( } else { // For discrete devices we need a command list ur_command_list_ptr_t CommandList{}; - UR_CALL(Queue->Context->getAvailableCommandList(Queue, CommandList, - UseCopyEngine)); + UR_CALL(Queue->Context->getAvailableCommandList( + Queue, CommandList, UseCopyEngine, NumEventsInWaitList, EventWaitList)); // Add the event to the command list. CommandList->second.append(reinterpret_cast(*Event)); @@ -1004,6 +1014,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferMap( char *ZeHandleSrc; UR_CALL(Buffer->getZeHandle(ZeHandleSrc, AccessMode, Queue->Device)); + UR_CALL(setSignalEvent(Queue, UseCopyEngine, &ZeEvent, Event, + NumEventsInWaitList, EventWaitList, + CommandList->second.ZeQueue)); + ZE2UR_CALL(zeCommandListAppendMemoryCopy, (ZeCommandList, *RetMap, ZeHandleSrc + Offset, Size, ZeEvent, WaitList.Length, WaitList.ZeEventList)); @@ -1114,7 +1128,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemUnmap( ur_command_list_ptr_t CommandList{}; UR_CALL(Queue->Context->getAvailableCommandList( - reinterpret_cast(Queue), CommandList, UseCopyEngine)); + reinterpret_cast(Queue), CommandList, UseCopyEngine, + NumEventsInWaitList, EventWaitList)); CommandList->second.append(reinterpret_cast(*Event)); (*Event)->RefCount.increment(); @@ -1131,6 +1146,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemUnmap( UR_CALL(Buffer->getZeHandle(ZeHandleDst, ur_mem_handle_t_::write_only, Queue->Device)); + UR_CALL(setSignalEvent(Queue, UseCopyEngine, &ZeEvent, Event, + NumEventsInWaitList, EventWaitList, + CommandList->second.ZeQueue)); + ZE2UR_CALL(zeCommandListAppendMemoryCopy, (ZeCommandList, ZeHandleDst + MapInfo.Offset, MappedPtr, MapInfo.Size, ZeEvent, (*Event)->WaitList.Length, @@ -1241,8 +1260,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMPrefetch( ur_command_list_ptr_t CommandList{}; // TODO: Change UseCopyEngine argument to 'true' once L0 backend // support is added - UR_CALL(Queue->Context->getAvailableCommandList(Queue, CommandList, - UseCopyEngine)); + UR_CALL(Queue->Context->getAvailableCommandList( + Queue, CommandList, UseCopyEngine, NumEventsInWaitList, EventWaitList)); // TODO: do we need to create a unique command type for this? ze_event_handle_t ZeEvent = nullptr; @@ -1298,7 +1317,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMAdvise( // TODO: Additional analysis is required to check if this operation will // run faster on copy engines. UR_CALL(Queue->Context->getAvailableCommandList(Queue, CommandList, - UseCopyEngine)); + UseCopyEngine, 0, nullptr)); // TODO: do we need to create a unique command type for this? ze_event_handle_t ZeEvent = nullptr; diff --git a/source/adapters/level_zero/queue.cpp b/source/adapters/level_zero/queue.cpp index 65ab3892eb..ab2277d8b7 100644 --- a/source/adapters/level_zero/queue.cpp +++ b/source/adapters/level_zero/queue.cpp @@ -1337,6 +1337,7 @@ ur_queue_handle_t_::executeCommandList(ur_command_list_ptr_t CommandList, // in the command list is not empty, otherwise we are going to just create // and remove proxy event right away and dereference deleted object // afterwards. + bool AppendBarrierNeeded = true; if (ZeEventsScope == LastCommandInBatchHostVisible && !CommandList->second.EventList.empty()) { // If there are only internal events in the command list then we don't @@ -1405,6 +1406,7 @@ ur_queue_handle_t_::executeCommandList(ur_command_list_ptr_t CommandList, ZE2UR_CALL(zeCommandListAppendSignalEvent, (CommandList->first, HostVisibleEvent->ZeEvent)); } else { + AppendBarrierNeeded = false; ZE2UR_CALL( zeCommandListAppendBarrier, (CommandList->first, HostVisibleEvent->ZeEvent, 0, nullptr)); @@ -1417,6 +1419,27 @@ ur_queue_handle_t_::executeCommandList(ur_command_list_ptr_t CommandList, // If we don't have host visible proxy then signal event if needed. this->signalEventFromCmdListIfLastEventDiscarded(CommandList); } + // Append Signalling of the inner events at the end of the batch if this is + // an integrated gpu and out of order signal events are not allowed. + if (!UrL0OutOfOrderIntegratedSignalEvent && this->Device->isIntegrated()) { + for (auto &Event : CommandList->second.EventList) { + // If the events scope does not apply a barrier already above, then we + // need to apply a barrier to wait on all the previous commands without + // signal events to complete before we can signal the batched events as + // completed. This functionality is only used if this command list is + // out of order and there are events created that were not used as + // signal events. + if (Event->IsInnerBatchedEvent) { + if (AppendBarrierNeeded) { + ZE2UR_CALL(zeCommandListAppendBarrier, + (CommandList->first, nullptr, 0, nullptr)); + AppendBarrierNeeded = false; + } + ZE2UR_CALL(zeCommandListAppendSignalEvent, + (CommandList->first, Event->ZeEvent)); + } + } + } // Close the command list and have it ready for dispatch. ZE2UR_CALL(zeCommandListClose, (CommandList->first)); @@ -1734,6 +1757,58 @@ ur_event_handle_t ur_queue_handle_t_::getEventFromQueueCache(bool IsMultiDevice, return RetEvent; } +// This helper function checks to see if an event for a command can be included +// at the end of a command list batch. This will only be true if the event does +// not have dependencies or the dependencies are not for events which exist in +// this batch. +bool eventCanBeBatched(ur_queue_handle_t Queue, bool UseCopyEngine, + uint32_t NumEventsInWaitList, + const ur_event_handle_t *EventWaitList) { + auto &CommandBatch = + UseCopyEngine ? Queue->CopyCommandBatch : Queue->ComputeCommandBatch; + // First see if there is an command-list open for batching commands + // for this queue. + if (Queue->hasOpenCommandList(UseCopyEngine)) { + // If this command should be batched, but the command has a dependency on a + // command in the current batch, then the command needs to have an event + // to track its completion so this event cannot be batched to the end of the + // command list. + if (NumEventsInWaitList > 0) { + for (auto &Event : CommandBatch.OpenCommandList->second.EventList) { + for (uint32_t i = 0; i < NumEventsInWaitList; i++) { + if (Event == EventWaitList[i]) { + return false; + } + } + } + } + } + return true; +} + +// This helper function checks to see if a signal event at the end of a command +// should be set. If the Queue is out of order and the command has no +// dependencies, then this command can be enqueued without a signal event set in +// a command list batch. The signal event will be appended at the end of the +// batch to be signalled at the end of the command list. +ur_result_t setSignalEvent(ur_queue_handle_t Queue, bool UseCopyEngine, + ze_event_handle_t *ZeEvent, ur_event_handle_t *Event, + uint32_t NumEventsInWaitList, + const ur_event_handle_t *EventWaitList, + ze_command_queue_handle_t ZeQueue) { + if (!UrL0OutOfOrderIntegratedSignalEvent && Queue->Device->isIntegrated() && + eventCanBeBatched(Queue, UseCopyEngine, NumEventsInWaitList, + EventWaitList) && + !Queue->isInOrderQueue() && !Queue->UsingImmCmdLists) { + ZeEvent = nullptr; + (*Event)->IsInnerBatchedEvent = true; + (*Event)->ZeBatchedQueue = ZeQueue; + } else { + (*ZeEvent) = (*Event)->ZeEvent; + } + return UR_RESULT_SUCCESS; +} + // This helper function creates a ur_event_handle_t and associate a // ur_queue_handle_t. Note that the caller of this function must have acquired // lock on the Queue that is passed in. diff --git a/source/adapters/level_zero/queue.hpp b/source/adapters/level_zero/queue.hpp index 5cb061be5f..ed832148ac 100644 --- a/source/adapters/level_zero/queue.hpp +++ b/source/adapters/level_zero/queue.hpp @@ -691,6 +691,25 @@ ur_result_t createEventAndAssociateQueue( ur_command_list_ptr_t CommandList, bool IsInternal, bool IsMultiDevice, std::optional HostVisible = std::nullopt); +// This helper function checks to see if an event for a command can be included +// at the end of a command list batch. This will only be true if the event does +// not have dependencies or the dependencies are not for events which exist in +// this batch. +bool eventCanBeBatched(ur_queue_handle_t Queue, bool UseCopyEngine, + uint32_t NumEventsInWaitList, + const ur_event_handle_t *EventWaitList); + +// This helper function checks to see if a signal event at the end of a command +// should be set. If the Queue is out of order and the command has no +// dependencies, then this command can be enqueued without a signal event set in +// a command list batch. The signal event will be appended at the end of the +// batch to be signalled at the end of the command list. +ur_result_t setSignalEvent(ur_queue_handle_t Queue, bool UseCopyEngine, + ze_event_handle_t *ZeEvent, ur_event_handle_t *Event, + uint32_t NumEventsInWaitList, + const ur_event_handle_t *EventWaitList, + ze_command_queue_handle_t ZeQueue); + // Helper function to perform the necessary cleanup of the events from reset cmd // list. ur_result_t CleanupEventListFromResetCmdList( diff --git a/source/loader/layers/validation/ur_valddi.cpp b/source/loader/layers/validation/ur_valddi.cpp index 1e14552b4e..7939ca21b9 100644 --- a/source/loader/layers/validation/ur_valddi.cpp +++ b/source/loader/layers/validation/ur_valddi.cpp @@ -496,7 +496,7 @@ __urdlllocal ur_result_t UR_APICALL urDeviceGetInfo( return UR_RESULT_ERROR_INVALID_NULL_POINTER; } - if (UR_DEVICE_INFO_CUBEMAP_SEAMLESS_FILTERING_SUPPORT_EXP < propName) { + if (UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_3D_EXP < propName) { return UR_RESULT_ERROR_INVALID_ENUMERATION; } diff --git a/source/loader/ur_libapi.cpp b/source/loader/ur_libapi.cpp index 66f4835c56..dba668e61b 100644 --- a/source/loader/ur_libapi.cpp +++ b/source/loader/ur_libapi.cpp @@ -842,7 +842,7 @@ ur_result_t UR_APICALL urDeviceGetSelected( /// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE /// + `NULL == hDevice` /// - ::UR_RESULT_ERROR_INVALID_ENUMERATION -/// + `::UR_DEVICE_INFO_CUBEMAP_SEAMLESS_FILTERING_SUPPORT_EXP < propName` +/// + `::UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_3D_EXP < propName` /// - ::UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION /// + If `propName` is not supported by the adapter. /// - ::UR_RESULT_ERROR_INVALID_SIZE diff --git a/source/ur_api.cpp b/source/ur_api.cpp index e6410ee99b..7f4746fcb7 100644 --- a/source/ur_api.cpp +++ b/source/ur_api.cpp @@ -736,7 +736,7 @@ ur_result_t UR_APICALL urDeviceGetSelected( /// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE /// + `NULL == hDevice` /// - ::UR_RESULT_ERROR_INVALID_ENUMERATION -/// + `::UR_DEVICE_INFO_CUBEMAP_SEAMLESS_FILTERING_SUPPORT_EXP < propName` +/// + `::UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_3D_EXP < propName` /// - ::UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION /// + If `propName` is not supported by the adapter. /// - ::UR_RESULT_ERROR_INVALID_SIZE diff --git a/test/conformance/device_code/CMakeLists.txt b/test/conformance/device_code/CMakeLists.txt index ee16b0eb43..26358d49f6 100644 --- a/test/conformance/device_code/CMakeLists.txt +++ b/test/conformance/device_code/CMakeLists.txt @@ -52,6 +52,11 @@ macro(add_device_binary SOURCE_FILE) set(EXTRA_ENV DYLD_FALLBACK_LIBRARY_PATH=${UR_SYCL_LIBRARY_DIR}) endif() endif() + + # Convert build flags to a regular CMake list, splitting by unquoted white + # space as necessary. + separate_arguments(DPCXX_BUILD_FLAGS_LIST NATIVE_COMMAND "${UR_DPCXX_BUILD_FLAGS}") + foreach(TRIPLE ${TARGET_TRIPLES}) set(EXE_PATH "${DEVICE_BINARY_DIR}/${KERNEL_NAME}_${TRIPLE}") if(${TRIPLE} MATCHES "amd") @@ -79,7 +84,7 @@ macro(add_device_binary SOURCE_FILE) add_custom_command(OUTPUT ${EXE_PATH} COMMAND ${UR_DPCXX} -fsycl -fsycl-targets=${TRIPLE} -fsycl-device-code-split=off ${AMD_TARGET_BACKEND} ${AMD_OFFLOAD_ARCH} ${AMD_NOGPULIB} - ${SOURCE_FILE} -o ${EXE_PATH} + ${DPCXX_BUILD_FLAGS_LIST} ${SOURCE_FILE} -o ${EXE_PATH} COMMAND ${CMAKE_COMMAND} -E env ${EXTRA_ENV} SYCL_DUMP_IMAGES=true ${EXE_PATH} || exit 0 diff --git a/test/conformance/device_code/bar.cpp b/test/conformance/device_code/bar.cpp index fecac40c75..58f2696bf8 100644 --- a/test/conformance/device_code/bar.cpp +++ b/test/conformance/device_code/bar.cpp @@ -3,14 +3,14 @@ // See LICENSE.TXT // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -#include +#include int main() { - cl::sycl::queue deviceQueue; - cl::sycl::range<1> numOfItems{1}; + sycl::queue deviceQueue; + sycl::range<1> numOfItems{1}; - deviceQueue.submit([&](cl::sycl::handler &cgh) { - auto kern = [=](cl::sycl::id<1>) {}; + deviceQueue.submit([&](sycl::handler &cgh) { + auto kern = [=](sycl::id<1>) {}; cgh.parallel_for(numOfItems, kern); }); diff --git a/test/conformance/device_code/fill.cpp b/test/conformance/device_code/fill.cpp index 443373edf2..fabcbcf8ec 100644 --- a/test/conformance/device_code/fill.cpp +++ b/test/conformance/device_code/fill.cpp @@ -3,19 +3,18 @@ // See LICENSE.TXT // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -#include +#include int main() { size_t array_size = 16; std::vector A(array_size, 1); uint32_t val = 42; - cl::sycl::queue sycl_queue; - auto A_buff = - cl::sycl::buffer(A.data(), cl::sycl::range<1>(array_size)); - sycl_queue.submit([&](cl::sycl::handler &cgh) { - auto A_acc = A_buff.get_access(cgh); - cgh.parallel_for(cl::sycl::range<1>{array_size}, - [A_acc, val](cl::sycl::item<1> itemId) { + sycl::queue sycl_queue; + auto A_buff = sycl::buffer(A.data(), sycl::range<1>(array_size)); + sycl_queue.submit([&](sycl::handler &cgh) { + auto A_acc = A_buff.get_access(cgh); + cgh.parallel_for(sycl::range<1>{array_size}, + [A_acc, val](sycl::item<1> itemId) { auto id = itemId.get_id(0); A_acc[id] = val; }); diff --git a/test/conformance/device_code/fill_2d.cpp b/test/conformance/device_code/fill_2d.cpp index d4dd6c704f..5fed417ed8 100644 --- a/test/conformance/device_code/fill_2d.cpp +++ b/test/conformance/device_code/fill_2d.cpp @@ -3,24 +3,24 @@ // See LICENSE.TXT // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -#include +#include int main() { size_t nd_range_x = 8; size_t nd_range_y = 8; - auto nd_range = cl::sycl::range<2>(nd_range_x, nd_range_y); + auto nd_range = sycl::range<2>(nd_range_x, nd_range_y); std::vector A(nd_range_x * nd_range_y, 1); uint32_t val = 42; - cl::sycl::queue sycl_queue; + sycl::queue sycl_queue; - auto work_range = cl::sycl::nd_range<2>(nd_range, cl::sycl::range<2>(1, 1)); - auto A_buff = cl::sycl::buffer( - A.data(), cl::sycl::range<1>(nd_range_x * nd_range_y)); - sycl_queue.submit([&](cl::sycl::handler &cgh) { - auto A_acc = A_buff.get_access(cgh); + auto work_range = sycl::nd_range<2>(nd_range, sycl::range<2>(1, 1)); + auto A_buff = sycl::buffer( + A.data(), sycl::range<1>(nd_range_x * nd_range_y)); + sycl_queue.submit([&](sycl::handler &cgh) { + auto A_acc = A_buff.get_access(cgh); cgh.parallel_for( - work_range, [A_acc, val](cl::sycl::nd_item<2> item_id) { + work_range, [A_acc, val](sycl::nd_item<2> item_id) { auto id = item_id.get_global_linear_id(); A_acc[id] = val; }); diff --git a/test/conformance/device_code/fill_3d.cpp b/test/conformance/device_code/fill_3d.cpp index a1f172ba6a..fd835222a2 100644 --- a/test/conformance/device_code/fill_3d.cpp +++ b/test/conformance/device_code/fill_3d.cpp @@ -3,26 +3,25 @@ // See LICENSE.TXT // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -#include +#include int main() { size_t nd_range_x = 4; size_t nd_range_y = 4; size_t nd_range_z = 4; - auto nd_range = cl::sycl::range<3>(nd_range_x, nd_range_y, nd_range_z); + auto nd_range = sycl::range<3>(nd_range_x, nd_range_y, nd_range_z); std::vector A(nd_range_x * nd_range_y * nd_range_y, 1); uint32_t val = 42; - cl::sycl::queue sycl_queue; + sycl::queue sycl_queue; - auto work_range = - cl::sycl::nd_range<3>(nd_range, cl::sycl::range<3>(1, 1, 1)); - auto A_buff = cl::sycl::buffer( - A.data(), cl::sycl::range<1>(nd_range_x * nd_range_y)); - sycl_queue.submit([&](cl::sycl::handler &cgh) { - auto A_acc = A_buff.get_access(cgh); + auto work_range = sycl::nd_range<3>(nd_range, sycl::range<3>(1, 1, 1)); + auto A_buff = sycl::buffer( + A.data(), sycl::range<1>(nd_range_x * nd_range_y)); + sycl_queue.submit([&](sycl::handler &cgh) { + auto A_acc = A_buff.get_access(cgh); cgh.parallel_for( - work_range, [A_acc, val](cl::sycl::nd_item<3> item_id) { + work_range, [A_acc, val](sycl::nd_item<3> item_id) { auto id = item_id.get_global_linear_id(); A_acc[id] = val; }); diff --git a/test/conformance/device_code/fill_usm.cpp b/test/conformance/device_code/fill_usm.cpp index 92cd255399..d57309cb59 100644 --- a/test/conformance/device_code/fill_usm.cpp +++ b/test/conformance/device_code/fill_usm.cpp @@ -3,17 +3,17 @@ // See LICENSE.TXT // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -#include +#include int main() { size_t array_size = 16; std::vector A(array_size, 1); uint32_t val = 42; - cl::sycl::queue sycl_queue; - uint32_t *data = cl::sycl::malloc_shared(array_size, sycl_queue); - sycl_queue.submit([&](cl::sycl::handler &cgh) { - cgh.parallel_for(cl::sycl::range<1>{array_size}, - [data, val](cl::sycl::item<1> itemId) { + sycl::queue sycl_queue; + uint32_t *data = sycl::malloc_shared(array_size, sycl_queue); + sycl_queue.submit([&](sycl::handler &cgh) { + cgh.parallel_for(sycl::range<1>{array_size}, + [data, val](sycl::item<1> itemId) { auto id = itemId.get_id(0); data[id] = val; }); diff --git a/test/conformance/device_code/foo.cpp b/test/conformance/device_code/foo.cpp index dc108b9606..20ad92be4b 100644 --- a/test/conformance/device_code/foo.cpp +++ b/test/conformance/device_code/foo.cpp @@ -3,14 +3,14 @@ // See LICENSE.TXT // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -#include +#include int main() { - cl::sycl::queue deviceQueue; - cl::sycl::range<1> numOfItems{1}; + sycl::queue deviceQueue; + sycl::range<1> numOfItems{1}; - deviceQueue.submit([&](cl::sycl::handler &cgh) { - auto kern = [=](cl::sycl::id<1>) {}; + deviceQueue.submit([&](sycl::handler &cgh) { + auto kern = [=](sycl::id<1>) {}; cgh.parallel_for(numOfItems, kern); }); diff --git a/test/conformance/device_code/image_copy.cpp b/test/conformance/device_code/image_copy.cpp index a64b601213..d04398c2fe 100644 --- a/test/conformance/device_code/image_copy.cpp +++ b/test/conformance/device_code/image_copy.cpp @@ -3,44 +3,40 @@ // See LICENSE.TXT // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -#include +#include int main() { - cl::sycl::queue sycl_queue; + sycl::queue sycl_queue; const int height = 8; const int width = 8; - auto image_range = cl::sycl::range<2>(height, width); + auto image_range = sycl::range<2>(height, width); const int channels = 4; std::vector in_data(height * width * channels, 0.5f); std::vector out_data(height * width * channels, 0); - cl::sycl::image<2> image_in( - in_data.data(), cl::sycl::image_channel_order::rgba, - cl::sycl::image_channel_type::fp32, image_range); - cl::sycl::image<2> image_out( - out_data.data(), cl::sycl::image_channel_order::rgba, - cl::sycl::image_channel_type::fp32, image_range); + sycl::image<2> image_in(in_data.data(), sycl::image_channel_order::rgba, + sycl::image_channel_type::fp32, image_range); + sycl::image<2> image_out(out_data.data(), sycl::image_channel_order::rgba, + sycl::image_channel_type::fp32, image_range); - auto work_range = - cl::sycl::nd_range<2>(image_range, cl::sycl::range<2>(1, 1)); - sycl_queue.submit([&](cl::sycl::handler &cgh) { - cl::sycl::accessor + auto work_range = sycl::nd_range<2>(image_range, sycl::range<2>(1, 1)); + sycl_queue.submit([&](sycl::handler &cgh) { + sycl::accessor in_acc(image_in, cgh); - cl::sycl::accessor + sycl::accessor out_acc(image_out, cgh); - cl::sycl::sampler smpl( - cl::sycl::coordinate_normalization_mode::unnormalized, - cl::sycl::addressing_mode::clamp, - cl::sycl::filtering_mode::nearest); + sycl::sampler smpl(sycl::coordinate_normalization_mode::unnormalized, + sycl::addressing_mode::clamp, + sycl::filtering_mode::nearest); cgh.parallel_for( - work_range, [=](cl::sycl::nd_item<2> item_id) { - auto coords = cl::sycl::int2(item_id.get_global_id(0), - item_id.get_global_id(1)); + work_range, [=](sycl::nd_item<2> item_id) { + auto coords = sycl::int2(item_id.get_global_id(0), + item_id.get_global_id(1)); out_acc.write(coords, in_acc.read(coords, smpl)); }); }); diff --git a/test/conformance/device_code/indexers_usm.cpp b/test/conformance/device_code/indexers_usm.cpp index 76b0751730..e055fa47cc 100644 --- a/test/conformance/device_code/indexers_usm.cpp +++ b/test/conformance/device_code/indexers_usm.cpp @@ -3,25 +3,24 @@ // See LICENSE.TXT // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -#include +#include int main() { - const cl::sycl::range<3> global_range(8, 8, 8); - const cl::sycl::range<3> local_range(2, 2, 2); - const cl::sycl::id<3> global_offset(4, 4, 4); - const cl::sycl::nd_range<3> nd_range(global_range, local_range, - global_offset); + const sycl::range<3> global_range(8, 8, 8); + const sycl::range<3> local_range(2, 2, 2); + const sycl::id<3> global_offset(4, 4, 4); + const sycl::nd_range<3> nd_range(global_range, local_range, global_offset); - cl::sycl::queue sycl_queue; + sycl::queue sycl_queue; const size_t elements_per_work_item = 6; - int *ptr = cl::sycl::malloc_shared(global_range[0] * global_range[1] * - global_range[2] * - elements_per_work_item, - sycl_queue); + int *ptr = + sycl::malloc_shared(global_range[0] * global_range[1] * + global_range[2] * elements_per_work_item, + sycl_queue); - sycl_queue.submit([&](cl::sycl::handler &cgh) { + sycl_queue.submit([&](sycl::handler &cgh) { cgh.parallel_for( - nd_range, [ptr](cl::sycl::nd_item<3> index) { + nd_range, [ptr](sycl::nd_item<3> index) { int *wi_ptr = ptr + index.get_global_linear_id() * elements_per_work_item; diff --git a/test/conformance/device_code/mean.cpp b/test/conformance/device_code/mean.cpp index 61623e0914..6d5a571374 100644 --- a/test/conformance/device_code/mean.cpp +++ b/test/conformance/device_code/mean.cpp @@ -3,34 +3,34 @@ // See LICENSE.TXT // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -#include +#include int main() { const int array_size = 16; const int wg_size = 4; std::vector in(array_size * wg_size, 1); std::vector out(array_size, 0); - cl::sycl::queue sycl_queue; - auto in_buff = cl::sycl::buffer( - in.data(), cl::sycl::range<1>(array_size * wg_size)); + sycl::queue sycl_queue; + auto in_buff = + sycl::buffer(in.data(), sycl::range<1>(array_size * wg_size)); auto out_buff = - cl::sycl::buffer(out.data(), cl::sycl::range<1>(array_size)); - sycl_queue.submit([&](cl::sycl::handler &cgh) { + sycl::buffer(out.data(), sycl::range<1>(array_size)); + sycl_queue.submit([&](sycl::handler &cgh) { sycl::local_accessor local_mem(wg_size, cgh); - auto in_acc = in_buff.get_access(cgh); - auto out_acc = out_buff.get_access(cgh); + auto in_acc = in_buff.get_access(cgh); + auto out_acc = out_buff.get_access(cgh); - cl::sycl::range<1> num_groups{array_size}; - cl::sycl::range<1> group_size{wg_size}; + sycl::range<1> num_groups{array_size}; + sycl::range<1> group_size{wg_size}; cgh.parallel_for_work_group( - num_groups, group_size, [=](cl::sycl::group<1> group) { + num_groups, group_size, [=](sycl::group<1> group) { auto group_id = group.get_group_id(); group.parallel_for_work_item([&](sycl::h_item<1> item) { auto local_id = item.get_local_id(0); auto in_index = (group_id * wg_size) + local_id; local_mem[local_id] = in_acc[in_index]; }); - cl::sycl::group_barrier(group); + sycl::group_barrier(group); uint32_t total = 0; for (int i = 0; i < wg_size; i++) { total += local_mem[i]; diff --git a/test/conformance/device_code/saxpy.cpp b/test/conformance/device_code/saxpy.cpp index 593e8e2435..ac113884b9 100644 --- a/test/conformance/device_code/saxpy.cpp +++ b/test/conformance/device_code/saxpy.cpp @@ -3,7 +3,7 @@ // See LICENSE.TXT // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -#include +#include int main() { size_t array_size = 16; @@ -11,20 +11,17 @@ int main() { std::vector Y(array_size, 2); std::vector Z(array_size, 0); uint32_t A = 42; - auto x_buff = - cl::sycl::buffer(X.data(), cl::sycl::range<1>(array_size)); - auto y_buff = - cl::sycl::buffer(Y.data(), cl::sycl::range<1>(array_size)); - auto z_buff = - cl::sycl::buffer(Z.data(), cl::sycl::range<1>(array_size)); + auto x_buff = sycl::buffer(X.data(), sycl::range<1>(array_size)); + auto y_buff = sycl::buffer(Y.data(), sycl::range<1>(array_size)); + auto z_buff = sycl::buffer(Z.data(), sycl::range<1>(array_size)); - cl::sycl::queue sycl_queue; - sycl_queue.submit([&](cl::sycl::handler &cgh) { - auto x_acc = x_buff.get_access(cgh); - auto y_acc = y_buff.get_access(cgh); - auto z_acc = z_buff.get_access(cgh); - cgh.parallel_for(cl::sycl::range<1>{array_size}, - [=](cl::sycl::item<1> itemId) { + sycl::queue sycl_queue; + sycl_queue.submit([&](sycl::handler &cgh) { + auto x_acc = x_buff.get_access(cgh); + auto y_acc = y_buff.get_access(cgh); + auto z_acc = z_buff.get_access(cgh); + cgh.parallel_for(sycl::range<1>{array_size}, + [=](sycl::item<1> itemId) { auto i = itemId.get_id(0); z_acc[i] = A * x_acc[i] + y_acc[i]; }); diff --git a/test/conformance/device_code/saxpy_usm.cpp b/test/conformance/device_code/saxpy_usm.cpp index 8772a7e25d..774686ab21 100644 --- a/test/conformance/device_code/saxpy_usm.cpp +++ b/test/conformance/device_code/saxpy_usm.cpp @@ -3,20 +3,20 @@ // See LICENSE.TXT // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -#include +#include int main() { size_t array_size = 16; - cl::sycl::queue sycl_queue; - uint32_t *X = cl::sycl::malloc_shared(array_size, sycl_queue); - uint32_t *Y = cl::sycl::malloc_shared(array_size, sycl_queue); - uint32_t *Z = cl::sycl::malloc_shared(array_size, sycl_queue); + sycl::queue sycl_queue; + uint32_t *X = sycl::malloc_shared(array_size, sycl_queue); + uint32_t *Y = sycl::malloc_shared(array_size, sycl_queue); + uint32_t *Z = sycl::malloc_shared(array_size, sycl_queue); uint32_t A = 42; - sycl_queue.submit([&](cl::sycl::handler &cgh) { - cgh.parallel_for(cl::sycl::range<1>{array_size}, - [=](cl::sycl::item<1> itemId) { + sycl_queue.submit([&](sycl::handler &cgh) { + cgh.parallel_for(sycl::range<1>{array_size}, + [=](sycl::item<1> itemId) { auto i = itemId.get_id(0); Z[i] = A * X[i] + Y[i]; }); diff --git a/tools/urinfo/urinfo.hpp b/tools/urinfo/urinfo.hpp index 111726f6cc..752a3a839f 100644 --- a/tools/urinfo/urinfo.hpp +++ b/tools/urinfo/urinfo.hpp @@ -383,5 +383,23 @@ inline void printDeviceInfos(ur_device_handle_t hDevice, std::cout << prefix; printDeviceInfo( hDevice, UR_DEVICE_INFO_CUBEMAP_SEAMLESS_FILTERING_SUPPORT_EXP); + std::cout << prefix; + printDeviceInfo( + hDevice, UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_1D_USM_EXP); + std::cout << prefix; + printDeviceInfo( + hDevice, UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_1D_EXP); + std::cout << prefix; + printDeviceInfo( + hDevice, UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_2D_USM_EXP); + std::cout << prefix; + printDeviceInfo( + hDevice, UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_2D_EXP); + std::cout << prefix; + printDeviceInfo( + hDevice, UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_3D_USM_EXP); + std::cout << prefix; + printDeviceInfo( + hDevice, UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_3D_EXP); } } // namespace urinfo