diff --git a/.github/workflows/ci-linux.yml b/.github/workflows/ci-linux.yml index a61b0dc2f..67922d147 100644 --- a/.github/workflows/ci-linux.yml +++ b/.github/workflows/ci-linux.yml @@ -158,6 +158,16 @@ jobs: source .venv/bin/activate pip install -r tests/requirements.txt + - name: Query device info + run: | + source .venv/bin/activate + echo "aie-metadata" + python amdxdna_driver_utils/amdxdna_ioctl.py --aie-metadata + echo "aie-version" + python amdxdna_driver_utils/amdxdna_ioctl.py --aie-version + echo "XRT_LITE_N_CORE_ROWS=$(python build_tools/ci/amdxdna_driver_utils/amdxdna_ioctl.py --num-rows)" >> $GITHUB_ENV + echo "XRT_LITE_N_CORE_COLS=$(python build_tools/ci/amdxdna_driver_utils/amdxdna_ioctl.py --num-cols)" >> $GITHUB_ENV + - name : E2E comparison of AIE to llvm-cpu run: | source .venv/bin/activate @@ -166,7 +176,9 @@ jobs: $PWD/iree-install \ $PWD/llvm-aie \ --vitis-dir /opt/Xilinx/Vitis/2024.2 \ - --reset-npu-between-runs -v + --reset-npu-between-runs -v \ + --xrt_lite_n_core_rows=$XRT_LITE_N_CORE_ROWS \ + --xrt_lite_n_core_cols=$XRT_LITE_N_CORE_COLS - name: E2E correctness matmul test run: | @@ -193,5 +205,5 @@ jobs: run: | DEVICE_TEST_DIR="$PWD/iree-install/device_tests" for t in $(ls $DEVICE_TEST_DIR); do - $DEVICE_TEST_DIR/$t + $DEVICE_TEST_DIR/$t --xrt_lite_n_core_rows=$XRT_LITE_N_CORE_ROWS --xrt_lite_n_core_cols=$XRT_LITE_N_CORE_COLS done diff --git a/build_tools/ci/amdxdna_driver_utils/amdxdna_accel.py b/build_tools/ci/amdxdna_driver_utils/amdxdna_accel.py new file mode 100644 index 000000000..edcd5f260 --- /dev/null +++ b/build_tools/ci/amdxdna_driver_utils/amdxdna_accel.py @@ -0,0 +1,789 @@ +# generated using clang2py amdxdna_accel.h -o amdxdna_accel.py -k cdefstum +import ctypes + + +class AsDictMixin: + @classmethod + def as_dict(cls, self): + result = {} + if not isinstance(self, AsDictMixin): + # not a structure, assume it's already a python object + return self + if not hasattr(cls, "_fields_"): + return result + for field_tuple in cls._fields_: # noqa + field = field_tuple[0] + if field.startswith("PADDING_"): + continue + value = getattr(self, field) + type_ = type(value) + if hasattr(value, "_length_") and hasattr(value, "_type_"): + # array + if not hasattr(type_, "as_dict"): + value = [v for v in value] + else: + type_ = type_._type_ + value = [type_.as_dict(v) for v in value] + elif hasattr(value, "contents") and hasattr(value, "_type_"): + # pointer + try: + if not hasattr(type_, "as_dict"): + value = value.contents + else: + type_ = type_._type_ + value = type_.as_dict(value.contents) + except ValueError: + # nullptr + value = None + elif isinstance(value, AsDictMixin): + # other structure + value = type_.as_dict(value) + result[field] = value + return result + + +class Structure(ctypes.Structure, AsDictMixin): + def __init__(self, *args, **kwds): + # We don't want to use positional arguments fill PADDING_* fields + + args = dict(zip(self.__class__._field_names_(), args)) + args.update(kwds) + super(Structure, self).__init__(**args) + + @classmethod + def _field_names_(cls): + if hasattr(cls, "_fields_"): + return (f[0] for f in cls._fields_ if not f[0].startswith("PADDING")) + else: + return () + + @classmethod + def get_type(cls, field): + for f in cls._fields_: + if f[0] == field: + return f[1] + return None + + @classmethod + def bind(cls, bound_fields): + fields = {} + for name, type_ in cls._fields_: + if hasattr(type_, "restype"): + if name in bound_fields: + if bound_fields[name] is None: + fields[name] = type_() + else: + # use a closure to capture the callback from the loop scope + fields[name] = type_( + (lambda callback: lambda *args: callback(*args))( + bound_fields[name] + ) + ) + del bound_fields[name] + else: + # default callback implementation (does nothing) + try: + default_ = type_(0).restype().value + except TypeError: + default_ = None + fields[name] = type_( + (lambda default_: lambda *args: default_)(default_) + ) + else: + # not a callback function, use default initialization + if name in bound_fields: + fields[name] = bound_fields[name] + del bound_fields[name] + else: + fields[name] = type_() + if len(bound_fields) != 0: + raise ValueError( + "Cannot bind the following unknown callback(s) {}.{}".format( + cls.__name__, bound_fields.keys() + ) + ) + return cls(**fields) + + +class Union(ctypes.Union, AsDictMixin): + pass + + +AMDXDNA_ACCEL_H_ = True # macro +AMDXDNA_DRIVER_MAJOR = 1 # macro +AMDXDNA_DRIVER_MINOR = 0 # macro +AMDXDNA_INVALID_CMD_HANDLE = ~0 # macro +AMDXDNA_INVALID_ADDR = ~0 # macro +AMDXDNA_INVALID_CTX_HANDLE = 0 # macro +AMDXDNA_INVALID_BO_HANDLE = 0 # macro +AMDXDNA_INVALID_FENCE_HANDLE = 0 # macro +SYNC_DIRECT_TO_DEVICE = 0 # macro +SYNC_DIRECT_FROM_DEVICE = 1 # macro + +# values for enumeration 'amdxdna_drm_ioctl_id' +amdxdna_drm_ioctl_id__enumvalues = { + 0: "DRM_AMDXDNA_CREATE_HWCTX", + 1: "DRM_AMDXDNA_DESTROY_HWCTX", + 2: "DRM_AMDXDNA_CONFIG_HWCTX", + 3: "DRM_AMDXDNA_CREATE_BO", + 4: "DRM_AMDXDNA_GET_BO_INFO", + 5: "DRM_AMDXDNA_SYNC_BO", + 6: "DRM_AMDXDNA_EXEC_CMD", + 7: "DRM_AMDXDNA_WAIT_CMD", + 8: "DRM_AMDXDNA_GET_INFO", + 9: "DRM_AMDXDNA_SET_STATE", + 10: "DRM_AMDXDNA_SUBMIT_WAIT", + 11: "DRM_AMDXDNA_SUBMIT_SIGNAL", + 12: "DRM_AMDXDNA_NUM_IOCTLS", +} +DRM_AMDXDNA_CREATE_HWCTX = 0 +DRM_AMDXDNA_DESTROY_HWCTX = 1 +DRM_AMDXDNA_CONFIG_HWCTX = 2 +DRM_AMDXDNA_CREATE_BO = 3 +DRM_AMDXDNA_GET_BO_INFO = 4 +DRM_AMDXDNA_SYNC_BO = 5 +DRM_AMDXDNA_EXEC_CMD = 6 +DRM_AMDXDNA_WAIT_CMD = 7 +DRM_AMDXDNA_GET_INFO = 8 +DRM_AMDXDNA_SET_STATE = 9 +DRM_AMDXDNA_SUBMIT_WAIT = 10 +DRM_AMDXDNA_SUBMIT_SIGNAL = 11 +DRM_AMDXDNA_NUM_IOCTLS = 12 +amdxdna_drm_ioctl_id = ctypes.c_uint32 # enum + +# values for enumeration 'amdxdna_device_type' +amdxdna_device_type__enumvalues = { + -1: "AMDXDNA_DEV_TYPE_UNKNOWN", + 0: "AMDXDNA_DEV_TYPE_KMQ", + 1: "AMDXDNA_DEV_TYPE_UMQ", +} +AMDXDNA_DEV_TYPE_UNKNOWN = -1 +AMDXDNA_DEV_TYPE_KMQ = 0 +AMDXDNA_DEV_TYPE_UMQ = 1 +amdxdna_device_type = ctypes.c_int32 # enum + + +class struct_amdxdna_qos_info(Structure): + pass + + +struct_amdxdna_qos_info._pack_ = 1 # source:False +struct_amdxdna_qos_info._fields_ = [ + ("gops", ctypes.c_uint32), + ("fps", ctypes.c_uint32), + ("dma_bandwidth", ctypes.c_uint32), + ("latency", ctypes.c_uint32), + ("frame_exec_time", ctypes.c_uint32), + ("priority", ctypes.c_uint32), +] + + +class struct_amdxdna_drm_create_hwctx(Structure): + pass + + +struct_amdxdna_drm_create_hwctx._pack_ = 1 # source:False +struct_amdxdna_drm_create_hwctx._fields_ = [ + ("ext", ctypes.c_uint64), + ("ext_flags", ctypes.c_uint64), + ("qos_p", ctypes.c_uint64), + ("umq_bo", ctypes.c_uint32), + ("log_buf_bo", ctypes.c_uint32), + ("max_opc", ctypes.c_uint32), + ("num_tiles", ctypes.c_uint32), + ("mem_size", ctypes.c_uint32), + ("umq_doorbell", ctypes.c_uint32), + ("handle", ctypes.c_uint32), + ("PADDING_0", ctypes.c_ubyte * 4), +] + + +# DRM_IOCTL_AMDXDNA_CREATE_HWCTX = DRM_IOWR ( DRM_COMMAND_BASE + DRM_AMDXDNA_CREATE_HWCTX , struct_amdxdna_drm_create_hwctx ) # macro +class struct_amdxdna_drm_destroy_hwctx(Structure): + pass + + +struct_amdxdna_drm_destroy_hwctx._pack_ = 1 # source:False +struct_amdxdna_drm_destroy_hwctx._fields_ = [ + ("handle", ctypes.c_uint32), + ("pad", ctypes.c_uint32), +] + + +# DRM_IOCTL_AMDXDNA_DESTROY_HWCTX = DRM_IOWR ( DRM_COMMAND_BASE + DRM_AMDXDNA_DESTROY_HWCTX , struct_amdxdna_drm_destroy_hwctx ) # macro +class struct_amdxdna_cu_config(Structure): + pass + + +struct_amdxdna_cu_config._pack_ = 1 # source:False +struct_amdxdna_cu_config._fields_ = [ + ("cu_bo", ctypes.c_uint32), + ("cu_func", ctypes.c_ubyte), + ("pad", ctypes.c_ubyte * 3), +] + + +def struct_amdxdna_hwctx_param_config_cu(num_cus, cu_configs): + assert len(cu_configs) == num_cus + + class struct_amdxdna_hwctx_param_config_cu(Structure): + pass + + struct_amdxdna_hwctx_param_config_cu._pack_ = 1 # source:False + struct_amdxdna_hwctx_param_config_cu._fields_ = [ + ("num_cus", ctypes.c_uint16), + ("pad", ctypes.c_uint16 * 3), + ("cu_configs", struct_amdxdna_cu_config * num_cus), + ] + struc = struct_amdxdna_hwctx_param_config_cu() + struc.num_cus = num_cus + struc.cu_configs = (struct_amdxdna_cu_config * num_cus)(*cu_configs) + return struc + + +# values for enumeration 'amdxdna_drm_config_hwctx_param' +amdxdna_drm_config_hwctx_param__enumvalues = { + 0: "DRM_AMDXDNA_HWCTX_CONFIG_CU", + 1: "DRM_AMDXDNA_HWCTX_ASSIGN_DBG_BUF", + 2: "DRM_AMDXDNA_HWCTX_REMOVE_DBG_BUF", + 3: "DRM_AMDXDNA_HWCTX_CONFIG_NUM", +} +DRM_AMDXDNA_HWCTX_CONFIG_CU = 0 +DRM_AMDXDNA_HWCTX_ASSIGN_DBG_BUF = 1 +DRM_AMDXDNA_HWCTX_REMOVE_DBG_BUF = 2 +DRM_AMDXDNA_HWCTX_CONFIG_NUM = 3 +amdxdna_drm_config_hwctx_param = ctypes.c_uint32 # enum + + +class struct_amdxdna_drm_config_hwctx(Structure): + pass + + +struct_amdxdna_drm_config_hwctx._pack_ = 1 # source:False +struct_amdxdna_drm_config_hwctx._fields_ = [ + ("handle", ctypes.c_uint32), + ("param_type", ctypes.c_uint32), + ("param_val", ctypes.c_uint64), + ("param_val_size", ctypes.c_uint32), + ("pad", ctypes.c_uint32), +] + +# DRM_IOCTL_AMDXDNA_CONFIG_HWCTX = DRM_IOWR ( DRM_COMMAND_BASE + DRM_AMDXDNA_CONFIG_HWCTX , struct_amdxdna_drm_config_hwctx ) # macro + +# values for enumeration 'amdxdna_bo_type' +amdxdna_bo_type__enumvalues = { + 0: "AMDXDNA_BO_INVALID", + 1: "AMDXDNA_BO_SHMEM", + 2: "AMDXDNA_BO_DEV_HEAP", + 3: "AMDXDNA_BO_DEV", + 4: "AMDXDNA_BO_CMD", + 5: "AMDXDNA_BO_DMA", +} +AMDXDNA_BO_INVALID = 0 +AMDXDNA_BO_SHMEM = 1 +AMDXDNA_BO_DEV_HEAP = 2 +AMDXDNA_BO_DEV = 3 +AMDXDNA_BO_CMD = 4 +AMDXDNA_BO_DMA = 5 +amdxdna_bo_type = ctypes.c_uint32 # enum + + +class struct_amdxdna_drm_create_bo(Structure): + pass + + +struct_amdxdna_drm_create_bo._pack_ = 1 # source:False +struct_amdxdna_drm_create_bo._fields_ = [ + ("flags", ctypes.c_uint64), + ("type", ctypes.c_uint32), + ("_pad", ctypes.c_uint32), + ("vaddr", ctypes.c_uint64), + ("size", ctypes.c_uint64), + ("handle", ctypes.c_uint32), + ("PADDING_0", ctypes.c_ubyte * 4), +] + + +# DRM_IOCTL_AMDXDNA_CREATE_BO = DRM_IOWR ( DRM_COMMAND_BASE + DRM_AMDXDNA_CREATE_BO , struct_amdxdna_drm_create_bo ) # macro +class struct_amdxdna_drm_get_bo_info(Structure): + pass + + +struct_amdxdna_drm_get_bo_info._pack_ = 1 # source:False +struct_amdxdna_drm_get_bo_info._fields_ = [ + ("ext", ctypes.c_uint64), + ("ext_flags", ctypes.c_uint64), + ("handle", ctypes.c_uint32), + ("_pad", ctypes.c_uint32), + ("map_offset", ctypes.c_uint64), + ("vaddr", ctypes.c_uint64), + ("xdna_addr", ctypes.c_uint64), +] + + +# DRM_IOCTL_AMDXDNA_GET_BO_INFO = DRM_IOWR ( DRM_COMMAND_BASE + DRM_AMDXDNA_GET_BO_INFO , struct_amdxdna_drm_get_bo_info ) # macro +class struct_amdxdna_drm_sync_bo(Structure): + pass + + +struct_amdxdna_drm_sync_bo._pack_ = 1 # source:False +struct_amdxdna_drm_sync_bo._fields_ = [ + ("handle", ctypes.c_uint32), + ("direction", ctypes.c_uint32), + ("offset", ctypes.c_uint64), + ("size", ctypes.c_uint64), +] + +# DRM_IOCTL_AMDXDNA_SYNC_BO = DRM_IOWR ( DRM_COMMAND_BASE + DRM_AMDXDNA_SYNC_BO , struct_amdxdna_drm_sync_bo ) # macro + +# values for enumeration 'amdxdna_cmd_type' +amdxdna_cmd_type__enumvalues = { + 0: "AMDXDNA_CMD_SUBMIT_EXEC_BUF", + 1: "AMDXDNA_CMD_SUBMIT_DEPENDENCY", + 2: "AMDXDNA_CMD_SUBMIT_SIGNAL", +} +AMDXDNA_CMD_SUBMIT_EXEC_BUF = 0 +AMDXDNA_CMD_SUBMIT_DEPENDENCY = 1 +AMDXDNA_CMD_SUBMIT_SIGNAL = 2 +amdxdna_cmd_type = ctypes.c_uint32 # enum + + +class struct_amdxdna_drm_exec_cmd(Structure): + pass + + +struct_amdxdna_drm_exec_cmd._pack_ = 1 # source:False +struct_amdxdna_drm_exec_cmd._fields_ = [ + ("ext", ctypes.c_uint64), + ("ext_flags", ctypes.c_uint64), + ("hwctx", ctypes.c_uint32), + ("type", ctypes.c_uint32), + ("cmd_handles", ctypes.c_uint64), + ("args", ctypes.c_uint64), + ("cmd_count", ctypes.c_uint32), + ("arg_count", ctypes.c_uint32), + ("seq", ctypes.c_uint64), +] + + +# DRM_IOCTL_AMDXDNA_EXEC_CMD = DRM_IOWR ( DRM_COMMAND_BASE + DRM_AMDXDNA_EXEC_CMD , struct_amdxdna_drm_exec_cmd ) # macro +class struct_amdxdna_drm_wait_cmd(Structure): + pass + + +struct_amdxdna_drm_wait_cmd._pack_ = 1 # source:False +struct_amdxdna_drm_wait_cmd._fields_ = [ + ("hwctx", ctypes.c_uint32), + ("timeout", ctypes.c_uint32), + ("seq", ctypes.c_uint64), +] + + +# DRM_IOCTL_AMDXDNA_WAIT_CMD = DRM_IOWR ( DRM_COMMAND_BASE + DRM_AMDXDNA_WAIT_CMD , struct_amdxdna_drm_wait_cmd ) # macro +class struct_amdxdna_drm_query_aie_status(Structure): + pass + + +struct_amdxdna_drm_query_aie_status._pack_ = 1 # source:False +struct_amdxdna_drm_query_aie_status._fields_ = [ + ("buffer", ctypes.c_uint64), + ("buffer_size", ctypes.c_uint32), + ("cols_filled", ctypes.c_uint32), +] + + +class struct_amdxdna_drm_query_aie_version(Structure): + pass + + +struct_amdxdna_drm_query_aie_version._pack_ = 1 # source:False +struct_amdxdna_drm_query_aie_version._fields_ = [ + ("major", ctypes.c_uint32), + ("minor", ctypes.c_uint32), +] + + +class struct_amdxdna_drm_query_aie_tile_metadata(Structure): + pass + + +struct_amdxdna_drm_query_aie_tile_metadata._pack_ = 1 # source:False +struct_amdxdna_drm_query_aie_tile_metadata._fields_ = [ + ("row_count", ctypes.c_uint16), + ("row_start", ctypes.c_uint16), + ("dma_channel_count", ctypes.c_uint16), + ("lock_count", ctypes.c_uint16), + ("event_reg_count", ctypes.c_uint16), + ("pad", ctypes.c_uint16 * 3), +] + + +class struct_amdxdna_drm_query_aie_metadata(Structure): + pass + + +struct_amdxdna_drm_query_aie_metadata._pack_ = 1 # source:False +struct_amdxdna_drm_query_aie_metadata._fields_ = [ + ("col_size", ctypes.c_uint32), + ("cols", ctypes.c_uint16), + ("rows", ctypes.c_uint16), + ("version", struct_amdxdna_drm_query_aie_version), + ("core", struct_amdxdna_drm_query_aie_tile_metadata), + ("mem", struct_amdxdna_drm_query_aie_tile_metadata), + ("shim", struct_amdxdna_drm_query_aie_tile_metadata), +] + + +class struct_amdxdna_drm_query_clock(Structure): + pass + + +struct_amdxdna_drm_query_clock._pack_ = 1 # source:False +struct_amdxdna_drm_query_clock._fields_ = [ + ("name", ctypes.c_ubyte * 16), + ("freq_mhz", ctypes.c_uint32), + ("pad", ctypes.c_uint32), +] + + +class struct_amdxdna_drm_query_clock_metadata(Structure): + _pack_ = 1 # source:False + _fields_ = [ + ("mp_npu_clock", struct_amdxdna_drm_query_clock), + ("h_clock", struct_amdxdna_drm_query_clock), + ] + + +# values for enumeration 'amdxdna_sensor_type' +amdxdna_sensor_type__enumvalues = { + 0: "AMDXDNA_SENSOR_TYPE_POWER", +} +AMDXDNA_SENSOR_TYPE_POWER = 0 +amdxdna_sensor_type = ctypes.c_uint32 # enum + + +class struct_amdxdna_drm_query_sensor(Structure): + pass + + +struct_amdxdna_drm_query_sensor._pack_ = 1 # source:False +struct_amdxdna_drm_query_sensor._fields_ = [ + ("label", ctypes.c_ubyte * 64), + ("input", ctypes.c_uint32), + ("max", ctypes.c_uint32), + ("average", ctypes.c_uint32), + ("highest", ctypes.c_uint32), + ("status", ctypes.c_ubyte * 64), + ("units", ctypes.c_ubyte * 16), + ("unitm", ctypes.c_byte), + ("type", ctypes.c_ubyte), + ("pad", ctypes.c_ubyte * 6), +] + + +class struct_amdxdna_drm_query_hwctx(Structure): + pass + + +struct_amdxdna_drm_query_hwctx._pack_ = 1 # source:False +struct_amdxdna_drm_query_hwctx._fields_ = [ + ("context_id", ctypes.c_uint32), + ("start_col", ctypes.c_uint32), + ("num_col", ctypes.c_uint32), + ("pad", ctypes.c_uint32), + ("pid", ctypes.c_int64), + ("command_submissions", ctypes.c_uint64), + ("command_completions", ctypes.c_uint64), + ("migrations", ctypes.c_uint64), + ("preemptions", ctypes.c_uint64), + ("errors", ctypes.c_uint64), +] + + +class struct_amdxdna_drm_aie_mem(Structure): + pass + + +struct_amdxdna_drm_aie_mem._pack_ = 1 # source:False +struct_amdxdna_drm_aie_mem._fields_ = [ + ("col", ctypes.c_uint32), + ("row", ctypes.c_uint32), + ("addr", ctypes.c_uint32), + ("size", ctypes.c_uint32), + ("buf_p", ctypes.c_uint64), +] + + +class struct_amdxdna_drm_aie_reg(Structure): + pass + + +struct_amdxdna_drm_aie_reg._pack_ = 1 # source:False +struct_amdxdna_drm_aie_reg._fields_ = [ + ("col", ctypes.c_uint32), + ("row", ctypes.c_uint32), + ("addr", ctypes.c_uint32), + ("val", ctypes.c_uint32), +] + +# values for enumeration 'amdxdna_power_mode_type' +amdxdna_power_mode_type__enumvalues = { + 0: "POWER_MODE_DEFAULT", + 1: "POWER_MODE_LOW", + 2: "POWER_MODE_MEDIUM", + 3: "POWER_MODE_HIGH", +} +POWER_MODE_DEFAULT = 0 +POWER_MODE_LOW = 1 +POWER_MODE_MEDIUM = 2 +POWER_MODE_HIGH = 3 +amdxdna_power_mode_type = ctypes.c_uint32 # enum + + +class struct_amdxdna_drm_get_power_mode(Structure): + pass + + +struct_amdxdna_drm_get_power_mode._pack_ = 1 # source:False +struct_amdxdna_drm_get_power_mode._fields_ = [ + ("power_mode", ctypes.c_ubyte), + ("pad", ctypes.c_ubyte * 7), +] + + +class struct_amdxdna_drm_query_firmware_version(Structure): + pass + + +struct_amdxdna_drm_query_firmware_version._pack_ = 1 # source:False +struct_amdxdna_drm_query_firmware_version._fields_ = [ + ("major", ctypes.c_uint32), + ("minor", ctypes.c_uint32), + ("patch", ctypes.c_uint32), + ("build", ctypes.c_uint32), +] + +# values for enumeration 'amdxdna_drm_get_param' +amdxdna_drm_get_param__enumvalues = { + 0: "DRM_AMDXDNA_QUERY_AIE_STATUS", + 1: "DRM_AMDXDNA_QUERY_AIE_METADATA", + 2: "DRM_AMDXDNA_QUERY_AIE_VERSION", + 3: "DRM_AMDXDNA_QUERY_CLOCK_METADATA", + 4: "DRM_AMDXDNA_QUERY_SENSORS", + 5: "DRM_AMDXDNA_QUERY_HW_CONTEXTS", + 6: "DRM_AMDXDNA_READ_AIE_MEM", + 7: "DRM_AMDXDNA_READ_AIE_REG", + 8: "DRM_AMDXDNA_QUERY_FIRMWARE_VERSION", + 9: "DRM_AMDXDNA_GET_POWER_MODE", + 10: "DRM_AMDXDNA_NUM_GET_PARAM", +} +DRM_AMDXDNA_QUERY_AIE_STATUS = 0 +DRM_AMDXDNA_QUERY_AIE_METADATA = 1 +DRM_AMDXDNA_QUERY_AIE_VERSION = 2 +DRM_AMDXDNA_QUERY_CLOCK_METADATA = 3 +DRM_AMDXDNA_QUERY_SENSORS = 4 +DRM_AMDXDNA_QUERY_HW_CONTEXTS = 5 +DRM_AMDXDNA_READ_AIE_MEM = 6 +DRM_AMDXDNA_READ_AIE_REG = 7 +DRM_AMDXDNA_QUERY_FIRMWARE_VERSION = 8 +DRM_AMDXDNA_GET_POWER_MODE = 9 +DRM_AMDXDNA_NUM_GET_PARAM = 10 +amdxdna_drm_get_param = ctypes.c_uint32 # enum + + +class struct_amdxdna_drm_get_info(Structure): + pass + + +struct_amdxdna_drm_get_info._pack_ = 1 # source:False +struct_amdxdna_drm_get_info._fields_ = [ + ("param", ctypes.c_uint32), + ("buffer_size", ctypes.c_uint32), + ("buffer", ctypes.c_uint64), +] + + +# DRM_IOCTL_AMDXDNA_GET_INFO = DRM_IOWR ( DRM_COMMAND_BASE + DRM_AMDXDNA_GET_INFO , struct_amdxdna_drm_get_info ) # macro +class struct_amdxdna_drm_set_power_mode(Structure): + pass + + +struct_amdxdna_drm_set_power_mode._pack_ = 1 # source:False +struct_amdxdna_drm_set_power_mode._fields_ = [ + ("power_mode", ctypes.c_ubyte), + ("pad", ctypes.c_ubyte * 7), +] + +# values for enumeration 'amdxdna_drm_set_param' +amdxdna_drm_set_param__enumvalues = { + 0: "DRM_AMDXDNA_SET_POWER_MODE", + 1: "DRM_AMDXDNA_WRITE_AIE_MEM", + 2: "DRM_AMDXDNA_WRITE_AIE_REG", + 3: "DRM_AMDXDNA_NUM_SET_PARAM", +} +DRM_AMDXDNA_SET_POWER_MODE = 0 +DRM_AMDXDNA_WRITE_AIE_MEM = 1 +DRM_AMDXDNA_WRITE_AIE_REG = 2 +DRM_AMDXDNA_NUM_SET_PARAM = 3 +amdxdna_drm_set_param = ctypes.c_uint32 # enum + + +class struct_amdxdna_drm_set_state(Structure): + pass + + +struct_amdxdna_drm_set_state._pack_ = 1 # source:False +struct_amdxdna_drm_set_state._fields_ = [ + ("param", ctypes.c_uint32), + ("buffer_size", ctypes.c_uint32), + ("buffer", ctypes.c_uint64), +] + + +# DRM_IOCTL_AMDXDNA_SET_STATE = DRM_IOWR ( DRM_COMMAND_BASE + DRM_AMDXDNA_SET_STATE , struct_amdxdna_drm_set_state ) # macro +class struct_amdxdna_drm_syncobjs(Structure): + pass + + +struct_amdxdna_drm_syncobjs._pack_ = 1 # source:False +struct_amdxdna_drm_syncobjs._fields_ = [ + ("handles", ctypes.c_uint64), + ("points", ctypes.c_uint64), + ("count", ctypes.c_uint32), + ("pad", ctypes.c_uint32), +] + + +def struct_amdxdna_cmd_chain(command_count): + class struct_amdxdna_cmd_chain(Structure): + pass + + struct_amdxdna_cmd_chain._pack_ = 1 # source:False + struct_amdxdna_cmd_chain._fields_ = [ + ("command_count", ctypes.c_uint32), + ("submit_index", ctypes.c_uint32), + ("error_index", ctypes.c_uint32), + ("reserved", ctypes.c_uint32 * 3), + ("data", ctypes.c_uint64 * command_count), + ] + return struct_amdxdna_cmd_chain + + +def struct_amdxdna_cmd(count): + class struct_amdxdna_cmd(Structure): + pass + + struct_amdxdna_cmd._pack_ = 1 # source:False + struct_amdxdna_cmd._fields_ = [ + ("state", ctypes.c_uint32, 4), + ("unused", ctypes.c_uint32, 6), + ("extra_cu_masks", ctypes.c_uint32, 2), + ("count", ctypes.c_uint32, 11), + ("opcode", ctypes.c_uint32, 5), + ("reserved", ctypes.c_uint32, 4), + ("data", ctypes.c_uint32 * count), + ] + return struct_amdxdna_cmd + + +# DRM_IOCTL_AMDXDNA_SUBMIT_WAIT = DRM_IOWR ( DRM_COMMAND_BASE + DRM_AMDXDNA_SUBMIT_WAIT , struct_amdxdna_drm_syncobjs ) # macro +# DRM_IOCTL_AMDXDNA_SUBMIT_SIGNAL = DRM_IOWR ( DRM_COMMAND_BASE + DRM_AMDXDNA_SUBMIT_SIGNAL , struct_amdxdna_drm_syncobjs ) # macro +__all__ = [ + "AMDXDNA_ACCEL_H_", + "AMDXDNA_BO_CMD", + "AMDXDNA_BO_DEV", + "AMDXDNA_BO_DEV_HEAP", + "AMDXDNA_BO_DMA", + "AMDXDNA_BO_INVALID", + "AMDXDNA_BO_SHMEM", + "AMDXDNA_CMD_SUBMIT_DEPENDENCY", + "AMDXDNA_CMD_SUBMIT_EXEC_BUF", + "AMDXDNA_CMD_SUBMIT_SIGNAL", + "AMDXDNA_DEV_TYPE_KMQ", + "AMDXDNA_DEV_TYPE_UMQ", + "AMDXDNA_DEV_TYPE_UNKNOWN", + "AMDXDNA_DRIVER_MAJOR", + "AMDXDNA_DRIVER_MINOR", + "AMDXDNA_INVALID_ADDR", + "AMDXDNA_INVALID_BO_HANDLE", + "AMDXDNA_INVALID_CMD_HANDLE", + "AMDXDNA_INVALID_CTX_HANDLE", + "AMDXDNA_INVALID_FENCE_HANDLE", + "AMDXDNA_SENSOR_TYPE_POWER", + "DRM_AMDXDNA_CONFIG_HWCTX", + "DRM_AMDXDNA_CREATE_BO", + "DRM_AMDXDNA_CREATE_HWCTX", + "DRM_AMDXDNA_DESTROY_HWCTX", + "DRM_AMDXDNA_EXEC_CMD", + "DRM_AMDXDNA_GET_BO_INFO", + "DRM_AMDXDNA_GET_INFO", + "DRM_AMDXDNA_GET_POWER_MODE", + "DRM_AMDXDNA_HWCTX_ASSIGN_DBG_BUF", + "DRM_AMDXDNA_HWCTX_CONFIG_CU", + "DRM_AMDXDNA_HWCTX_CONFIG_NUM", + "DRM_AMDXDNA_HWCTX_REMOVE_DBG_BUF", + "DRM_AMDXDNA_NUM_GET_PARAM", + "DRM_AMDXDNA_NUM_IOCTLS", + "DRM_AMDXDNA_NUM_SET_PARAM", + "DRM_AMDXDNA_QUERY_AIE_METADATA", + "DRM_AMDXDNA_QUERY_AIE_STATUS", + "DRM_AMDXDNA_QUERY_AIE_VERSION", + "DRM_AMDXDNA_QUERY_CLOCK_METADATA", + "DRM_AMDXDNA_QUERY_FIRMWARE_VERSION", + "DRM_AMDXDNA_QUERY_HW_CONTEXTS", + "DRM_AMDXDNA_QUERY_SENSORS", + "DRM_AMDXDNA_READ_AIE_MEM", + "DRM_AMDXDNA_READ_AIE_REG", + "DRM_AMDXDNA_SET_POWER_MODE", + "DRM_AMDXDNA_SET_STATE", + "DRM_AMDXDNA_SUBMIT_SIGNAL", + "DRM_AMDXDNA_SUBMIT_WAIT", + "DRM_AMDXDNA_SYNC_BO", + "DRM_AMDXDNA_WAIT_CMD", + "DRM_AMDXDNA_WRITE_AIE_MEM", + "DRM_AMDXDNA_WRITE_AIE_REG", + "POWER_MODE_DEFAULT", + "POWER_MODE_HIGH", + "POWER_MODE_LOW", + "POWER_MODE_MEDIUM", + "SYNC_DIRECT_FROM_DEVICE", + "SYNC_DIRECT_TO_DEVICE", + "amdxdna_bo_type", + "amdxdna_cmd_type", + "amdxdna_device_type", + "amdxdna_drm_config_hwctx_param", + "amdxdna_drm_get_param", + "amdxdna_drm_ioctl_id", + "amdxdna_drm_set_param", + "amdxdna_power_mode_type", + "amdxdna_sensor_type", + "struct_amdxdna_cu_config", + "struct_amdxdna_drm_aie_mem", + "struct_amdxdna_drm_aie_reg", + "struct_amdxdna_drm_config_hwctx", + "struct_amdxdna_drm_create_bo", + "struct_amdxdna_drm_create_hwctx", + "struct_amdxdna_drm_destroy_hwctx", + "struct_amdxdna_drm_exec_cmd", + "struct_amdxdna_drm_get_bo_info", + "struct_amdxdna_drm_get_info", + "struct_amdxdna_drm_get_power_mode", + "struct_amdxdna_drm_query_aie_metadata", + "struct_amdxdna_drm_query_aie_status", + "struct_amdxdna_drm_query_aie_tile_metadata", + "struct_amdxdna_drm_query_aie_version", + "struct_amdxdna_drm_query_clock", + "struct_amdxdna_drm_query_clock_metadata", + "struct_amdxdna_drm_query_firmware_version", + "struct_amdxdna_drm_query_hwctx", + "struct_amdxdna_drm_query_sensor", + "struct_amdxdna_drm_set_power_mode", + "struct_amdxdna_drm_set_state", + "struct_amdxdna_drm_sync_bo", + "struct_amdxdna_drm_syncobjs", + "struct_amdxdna_drm_wait_cmd", + "struct_amdxdna_hwctx_param_config_cu", + "struct_amdxdna_qos_info", + "struct_amdxdna_cmd_chain", + "struct_amdxdna_cmd", +] diff --git a/build_tools/ci/amdxdna_driver_utils/amdxdna_ioctl.py b/build_tools/ci/amdxdna_driver_utils/amdxdna_ioctl.py new file mode 100644 index 000000000..38aafabc0 --- /dev/null +++ b/build_tools/ci/amdxdna_driver_utils/amdxdna_ioctl.py @@ -0,0 +1,217 @@ +import argparse +import array +import ctypes +import ctypes.util +import fcntl +import pathlib +import re +import struct +from argparse import Namespace +from pprint import pformat + +import amdxdna_accel +from amdxdna_accel import ( + struct_amdxdna_drm_query_aie_version, + struct_amdxdna_drm_get_info, + struct_amdxdna_drm_query_aie_metadata, + DRM_AMDXDNA_QUERY_AIE_VERSION, + DRM_AMDXDNA_QUERY_AIE_METADATA, +) + +_IOC_NRBITS = 8 +_IOC_TYPEBITS = 8 +_IOC_SIZEBITS = 14 +_IOC_DIRBITS = 2 + +_IOC_NRMASK = (1 << _IOC_NRBITS) - 1 +_IOC_TYPEMASK = (1 << _IOC_TYPEBITS) - 1 +_IOC_SIZEMASK = (1 << _IOC_SIZEBITS) - 1 +_IOC_DIRMASK = (1 << _IOC_DIRBITS) - 1 + +_IOC_NRSHIFT = 0 +_IOC_TYPESHIFT = _IOC_NRSHIFT + _IOC_NRBITS +_IOC_SIZESHIFT = _IOC_TYPESHIFT + _IOC_TYPEBITS +_IOC_DIRSHIFT = _IOC_SIZESHIFT + _IOC_SIZEBITS + +IOC_NONE = 0 +IOC_WRITE = 1 +IOC_READ = 2 + + +def _IOC(dir, type, nr, size): + assert dir <= _IOC_DIRMASK, dir + assert type <= _IOC_TYPEMASK, type + assert nr <= _IOC_NRMASK, nr + assert size <= _IOC_SIZEMASK, size + return ( + (dir << _IOC_DIRSHIFT) + | (type << _IOC_TYPESHIFT) + | (nr << _IOC_NRSHIFT) + | (size << _IOC_SIZESHIFT) + ) + + +def _IOC_TYPECHECK(t): + if isinstance(t, (memoryview, bytearray)): + size = len(t) + elif isinstance(t, struct.Struct): + size = t.size + elif isinstance(t, array.array): + size = t.itemsize * len(t) + else: + size = ctypes.sizeof(t) + assert size <= _IOC_SIZEMASK, size + return size + + +def _IOWR(type, nr, size): + return _IOC(IOC_READ | IOC_WRITE, type, nr, _IOC_TYPECHECK(size)) + + +def get_struct(argp, stype): + return ctypes.cast(ctypes.c_void_p(argp), ctypes.POINTER(stype)).contents + + +def get_void_ptr_to_struct(s): + ptr = ctypes.pointer(s) + return ctypes.cast(ptr, ctypes.c_void_p) + + +def format_struct(s): + return pformat(s.as_dict(s)) + + +# +DRM_IOCTL_BASE = ord("d") +DRM_COMMAND_BASE = 0x40 + + +def DRM_IOWR(nr, type): + return _IOWR(DRM_IOCTL_BASE, nr, type) + + +def ioctls_from_header(): + hdr = ( + (pathlib.Path(__file__).parent / "amdxdna_accel.py") + .read_text() + .replace("\\\n", "") + ) + pattern = "DRM_IOCTL_AMDXDNA_([A-Z0-9_]+) = DRM_IOWR \( DRM_COMMAND_BASE \+ DRM_AMDXDNA_([A-Z0-9_]+) , struct_amdxdna_drm_([a-z0-9_]+) \)" + matches = re.findall(pattern, hdr, re.MULTILINE) + ioctls = Namespace() + for name, offset, sname in matches: + assert name == offset + offset = f"DRM_AMDXDNA_{name}" + assert hasattr(amdxdna_accel, offset) + offset = getattr(amdxdna_accel, offset) + struc = getattr(amdxdna_accel, "struct_amdxdna_drm_" + sname) + setattr( + ioctls, + f"DRM_IOCTL_AMDXDNA_{name}", + DRM_IOWR(DRM_COMMAND_BASE + offset, struc), + ) + + return ioctls + + +ioctls = ioctls_from_header() + + +def get_aie_version(drv_fd): + version = struct_amdxdna_drm_query_aie_version() + info_params = struct_amdxdna_drm_get_info( + DRM_AMDXDNA_QUERY_AIE_VERSION, + ctypes.sizeof(struct_amdxdna_drm_query_aie_version), + get_void_ptr_to_struct(version).value, + ) + + fcntl.ioctl(drv_fd, ioctls.DRM_IOCTL_AMDXDNA_GET_INFO, info_params) + + return version.major, version.minor + + +def get_aie_metadata(drv_fd): + metadata = struct_amdxdna_drm_query_aie_metadata() + info_params = struct_amdxdna_drm_get_info( + DRM_AMDXDNA_QUERY_AIE_METADATA, + ctypes.sizeof(struct_amdxdna_drm_query_aie_metadata), + get_void_ptr_to_struct(metadata).value, + ) + + fcntl.ioctl(drv_fd, ioctls.DRM_IOCTL_AMDXDNA_GET_INFO, info_params) + + return format_struct(metadata) + + +def get_core_n_rows(drv_fd): + metadata = struct_amdxdna_drm_query_aie_metadata() + info_params = struct_amdxdna_drm_get_info( + DRM_AMDXDNA_QUERY_AIE_METADATA, + ctypes.sizeof(struct_amdxdna_drm_query_aie_metadata), + get_void_ptr_to_struct(metadata).value, + ) + + fcntl.ioctl(drv_fd, ioctls.DRM_IOCTL_AMDXDNA_GET_INFO, info_params) + return metadata.core.row_count + + +def find_npu_device(): + drvpath = pathlib.Path("/sys/bus/pci/drivers/amdxdna") + for file in drvpath.iterdir(): + if file.is_symlink(): + actual_path = (drvpath / file.readlink()).resolve() + if str(actual_path).startswith("/sys/devices/pci"): + return actual_path + raise RuntimeError("npu device not found") + + +def read_vbnv(npu_device_path): + f = open(npu_device_path / "vbnv") + vbnv = f.read() + assert vbnv.startswith("RyzenAI-") + return vbnv.split("-")[-1].strip() + + +def get_core_n_cols(drv_fd, npu_device): + metadata = struct_amdxdna_drm_query_aie_metadata() + info_params = struct_amdxdna_drm_get_info( + DRM_AMDXDNA_QUERY_AIE_METADATA, + ctypes.sizeof(struct_amdxdna_drm_query_aie_metadata), + get_void_ptr_to_struct(metadata).value, + ) + + fcntl.ioctl(drv_fd, ioctls.DRM_IOCTL_AMDXDNA_GET_INFO, info_params) + if npu_device == "npu1": + # phoenix + return metadata.cols - 1 + elif npu_device == "npu4": + # strix + return metadata.cols + + return NotImplementedError(f"unrecognized {npu_device=}") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--npu-device", action="store_true") + parser.add_argument("--num-rows", action="store_true") + parser.add_argument("--num-cols", action="store_true") + parser.add_argument("--aie-metadata", action="store_true") + parser.add_argument("--aie-version", action="store_true") + args = parser.parse_args() + + drv_path = "/dev/accel/accel0" + drv_fd = open(drv_path, "r+") + npu_device_path = find_npu_device() + npu_device = read_vbnv(npu_device_path) + + if args.npu_device: + print(npu_device) + if args.num_rows: + print(get_core_n_rows(drv_fd)) + if args.num_cols: + print(get_core_n_cols(drv_fd, npu_device)) + if args.aie_metadata: + print(get_aie_metadata(drv_fd)) + if args.aie_version: + print(get_aie_version(drv_fd)) diff --git a/build_tools/ci/cpu_comparison/run.py b/build_tools/ci/cpu_comparison/run.py index 8ddd22ddc..87622dee3 100755 --- a/build_tools/ci/cpu_comparison/run.py +++ b/build_tools/ci/cpu_comparison/run.py @@ -197,6 +197,11 @@ def generate_aie_output(config, aie_vmfb, input_args, function_name, name, outpu ] if function_name: run_args += [f"--function={function_name}"] + if config.xrt_lite_n_core_rows is not None: + run_args += [f"--xrt_lite_n_core_rows={config.xrt_lite_n_core_rows}"] + if config.xrt_lite_n_core_cols is not None: + run_args += [f"--xrt_lite_n_core_cols={config.xrt_lite_n_core_cols}"] + if config.reset_npu_between_runs: shell_out(config.reset_npu_script, verbose=config.verbose) @@ -269,6 +274,8 @@ def __init__( do_not_run_aie, additional_aie_compilation_flags, device_hal, + xrt_lite_n_core_rows, + xrt_lite_n_core_cols, ): self.output_dir = output_dir self.iree_install_dir = iree_install_dir @@ -286,6 +293,8 @@ def __init__( self.do_not_run_aie = do_not_run_aie self.additional_aie_compilation_flags = additional_aie_compilation_flags self.device_hal = device_hal + self.xrt_lite_n_core_rows = xrt_lite_n_core_rows + self.xrt_lite_n_core_cols = xrt_lite_n_core_cols # Try get the xrt and (linux) kernel versions. self.linux_kernel = "undetermined" @@ -849,7 +858,9 @@ def all_tests( do_not_run_aie, test_set, additional_aie_compilation_flags, - device_hal + device_hal, + xrt_lite_n_core_rows, + xrt_lite_n_core_cols, ): """ There are a few ways to add tests to this script: @@ -891,7 +902,9 @@ def all_tests( reset_npu_between_runs, do_not_run_aie, additional_aie_compilation_flags, - device_hal + device_hal, + xrt_lite_n_core_rows, + xrt_lite_n_core_cols, ) if verbose: print(config) @@ -946,6 +959,8 @@ def all_tests( parser.add_argument("peano_install_dir", type=abs_path) parser.add_argument("--xrt-dir", type=abs_path) parser.add_argument("--vitis-dir", type=abs_path) + parser.add_argument("--xrt_lite_n_core_rows", type=int) + parser.add_argument("--xrt_lite_n_core_rows", type=int) # TODO(newling) make bool options boolean, not integer (tried but had issues) parser.add_argument( @@ -1052,5 +1067,7 @@ def all_tests( args.do_not_run_aie, test_set_list, args.additional_aie_compilation_flags, - args.device_hal + args.device_hal, + args.xrt_lite_n_core_rows, + args.xrt_lite_n_core_cols, ) diff --git a/build_tools/ci/run_matmul_test.sh b/build_tools/ci/run_matmul_test.sh index c1c5a6d56..6a6146487 100755 --- a/build_tools/ci/run_matmul_test.sh +++ b/build_tools/ci/run_matmul_test.sh @@ -464,6 +464,13 @@ function run_matmul_test() { --device=${DEVICE_HAL} \ --max_elements_to_check=${max_elements_to_check}" + if [ -n "$XRT_LITE_N_CORE_ROWS" ]; then + COMMAND="${COMMAND} --xrt_lite_n_core_rows=$XRT_LITE_N_CORE_ROWS" + fi + if [ -n "$XRT_LITE_N_CORE_COLS" ]; then + COMMAND="${COMMAND} --xrt_lite_n_core_cols=$XRT_LITE_N_CORE_COLS" + fi + total_num_runs=$(( num_repeat_runs * num_corruption_repeat_runs)) echo "**** Running '${name}' matmul test ${total_num_runs} times (command ${COMMAND}) ****" for i in $(seq 1 $num_repeat_runs); do diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/api.h b/runtime/src/iree-amd-aie/driver/xrt-lite/api.h index 62b2a9fae..c969388ba 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/api.h +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/api.h @@ -10,13 +10,16 @@ #include "iree/base/api.h" #include "iree/hal/api.h" -struct iree_hal_xrt_lite_device_options {}; +struct iree_hal_xrt_lite_device_params { + int32_t n_core_rows; + int32_t n_core_cols; +}; IREE_API_EXPORT void iree_hal_xrt_lite_device_options_initialize( - struct iree_hal_xrt_lite_device_options* out_params); + struct iree_hal_xrt_lite_device_params* out_params); struct iree_hal_xrt_lite_driver_options { - struct iree_hal_xrt_lite_device_options default_device_options; + struct iree_hal_xrt_lite_device_params device_params; }; IREE_API_EXPORT void iree_hal_xrt_lite_driver_options_initialize( @@ -30,11 +33,12 @@ IREE_API_EXPORT void iree_hal_xrt_lite_driver_options_initialize( IREE_API_EXPORT iree_status_t iree_hal_xrt_lite_driver_create( iree_string_view_t identifier, const struct iree_hal_xrt_lite_driver_options* options, + const struct iree_hal_xrt_lite_device_params* device_params, iree_allocator_t host_allocator, iree_hal_driver_t** out_driver); IREE_API_EXPORT iree_status_t iree_hal_xrt_lite_device_create( iree_string_view_t identifier, - const struct iree_hal_xrt_lite_device_options* options, + const struct iree_hal_xrt_lite_device_params* params, iree_allocator_t host_allocator, iree_hal_device_t** out_device); #endif // IREE_AMD_AIE_DRIVER_XRT_LITE_API_H_ diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/device.cc b/runtime/src/iree-amd-aie/driver/xrt-lite/device.cc index 6239b10e4..c9d3bb64e 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/device.cc +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/device.cc @@ -8,6 +8,7 @@ #include "iree-amd-aie/driver/xrt-lite/allocator.h" #include "iree-amd-aie/driver/xrt-lite/api.h" +#include "iree-amd-aie/driver/xrt-lite/device.h" #include "iree-amd-aie/driver/xrt-lite/direct_command_buffer.h" #include "iree-amd-aie/driver/xrt-lite/nop_executable_cache.h" #include "iree-amd-aie/driver/xrt-lite/nop_semaphore.h" @@ -21,38 +22,25 @@ namespace { extern const iree_hal_device_vtable_t iree_hal_xrt_lite_device_vtable; } -struct iree_hal_xrt_lite_device { - iree_hal_resource_t resource; - iree_allocator_t host_allocator; - // TODO(max): not used because "device allocations" are performed through - // device - iree_hal_allocator_t* device_allocator; - // block pool used for command buffer allocations, uses a larger block size - // since command buffers can contain inlined data - iree_arena_block_pool_t block_pool; - shim_xdna::device* shim_device; - // should come last; see the definition of total_size below in - // iree_hal_xrt_lite_device_create - iree_string_view_t identifier; - - iree_hal_xrt_lite_device(const iree_hal_xrt_lite_device_options* options, - iree_allocator_t host_allocator) { - IREE_ASSERT_ARGUMENT(options); - IREE_TRACE_ZONE_BEGIN(z0); - - iree_hal_resource_initialize(&iree_hal_xrt_lite_device_vtable, &resource); - this->host_allocator = host_allocator; - shim_device = new shim_xdna::device; - - iree_status_t status = iree_hal_xrt_lite_allocator_create( - host_allocator, shim_device, &device_allocator); - IREE_ASSERT(iree_status_is_ok(status)); - iree_arena_block_pool_initialize(ARENA_BLOCK_SIZE, host_allocator, - &block_pool); +iree_hal_xrt_lite_device::iree_hal_xrt_lite_device( + const iree_hal_xrt_lite_device_params* options, + iree_allocator_t host_allocator) { + IREE_ASSERT_ARGUMENT(options); + IREE_TRACE_ZONE_BEGIN(z0); - IREE_TRACE_ZONE_END(z0); - } -}; + iree_hal_resource_initialize(&iree_hal_xrt_lite_device_vtable, &resource); + this->host_allocator = host_allocator; + shim_device = + new shim_xdna::device(options->n_core_rows, options->n_core_cols); + + iree_status_t status = iree_hal_xrt_lite_allocator_create( + host_allocator, shim_device, &device_allocator); + IREE_ASSERT(iree_status_is_ok(status)); + iree_arena_block_pool_initialize(ARENA_BLOCK_SIZE, host_allocator, + &block_pool); + + IREE_TRACE_ZONE_END(z0); +} static iree_status_t iree_hal_xrt_lite_device_create_executable_cache( iree_hal_device_t* base_device, iree_string_view_t identifier, @@ -123,8 +111,7 @@ static iree_status_t iree_hal_xrt_lite_device_queue_execute( IREE_HAL_COMMAND_BUFFER_MODE_UNVALIDATED; IREE_RETURN_AND_END_ZONE_IF_ERROR( z0, iree_hal_xrt_lite_direct_command_buffer_create( - device->shim_device, device->device_allocator, mode, - IREE_HAL_COMMAND_CATEGORY_ANY, + device, mode, IREE_HAL_COMMAND_CATEGORY_ANY, /*binding_capacity=*/0, &device->block_pool, device->host_allocator, &xrt_command_buffer)); IREE_RETURN_AND_END_ZONE_IF_ERROR( @@ -244,7 +231,7 @@ static iree_hal_allocator_t* iree_hal_xrt_lite_device_device_allocator( } void iree_hal_xrt_lite_device_options_initialize( - iree_hal_xrt_lite_device_options* out_options) { + iree_hal_xrt_lite_device_params* out_options) { IREE_TRACE_ZONE_BEGIN(z0); memset(out_options, 0, sizeof(*out_options)); @@ -254,7 +241,7 @@ void iree_hal_xrt_lite_device_options_initialize( iree_status_t iree_hal_xrt_lite_device_create( iree_string_view_t identifier, - const iree_hal_xrt_lite_device_options* options, + const iree_hal_xrt_lite_device_params* options, iree_allocator_t host_allocator, iree_hal_device_t** out_device) { IREE_ASSERT_ARGUMENT(options); IREE_ASSERT_ARGUMENT(out_device); diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/device.h b/runtime/src/iree-amd-aie/driver/xrt-lite/device.h new file mode 100644 index 000000000..ad3141e88 --- /dev/null +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/device.h @@ -0,0 +1,33 @@ +// Copyright 2024 The IREE Authors +// +// Licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#ifndef IREE_AMD_AIE_DRIVER_XRT_LITE_XRT_LITE_DEVICE_H_ +#define IREE_AMD_AIE_DRIVER_XRT_LITE_XRT_LITE_DEVICE_H_ + +#include "iree-amd-aie/driver/xrt-lite/api.h" +#include "iree-amd-aie/driver/xrt-lite/shim/linux/kmq/device.h" +#include "iree/base/internal/arena.h" +#include "iree/hal/api.h" + +struct iree_hal_xrt_lite_device { + iree_hal_resource_t resource; + iree_allocator_t host_allocator; + // TODO(max): not used because "device allocations" are performed through + // device + iree_hal_allocator_t* device_allocator; + // block pool used for command buffer allocations, uses a larger block size + // since command buffers can contain inlined data + iree_arena_block_pool_t block_pool; + shim_xdna::device* shim_device; + // should come last; see the definition of total_size below in + // iree_hal_xrt_lite_device_create + iree_string_view_t identifier; + + iree_hal_xrt_lite_device(const iree_hal_xrt_lite_device_params* options, + iree_allocator_t host_allocator); +}; + +#endif // IREE_AMD_AIE_DRIVER_XRT_LITE_XRT_LITE_DEVICE_H_ diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/direct_command_buffer.cc b/runtime/src/iree-amd-aie/driver/xrt-lite/direct_command_buffer.cc index f5a79dc10..5861ebd8b 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/direct_command_buffer.cc +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/direct_command_buffer.cc @@ -22,7 +22,7 @@ struct iree_hal_xrt_lite_direct_command_buffer { // Staging arena used for host->device transfers. iree_arena_allocator_t arena; - shim_xdna::device* shim_device; + iree_hal_xrt_lite_device* device; }; namespace { @@ -31,13 +31,12 @@ extern const iree_hal_command_buffer_vtable_t } // namespace iree_status_t iree_hal_xrt_lite_direct_command_buffer_create( - shim_xdna::device* shim_device, iree_hal_allocator_t* device_allocator, - iree_hal_command_buffer_mode_t mode, + iree_hal_xrt_lite_device* device, iree_hal_command_buffer_mode_t mode, iree_hal_command_category_t command_categories, iree_host_size_t binding_capacity, iree_arena_block_pool_t* block_pool, iree_allocator_t host_allocator, iree_hal_command_buffer_t** out_command_buffer) { - IREE_ASSERT_ARGUMENT(device_allocator); + IREE_ASSERT_ARGUMENT(device); IREE_ASSERT_ARGUMENT(out_command_buffer); *out_command_buffer = nullptr; if (binding_capacity > 0) { @@ -57,12 +56,12 @@ iree_status_t iree_hal_xrt_lite_direct_command_buffer_create( mode, binding_capacity), reinterpret_cast(&command_buffer))); iree_hal_command_buffer_initialize( - device_allocator, mode, command_categories, IREE_HAL_QUEUE_AFFINITY_ANY, - binding_capacity, + device->device_allocator, mode, command_categories, + IREE_HAL_QUEUE_AFFINITY_ANY, binding_capacity, reinterpret_cast(command_buffer) + sizeof(*command_buffer), &iree_hal_xrt_lite_direct_command_buffer_vtable, &command_buffer->base); command_buffer->host_allocator = host_allocator; - command_buffer->shim_device = shim_device; + command_buffer->device = device; iree_arena_initialize(block_pool, &command_buffer->arena); iree_status_t status = iree_hal_resource_set_allocate(block_pool, &command_buffer->resource_set); @@ -164,15 +163,17 @@ static iree_status_t iree_hal_xrt_lite_direct_command_buffer_dispatch( &executable)); size_t ctrl_code_size = kernel_params.asm_inst.size() * sizeof(uint32_t); - auto bo_ctrl_code = command_buffer->shim_device->alloc_bo( + auto bo_ctrl_code = command_buffer->device->shim_device->alloc_bo( ctrl_code_size, XCL_BO_FLAGS_CACHEABLE); uint32_t* instr_buffer = static_cast(bo_ctrl_code->map()); memcpy(instr_buffer, kernel_params.asm_inst.data(), ctrl_code_size); bo_ctrl_code->sync(shim_xdna::direction::host2device); - shim_xdna::kernel ebuf(command_buffer->shim_device->get_pdev(), ERT_START_CU); - shim_xdna::hw_ctx context = command_buffer->shim_device->create_hw_context( - kernel_params.pdi, kernel_params.kernel_name); + shim_xdna::kernel ebuf(command_buffer->device->shim_device->get_pdev(), + ERT_START_CU); + shim_xdna::hw_ctx context = + command_buffer->device->shim_device->create_hw_context( + kernel_params.pdi, kernel_params.kernel_name); shim_xdna::cuidx_t cu_idx = context.open_cu_context(kernel_params.kernel_name); diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/direct_command_buffer.h b/runtime/src/iree-amd-aie/driver/xrt-lite/direct_command_buffer.h index 1612c9509..da797f20f 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/direct_command_buffer.h +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/direct_command_buffer.h @@ -7,6 +7,7 @@ #ifndef IREE_AMD_AIE_DRIVER_XRT_LITE_XRT_LITE_COMMAND_BUFFER_H_ #define IREE_AMD_AIE_DRIVER_XRT_LITE_XRT_LITE_COMMAND_BUFFER_H_ +#include "iree-amd-aie/driver/xrt-lite/device.h" #include "iree-amd-aie/driver/xrt-lite/shim/linux/kmq/device.h" #include "iree/base/internal/arena.h" #include "iree/hal/api.h" @@ -14,8 +15,7 @@ // `out_command_buffer` must be released by the caller (see // iree_hal_command_buffer_release). iree_status_t iree_hal_xrt_lite_direct_command_buffer_create( - shim_xdna::device* shim_device, iree_hal_allocator_t* device_allocator, - iree_hal_command_buffer_mode_t mode, + iree_hal_xrt_lite_device* device, iree_hal_command_buffer_mode_t mode, iree_hal_command_category_t command_categories, iree_host_size_t binding_capacity, iree_arena_block_pool_t* block_pool, iree_allocator_t host_allocator, diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/driver.cc b/runtime/src/iree-amd-aie/driver/xrt-lite/driver.cc index eda721bb1..87a7b9c1f 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/driver.cc +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/driver.cc @@ -26,8 +26,7 @@ void iree_hal_xrt_lite_driver_options_initialize( IREE_TRACE_ZONE_BEGIN(z0); memset(out_options, 0, sizeof(*out_options)); - iree_hal_xrt_lite_device_options_initialize( - &out_options->default_device_options); + iree_hal_xrt_lite_device_options_initialize(&out_options->device_params); IREE_TRACE_ZONE_END(z0); } @@ -35,6 +34,7 @@ void iree_hal_xrt_lite_driver_options_initialize( IREE_API_EXPORT iree_status_t iree_hal_xrt_lite_driver_create( iree_string_view_t identifier, const iree_hal_xrt_lite_driver_options* options, + const iree_hal_xrt_lite_device_params* device_params, iree_allocator_t host_allocator, iree_hal_driver_t** out_driver) { IREE_ASSERT_ARGUMENT(options); IREE_ASSERT_ARGUMENT(out_driver); @@ -53,6 +53,7 @@ IREE_API_EXPORT iree_status_t iree_hal_xrt_lite_driver_create( identifier, &driver->identifier, reinterpret_cast(driver) + total_size - identifier.size); memcpy(&driver->options, options, sizeof(*options)); + memcpy(&driver->options.device_params, device_params, sizeof(*device_params)); *out_driver = reinterpret_cast(driver); IREE_TRACE_ZONE_END(z0); @@ -99,8 +100,7 @@ static iree_status_t iree_hal_xrt_lite_driver_create_device_by_id( iree_hal_xrt_lite_driver* driver = IREE_HAL_XRT_LITE_CHECKED_VTABLE_CAST( base_driver, iree_hal_xrt_lite_driver_vtable, iree_hal_xrt_lite_driver); - iree_hal_xrt_lite_device_options options = - driver->options.default_device_options; + iree_hal_xrt_lite_device_params options = driver->options.device_params; IREE_TRACE_ZONE_END(z0); return iree_hal_xrt_lite_device_create(driver->identifier, &options, @@ -116,8 +116,7 @@ static iree_status_t iree_hal_xrt_lite_driver_create_device_by_path( iree_hal_xrt_lite_driver* driver = IREE_HAL_XRT_LITE_CHECKED_VTABLE_CAST( base_driver, iree_hal_xrt_lite_driver_vtable, iree_hal_xrt_lite_driver); - iree_hal_xrt_lite_device_options options = - driver->options.default_device_options; + iree_hal_xrt_lite_device_params options = driver->options.device_params; IREE_TRACE_ZONE_END(z0); return iree_hal_xrt_lite_device_create(driver->identifier, &options, diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/registration/driver_module.c b/runtime/src/iree-amd-aie/driver/xrt-lite/registration/driver_module.c index 928305857..45617cbe3 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/registration/driver_module.c +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/registration/driver_module.c @@ -8,38 +8,128 @@ #include "iree-amd-aie/driver/xrt-lite/api.h" #include "iree/base/api.h" +#include "iree/base/internal/flags.h" + +IREE_FLAG(int32_t, xrt_lite_n_core_rows, 4, + "Number of core rows to use on NPU."); +IREE_FLAG(int32_t, xrt_lite_n_core_cols, 4, + "Number of core cols to use on NPU."); + +static const iree_string_view_t key_xrt_lite_n_core_rows = + iree_string_view_literal("xrt_lite_n_core_rows"); +static const iree_string_view_t key_xrt_lite_n_core_cols = + iree_string_view_literal("xrt_lite_n_core_cols"); static iree_status_t iree_hal_xrt_lite_driver_factory_enumerate( void* self, iree_host_size_t* out_driver_info_count, const iree_hal_driver_info_t** out_driver_infos) { + IREE_TRACE_ZONE_BEGIN(z0); + static const iree_hal_driver_info_t default_driver_info = { .driver_name = IREE_SVL("xrt-lite"), .full_name = IREE_SVL("XRT-LITE driver (for AIE)"), }; *out_driver_info_count = 1; *out_driver_infos = &default_driver_info; + + IREE_TRACE_ZONE_END(z0); + return iree_ok_status(); +} + +static iree_status_t iree_hal_xrt_lite_driver_parse_flags( + iree_string_pair_builder_t* builder) { + IREE_TRACE_ZONE_BEGIN(z0); + + IREE_RETURN_AND_END_ZONE_IF_ERROR( + z0, iree_string_pair_builder_add_int32(builder, key_xrt_lite_n_core_rows, + FLAG_xrt_lite_n_core_rows)); + IREE_RETURN_AND_END_ZONE_IF_ERROR( + z0, iree_string_pair_builder_add_int32(builder, key_xrt_lite_n_core_cols, + FLAG_xrt_lite_n_core_cols)); + + IREE_TRACE_ZONE_END(z0); + return iree_ok_status(); +} + +static iree_status_t iree_hal_xrt_lite_driver_populate_options( + iree_allocator_t host_allocator, + struct iree_hal_xrt_lite_driver_options* driver_options, + struct iree_hal_xrt_lite_device_params* device_params, + iree_host_size_t pairs_size, iree_string_pair_t* pairs) { + IREE_TRACE_ZONE_BEGIN(z0); + + for (iree_host_size_t i = 0; i < pairs_size; ++i) { + iree_string_view_t key = pairs[i].key; + iree_string_view_t value = pairs[i].value; + int32_t ivalue; + + if (iree_string_view_equal(key, key_xrt_lite_n_core_rows)) { + if (!iree_string_view_atoi_int32(value, &ivalue)) { + IREE_TRACE_ZONE_END(z0); + return iree_make_status( + IREE_STATUS_FAILED_PRECONDITION, + "Option 'key_xrt_lite_n_core_rows' expected to be int. Got: '%.*s'", + (int)value.size, value.data); + } + device_params->n_core_rows = ivalue; + } else if (iree_string_view_equal(key, key_xrt_lite_n_core_cols)) { + if (!iree_string_view_atoi_int32(value, &ivalue)) { + IREE_TRACE_ZONE_END(z0); + return iree_make_status( + IREE_STATUS_FAILED_PRECONDITION, + "Option 'key_xrt_lite_n_core_cols' expected to be int. Got: '%.*s'", + (int)value.size, value.data); + } + device_params->n_core_cols = ivalue; + } else { + IREE_TRACE_ZONE_END(z0); + return iree_make_status(IREE_STATUS_FAILED_PRECONDITION, + "Unrecognized options: %.*s", (int)key.size, + key.data); + } + } + + IREE_TRACE_ZONE_END(z0); return iree_ok_status(); } static iree_status_t iree_hal_xrt_lite_driver_factory_try_create( void* self, iree_string_view_t driver_name, iree_allocator_t host_allocator, iree_hal_driver_t** out_driver) { + IREE_TRACE_ZONE_BEGIN(z0); + if (!iree_string_view_equal(driver_name, IREE_SV("xrt-lite"))) { + IREE_TRACE_ZONE_END(z0); return iree_make_status(IREE_STATUS_UNAVAILABLE, "no driver '%.*s' is provided by this factory", (int)driver_name.size, driver_name.data); } - // TODO(max): populate options from flags. This driver module file is only - // used in native tools that have access to the flags library. Programmatic - // creation of the driver and devices will bypass this file and pass the - // options via this struct or key-value string parameters. - struct iree_hal_xrt_lite_driver_options options; - iree_hal_xrt_lite_driver_options_initialize(&options); + struct iree_hal_xrt_lite_driver_options driver_options; + iree_hal_xrt_lite_driver_options_initialize(&driver_options); + struct iree_hal_xrt_lite_device_params device_params; + iree_hal_xrt_lite_device_options_initialize(&device_params); + + iree_string_pair_builder_t flag_option_builder; + iree_string_pair_builder_initialize(host_allocator, &flag_option_builder); + iree_status_t status = + iree_hal_xrt_lite_driver_parse_flags(&flag_option_builder); + + if (iree_status_is_ok(status)) { + IREE_TRACE_ZONE_END(z0); + status = iree_hal_xrt_lite_driver_populate_options( + host_allocator, &driver_options, &device_params, + iree_string_pair_builder_size(&flag_option_builder), + iree_string_pair_builder_pairs(&flag_option_builder)); + } else { + IREE_TRACE_ZONE_END(z0); + return status; + } - iree_status_t status = iree_hal_xrt_lite_driver_create( - driver_name, &options, host_allocator, out_driver); + status = iree_hal_xrt_lite_driver_create( + driver_name, &driver_options, &device_params, host_allocator, out_driver); + IREE_TRACE_ZONE_END(z0); return status; } diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/device.cpp b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/device.cpp index ae87fc0b3..6a591e9ec 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/device.cpp +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/device.cpp @@ -141,7 +141,10 @@ void *pdev::mmap(void *addr, size_t len, int prot, int flags, return ret; } -device::device() { SHIM_DEBUG("Created KMQ device"); } +device::device(uint32_t n_rows, uint32_t n_cols) + : n_rows(n_rows), n_cols(n_cols) { + SHIM_DEBUG("Created KMQ device"); +} device::~device() { SHIM_DEBUG("Destroying KMQ device"); } @@ -150,12 +153,12 @@ const pdev &device::get_pdev() const { return m_pdev; } hw_ctx device::create_hw_context(const std::vector &pdi, const std::string &cu_name, const std::map &qos) { - return hw_ctx(*this, pdi, cu_name, qos); + return {*this, pdi, cu_name, n_rows, n_cols, qos}; } hw_ctx device::create_hw_context(const std::vector &pdi, const std::string &cu_name) { - return hw_ctx(*this, pdi, cu_name); + return {*this, pdi, cu_name, n_rows, n_cols}; } std::unique_ptr device::alloc_bo(uint32_t ctx_id, size_t size, diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/device.h b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/device.h index f483960e1..8ace4e79d 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/device.h +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/device.h @@ -28,11 +28,11 @@ struct pdev { struct device { enum class access_mode : uint8_t { exclusive = 0, shared = 1 }; - - mutable std::mutex m_mutex; pdev m_pdev; + uint32_t n_rows; + uint32_t n_cols; - device(); + device(uint32_t n_rows, uint32_t n_cols); ~device(); std::unique_ptr import_bo(int ehdl) const; diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/hwctx.cpp b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/hwctx.cpp index 8784c2cc8..20a94efd7 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/hwctx.cpp +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/hwctx.cpp @@ -14,8 +14,13 @@ namespace shim_xdna { hw_ctx::hw_ctx(device &dev, const std::map &qos, std::unique_ptr q, const std::vector &pdi, - const std::string &cu_name, size_t functional) - : m_device(dev), m_q(std::move(q)), m_doorbell(0), m_log_buf(nullptr) { + const std::string &cu_name, uint32_t n_rows, uint32_t n_cols) + : m_device(dev), + m_q(std::move(q)), + m_num_rows(n_rows), + m_num_cols(n_cols), + m_doorbell(0), + m_log_buf(nullptr) { SHIM_DEBUG("Creating HW context..."); for (auto &[key, value] : qos) { @@ -33,18 +38,21 @@ hw_ctx::hw_ctx(device &dev, const std::map &qos, m_qos.priority = value; } - m_cu_info.push_back({.m_name = cu_name, .m_func = functional, .m_pdi = pdi}); + // TODO(max): multiple pdis? + m_cu_info.push_back( + {.m_name = cu_name, .m_func = /*functional*/ 0, .m_pdi = pdi}); if (m_cu_info.empty()) shim_err(EINVAL, "No valid DPU kernel found in xclbin"); - m_ops_per_cycle = 2048 /*aie_partition.ops_per_cycle*/; - m_num_cols = 4 /*aie_partition.ncol*/; + // TODO(max): configure this + m_ops_per_cycle = 2048; } hw_ctx::hw_ctx(device &device, const std::vector &pdi, - const std::string &cu_name, + const std::string &cu_name, uint32_t n_rows, uint32_t n_cols, const std::map &qos) - : hw_ctx(device, qos, std::make_unique(device), pdi, cu_name) { + : hw_ctx(device, qos, std::make_unique(device), pdi, cu_name, n_rows, + n_cols) { create_ctx_on_device(); std::vector cu_conf_param_buf(sizeof(amdxdna_hwctx_param_config_cu) + m_cu_info.size() * @@ -118,11 +126,7 @@ void hw_ctx::create_ctx_on_device() { arg.qos_p = reinterpret_cast(&m_qos); arg.umq_bo = m_q->m_queue_boh; arg.max_opc = m_ops_per_cycle; - // TODO(max) - // throw std::runtime_error("TODO(max): core_rows"); - // arg.num_tiles = m_num_cols * - // xrt_core::device_query(&m_device).core_rows; - arg.num_tiles = m_num_cols * 4; + arg.num_tiles = m_num_rows * m_num_cols; arg.log_buf_bo = m_log_bo ? m_log_bo->get_drm_bo_handle() : AMDXDNA_INVALID_BO_HANDLE; m_device.get_pdev().ioctl(DRM_IOCTL_AMDXDNA_CREATE_HWCTX, &arg); @@ -133,7 +137,7 @@ void hw_ctx::create_ctx_on_device() { m_q->bind_hwctx(this); } -void hw_ctx::delete_ctx_on_device() { +void hw_ctx::delete_ctx_on_device() const { if (m_handle == AMDXDNA_INVALID_CTX_HANDLE) return; m_q->unbind_hwctx(); diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/hwctx.h b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/hwctx.h index b989c60ce..7a169e270 100644 --- a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/hwctx.h +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/hwctx.h @@ -43,6 +43,7 @@ struct hw_ctx { std::vector m_cu_info; std::unique_ptr m_q; uint32_t m_ops_per_cycle; + uint32_t m_num_rows; uint32_t m_num_cols; uint32_t m_doorbell; std::unique_ptr m_log_bo; @@ -51,9 +52,10 @@ struct hw_ctx { hw_ctx(device &dev, const std::map &qos, std::unique_ptr q, const std::vector &pdi, - const std::string &cu_name, size_t functional = 0); + const std::string &cu_name, uint32_t n_rows, uint32_t n_cols); hw_ctx(device &dev, const std::vector &pdi, const std::string &cu_name, + uint32_t n_rows, uint32_t n_cols, const std::map &qos = {}); ~hw_ctx(); // no copying @@ -67,7 +69,7 @@ struct hw_ctx { void create_ctx_on_device(); void init_log_buf(); void fini_log_buf() const; - void delete_ctx_on_device(); + void delete_ctx_on_device() const; hw_q *get_hw_queue() const; }; diff --git a/tests/conftest.py b/tests/conftest.py index 31c12bb7c..10b70107a 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -3,6 +3,10 @@ import numpy as np import pytest + +# TODO(max): connect this (or something) to xrt_lite_n_core_rows and xrt_lite_n_core_cols +from iree._runtime_libs._runtime import parse_flags + from iree.compiler import ir from iree.compiler._mlir_libs import get_dialect_registry from iree.compiler.api import Session, Output, Source, _initializeGlobalCL