From d174fd44fe06629dd5a7fce91aa13c9077888162 Mon Sep 17 00:00:00 2001 From: Maksim Levental Date: Tue, 27 Aug 2024 22:09:27 -0500 Subject: [PATCH] add bare example --- .github/workflows/ci-linux.yml | 3 + experimental/hsa/CMakeLists.txt | 2 + experimental/hsa/aie_hsa_bare_add_one.cc | 523 ++++++++++++++++++++ experimental/hsa/aie_hsa_dispatch_test.cc | 2 +- experimental/hsa/amdxdna_accel.h | 569 ++++++++++++++++++++++ experimental/hsa/hsa_ipu.h | 270 ++++++++++ 6 files changed, 1368 insertions(+), 1 deletion(-) create mode 100644 experimental/hsa/aie_hsa_bare_add_one.cc create mode 100644 experimental/hsa/amdxdna_accel.h create mode 100644 experimental/hsa/hsa_ipu.h diff --git a/.github/workflows/ci-linux.yml b/.github/workflows/ci-linux.yml index 383b99d8a..5071219ab 100644 --- a/.github/workflows/ci-linux.yml +++ b/.github/workflows/ci-linux.yml @@ -219,3 +219,6 @@ jobs: cmake .. -DCMAKE_BUILD_TYPE=Debug -GNinja ninja aie_hsa_dispatch_test ./aie_hsa_dispatch_test $PWD/.. + + ninja aie_hsa_bare_add_one + ./aie_hsa_bare_add_one $PWD/.. diff --git a/experimental/hsa/CMakeLists.txt b/experimental/hsa/CMakeLists.txt index 8c299a30a..b1655008e 100644 --- a/experimental/hsa/CMakeLists.txt +++ b/experimental/hsa/CMakeLists.txt @@ -7,3 +7,5 @@ find_package(hsa-runtime64 REQUIRED) add_executable(aie_hsa_dispatch_test aie_hsa_dispatch_test.cc) target_link_libraries(aie_hsa_dispatch_test PUBLIC hsa-runtime64::hsa-runtime64) + +add_executable(aie_hsa_bare_add_one aie_hsa_bare_add_one.cc) diff --git a/experimental/hsa/aie_hsa_bare_add_one.cc b/experimental/hsa/aie_hsa_bare_add_one.cc new file mode 100644 index 000000000..f5843d7e9 --- /dev/null +++ b/experimental/hsa/aie_hsa_bare_add_one.cc @@ -0,0 +1,523 @@ + +/* + +RUN: (add_one_test %S) | FileCheck %s +CHECK: /dev/accel/accel0 open +CHECK: Driver version 1.1 +CHECK: Heap buffer @: 0x7f313c000000 +CHECK: Loading pdi +CHECK: Pdi file size: 3552 +CHECK: Loading dpu inst +CHECK: Loading dpu inst +CHECK: DPU 0 instructions @: 0x7f313c008000 +CHECK: DPU 1 instructions @: 0x7f313c010000 +CHECK: PDI file @: 0x7f313c000000 +CHECK: PDI handle @: 2 +CHECK: Input @: 0x7f313c018000 +CHECK: Output @: 0x7f313c020000 +CHECK: Input @: 0x7f313c028000 +CHECK: Output @: 0x7f313c030000 +CHECK: Size of param_config_cu: 0x8 +CHECK: Synch bo ioctl failed for handle 11 +CHECK: Synch bo ioctl failed for handle 9 +CHECK: Synch bo ioctl failed for handle 10 +CHECK: Checking run 0: +CHECK: Checking run 1: +CHECK: PASS! +CHECK: Closing +CHECK: Done + + */ + +#include +#include +#include +#include +#include + +#include "amdxdna_accel.h" +#include "hsa_ipu.h" + +#define DATA_BUFFER_SIZE (1024 * 4) + +/* + * Interpretation of the beginning of data payload for ERT_CMD_CHAIN in + * amdxdna_cmd. The rest of the payload in amdxdna_cmd is cmd BO handles. + */ +struct amdxdna_cmd_chain { + uint32_t command_count; + uint32_t submit_index; + uint32_t error_index; + uint32_t reserved[3]; + uint64_t data[] __counted_by(command_count); +}; + +/* Exec buffer command header format */ +struct amdxdna_cmd { + union { + struct { + uint32_t state : 4; + uint32_t unused : 6; + uint32_t extra_cu_masks : 2; + uint32_t count : 11; + uint32_t opcode : 5; + uint32_t reserved : 4; + }; + uint32_t header; + }; + uint32_t data[] __counted_by(count); +}; + +// These packets are variable width but using this as a +// maximum size for now +#define PACKET_SIZE 64 + +int main(int argc, char **argv) { + int drv_fd; + int ret; + const char drv_path[] = "/dev/accel/accel0"; + std::string test_dir(argv[1]); + std::string inst_path = test_dir + "/add_one_insts.txt"; + std::string pdi_path_str = test_dir + "/add_one.pdi"; + const char *dpu_inst_path = inst_path.c_str(); + const char *pdi_path = pdi_path_str.c_str(); // Add one kernel + uint32_t heap_handle; + uint32_t major, minor; + + // open the driver + drv_fd = open(drv_path, O_RDWR); + + if (drv_fd < 0) { + printf("Error %i opening %s\n", drv_fd, drv_path); + return -1; + } + + printf("%s open\n", drv_path); + + // get driver version + if (get_driver_version(drv_fd, &major, &minor) < 0) { + printf("Error getting driver version\n"); + printf("Closing\n"); + close(drv_fd); + printf("Done\n"); + return -1; + } + + printf("Driver version %u.%u\n", major, minor); + + ///////////////////////////////////////////////////////////////////////////////// + // Step 0: Allocate the necessary BOs. This includes: + // 1. The operands for the two kernels that will be launched + // 2. A heap which contains: + // a. A PDI for the design that will be run + // b. Instruction sequences for both runs + + // reserve some device memory for the heap + if (alloc_heap(drv_fd, 48 * 1024 * 1024, &heap_handle) < 0) { + perror("Error allocating device heap"); + printf("Closing\n"); + close(drv_fd); + printf("Done\n"); + return -1; + } + + uint64_t pdi_vaddr; + uint64_t pdi_sram_vaddr; + uint32_t pdi_handle; + printf("Loading pdi\n"); + ret = load_pdi(drv_fd, &pdi_vaddr, &pdi_sram_vaddr, &pdi_handle, pdi_path); + if (ret < 0) { + printf("Error %i loading pdi\n", ret); + printf("Closing\n"); + close(drv_fd); + printf("Done\n"); + return -1; + } + + uint64_t dpu_0_vaddr; + uint64_t dpu_0_sram_vaddr; + uint32_t dpu_0_handle; + uint32_t num_dpu_0_insts; + printf("Loading dpu inst\n"); + ret = load_instructions(drv_fd, &dpu_0_vaddr, &dpu_0_sram_vaddr, + &dpu_0_handle, dpu_inst_path, &num_dpu_0_insts); + if (ret < 0) { + printf("Error %i loading dpu instructions\n", ret); + printf("Closing\n"); + close(drv_fd); + printf("Done\n"); + return -1; + } + + uint64_t dpu_1_vaddr; + uint64_t dpu_1_sram_vaddr; + uint32_t dpu_1_handle; + uint32_t num_dpu_1_insts; + printf("Loading dpu inst\n"); + ret = load_instructions(drv_fd, &dpu_1_vaddr, &dpu_1_sram_vaddr, + &dpu_1_handle, dpu_inst_path, &num_dpu_1_insts); + if (ret < 0) { + printf("Error %i loading dpu instructions\n", ret); + printf("Closing\n"); + close(drv_fd); + printf("Done\n"); + return -1; + } + + printf("DPU 0 instructions @: %p\n", (void *)dpu_0_vaddr); + printf("DPU 1 instructions @: %p\n", (void *)dpu_1_vaddr); + printf("PDI file @: %p\n", (void *)pdi_vaddr); + printf("PDI handle @: %d\n", pdi_handle); + + uint64_t input_0; + uint64_t input_0_sram_vaddr; + uint32_t input_0_handle; + ret = create_dev_bo(drv_fd, &input_0, &input_0_sram_vaddr, &input_0_handle, + DATA_BUFFER_SIZE); + // ret = create_shmem_bo(drv_fd, &input_0, &input_0_sram_vaddr, + // &input_0_handle, DATA_BUFFER_SIZE); + printf("Input @: %p\n", (void *)input_0); + if (ret < 0) { + printf("Error %i creating data 0\n", ret); + printf("Closing\n"); + close(drv_fd); + printf("Done\n"); + return -1; + } + + uint64_t output_0; + uint64_t output_0_sram_vaddr; + uint32_t output_0_handle; + ret = create_dev_bo(drv_fd, &output_0, &output_0_sram_vaddr, &output_0_handle, + DATA_BUFFER_SIZE); + // ret = create_shmem_bo(drv_fd, &output_0, &output_0_sram_vaddr, + // &output_0_handle, DATA_BUFFER_SIZE); + printf("Output @: %p\n", (void *)output_0); + if (ret < 0) { + printf("Error %i creating data 1\n", ret); + printf("Closing\n"); + close(drv_fd); + printf("Done\n"); + return -1; + } + + uint64_t input_1; + uint64_t input_1_sram_vaddr; + uint32_t input_1_handle; + ret = create_dev_bo(drv_fd, &input_1, &input_1_sram_vaddr, &input_1_handle, + DATA_BUFFER_SIZE); + // ret = create_shmem_bo(drv_fd, &input_1, &input_1_sram_vaddr, + // &input_1_handle, DATA_BUFFER_SIZE); + printf("Input @: %p\n", (void *)input_1); + if (ret < 0) { + printf("Error %i creating data 0\n", ret); + printf("Closing\n"); + close(drv_fd); + printf("Done\n"); + return -1; + } + + uint64_t output_1; + uint64_t output_1_sram_vaddr; + uint32_t output_1_handle; + ret = create_dev_bo(drv_fd, &output_1, &output_1_sram_vaddr, &output_1_handle, + DATA_BUFFER_SIZE); + // ret = create_shmem_bo(drv_fd, &output_1, &output_1_sram_vaddr, + // &output_1_handle, DATA_BUFFER_SIZE); + printf("Output @: %p\n", (void *)output_1); + if (ret < 0) { + printf("Error %i creating data 1\n", ret); + printf("Closing\n"); + close(drv_fd); + printf("Done\n"); + return -1; + } + + for (int i = 0; i < DATA_BUFFER_SIZE / sizeof(uint32_t); i++) { + *((uint32_t *)input_0 + i) = i; + *((uint32_t *)input_1 + i) = i + 0xFEEDED1E; + *((uint32_t *)output_0 + i) = 0xDEFACE; + *((uint32_t *)output_1 + i) = 0xDEADBEEF; + } + + // Writing the user buffers + sync_bo(drv_fd, input_0_handle); + sync_bo(drv_fd, output_0_handle); + sync_bo(drv_fd, input_1_handle); + sync_bo(drv_fd, output_1_handle); + + // Performing a sync on the queue descriptor, completion signal, queue buffer + // and config cu bo. + sync_bo(drv_fd, dpu_0_handle); + sync_bo(drv_fd, dpu_1_handle); + sync_bo(drv_fd, pdi_handle); + sync_bo(drv_fd, input_0_handle); + sync_bo(drv_fd, output_0_handle); + + ///////////////////////////////////////////////////////////////////////////////// + // Step 1: Create a user mode queue + // This is going to be where we create a queue where we: + // 1. Create and configure a hardware context + // 2. Allocate the queue buffer as a user-mode queue + + // Allocating a structure to store QOS information + struct amdxdna_qos_info *qos = + (struct amdxdna_qos_info *)malloc(sizeof(struct amdxdna_qos_info)); + qos->gops = 0; + qos->fps = 0; + qos->dma_bandwidth = 0; + qos->latency = 0; + qos->frame_exec_time = 0; + qos->priority = 0; + + // This is the structure that we pass + struct amdxdna_drm_create_hwctx create_hw_ctx = { + .ext = 0, + .ext_flags = 0, + .qos_p = (uint64_t)qos, + .umq_bo = 0, + .log_buf_bo = 0, + .max_opc = 0x800, // Not sure what this is but this was the value used + .num_tiles = 4, + .mem_size = 0, + .umq_doorbell = 0, + }; + ret = ioctl(drv_fd, DRM_IOCTL_AMDXDNA_CREATE_HWCTX, &create_hw_ctx); + if (ret != 0) { + perror("Failed to create hwctx"); + return -1; + } + + // Creating a structure to configure the CU + struct amdxdna_cu_config cu_config = { + .cu_bo = pdi_handle, + .cu_func = 0, + }; + + // Creating a structure to configure the hardware context + struct amdxdna_hwctx_param_config_cu param_config_cu; + param_config_cu.num_cus = 1; + param_config_cu.cu_configs[0] = cu_config; + + printf("Size of param_config_cu: 0x%lx\n", sizeof(param_config_cu)); + + // Configuring the hardware context with the PDI + struct amdxdna_drm_config_hwctx config_hw_ctx = { + .handle = create_hw_ctx.handle, + .param_type = DRM_AMDXDNA_HWCTX_CONFIG_CU, + .param_val = + (uint64_t)¶m_config_cu, // Pass in the pointer to the param value + .param_val_size = 0x10, // Size of param config CU is 16B + }; + ret = ioctl(drv_fd, DRM_IOCTL_AMDXDNA_CONFIG_HWCTX, &config_hw_ctx); + if (ret != 0) { + perror("Failed to config hwctx"); + return -1; + } + + ///////////////////////////////////////////////////////////////////////////////// + // Step 2: Configuring the CMD BOs with the different instruction sequences + struct amdxdna_drm_create_bo create_cmd_bo_0 = { + .type = AMDXDNA_BO_CMD, + .size = PACKET_SIZE, + }; + int cmd_bo_ret = ioctl(drv_fd, DRM_IOCTL_AMDXDNA_CREATE_BO, &create_cmd_bo_0); + if (cmd_bo_ret != 0) { + perror("Failed to create cmd_0"); + return -1; + } + + struct amdxdna_drm_get_bo_info cmd_bo_0_get_bo_info = { + .handle = create_cmd_bo_0.handle}; + ret = ioctl(drv_fd, DRM_IOCTL_AMDXDNA_GET_BO_INFO, &cmd_bo_0_get_bo_info); + if (ret != 0) { + perror("Failed to get cmd BO 0 info"); + return -2; + } + + // Writing the first packet to the queue + struct amdxdna_cmd *cmd_0 = (struct amdxdna_cmd *)mmap( + 0, PACKET_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED, drv_fd, + cmd_bo_0_get_bo_info.map_offset); + cmd_0->state = 1; // ERT_CMD_STATE_NEW; + cmd_0->extra_cu_masks = 0; + cmd_0->count = 0xF; // NOTE: For some reason this needs to be larger + cmd_0->opcode = 0x0; // ERT_START_CU; + cmd_0->data[0] = 0x3; // NOTE: This one seems to be skipped + cmd_0->data[1] = 0x3; // Transaction opcode + cmd_0->data[2] = 0x0; + cmd_0->data[3] = dpu_0_sram_vaddr; + cmd_0->data[4] = 0x0; + cmd_0->data[5] = 0x44; // Size of DPU instruction + cmd_0->data[6] = input_0 & 0xFFFFFFFF; // Input low + cmd_0->data[7] = (input_0 >> 32) & 0xFFFFFFFF; // Input high + cmd_0->data[8] = output_0 & 0xFFFFFFFF; // Output low + cmd_0->data[9] = (output_0 >> 32) & 0xFFFFFFFF; // Output high + + // Writing to the second packet of the queue + struct amdxdna_drm_create_bo create_cmd_bo_1 = { + .type = AMDXDNA_BO_CMD, + .size = PACKET_SIZE, + }; + cmd_bo_ret = ioctl(drv_fd, DRM_IOCTL_AMDXDNA_CREATE_BO, &create_cmd_bo_1); + if (cmd_bo_ret != 0) { + perror("Failed to create cmd_1"); + return -1; + } + + struct amdxdna_drm_get_bo_info cmd_bo_1_get_bo_info = { + .handle = create_cmd_bo_1.handle}; + ret = ioctl(drv_fd, DRM_IOCTL_AMDXDNA_GET_BO_INFO, &cmd_bo_1_get_bo_info); + if (ret != 0) { + perror("Failed to get cmd BO 0 info"); + return -2; + } + + struct amdxdna_cmd *cmd_1 = (struct amdxdna_cmd *)mmap( + 0, PACKET_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED, drv_fd, + cmd_bo_1_get_bo_info.map_offset); + cmd_1->state = 1; // ERT_CMD_STATE_NEW; + cmd_1->extra_cu_masks = 0; + cmd_1->count = 10; // Number of commands + cmd_1->opcode = 0x0; // ERT_START_CU; + cmd_1->data[0] = 0x3; // This one seems to be skipped + cmd_1->data[1] = 0x3; // Transaction opcode + cmd_1->data[2] = 0x0; + cmd_1->data[3] = dpu_1_sram_vaddr; + cmd_1->data[4] = 0x0; + cmd_1->data[5] = 0x44; // Size of DPU instruction + cmd_1->data[6] = input_1 & 0xFFFFFFFF; // Input low + cmd_1->data[7] = (input_1 >> 32) & 0xFFFFFFFF; // Input high + cmd_1->data[8] = output_1 & 0xFFFFFFFF; // Output low + cmd_1->data[9] = (output_1 >> 32) & 0xFFFFFFFF; // Output high + + ///////////////////////////////////////////////////////////////////////////////// + // Step 3: Submit commands -- This requires creating a BO_EXEC that contains + // the command chain that points to the instruction sequences just created + + // Allocate a command chain + void *bo_cmd_chain_buf = NULL; + cmd_bo_ret = posix_memalign(&bo_cmd_chain_buf, 4096, 4096); + if (cmd_bo_ret != 0 || bo_cmd_chain_buf == NULL) { + printf("[ERROR] Failed to allocate cmd_bo buffer of size %d\n", 4096); + } + + struct amdxdna_drm_create_bo create_cmd_chain_bo = { + .type = AMDXDNA_BO_CMD, + .size = 4096, + }; + cmd_bo_ret = ioctl(drv_fd, DRM_IOCTL_AMDXDNA_CREATE_BO, &create_cmd_chain_bo); + if (cmd_bo_ret != 0) { + perror("Failed to create command chain BO"); + return -1; + } + + struct amdxdna_drm_get_bo_info cmd_chain_bo_get_bo_info = { + .handle = create_cmd_chain_bo.handle}; + ret = ioctl(drv_fd, DRM_IOCTL_AMDXDNA_GET_BO_INFO, &cmd_chain_bo_get_bo_info); + if (ret != 0) { + perror("Failed to get cmd BO 0 info"); + return -2; + } + + struct amdxdna_cmd *cmd_chain = + (struct amdxdna_cmd *)mmap(0, 4096, PROT_READ | PROT_WRITE, MAP_SHARED, + drv_fd, cmd_chain_bo_get_bo_info.map_offset); + + // Writing information to the command buffer + struct amdxdna_cmd_chain *cmd_chain_payload = + (struct amdxdna_cmd_chain *)(cmd_chain->data); + cmd_chain->state = 1; // ERT_CMD_STATE_NEW; + cmd_chain->extra_cu_masks = 0; + cmd_chain->count = 0xA; // TODO: Why is this the value? + cmd_chain->opcode = 0x13; // ERT_CMD_CHAIN + cmd_chain_payload->command_count = 2; + cmd_chain_payload->submit_index = 0; + cmd_chain_payload->error_index = 0; + cmd_chain_payload->data[0] = create_cmd_bo_0.handle; + cmd_chain_payload->data[1] = create_cmd_bo_1.handle; + + // Reading the user buffers + sync_bo(drv_fd, create_cmd_chain_bo.handle); + sync_bo(drv_fd, create_cmd_bo_0.handle); + sync_bo(drv_fd, create_cmd_bo_1.handle); + + // Perform a submit cmd + uint32_t bo_args[6] = {dpu_0_handle, dpu_1_handle, input_0_handle, + output_0_handle, input_1_handle, output_1_handle}; + struct amdxdna_drm_exec_cmd exec_cmd_0 = { + .ext = 0, + .ext_flags = 0, + .hwctx = create_hw_ctx.handle, + .type = AMDXDNA_CMD_SUBMIT_EXEC_BUF, + .cmd_handles = create_cmd_chain_bo.handle, + .args = (uint64_t)bo_args, + .cmd_count = 1, + .arg_count = sizeof(bo_args) / sizeof(uint32_t), + }; + ret = ioctl(drv_fd, DRM_IOCTL_AMDXDNA_EXEC_CMD, &exec_cmd_0); + if (ret != 0) { + perror("Failed to submit work"); + return -1; + } + + ///////////////////////////////////////////////////////////////////////////////// + // Step 4: Wait for the output + // Use the wait IOCTL to wait for our submission to complete + struct amdxdna_drm_wait_cmd wait_cmd = { + .hwctx = create_hw_ctx.handle, + .timeout = 50, // 50ms timeout + .seq = exec_cmd_0.seq, + }; + + ret = ioctl(drv_fd, DRM_IOCTL_AMDXDNA_WAIT_CMD, &wait_cmd); + if (ret != 0) { + perror("Failed to wait"); + return -1; + } + + ///////////////////////////////////////////////////////////////////////////////// + // Step 5: Verify output + + // Reading the user buffers + sync_bo(drv_fd, input_0_handle); + sync_bo(drv_fd, output_0_handle); + sync_bo(drv_fd, input_1_handle); + sync_bo(drv_fd, output_1_handle); + + int errors = 0; + printf("Checking run 0:\n"); + for (int i = 0; i < DATA_BUFFER_SIZE / sizeof(uint32_t); i++) { + uint32_t src = *((uint32_t *)input_0 + i); + uint32_t dst = *((uint32_t *)output_0 + i); + // printf("src: 0x%x\n", src); + // printf("dst: 0x%x\n", dst); + if (src + 1 != dst) { + printf("[ERROR] %d: %d + 1 != %d\n", i, src, dst); + errors++; + } + } + + printf("Checking run 1:\n"); + for (int i = 0; i < DATA_BUFFER_SIZE / sizeof(uint32_t); i++) { + uint32_t src = *((uint32_t *)input_1 + i); + uint32_t dst = *((uint32_t *)output_1 + i); + // printf("src: 0x%x\n", src); + // printf("dst: 0x%x\n", dst); + if (src + 1 != dst) { + printf("[ERROR] %d: %d + 1 != %d\n", i, src, dst); + errors++; + } + } + + if (!errors) { + printf("PASS!\n"); + } else { + printf("FAIL! %d/2048\n", errors); + } + + printf("Closing\n"); + close(drv_fd); + printf("Done\n"); + return 0; +} \ No newline at end of file diff --git a/experimental/hsa/aie_hsa_dispatch_test.cc b/experimental/hsa/aie_hsa_dispatch_test.cc index 799a060df..fca55e4a4 100644 --- a/experimental/hsa/aie_hsa_dispatch_test.cc +++ b/experimental/hsa/aie_hsa_dispatch_test.cc @@ -305,5 +305,5 @@ int main(int argc, char **argv) { r = hsa_shut_down(); assert(r == HSA_STATUS_SUCCESS); - std::cout << "PASS"; + std::cout << "PASS\n"; } diff --git a/experimental/hsa/amdxdna_accel.h b/experimental/hsa/amdxdna_accel.h new file mode 100644 index 000000000..048fd3fe6 --- /dev/null +++ b/experimental/hsa/amdxdna_accel.h @@ -0,0 +1,569 @@ +/* SPDX-License-Identifier: NCSA OR GPL-2.0 WITH Linux-syscall-note */ +/* + * Copyright (C) 2022-2024, Advanced Micro Devices, Inc. + */ + +#ifndef AMDXDNA_ACCEL_H_ +#define AMDXDNA_ACCEL_H_ + +#include +#include +#include + +#if defined(__cplusplus) +extern "C" { +#endif + +#ifndef __counted_by +#define __counted_by(cnt) +#endif + +#define AMDXDNA_DRIVER_MAJOR 1 +#define AMDXDNA_DRIVER_MINOR 0 + +#define AMDXDNA_INVALID_CMD_HANDLE (~0UL) +#define AMDXDNA_INVALID_ADDR (~0UL) +#define AMDXDNA_INVALID_CTX_HANDLE 0 +#define AMDXDNA_INVALID_BO_HANDLE 0 + +/* + * The interface can grow/extend over time. + * On each struct amdxdna_drm_*, to support potential extension, we defined it + * like this. + * + * Example code: + * + * struct amdxdna_drm_example_data { + * .ext = (uintptr_t)&example_data_ext; + * ... + * }; + * + * We don't have extension now. The extension struct will define in the future. + */ + +enum amdxdna_drm_ioctl_id { + DRM_AMDXDNA_CREATE_HWCTX, + DRM_AMDXDNA_DESTROY_HWCTX, + DRM_AMDXDNA_CONFIG_HWCTX, + DRM_AMDXDNA_CREATE_BO, + DRM_AMDXDNA_GET_BO_INFO, + DRM_AMDXDNA_SYNC_BO, + DRM_AMDXDNA_EXEC_CMD, + DRM_AMDXDNA_WAIT_CMD, + DRM_AMDXDNA_GET_INFO, + DRM_AMDXDNA_SET_STATE, + DRM_AMDXDNA_NUM_IOCTLS +}; + +enum amdxdna_device_type { + AMDXDNA_DEV_TYPE_UNKNOWN = -1, + AMDXDNA_DEV_TYPE_KMQ, + AMDXDNA_DEV_TYPE_UMQ, +}; + +/** + * struct qos_info - QoS information for driver. + * @gops: Giga operations per second. + * @fps: Frames per second. + * @dma_bandwidth: DMA bandwidtha. + * @latency: Frame response latency. + * @frame_exec_time: Frame execution time. + * @priority: Request priority. + * + * User program can provide QoS hints to driver. + */ +struct amdxdna_qos_info { + __u32 gops; + __u32 fps; + __u32 dma_bandwidth; + __u32 latency; + __u32 frame_exec_time; + __u32 priority; +}; + +/** + * struct amdxdna_drm_create_hwctx - Create hardware context. + * @ext: MBZ. + * @ext_flags: MBZ. + * @qos_p: Address of QoS info. + * @umq_bo: BO handle for user mode queue(UMQ). + * @log_buf_bo: BO handle for log buffer. + * @max_opc: Maximum operations per cycle. + * @num_tiles: Number of AIE tiles. + * @mem_size: Size of AIE tile memory. + * @umq_doorbell: Returned offset of doorbell associated with UMQ. + * @handle: Returned hardware context handle. + */ +struct amdxdna_drm_create_hwctx { + __u64 ext; + __u64 ext_flags; + __u64 qos_p; + __u32 umq_bo; + __u32 log_buf_bo; + __u32 max_opc; + __u32 num_tiles; + __u32 mem_size; + __u32 umq_doorbell; + __u32 handle; +}; + +/** + * struct amdxdna_drm_destroy_hwctx - Destroy hardware context. + * @handle: Hardware context handle. + * @pad: MBZ. + */ +struct amdxdna_drm_destroy_hwctx { + __u32 handle; + __u32 pad; +}; + +/** + * struct amdxdna_cu_config - configuration for one CU + * @cu_bo: CU configuration buffer bo handle + * @cu_func: Functional of a CU + * @pad: MBZ + */ +struct amdxdna_cu_config { + __u32 cu_bo; + __u8 cu_func; + __u8 pad[3]; +}; + +/** + * struct amdxdna_hwctx_param_config_cu - configuration for CUs in hardware + * context + * @num_cus: Number of CUs to configure + * @pad: MBZ + * @cu_configs: Array of CU configurations of struct amdxdna_cu_config + */ +struct amdxdna_hwctx_param_config_cu { + __u16 num_cus; + __u16 pad[3]; + struct amdxdna_cu_config cu_configs[] __counted_by(num_cus); +}; + +enum amdxdna_drm_config_hwctx_param { + DRM_AMDXDNA_HWCTX_CONFIG_CU, + DRM_AMDXDNA_HWCTX_ASSIGN_DBG_BUF, + DRM_AMDXDNA_HWCTX_REMOVE_DBG_BUF, + DRM_AMDXDNA_HWCTX_CONFIG_NUM +}; + +/** + * struct amdxdna_drm_config_hwctx - Configure hardware context. + * @handle: hardware context handle. + * @param_type: Value in enum amdxdna_drm_config_hwctx_param. Specifies the + * structure passed in via param_val. + * @param_val: A structure specified by the param_type struct member. + * @param_val_size: Size of the parameter buffer pointed to by the param_val. + * If param_val is not a pointer, driver can ignore this. + * + * Note: if the param_val is a pointer pointing to a buffer, the maximum size + * of the buffer is 4KiB(PAGE_SIZE). + */ +struct amdxdna_drm_config_hwctx { + __u32 handle; + __u32 param_type; + __u64 param_val; + __u32 param_val_size; + __u32 pad; +}; + +/* + * AMDXDNA_BO_SHMEM: DRM GEM SHMEM bo + * AMDXDNA_BO_DEV_HEAP: Shared host memory to device as heap memory + * AMDXDNA_BO_DEV_BO: Allocated from BO_DEV_HEAP + * AMDXDNA_BO_CMD: User and driver accessible bo + * AMDXDNA_BO_DMA: DRM GEM DMA bo + */ +enum amdxdna_bo_type { + AMDXDNA_BO_INVALID = 0, + AMDXDNA_BO_SHMEM, + AMDXDNA_BO_DEV_HEAP, + AMDXDNA_BO_DEV, + AMDXDNA_BO_CMD, + AMDXDNA_BO_DMA, +}; + +/** + * struct amdxdna_drm_create_bo - Create a buffer object. + * @flags: Buffer flags. MBZ. + * @type: Buffer type. + * @vaddr: User VA of buffer if applied. MBZ. + * @size: Size in bytes. + * @handle: Returned DRM buffer object handle. + */ +struct amdxdna_drm_create_bo { + __u64 flags; + __u32 type; + __u32 _pad; + __u64 vaddr; + __u64 size; + __u32 handle; +}; + +/** + * struct amdxdna_drm_get_bo_info - Get buffer object information. + * @ext: MBZ. + * @ext_flags: MBZ. + * @handle: DRM buffer object handle. + * @map_offset: Returned DRM fake offset for mmap(). + * @vaddr: Returned user VA of buffer. 0 in case user needs mmap(). + * @xdna_addr: Returned XDNA device virtual address. + */ +struct amdxdna_drm_get_bo_info { + __u64 ext; + __u64 ext_flags; + __u32 handle; + __u32 _pad; + __u64 map_offset; + __u64 vaddr; + __u64 xdna_addr; +}; + +/** + * struct amdxdna_drm_sync_bo - Sync buffer object. + * @handle: Buffer object handle. + * @direction: Direction of sync, can be from device or to device. + * @offset: Offset in the buffer to sync. + * @size: Size in bytes. + */ +struct amdxdna_drm_sync_bo { + __u32 handle; +#define SYNC_DIRECT_TO_DEVICE 0U +#define SYNC_DIRECT_FROM_DEVICE 1U + __u32 direction; + __u64 offset; + __u64 size; +}; + +enum amdxdna_cmd_type { + AMDXDNA_CMD_SUBMIT_EXEC_BUF = 0, + AMDXDNA_CMD_SUBMIT_DEPENDENCY, + AMDXDNA_CMD_SUBMIT_SIGNAL, +}; + +/** + * struct amdxdna_drm_exec_cmd - Execute command. + * @ext: MBZ. + * @ext_flags: MBZ. + * @hwctx: Hardware context handle. + * @type: One of command type in enum amdxdna_cmd_type. + * @cmd_handles: Array of command handles or the command handle itself in case + * of just one. + * @args: Array of arguments for all command handles. + * @cmd_count: Number of command handles in the cmd_handles array. + * @arg_count: Number of arguments in the args array. + * @seq: Returned sequence number for this command. + */ +struct amdxdna_drm_exec_cmd { + __u64 ext; + __u64 ext_flags; + __u32 hwctx; + __u32 type; + __u64 cmd_handles; + __u64 args; + __u32 cmd_count; + __u32 arg_count; + __u64 seq; +}; + +/** + * struct amdxdna_drm_wait_cmd - Wait exectuion command. + * + * @hwctx: hardware context handle. + * @timeout: timeout in ms, 0 implies infinite wait. + * @seq: sequence number of the command returned by execute command. + * + * Wait a command specified by seq to be completed. + * Using AMDXDNA_INVALID_CMD_HANDLE as seq means wait till there is a free slot + * to submit a new command. + */ +struct amdxdna_drm_wait_cmd { + __u32 hwctx; + __u32 timeout; + __u64 seq; +}; + +/** + * struct amdxdna_drm_query_aie_status - Query the status of the AIE hardware + * @buffer: The user space buffer that will return the AIE status + * @buffer_size: The size of the user space buffer + * @cols_filled: A bitmap of AIE columns whose data has been returned in the + * buffer. + */ +struct amdxdna_drm_query_aie_status { + __u64 buffer; /* out */ + __u32 buffer_size; /* in */ + __u32 cols_filled; /* out */ +}; + +/** + * struct amdxdna_drm_query_aie_version - Query the version of the AIE hardware + * @major: The major version number + * @minor: The minor version number + */ +struct amdxdna_drm_query_aie_version { + __u32 major; /* out */ + __u32 minor; /* out */ +}; + +/** + * struct amdxdna_drm_query_aie_tile_metadata - Query the metadata of AIE tile + * (core, mem, shim) + * @row_count: The number of rows. + * @row_start: The starting row number. + * @dma_channel_count: The number of dma channels. + * @lock_count: The number of locks. + * @event_reg_count: The number of events. + * @pad: MBZ. + */ +struct amdxdna_drm_query_aie_tile_metadata { + __u16 row_count; + __u16 row_start; + __u16 dma_channel_count; + __u16 lock_count; + __u16 event_reg_count; + __u16 pad[3]; +}; + +/** + * struct amdxdna_drm_query_aie_metadata - Query the metadata of the AIE + * hardware + * @col_size: The size of a column in bytes. + * @cols: The total number of columns. + * @rows: The total number of rows. + * @version: The version of the AIE hardware. + * @core: The metadata for all core tiles. + * @mem: The metadata for all mem tiles. + * @shim: The metadata for all shim tiles. + */ +struct amdxdna_drm_query_aie_metadata { + __u32 col_size; + __u16 cols; + __u16 rows; + struct amdxdna_drm_query_aie_version version; + struct amdxdna_drm_query_aie_tile_metadata core; + struct amdxdna_drm_query_aie_tile_metadata mem; + struct amdxdna_drm_query_aie_tile_metadata shim; +}; + +/** + * struct amdxdna_drm_query_clock - Metadata for a clock + * @name: The clock name. + * @freq_mhz: The clock frequency. + * @pad: MBZ. + */ +struct amdxdna_drm_query_clock { + __u8 name[16]; + __u32 freq_mhz; + __u32 pad; +}; + +/** + * struct amdxdna_drm_query_clock_metadata - Query metadata for clocks + * @mp_npu_clock: The metadata for MP-NPU clock. + * @h_clock: The metadata for H clock. + */ +struct amdxdna_drm_query_clock_metadata { + struct amdxdna_drm_query_clock mp_npu_clock; + struct amdxdna_drm_query_clock h_clock; +}; + +enum amdxdna_sensor_type { AMDXDNA_SENSOR_TYPE_POWER }; + +/** + * struct amdxdna_drm_query_sensor - The data for single sensor. + * @label: The name for a sensor. + * @input: The current value of the sensor. + * @max: The maximum value possible for the sensor. + * @average: The average value of the sensor. + * @highest: The highest recorded sensor value for this driver load for the + * sensor. + * @status: The sensor status. + * @units: The sensor units. + * @unitm: Translates value member variables into the correct unit via (pow(10, + * unitm) * value) + * @type: The sensor type from enum amdxdna_sensor_type + * @pad: MBZ. + */ +struct amdxdna_drm_query_sensor { + __u8 label[64]; + __u32 input; + __u32 max; + __u32 average; + __u32 highest; + __u8 status[64]; + __u8 units[16]; + __s8 unitm; + __u8 type; + __u8 pad[6]; +}; + +/** + * struct amdxdna_drm_query_hwctx - The data for single context. + * @context_id: The ID for this context. + * @start_col: The starting column for the partition assigned to this context. + * @num_col: The number of columns in the partition assigned to this context. + * @pid: The Process ID of the process that created this context. + * @command_submissions: The number of commands submitted to this context. + * @command_completions: The number of commands completed by this context. + * @migrations: The number of times this context has been moved to a different + * partition. + * @preemptions: The number of times this context has been preempted by another + * context in the same partition. + * @pad: MBZ. + */ +struct amdxdna_drm_query_hwctx { + __u32 context_id; + __u32 start_col; + __u32 num_col; + __u32 pad; + __s64 pid; + __u64 command_submissions; + __u64 command_completions; + __u64 migrations; + __u64 preemptions; + __u64 errors; +}; + +/** + * struct amdxdna_drm_aie_mem - The data for AIE memory read/write + * @col: The AIE column index + * @row: The AIE row index + * @addr: The AIE memory address to read/write + * @size: The size of bytes to read/write + * @buf_p: The buffer to store read/write data + * + * This is used for DRM_AMDXDNA_READ_AIE_MEM and DRM_AMDXDNA_WRITE_AIE_MEM + * parameters. + */ +struct amdxdna_drm_aie_mem { + __u32 col; + __u32 row; + __u32 addr; + __u32 size; + __u64 buf_p; +}; + +/** + * struct amdxdna_drm_aie_reg - The data for AIE register read/write + * @col: The AIE column index + * @row: The AIE row index + * @addr: The AIE register address to read/write + * @val: The value to write or returned value from AIE + * + * This is used for DRM_AMDXDNA_READ_AIE_REG and DRM_AMDXDNA_WRITE_AIE_REG + * parameters. + */ +struct amdxdna_drm_aie_reg { + __u32 col; + __u32 row; + __u32 addr; + __u32 val; +}; + +enum amdxdna_drm_get_param { + DRM_AMDXDNA_QUERY_AIE_STATUS, + DRM_AMDXDNA_QUERY_AIE_METADATA, + DRM_AMDXDNA_QUERY_AIE_VERSION, + DRM_AMDXDNA_QUERY_CLOCK_METADATA, + DRM_AMDXDNA_QUERY_SENSORS, + DRM_AMDXDNA_QUERY_HW_CONTEXTS, + DRM_AMDXDNA_READ_AIE_MEM, + DRM_AMDXDNA_READ_AIE_REG, + DRM_AMDXDNA_NUM_GET_PARAM, +}; + +/** + * struct amdxdna_drm_get_info - Get some information from the AIE hardware. + * @param: Value in enum amdxdna_drm_get_param. Specifies the structure passed + * in the buffer. + * @buffer_size: Size of the input buffer. Size needed/written by the kernel. + * @buffer: A structure specified by the param struct member. + */ +struct amdxdna_drm_get_info { + __u32 param; /* in */ + __u32 buffer_size; /* in/out */ + __u64 buffer; /* in/out */ +}; + +enum amdxdna_power_mode_type { + XRT_POWER_MODE_DEFAULT, /**< Fallback to calculated DPM */ + XRT_POWER_MODE_LOW, /**< Set frequency to lowest DPM */ + XRT_POWER_MODE_MEDIUM, /**< Set frequency to medium DPM */ + XRT_POWER_MODE_HIGH, /**< Set frequency to highest DPM */ +}; + +/** + * struct amdxdna_drm_set_power_mode - Set the power mode of the AIE hardware + * @power_mode: The sensor type from enum amdxdna_power_mode_type + * @pad: MBZ. + */ +struct amdxdna_drm_set_power_mode { + __u8 power_mode; + __u8 pad[7]; +}; + +enum amdxdna_drm_set_param { + DRM_AMDXDNA_SET_POWER_MODE, + DRM_AMDXDNA_WRITE_AIE_MEM, + DRM_AMDXDNA_WRITE_AIE_REG, + DRM_AMDXDNA_NUM_SET_PARAM, +}; + +/** + * struct amdxdna_drm_set_state - Set the state of some component within the AIE + * hardware. + * @param: Value in enum amdxdna_drm_set_param. Specifies the structure passed + * in the buffer. + * @buffer_size: Size of the input buffer. + * @buffer: A structure specified by the param struct member. + */ +struct amdxdna_drm_set_state { + __u32 param; /* in */ + __u32 buffer_size; /* in */ + __u64 buffer; /* in */ +}; + +#define DRM_IOCTL_AMDXDNA_CREATE_HWCTX \ + DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_CREATE_HWCTX, \ + struct amdxdna_drm_create_hwctx) + +#define DRM_IOCTL_AMDXDNA_DESTROY_HWCTX \ + DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_DESTROY_HWCTX, \ + struct amdxdna_drm_destroy_hwctx) + +#define DRM_IOCTL_AMDXDNA_CONFIG_HWCTX \ + DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_CONFIG_HWCTX, \ + struct amdxdna_drm_config_hwctx) + +#define DRM_IOCTL_AMDXDNA_CREATE_BO \ + DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_CREATE_BO, \ + struct amdxdna_drm_create_bo) + +#define DRM_IOCTL_AMDXDNA_GET_BO_INFO \ + DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_GET_BO_INFO, \ + struct amdxdna_drm_get_bo_info) + +#define DRM_IOCTL_AMDXDNA_SYNC_BO \ + DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_SYNC_BO, struct amdxdna_drm_sync_bo) + +#define DRM_IOCTL_AMDXDNA_EXEC_CMD \ + DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_EXEC_CMD, struct amdxdna_drm_exec_cmd) + +#define DRM_IOCTL_AMDXDNA_WAIT_CMD \ + DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_WAIT_CMD, struct amdxdna_drm_wait_cmd) + +#define DRM_IOCTL_AMDXDNA_GET_INFO \ + DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_GET_INFO, struct amdxdna_drm_get_info) + +#define DRM_IOCTL_AMDXDNA_SET_STATE \ + DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_SET_STATE, \ + struct amdxdna_drm_set_state) + +#if defined(__cplusplus) +} /* extern c end */ +#endif + +#endif /* AMDXDNA_ACCEL_H_ */ diff --git a/experimental/hsa/hsa_ipu.h b/experimental/hsa/hsa_ipu.h new file mode 100644 index 000000000..31176a34f --- /dev/null +++ b/experimental/hsa/hsa_ipu.h @@ -0,0 +1,270 @@ + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include "amdxdna_accel.h" + +// want to mmap the file +#include +#include + +#define MAX_NUM_INSTRUCTIONS 1024 // Maximum number of dpu or pdi instructions. + +// Dummy packet defines + +int map_doorbell(int fd, uint64_t *doorbell) { + // Mmap the mailbox. + int32_t page_size = 4096; + *doorbell = (uint64_t)mmap(NULL, page_size, PROT_READ | PROT_WRITE, + MAP_SHARED, fd, 0); + if (doorbell != MAP_FAILED) { + printf("Doorbell mapped\n"); + return 0; + } + + printf("[ERROR] doorbell mmap failed: %s\n", strerror(errno)); + return errno; +} + +void ring_doorbell(uint64_t doorbell) { + int32_t curr_tail = *((int32_t *)doorbell); + *((uint32_t *)doorbell) = curr_tail + 0x94; +} + +int get_driver_version(int fd, __u32 *major, __u32 *minor) { + int ret; + struct amdxdna_drm_query_aie_version version; + + struct amdxdna_drm_get_info info_params = { + .param = DRM_AMDXDNA_QUERY_AIE_VERSION, + .buffer_size = sizeof(version), + .buffer = (__u64)&version, + }; + + ret = ioctl(fd, DRM_IOCTL_AMDXDNA_GET_INFO, &info_params); + if (ret == 0) { + *major = version.major; + *minor = version.minor; + } + + return ret; +} + +/* + Allocates a heap on the device by creating a BO of type dev heap +*/ +static int alloc_heap(int fd, __u32 size, __u32 *handle) { + int ret; + void *heap_buf = NULL; + const size_t alignment = 64 * 1024 * 1024; + ret = posix_memalign(&heap_buf, alignment, size); + if (ret != 0 || heap_buf == NULL) { + printf("[ERROR] Failed to allocate heap buffer of size %d\n", size); + } + + void *dev_heap_parent = mmap(0, alignment * 2 - 1, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + + if (dev_heap_parent == MAP_FAILED) { + dev_heap_parent = nullptr; + return -1; + } + + struct amdxdna_drm_create_bo create_bo_params = { + .type = AMDXDNA_BO_DEV_HEAP, + .size = size, + }; + + ret = ioctl(fd, DRM_IOCTL_AMDXDNA_CREATE_BO, &create_bo_params); + if (ret == 0 && handle) { + *handle = create_bo_params.handle; + } + + struct amdxdna_drm_get_bo_info get_bo_info = {.handle = + create_bo_params.handle}; + ret = ioctl(fd, DRM_IOCTL_AMDXDNA_GET_BO_INFO, &get_bo_info); + if (ret != 0) { + perror("Failed to get BO info"); + return -2; + } + + // Need to free the heap buf but still use the address so we can + // ensure alignment + free(heap_buf); + heap_buf = (void *)mmap(heap_buf, size, PROT_READ | PROT_WRITE, MAP_SHARED, + fd, get_bo_info.map_offset); + printf("Heap buffer @: %p\n", heap_buf); + + return ret; +} + +/* + Creates a dev bo which is carved out of the heap bo. +*/ +static int create_dev_bo(int fd, uint64_t *vaddr, uint64_t *sram_vaddr, + __u32 *handle, __u64 size_in_bytes) { + struct amdxdna_drm_create_bo create_bo = { + .type = AMDXDNA_BO_DEV, + .size = size_in_bytes, + }; + int ret = ioctl(fd, DRM_IOCTL_AMDXDNA_CREATE_BO, &create_bo); + if (ret != 0) { + perror("Failed to create BO"); + return -1; + } + + struct amdxdna_drm_get_bo_info get_bo_info = {.handle = create_bo.handle}; + ret = ioctl(fd, DRM_IOCTL_AMDXDNA_GET_BO_INFO, &get_bo_info); + if (ret != 0) { + perror("Failed to get BO info"); + return -2; + } + + *vaddr = get_bo_info.vaddr; + *sram_vaddr = get_bo_info.xdna_addr; + *handle = create_bo.handle; + return 0; +} + +/* + Creates a shmem bo +*/ +static int create_shmem_bo(int fd, uint64_t *vaddr, uint64_t *sram_vaddr, + __u32 *handle, __u64 size_in_bytes) { + const size_t alignment = 64 * 1024 * 1024; + void *shmem_create = NULL; + int ret = posix_memalign(&shmem_create, alignment, size_in_bytes); + if (ret != 0) { + printf("[ERROR] Failed to allocate shmem bo of size %lld\n", size_in_bytes); + } + + // Touching buffer to map page + *(uint32_t *)shmem_create = 0xDEADBEEF; + + printf("Shmem BO @: %p\n", shmem_create); + + struct amdxdna_drm_create_bo create_bo = {.type = AMDXDNA_BO_SHMEM, + .vaddr = (__u64)shmem_create, + .size = size_in_bytes}; + ret = ioctl(fd, DRM_IOCTL_AMDXDNA_CREATE_BO, &create_bo); + if (ret != 0) { + perror("Failed to create BO"); + return -1; + } + + struct amdxdna_drm_get_bo_info get_bo_info = {.handle = create_bo.handle}; + ret = ioctl(fd, DRM_IOCTL_AMDXDNA_GET_BO_INFO, &get_bo_info); + if (ret != 0) { + perror("Failed to get BO info"); + return -2; + } + + *vaddr = (__u64)shmem_create; + *sram_vaddr = get_bo_info.xdna_addr; + *handle = create_bo.handle; + return 0; +} + +/* + Wrapper around synch bo ioctl. +*/ +static int sync_bo(int fd, __u32 handle) { + struct amdxdna_drm_sync_bo sync_params = { + .handle = handle, + }; + int ret = ioctl(fd, DRM_IOCTL_AMDXDNA_SYNC_BO, &sync_params); + if (ret != 0) { + printf("Synch bo ioctl failed for handle %d\n", handle); + } + return ret; +} + +/* + Create a BO_DEV and populate it with a PDI +*/ + +static int load_pdi(int fd, uint64_t *vaddr, uint64_t *sram_addr, __u32 *handle, + const char *path) { + FILE *file = fopen(path, "r"); + if (file == NULL) { + perror("Failed to open instructions file."); + return -1; + } + + fseek(file, 0L, SEEK_END); + ssize_t file_size = ftell(file); + fseek(file, 0L, SEEK_SET); + + printf("Pdi file size: %ld\n", file_size); + + fclose(file); + + // Mmaping the file + int pdi_fd = open(path, O_RDONLY); + uint64_t *file_data = + (uint64_t *)mmap(0, file_size, PROT_READ, MAP_PRIVATE, pdi_fd, 0); + + // Creating a BO_DEV bo to store the pdi file. + int ret = create_dev_bo(fd, vaddr, sram_addr, handle, file_size); + if (ret != 0) { + perror("Failed to create pdi BO"); + return -1; + } + + // copy the file into Bo dev + uint64_t *bo = (uint64_t *)*vaddr; + memcpy(bo, file_data, file_size); + + close(pdi_fd); + return 0; +} + +/* + Create a BO DEV and populate it with instructions whose virtual address is + passed to the driver via an HSA packet. +*/ +static int load_instructions(int fd, uint64_t *vaddr, uint64_t *sram_addr, + __u32 *handle, const char *path, __u32 *num_inst) { + // read dpu instructions into an array + FILE *file = fopen(path, "r"); + if (file == NULL) { + perror("Failed to open instructions file."); + return -1; + } + + char *line = NULL; + size_t len = 0; + __u32 inst_array[MAX_NUM_INSTRUCTIONS]; + __u32 inst_counter = 0; + while (getline(&line, &len, file) != -1) { + inst_array[inst_counter++] = strtoul(line, NULL, 16); + if (inst_counter >= MAX_NUM_INSTRUCTIONS) { + perror("Instruction array overflowed."); + return -2; + } + } + fclose(file); + + // Creating a BO_DEV bo to store the instruction. + int ret = + create_dev_bo(fd, vaddr, sram_addr, handle, inst_counter * sizeof(__u32)); + if (ret != 0) { + perror("Failed to create dpu BO"); + return -3; + } + + *num_inst = inst_counter; + + memcpy((__u32 *)*vaddr, inst_array, inst_counter * sizeof(__u32)); + return ret; +}