Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

数据拷贝相关程序编译问题 #125

Closed
xxrrnn opened this issue May 27, 2024 · 11 comments
Closed

数据拷贝相关程序编译问题 #125

xxrrnn opened this issue May 27, 2024 · 11 comments
Labels
good first issue Good for newcomers

Comments

@xxrrnn
Copy link

xxrrnn commented May 27, 2024

kernel函数如下:

__kernel void __attribute__((noinline)) lsu(__global int *src)
{
  // long i = get_global_id(0);
  // dst[i] += src[i];
  __local int sharedmem[32];
  for(int i = 0; i < 32; i++){
    sharedmem[i] = src[i];
  }
}

main.cc函数如下:

/*
 *  Simple OpenCL demo program
 *
 *  Copyright (C) 2009  Clifford Wolf <clifford@clifford.at>
 *
 *  This program is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 2 of the License, or
 *  (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program; if not, write to the Free Software
 *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 *
 *  gcc -o cldemo -std=gnu99 -Wall -I/usr/include/nvidia-current cldemo.c
 * -lOpenCL
 *
 */

#include <CL/cl.h>
#include <fstream>
#include <iostream>
#include <sstream>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h> 
#include <chrono>
#include <vector>

#define CL_CHECK(_expr)                                                        \
  do {                                                                         \
    cl_int _err = _expr;                                                       \
    if (_err == CL_SUCCESS)                                                    \
      break;                                                                   \
    fprintf(stderr, "OpenCL Error: '%s' returned %d!\n", #_expr, (int)_err);   \
    abort();                                                                   \
  } while (0)

#define CL_CHECK_ERR(_expr)                                                    \
  ({                                                                           \
    cl_int _err = CL_INVALID_VALUE;                                            \
    decltype(_expr) _ret = _expr;                                                \
    if (_err != CL_SUCCESS) {                                                  \
      fprintf(stderr, "OpenCL Error: '%s' returned %d!\n", #_expr, (int)_err); \
      abort();                                                                 \
    }                                                                          \
    _ret;                                                                      \
  })

void pfn_notify(const char *errinfo, const void *private_info, size_t cb,
                void *user_data) {
  fprintf(stderr, "OpenCL Error (via pfn_notify): %s\n", errinfo);
}

static int read_kernel_file(const char* filename, uint8_t** data, size_t* size) {
  if (nullptr == filename || nullptr == data || 0 == size)
    return -1;

  FILE* fp = fopen(filename, "r");
  if (NULL == fp) {
    fprintf(stderr, "Failed to load kernel.");
    return -1;
  }
  fseek(fp , 0 , SEEK_END);
  long fsize = ftell(fp);
  rewind(fp);

  *data = (uint8_t*)malloc(fsize);
  *size = fread(*data, 1, fsize, fp);
  
  fclose(fp);
  
  return 0;
}

static bool almost_equal(float a, float b, int ulp = 4) {
  union fi_t { int i; float f; };
  fi_t fa, fb;
  fa.f = a;
  fb.f = b;
  return std::abs(fa.i - fb.i) <= ulp;
}

uint8_t *kernel_bin = NULL;

///
//  Cleanup any created OpenCL resources
//
void Cleanup(cl_device_id device_id, cl_context context, cl_command_queue commandQueue,
             cl_program program, cl_kernel kernel, cl_mem memObjects[2]) {
  if (kernel_bin) 
    free(kernel_bin);
  
  if (commandQueue != 0)
    clReleaseCommandQueue(commandQueue);

  for (int i = 0; i < 2; i++) {
    if (memObjects[i] != 0)
      clReleaseMemObject(memObjects[i]);
  }

  if (kernel != 0)
    clReleaseKernel(kernel);

  if (program != 0)
    clReleaseProgram(program);

  if (context != 0)
    clReleaseContext(context);

  if (device_id != 0) 
    clReleaseDevice(device_id);
}

int size = 64;

static void show_usage() {
  printf("Usage: [-n size] [-h: help]\n");
}

static void parse_args(int argc, char **argv) {
  int c;
  while ((c = getopt(argc, argv, "n:h?")) != -1) {
    switch (c) {
    case 'n':
      size = atoi(optarg);
      break;
    case 'h':
    case '?': {
      show_usage();
      exit(0);
    } break;
    default:
      show_usage();
      exit(-1);
    }
  }

  printf("Workload size=%d\n", size);
}

int main(int argc, char **argv) {
  // parse command arguments
  parse_args(argc, argv);
  
  cl_platform_id platform_id;
  cl_device_id device_id;
  cl_program program;
  cl_mem input_buffer;
  cl_mem output_buffer;
  size_t kernel_size;
  cl_context context;
  cl_command_queue queue;

  // Getting platform and device information
  CL_CHECK(clGetPlatformIDs(1, &platform_id, NULL));
  CL_CHECK(clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id, NULL));

  context = CL_CHECK_ERR(clCreateContext(NULL, 1, &device_id, &pfn_notify, NULL, &_err));  
  queue = CL_CHECK_ERR(clCreateCommandQueue(context, device_id, 0, &_err));

  cl_kernel kernel = 0;
  cl_mem memObjects[2] = {0, 0};

  printf("Create program from kernel source\n");
// #ifdef HOSTGPU
  if (0 != read_kernel_file("kernel.cl", &kernel_bin, &kernel_size))
    return -1;
  program = CL_CHECK_ERR(clCreateProgramWithSource(
    context, 1, (const char**)&kernel_bin, &kernel_size, &_err));  
// #else
//   if (0 != read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size))
//     return -1;
//   program = CL_CHECK_ERR(clCreateProgramWithBinary(
//     context, 1, &device_id, &kernel_size, (const uint8_t**)&kernel_bin, NULL, &_err));
// #endif

  // Build program
  CL_CHECK(clBuildProgram(program, 1, &device_id, NULL, NULL, NULL));

  size_t nbytes = sizeof(int) * size;

  printf("create input buffer\n");  
  input_buffer = CL_CHECK_ERR(clCreateBuffer(context, CL_MEM_READ_ONLY, nbytes, NULL, &_err));

  printf("create output buffer\n");  
  output_buffer = CL_CHECK_ERR(clCreateBuffer(context, CL_MEM_READ_WRITE, nbytes, NULL, &_err));

  memObjects[0] = input_buffer;
  memObjects[1] = output_buffer;

  float factor = ((float)rand() / (float)(RAND_MAX)) * 100.0;

  printf("create kernel\n");
  kernel = CL_CHECK_ERR(clCreateKernel(program, "lsu", &_err));

  printf("setting up kernel args\n");
  CL_CHECK(clSetKernelArg(kernel, 0, sizeof(input_buffer), &input_buffer));
  // CL_CHECK(clSetKernelArg(kernel, 1, sizeof(output_buffer), &output_buffer));
  // CL_CHECK(clSetKernelArg(kernel, 2, sizeof(factor), &factor));

  size_t global_offset[1] = {0};
  size_t global_work_size[1] = {(uint64_t)size};
  size_t local_work_size[1] = {(uint64_t)size};

  printf("initialize buffers\n");
  std::vector<int> ref_vec(size, 0);
  {
    std::vector<int> dst_vec(size, 0);
    std::vector<int> src_vec(size);
    
    for (int i = 0; i < size; i++) {
      src_vec[i] = int(i);
    }

    CL_CHECK(clEnqueueWriteBuffer(queue, input_buffer, CL_TRUE, 0, nbytes, src_vec.data(), 0, NULL, NULL));
    // CL_CHECK(clEnqueueWriteBuffer(queue, output_buffer, CL_TRUE, 0, nbytes, dst_vec.data(), 0, NULL, NULL));

  //   size_t num_groups_x = global_work_size[0] / local_work_size[0];    
  //   for (size_t workgroup_id_x = 0; workgroup_id_x < num_groups_x; ++workgroup_id_x) {
  //     for (size_t local_id_x = 0; local_id_x < local_work_size[0]; ++local_id_x) {
  //       // Calculate global ID for the work-item
  //       int global_id_x = global_offset[0] + local_work_size[0] * workgroup_id_x + local_id_x;
  //       // kernel operation
  //       int i = global_id_x;
  //       ref_vec[i] += src_vec[i] * factor;
  //     }
  //   }
  }

  printf("enqueue kernel\n");
  auto time_start = std::chrono::high_resolution_clock::now();
  CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 1, global_offset, global_work_size, local_work_size, 0, NULL, NULL));
  CL_CHECK(clFinish(queue));
  auto time_end = std::chrono::high_resolution_clock::now();
  double elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(time_end - time_start).count();
  printf("Elapsed time: %lg ms\n", elapsed);

  // printf("Verify result\n");
  // int errors = 0;
  // {
  //   std::vector<float> dst_vec(size);    
  //   CL_CHECK(clEnqueueReadBuffer(queue, output_buffer, CL_TRUE, 0, nbytes, dst_vec.data(), 0, NULL, NULL));

  //   for (int i = 0; i < size; ++i) {
  //     if (!almost_equal(dst_vec[i], ref_vec[i])) {
  //       if (errors < 100) 
  //         printf("*** error: [%d] expected=%f, actual=%f\n", i, ref_vec[i], dst_vec[i]);
  //       ++errors;
  //     }
  //   }
  
  //   if (0 == errors) {
  //     printf("PASSED!\n");
  //   } else {
  //     printf("FAILED! - %d errors\n", errors);    
  //   }
  // }

  Cleanup(device_id, context, queue, program, kernel, memObjects);

  // return errors;
}

makefile如下:

# PROJECT = saxpy

# SRCS = main.cc

# OPTS ?= -n1024

# include ../common.mk

include ../common/make.config
TYPE = GPU
CC = clang++
CC_FLAGS = -g -O2 -std=c++11
CC_FLAGS += -I.

.PHONY: lsu.out
lsu.out: 
	$(CC) $(CC_FLAGS) main.cc -o lsu.out -I$(OPENCL_INC) -L$(OPENCL_LIB) -lOpenCL -Wno-unused-result -Wl,--init=lsu

%.o: %.[ch]
	$(CC) $(CC_FLAGS) $< -c

clean:
	rm -f *.o *~ *.out *.linkinfo

通过mask和执行./lsu.out,生成的dump文件中没有kernel中的指令(以下为dump文件的一部分):

800000ac <.Lpcrel_hi6>:
800000ac: 97 12 00 00  	auipc	t0, 1
800000b0: 93 82 42 f5  	addi	t0, t0, -172
800000b4: 23 a0 62 00  	sw	t1, 0(t0)

800000b8 <lsu>:
800000b8: 67 80 00 00  	ret

kernel.txt
main.txt

@zhoujingya
Copy link
Contributor

include ../common/make.config这里面是啥,你们这边写的user mode driver code,生成的是host(X86)的代码@xxrrnn

@xxrrnn
Copy link
Author

xxrrnn commented May 28, 2024

DEFAULT OCL

OPENCL_DIR = /home/lan/work/ventus/ocl-icd
OPENCL_INC = /home/lan/work/ventus/llvm-project/install/include
OPENCL_LIB = /home/lan/work/ventus/llvm-project/install/lib

@zhoujingya
Copy link
Contributor

DEFAULT OCL

OPENCL_DIR = /home/lan/work/ventus/ocl-icd OPENCL_INC = /home/lan/work/ventus/llvm-project/install/include OPENCL_LIB = /home/lan/work/ventus/llvm-project/install/lib

需要调用kernel的driver(pocl)来编译kernel代码。因为你现在这种方式就是user mode,类似于pocl里面的vecadd例子,编译的是host代码,

@xxrrnn
Copy link
Author

xxrrnn commented May 28, 2024

具体该怎么做呢?还不是很熟悉

@zhoujingya
Copy link
Contributor

具体该怎么做呢?还不是很熟悉

@yangzexia 如果你们在一个地方的话可以当面问一下

@xxrrnn
Copy link
Author

xxrrnn commented May 28, 2024

lan@DESKTOPONFR7BU:~/work/ventus/gpu-rodinia/vortexcase/lsu$ make
clang++ -g -O2 -std=c++11 -I. main.cc -o lsu.out -I/home/lan/work/ventus/llvm-project/install/include -L/home/lan/work/ventus/llvm-project/install/lib -lOpenCL -Wno-unused-result -Wl,--init=lsu
In file included from main.cc:25:
In file included from /home/lan/work/ventus/llvm-project/install/include/CL/cl.h:20:
/home/lan/work/ventus/llvm-project/install/include/CL/cl_version.h:22:9: warning: cl_version.h: CL_TARGET_OPENCL_VERSION is not defined. Defaulting to 300 (OpenCL 3.0) [-W#pragma-messages]
#pragma message("cl_version.h: CL_TARGET_OPENCL_VERSION is not defined. Defaulting to 300 (OpenCL 3.0)")
        ^
main.cc:166:24: warning: 'clCreateCommandQueue' is deprecated [-Wdeprecated-declarations]
  queue = CL_CHECK_ERR(clCreateCommandQueue(context, device_id, 0, &_err));
                       ^
/home/lan/work/ventus/llvm-project/install/include/CL/cl.h:1923:66: note: 'clCreateCommandQueue' has been explicitly marked deprecated here
                     cl_int *                       errcode_ret) CL_API_SUFFIX__VERSION_1_2_DEPRECATED;
                                                                 ^
/home/lan/work/ventus/llvm-project/install/include/CL/cl_platform.h:108:72: note: expanded from macro 'CL_API_SUFFIX__VERSION_1_2_DEPRECATED'
    #define CL_API_SUFFIX__VERSION_1_2_DEPRECATED CL_API_SUFFIX_COMMON CL_API_SUFFIX_DEPRECATED
                                                                       ^
/home/lan/work/ventus/llvm-project/install/include/CL/cl_platform.h:78:51: note: expanded from macro 'CL_API_SUFFIX_DEPRECATED'
  #define CL_API_SUFFIX_DEPRECATED __attribute__((deprecated))
                                                  ^
main.cc:166:24: warning: 'clCreateCommandQueue' is deprecated [-Wdeprecated-declarations]
  queue = CL_CHECK_ERR(clCreateCommandQueue(context, device_id, 0, &_err));
                       ^
/home/lan/work/ventus/llvm-project/install/include/CL/cl.h:1923:66: note: 'clCreateCommandQueue' has been explicitly marked deprecated here
                     cl_int *                       errcode_ret) CL_API_SUFFIX__VERSION_1_2_DEPRECATED;
                                                                 ^
/home/lan/work/ventus/llvm-project/install/include/CL/cl_platform.h:108:72: note: expanded from macro 'CL_API_SUFFIX__VERSION_1_2_DEPRECATED'
    #define CL_API_SUFFIX__VERSION_1_2_DEPRECATED CL_API_SUFFIX_COMMON CL_API_SUFFIX_DEPRECATED
                                                                       ^
/home/lan/work/ventus/llvm-project/install/include/CL/cl_platform.h:78:51: note: expanded from macro 'CL_API_SUFFIX_DEPRECATED'
  #define CL_API_SUFFIX_DEPRECATED __attribute__((deprecated))
                                                  ^
3 warnings generated.
lan@DESKTOPONFR7BU:~/work/ventus/gpu-rodinia/vortexcase/lsu$ ./lsu.out 
Workload size=64
** Final POCL_DEBUG flags: 100000 
[INFO]: [HW DRIVER] in [FILE] ventus.cpp,[LINE]25,[fn] vt_dev_open: vt_dev_open : hello world from ventus.cpp
spike device initialize: allocating local memory: to allocate at 0x70000000 with 268435456 bytes 
spike device initialize: allocating pc source memory: to allocate at 0x80000000 with 268435456 bytes 
Create program from kernel source
### options: -DPOCL_DEVICE_ADDRESS_BITS=32 -D__USE_CLANG_OPENCL_C_H -xcl -Dinline= -I. -cl-kernel-arg-info  -D__ENDIAN_LITTLE__=1 -DCL_DEVICE_MAX_GLOBAL_VARIABLE_SIZE=0 -D__OPENCL_VERSION__=200 -cl-std=CL2.0 -D__OPENCL_C_VERSION__=200 -Dcl_khr_fp64=1 -D__opencl_c_generic_address_space=1 -D__opencl_c_named_address_space_builtins=1 -cl-ext=-all,+cl_khr_fp64,+__opencl_c_generic_address_space,+__opencl_c_named_address_space_builtins -fno-builtin -triple=riscv32 -target-cpu ventus-gpgpu user_options: 
### Triple: riscv32, CPU: ventus-gpgpu
create input buffer
create output buffer
create kernel
setting up kernel args
initialize buffers
Warning: the memory at  [0x90000000, 0x900000FF] has been realigned
to the 4 KiB page size: [0x90000000, 0x90000FFF]
to allocate at 0x90000000 with 4096 bytes 
to copy to 0x90000000 with 256 bytes
enqueue kernel
notice that ventus hasn't support local buffer as argument yet.
[2024-05-28 03:28:56.184955456]POCL: in fn void pocl_ventus_run(void *, _cl_command_node *) at line 629:
  |    VENTUS |  Allocating kernel arg buffer entry:
Warning: the memory at  [0x90001000, 0x90001003] has been realigned
to the 4 KiB page size: [0x90001000, 0x90001FFF]
to allocate at 0x90001000 with 4096 bytes 
to copy to 0x90001000 with 4 bytes
[2024-05-28 03:28:56.140411276613688]POCL: in fn void pocl_ventus_run(void *, _cl_command_node *) at line 683:
  |    VENTUS |  Kernel entry of "lsu" is : "0x800000b8"
[2024-05-28 03:28:56.345683756]POCL: in fn void pocl_ventus_run(void *, _cl_command_node *) at line 714:
  |    VENTUS |  Vmem file has been written to object.vmem
[2024-05-28 03:28:56.345863956]POCL: in fn void pocl_ventus_run(void *, _cl_command_node *) at line 766:
  |    VENTUS |  Preparing private memory of ventus:
to allocate at 0x90002000 with 262144 bytes 
[2024-05-28 03:28:56.345930556]POCL: in fn void pocl_ventus_run(void *, _cl_command_node *) at line 801:
  |    VENTUS |  Allocating metadata space:
Warning: the memory at  [0x90042000, 0x9004203F] has been realigned
to the 4 KiB page size: [0x90042000, 0x90042FFF]
to allocate at 0x90042000 with 4096 bytes 
to copy to 0x90042000 with 64 bytes
[2024-05-28 03:28:56.345999756]POCL: in fn void pocl_ventus_run(void *, _cl_command_node *) at line 811:
  |    VENTUS |  kernel metadata has been written to 0x90042000
arg gpgpu is numw:4,numt:16,numwg:1,kernelx:1,kernely:1,kernelz:1,ldssize:0x1000,pdssize:0x10000000,pdsbase:0x90002000,knlbase:0x90042000,currwgid:0
vaddr mem scope is -m0x70000000:0x90043000
src file is object.riscv, run log is written to object.riscv.log
spike -l --log-commits -p4 --isa rv32gcv_zfh --pc=0x80000000 -m0x70000000:0x90043000 --varch vlen:512,elen:32 --gpgpuarch numw:4,numt:16,numwg:1,kernelx:1,kernely:1,kernelz:1,ldssize:0x1000,pdssize:0x10000000,pdsbase:0x90002000,knlbase:0x90042000,currwgid:0 --log=object.riscv.log object.riscv 
Log file object.riscv.log renamed successfully to lsu_0.log.
Elapsed time: 170 ms

@zhoujingya
Copy link
Contributor

zhoujingya commented May 28, 2024

lan@DESKTOPONFR7BU:~/work/ventus/gpu-rodinia/vortexcase/lsu$ make
clang++ -g -O2 -std=c++11 -I. main.cc -o lsu.out -I/home/lan/work/ventus/llvm-project/install/include -L/home/lan/work/ventus/llvm-project/install/lib -lOpenCL -Wno-unused-result -Wl,--init=lsu
In file included from main.cc:25:
In file included from /home/lan/work/ventus/llvm-project/install/include/CL/cl.h:20:
/home/lan/work/ventus/llvm-project/install/include/CL/cl_version.h:22:9: warning: cl_version.h: CL_TARGET_OPENCL_VERSION is not defined. Defaulting to 300 (OpenCL 3.0) [-W#pragma-messages]
#pragma message("cl_version.h: CL_TARGET_OPENCL_VERSION is not defined. Defaulting to 300 (OpenCL 3.0)")
        ^
main.cc:166:24: warning: 'clCreateCommandQueue' is deprecated [-Wdeprecated-declarations]
  queue = CL_CHECK_ERR(clCreateCommandQueue(context, device_id, 0, &_err));
                       ^
/home/lan/work/ventus/llvm-project/install/include/CL/cl.h:1923:66: note: 'clCreateCommandQueue' has been explicitly marked deprecated here
                     cl_int *                       errcode_ret) CL_API_SUFFIX__VERSION_1_2_DEPRECATED;
                                                                 ^
/home/lan/work/ventus/llvm-project/install/include/CL/cl_platform.h:108:72: note: expanded from macro 'CL_API_SUFFIX__VERSION_1_2_DEPRECATED'
    #define CL_API_SUFFIX__VERSION_1_2_DEPRECATED CL_API_SUFFIX_COMMON CL_API_SUFFIX_DEPRECATED
                                                                       ^
/home/lan/work/ventus/llvm-project/install/include/CL/cl_platform.h:78:51: note: expanded from macro 'CL_API_SUFFIX_DEPRECATED'
  #define CL_API_SUFFIX_DEPRECATED __attribute__((deprecated))
                                                  ^
main.cc:166:24: warning: 'clCreateCommandQueue' is deprecated [-Wdeprecated-declarations]
  queue = CL_CHECK_ERR(clCreateCommandQueue(context, device_id, 0, &_err));
                       ^
/home/lan/work/ventus/llvm-project/install/include/CL/cl.h:1923:66: note: 'clCreateCommandQueue' has been explicitly marked deprecated here
                     cl_int *                       errcode_ret) CL_API_SUFFIX__VERSION_1_2_DEPRECATED;
                                                                 ^
/home/lan/work/ventus/llvm-project/install/include/CL/cl_platform.h:108:72: note: expanded from macro 'CL_API_SUFFIX__VERSION_1_2_DEPRECATED'
    #define CL_API_SUFFIX__VERSION_1_2_DEPRECATED CL_API_SUFFIX_COMMON CL_API_SUFFIX_DEPRECATED
                                                                       ^
/home/lan/work/ventus/llvm-project/install/include/CL/cl_platform.h:78:51: note: expanded from macro 'CL_API_SUFFIX_DEPRECATED'
  #define CL_API_SUFFIX_DEPRECATED __attribute__((deprecated))
                                                  ^
3 warnings generated.
lan@DESKTOPONFR7BU:~/work/ventus/gpu-rodinia/vortexcase/lsu$ ./lsu.out 
Workload size=64
** Final POCL_DEBUG flags: 100000 
[INFO]: [HW DRIVER] in [FILE] ventus.cpp,[LINE]25,[fn] vt_dev_open: vt_dev_open : hello world from ventus.cpp
spike device initialize: allocating local memory: to allocate at 0x70000000 with 268435456 bytes 
spike device initialize: allocating pc source memory: to allocate at 0x80000000 with 268435456 bytes 
Create program from kernel source
### options: -DPOCL_DEVICE_ADDRESS_BITS=32 -D__USE_CLANG_OPENCL_C_H -xcl -Dinline= -I. -cl-kernel-arg-info  -D__ENDIAN_LITTLE__=1 -DCL_DEVICE_MAX_GLOBAL_VARIABLE_SIZE=0 -D__OPENCL_VERSION__=200 -cl-std=CL2.0 -D__OPENCL_C_VERSION__=200 -Dcl_khr_fp64=1 -D__opencl_c_generic_address_space=1 -D__opencl_c_named_address_space_builtins=1 -cl-ext=-all,+cl_khr_fp64,+__opencl_c_generic_address_space,+__opencl_c_named_address_space_builtins -fno-builtin -triple=riscv32 -target-cpu ventus-gpgpu user_options: 
### Triple: riscv32, CPU: ventus-gpgpu
create input buffer
create output buffer
create kernel
setting up kernel args
initialize buffers
Warning: the memory at  [0x90000000, 0x900000FF] has been realigned
to the 4 KiB page size: [0x90000000, 0x90000FFF]
to allocate at 0x90000000 with 4096 bytes 
to copy to 0x90000000 with 256 bytes
enqueue kernel
notice that ventus hasn't support local buffer as argument yet.
[2024-05-28 03:28:56.184955456]POCL: in fn void pocl_ventus_run(void *, _cl_command_node *) at line 629:
  |    VENTUS |  Allocating kernel arg buffer entry:
Warning: the memory at  [0x90001000, 0x90001003] has been realigned
to the 4 KiB page size: [0x90001000, 0x90001FFF]
to allocate at 0x90001000 with 4096 bytes 
to copy to 0x90001000 with 4 bytes
[2024-05-28 03:28:56.140411276613688]POCL: in fn void pocl_ventus_run(void *, _cl_command_node *) at line 683:
  |    VENTUS |  Kernel entry of "lsu" is : "0x800000b8"
[2024-05-28 03:28:56.345683756]POCL: in fn void pocl_ventus_run(void *, _cl_command_node *) at line 714:
  |    VENTUS |  Vmem file has been written to object.vmem
[2024-05-28 03:28:56.345863956]POCL: in fn void pocl_ventus_run(void *, _cl_command_node *) at line 766:
  |    VENTUS |  Preparing private memory of ventus:
to allocate at 0x90002000 with 262144 bytes 
[2024-05-28 03:28:56.345930556]POCL: in fn void pocl_ventus_run(void *, _cl_command_node *) at line 801:
  |    VENTUS |  Allocating metadata space:
Warning: the memory at  [0x90042000, 0x9004203F] has been realigned
to the 4 KiB page size: [0x90042000, 0x90042FFF]
to allocate at 0x90042000 with 4096 bytes 
to copy to 0x90042000 with 64 bytes
[2024-05-28 03:28:56.345999756]POCL: in fn void pocl_ventus_run(void *, _cl_command_node *) at line 811:
  |    VENTUS |  kernel metadata has been written to 0x90042000
arg gpgpu is numw:4,numt:16,numwg:1,kernelx:1,kernely:1,kernelz:1,ldssize:0x1000,pdssize:0x10000000,pdsbase:0x90002000,knlbase:0x90042000,currwgid:0
vaddr mem scope is -m0x70000000:0x90043000
src file is object.riscv, run log is written to object.riscv.log
spike -l --log-commits -p4 --isa rv32gcv_zfh --pc=0x80000000 -m0x70000000:0x90043000 --varch vlen:512,elen:32 --gpgpuarch numw:4,numt:16,numwg:1,kernelx:1,kernely:1,kernelz:1,ldssize:0x1000,pdssize:0x10000000,pdsbase:0x90002000,knlbase:0x90042000,currwgid:0 --log=object.riscv.log object.riscv 
Log file object.riscv.log renamed successfully to lsu_0.log.
Elapsed time: 170 ms

等我们复现一下

@yangzexia
Copy link
Collaborator

kernel的编译选项:
image

@zhoujingya
Copy link
Contributor

kernel的编译选项: image

看着没啥毛病啊

@zhoujingya
Copy link
Contributor

lsu.log
vecadd.log
感觉像是你们的c++代码写的有问题 @yangzexia @xxrrnn

@zhoujingya zhoujingya added the good first issue Good for newcomers label May 28, 2024
@zhoujingya
Copy link
Contributor

代码被优化掉了,此issue close

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
good first issue Good for newcomers
Projects
None yet
Development

No branches or pull requests

3 participants