Merge pull request #26 from liwuhen/feat-dev

update trt, decode modules
liwuhen · Jan 21, 2025 · 42ca24b · 42ca24b
2 parents 2e59b72 + 88cf368
commit 42ca24b
Show file tree

Hide file tree

Showing 14 changed files with 208 additions and 77 deletions.
diff --git a/.github/workflows/linters.yml b/.github/workflows/linters.yml
@@ -49,6 +49,8 @@ jobs:
                   -build/include,\
                   -build/namespaces,\
                   -build/header_guard,\
+                  -whitespace/comma, \
+                  -whitespace/comments, \
                   -whitespace/line_length,\
                   -whitespace/indent_namespace,\
                   -runtime/string"  # Ignore runtime checks on string usage.
diff --git a/modules/app_yolo/architecture/common/appconfig.cpp b/modules/app_yolo/architecture/common/appconfig.cpp
@@ -29,7 +29,9 @@ int AppConfig::dst_img_w_;
 int AppConfig::dst_img_h_;
 int AppConfig::dst_img_c_;
 int AppConfig::model_acc_;
+int AppConfig::branch_num_;
 int AppConfig::batchsizes_;
+int AppConfig::decode_type_;
 int AppConfig::max_objects_;
 float AppConfig::obj_threshold_;
 float AppConfig::nms_threshold_;
@@ -81,25 +83,27 @@ AppConfig::AppConfig(const std::string& config_filename) : config_filename_(conf
     return;
   }
 
-  src_img_w_ = yaml_node_["preprocessor_config"]["src_img_width"].as<int>();
-  src_img_h_ = yaml_node_["preprocessor_config"]["src_img_height"].as<int>();
-  src_img_c_ = yaml_node_["preprocessor_config"]["src_img_channel"].as<int>();
-  dst_img_w_ = yaml_node_["preprocessor_config"]["dst_img_width"].as<int>();
-  dst_img_h_ = yaml_node_["preprocessor_config"]["dst_img_height"].as<int>();
-  dst_img_c_ = yaml_node_["preprocessor_config"]["dst_img_channel"].as<int>();
-  batchsizes_ = yaml_node_["preprocessor_config"]["batch_size"].as<int>();
+  src_img_w_   = yaml_node_["preprocessor_config"]["src_img_width"].as<int>();
+  src_img_h_   = yaml_node_["preprocessor_config"]["src_img_height"].as<int>();
+  src_img_c_   = yaml_node_["preprocessor_config"]["src_img_channel"].as<int>();
+  dst_img_w_   = yaml_node_["preprocessor_config"]["dst_img_width"].as<int>();
+  dst_img_h_   = yaml_node_["preprocessor_config"]["dst_img_height"].as<int>();
+  dst_img_c_   = yaml_node_["preprocessor_config"]["dst_img_channel"].as<int>();
+  batchsizes_  = yaml_node_["preprocessor_config"]["batch_size"].as<int>();
+  branch_num_  = yaml_node_["predict_config"]["branch_num"].as<int>();
   predict_dim_ = yaml_node_["predict_config"]["predict_dim"].as<std::vector<int>>();
+  decode_type_ = yaml_node_["predict_config"]["decode_type"].as<int>();
   max_objects_ = yaml_node_["predict_config"]["max_objects"].as<int>();
   obj_threshold_ = yaml_node_["predict_config"]["obj_threshold"].as<float>();
   nms_threshold_ = yaml_node_["predict_config"]["nms_threshold"].as<float>();
-  img_path_ = yaml_node_["inference_config"]["offline_test"]["img_path"].as<std::string>();
-  save_img_ = yaml_node_["inference_config"]["offline_test"]["save_img"].as<std::string>();
-  trt_path_ = yaml_node_["inference_config"]["engine_path"].as<std::string>();
-  onnx_path_ = yaml_node_["inference_config"]["onnx_path"].as<std::string>();
-  model_acc_ = yaml_node_["inference_config"]["model_acc"].as<int>();
-  predict_path_ = yaml_node_["inference_config"]["predict_path"].as<std::string>();
-  log_path_ = yaml_node_["common_config"]["log_path"].as<std::string>();
-  imgs_path_ = yaml_node_["common_config"]["imgs_path"].as<std::string>();
+  img_path_      = yaml_node_["inference_config"]["offline_test"]["img_path"].as<std::string>();
+  save_img_      = yaml_node_["inference_config"]["offline_test"]["save_img"].as<std::string>();
+  trt_path_      = yaml_node_["inference_config"]["engine_path"].as<std::string>();
+  onnx_path_     = yaml_node_["inference_config"]["onnx_path"].as<std::string>();
+  model_acc_     = yaml_node_["inference_config"]["model_acc"].as<int>();
+  predict_path_  = yaml_node_["inference_config"]["predict_path"].as<std::string>();
+  log_path_      = yaml_node_["common_config"]["log_path"].as<std::string>();
+  imgs_path_     = yaml_node_["common_config"]["imgs_path"].as<std::string>();
 
   if (trt_path_ == "") {
     throw std::invalid_argument("engine_path is empty");

diff --git a/modules/app_yolo/architecture/common/appconfig.h b/modules/app_yolo/architecture/common/appconfig.h
@@ -69,7 +69,9 @@ class AppConfig {
   REG_YAML_VAR(int, dst_img_w_);
   REG_YAML_VAR(int, dst_img_h_);
   REG_YAML_VAR(int, dst_img_c_);
+  REG_YAML_VAR(int, branch_num_);
   REG_YAML_VAR(int, batchsizes_);
+  REG_YAML_VAR(int, decode_type_);
   REG_YAML_VAR(int, max_objects_);
   REG_YAML_VAR(int, model_acc_);
   REG_YAML_VAR(float, obj_threshold_);

diff --git a/modules/app_yolo/architecture/common/parseconfig.cpp b/modules/app_yolo/architecture/common/parseconfig.cpp
@@ -38,21 +38,23 @@ void ParseMsgs::ReadYamlParam() {
   dst_img_h_ = app_config->get_dst_img_h_();
   dst_img_c_ = app_config->get_dst_img_c_();
   model_acc_ = app_config->get_model_acc_();
-  batchsizes_ = app_config->get_batchsizes_();
-  predict_dim_ = app_config->get_predict_dim_();
-  max_objects_ = app_config->get_max_objects_();
+  branch_num_    = app_config->get_branch_num_();
+  batchsizes_    = app_config->get_batchsizes_();
+  predict_dim_   = app_config->get_predict_dim_();
+  decode_type_   = app_config->get_decode_type_();
+  max_objects_   = app_config->get_max_objects_();
   obj_threshold_ = app_config->get_obj_threshold_();
   nms_threshold_ = app_config->get_nms_threshold_();
-  img_path_ = app_config->get_home_path_() + app_config->get_img_path_();
-  save_img_ = app_config->get_home_path_() + app_config->get_save_img_();
-  trt_path_ = app_config->get_home_path_() + app_config->get_trt_path_();
-  onnx_path_ = app_config->get_home_path_() + app_config->get_onnx_path_();
-  predict_path_ = app_config->get_home_path_() + app_config->get_predict_path_();
-  log_path_ = app_config->get_home_path_() + app_config->get_log_path_();
-  imgs_path_ = app_config->get_home_path_() + app_config->get_imgs_path_();
+  img_path_      = app_config->get_home_path_() + app_config->get_img_path_();
+  save_img_      = app_config->get_home_path_() + app_config->get_save_img_();
+  trt_path_      = app_config->get_home_path_() + app_config->get_trt_path_();
+  onnx_path_     = app_config->get_home_path_() + app_config->get_onnx_path_();
+  predict_path_  = app_config->get_home_path_() + app_config->get_predict_path_();
+  log_path_      = app_config->get_home_path_() + app_config->get_log_path_();
+  imgs_path_     = app_config->get_home_path_() + app_config->get_imgs_path_();
 
-  srcimg_size_ = src_img_w_ * src_img_h_ * src_img_c_;
-  dstimg_size_ = dst_img_w_ * dst_img_h_ * dst_img_c_;
+  srcimg_size_   = src_img_w_ * src_img_h_ * src_img_c_;
+  dstimg_size_   = dst_img_w_ * dst_img_h_ * dst_img_c_;
 }
 
 }  // namespace common

diff --git a/modules/app_yolo/architecture/common/parseconfig.h b/modules/app_yolo/architecture/common/parseconfig.h
@@ -69,7 +69,9 @@ class ParseMsgs {
   int dst_img_c_;        // Target image channel
   int dstimg_size_;      // Target image size
   int model_acc_;        // Model quantisation accuracy
+  int branch_num_;       // Model branch number
   int batchsizes_;       // Batch size
+  int decode_type_;      // Decode type
   int max_objects_;      // Maximum number of targets
   float obj_threshold_;  // Target Thresholds
   float nms_threshold_;  // Nms Target Thresholds

diff --git a/modules/app_yolo/architecture/decodeprocessor.cpp b/modules/app_yolo/architecture/decodeprocessor.cpp
@@ -87,7 +87,8 @@ bool DecodeProcessor::DataResourceRelease() {}
 /**
  * @description: Inference
  */
-bool DecodeProcessor::Inference(float* predict, InfertMsg& infer_msg, std::shared_ptr<InferMsgQue>& bboxQueue) {
+bool DecodeProcessor::Inference(float* predict,
+    InfertMsg& infer_msg, std::shared_ptr<InferMsgQue>& bboxQueue) {
   imgshape_["src"] = make_pair(infer_msg.height, infer_msg.width);
 
   vector<Box> box_result;
@@ -108,7 +109,8 @@ bool DecodeProcessor::Inference(float* predict, InfertMsg& infer_msg, std::share
 /**
  * @description: Visualization
  */
-void DecodeProcessor::Visualization(bool real_time, cv::Mat& img, int64_t timestamp, vector<Box>& results) {
+void DecodeProcessor::Visualization(bool real_time,
+    cv::Mat& img, int64_t timestamp, vector<Box>& results) {
   for (auto& box : results) {
     cv::Scalar color;
     tie(color[0], color[1], color[2]) = random_color(box.label);
@@ -134,29 +136,95 @@ void DecodeProcessor::Visualization(bool real_time, cv::Mat& img, int64_t timest
  * @description: Bbox mapping to original map scale.
  */
 void DecodeProcessor::ScaleBoxes(vector<Box>& box_result) {
-  float gain = min(imgshape_["dst"].first / static_cast<float>(imgshape_["src"].first), imgshape_["dst"].second / static_cast<float>(imgshape_["src"].second));
-  float pad[] = {(imgshape_["dst"].second - imgshape_["src"].second * gain) * 0.5, (imgshape_["dst"].first - imgshape_["src"].first * gain) * 0.5};
+  float gain  = min(imgshape_["dst"].first / static_cast<float>(imgshape_["src"].first),\
+                imgshape_["dst"].second / static_cast<float>(imgshape_["src"].second));
+  float pad[] = {(imgshape_["dst"].second - imgshape_["src"].second * gain) * 0.5, \
+                (imgshape_["dst"].first - imgshape_["src"].first * gain) * 0.5};
   for (int index = 0; index < box_result.size(); index++) {
-    box_result[index].left = clamp((box_result[index].left - pad[0]) / gain, 0.0f, static_cast<float>(imgshape_["src"].second));
-    box_result[index].right = clamp((box_result[index].right - pad[0]) / gain, 0.0f, static_cast<float>(imgshape_["src"].second));
-    box_result[index].top = clamp((box_result[index].top - pad[1]) / gain, 0.0f, static_cast<float>(imgshape_["src"].first));
-    box_result[index].bottom = clamp((box_result[index].bottom - pad[1]) / gain, 0.0f, static_cast<float>(imgshape_["src"].first));
+    box_result[index].left   = clamp((box_result[index].left - pad[0]) / gain, 0.0f, \
+                               static_cast<float>(imgshape_["src"].second));
+    box_result[index].right  = clamp((box_result[index].right - pad[0]) / gain, 0.0f, \
+                               static_cast<float>(imgshape_["src"].second));
+    box_result[index].top    = clamp((box_result[index].top - pad[1]) / gain, 0.0f, \
+                               static_cast<float>(imgshape_["src"].first));
+    box_result[index].bottom = clamp((box_result[index].bottom - pad[1]) / gain, 0.0f, \
+                               static_cast<float>(imgshape_["src"].first));
   }
 }
 
 /**
- * @description: Cpu decode．
+ * @description: Bounding box decoding at feature level．
  */
-void DecodeProcessor::CpuDecode(float* predict, InfertMsg& infer_msg, vector<Box>& box_result) {
+void DecodeProcessor::BboxDecodeFeatureLevel(float* predict,
+    InfertMsg& infer_msg, vector<Box>& box_result)
+{
+  // for (int j = 0; j < out_node_vec[1]; j++)
+  // {
+  //   float* lables_node = cpu_output_buffers_[0] + j * 3;
+  //   float* scores_node = cpu_output_buffers_[1] + j * 1;
+  //   float* boxes_node  = cpu_output_buffers_[2] + j * 4;  // 特征图级别
+
+  //   int label  = std::max_element(lables_node, lables_node + 3) - lables_node;
+  //   float prob = lables_node[label];
+
+  //   float objness = scores_node[0];
+  //   if(objness < confidence_threshold)
+  //       continue;
+
+  //   float confidence = prob * objness;
+  //   if(confidence < confidence_threshold)
+  //       continue;
+
+  //   if (j < 7680) {
+  //       grid_x = anchor_points[0][j].first;
+  //       grid_y = anchor_points[0][j].second;
+  //       stride = 8;
+  //   }
+  //   else if (j >= 7680 && j < 9600) {
+  //       grid_x = anchor_points[1][j-7680].first;
+  //       grid_y = anchor_points[1][j-7680].second;
+  //       stride = 16;
+  //   }
+  //   else if (j >= 9600 && j < 10080) {
+  //       grid_x = anchor_points[2][j-9600].first;
+  //       grid_y = anchor_points[2][j-9600].second;
+  //       stride = 32;
+  //   }
+
+  //   // 特征图级别 -> 输入图像层级
+  //   float cx     = (boxes_node[0] + grid_x) * stride;  // 输入图像级别
+  //   float cy     = (boxes_node[1] + grid_y) * stride;
+  //   float width  = exp(boxes_node[2]) * stride;
+  //   float height = exp(boxes_node[3]) * stride;  // anchor free
+  //   float left   = cx - width  * 0.5;  // 输入图像级别
+  //   float top    = cy - height * 0.5;
+  //   float right  = cx + width  * 0.5;
+  //   float bottom = cy + height * 0.5;
+
+  //   // 输入图像层级 -> 原图图像层级
+  //   float image_base_left   = d2i[0] * left   + d2i[2];
+  //   float image_base_right  = d2i[0] * right  + d2i[2];
+  //   float image_base_top    = d2i[0] * top    + d2i[5];
+  //   float image_base_bottom = d2i[0] * bottom + d2i[5];
+  //   bboxes.push_back({image_base_left, image_base_top, image_base_right, image_base_bottom, (float)label, confidence});
+  // }
+}
+
+/**
+ * @description: Bounding box decoding at input level．
+ */
+void DecodeProcessor::BboxDecodeInputLevel(float* predict,
+    InfertMsg& infer_msg, vector<Box>& box_result) {
   vector<Box> boxes;
   int num_classes = parsemsgs_->predict_dim_[2] - 5;
-  for (int i = 0; i < parsemsgs_->predict_dim_[1]; ++i) {
-    float* pitem = predict + i * parsemsgs_->predict_dim_[2];
+  for (int i = 0; i < parsemsgs_->predict_dim_[1]; ++i)
+  {
+    float* pitem  = predict + i * parsemsgs_->predict_dim_[2];
     float objness = pitem[4];
     if (objness < parsemsgs_->obj_threshold_) continue;
     float* pclass = pitem + 5;
 
-    int label = std::max_element(pclass, pclass + num_classes) - pclass;
+    int label  = std::max_element(pclass, pclass + num_classes) - pclass;
     float prob = pclass[label];
     float confidence = prob * objness;
     if (confidence < parsemsgs_->obj_threshold_) continue;
@@ -165,22 +233,35 @@ void DecodeProcessor::CpuDecode(float* predict, InfertMsg& infer_msg, vector<Box
     float cy     = pitem[1];
     float width  = pitem[2];
     float height = pitem[3];
-    float left   = cx - width * 0.5;
+    float left   = cx - width  * 0.5;
     float top    = cy - height * 0.5;
-    float right  = cx + width * 0.5;
+    float right  = cx + width  * 0.5;
     float bottom = cy + height * 0.5;
 
     // 输入图像层级模型预测框 ==> 映射回原图上尺寸
-    float image_left = infer_msg.affineMatrix_inv(0, 0) * left + infer_msg.affineMatrix_inv(0, 2);
-    float image_top = infer_msg.affineMatrix_inv(1, 1) * top + infer_msg.affineMatrix_inv(1, 2);
-    float image_right = infer_msg.affineMatrix_inv(0, 0) * right + infer_msg.affineMatrix_inv(0, 2);
+    float image_left   = infer_msg.affineMatrix_inv(0, 0) * left   + infer_msg.affineMatrix_inv(0, 2);
+    float image_top    = infer_msg.affineMatrix_inv(1, 1) * top    + infer_msg.affineMatrix_inv(1, 2);
+    float image_right  = infer_msg.affineMatrix_inv(0, 0) * right  + infer_msg.affineMatrix_inv(0, 2);
     float image_bottom = infer_msg.affineMatrix_inv(1, 1) * bottom + infer_msg.affineMatrix_inv(1, 2);
 
     boxes.emplace_back(image_left, image_top, image_right, image_bottom, confidence, label);
   }
-
   nms_plugin_->Nms(boxes, box_result, parsemsgs_->nms_threshold_);
 }
 
+/**
+ * @description: Cpu decode．
+ */
+void DecodeProcessor::CpuDecode(float* predict,
+    InfertMsg& infer_msg, vector<Box>& box_result) {
+  if((DecodeType)parsemsgs_->decode_type_ == DecodeType::FEATURE_LEVEL) {
+    BboxDecodeFeatureLevel(predict, infer_msg, box_result);
+  } else if ((DecodeType)parsemsgs_->decode_type_ == DecodeType::INPUT_LEVEL) {
+    BboxDecodeInputLevel(predict, infer_msg, box_result);
+  } else {
+    GLOG_ERROR("[CpuDecode]: Decoding method error. ");
+  }
+}
+
 }  // namespace appinfer
 }  // namespace hpc
diff --git a/modules/app_yolo/architecture/decodeprocessor.h b/modules/app_yolo/architecture/decodeprocessor.h
@@ -138,6 +138,22 @@ class DecodeProcessor : public InferModuleBase {
    */
   void ScaleBoxes(vector<Box>& box_result);
 
+  /**
+   * @brief     Box decode feature level．
+   * @param[in] [float*, InfertMsg&, vector<Box>&]．
+   * @return    void.
+   */
+  void BboxDecodeFeatureLevel(float* predict,
+    InfertMsg& infer_msg, vector<Box>& box_result);
+
+  /**
+   * @brief     Box decode input level．
+   * @param[in] [float*, InfertMsg&, vector<Box>&]．
+   * @return    void.
+   */
+  void BboxDecodeInputLevel(float* predict,
+    InfertMsg& infer_msg, vector<Box>& box_result);
+
   /**
    * @brief     Cpu decode．
    * @param[in] [float*, vector<Box>&]．

diff --git a/modules/app_yolo/architecture/preprocessor.cpp b/modules/app_yolo/architecture/preprocessor.cpp
@@ -84,7 +84,9 @@ bool PreProcessor::DataResourceRelease() {}
 /**
  * @description: Inference.
  */
-bool PreProcessor::Inference(InfertMsg& input_msg, float* dstimg, DeviceMode inferMode, cudaStream_t stream) {
+bool PreProcessor::Inference(InfertMsg& input_msg,
+    float* dstimg, DeviceMode inferMode, cudaStream_t stream)
+{
   CalAffineMatrix(input_msg);
 
   switch (inferMode) {
@@ -108,14 +110,17 @@ bool PreProcessor::Inference(InfertMsg& input_msg, float* dstimg, DeviceMode inf
 /**
  * @description: Gpu preprocessor.
  */
-bool PreProcessor::GpuPreprocessor(InfertMsg& input_msg, float* dstimg, cudaStream_t stream) {
-  checkRuntime(cudaMemcpy(input_data_device_, input_msg.image.data, input_msg.img_size * sizeof(uint8_t), cudaMemcpyHostToDevice));
+bool PreProcessor::GpuPreprocessor(InfertMsg& input_msg, float* dstimg, cudaStream_t stream)
+{
+  checkRuntime(cudaMemcpy(input_data_device_, input_msg.image.data,\
+      input_msg.img_size * sizeof(uint8_t), cudaMemcpyHostToDevice));
 
   if (std::string(MODEL_FLAG) == "yolov5") {
-    warp_affine_bilinear(input_data_device_, parsemsgs_->batchsizes_, input_msg, dstimg, parsemsgs_->dst_img_w_, parsemsgs_->dst_img_h_, 114, nullptr, AppYolo::YOLOV5_MODE);
+    warp_affine_bilinear(input_data_device_, parsemsgs_->batchsizes_, input_msg, dstimg, \
+        parsemsgs_->dst_img_w_, parsemsgs_->dst_img_h_, 114, nullptr, AppYolo::YOLOV5_MODE);
   } else if (std::string(MODEL_FLAG) == "yolox") {
-    warp_affine_bilinear(input_data_device_, parsemsgs_->batchsizes_, input_msg, dstimg, parsemsgs_->dst_img_w_, parsemsgs_->dst_img_h_, 114, nullptr, AppYolo::YOLOX_MODE);
-  } else {
+    warp_affine_bilinear(input_data_device_, parsemsgs_->batchsizes_, input_msg, dstimg, \
+        parsemsgs_->dst_img_w_, parsemsgs_->dst_img_h_, 114, nullptr, AppYolo::YOLOX_MODE);
   }
 
   return true;
@@ -124,13 +129,15 @@ bool PreProcessor::GpuPreprocessor(InfertMsg& input_msg, float* dstimg, cudaStre
 /**
  * @description: Cpu preprocessor.
  */
-bool PreProcessor::CpuPreprocessor(cv::Mat& srcimg, uint64_t timestamp, float* input_device_gpu, cudaStream_t stream) {
+bool PreProcessor::CpuPreprocessor(cv::Mat& srcimg, uint64_t timestamp,
+    float* input_device_gpu, cudaStream_t stream)
+{
   checkRuntime(cudaMallocHost(&input_data_host_, sizeof(float) * parsemsgs_->dstimg_size_));
 
   float scale_x = parsemsgs_->dst_img_w_ / static_cast<float>(parsemsgs_->src_img_w_);
   float scale_y = parsemsgs_->dst_img_h_ / static_cast<float>(parsemsgs_->src_img_h_);
-  float scale = std::min(scale_x, scale_y);
-  float i2d[6], d2i[6];
+  float scale   = std::min(scale_x, scale_y);
+  float i2d[6];
   // resize 图像，源图像和目标图像几何中心的对齐
   i2d[0] = scale;
   i2d[1] = 0;
@@ -140,12 +147,11 @@ bool PreProcessor::CpuPreprocessor(cv::Mat& srcimg, uint64_t timestamp, float* i
   i2d[5] = (-scale * parsemsgs_->src_img_h_ + parsemsgs_->dst_img_h_ + scale - 1) * 0.5;
 
   cv::Mat m2x3_i2d(2, 3, CV_32F, i2d);            // image to dst(network), 2x3 matrix
-  cv::Mat m2x3_d2i(2, 3, CV_32F, d2i);            // dst to image, 2x3 matrix
-  cv::invertAffineTransform(m2x3_i2d, m2x3_d2i);  // 计算一个反仿射变换
 
   cv::Mat input_image(parsemsgs_->dst_img_h_, parsemsgs_->dst_img_w_, CV_8UC3);
   // 对图像做平移缩放旋转变换，可逆
-  cv::warpAffine(srcimg, input_image, m2x3_i2d, input_image.size(), cv::INTER_LINEAR, cv::BORDER_CONSTANT, cv::Scalar::all(114));
+  cv::warpAffine(srcimg, input_image, m2x3_i2d, input_image.size(), \
+      cv::INTER_LINEAR, cv::BORDER_CONSTANT, cv::Scalar::all(114));
   std::string path = parsemsgs_->save_img_ + "/img_cpu_test_" + std::to_string(timestamp) + ".jpg";
   cv::imwrite(path, input_image);
 
@@ -161,7 +167,8 @@ bool PreProcessor::CpuPreprocessor(cv::Mat& srcimg, uint64_t timestamp, float* i
     *phost_b++ = pimage[2] / 255.0f;
   }
 
-  checkRuntime(cudaMemcpyAsync(input_device_gpu, input_data_host_, sizeof(float) * parsemsgs_->dstimg_size_, cudaMemcpyHostToDevice, stream));
+  checkRuntime(cudaMemcpyAsync(input_device_gpu, input_data_host_, \
+      sizeof(float) * parsemsgs_->dstimg_size_, cudaMemcpyHostToDevice, stream));
 
   return true;
 }