IntelLabs · TaoHuUMD · Jul 24, 2017 · Jul 24, 2017 · Jul 24, 2017 · Jul 24, 2017
diff --git a/include/caffe/layers/winograd_layer_inference.hpp b/include/caffe/layers/winograd_layer_inference.hpp
@@ -0,0 +1,116 @@
+#ifndef WINOGRAD_LAYER_INFERENCE_HPP_
+#define WINOGRAD_LAYER_INFERENCE_HPP_
+
+//winograd_layer for cpu inference
+
+#include <vector>
+
+#include "caffe/blob.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/proto/caffe.pb.h"
+
+#include "caffe/layers/base_conv_layer.hpp"
+
+namespace caffe {
+
+/**
+ * @brief Convolves the input image with a bank of learned filters,
+ *        and (optionally) adds biases.
+ *
+ *   Caffe convolves by reduction to matrix multiplication. This achieves
+ *   high-throughput and generality of input and filter dimensions but comes at
+ *   the cost of memory for matrices. This makes use of efficiency in BLAS.
+ *
+ *   The input is "im2col" transformed to a channel K' x H x W data matrix
+ *   for multiplication with the N x K' x H x W filter matrix to yield a
+ *   N' x H x W output matrix that is then "col2im" restored. K' is the
+ *   input channel * kernel height * kernel width dimension of the unrolled
+ *   inputs so that the im2col matrix has a column for each input region to
+ *   be filtered. col2im restores the output spatial structure by rolling up
+ *   the output channel N' columns of the output matrix.
+ */
+template <typename Dtype>
+class WinogradLayerInference : public BaseConvolutionLayer<Dtype> {
+ public:
+  /**
+   * @param param provides ConvolutionParameter convolution_param,
+   *    with ConvolutionLayer options:
+   *  - num_output. The number of filters.
+   *  - kernel_size / kernel_h / kernel_w. The filter dimensions, given by
+   *  kernel_size for square filters or kernel_h and kernel_w for rectangular
+   *  filters.
+   *  - stride / stride_h / stride_w (\b optional, default 1). The filter
+   *  stride, given by stride_size for equal dimensions or stride_h and stride_w
+   *  for different strides. By default the convolution is dense with stride 1.
+   *  - pad / pad_h / pad_w (\b optional, default 0). The zero-padding for
+   *  convolution, given by pad for equal dimensions or pad_h and pad_w for
+   *  different padding. Input padding is computed implicitly instead of
+   *  actually padding.
+   *  - dilation (\b optional, default 1). The filter
+   *  dilation, given by dilation_size for equal dimensions for different
+   *  dilation. By default the convolution has dilation 1.
+   *  - group (\b optional, default 1). The number of filter groups. Group
+   *  convolution is a method for reducing parameterization by selectively
+   *  connecting input and output channels. The input and output channel dimensions must be divisible
+   *  by the number of groups. For group @f$ \geq 1 @f$, the
+   *  convolutional filters' input and output channels are separated s.t. each
+   *  group takes 1 / group of the input channels and makes 1 / group of the
+   *  output channels. Concretely 4 input channels, 8 output channels, and
+   *  2 groups separate input channels 1-2 and output channels 1-4 into the
+   *  first group and input channels 3-4 and output channels 5-8 into the second
+   *  group.
+   *  - bias_term (\b optional, default true). Whether to have a bias.
+   *  - engine: convolution has CAFFE (matrix multiplication) and CUDNN (library
+   *    kernels + stream parallelism) engines.
+   */
+  explicit WinogradLayerInference(const LayerParameter& param)
+      : BaseConvolutionLayer<Dtype>(param) {}
+
+  virtual inline const char* type() const { return "Winograd"; }
+
+  virtual void WeightAlign();
+  bool IsReshapedToWinograd();
+  void ReshapeToWinograd();
+
+  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+
+ protected:
+  void WeightAlignLocal();
+
+  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+  virtual inline bool reverse_dimensions() { return false; }
+  virtual void compute_output_shape();
+
+  // used in forward pass
+  void winograd_input_im2col_cpu(const Dtype *data, Dtype *col_buff);
+  void winograd_output_col2im_cpu(const Dtype *col_buff, Dtype *data);
+
+  // used in backward pass
+  void winograd_output_im2col_cpu(const Dtype *col_buff, Dtype *data);
+  void winograd_input_col2im_cpu(const Dtype *col_buff, Dtype *data);
+
+  Blob<Dtype> temp1_, temp2_, winograd_weight_;
+
+  // The following variables are initialized in WeightAlign
+  int tile_h_in_, tile_w_in_; /* input tile size */
+  int tile_h_out_, tile_w_out_; /* output tile size */
+  int ntiles_h_, ntiles_w_; /* number of tiles */
+
+  shared_ptr<Blob<long> >
+    in_activation_ptrs_, out_activation_ptrs_, weight_ptrs_, weight_diff_ptrs_;
+    /** buffer for pointers to be passed to cubalsSgemmBatched */
+
+  bool weight_ptrs_initialized_, weight_diff_ptrs_initialized_;
+};
+
+}  // namespace caffe
+
+#endif  // CAFFE_WINOGRAD_LAYER_HPP_