Skip to content
This repository has been archived by the owner on Dec 16, 2022. It is now read-only.

winograd convolution (portable and fast in inference) #11

Open
wants to merge 6 commits into
base: intel_scnn
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
116 changes: 116 additions & 0 deletions include/caffe/layers/winograd_layer_inference.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
#ifndef WINOGRAD_LAYER_INFERENCE_HPP_
#define WINOGRAD_LAYER_INFERENCE_HPP_

//winograd_layer for cpu inference

#include <vector>

#include "caffe/blob.hpp"
#include "caffe/layer.hpp"
#include "caffe/proto/caffe.pb.h"

#include "caffe/layers/base_conv_layer.hpp"

namespace caffe {

/**
* @brief Convolves the input image with a bank of learned filters,
* and (optionally) adds biases.
*
* Caffe convolves by reduction to matrix multiplication. This achieves
* high-throughput and generality of input and filter dimensions but comes at
* the cost of memory for matrices. This makes use of efficiency in BLAS.
*
* The input is "im2col" transformed to a channel K' x H x W data matrix
* for multiplication with the N x K' x H x W filter matrix to yield a
* N' x H x W output matrix that is then "col2im" restored. K' is the
* input channel * kernel height * kernel width dimension of the unrolled
* inputs so that the im2col matrix has a column for each input region to
* be filtered. col2im restores the output spatial structure by rolling up
* the output channel N' columns of the output matrix.
*/
template <typename Dtype>
class WinogradLayerInference : public BaseConvolutionLayer<Dtype> {
public:
/**
* @param param provides ConvolutionParameter convolution_param,
* with ConvolutionLayer options:
* - num_output. The number of filters.
* - kernel_size / kernel_h / kernel_w. The filter dimensions, given by
* kernel_size for square filters or kernel_h and kernel_w for rectangular
* filters.
* - stride / stride_h / stride_w (\b optional, default 1). The filter
* stride, given by stride_size for equal dimensions or stride_h and stride_w
* for different strides. By default the convolution is dense with stride 1.
* - pad / pad_h / pad_w (\b optional, default 0). The zero-padding for
* convolution, given by pad for equal dimensions or pad_h and pad_w for
* different padding. Input padding is computed implicitly instead of
* actually padding.
* - dilation (\b optional, default 1). The filter
* dilation, given by dilation_size for equal dimensions for different
* dilation. By default the convolution has dilation 1.
* - group (\b optional, default 1). The number of filter groups. Group
* convolution is a method for reducing parameterization by selectively
* connecting input and output channels. The input and output channel dimensions must be divisible
* by the number of groups. For group @f$ \geq 1 @f$, the
* convolutional filters' input and output channels are separated s.t. each
* group takes 1 / group of the input channels and makes 1 / group of the
* output channels. Concretely 4 input channels, 8 output channels, and
* 2 groups separate input channels 1-2 and output channels 1-4 into the
* first group and input channels 3-4 and output channels 5-8 into the second
* group.
* - bias_term (\b optional, default true). Whether to have a bias.
* - engine: convolution has CAFFE (matrix multiplication) and CUDNN (library
* kernels + stream parallelism) engines.
*/
explicit WinogradLayerInference(const LayerParameter& param)
: BaseConvolutionLayer<Dtype>(param) {}

virtual inline const char* type() const { return "Winograd"; }

virtual void WeightAlign();
bool IsReshapedToWinograd();
void ReshapeToWinograd();

virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
const vector<Blob<Dtype>*>& top);

protected:
void WeightAlignLocal();

virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
const vector<Blob<Dtype>*>& top);
virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
const vector<Blob<Dtype>*>& top);
virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
virtual inline bool reverse_dimensions() { return false; }
virtual void compute_output_shape();

// used in forward pass
void winograd_input_im2col_cpu(const Dtype *data, Dtype *col_buff);
void winograd_output_col2im_cpu(const Dtype *col_buff, Dtype *data);

// used in backward pass
void winograd_output_im2col_cpu(const Dtype *col_buff, Dtype *data);
void winograd_input_col2im_cpu(const Dtype *col_buff, Dtype *data);

Blob<Dtype> temp1_, temp2_, winograd_weight_;

// The following variables are initialized in WeightAlign
int tile_h_in_, tile_w_in_; /* input tile size */
int tile_h_out_, tile_w_out_; /* output tile size */
int ntiles_h_, ntiles_w_; /* number of tiles */

shared_ptr<Blob<long> >
in_activation_ptrs_, out_activation_ptrs_, weight_ptrs_, weight_diff_ptrs_;
/** buffer for pointers to be passed to cubalsSgemmBatched */

bool weight_ptrs_initialized_, weight_diff_ptrs_initialized_;
};

} // namespace caffe

#endif // CAFFE_WINOGRAD_LAYER_HPP_
Loading