|
|
@ -137,7 +137,7 @@ size_t get_workspace_size(layer l){ |
|
|
|
|
|
|
|
|
|
|
|
#ifdef GPU |
|
|
|
#ifdef GPU |
|
|
|
#ifdef CUDNN |
|
|
|
#ifdef CUDNN |
|
|
|
void cudnn_convolutional_setup(layer *l) |
|
|
|
void cudnn_convolutional_setup(layer *l, int cudnn_preference) |
|
|
|
{ |
|
|
|
{ |
|
|
|
cudnnSetTensor4dDescriptor(l->dsrcTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l->batch, l->c, l->h, l->w);
|
|
|
|
cudnnSetTensor4dDescriptor(l->dsrcTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l->batch, l->c, l->h, l->w);
|
|
|
|
cudnnSetTensor4dDescriptor(l->ddstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l->batch, l->out_c, l->out_h, l->out_w);
|
|
|
|
cudnnSetTensor4dDescriptor(l->ddstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l->batch, l->out_c, l->out_h, l->out_w);
|
|
|
@ -151,12 +151,21 @@ void cudnn_convolutional_setup(layer *l) |
|
|
|
#else |
|
|
|
#else |
|
|
|
cudnnSetConvolution2dDescriptor(l->convDesc, l->pad, l->pad, l->stride, l->stride, 1, 1, CUDNN_CROSS_CORRELATION); // cudnn 5.1
|
|
|
|
cudnnSetConvolution2dDescriptor(l->convDesc, l->pad, l->pad, l->stride, l->stride, 1, 1, CUDNN_CROSS_CORRELATION); // cudnn 5.1
|
|
|
|
#endif |
|
|
|
#endif |
|
|
|
|
|
|
|
int forward_algo = CUDNN_CONVOLUTION_FWD_PREFER_FASTEST; |
|
|
|
|
|
|
|
int backward_algo = CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST; |
|
|
|
|
|
|
|
int backward_filter = CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST; |
|
|
|
|
|
|
|
if (cudnn_preference == cudnn_smallest) { |
|
|
|
|
|
|
|
forward_algo = CUDNN_CONVOLUTION_FWD_NO_WORKSPACE; |
|
|
|
|
|
|
|
backward_algo = CUDNN_CONVOLUTION_BWD_DATA_NO_WORKSPACE; |
|
|
|
|
|
|
|
backward_filter = CUDNN_CONVOLUTION_BWD_FILTER_NO_WORKSPACE; |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
cudnnGetConvolutionForwardAlgorithm(cudnn_handle(), |
|
|
|
cudnnGetConvolutionForwardAlgorithm(cudnn_handle(), |
|
|
|
l->srcTensorDesc, |
|
|
|
l->srcTensorDesc, |
|
|
|
l->weightDesc, |
|
|
|
l->weightDesc, |
|
|
|
l->convDesc, |
|
|
|
l->convDesc, |
|
|
|
l->dstTensorDesc, |
|
|
|
l->dstTensorDesc, |
|
|
|
CUDNN_CONVOLUTION_FWD_PREFER_FASTEST, |
|
|
|
forward_algo, |
|
|
|
0, |
|
|
|
0, |
|
|
|
&l->fw_algo); |
|
|
|
&l->fw_algo); |
|
|
|
cudnnGetConvolutionBackwardDataAlgorithm(cudnn_handle(), |
|
|
|
cudnnGetConvolutionBackwardDataAlgorithm(cudnn_handle(), |
|
|
@ -164,7 +173,7 @@ void cudnn_convolutional_setup(layer *l) |
|
|
|
l->ddstTensorDesc, |
|
|
|
l->ddstTensorDesc, |
|
|
|
l->convDesc, |
|
|
|
l->convDesc, |
|
|
|
l->dsrcTensorDesc, |
|
|
|
l->dsrcTensorDesc, |
|
|
|
CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST, |
|
|
|
backward_algo, |
|
|
|
0, |
|
|
|
0, |
|
|
|
&l->bd_algo); |
|
|
|
&l->bd_algo); |
|
|
|
cudnnGetConvolutionBackwardFilterAlgorithm(cudnn_handle(), |
|
|
|
cudnnGetConvolutionBackwardFilterAlgorithm(cudnn_handle(), |
|
|
@ -172,7 +181,7 @@ void cudnn_convolutional_setup(layer *l) |
|
|
|
l->ddstTensorDesc, |
|
|
|
l->ddstTensorDesc, |
|
|
|
l->convDesc, |
|
|
|
l->convDesc, |
|
|
|
l->dweightDesc, |
|
|
|
l->dweightDesc, |
|
|
|
CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST, |
|
|
|
backward_filter, |
|
|
|
0, |
|
|
|
0, |
|
|
|
&l->bf_algo); |
|
|
|
&l->bf_algo); |
|
|
|
} |
|
|
|
} |
|
|
@ -306,7 +315,7 @@ convolutional_layer make_convolutional_layer(int batch, int h, int w, int c, int |
|
|
|
cudnnCreateTensorDescriptor(&l.ddstTensorDesc); |
|
|
|
cudnnCreateTensorDescriptor(&l.ddstTensorDesc); |
|
|
|
cudnnCreateFilterDescriptor(&l.dweightDesc); |
|
|
|
cudnnCreateFilterDescriptor(&l.dweightDesc); |
|
|
|
cudnnCreateConvolutionDescriptor(&l.convDesc); |
|
|
|
cudnnCreateConvolutionDescriptor(&l.convDesc); |
|
|
|
cudnn_convolutional_setup(&l); |
|
|
|
cudnn_convolutional_setup(&l, cudnn_fastest); |
|
|
|
#endif |
|
|
|
#endif |
|
|
|
} |
|
|
|
} |
|
|
|
#endif |
|
|
|
#endif |
|
|
@ -396,10 +405,22 @@ void resize_convolutional_layer(convolutional_layer *l, int w, int h) |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
#ifdef CUDNN |
|
|
|
#ifdef CUDNN |
|
|
|
cudnn_convolutional_setup(l); |
|
|
|
cudnn_convolutional_setup(l, cudnn_fastest); |
|
|
|
#endif |
|
|
|
#endif |
|
|
|
#endif |
|
|
|
#endif |
|
|
|
l->workspace_size = get_workspace_size(*l); |
|
|
|
l->workspace_size = get_workspace_size(*l); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#ifdef CUDNN |
|
|
|
|
|
|
|
// check for excessive memory consumption
|
|
|
|
|
|
|
|
size_t free_byte; |
|
|
|
|
|
|
|
size_t total_byte; |
|
|
|
|
|
|
|
check_error(cudaMemGetInfo(&free_byte, &total_byte)); |
|
|
|
|
|
|
|
if (l->workspace_size > free_byte || l->workspace_size >= total_byte / 2) { |
|
|
|
|
|
|
|
printf(" used slow CUDNN algo without Workspace! \n"); |
|
|
|
|
|
|
|
cudnn_convolutional_setup(l, cudnn_smallest); |
|
|
|
|
|
|
|
l->workspace_size = get_workspace_size(*l); |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
#endif |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
void add_bias(float *output, float *biases, int batch, int n, int size) |
|
|
|
void add_bias(float *output, float *biases, int batch, int n, int size) |
|
|
|