From 033e934ce82826c73d851098baf7ce4b1a27c89a Mon Sep 17 00:00:00 2001 From: AlexeyAB Date: Wed, 21 Feb 2018 19:14:01 +0300 Subject: [PATCH] If there is excessive GPU-RAM consumption by CUDNN then then do not use Workspace --- src/convolutional_layer.c | 33 +++++++++++++++++++++++++++------ src/convolutional_layer.h | 2 +- src/cuda.h | 1 + src/network.c | 14 ++++++++++++-- 4 files changed, 41 insertions(+), 9 deletions(-) diff --git a/src/convolutional_layer.c b/src/convolutional_layer.c index ca834860..801270ab 100644 --- a/src/convolutional_layer.c +++ b/src/convolutional_layer.c @@ -137,7 +137,7 @@ size_t get_workspace_size(layer l){ #ifdef GPU #ifdef CUDNN -void cudnn_convolutional_setup(layer *l) +void cudnn_convolutional_setup(layer *l, int cudnn_preference) { cudnnSetTensor4dDescriptor(l->dsrcTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l->batch, l->c, l->h, l->w); cudnnSetTensor4dDescriptor(l->ddstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l->batch, l->out_c, l->out_h, l->out_w); @@ -151,12 +151,21 @@ void cudnn_convolutional_setup(layer *l) #else cudnnSetConvolution2dDescriptor(l->convDesc, l->pad, l->pad, l->stride, l->stride, 1, 1, CUDNN_CROSS_CORRELATION); // cudnn 5.1 #endif + int forward_algo = CUDNN_CONVOLUTION_FWD_PREFER_FASTEST; + int backward_algo = CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST; + int backward_filter = CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST; + if (cudnn_preference == cudnn_smallest) { + forward_algo = CUDNN_CONVOLUTION_FWD_NO_WORKSPACE; + backward_algo = CUDNN_CONVOLUTION_BWD_DATA_NO_WORKSPACE; + backward_filter = CUDNN_CONVOLUTION_BWD_FILTER_NO_WORKSPACE; + } + cudnnGetConvolutionForwardAlgorithm(cudnn_handle(), l->srcTensorDesc, l->weightDesc, l->convDesc, l->dstTensorDesc, - CUDNN_CONVOLUTION_FWD_PREFER_FASTEST, + forward_algo, 0, &l->fw_algo); cudnnGetConvolutionBackwardDataAlgorithm(cudnn_handle(), @@ -164,7 +173,7 @@ void cudnn_convolutional_setup(layer *l) l->ddstTensorDesc, l->convDesc, l->dsrcTensorDesc, - CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST, + backward_algo, 0, &l->bd_algo); cudnnGetConvolutionBackwardFilterAlgorithm(cudnn_handle(), @@ -172,7 +181,7 @@ void cudnn_convolutional_setup(layer *l) l->ddstTensorDesc, l->convDesc, l->dweightDesc, - CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST, + backward_filter, 0, &l->bf_algo); } @@ -306,7 +315,7 @@ convolutional_layer make_convolutional_layer(int batch, int h, int w, int c, int cudnnCreateTensorDescriptor(&l.ddstTensorDesc); cudnnCreateFilterDescriptor(&l.dweightDesc); cudnnCreateConvolutionDescriptor(&l.convDesc); - cudnn_convolutional_setup(&l); + cudnn_convolutional_setup(&l, cudnn_fastest); #endif } #endif @@ -396,10 +405,22 @@ void resize_convolutional_layer(convolutional_layer *l, int w, int h) } } #ifdef CUDNN - cudnn_convolutional_setup(l); + cudnn_convolutional_setup(l, cudnn_fastest); #endif #endif l->workspace_size = get_workspace_size(*l); + +#ifdef CUDNN + // check for excessive memory consumption + size_t free_byte; + size_t total_byte; + check_error(cudaMemGetInfo(&free_byte, &total_byte)); + if (l->workspace_size > free_byte || l->workspace_size >= total_byte / 2) { + printf(" used slow CUDNN algo without Workspace! \n"); + cudnn_convolutional_setup(l, cudnn_smallest); + l->workspace_size = get_workspace_size(*l); + } +#endif } void add_bias(float *output, float *biases, int batch, int n, int size) diff --git a/src/convolutional_layer.h b/src/convolutional_layer.h index 970aa101..da98dcef 100644 --- a/src/convolutional_layer.h +++ b/src/convolutional_layer.h @@ -20,7 +20,7 @@ void pull_convolutional_layer(convolutional_layer layer); void add_bias_gpu(float *output, float *biases, int batch, int n, int size); void backward_bias_gpu(float *bias_updates, float *delta, int batch, int n, int size); #ifdef CUDNN -void cudnn_convolutional_setup(layer *l); +void cudnn_convolutional_setup(layer *l, int cudnn_preference); #endif #endif diff --git a/src/cuda.h b/src/cuda.h index 31577ff3..0bc05578 100644 --- a/src/cuda.h +++ b/src/cuda.h @@ -35,6 +35,7 @@ cudaStream_t get_cuda_stream(); #ifdef CUDNN cudnnHandle_t cudnn_handle(); +enum {cudnn_fastest, cudnn_smallest}; #endif #endif diff --git a/src/network.c b/src/network.c index f3185a31..10d1ead3 100644 --- a/src/network.c +++ b/src/network.c @@ -316,7 +316,17 @@ void set_batch_network(network *net, int b) net->layers[i].batch = b; #ifdef CUDNN if(net->layers[i].type == CONVOLUTIONAL){ - cudnn_convolutional_setup(net->layers + i); + layer *l = net->layers + i; + cudnn_convolutional_setup(l, cudnn_fastest); + // check for excessive memory consumption + size_t free_byte; + size_t total_byte; + check_error(cudaMemGetInfo(&free_byte, &total_byte)); + if (l->workspace_size > free_byte || l->workspace_size >= total_byte / 2) { + printf(" used slow CUDNN algo without Workspace! \n"); + cudnn_convolutional_setup(l, cudnn_smallest); + l->workspace_size = get_workspace_size(*l); + } } #endif } @@ -378,7 +388,7 @@ int resize_network(network *net, int w, int h) } #ifdef GPU if(gpu_index >= 0){ - printf(" try to allocate workspace, "); + printf(" try to allocate workspace = %zu * sizeof(float), ", (workspace_size - 1) / sizeof(float) + 1); net->workspace = cuda_make_array(0, (workspace_size-1)/sizeof(float)+1); printf(" CUDA allocate done! \n"); }else {