If there is excessive GPU-RAM consumption by CUDNN then then do not use Workspace

pull/492/head
AlexeyAB 7 years ago
parent e4ab47dfce
commit 033e934ce8
  1. 33
      src/convolutional_layer.c
  2. 2
      src/convolutional_layer.h
  3. 1
      src/cuda.h
  4. 14
      src/network.c

@ -137,7 +137,7 @@ size_t get_workspace_size(layer l){
#ifdef GPU
#ifdef CUDNN
void cudnn_convolutional_setup(layer *l)
void cudnn_convolutional_setup(layer *l, int cudnn_preference)
{
cudnnSetTensor4dDescriptor(l->dsrcTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l->batch, l->c, l->h, l->w);
cudnnSetTensor4dDescriptor(l->ddstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l->batch, l->out_c, l->out_h, l->out_w);
@ -151,12 +151,21 @@ void cudnn_convolutional_setup(layer *l)
#else
cudnnSetConvolution2dDescriptor(l->convDesc, l->pad, l->pad, l->stride, l->stride, 1, 1, CUDNN_CROSS_CORRELATION); // cudnn 5.1
#endif
int forward_algo = CUDNN_CONVOLUTION_FWD_PREFER_FASTEST;
int backward_algo = CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST;
int backward_filter = CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST;
if (cudnn_preference == cudnn_smallest) {
forward_algo = CUDNN_CONVOLUTION_FWD_NO_WORKSPACE;
backward_algo = CUDNN_CONVOLUTION_BWD_DATA_NO_WORKSPACE;
backward_filter = CUDNN_CONVOLUTION_BWD_FILTER_NO_WORKSPACE;
}
cudnnGetConvolutionForwardAlgorithm(cudnn_handle(),
l->srcTensorDesc,
l->weightDesc,
l->convDesc,
l->dstTensorDesc,
CUDNN_CONVOLUTION_FWD_PREFER_FASTEST,
forward_algo,
0,
&l->fw_algo);
cudnnGetConvolutionBackwardDataAlgorithm(cudnn_handle(),
@ -164,7 +173,7 @@ void cudnn_convolutional_setup(layer *l)
l->ddstTensorDesc,
l->convDesc,
l->dsrcTensorDesc,
CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST,
backward_algo,
0,
&l->bd_algo);
cudnnGetConvolutionBackwardFilterAlgorithm(cudnn_handle(),
@ -172,7 +181,7 @@ void cudnn_convolutional_setup(layer *l)
l->ddstTensorDesc,
l->convDesc,
l->dweightDesc,
CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST,
backward_filter,
0,
&l->bf_algo);
}
@ -306,7 +315,7 @@ convolutional_layer make_convolutional_layer(int batch, int h, int w, int c, int
cudnnCreateTensorDescriptor(&l.ddstTensorDesc);
cudnnCreateFilterDescriptor(&l.dweightDesc);
cudnnCreateConvolutionDescriptor(&l.convDesc);
cudnn_convolutional_setup(&l);
cudnn_convolutional_setup(&l, cudnn_fastest);
#endif
}
#endif
@ -396,10 +405,22 @@ void resize_convolutional_layer(convolutional_layer *l, int w, int h)
}
}
#ifdef CUDNN
cudnn_convolutional_setup(l);
cudnn_convolutional_setup(l, cudnn_fastest);
#endif
#endif
l->workspace_size = get_workspace_size(*l);
#ifdef CUDNN
// check for excessive memory consumption
size_t free_byte;
size_t total_byte;
check_error(cudaMemGetInfo(&free_byte, &total_byte));
if (l->workspace_size > free_byte || l->workspace_size >= total_byte / 2) {
printf(" used slow CUDNN algo without Workspace! \n");
cudnn_convolutional_setup(l, cudnn_smallest);
l->workspace_size = get_workspace_size(*l);
}
#endif
}
void add_bias(float *output, float *biases, int batch, int n, int size)

@ -20,7 +20,7 @@ void pull_convolutional_layer(convolutional_layer layer);
void add_bias_gpu(float *output, float *biases, int batch, int n, int size);
void backward_bias_gpu(float *bias_updates, float *delta, int batch, int n, int size);
#ifdef CUDNN
void cudnn_convolutional_setup(layer *l);
void cudnn_convolutional_setup(layer *l, int cudnn_preference);
#endif
#endif

@ -35,6 +35,7 @@ cudaStream_t get_cuda_stream();
#ifdef CUDNN
cudnnHandle_t cudnn_handle();
enum {cudnn_fastest, cudnn_smallest};
#endif
#endif

@ -316,7 +316,17 @@ void set_batch_network(network *net, int b)
net->layers[i].batch = b;
#ifdef CUDNN
if(net->layers[i].type == CONVOLUTIONAL){
cudnn_convolutional_setup(net->layers + i);
layer *l = net->layers + i;
cudnn_convolutional_setup(l, cudnn_fastest);
// check for excessive memory consumption
size_t free_byte;
size_t total_byte;
check_error(cudaMemGetInfo(&free_byte, &total_byte));
if (l->workspace_size > free_byte || l->workspace_size >= total_byte / 2) {
printf(" used slow CUDNN algo without Workspace! \n");
cudnn_convolutional_setup(l, cudnn_smallest);
l->workspace_size = get_workspace_size(*l);
}
}
#endif
}
@ -378,7 +388,7 @@ int resize_network(network *net, int w, int h)
}
#ifdef GPU
if(gpu_index >= 0){
printf(" try to allocate workspace, ");
printf(" try to allocate workspace = %zu * sizeof(float), ", (workspace_size - 1) / sizeof(float) + 1);
net->workspace = cuda_make_array(0, (workspace_size-1)/sizeof(float)+1);
printf(" CUDA allocate done! \n");
}else {

Loading…
Cancel
Save