If there is excessive GPU-RAM consumption by CUDNN then then do not use Workspace

7 years ago · 033e934ce8
parent e4ab47dfce
commit 033e934ce8
4 changed files with 41 additions and 9 deletions
--- a/src/convolutional_layer.c
+++ b/src/convolutional_layer.c
@ -137,7 +137,7 @@ size_t get_workspace_size(layer l){

 #ifdef GPU
 #ifdef CUDNN
-void cudnn_convolutional_setup(layer *l)
+void cudnn_convolutional_setup(layer *l, int cudnn_preference)
 {
    cudnnSetTensor4dDescriptor(l->dsrcTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l->batch, l->c, l->h, l->w); 
    cudnnSetTensor4dDescriptor(l->ddstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l->batch, l->out_c, l->out_h, l->out_w); 
@ -151,12 +151,21 @@ void cudnn_convolutional_setup(layer *l)
 #else
 	cudnnSetConvolution2dDescriptor(l->convDesc, l->pad, l->pad, l->stride, l->stride, 1, 1, CUDNN_CROSS_CORRELATION);	// cudnn 5.1
 #endif
+	int forward_algo = CUDNN_CONVOLUTION_FWD_PREFER_FASTEST;
+	int backward_algo = CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST;
+	int backward_filter = CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST;
+	if (cudnn_preference == cudnn_smallest) {
+		forward_algo = CUDNN_CONVOLUTION_FWD_NO_WORKSPACE;
+		backward_algo = CUDNN_CONVOLUTION_BWD_DATA_NO_WORKSPACE;
+		backward_filter = CUDNN_CONVOLUTION_BWD_FILTER_NO_WORKSPACE;
+	}
+
 	cudnnGetConvolutionForwardAlgorithm(cudnn_handle(),
            l->srcTensorDesc,
            l->weightDesc,
            l->convDesc,
            l->dstTensorDesc,
-            CUDNN_CONVOLUTION_FWD_PREFER_FASTEST,
+			forward_algo,
            0,
            &l->fw_algo);
    cudnnGetConvolutionBackwardDataAlgorithm(cudnn_handle(),
@ -164,7 +173,7 @@ void cudnn_convolutional_setup(layer *l)
            l->ddstTensorDesc,
            l->convDesc,
            l->dsrcTensorDesc,
-            CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST,
+			backward_algo,
            0,
            &l->bd_algo);
    cudnnGetConvolutionBackwardFilterAlgorithm(cudnn_handle(),
@ -172,7 +181,7 @@ void cudnn_convolutional_setup(layer *l)
            l->ddstTensorDesc,
            l->convDesc,
            l->dweightDesc,
-            CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST,
+			backward_filter,
            0,
            &l->bf_algo);
 }
@ -306,7 +315,7 @@ convolutional_layer make_convolutional_layer(int batch, int h, int w, int c, int
        cudnnCreateTensorDescriptor(&l.ddstTensorDesc);
        cudnnCreateFilterDescriptor(&l.dweightDesc);
        cudnnCreateConvolutionDescriptor(&l.convDesc);
-        cudnn_convolutional_setup(&l);
+        cudnn_convolutional_setup(&l, cudnn_fastest);
 #endif
    }
 #endif
@ -396,10 +405,22 @@ void resize_convolutional_layer(convolutional_layer *l, int w, int h)
 		}
 	}
 #ifdef CUDNN
-    cudnn_convolutional_setup(l);
+    cudnn_convolutional_setup(l, cudnn_fastest);
 #endif
 #endif
    l->workspace_size = get_workspace_size(*l);
+
+#ifdef CUDNN
+	// check for excessive memory consumption 
+	size_t free_byte;
+	size_t total_byte;
+	check_error(cudaMemGetInfo(&free_byte, &total_byte));
+	if (l->workspace_size > free_byte || l->workspace_size >= total_byte / 2) {
+		printf(" used slow CUDNN algo without Workspace! \n");
+		cudnn_convolutional_setup(l, cudnn_smallest);
+		l->workspace_size = get_workspace_size(*l);
+	}
+#endif
 }

 void add_bias(float *output, float *biases, int batch, int n, int size)
--- a/src/convolutional_layer.h
+++ b/src/convolutional_layer.h
@ -20,7 +20,7 @@ void pull_convolutional_layer(convolutional_layer layer);
 void add_bias_gpu(float *output, float *biases, int batch, int n, int size);
 void backward_bias_gpu(float *bias_updates, float *delta, int batch, int n, int size);
 #ifdef CUDNN
-void cudnn_convolutional_setup(layer *l);
+void cudnn_convolutional_setup(layer *l, int cudnn_preference);
 #endif
 #endif

--- a/src/cuda.h
+++ b/src/cuda.h
@ -35,6 +35,7 @@ cudaStream_t get_cuda_stream();

 #ifdef CUDNN
 cudnnHandle_t cudnn_handle();
+enum {cudnn_fastest, cudnn_smallest};
 #endif

 #endif
--- a/src/network.c
+++ b/src/network.c
@ -316,7 +316,17 @@ void set_batch_network(network *net, int b)
        net->layers[i].batch = b;
 #ifdef CUDNN
        if(net->layers[i].type == CONVOLUTIONAL){
-            cudnn_convolutional_setup(net->layers + i);
+			layer *l = net->layers + i;
+            cudnn_convolutional_setup(l, cudnn_fastest);
+			// check for excessive memory consumption 
+			size_t free_byte;
+			size_t total_byte;
+			check_error(cudaMemGetInfo(&free_byte, &total_byte));
+			if (l->workspace_size > free_byte || l->workspace_size >= total_byte / 2) {
+				printf(" used slow CUDNN algo without Workspace! \n");
+				cudnn_convolutional_setup(l, cudnn_smallest);
+				l->workspace_size = get_workspace_size(*l);
+			}
        }
 #endif
    }
@ -378,7 +388,7 @@ int resize_network(network *net, int w, int h)
    }
 #ifdef GPU
    if(gpu_index >= 0){
-		printf(" try to allocate workspace, ");
+		printf(" try to allocate workspace = %zu * sizeof(float), ", (workspace_size - 1) / sizeof(float) + 1);
        net->workspace = cuda_make_array(0, (workspace_size-1)/sizeof(float)+1);
 		printf(" CUDA allocate done! \n");
    }else {