From cb998db9496fecd82ce7a1192b21c3cffaa95d48 Mon Sep 17 00:00:00 2001 From: AlexeyAB Date: Tue, 11 Dec 2018 21:16:18 +0300 Subject: [PATCH] Some fix for CUDNN_HALF --- src/convolutional_kernels.cu | 6 +++-- src/convolutional_layer.c | 47 ++++++++++++++++++++++++++++++++++-- 2 files changed, 49 insertions(+), 4 deletions(-) diff --git a/src/convolutional_kernels.cu b/src/convolutional_kernels.cu index 12c88c4d..af685631 100644 --- a/src/convolutional_kernels.cu +++ b/src/convolutional_kernels.cu @@ -295,7 +295,8 @@ void forward_convolutional_layer_gpu(convolutional_layer l, network_state state) //#ifdef CUDNN_HALF //if (state.use_mixed_precision) { int iteration_num = (*state.net.seen) / (state.net.batch*state.net.subdivisions); - if (state.index != 0 && state.net.cudnn_half && !l.xnor && (!state.train || iteration_num > 2*state.net.burn_in)) + if (state.net.cudnn_half && !l.xnor && (!state.train || iteration_num > 3*state.net.burn_in)) + //if(state.index != 0) { //printf("\n CUDNN_HALF!!! state.index = %d \n", state.index); @@ -475,7 +476,8 @@ void backward_convolutional_layer_gpu(convolutional_layer l, network_state state //#ifdef CUDNN_HALF int iteration_num = (*state.net.seen) / (state.net.batch*state.net.subdivisions); - if (state.index != 0 && state.net.cudnn_half && !l.xnor && (!state.train || iteration_num > 2*state.net.burn_in)) + if (state.net.cudnn_half && !l.xnor && (!state.train || iteration_num > 3*state.net.burn_in)) + //if (state.index != 0) { const size_t input16_size = l.batch*l.c*l.w*l.h; diff --git a/src/convolutional_layer.c b/src/convolutional_layer.c index 9fc33c0c..b05b3d00 100644 --- a/src/convolutional_layer.c +++ b/src/convolutional_layer.c @@ -136,6 +136,43 @@ size_t get_workspace_size(layer l){ return (size_t)l.out_h*l.out_w*l.size*l.size*l.c*sizeof(float); } +size_t get_workspace_size16(layer l) { +#if defined(CUDNN) && defined(CUDNN_HALF) + if (gpu_index >= 0) { + size_t most = 0; + size_t s = 0; + cudnnGetConvolutionForwardWorkspaceSize(cudnn_handle(), + l.srcTensorDesc16, + l.weightDesc16, + l.convDesc, + l.dstTensorDesc16, + l.fw_algo16, + &s); + if (s > most) most = s; + cudnnGetConvolutionBackwardFilterWorkspaceSize(cudnn_handle(), + l.srcTensorDesc16, + l.ddstTensorDesc16, + l.convDesc, + l.dweightDesc16, + l.bf_algo16, + &s); + if (s > most) most = s; + cudnnGetConvolutionBackwardDataWorkspaceSize(cudnn_handle(), + l.weightDesc16, + l.ddstTensorDesc16, + l.convDesc, + l.dsrcTensorDesc16, + l.bd_algo16, + &s); + if (s > most) most = s; + return most; + } +#endif + return 0; + //if (l.xnor) return (size_t)l.bit_align*l.size*l.size*l.c * sizeof(float); + //return (size_t)l.out_h*l.out_w*l.size*l.size*l.c * sizeof(float); +} + #ifdef GPU #ifdef CUDNN void cudnn_convolutional_setup(layer *l, int cudnn_preference) @@ -177,7 +214,7 @@ void cudnn_convolutional_setup(layer *l, int cudnn_preference) cudnnSetTensor4dDescriptor(l->dstTensorDesc, CUDNN_TENSOR_NCHW, data_type, l->batch, l->out_c, l->out_h, l->out_w); cudnnSetFilter4dDescriptor(l->weightDesc, data_type, CUDNN_TENSOR_NCHW, l->n, l->c, l->size, l->size); -#ifdef CUDNN_HALF +//#ifdef CUDNN_HALF // backward delta cudnnSetTensor4dDescriptor(l->dsrcTensorDesc16, CUDNN_TENSOR_NCHW, CUDNN_DATA_HALF, l->batch, l->c, l->h, l->w); cudnnSetTensor4dDescriptor(l->ddstTensorDesc16, CUDNN_TENSOR_NCHW, CUDNN_DATA_HALF, l->batch, l->out_c, l->out_h, l->out_w); @@ -190,7 +227,7 @@ void cudnn_convolutional_setup(layer *l, int cudnn_preference) // batch norm cudnnSetTensor4dDescriptor(l->normDstTensorDescF16, CUDNN_TENSOR_NCHW, CUDNN_DATA_HALF, l->batch, l->out_c, l->out_h, l->out_w); -#endif +//#endif // batch norm cudnnSetTensor4dDescriptor(l->normTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, l->out_c, 1, 1); @@ -429,6 +466,8 @@ convolutional_layer make_convolutional_layer(int batch, int h, int w, int c, int } #endif l.workspace_size = get_workspace_size(l); + size_t workspace_size16 = get_workspace_size16(l); + if (workspace_size16 > l.workspace_size) l.workspace_size = workspace_size16; l.activation = activation; //fprintf(stderr, "conv %5d %2d x%2d /%2d %4d x%4d x%4d -> %4d x%4d x%4d\n", n, size, size, stride, w, h, c, l.out_w, l.out_h, l.out_c); @@ -532,6 +571,8 @@ void resize_convolutional_layer(convolutional_layer *l, int w, int h) #endif #endif l->workspace_size = get_workspace_size(*l); + size_t workspace_size16 = get_workspace_size16(*l); + if (workspace_size16 > l->workspace_size) l->workspace_size = workspace_size16; #ifdef CUDNN // check for excessive memory consumption @@ -542,6 +583,8 @@ void resize_convolutional_layer(convolutional_layer *l, int w, int h) printf(" used slow CUDNN algo without Workspace! Need memory: %zu, available: %zu\n", l->workspace_size, (free_byte < total_byte/2) ? free_byte : total_byte/2); cudnn_convolutional_setup(l, cudnn_smallest); l->workspace_size = get_workspace_size(*l); + size_t workspace_size16 = get_workspace_size16(*l); + if (workspace_size16 > l->workspace_size) l->workspace_size = workspace_size16; } #endif }