From cb998db9496fecd82ce7a1192b21c3cffaa95d48 Mon Sep 17 00:00:00 2001
From: AlexeyAB <alexeyab84@gmail.com>
Date: Tue, 11 Dec 2018 21:16:18 +0300
Subject: [PATCH] Some fix for CUDNN_HALF

---
 src/convolutional_kernels.cu |  6 +++--
 src/convolutional_layer.c    | 47 ++++++++++++++++++++++++++++++++++--
 2 files changed, 49 insertions(+), 4 deletions(-)

diff --git a/src/convolutional_kernels.cu b/src/convolutional_kernels.cu
index 12c88c4d..af685631 100644
--- a/src/convolutional_kernels.cu
+++ b/src/convolutional_kernels.cu
@@ -295,7 +295,8 @@ void forward_convolutional_layer_gpu(convolutional_layer l, network_state state)
 //#ifdef CUDNN_HALF
     //if (state.use_mixed_precision) {
     int iteration_num = (*state.net.seen) / (state.net.batch*state.net.subdivisions);
-    if (state.index != 0 && state.net.cudnn_half && !l.xnor && (!state.train || iteration_num > 2*state.net.burn_in))
+    if (state.net.cudnn_half && !l.xnor && (!state.train || iteration_num > 3*state.net.burn_in))
+        //if(state.index != 0)
     {
         //printf("\n CUDNN_HALF!!! state.index = %d \n", state.index);
 
@@ -475,7 +476,8 @@ void backward_convolutional_layer_gpu(convolutional_layer l, network_state state
 
 //#ifdef CUDNN_HALF
     int iteration_num = (*state.net.seen) / (state.net.batch*state.net.subdivisions);
-    if (state.index != 0 && state.net.cudnn_half && !l.xnor && (!state.train || iteration_num > 2*state.net.burn_in))
+    if (state.net.cudnn_half && !l.xnor && (!state.train || iteration_num > 3*state.net.burn_in))
+        //if (state.index != 0)
     {
 
         const size_t input16_size = l.batch*l.c*l.w*l.h;
diff --git a/src/convolutional_layer.c b/src/convolutional_layer.c
index 9fc33c0c..b05b3d00 100644
--- a/src/convolutional_layer.c
+++ b/src/convolutional_layer.c
@@ -136,6 +136,43 @@ size_t get_workspace_size(layer l){
     return (size_t)l.out_h*l.out_w*l.size*l.size*l.c*sizeof(float);
 }
 
+size_t get_workspace_size16(layer l) {
+#if defined(CUDNN) && defined(CUDNN_HALF)
+    if (gpu_index >= 0) {
+        size_t most = 0;
+        size_t s = 0;
+        cudnnGetConvolutionForwardWorkspaceSize(cudnn_handle(),
+            l.srcTensorDesc16,
+            l.weightDesc16,
+            l.convDesc,
+            l.dstTensorDesc16,
+            l.fw_algo16,
+            &s);
+        if (s > most) most = s;
+        cudnnGetConvolutionBackwardFilterWorkspaceSize(cudnn_handle(),
+            l.srcTensorDesc16,
+            l.ddstTensorDesc16,
+            l.convDesc,
+            l.dweightDesc16,
+            l.bf_algo16,
+            &s);
+        if (s > most) most = s;
+        cudnnGetConvolutionBackwardDataWorkspaceSize(cudnn_handle(),
+            l.weightDesc16,
+            l.ddstTensorDesc16,
+            l.convDesc,
+            l.dsrcTensorDesc16,
+            l.bd_algo16,
+            &s);
+        if (s > most) most = s;
+        return most;
+    }
+#endif
+    return 0;
+    //if (l.xnor) return (size_t)l.bit_align*l.size*l.size*l.c * sizeof(float);
+    //return (size_t)l.out_h*l.out_w*l.size*l.size*l.c * sizeof(float);
+}
+
 #ifdef GPU
 #ifdef CUDNN
 void cudnn_convolutional_setup(layer *l, int cudnn_preference)
@@ -177,7 +214,7 @@ void cudnn_convolutional_setup(layer *l, int cudnn_preference)
     cudnnSetTensor4dDescriptor(l->dstTensorDesc, CUDNN_TENSOR_NCHW, data_type, l->batch, l->out_c, l->out_h, l->out_w);
     cudnnSetFilter4dDescriptor(l->weightDesc, data_type, CUDNN_TENSOR_NCHW, l->n, l->c, l->size, l->size);
 
-#ifdef CUDNN_HALF
+//#ifdef CUDNN_HALF
     // backward delta
     cudnnSetTensor4dDescriptor(l->dsrcTensorDesc16, CUDNN_TENSOR_NCHW, CUDNN_DATA_HALF, l->batch, l->c, l->h, l->w);
     cudnnSetTensor4dDescriptor(l->ddstTensorDesc16, CUDNN_TENSOR_NCHW, CUDNN_DATA_HALF, l->batch, l->out_c, l->out_h, l->out_w);
@@ -190,7 +227,7 @@ void cudnn_convolutional_setup(layer *l, int cudnn_preference)
 
     // batch norm
     cudnnSetTensor4dDescriptor(l->normDstTensorDescF16, CUDNN_TENSOR_NCHW, CUDNN_DATA_HALF, l->batch, l->out_c, l->out_h, l->out_w);
-#endif
+//#endif
 
     // batch norm
     cudnnSetTensor4dDescriptor(l->normTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, l->out_c, 1, 1);
@@ -429,6 +466,8 @@ convolutional_layer make_convolutional_layer(int batch, int h, int w, int c, int
     }
 #endif
     l.workspace_size = get_workspace_size(l);
+    size_t workspace_size16 = get_workspace_size16(l);
+    if (workspace_size16 > l.workspace_size) l.workspace_size = workspace_size16;
     l.activation = activation;
 
     //fprintf(stderr, "conv  %5d %2d x%2d /%2d  %4d x%4d x%4d   ->  %4d x%4d x%4d\n", n, size, size, stride, w, h, c, l.out_w, l.out_h, l.out_c);
@@ -532,6 +571,8 @@ void resize_convolutional_layer(convolutional_layer *l, int w, int h)
 #endif
 #endif
     l->workspace_size = get_workspace_size(*l);
+    size_t workspace_size16 = get_workspace_size16(*l);
+    if (workspace_size16 > l->workspace_size) l->workspace_size = workspace_size16;
 
 #ifdef CUDNN
     // check for excessive memory consumption
@@ -542,6 +583,8 @@ void resize_convolutional_layer(convolutional_layer *l, int w, int h)
         printf(" used slow CUDNN algo without Workspace! Need memory: %zu, available: %zu\n", l->workspace_size, (free_byte < total_byte/2) ? free_byte : total_byte/2);
         cudnn_convolutional_setup(l, cudnn_smallest);
         l->workspace_size = get_workspace_size(*l);
+        size_t workspace_size16 = get_workspace_size16(*l);
+        if (workspace_size16 > l->workspace_size) l->workspace_size = workspace_size16;
     }
 #endif
 }