From d11caf486da34a562347db53c140900aef42140f Mon Sep 17 00:00:00 2001
From: AlexeyAB <alexeyab84@gmail.com>
Date: Sat, 14 Mar 2020 03:24:46 +0300
Subject: [PATCH] Fixed BiFPN and label_smothing for Detection

---
 include/darknet.h         |  2 ++
 src/activation_kernels.cu |  2 ++
 src/blas.h                |  1 +
 src/blas_kernels.cu       | 64 +++++++++++++++++++++++++++------------
 src/darknet.c             |  2 +-
 src/detector.c            |  2 +-
 src/gaussian_yolo_layer.c | 23 ++++++--------
 src/network.c             |  2 +-
 src/network_kernels.cu    |  6 ++--
 src/parser.c              | 10 +++++-
 src/shortcut_layer.c      | 17 ++++++++++-
 src/yolo_layer.c          | 26 ++++++++--------
 12 files changed, 102 insertions(+), 55 deletions(-)

diff --git a/include/darknet.h b/include/darknet.h
index 0c6761a0..ecc06402 100644
--- a/include/darknet.h
+++ b/include/darknet.h
@@ -324,6 +324,8 @@ struct layer {
 
     int onlyforward;
     int stopbackward;
+    int dont_update;
+    int burnin_update;
     int dontload;
     int dontsave;
     int dontloadscales;
diff --git a/src/activation_kernels.cu b/src/activation_kernels.cu
index c7cd49f2..ef2ddbe4 100644
--- a/src/activation_kernels.cu
+++ b/src/activation_kernels.cu
@@ -493,6 +493,7 @@ __global__ void activate_array_normalize_channels_softmax_kernel(float *x, int s
         for (k = 0; k < channels; ++k) {
             float val = x[wh_i + k * wh_step + b*wh_step*channels];
             val = expf(val - max_val) / sum;
+            if (isnan(val) || isinf(val)) val = 0;
             output_gpu[wh_i + k * wh_step + b*wh_step*channels] = val;
         }
     }
@@ -535,6 +536,7 @@ __global__ void gradient_array_normalize_channels_softmax_kernel(float *x, int s
             float delta = delta_gpu[index];
             float grad = x[index] * (1 - x[index]);
             delta = delta * grad;
+            if (isnan(delta) || isinf(delta)) delta = 0;
             delta_gpu[index] = delta;
         }
     }
diff --git a/src/blas.h b/src/blas.h
index 9cf665d8..22048ec0 100644
--- a/src/blas.h
+++ b/src/blas.h
@@ -60,6 +60,7 @@ void fix_nan_and_inf_cpu(float *input, size_t size);
 
 #ifdef GPU
 
+void constrain_weight_updates_ongpu(int N, float learning_rate, float *weights_gpu, float *weight_updates_gpu);
 void axpy_ongpu(int N, float ALPHA, float * X, int INCX, float * Y, int INCY);
 void axpy_ongpu_offset(int N, float ALPHA, float * X, int OFFX, int INCX, float * Y, int OFFY, int INCY);
 void simple_copy_ongpu(int size, float *src, float *dst);
diff --git a/src/blas_kernels.cu b/src/blas_kernels.cu
index 77c6efdf..30354333 100644
--- a/src/blas_kernels.cu
+++ b/src/blas_kernels.cu
@@ -418,6 +418,24 @@ __global__ void reorg_kernel(int N, float *x, int w, int h, int c, int batch, in
     //else out[0] = x[0];
 }
 
+__global__ void constrain_weight_updates_kernel(int N, float learning_rate, float *weights_gpu, float *weight_updates_gpu)
+{
+    int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
+    if (i < N) {
+        const float w = weights_gpu[i];
+        const float wu = weight_updates_gpu[i];
+        const float wu_sign = (wu == 0) ? 0 : (wu / wu);
+        const float abs_limit = fabs(w*learning_rate);
+        if (fabs(wu) > abs_limit) weight_updates_gpu[i] = abs_limit * wu_sign;
+    }
+}
+
+extern "C" void constrain_weight_updates_ongpu(int N, float learning_rate, float *weights_gpu, float *weight_updates_gpu)
+{
+    constrain_weight_updates_kernel << <cuda_gridsize(N), BLOCK, 0, get_cuda_stream() >> >(N, learning_rate, weights_gpu, weight_updates_gpu);
+    CHECK_CUDA(cudaPeekAtLastError());
+}
+
 __global__ void axpy_kernel(int N, float ALPHA, float *X, int OFFX, int INCX,  float *Y, int OFFY, int INCY)
 {
     int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
@@ -801,6 +819,12 @@ __device__ float relu(float src) {
     return 0;
 }
 
+__device__ float lrelu(float src) {
+    const float eps = 0.001;
+    if (src > eps) return src;
+    return eps;
+}
+
 __global__ void shortcut_singlelayer_simple_kernel(int size, int src_outputs, int batch, int n, int *outputs_of_layers_gpu, float **layers_output_gpu, float *out, float *in, float *weights_gpu, int nweights, WEIGHTS_NORMALIZATION_T weights_normalizion)
 {
     const int id = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
@@ -852,7 +876,7 @@ __global__ void shortcut_multilayer_kernel(int size, int src_outputs, int batch,
         for (int i = 0; i < (n + 1); ++i) {
             const int weights_index = src_i / step + i*layer_step;  // [0 or c or (c, h ,w)]
             const float w = weights_gpu[weights_index];
-            if (weights_normalizion == RELU_NORMALIZATION) sum += relu(w);
+            if (weights_normalizion == RELU_NORMALIZATION) sum += lrelu(w);
             else if (weights_normalizion == SOFTMAX_NORMALIZATION) sum += expf(w - max_val);
         }
     }
@@ -861,7 +885,7 @@ __global__ void shortcut_multilayer_kernel(int size, int src_outputs, int batch,
 
     if (weights_gpu) {
         float w = weights_gpu[src_i / step];
-        if (weights_normalizion == RELU_NORMALIZATION) w = relu(w) / sum;
+        if (weights_normalizion == RELU_NORMALIZATION) w = lrelu(w) / sum;
         else if (weights_normalizion == SOFTMAX_NORMALIZATION) w = expf(w - max_val) / sum;
 
         out_val = in[id] * w; // [0 or c or (c, h ,w)]
@@ -879,7 +903,7 @@ __global__ void shortcut_multilayer_kernel(int size, int src_outputs, int batch,
             if (weights_gpu) {
                 const int weights_index = src_i / step + (i + 1)*layer_step;  // [0 or c or (c, h ,w)]
                 float w = weights_gpu[weights_index];
-                if (weights_normalizion == RELU_NORMALIZATION) w = relu(w) / sum;
+                if (weights_normalizion == RELU_NORMALIZATION) w = lrelu(w) / sum;
                 else if (weights_normalizion == SOFTMAX_NORMALIZATION) w = expf(w - max_val) / sum;
 
                 out_val += add[add_index] * w; // [0 or c or (c, h ,w)]
@@ -935,34 +959,27 @@ __global__ void backward_shortcut_multilayer_kernel(int size, int src_outputs, i
         for (i = 0; i < (n + 1); ++i) {
             const int weights_index = src_i / step + i*layer_step;  // [0 or c or (c, h ,w)]
             const float w = weights_gpu[weights_index];
-            if (weights_normalizion == RELU_NORMALIZATION) sum += relu(w);
+            if (weights_normalizion == RELU_NORMALIZATION) sum += lrelu(w);
             else if (weights_normalizion == SOFTMAX_NORMALIZATION) sum += expf(w - max_val);
         }
 
-        /*
-        grad = 0;
-        for (i = 0; i < (n + 1); ++i) {
-            const int weights_index = src_i / step + i*layer_step;  // [0 or c or (c, h ,w)]
-            const float delta_w = delta_in[id] * in[id];
-            const float w = weights_gpu[weights_index];
-            if (weights_normalizion == RELU_NORMALIZATION) grad += delta_w * relu(w) / sum;
-            else if (weights_normalizion == SOFTMAX_NORMALIZATION) grad += delta_w * expf(w - max_val) / sum;
-        }
-        */
     }
 
     if (weights_gpu) {
         float w = weights_gpu[src_i / step];
-        if (weights_normalizion == RELU_NORMALIZATION) w = relu(w) / sum;
+        if (weights_normalizion == RELU_NORMALIZATION) w = lrelu(w) / sum;
         else if (weights_normalizion == SOFTMAX_NORMALIZATION) w = expf(w - max_val) / sum;
 
         if (weights_normalizion == RELU_NORMALIZATION) grad = w;
         else if (weights_normalizion == SOFTMAX_NORMALIZATION) grad = w*(1-w);
 
         delta_out[id] += delta_in[id] * w; // [0 or c or (c, h ,w)]
-        float weights_update_tmp = delta_in[id] * in[id] * grad / step;
+        float weights_update_tmp = delta_in[id] * in[id] * grad;// / step;
 
         if (layer_step == 1 && (size/32) > (id/32 + 1)) {
+            if (isnan(weights_update_tmp) || isinf(weights_update_tmp)) {
+                weights_update_tmp = 0;
+            }
             float wu = warpAllReduceSum(weights_update_tmp);
             if (threadIdx.x % 32 == 0) {
                 if (!isnan(wu) && !isinf(wu))
@@ -997,13 +1014,18 @@ __global__ void backward_shortcut_multilayer_kernel(int size, int src_outputs, i
                 else if (weights_normalizion == SOFTMAX_NORMALIZATION) grad = w*(1 - w);
 
                 layer_delta[add_index] += delta_in[id] * w;
-                float weights_update_tmp = delta_in[id] * add[add_index] * grad / step;
+                float weights_update_tmp = delta_in[id] * add[add_index] * grad;// / step;
 
                 if (layer_step == 1 && (size / 32) > (id / 32 + 1)) {
+                    if (isnan(weights_update_tmp) || isinf(weights_update_tmp)) {
+                        weights_update_tmp = 0;
+                    }
                     float wu = warpAllReduceSum(weights_update_tmp);
                     if (threadIdx.x % 32 == 0) {
                         if (!isnan(wu) && !isinf(wu))
                             atomicAdd(&weight_updates_gpu[weights_index], wu);
+                        //if(weights_gpu[weights_index] != 1) printf(" wu = %f, weights_update_tmp = %f, w = %f, weights_gpu[weights_index] = %f, grad = %f, weights_normalizion = %d ",
+                        //    wu, weights_update_tmp, w, weights_gpu[weights_index], grad, weights_normalizion);
                     }
                 }
                 else {
@@ -1360,8 +1382,9 @@ __global__ void fix_nan_and_inf_kernel(float *input, size_t size)
     const int index = blockIdx.x*blockDim.x + threadIdx.x;
     if (index < size) {
         float val = input[index];
-        if (isnan(val) || isinf(val))
-            input[index] = 1.0f / index;  // pseudo random value
+        if (isnan(val) || isinf(val)) {
+            input[index] = 1.0f / (fabs((float)index) + 1);  // pseudo random value
+        }
     }
 }
 
@@ -1380,8 +1403,9 @@ __global__ void reset_nan_and_inf_kernel(float *input, size_t size)
     const int index = blockIdx.x*blockDim.x + threadIdx.x;
     if (index < size) {
         float val = input[index];
-        if (isnan(val) || isinf(val))
+        if (isnan(val) || isinf(val)) {
             input[index] = 0;
+        }
     }
 }
 
diff --git a/src/darknet.c b/src/darknet.c
index cba4ebcd..93d88b1b 100644
--- a/src/darknet.c
+++ b/src/darknet.c
@@ -172,7 +172,7 @@ void oneoff(char *cfgfile, char *weightfile, char *outfile)
 void partial(char *cfgfile, char *weightfile, char *outfile, int max)
 {
     gpu_index = -1;
-    network net = parse_network_cfg(cfgfile);
+    network net = parse_network_cfg_custom(cfgfile, 1, 1);
     if(weightfile){
         load_weights_upto(&net, weightfile, max);
     }
diff --git a/src/detector.c b/src/detector.c
index 536befb8..d403f38c 100644
--- a/src/detector.c
+++ b/src/detector.c
@@ -356,7 +356,7 @@ void train_detector(char *datacfg, char *cfgfile, char *weightfile, int *gpus, i
             save_weights(net, buff);
         }
 
-        if (iteration >= (iter_save_last + 100) || iteration % 100 == 0) {
+        if (iteration >= (iter_save_last + 100) || (iteration % 100 == 0 && iteration > 1)) {
             iter_save_last = iteration;
 #ifdef GPU
             if (ngpus != 1) sync_nets(nets, ngpus, 0);
diff --git a/src/gaussian_yolo_layer.c b/src/gaussian_yolo_layer.c
index 101005ee..d3bd3f58 100644
--- a/src/gaussian_yolo_layer.c
+++ b/src/gaussian_yolo_layer.c
@@ -371,25 +371,20 @@ void delta_gaussian_yolo_class(float *output, float *delta, int index, int class
 {
     int n;
     if (delta[index]){
-        if (label_smooth_eps > 0) {
-            float out_val = output[index + stride*class_id] * (1 - label_smooth_eps) + 0.5*label_smooth_eps;
-            delta[index + stride*class_id] = 1 - out_val;
-        }
-        else {
-            delta[index + stride*class_id] = 1 - output[index + stride*class_id];
-        }
+        float y_true = 1;
+        if (label_smooth_eps) y_true = y_true *  (1 - label_smooth_eps) + 0.5*label_smooth_eps;
+        delta[index + stride*class_id] = y_true - output[index + stride*class_id];
+        //delta[index + stride*class_id] = 1 - output[index + stride*class_id];
+
         if (classes_multipliers) delta[index + stride*class_id] *= classes_multipliers[class_id];
         if(avg_cat) *avg_cat += output[index + stride*class_id];
         return;
     }
     for(n = 0; n < classes; ++n){
-        if (label_smooth_eps > 0) {
-            float out_val = output[index + stride*class_id] * (1 - label_smooth_eps) + 0.5*label_smooth_eps;
-            delta[index + stride*n] = ((n == class_id) ? 1 : 0) - out_val;
-        }
-        else {
-            delta[index + stride*n] = ((n == class_id) ? 1 : 0) - output[index + stride*n];
-        }
+        float y_true = ((n == class_id) ? 1 : 0);
+        if (label_smooth_eps) y_true = y_true *  (1 - label_smooth_eps) + 0.5*label_smooth_eps;
+        delta[index + stride*n] = y_true - output[index + stride*n];
+
         if (classes_multipliers && n == class_id) delta[index + stride*class_id] *= classes_multipliers[class_id];
         if(n == class_id && avg_cat) *avg_cat += output[index + stride*n];
     }
diff --git a/src/network.c b/src/network.c
index 9f587082..8f09dab7 100644
--- a/src/network.c
+++ b/src/network.c
@@ -1135,7 +1135,7 @@ void fuse_conv_batchnorm(network net)
                 //cuda_pull_array(l.weights_gpu, l.weights, l.nweights);
                 int i;
                 for (i = 0; i < l->nweights; ++i) printf(" w = %f,", l->weights[i]);
-                printf(" l->nweights = %d \n", l->nweights);
+                printf(" l->nweights = %d, j = %d \n", l->nweights, j);
             }
 
             // nweights - l.n or l.n*l.c or (l.n*l.c*l.h*l.w)
diff --git a/src/network_kernels.cu b/src/network_kernels.cu
index ba65ff04..75b3d6a2 100644
--- a/src/network_kernels.cu
+++ b/src/network_kernels.cu
@@ -169,7 +169,8 @@ void backward_network_gpu(network net, network_state state)
     for(i = net.n-1; i >= 0; --i){
         state.index = i;
         layer l = net.layers[i];
-        if (l.stopbackward) break;
+        if (l.stopbackward == 1) break;
+        if (l.stopbackward > get_current_iteration(net)) break;
         if(i == 0){
             state.input = original_input;
             state.delta = original_delta;
@@ -247,7 +248,8 @@ void update_network_gpu(network net)
         layer l = net.layers[i];
         l.t = get_current_batch(net);
         if (iteration_num > (net.max_batches * 1 / 2)) l.deform = 0;
-        if(l.update_gpu){
+        if (l.burnin_update && (l.burnin_update*net.burn_in > iteration_num)) continue;
+        if(l.update_gpu && l.dont_update < iteration_num){
             l.update_gpu(l, update_batch, rate, net.momentum, net.decay, net.loss_scale);
         }
     }
diff --git a/src/parser.c b/src/parser.c
index cd3f3d24..426e1481 100644
--- a/src/parser.c
+++ b/src/parser.c
@@ -1365,6 +1365,8 @@ network parse_network_cfg_custom(char *filename, int batch, int time_steps)
         l.clip = option_find_float_quiet(options, "clip", 0);
         l.dynamic_minibatch = net.dynamic_minibatch;
         l.onlyforward = option_find_int_quiet(options, "onlyforward", 0);
+        l.dont_update = option_find_int_quiet(options, "dont_update", 0);
+        l.burnin_update = option_find_int_quiet(options, "burnin_update", 0);
         l.stopbackward = option_find_int_quiet(options, "stopbackward", 0);
         l.dontload = option_find_int_quiet(options, "dontload", 0);
         l.dontloadscales = option_find_int_quiet(options, "dontloadscales", 0);
@@ -1556,8 +1558,14 @@ void save_shortcut_weights(layer l, FILE *fp)
 #ifdef GPU
     if (gpu_index >= 0) {
         pull_shortcut_layer(l);
+        printf("\n pull_shortcut_layer \n");
     }
 #endif
+    for (int i = 0; i < l.nweights; ++i) printf(" %f, ", l.weight_updates[i]);
+    printf(" l.nweights = %d - update \n", l.nweights);
+    for (int i = 0; i < l.nweights; ++i) printf(" %f, ", l.weights[i]);
+    printf(" l.nweights = %d \n\n", l.nweights);
+
     int num = l.nweights;
     fwrite(l.weights, sizeof(float), num, fp);
 }
@@ -1843,7 +1851,7 @@ void load_shortcut_weights(layer l, FILE *fp)
     read_bytes = fread(l.weights, sizeof(float), num, fp);
     if (read_bytes > 0 && read_bytes < num) printf("\n Warning: Unexpected end of wights-file! l.weights - l.index = %d \n", l.index);
     //for (int i = 0; i < l.nweights; ++i) printf(" %f, ", l.weights[i]);
-    //printf("\n\n");
+    //printf(" read_bytes = %d \n\n", read_bytes);
 #ifdef GPU
     if (gpu_index >= 0) {
         push_shortcut_layer(l);
diff --git a/src/shortcut_layer.c b/src/shortcut_layer.c
index fedb9d8d..3e2521b3 100644
--- a/src/shortcut_layer.c
+++ b/src/shortcut_layer.c
@@ -256,10 +256,24 @@ void update_shortcut_layer_gpu(layer l, int batch, float learning_rate_init, flo
         reset_nan_and_inf(l.weight_updates_gpu, l.nweights);
         fix_nan_and_inf(l.weights_gpu, l.nweights);
 
-        axpy_ongpu(l.nweights, -decay*batch, l.weights_gpu, 1, l.weight_updates_gpu, 1);
+        constrain_weight_updates_ongpu(l.nweights, 0.001, l.weights_gpu, l.weight_updates_gpu);
+
+        /*
+        cuda_pull_array_async(l.weights_gpu, l.weights, l.nweights);
+        cuda_pull_array_async(l.weight_updates_gpu, l.weight_updates, l.nweights);
+        CHECK_CUDA(cudaStreamSynchronize(get_cuda_stream()));
+        for (int i = 0; i < l.nweights; ++i) printf(" %f, ", l.weight_updates[i]);
+        printf(" l.nweights = %d - updates \n", l.nweights);
+        for (int i = 0; i < l.nweights; ++i) printf(" %f, ", l.weights[i]);
+        printf(" l.nweights = %d \n\n", l.nweights);
+        */
+
+        //axpy_ongpu(l.nweights, -decay*batch, l.weights_gpu, 1, l.weight_updates_gpu, 1);
         axpy_ongpu(l.nweights, learning_rate / batch, l.weight_updates_gpu, 1, l.weights_gpu, 1);
         scal_ongpu(l.nweights, momentum, l.weight_updates_gpu, 1);
 
+        //fill_ongpu(l.nweights, 0, l.weight_updates_gpu, 1);
+
         //if (l.clip) {
         //    constrain_ongpu(l.nweights, l.clip, l.weights_gpu, 1);
         //}
@@ -268,6 +282,7 @@ void update_shortcut_layer_gpu(layer l, int batch, float learning_rate_init, flo
 
 void pull_shortcut_layer(layer l)
 {
+    cuda_pull_array_async(l.weight_updates_gpu, l.weight_updates, l.nweights);
     cuda_pull_array_async(l.weights_gpu, l.weights, l.nweights);
     CHECK_CUDA(cudaPeekAtLastError());
     CHECK_CUDA(cudaStreamSynchronize(get_cuda_stream()));
diff --git a/src/yolo_layer.c b/src/yolo_layer.c
index 8d25cd57..31fe7249 100644
--- a/src/yolo_layer.c
+++ b/src/yolo_layer.c
@@ -257,13 +257,12 @@ void delta_yolo_class(float *output, float *delta, int index, int class_id, int
 {
     int n;
     if (delta[index + stride*class_id]){
-        if (label_smooth_eps > 0) {
-            float out_val = output[index + stride*class_id] * (1 - label_smooth_eps) + 0.5*label_smooth_eps;
-            delta[index + stride*class_id] = 1 - out_val;
-        }
-        else {
-            delta[index + stride*class_id] = 1 - output[index + stride*class_id];
-        }
+        float y_true = 1;
+        if(label_smooth_eps) y_true = y_true *  (1 - label_smooth_eps) + 0.5*label_smooth_eps;
+        float result_delta = y_true - output[index + stride*class_id];
+        if(!isnan(result_delta) && !isinf(result_delta)) delta[index + stride*class_id] = result_delta;
+        //delta[index + stride*class_id] = 1 - output[index + stride*class_id];
+
         if (classes_multipliers) delta[index + stride*class_id] *= classes_multipliers[class_id];
         if(avg_cat) *avg_cat += output[index + stride*class_id];
         return;
@@ -291,13 +290,11 @@ void delta_yolo_class(float *output, float *delta, int index, int class_id, int
     else {
         // default
         for (n = 0; n < classes; ++n) {
-            if (label_smooth_eps > 0) {
-                float out_val = output[index + stride*class_id] * (1 - label_smooth_eps) + 0.5*label_smooth_eps;
-                delta[index + stride*n] = ((n == class_id) ? 1 : 0) - out_val;
-            }
-            else {
-                delta[index + stride*n] = ((n == class_id) ? 1 : 0) - output[index + stride*n];
-            }
+            float y_true = ((n == class_id) ? 1 : 0);
+            if (label_smooth_eps) y_true = y_true *  (1 - label_smooth_eps) + 0.5*label_smooth_eps;
+            float result_delta = y_true - output[index + stride*n];
+            if (!isnan(result_delta) && !isinf(result_delta)) delta[index + stride*n] = result_delta;
+
             if (classes_multipliers && n == class_id) delta[index + stride*class_id] *= classes_multipliers[class_id];
             if (n == class_id && avg_cat) *avg_cat += output[index + stride*n];
         }
@@ -385,6 +382,7 @@ void forward_yolo_layer(const layer l, network_state state)
                         int class_index = entry_index(l, b, n*l.w*l.h + j*l.w + i, 4 + 1);
                         int obj_index = entry_index(l, b, n*l.w*l.h + j*l.w + i, 4);
                         float objectness = l.output[obj_index];
+                        if (isnan(objectness) || isinf(objectness)) l.output[obj_index] = 0;
                         int class_id_match = compare_yolo_class(l.output, l.classes, class_index, l.w*l.h, objectness, class_id, 0.25f);
 
                         float iou = box_iou(pred, truth);