Fixed BiFPN and label_smothing for Detection

5 years ago · d11caf486d
parent d79d2815b9
commit d11caf486d
12 changed files with 102 additions and 55 deletions
--- a/include/darknet.h
+++ b/include/darknet.h
@ -324,6 +324,8 @@ struct layer {

    int onlyforward;
    int stopbackward;
+    int dont_update;
+    int burnin_update;
    int dontload;
    int dontsave;
    int dontloadscales;
--- a/src/activation_kernels.cu
+++ b/src/activation_kernels.cu
@ -493,6 +493,7 @@ __global__ void activate_array_normalize_channels_softmax_kernel(float *x, int s
        for (k = 0; k < channels; ++k) {
            float val = x[wh_i + k * wh_step + b*wh_step*channels];
            val = expf(val - max_val) / sum;
+            if (isnan(val) || isinf(val)) val = 0;
            output_gpu[wh_i + k * wh_step + b*wh_step*channels] = val;
        }
    }
@ -535,6 +536,7 @@ __global__ void gradient_array_normalize_channels_softmax_kernel(float *x, int s
            float delta = delta_gpu[index];
            float grad = x[index] * (1 - x[index]);
            delta = delta * grad;
+            if (isnan(delta) || isinf(delta)) delta = 0;
            delta_gpu[index] = delta;
        }
    }
--- a/src/blas.h
+++ b/src/blas.h
@ -60,6 +60,7 @@ void fix_nan_and_inf_cpu(float *input, size_t size);

 #ifdef GPU

+void constrain_weight_updates_ongpu(int N, float learning_rate, float *weights_gpu, float *weight_updates_gpu);
 void axpy_ongpu(int N, float ALPHA, float * X, int INCX, float * Y, int INCY);
 void axpy_ongpu_offset(int N, float ALPHA, float * X, int OFFX, int INCX, float * Y, int OFFY, int INCY);
 void simple_copy_ongpu(int size, float *src, float *dst);
--- a/src/blas_kernels.cu
+++ b/src/blas_kernels.cu
@ -418,6 +418,24 @@ __global__ void reorg_kernel(int N, float *x, int w, int h, int c, int batch, in
    //else out[0] = x[0];
 }

+__global__ void constrain_weight_updates_kernel(int N, float learning_rate, float *weights_gpu, float *weight_updates_gpu)
+{
+    int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
+    if (i < N) {
+        const float w = weights_gpu[i];
+        const float wu = weight_updates_gpu[i];
+        const float wu_sign = (wu == 0) ? 0 : (wu / wu);
+        const float abs_limit = fabs(w*learning_rate);
+        if (fabs(wu) > abs_limit) weight_updates_gpu[i] = abs_limit * wu_sign;
+    }
+}
+
+extern "C" void constrain_weight_updates_ongpu(int N, float learning_rate, float *weights_gpu, float *weight_updates_gpu)
+{
+    constrain_weight_updates_kernel << <cuda_gridsize(N), BLOCK, 0, get_cuda_stream() >> >(N, learning_rate, weights_gpu, weight_updates_gpu);
+    CHECK_CUDA(cudaPeekAtLastError());
+}
+
 __global__ void axpy_kernel(int N, float ALPHA, float *X, int OFFX, int INCX,  float *Y, int OFFY, int INCY)
 {
    int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
@ -801,6 +819,12 @@ __device__ float relu(float src) {
    return 0;
 }

+__device__ float lrelu(float src) {
+    const float eps = 0.001;
+    if (src > eps) return src;
+    return eps;
+}
+
 __global__ void shortcut_singlelayer_simple_kernel(int size, int src_outputs, int batch, int n, int *outputs_of_layers_gpu, float **layers_output_gpu, float *out, float *in, float *weights_gpu, int nweights, WEIGHTS_NORMALIZATION_T weights_normalizion)
 {
    const int id = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
@ -852,7 +876,7 @@ __global__ void shortcut_multilayer_kernel(int size, int src_outputs, int batch,
        for (int i = 0; i < (n + 1); ++i) {
            const int weights_index = src_i / step + i*layer_step;  // [0 or c or (c, h ,w)]
            const float w = weights_gpu[weights_index];
-            if (weights_normalizion == RELU_NORMALIZATION) sum += relu(w);
+            if (weights_normalizion == RELU_NORMALIZATION) sum += lrelu(w);
            else if (weights_normalizion == SOFTMAX_NORMALIZATION) sum += expf(w - max_val);
        }
    }
@ -861,7 +885,7 @@ __global__ void shortcut_multilayer_kernel(int size, int src_outputs, int batch,

    if (weights_gpu) {
        float w = weights_gpu[src_i / step];
-        if (weights_normalizion == RELU_NORMALIZATION) w = relu(w) / sum;
+        if (weights_normalizion == RELU_NORMALIZATION) w = lrelu(w) / sum;
        else if (weights_normalizion == SOFTMAX_NORMALIZATION) w = expf(w - max_val) / sum;

        out_val = in[id] * w; // [0 or c or (c, h ,w)]
@ -879,7 +903,7 @@ __global__ void shortcut_multilayer_kernel(int size, int src_outputs, int batch,
            if (weights_gpu) {
                const int weights_index = src_i / step + (i + 1)*layer_step;  // [0 or c or (c, h ,w)]
                float w = weights_gpu[weights_index];
-                if (weights_normalizion == RELU_NORMALIZATION) w = relu(w) / sum;
+                if (weights_normalizion == RELU_NORMALIZATION) w = lrelu(w) / sum;
                else if (weights_normalizion == SOFTMAX_NORMALIZATION) w = expf(w - max_val) / sum;

                out_val += add[add_index] * w; // [0 or c or (c, h ,w)]
@ -935,34 +959,27 @@ __global__ void backward_shortcut_multilayer_kernel(int size, int src_outputs, i
        for (i = 0; i < (n + 1); ++i) {
            const int weights_index = src_i / step + i*layer_step;  // [0 or c or (c, h ,w)]
            const float w = weights_gpu[weights_index];
-            if (weights_normalizion == RELU_NORMALIZATION) sum += relu(w);
+            if (weights_normalizion == RELU_NORMALIZATION) sum += lrelu(w);
            else if (weights_normalizion == SOFTMAX_NORMALIZATION) sum += expf(w - max_val);
        }

-        /*
-        grad = 0;
-        for (i = 0; i < (n + 1); ++i) {
-            const int weights_index = src_i / step + i*layer_step;  // [0 or c or (c, h ,w)]
-            const float delta_w = delta_in[id] * in[id];
-            const float w = weights_gpu[weights_index];
-            if (weights_normalizion == RELU_NORMALIZATION) grad += delta_w * relu(w) / sum;
-            else if (weights_normalizion == SOFTMAX_NORMALIZATION) grad += delta_w * expf(w - max_val) / sum;
-        }
-        */
    }

    if (weights_gpu) {
        float w = weights_gpu[src_i / step];
-        if (weights_normalizion == RELU_NORMALIZATION) w = relu(w) / sum;
+        if (weights_normalizion == RELU_NORMALIZATION) w = lrelu(w) / sum;
        else if (weights_normalizion == SOFTMAX_NORMALIZATION) w = expf(w - max_val) / sum;

        if (weights_normalizion == RELU_NORMALIZATION) grad = w;
        else if (weights_normalizion == SOFTMAX_NORMALIZATION) grad = w*(1-w);

        delta_out[id] += delta_in[id] * w; // [0 or c or (c, h ,w)]
-        float weights_update_tmp = delta_in[id] * in[id] * grad / step;
+        float weights_update_tmp = delta_in[id] * in[id] * grad;// / step;

        if (layer_step == 1 && (size/32) > (id/32 + 1)) {
+            if (isnan(weights_update_tmp) || isinf(weights_update_tmp)) {
+                weights_update_tmp = 0;
+            }
            float wu = warpAllReduceSum(weights_update_tmp);
            if (threadIdx.x % 32 == 0) {
                if (!isnan(wu) && !isinf(wu))
@ -997,13 +1014,18 @@ __global__ void backward_shortcut_multilayer_kernel(int size, int src_outputs, i
                else if (weights_normalizion == SOFTMAX_NORMALIZATION) grad = w*(1 - w);

                layer_delta[add_index] += delta_in[id] * w;
-                float weights_update_tmp = delta_in[id] * add[add_index] * grad / step;
+                float weights_update_tmp = delta_in[id] * add[add_index] * grad;// / step;

                if (layer_step == 1 && (size / 32) > (id / 32 + 1)) {
+                    if (isnan(weights_update_tmp) || isinf(weights_update_tmp)) {
+                        weights_update_tmp = 0;
+                    }
                    float wu = warpAllReduceSum(weights_update_tmp);
                    if (threadIdx.x % 32 == 0) {
                        if (!isnan(wu) && !isinf(wu))
                            atomicAdd(&weight_updates_gpu[weights_index], wu);
+                        //if(weights_gpu[weights_index] != 1) printf(" wu = %f, weights_update_tmp = %f, w = %f, weights_gpu[weights_index] = %f, grad = %f, weights_normalizion = %d ",
+                        //    wu, weights_update_tmp, w, weights_gpu[weights_index], grad, weights_normalizion);
                    }
                }
                else {
@ -1360,8 +1382,9 @@ __global__ void fix_nan_and_inf_kernel(float *input, size_t size)
    const int index = blockIdx.x*blockDim.x + threadIdx.x;
    if (index < size) {
        float val = input[index];
-        if (isnan(val) || isinf(val))
-            input[index] = 1.0f / index;  // pseudo random value
+        if (isnan(val) || isinf(val)) {
+            input[index] = 1.0f / (fabs((float)index) + 1);  // pseudo random value
+        }
    }
 }

@ -1380,10 +1403,11 @@ __global__ void reset_nan_and_inf_kernel(float *input, size_t size)
    const int index = blockIdx.x*blockDim.x + threadIdx.x;
    if (index < size) {
        float val = input[index];
-        if (isnan(val) || isinf(val))
+        if (isnan(val) || isinf(val)) {
            input[index] = 0;
        }
    }
+}

 extern "C" void reset_nan_and_inf(float *input, size_t size)
 {
--- a/src/darknet.c
+++ b/src/darknet.c
@ -172,7 +172,7 @@ void oneoff(char *cfgfile, char *weightfile, char *outfile)
 void partial(char *cfgfile, char *weightfile, char *outfile, int max)
 {
    gpu_index = -1;
-    network net = parse_network_cfg(cfgfile);
+    network net = parse_network_cfg_custom(cfgfile, 1, 1);
    if(weightfile){
        load_weights_upto(&net, weightfile, max);
    }
--- a/src/detector.c
+++ b/src/detector.c
@ -356,7 +356,7 @@ void train_detector(char *datacfg, char *cfgfile, char *weightfile, int *gpus, i
            save_weights(net, buff);
        }

-        if (iteration >= (iter_save_last + 100) || iteration % 100 == 0) {
+        if (iteration >= (iter_save_last + 100) || (iteration % 100 == 0 && iteration > 1)) {
            iter_save_last = iteration;
 #ifdef GPU
            if (ngpus != 1) sync_nets(nets, ngpus, 0);
--- a/src/gaussian_yolo_layer.c
+++ b/src/gaussian_yolo_layer.c
@ -371,25 +371,20 @@ void delta_gaussian_yolo_class(float *output, float *delta, int index, int class
 {
    int n;
    if (delta[index]){
-        if (label_smooth_eps > 0) {
-            float out_val = output[index + stride*class_id] * (1 - label_smooth_eps) + 0.5*label_smooth_eps;
-            delta[index + stride*class_id] = 1 - out_val;
-        }
-        else {
-            delta[index + stride*class_id] = 1 - output[index + stride*class_id];
-        }
+        float y_true = 1;
+        if (label_smooth_eps) y_true = y_true *  (1 - label_smooth_eps) + 0.5*label_smooth_eps;
+        delta[index + stride*class_id] = y_true - output[index + stride*class_id];
+        //delta[index + stride*class_id] = 1 - output[index + stride*class_id];
+
        if (classes_multipliers) delta[index + stride*class_id] *= classes_multipliers[class_id];
        if(avg_cat) *avg_cat += output[index + stride*class_id];
        return;
    }
    for(n = 0; n < classes; ++n){
-        if (label_smooth_eps > 0) {
-            float out_val = output[index + stride*class_id] * (1 - label_smooth_eps) + 0.5*label_smooth_eps;
-            delta[index + stride*n] = ((n == class_id) ? 1 : 0) - out_val;
-        }
-        else {
-            delta[index + stride*n] = ((n == class_id) ? 1 : 0) - output[index + stride*n];
-        }
+        float y_true = ((n == class_id) ? 1 : 0);
+        if (label_smooth_eps) y_true = y_true *  (1 - label_smooth_eps) + 0.5*label_smooth_eps;
+        delta[index + stride*n] = y_true - output[index + stride*n];
+
        if (classes_multipliers && n == class_id) delta[index + stride*class_id] *= classes_multipliers[class_id];
        if(n == class_id && avg_cat) *avg_cat += output[index + stride*n];
    }
--- a/src/network.c
+++ b/src/network.c
@ -1135,7 +1135,7 @@ void fuse_conv_batchnorm(network net)
                //cuda_pull_array(l.weights_gpu, l.weights, l.nweights);
                int i;
                for (i = 0; i < l->nweights; ++i) printf(" w = %f,", l->weights[i]);
-                printf(" l->nweights = %d \n", l->nweights);
+                printf(" l->nweights = %d, j = %d \n", l->nweights, j);
            }

            // nweights - l.n or l.n*l.c or (l.n*l.c*l.h*l.w)
--- a/src/network_kernels.cu
+++ b/src/network_kernels.cu
@ -169,7 +169,8 @@ void backward_network_gpu(network net, network_state state)
    for(i = net.n-1; i >= 0; --i){
        state.index = i;
        layer l = net.layers[i];
-        if (l.stopbackward) break;
+        if (l.stopbackward == 1) break;
+        if (l.stopbackward > get_current_iteration(net)) break;
        if(i == 0){
            state.input = original_input;
            state.delta = original_delta;
@ -247,7 +248,8 @@ void update_network_gpu(network net)
        layer l = net.layers[i];
        l.t = get_current_batch(net);
        if (iteration_num > (net.max_batches * 1 / 2)) l.deform = 0;
-        if(l.update_gpu){
+        if (l.burnin_update && (l.burnin_update*net.burn_in > iteration_num)) continue;
+        if(l.update_gpu && l.dont_update < iteration_num){
            l.update_gpu(l, update_batch, rate, net.momentum, net.decay, net.loss_scale);
        }
    }
--- a/src/parser.c
+++ b/src/parser.c
@ -1365,6 +1365,8 @@ network parse_network_cfg_custom(char *filename, int batch, int time_steps)
        l.clip = option_find_float_quiet(options, "clip", 0);
        l.dynamic_minibatch = net.dynamic_minibatch;
        l.onlyforward = option_find_int_quiet(options, "onlyforward", 0);
+        l.dont_update = option_find_int_quiet(options, "dont_update", 0);
+        l.burnin_update = option_find_int_quiet(options, "burnin_update", 0);
        l.stopbackward = option_find_int_quiet(options, "stopbackward", 0);
        l.dontload = option_find_int_quiet(options, "dontload", 0);
        l.dontloadscales = option_find_int_quiet(options, "dontloadscales", 0);
@ -1556,8 +1558,14 @@ void save_shortcut_weights(layer l, FILE *fp)
 #ifdef GPU
    if (gpu_index >= 0) {
        pull_shortcut_layer(l);
+        printf("\n pull_shortcut_layer \n");
    }
 #endif
+    for (int i = 0; i < l.nweights; ++i) printf(" %f, ", l.weight_updates[i]);
+    printf(" l.nweights = %d - update \n", l.nweights);
+    for (int i = 0; i < l.nweights; ++i) printf(" %f, ", l.weights[i]);
+    printf(" l.nweights = %d \n\n", l.nweights);
+
    int num = l.nweights;
    fwrite(l.weights, sizeof(float), num, fp);
 }
@ -1843,7 +1851,7 @@ void load_shortcut_weights(layer l, FILE *fp)
    read_bytes = fread(l.weights, sizeof(float), num, fp);
    if (read_bytes > 0 && read_bytes < num) printf("\n Warning: Unexpected end of wights-file! l.weights - l.index = %d \n", l.index);
    //for (int i = 0; i < l.nweights; ++i) printf(" %f, ", l.weights[i]);
-    //printf("\n\n");
+    //printf(" read_bytes = %d \n\n", read_bytes);
 #ifdef GPU
    if (gpu_index >= 0) {
        push_shortcut_layer(l);
--- a/src/shortcut_layer.c
+++ b/src/shortcut_layer.c
@ -256,10 +256,24 @@ void update_shortcut_layer_gpu(layer l, int batch, float learning_rate_init, flo
        reset_nan_and_inf(l.weight_updates_gpu, l.nweights);
        fix_nan_and_inf(l.weights_gpu, l.nweights);

-        axpy_ongpu(l.nweights, -decay*batch, l.weights_gpu, 1, l.weight_updates_gpu, 1);
+        constrain_weight_updates_ongpu(l.nweights, 0.001, l.weights_gpu, l.weight_updates_gpu);
+
+        /*
+        cuda_pull_array_async(l.weights_gpu, l.weights, l.nweights);
+        cuda_pull_array_async(l.weight_updates_gpu, l.weight_updates, l.nweights);
+        CHECK_CUDA(cudaStreamSynchronize(get_cuda_stream()));
+        for (int i = 0; i < l.nweights; ++i) printf(" %f, ", l.weight_updates[i]);
+        printf(" l.nweights = %d - updates \n", l.nweights);
+        for (int i = 0; i < l.nweights; ++i) printf(" %f, ", l.weights[i]);
+        printf(" l.nweights = %d \n\n", l.nweights);
+        */
+
+        //axpy_ongpu(l.nweights, -decay*batch, l.weights_gpu, 1, l.weight_updates_gpu, 1);
        axpy_ongpu(l.nweights, learning_rate / batch, l.weight_updates_gpu, 1, l.weights_gpu, 1);
        scal_ongpu(l.nweights, momentum, l.weight_updates_gpu, 1);

+        //fill_ongpu(l.nweights, 0, l.weight_updates_gpu, 1);
+
        //if (l.clip) {
        //    constrain_ongpu(l.nweights, l.clip, l.weights_gpu, 1);
        //}
@ -268,6 +282,7 @@ void update_shortcut_layer_gpu(layer l, int batch, float learning_rate_init, flo

 void pull_shortcut_layer(layer l)
 {
+    cuda_pull_array_async(l.weight_updates_gpu, l.weight_updates, l.nweights);
    cuda_pull_array_async(l.weights_gpu, l.weights, l.nweights);
    CHECK_CUDA(cudaPeekAtLastError());
    CHECK_CUDA(cudaStreamSynchronize(get_cuda_stream()));
--- a/src/yolo_layer.c
+++ b/src/yolo_layer.c
@ -257,13 +257,12 @@ void delta_yolo_class(float *output, float *delta, int index, int class_id, int
 {
    int n;
    if (delta[index + stride*class_id]){
-        if (label_smooth_eps > 0) {
-            float out_val = output[index + stride*class_id] * (1 - label_smooth_eps) + 0.5*label_smooth_eps;
-            delta[index + stride*class_id] = 1 - out_val;
-        }
-        else {
-            delta[index + stride*class_id] = 1 - output[index + stride*class_id];
-        }
+        float y_true = 1;
+        if(label_smooth_eps) y_true = y_true *  (1 - label_smooth_eps) + 0.5*label_smooth_eps;
+        float result_delta = y_true - output[index + stride*class_id];
+        if(!isnan(result_delta) && !isinf(result_delta)) delta[index + stride*class_id] = result_delta;
+        //delta[index + stride*class_id] = 1 - output[index + stride*class_id];
+
        if (classes_multipliers) delta[index + stride*class_id] *= classes_multipliers[class_id];
        if(avg_cat) *avg_cat += output[index + stride*class_id];
        return;
@ -291,13 +290,11 @@ void delta_yolo_class(float *output, float *delta, int index, int class_id, int
    else {
        // default
        for (n = 0; n < classes; ++n) {
-            if (label_smooth_eps > 0) {
-                float out_val = output[index + stride*class_id] * (1 - label_smooth_eps) + 0.5*label_smooth_eps;
-                delta[index + stride*n] = ((n == class_id) ? 1 : 0) - out_val;
-            }
-            else {
-                delta[index + stride*n] = ((n == class_id) ? 1 : 0) - output[index + stride*n];
-            }
+            float y_true = ((n == class_id) ? 1 : 0);
+            if (label_smooth_eps) y_true = y_true *  (1 - label_smooth_eps) + 0.5*label_smooth_eps;
+            float result_delta = y_true - output[index + stride*n];
+            if (!isnan(result_delta) && !isinf(result_delta)) delta[index + stride*n] = result_delta;
+
            if (classes_multipliers && n == class_id) delta[index + stride*class_id] *= classes_multipliers[class_id];
            if (n == class_id && avg_cat) *avg_cat += output[index + stride*n];
        }
@ -385,6 +382,7 @@ void forward_yolo_layer(const layer l, network_state state)
                        int class_index = entry_index(l, b, n*l.w*l.h + j*l.w + i, 4 + 1);
                        int obj_index = entry_index(l, b, n*l.w*l.h + j*l.w + i, 4);
                        float objectness = l.output[obj_index];
+                        if (isnan(objectness) || isinf(objectness)) l.output[obj_index] = 0;
                        int class_id_match = compare_yolo_class(l.output, l.classes, class_index, l.w*l.h, objectness, class_id, 0.25f);

                        float iou = box_iou(pred, truth);