diff --git a/README.md b/README.md
index 632f8962..a00d8924 100644
--- a/README.md
+++ b/README.md
@@ -1,5 +1,5 @@
 # Yolo-v3 and Yolo-v2 for Windows and Linux
-### (neural network for object detection) - Tensor Cores can be used on [Linux](https://github.com/AlexeyAB/darknet#how-to-compile-on-linux) and [Windows](https://github.com/AlexeyAB/darknet#how-to-compile-on-windows-using-vcpkg)
+### (neural network for object detection) - Tensor Cores can be used on [Linux](https://github.com/AlexeyAB/darknet#how-to-compile-on-linux) and [Windows](https://github.com/AlexeyAB/darknet#how-to-compile-on-windows-using-cmake-gui)
 
 More details: http://pjreddie.com/darknet/yolo/
 
@@ -8,7 +8,8 @@ More details: http://pjreddie.com/darknet/yolo/
 [![TravisCI](https://travis-ci.org/AlexeyAB/darknet.svg?branch=master)](https://travis-ci.org/AlexeyAB/darknet)
 [![AppveyorCI](https://ci.appveyor.com/api/projects/status/594bwb5uoc1fxwiu/branch/master?svg=true)](https://ci.appveyor.com/project/AlexeyAB/darknet/branch/master)
 [![Contributors](https://img.shields.io/github/contributors/AlexeyAB/Darknet.svg)](https://github.com/AlexeyAB/darknet/graphs/contributors)
-[![License: Unlicense](https://img.shields.io/badge/license-Unlicense-blue.svg)](https://github.com/AlexeyAB/darknet/blob/master/LICENSE)  
+[![License: Unlicense](https://img.shields.io/badge/license-Unlicense-blue.svg)](https://github.com/AlexeyAB/darknet/blob/master/LICENSE)
+[![DOI](https://zenodo.org/badge/75388965.svg)](https://zenodo.org/badge/latestdoi/75388965)
 
 
 * [Requirements (and how to install dependecies)](#requirements)
@@ -395,7 +396,7 @@ Training Yolo v3:
       * https://github.com/AlexeyAB/darknet/blob/0039fd26786ab5f71d5af725fc18b3f521e7acfd/cfg/yolov3.cfg#L610
       * https://github.com/AlexeyAB/darknet/blob/0039fd26786ab5f71d5af725fc18b3f521e7acfd/cfg/yolov3.cfg#L696
       * https://github.com/AlexeyAB/darknet/blob/0039fd26786ab5f71d5af725fc18b3f521e7acfd/cfg/yolov3.cfg#L783
-  * change [`filters=255`] to filters=(classes + 5)x3 in the 3 `[convolutional]` before each `[yolo]` layer
+  * change [`filters=255`] to filters=(classes + 5)x3 in the 3 `[convolutional]` before each `[yolo]` layer, keep in mind that it only has to be the last `[convolutional]` before each of the `[yolo]` layers.
       * https://github.com/AlexeyAB/darknet/blob/0039fd26786ab5f71d5af725fc18b3f521e7acfd/cfg/yolov3.cfg#L603
       * https://github.com/AlexeyAB/darknet/blob/0039fd26786ab5f71d5af725fc18b3f521e7acfd/cfg/yolov3.cfg#L689
       * https://github.com/AlexeyAB/darknet/blob/0039fd26786ab5f71d5af725fc18b3f521e7acfd/cfg/yolov3.cfg#L776
diff --git a/include/darknet.h b/include/darknet.h
index a13168da..ecc06402 100644
--- a/include/darknet.h
+++ b/include/darknet.h
@@ -34,6 +34,8 @@
 
 #define SECRET_NUM -1234
 
+typedef enum { UNUSED_DEF_VAL } UNUSED_ENUM_TYPE;
+
 #ifdef GPU
 
 #include <cuda_runtime.h>
@@ -42,8 +44,8 @@
 
 #ifdef CUDNN
 #include <cudnn.h>
-#endif
-#endif
+#endif  // CUDNN
+#endif  // GPU
 
 #ifdef __cplusplus
 extern "C" {
@@ -209,13 +211,14 @@ struct layer {
     void(*update)    (struct layer, int, float, float, float);
     void(*forward_gpu)   (struct layer, struct network_state);
     void(*backward_gpu)  (struct layer, struct network_state);
-    void(*update_gpu)    (struct layer, int, float, float, float);
+    void(*update_gpu)    (struct layer, int, float, float, float, float);
     layer *share_layer;
     int train;
     int avgpool;
     int batch_normalize;
     int shortcut;
     int batch;
+    int dynamic_minibatch;
     int forced;
     int flipped;
     int inputs;
@@ -321,6 +324,8 @@ struct layer {
 
     int onlyforward;
     int stopbackward;
+    int dont_update;
+    int burnin_update;
     int dontload;
     int dontsave;
     int dontloadscales;
@@ -495,7 +500,7 @@ struct layer {
 
     size_t workspace_size;
 
-#ifdef GPU
+//#ifdef GPU
     int *indexes_gpu;
 
     float *z_gpu;
@@ -610,8 +615,21 @@ struct layer {
     cudnnConvolutionBwdDataAlgo_t bd_algo, bd_algo16;
     cudnnConvolutionBwdFilterAlgo_t bf_algo, bf_algo16;
     cudnnPoolingDescriptor_t poolingDesc;
+#else   // CUDNN
+    void* srcTensorDesc, *dstTensorDesc;
+    void* srcTensorDesc16, *dstTensorDesc16;
+    void* dsrcTensorDesc, *ddstTensorDesc;
+    void* dsrcTensorDesc16, *ddstTensorDesc16;
+    void* normTensorDesc, *normDstTensorDesc, *normDstTensorDescF16;
+    void* weightDesc, *weightDesc16;
+    void* dweightDesc, *dweightDesc16;
+    void* convDesc;
+    UNUSED_ENUM_TYPE fw_algo, fw_algo16;
+    UNUSED_ENUM_TYPE bd_algo, bd_algo16;
+    UNUSED_ENUM_TYPE bf_algo, bf_algo16;
+    void* poolingDesc;
 #endif  // CUDNN
-#endif  // GPU
+//#endif  // GPU
 };
 
 
@@ -625,6 +643,8 @@ typedef struct network {
     int n;
     int batch;
     uint64_t *seen;
+    int *cur_iteration;
+    float loss_scale;
     int *t;
     float epoch;
     int subdivisions;
@@ -701,7 +721,7 @@ typedef struct network {
     float *cost;
     float clip;
 
-#ifdef GPU
+//#ifdef GPU
     //float *input_gpu;
     //float *truth_gpu;
     float *delta_gpu;
@@ -722,8 +742,9 @@ typedef struct network {
     float *global_delta_gpu;
     float *state_delta_gpu;
     size_t max_delta_gpu_size;
-#endif
+//#endif  // GPU
     int optimized_memory;
+    int dynamic_minibatch;
     size_t workspace_size_limit;
 } network;
 
diff --git a/src/activation_kernels.cu b/src/activation_kernels.cu
index c7cd49f2..ef2ddbe4 100644
--- a/src/activation_kernels.cu
+++ b/src/activation_kernels.cu
@@ -493,6 +493,7 @@ __global__ void activate_array_normalize_channels_softmax_kernel(float *x, int s
         for (k = 0; k < channels; ++k) {
             float val = x[wh_i + k * wh_step + b*wh_step*channels];
             val = expf(val - max_val) / sum;
+            if (isnan(val) || isinf(val)) val = 0;
             output_gpu[wh_i + k * wh_step + b*wh_step*channels] = val;
         }
     }
@@ -535,6 +536,7 @@ __global__ void gradient_array_normalize_channels_softmax_kernel(float *x, int s
             float delta = delta_gpu[index];
             float grad = x[index] * (1 - x[index]);
             delta = delta * grad;
+            if (isnan(delta) || isinf(delta)) delta = 0;
             delta_gpu[index] = delta;
         }
     }
diff --git a/src/batchnorm_layer.c b/src/batchnorm_layer.c
index 7d9b172d..7432434c 100644
--- a/src/batchnorm_layer.c
+++ b/src/batchnorm_layer.c
@@ -258,15 +258,17 @@ void forward_batchnorm_layer_gpu(layer l, network_state state)
             fast_mean_gpu(l.output_gpu, l.batch, l.out_c, l.out_h*l.out_w, l.mean_gpu);
 
             //fast_v_gpu(l.output_gpu, l.mean_gpu, l.batch, l.out_c, l.out_h*l.out_w, l.v_cbn_gpu);
-            int minibatch_index = state.net.current_subdivision + 1;
-            float alpha = 0.01;
+            const int minibatch_index = state.net.current_subdivision + 1;
+            const int max_minibatch_index = state.net.subdivisions;
+            //printf("\n minibatch_index = %d, max_minibatch_index = %d \n", minibatch_index, max_minibatch_index);
+            const float alpha = 0.01;
 
             int inverse_variance = 0;
 #ifdef CUDNN
             inverse_variance = 1;
 #endif  // CUDNN
 
-            fast_v_cbn_gpu(l.output_gpu, l.mean_gpu, l.batch, l.out_c, l.out_h*l.out_w, minibatch_index, l.m_cbn_avg_gpu, l.v_cbn_avg_gpu, l.variance_gpu,
+            fast_v_cbn_gpu(l.output_gpu, l.mean_gpu, l.batch, l.out_c, l.out_h*l.out_w, minibatch_index, max_minibatch_index, l.m_cbn_avg_gpu, l.v_cbn_avg_gpu, l.variance_gpu,
                 alpha, l.rolling_mean_gpu, l.rolling_variance_gpu, inverse_variance, .00001);
 
             normalize_scale_bias_gpu(l.output_gpu, l.mean_gpu, l.variance_gpu, l.scales_gpu, l.biases_gpu, l.batch, l.out_c, l.out_h*l.out_w, inverse_variance, .00001f);
@@ -385,9 +387,9 @@ void backward_batchnorm_layer_gpu(layer l, network_state state)
     }
 }
 
-void update_batchnorm_layer_gpu(layer l, int batch, float learning_rate_init, float momentum, float decay)
+void update_batchnorm_layer_gpu(layer l, int batch, float learning_rate_init, float momentum, float decay, float loss_scale)
 {
-    float learning_rate = learning_rate_init*l.learning_rate_scale;
+    float learning_rate = learning_rate_init * l.learning_rate_scale / loss_scale;
     //float momentum = a.momentum;
     //float decay = a.decay;
     //int batch = a.batch;
diff --git a/src/batchnorm_layer.h b/src/batchnorm_layer.h
index f5137465..afdc54b7 100644
--- a/src/batchnorm_layer.h
+++ b/src/batchnorm_layer.h
@@ -18,7 +18,7 @@ void resize_batchnorm_layer(layer *l, int w, int h);
 #ifdef GPU
 void forward_batchnorm_layer_gpu(layer l, network_state state);
 void backward_batchnorm_layer_gpu(layer l, network_state state);
-void update_batchnorm_layer_gpu(layer l, int batch, float learning_rate_init, float momentum, float decay);
+void update_batchnorm_layer_gpu(layer l, int batch, float learning_rate_init, float momentum, float decay, float loss_scale);
 void pull_batchnorm_layer(layer l);
 void push_batchnorm_layer(layer l);
 #endif
diff --git a/src/blas.h b/src/blas.h
index 252497a8..2f914516 100644
--- a/src/blas.h
+++ b/src/blas.h
@@ -60,6 +60,7 @@ void fix_nan_and_inf_cpu(float *input, size_t size);
 
 #ifdef GPU
 
+void constrain_weight_updates_ongpu(int N, float coef, float *weights_gpu, float *weight_updates_gpu);
 void axpy_ongpu(int N, float ALPHA, float * X, int INCX, float * Y, int INCY);
 void axpy_ongpu_offset(int N, float ALPHA, float * X, int OFFX, int INCX, float * Y, int OFFY, int INCY);
 void simple_copy_ongpu(int size, float *src, float *dst);
@@ -87,7 +88,7 @@ void fast_variance_delta_gpu(float *x, float *delta, float *mean, float *varianc
 
 void fast_mean_gpu(float *x, int batch, int filters, int spatial, float *mean);
 void fast_variance_gpu(float *x, float *mean, int batch, int filters, int spatial, float *variance);
-void fast_v_cbn_gpu(const float *x, float *mean, int batch, int filters, int spatial, int minibatch_index, float *m_avg, float *v_avg, float *variance,
+void fast_v_cbn_gpu(const float *x, float *mean, int batch, int filters, int spatial, int minibatch_index, int max_minibatch_index, float *m_avg, float *v_avg, float *variance,
     const float alpha, float *rolling_mean_gpu, float *rolling_variance_gpu, int inverse_variance, float epsilon);
 void normalize_scale_bias_gpu(float *x, float *mean, float *variance, float *scales, float *biases, int batch, int filters, int spatial, int inverse_variance, float epsilon);
 void compare_2_arrays_gpu(float *one, float *two, int size);
diff --git a/src/blas_kernels.cu b/src/blas_kernels.cu
index d7ad75ea..a35177b5 100644
--- a/src/blas_kernels.cu
+++ b/src/blas_kernels.cu
@@ -418,6 +418,24 @@ __global__ void reorg_kernel(int N, float *x, int w, int h, int c, int batch, in
     //else out[0] = x[0];
 }
 
+__global__ void constrain_weight_updates_kernel(int N, float coef, float *weights_gpu, float *weight_updates_gpu)
+{
+    int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
+    if (i < N) {
+        const float w = weights_gpu[i];
+        const float wu = weight_updates_gpu[i];
+        const float wu_sign = (wu == 0) ? 0 : (fabs(wu) / wu);
+        const float abs_limit = fabs(w * coef);
+        if (fabs(wu) > abs_limit) weight_updates_gpu[i] = abs_limit * wu_sign;
+    }
+}
+
+extern "C" void constrain_weight_updates_ongpu(int N, float coef, float *weights_gpu, float *weight_updates_gpu)
+{
+    constrain_weight_updates_kernel << <cuda_gridsize(N), BLOCK, 0, get_cuda_stream() >> >(N, coef, weights_gpu, weight_updates_gpu);
+    CHECK_CUDA(cudaPeekAtLastError());
+}
+
 __global__ void axpy_kernel(int N, float ALPHA, float *X, int OFFX, int INCX,  float *Y, int OFFY, int INCY)
 {
     int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
@@ -572,7 +590,7 @@ extern "C" void fast_variance_gpu(float *x, float *mean, int batch, int filters,
 }
 
 
-__global__ void  fast_v_cbn_kernel(const float *x, float *mean, int batch, int filters, int spatial, int minibatch_index, float *m_avg, float *v_avg, float *variance,
+__global__ void  fast_v_cbn_kernel(const float *x, float *mean, int batch, int filters, int spatial, int minibatch_index, int max_minibatch_index, float *m_avg, float *v_avg, float *variance,
     const float alpha, float *rolling_mean_gpu, float *rolling_variance_gpu, int inverse_variance, float epsilon)
 {
     const int threads = BLOCK;
@@ -615,16 +633,19 @@ __global__ void  fast_v_cbn_kernel(const float *x, float *mean, int batch, int f
         if (inverse_variance) variance[filter] = 1.0f / sqrtf(variance_tmp + epsilon);
         else variance[filter] = variance_tmp;
 
-        rolling_mean_gpu[filter] = alpha * mean[filter] + (1 - alpha) * rolling_mean_gpu[filter];
+        //if (max_minibatch_index == minibatch_index)
+        {
+            rolling_mean_gpu[filter] = alpha * mean[filter] + (1 - alpha) * rolling_mean_gpu[filter];
 
-        rolling_variance_gpu[filter] = alpha * variance_tmp + (1 - alpha) * rolling_variance_gpu[filter];
+            rolling_variance_gpu[filter] = alpha * variance_tmp + (1 - alpha) * rolling_variance_gpu[filter];
+        }
     }
 }
 
-extern "C" void fast_v_cbn_gpu(const float *x, float *mean, int batch, int filters, int spatial, int minibatch_index, float *m_avg, float *v_avg, float *variance,
+extern "C" void fast_v_cbn_gpu(const float *x, float *mean, int batch, int filters, int spatial, int minibatch_index, int max_minibatch_index, float *m_avg, float *v_avg, float *variance,
     const float alpha, float *rolling_mean_gpu, float *rolling_variance_gpu, int inverse_variance, float epsilon)
 {
-    fast_v_cbn_kernel << <filters, BLOCK, 0, get_cuda_stream() >> >(x, mean, batch, filters, spatial, minibatch_index, m_avg, v_avg, variance, alpha, rolling_mean_gpu, rolling_variance_gpu, inverse_variance, epsilon);
+    fast_v_cbn_kernel << <filters, BLOCK, 0, get_cuda_stream() >> >(x, mean, batch, filters, spatial, minibatch_index, max_minibatch_index, m_avg, v_avg, variance, alpha, rolling_mean_gpu, rolling_variance_gpu, inverse_variance, epsilon);
     CHECK_CUDA(cudaPeekAtLastError());
 }
 
@@ -798,6 +819,12 @@ __device__ float relu(float src) {
     return 0;
 }
 
+__device__ float lrelu(float src) {
+    const float eps = 0.001;
+    if (src > eps) return src;
+    return eps;
+}
+
 __global__ void shortcut_singlelayer_simple_kernel(int size, int src_outputs, int batch, int n, int *outputs_of_layers_gpu, float **layers_output_gpu, float *out, float *in, float *weights_gpu, int nweights, WEIGHTS_NORMALIZATION_T weights_normalizion)
 {
     const int id = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
@@ -849,7 +876,7 @@ __global__ void shortcut_multilayer_kernel(int size, int src_outputs, int batch,
         for (int i = 0; i < (n + 1); ++i) {
             const int weights_index = src_i / step + i*layer_step;  // [0 or c or (c, h ,w)]
             const float w = weights_gpu[weights_index];
-            if (weights_normalizion == RELU_NORMALIZATION) sum += relu(w);
+            if (weights_normalizion == RELU_NORMALIZATION) sum += lrelu(w);
             else if (weights_normalizion == SOFTMAX_NORMALIZATION) sum += expf(w - max_val);
         }
     }
@@ -858,7 +885,7 @@ __global__ void shortcut_multilayer_kernel(int size, int src_outputs, int batch,
 
     if (weights_gpu) {
         float w = weights_gpu[src_i / step];
-        if (weights_normalizion == RELU_NORMALIZATION) w = relu(w) / sum;
+        if (weights_normalizion == RELU_NORMALIZATION) w = lrelu(w) / sum;
         else if (weights_normalizion == SOFTMAX_NORMALIZATION) w = expf(w - max_val) / sum;
 
         out_val = in[id] * w; // [0 or c or (c, h ,w)]
@@ -876,7 +903,7 @@ __global__ void shortcut_multilayer_kernel(int size, int src_outputs, int batch,
             if (weights_gpu) {
                 const int weights_index = src_i / step + (i + 1)*layer_step;  // [0 or c or (c, h ,w)]
                 float w = weights_gpu[weights_index];
-                if (weights_normalizion == RELU_NORMALIZATION) w = relu(w) / sum;
+                if (weights_normalizion == RELU_NORMALIZATION) w = lrelu(w) / sum;
                 else if (weights_normalizion == SOFTMAX_NORMALIZATION) w = expf(w - max_val) / sum;
 
                 out_val += add[add_index] * w; // [0 or c or (c, h ,w)]
@@ -932,34 +959,27 @@ __global__ void backward_shortcut_multilayer_kernel(int size, int src_outputs, i
         for (i = 0; i < (n + 1); ++i) {
             const int weights_index = src_i / step + i*layer_step;  // [0 or c or (c, h ,w)]
             const float w = weights_gpu[weights_index];
-            if (weights_normalizion == RELU_NORMALIZATION) sum += relu(w);
+            if (weights_normalizion == RELU_NORMALIZATION) sum += lrelu(w);
             else if (weights_normalizion == SOFTMAX_NORMALIZATION) sum += expf(w - max_val);
         }
 
-        /*
-        grad = 0;
-        for (i = 0; i < (n + 1); ++i) {
-            const int weights_index = src_i / step + i*layer_step;  // [0 or c or (c, h ,w)]
-            const float delta_w = delta_in[id] * in[id];
-            const float w = weights_gpu[weights_index];
-            if (weights_normalizion == RELU_NORMALIZATION) grad += delta_w * relu(w) / sum;
-            else if (weights_normalizion == SOFTMAX_NORMALIZATION) grad += delta_w * expf(w - max_val) / sum;
-        }
-        */
     }
 
     if (weights_gpu) {
         float w = weights_gpu[src_i / step];
-        if (weights_normalizion == RELU_NORMALIZATION) w = relu(w) / sum;
+        if (weights_normalizion == RELU_NORMALIZATION) w = lrelu(w) / sum;
         else if (weights_normalizion == SOFTMAX_NORMALIZATION) w = expf(w - max_val) / sum;
 
         if (weights_normalizion == RELU_NORMALIZATION) grad = w;
         else if (weights_normalizion == SOFTMAX_NORMALIZATION) grad = w*(1-w);
 
         delta_out[id] += delta_in[id] * w; // [0 or c or (c, h ,w)]
-        float weights_update_tmp = delta_in[id] * in[id] * grad;
+        float weights_update_tmp = delta_in[id] * in[id] * grad;// / step;
 
         if (layer_step == 1 && (size/32) > (id/32 + 1)) {
+            if (isnan(weights_update_tmp) || isinf(weights_update_tmp)) {
+                weights_update_tmp = 0;
+            }
             float wu = warpAllReduceSum(weights_update_tmp);
             if (threadIdx.x % 32 == 0) {
                 if (!isnan(wu) && !isinf(wu))
@@ -994,13 +1014,18 @@ __global__ void backward_shortcut_multilayer_kernel(int size, int src_outputs, i
                 else if (weights_normalizion == SOFTMAX_NORMALIZATION) grad = w*(1 - w);
 
                 layer_delta[add_index] += delta_in[id] * w;
-                float weights_update_tmp = delta_in[id] * add[add_index] * grad;
+                float weights_update_tmp = delta_in[id] * add[add_index] * grad;// / step;
 
                 if (layer_step == 1 && (size / 32) > (id / 32 + 1)) {
+                    if (isnan(weights_update_tmp) || isinf(weights_update_tmp)) {
+                        weights_update_tmp = 0;
+                    }
                     float wu = warpAllReduceSum(weights_update_tmp);
                     if (threadIdx.x % 32 == 0) {
                         if (!isnan(wu) && !isinf(wu))
                             atomicAdd(&weight_updates_gpu[weights_index], wu);
+                        //if(weights_gpu[weights_index] != 1) printf(" wu = %f, weights_update_tmp = %f, w = %f, weights_gpu[weights_index] = %f, grad = %f, weights_normalizion = %d ",
+                        //    wu, weights_update_tmp, w, weights_gpu[weights_index], grad, weights_normalizion);
                     }
                 }
                 else {
@@ -1357,8 +1382,9 @@ __global__ void fix_nan_and_inf_kernel(float *input, size_t size)
     const int index = blockIdx.x*blockDim.x + threadIdx.x;
     if (index < size) {
         float val = input[index];
-        if (isnan(val) || isinf(val))
-            input[index] = 1.0f / index;  // pseudo random value
+        if (isnan(val) || isinf(val)) {
+            input[index] = 1.0f / (fabs((float)index) + 1);  // pseudo random value
+        }
     }
 }
 
@@ -1377,8 +1403,9 @@ __global__ void reset_nan_and_inf_kernel(float *input, size_t size)
     const int index = blockIdx.x*blockDim.x + threadIdx.x;
     if (index < size) {
         float val = input[index];
-        if (isnan(val) || isinf(val))
+        if (isnan(val) || isinf(val)) {
             input[index] = 0;
+        }
     }
 }
 
diff --git a/src/classifier.c b/src/classifier.c
index 4cfbedb0..a0c5f734 100644
--- a/src/classifier.c
+++ b/src/classifier.c
@@ -48,7 +48,10 @@ void train_classifier(char *datacfg, char *cfgfile, char *weightfile, int *gpus,
         if(weightfile){
             load_weights(&nets[i], weightfile);
         }
-        if(clear) *nets[i].seen = 0;
+        if (clear) {
+            *nets[i].seen = 0;
+            *nets[i].cur_iteration = 0;
+        }
         nets[i].learning_rate *= ngpus;
     }
     srand(time(0));
@@ -166,8 +169,8 @@ void train_classifier(char *datacfg, char *cfgfile, char *weightfile, int *gpus,
         calc_topk_for_each = fmax(calc_topk_for_each, 100);
         if (i % 10 == 0) {
             if (calc_topk) {
-                fprintf(stderr, "\n (next TOP5 calculation at %d iterations) ", calc_topk_for_each);
-                if (topk > 0) fprintf(stderr, " Last accuracy TOP5 = %2.2f %% \n", topk * 100);
+                fprintf(stderr, "\n (next TOP%d calculation at %d iterations) ", topk_data, calc_topk_for_each);
+                if (topk > 0) fprintf(stderr, " Last accuracy TOP%d = %2.2f %% \n", topk_data, topk * 100);
             }
 
             if (net.cudnn_half) {
@@ -179,7 +182,7 @@ void train_classifier(char *datacfg, char *cfgfile, char *weightfile, int *gpus,
         int draw_precision = 0;
         if (calc_topk && (i >= calc_topk_for_each || i == net.max_batches)) {
             iter_topk = i;
-            topk = validate_classifier_single(datacfg, cfgfile, weightfile, &net, topk_data); // calc TOP5
+            topk = validate_classifier_single(datacfg, cfgfile, weightfile, &net, topk_data); // calc TOP-n
             printf("\n accuracy %s = %f \n", topk_buff, topk);
             draw_precision = 1;
         }
@@ -767,13 +770,14 @@ void try_classifier(char *datacfg, char *cfgfile, char *weightfile, char *filena
         float *predictions = network_predict(net, X);
 
         layer l = net.layers[layer_num];
-        for(int i = 0; i < l.c; ++i){
+        int i;
+        for(i = 0; i < l.c; ++i){
             if(l.rolling_mean) printf("%f %f %f\n", l.rolling_mean[i], l.rolling_variance[i], l.scales[i]);
         }
 #ifdef GPU
         cuda_pull_array(l.output_gpu, l.output, l.outputs);
 #endif
-        for(int i = 0; i < l.outputs; ++i){
+        for(i = 0; i < l.outputs; ++i){
             printf("%f\n", l.output[i]);
         }
         /*
@@ -791,7 +795,7 @@ void try_classifier(char *datacfg, char *cfgfile, char *weightfile, char *filena
 
         top_predictions(net, top, indexes);
         printf("%s: Predicted in %f seconds.\n", input, sec(clock()-time));
-        for(int i = 0; i < top; ++i){
+        for(i = 0; i < top; ++i){
             int index = indexes[i];
             printf("%s: %f\n", names[index], predictions[index]);
         }
diff --git a/src/connected_layer.c b/src/connected_layer.c
index 08a16aaa..25a5ffa1 100644
--- a/src/connected_layer.c
+++ b/src/connected_layer.c
@@ -308,8 +308,17 @@ void push_connected_layer(connected_layer l)
     CHECK_CUDA(cudaPeekAtLastError());
 }
 
-void update_connected_layer_gpu(connected_layer l, int batch, float learning_rate, float momentum, float decay)
+void update_connected_layer_gpu(connected_layer l, int batch, float learning_rate_init, float momentum, float decay, float loss_scale)
 {
+    float learning_rate = learning_rate_init * l.learning_rate_scale;
+
+    // Loss scale for Mixed-Precision on Tensor-Cores
+    if (loss_scale != 1.0) {
+        scal_ongpu(l.inputs*l.outputs, 1.0 / loss_scale, l.weight_updates_gpu, 1);
+        scal_ongpu(l.outputs, 1.0 / loss_scale, l.bias_updates_gpu, 1);
+        scal_ongpu(l.outputs, 1.0 / loss_scale, l.scale_updates_gpu, 1);
+    }
+
     axpy_ongpu(l.outputs, learning_rate/batch, l.bias_updates_gpu, 1, l.biases_gpu, 1);
     scal_ongpu(l.outputs, momentum, l.bias_updates_gpu, 1);
 
diff --git a/src/connected_layer.h b/src/connected_layer.h
index 80805cc2..284c5125 100644
--- a/src/connected_layer.h
+++ b/src/connected_layer.h
@@ -22,7 +22,7 @@ void statistics_connected_layer(layer l);
 #ifdef GPU
 void forward_connected_layer_gpu(connected_layer layer, network_state state);
 void backward_connected_layer_gpu(connected_layer layer, network_state state);
-void update_connected_layer_gpu(connected_layer layer, int batch, float learning_rate, float momentum, float decay);
+void update_connected_layer_gpu(connected_layer layer, int batch, float learning_rate, float momentum, float decay, float loss_scale);
 void push_connected_layer(connected_layer layer);
 void pull_connected_layer(connected_layer layer);
 #endif
diff --git a/src/conv_lstm_layer.c b/src/conv_lstm_layer.c
index bc53848a..cb7e3fe1 100644
--- a/src/conv_lstm_layer.c
+++ b/src/conv_lstm_layer.c
@@ -791,21 +791,21 @@ void push_conv_lstm_layer(layer l)
     push_convolutional_layer(*(l.uo));
 }
 
-void update_conv_lstm_layer_gpu(layer l, int batch, float learning_rate, float momentum, float decay)
+void update_conv_lstm_layer_gpu(layer l, int batch, float learning_rate, float momentum, float decay, float loss_scale)
 {
     if (l.peephole) {
-        update_convolutional_layer_gpu(*(l.vf), batch, learning_rate, momentum, decay);
-        update_convolutional_layer_gpu(*(l.vi), batch, learning_rate, momentum, decay);
-        update_convolutional_layer_gpu(*(l.vo), batch, learning_rate, momentum, decay);
+        update_convolutional_layer_gpu(*(l.vf), batch, learning_rate, momentum, decay, loss_scale);
+        update_convolutional_layer_gpu(*(l.vi), batch, learning_rate, momentum, decay, loss_scale);
+        update_convolutional_layer_gpu(*(l.vo), batch, learning_rate, momentum, decay, loss_scale);
     }
-    update_convolutional_layer_gpu(*(l.wf), batch, learning_rate, momentum, decay);
-    update_convolutional_layer_gpu(*(l.wi), batch, learning_rate, momentum, decay);
-    update_convolutional_layer_gpu(*(l.wg), batch, learning_rate, momentum, decay);
-    update_convolutional_layer_gpu(*(l.wo), batch, learning_rate, momentum, decay);
-    update_convolutional_layer_gpu(*(l.uf), batch, learning_rate, momentum, decay);
-    update_convolutional_layer_gpu(*(l.ui), batch, learning_rate, momentum, decay);
-    update_convolutional_layer_gpu(*(l.ug), batch, learning_rate, momentum, decay);
-    update_convolutional_layer_gpu(*(l.uo), batch, learning_rate, momentum, decay);
+    update_convolutional_layer_gpu(*(l.wf), batch, learning_rate, momentum, decay, loss_scale);
+    update_convolutional_layer_gpu(*(l.wi), batch, learning_rate, momentum, decay, loss_scale);
+    update_convolutional_layer_gpu(*(l.wg), batch, learning_rate, momentum, decay, loss_scale);
+    update_convolutional_layer_gpu(*(l.wo), batch, learning_rate, momentum, decay, loss_scale);
+    update_convolutional_layer_gpu(*(l.uf), batch, learning_rate, momentum, decay, loss_scale);
+    update_convolutional_layer_gpu(*(l.ui), batch, learning_rate, momentum, decay, loss_scale);
+    update_convolutional_layer_gpu(*(l.ug), batch, learning_rate, momentum, decay, loss_scale);
+    update_convolutional_layer_gpu(*(l.uo), batch, learning_rate, momentum, decay, loss_scale);
 }
 
 void forward_conv_lstm_layer_gpu(layer l, network_state state)
diff --git a/src/conv_lstm_layer.h b/src/conv_lstm_layer.h
index 0a0438b0..14232cab 100644
--- a/src/conv_lstm_layer.h
+++ b/src/conv_lstm_layer.h
@@ -23,7 +23,7 @@ void update_conv_lstm_layer(layer l, int batch, float learning_rate, float momen
 #ifdef GPU
 void forward_conv_lstm_layer_gpu(layer l, network_state state);
 void backward_conv_lstm_layer_gpu(layer l, network_state state);
-void update_conv_lstm_layer_gpu(layer l, int batch, float learning_rate, float momentum, float decay);
+void update_conv_lstm_layer_gpu(layer l, int batch, float learning_rate, float momentum, float decay, float loss_scale);
 #endif
 
 #ifdef __cplusplus
diff --git a/src/convolutional_kernels.cu b/src/convolutional_kernels.cu
index 00d3961c..3b4a343a 100644
--- a/src/convolutional_kernels.cu
+++ b/src/convolutional_kernels.cu
@@ -419,8 +419,8 @@ void forward_convolutional_layer_gpu(convolutional_layer l, network_state state)
 
 //#ifdef CUDNN_HALF
     //if (state.use_mixed_precision) {
-    int iteration_num = (*state.net.seen) / (state.net.batch*state.net.subdivisions);
-    if (state.index != 0 && state.net.cudnn_half && !l.xnor && (!state.train || iteration_num > 3*state.net.burn_in) &&
+    int iteration_num = get_current_iteration(state.net); // (*state.net.seen) / (state.net.batch*state.net.subdivisions);
+    if (state.index != 0 && state.net.cudnn_half && !l.xnor && (!state.train || iteration_num > 3 * state.net.burn_in) &&
         (l.c / l.groups) % 8 == 0 && l.n % 8 == 0 && !state.train && l.groups <= 1 && l.size > 1)
     {
         //printf("\n CUDNN_HALF!!! state.index = %d \n", state.index);
@@ -671,8 +671,8 @@ void backward_convolutional_layer_gpu(convolutional_layer l, network_state state
     float alpha = 1, beta = 0;
 
 //#ifdef CUDNN_HALF
-    int iteration_num = (*state.net.seen) / (state.net.batch*state.net.subdivisions);
-    if (state.index != 0 && state.net.cudnn_half && !l.xnor && (!state.train || iteration_num > 3*state.net.burn_in) &&
+    int iteration_num = get_current_iteration(state.net); //(*state.net.seen) / (state.net.batch*state.net.subdivisions);
+    if (state.index != 0 && state.net.cudnn_half && !l.xnor && (!state.train || iteration_num > 3 * state.net.burn_in) &&
         (l.c / l.groups) % 8 == 0 && l.n % 8 == 0 && !state.train && l.groups <= 1 && l.size > 1)
     {
         const size_t input16_size = l.batch*l.c*l.w*l.h;
@@ -717,7 +717,7 @@ void backward_convolutional_layer_gpu(convolutional_layer l, network_state state
                 l.normDstTensorDescF16,
                 delta16,                // input
                 l.normDstTensorDescF16,
-                l.x_norm_gpu,            // output (new delta)
+                l.output_gpu, //l.x_norm_gpu,            // output (new delta)
                 l.normTensorDesc,
                 l.scales_gpu,            // input (should be FP32)
                 l.scale_updates_gpu,    // output (should be FP32)
@@ -726,7 +726,7 @@ void backward_convolutional_layer_gpu(convolutional_layer l, network_state state
                 l.mean_gpu,                // input (should be FP32)
                 l.variance_gpu));        // input (should be FP32)
 
-            simple_copy_ongpu(l.outputs*l.batch / 2, l.x_norm_gpu, delta16);
+            simple_copy_ongpu(l.outputs*l.batch / 2, l.output_gpu, delta16);
             //copy_ongpu(l.outputs*l.batch / 2, l.x_norm_gpu, 1, delta16, 1);
             //cudaMemcpyAsync(delta16, l.x_norm_gpu, l.outputs*l.batch * sizeof(half), cudaMemcpyDefault, get_cuda_stream());
         }
@@ -978,7 +978,7 @@ void assisted_activation2_gpu(float alpha, float *output, float *gt_gpu, float *
 
 void assisted_excitation_forward_gpu(convolutional_layer l, network_state state)
 {
-    const int iteration_num = (*state.net.seen) / (state.net.batch*state.net.subdivisions);
+    const int iteration_num = get_current_iteration(state.net); //(*state.net.seen) / (state.net.batch*state.net.subdivisions);
 
     // epoch
     //const float epoch = (float)(*state.net.seen) / state.net.train_images_num;
@@ -1188,7 +1188,7 @@ void push_convolutional_layer(convolutional_layer l)
     CHECK_CUDA(cudaPeekAtLastError());
 }
 
-void update_convolutional_layer_gpu(layer l, int batch, float learning_rate_init, float momentum, float decay)
+void update_convolutional_layer_gpu(layer l, int batch, float learning_rate_init, float momentum, float decay, float loss_scale)
 {
 
         /*
@@ -1232,6 +1232,13 @@ void update_convolutional_layer_gpu(layer l, int batch, float learning_rate_init
     //float decay = a.decay;
     //int batch = a.batch;
 
+    // Loss scale for Mixed-Precision on Tensor-Cores
+    if (loss_scale != 1.0) {
+        if (l.weight_updates_gpu && l.nweights > 0) scal_ongpu(l.nweights, 1.0 / loss_scale, l.weight_updates_gpu, 1);
+        if (l.bias_updates_gpu && l.n > 0) scal_ongpu(l.n, 1.0 / loss_scale, l.bias_updates_gpu, 1);
+        if (l.scale_updates_gpu && l.n > 0) scal_ongpu(l.n, 1.0 / loss_scale, l.scale_updates_gpu, 1);
+    }
+
     reset_nan_and_inf(l.weight_updates_gpu, l.nweights);
     fix_nan_and_inf(l.weights_gpu, l.nweights);
 
@@ -1280,9 +1287,9 @@ void update_convolutional_layer_gpu(layer l, int batch, float learning_rate_init
         //}
     }
 
-    //if (l.clip) {
-    //    constrain_gpu(l.nweights, l.clip, l.weights_gpu, 1);
-    //}
+    if (l.clip) {
+        constrain_ongpu(l.nweights, l.clip, l.weights_gpu, 1);
+    }
 }
 
 /*
diff --git a/src/convolutional_layer.c b/src/convolutional_layer.c
index f6c4d3dc..0e0f614e 100644
--- a/src/convolutional_layer.c
+++ b/src/convolutional_layer.c
@@ -786,7 +786,7 @@ void resize_convolutional_layer(convolutional_layer *l, int w, int h)
 
     if (l->activation == SWISH || l->activation == MISH) l->activation_input = (float*)realloc(l->activation_input, total_batch*l->outputs * sizeof(float));
 #ifdef GPU
-    if (old_w < w || old_h < h) {
+    if (old_w < w || old_h < h || l->dynamic_minibatch) {
         if (l->train) {
             cuda_free(l->delta_gpu);
             l->delta_gpu = cuda_make_array(l->delta, total_batch*l->outputs);
diff --git a/src/convolutional_layer.h b/src/convolutional_layer.h
index 18135937..e83ca873 100644
--- a/src/convolutional_layer.h
+++ b/src/convolutional_layer.h
@@ -15,7 +15,7 @@ extern "C" {
 #ifdef GPU
 void forward_convolutional_layer_gpu(convolutional_layer layer, network_state state);
 void backward_convolutional_layer_gpu(convolutional_layer layer, network_state state);
-void update_convolutional_layer_gpu(convolutional_layer layer, int batch, float learning_rate, float momentum, float decay);
+void update_convolutional_layer_gpu(convolutional_layer layer, int batch, float learning_rate, float momentum, float decay, float loss_scale);
 
 void push_convolutional_layer(convolutional_layer layer);
 void pull_convolutional_layer(convolutional_layer layer);
diff --git a/src/crnn_layer.c b/src/crnn_layer.c
index 8b9ed2f9..84646b4a 100644
--- a/src/crnn_layer.c
+++ b/src/crnn_layer.c
@@ -265,11 +265,11 @@ void push_crnn_layer(layer l)
     push_convolutional_layer(*(l.output_layer));
 }
 
-void update_crnn_layer_gpu(layer l, int batch, float learning_rate, float momentum, float decay)
+void update_crnn_layer_gpu(layer l, int batch, float learning_rate, float momentum, float decay, float loss_scale)
 {
-    update_convolutional_layer_gpu(*(l.input_layer), batch, learning_rate, momentum, decay);
-    update_convolutional_layer_gpu(*(l.self_layer), batch, learning_rate, momentum, decay);
-    update_convolutional_layer_gpu(*(l.output_layer), batch, learning_rate, momentum, decay);
+    update_convolutional_layer_gpu(*(l.input_layer), batch, learning_rate, momentum, decay, loss_scale);
+    update_convolutional_layer_gpu(*(l.self_layer), batch, learning_rate, momentum, decay, loss_scale);
+    update_convolutional_layer_gpu(*(l.output_layer), batch, learning_rate, momentum, decay, loss_scale);
 }
 
 void forward_crnn_layer_gpu(layer l, network_state state)
diff --git a/src/crnn_layer.h b/src/crnn_layer.h
index c2dc7a25..b85df6ff 100644
--- a/src/crnn_layer.h
+++ b/src/crnn_layer.h
@@ -20,7 +20,7 @@ void update_crnn_layer(layer l, int batch, float learning_rate, float momentum,
 #ifdef GPU
 void forward_crnn_layer_gpu(layer l, network_state state);
 void backward_crnn_layer_gpu(layer l, network_state state);
-void update_crnn_layer_gpu(layer l, int batch, float learning_rate, float momentum, float decay);
+void update_crnn_layer_gpu(layer l, int batch, float learning_rate, float momentum, float decay, float loss_scale);
 void push_crnn_layer(layer l);
 void pull_crnn_layer(layer l);
 #endif
diff --git a/src/dark_cuda.c b/src/dark_cuda.c
index e51617af..e7594309 100644
--- a/src/dark_cuda.c
+++ b/src/dark_cuda.c
@@ -1,6 +1,7 @@
 #ifdef __cplusplus
 extern "C" {
 #endif
+int cuda_debug_sync = 0;
 int gpu_index = 0;
 #ifdef __cplusplus
 }
@@ -19,6 +20,7 @@ int gpu_index = 0;
 
 #pragma comment(lib, "cuda.lib")
 
+
 #ifdef CUDNN
 #ifndef USE_CMAKE_LIBS
 #pragma comment(lib, "cudnn.lib")
@@ -29,6 +31,7 @@ int gpu_index = 0;
 #error "If you set CUDNN_HALF=1 then you must set CUDNN=1"
 #endif
 
+
 void cuda_set_device(int n)
 {
     gpu_index = n;
@@ -86,10 +89,13 @@ void check_error_extended(cudaError_t status, const char *file, int line, const
         check_error(status);
     }
 #if defined(DEBUG) || defined(CUDA_DEBUG)
-    status = cudaDeviceSynchronize();
-    if (status != cudaSuccess)
-        printf("CUDA status = cudaDeviceSynchronize() Error: file: %s() : line: %d : build time: %s \n", file, line, date_time);
+    cuda_debug_sync = 1;
 #endif
+    if (cuda_debug_sync) {
+        status = cudaDeviceSynchronize();
+        if (status != cudaSuccess)
+            printf("CUDA status = cudaDeviceSynchronize() Error: file: %s() : line: %d : build time: %s \n", file, line, date_time);
+    }
     check_error(status);
 }
 
@@ -173,6 +179,9 @@ void cudnn_check_error(cudnnStatus_t status)
 #if defined(DEBUG) || defined(CUDA_DEBUG)
     cudaDeviceSynchronize();
 #endif
+    if (cuda_debug_sync) {
+        cudaDeviceSynchronize();
+    }
     cudnnStatus_t status2 = CUDNN_STATUS_SUCCESS;
 #ifdef CUDNN_ERRQUERY_RAWCODE
     cudnnStatus_t status_tmp = cudnnQueryRuntimeError(cudnn_handle(), &status2, CUDNN_ERRQUERY_RAWCODE, NULL);
@@ -208,10 +217,13 @@ void cudnn_check_error_extended(cudnnStatus_t status, const char *file, int line
         cudnn_check_error(status);
     }
 #if defined(DEBUG) || defined(CUDA_DEBUG)
-    status = cudaDeviceSynchronize();
-    if (status != CUDNN_STATUS_SUCCESS)
-        printf("\n cuDNN status = cudaDeviceSynchronize() Error in: file: %s() : line: %d : build time: %s \n", file, line, date_time);
+    cuda_debug_sync = 1;
 #endif
+    if (cuda_debug_sync) {
+        cudaError_t status = cudaDeviceSynchronize();
+        if (status != CUDNN_STATUS_SUCCESS)
+            printf("\n cudaError_t status = cudaDeviceSynchronize() Error in: file: %s() : line: %d : build time: %s \n", file, line, date_time);
+    }
     cudnn_check_error(status);
 }
 #endif
diff --git a/src/dark_cuda.h b/src/dark_cuda.h
index 4798d729..5f6978ae 100644
--- a/src/dark_cuda.h
+++ b/src/dark_cuda.h
@@ -6,6 +6,8 @@
 extern "C" {
 #endif
 
+
+extern int cuda_debug_sync;
 extern int gpu_index;
 #ifdef __cplusplus
 }
diff --git a/src/darknet.c b/src/darknet.c
index ec47b05a..93d88b1b 100644
--- a/src/darknet.c
+++ b/src/darknet.c
@@ -165,17 +165,19 @@ void oneoff(char *cfgfile, char *weightfile, char *outfile)
     copy_cpu(l.n/3*l.c, l.weights, 1, l.weights +   l.n/3*l.c, 1);
     copy_cpu(l.n/3*l.c, l.weights, 1, l.weights + 2*l.n/3*l.c, 1);
     *net.seen = 0;
+    *net.cur_iteration = 0;
     save_weights(net, outfile);
 }
 
 void partial(char *cfgfile, char *weightfile, char *outfile, int max)
 {
     gpu_index = -1;
-    network net = parse_network_cfg(cfgfile);
+    network net = parse_network_cfg_custom(cfgfile, 1, 1);
     if(weightfile){
         load_weights_upto(&net, weightfile, max);
     }
     *net.seen = 0;
+    *net.cur_iteration = 0;
     save_weights_upto(net, outfile, max);
 }
 
@@ -455,15 +457,22 @@ int main(int argc, char **argv)
 
 #ifndef GPU
     gpu_index = -1;
+    printf(" GPU isn't used \n");
     init_cpu();
-#else
+#else   // GPU
     if(gpu_index >= 0){
         cuda_set_device(gpu_index);
         CHECK_CUDA(cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync));
     }
 
     show_cuda_cudnn_info();
-#endif
+    cuda_debug_sync = find_arg(argc, argv, "-cuda_debug_sync");
+
+#ifdef CUDNN_HALF
+    printf(" CUDNN_HALF=1 \n");
+#endif  // CUDNN_HALF
+
+#endif  // GPU
 
     show_opencv_info();
 
diff --git a/src/data.c b/src/data.c
index 9e564e2c..a103cc5a 100644
--- a/src/data.c
+++ b/src/data.c
@@ -8,6 +8,8 @@
 #include <stdlib.h>
 #include <string.h>
 
+extern int check_mistakes;
+
 #define NUMCHARS 37
 
 pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;
@@ -178,7 +180,6 @@ matrix load_image_augment_paths(char **paths, int n, int use_flip, int min, int
     return X;
 }
 
-extern int check_mistakes;
 
 box_label *read_boxes(char *filename, int *n)
 {
@@ -950,7 +951,10 @@ data load_data_detection(int n, char **paths, int m, int w, int h, int c, int bo
     const int random_index = random_gen();
     c = c ? c : 3;
 
-    assert(use_mixup != 2);
+    if (use_mixup == 2) {
+        printf("\n cutmix=1 - isn't supported for Detector \n");
+        exit(0);
+    }
     if (use_mixup == 3 && letter_box) {
         printf("\n Combination: letter_box=1 & mosaic=1 - isn't supported, use only 1 of these parameters \n");
         exit(0);
@@ -1211,7 +1215,15 @@ data load_data_detection(int n, char **paths, int m, int w, int h, int c, int bo
     if(track) random_paths = get_sequential_paths(paths, n, m, mini_batch, augment_speed);
     else random_paths = get_random_paths(paths, n, m);
 
-    assert(use_mixup < 2);
+    //assert(use_mixup < 2);
+    if (use_mixup == 2) {
+        printf("\n cutmix=1 - isn't supported for Detector \n");
+        exit(0);
+    }
+    if (use_mixup == 3) {
+        printf("\n mosaic=1 - compile Darknet with OpenCV for using mosaic=1 \n");
+        exit(0);
+    }
     int mixup = use_mixup ? random_gen() % 2 : 0;
     //printf("\n mixup = %d \n", mixup);
     if (mixup) {
diff --git a/src/detector.c b/src/detector.c
index 3ececa5f..08874179 100644
--- a/src/detector.c
+++ b/src/detector.c
@@ -66,19 +66,22 @@ void train_detector(char *datacfg, char *cfgfile, char *weightfile, int *gpus, i
 
     srand(time(0));
     int seed = rand();
-    int i;
-    for (i = 0; i < ngpus; ++i) {
+    int k;
+    for (k = 0; k < ngpus; ++k) {
         srand(seed);
 #ifdef GPU
-        cuda_set_device(gpus[i]);
+        cuda_set_device(gpus[k]);
 #endif
-        nets[i] = parse_network_cfg(cfgfile);
-        nets[i].benchmark_layers = benchmark_layers;
+        nets[k] = parse_network_cfg(cfgfile);
+        nets[k].benchmark_layers = benchmark_layers;
         if (weightfile) {
-            load_weights(&nets[i], weightfile);
+            load_weights(&nets[k], weightfile);
+        }
+        if (clear) {
+            *nets[k].seen = 0;
+            *nets[k].cur_iteration = 0;
         }
-        if (clear) *nets[i].seen = 0;
-        nets[i].learning_rate *= ngpus;
+        nets[k].learning_rate *= ngpus;
     }
     srand(time(0));
     network net = nets[0];
@@ -105,12 +108,13 @@ void train_detector(char *datacfg, char *cfgfile, char *weightfile, int *gpus, i
     int train_images_num = plist->size;
     char **paths = (char **)list_to_array(plist);
 
-    int init_w = net.w;
-    int init_h = net.h;
+    const int init_w = net.w;
+    const int init_h = net.h;
+    const int init_b = net.batch;
     int iter_save, iter_save_last, iter_map;
-    iter_save = get_current_batch(net);
-    iter_save_last = get_current_batch(net);
-    iter_map = get_current_batch(net);
+    iter_save = get_current_iteration(net);
+    iter_save_last = get_current_iteration(net);
+    iter_map = get_current_iteration(net);
     float mean_average_precision = -1;
     float best_map = mean_average_precision;
 
@@ -169,7 +173,7 @@ void train_detector(char *datacfg, char *cfgfile, char *weightfile, int *gpus, i
     start = what_time_is_it_now();
 
     //while(i*imgs < N*120){
-    while (get_current_batch(net) < net.max_batches) {
+    while (get_current_iteration(net) < net.max_batches) {
         if (l.random && count++ % 10 == 0) {
             float rand_coef = 1.4;
             if (l.random != 1.0) rand_coef = l.random;
@@ -179,26 +183,48 @@ void train_detector(char *datacfg, char *cfgfile, char *weightfile, int *gpus, i
             int dim_h = roundl(random_val*init_h / net.resize_step + 1) * net.resize_step;
             if (random_val < 1 && (dim_w > init_w || dim_h > init_h)) dim_w = init_w, dim_h = init_h;
 
-            // at the beginning
-            if (avg_loss < 0) {
-                dim_w = roundl(rand_coef*init_w / net.resize_step + 1) * net.resize_step;
-                dim_h = roundl(rand_coef*init_h / net.resize_step + 1) * net.resize_step;
+            int max_dim_w = roundl(rand_coef*init_w / net.resize_step + 1) * net.resize_step;
+            int max_dim_h = roundl(rand_coef*init_h / net.resize_step + 1) * net.resize_step;
+
+            // at the beginning (check if enough memory) and at the end (calc rolling mean/variance)
+            if (avg_loss < 0 || get_current_iteration(net) > net.max_batches - 100) {
+                dim_w = max_dim_w;
+                dim_h = max_dim_h;
             }
 
             if (dim_w < net.resize_step) dim_w = net.resize_step;
             if (dim_h < net.resize_step) dim_h = net.resize_step;
+            int dim_b = (init_b * max_dim_w * max_dim_h) / (dim_w * dim_h);
+            int new_dim_b = (int)(dim_b * 0.8);
+            if (new_dim_b > init_b) dim_b = new_dim_b;
 
-            printf("%d x %d \n", dim_w, dim_h);
             args.w = dim_w;
             args.h = dim_h;
 
+            int k;
+            if (net.dynamic_minibatch) {
+                for (k = 0; k < ngpus; ++k) {
+                    (*nets[k].seen) = init_b * net.subdivisions * get_current_iteration(net); // remove this line, when you will save to weights-file both: seen & cur_iteration
+                    nets[k].batch = dim_b;
+                    int j;
+                    for (j = 0; j < nets[k].n; ++j)
+                        nets[k].layers[j].batch = dim_b;
+                }
+                net.batch = dim_b;
+                imgs = net.batch * net.subdivisions * ngpus;
+                args.n = imgs;
+                printf("\n %d x %d  (batch = %d) \n", dim_w, dim_h, net.batch);
+            }
+            else
+                printf("\n %d x %d \n", dim_w, dim_h);
+
             pthread_join(load_thread, 0);
             train = buffer;
             free_data(train);
             load_thread = load_data(args);
 
-            for (i = 0; i < ngpus; ++i) {
-                resize_network(nets + i, dim_w, dim_h);
+            for (k = 0; k < ngpus; ++k) {
+                resize_network(nets + k, dim_w, dim_h);
             }
             net = nets[0];
         }
@@ -250,7 +276,8 @@ void train_detector(char *datacfg, char *cfgfile, char *weightfile, int *gpus, i
         if (avg_loss < 0 || avg_loss != avg_loss) avg_loss = loss;    // if(-inf or nan)
         avg_loss = avg_loss*.9 + loss*.1;
 
-        i = get_current_batch(net);
+        const int iteration = get_current_iteration(net);
+        //i = get_current_batch(net);
 
         int calc_map_for_each = 4 * train_images_num / (net.batch * net.subdivisions);  // calculate mAP for each 4 Epochs
         calc_map_for_each = fmax(calc_map_for_each, 100);
@@ -263,22 +290,36 @@ void train_detector(char *datacfg, char *cfgfile, char *weightfile, int *gpus, i
         }
 
         if (net.cudnn_half) {
-            if (i < net.burn_in * 3) fprintf(stderr, "\n Tensor Cores are disabled until the first %d iterations are reached.", 3 * net.burn_in);
+            if (iteration < net.burn_in * 3) fprintf(stderr, "\n Tensor Cores are disabled until the first %d iterations are reached.", 3 * net.burn_in);
             else fprintf(stderr, "\n Tensor Cores are used.");
         }
-        printf("\n %d: %f, %f avg loss, %f rate, %lf seconds, %d images\n", get_current_batch(net), loss, avg_loss, get_current_rate(net), (what_time_is_it_now() - time), i*imgs);
+        printf("\n %d: %f, %f avg loss, %f rate, %lf seconds, %d images\n", iteration, loss, avg_loss, get_current_rate(net), (what_time_is_it_now() - time), iteration*imgs);
 
         int draw_precision = 0;
-        if (calc_map && (i >= next_map_calc || i == net.max_batches)) {
+        if (calc_map && (iteration >= next_map_calc || iteration == net.max_batches)) {
             if (l.random) {
-                printf("Resizing to initial size: %d x %d \n", init_w, init_h);
+                printf("Resizing to initial size: %d x %d ", init_w, init_h);
                 args.w = init_w;
                 args.h = init_h;
+                int k;
+                if (net.dynamic_minibatch) {
+                    for (k = 0; k < ngpus; ++k) {
+                        for (k = 0; k < ngpus; ++k) {
+                            nets[k].batch = init_b;
+                            int j;
+                            for (j = 0; j < nets[k].n; ++j)
+                                nets[k].layers[j].batch = init_b;
+                        }
+                    }
+                    net.batch = init_b;
+                    imgs = init_b * net.subdivisions * ngpus;
+                    args.n = imgs;
+                    printf("\n %d x %d  (batch = %d) \n", init_w, init_h, init_b);
+                }
                 pthread_join(load_thread, 0);
                 free_data(train);
                 train = buffer;
                 load_thread = load_data(args);
-                int k;
                 for (k = 0; k < ngpus; ++k) {
                     resize_network(nets + k, init_w, init_h);
                 }
@@ -290,7 +331,7 @@ void train_detector(char *datacfg, char *cfgfile, char *weightfile, int *gpus, i
             // combine Training and Validation networks
             //network net_combined = combine_train_valid_networks(net, net_map);
 
-            iter_map = i;
+            iter_map = iteration;
             mean_average_precision = validate_detector_map(datacfg, cfgfile, weightfile, 0.25, 0.5, 0, net.letter_box, &net_map);// &net_combined);
             printf("\n mean_average_precision (mAP@0.5) = %f \n", mean_average_precision);
             if (mean_average_precision > best_map) {
@@ -319,18 +360,18 @@ void train_detector(char *datacfg, char *cfgfile, char *weightfile, int *gpus, i
 
         //if (i % 1000 == 0 || (i < 1000 && i % 100 == 0)) {
         //if (i % 100 == 0) {
-        if (i >= (iter_save + 1000) || i % 1000 == 0) {
-            iter_save = i;
+        if (iteration >= (iter_save + 1000) || iteration % 1000 == 0) {
+            iter_save = iteration;
 #ifdef GPU
             if (ngpus != 1) sync_nets(nets, ngpus, 0);
 #endif
             char buff[256];
-            sprintf(buff, "%s/%s_%d.weights", backup_directory, base, i);
+            sprintf(buff, "%s/%s_%d.weights", backup_directory, base, iteration);
             save_weights(net, buff);
         }
 
-        if (i >= (iter_save_last + 100) || i % 100 == 0) {
-            iter_save_last = i;
+        if (iteration >= (iter_save_last + 100) || (iteration % 100 == 0 && iteration > 1)) {
+            iter_save_last = iteration;
 #ifdef GPU
             if (ngpus != 1) sync_nets(nets, ngpus, 0);
 #endif
@@ -364,7 +405,7 @@ void train_detector(char *datacfg, char *cfgfile, char *weightfile, int *gpus, i
     free_list_contents_kvp(options);
     free_list(options);
 
-    for (i = 0; i < ngpus; ++i) free_network(nets[i]);
+    for (k = 0; k < ngpus; ++k) free_network(nets[k]);
     free(nets);
     //free_network(net);
 
diff --git a/src/dropout_layer.c b/src/dropout_layer.c
index dccf7eeb..3cc73ed2 100644
--- a/src/dropout_layer.c
+++ b/src/dropout_layer.c
@@ -33,8 +33,10 @@ dropout_layer make_dropout_layer(int batch, int inputs, float probability, int d
     l.forward_gpu = forward_dropout_layer_gpu;
     l.backward_gpu = backward_dropout_layer_gpu;
     l.rand_gpu = cuda_make_array(l.rand, inputs*batch);
-    l.drop_blocks_scale = cuda_make_array_pinned(l.rand, l.batch);
-    l.drop_blocks_scale_gpu = cuda_make_array(l.rand, l.batch);
+    if (l.dropblock) {
+        l.drop_blocks_scale = cuda_make_array_pinned(l.rand, l.batch);
+        l.drop_blocks_scale_gpu = cuda_make_array(l.rand, l.batch);
+    }
 #endif
     if (l.dropblock) {
         if(l.dropblock_size_abs) fprintf(stderr, "dropblock    p = %.3f   l.dropblock_size_abs = %d    %4d  ->   %4d\n", probability, l.dropblock_size_abs, inputs, inputs);
@@ -48,11 +50,18 @@ void resize_dropout_layer(dropout_layer *l, int inputs)
 {
     l->inputs = l->outputs = inputs;
     l->rand = (float*)xrealloc(l->rand, l->inputs * l->batch * sizeof(float));
-    #ifdef GPU
+#ifdef GPU
     cuda_free(l->rand_gpu);
-
     l->rand_gpu = cuda_make_array(l->rand, l->inputs*l->batch);
-    #endif
+
+    if (l->dropblock) {
+        cudaFreeHost(l->drop_blocks_scale);
+        l->drop_blocks_scale = cuda_make_array_pinned(l->rand, l->batch);
+
+        cuda_free(l->drop_blocks_scale_gpu);
+        l->drop_blocks_scale_gpu = cuda_make_array(l->rand, l->batch);
+    }
+#endif
 }
 
 void forward_dropout_layer(dropout_layer l, network_state state)
diff --git a/src/dropout_layer_kernels.cu b/src/dropout_layer_kernels.cu
index 346de4bb..a963235e 100644
--- a/src/dropout_layer_kernels.cu
+++ b/src/dropout_layer_kernels.cu
@@ -74,6 +74,15 @@ __global__ void dropblock_fast_kernel(float *rand, float prob, int w, int h, int
 
 }
 
+__global__ void set_scales_dropblock_kernel(float *drop_blocks_scale, int block_size_w, int block_size_h, int outputs, int batch)
+{
+    const int index = blockIdx.x*blockDim.x + threadIdx.x;
+    if (index >= batch) return;
+
+    const float prob = drop_blocks_scale[index] * block_size_w * block_size_h / (float)outputs;
+    const float scale = 1.0f / (1.0f - prob);
+    drop_blocks_scale[index] = scale;
+}
 
 __global__ void scale_dropblock_kernel(float *output, int size, int outputs, float *drop_blocks_scale)
 {
@@ -95,7 +104,7 @@ __global__ void yoloswag420blazeit360noscope(float *input, int size, float *rand
 void forward_dropout_layer_gpu(dropout_layer l, network_state state)
 {
     if (!state.train) return;
-    int iteration_num = (*state.net.seen) / (state.net.batch*state.net.subdivisions);
+    int iteration_num = get_current_iteration(state.net); // (*state.net.seen) / (state.net.batch*state.net.subdivisions);
     //if (iteration_num < state.net.burn_in) return;
 
     // We gradually increase the block size and the probability of dropout - during the first half of the training
@@ -136,18 +145,24 @@ void forward_dropout_layer_gpu(dropout_layer l, network_state state)
         dropblock_fast_kernel << <num_blocks, BLOCK, 0, get_cuda_stream() >> > (l.rand_gpu, block_prob, l.w, l.h, l.w*l.h, l.c, block_size, l.drop_blocks_scale_gpu, state.input);
         CHECK_CUDA(cudaPeekAtLastError());
 
+        num_blocks = get_number_of_blocks(l.batch, BLOCK);
+        set_scales_dropblock_kernel << <num_blocks, BLOCK, 0, get_cuda_stream() >> > (l.drop_blocks_scale_gpu, block_size, block_size, l.outputs, l.batch);
+        CHECK_CUDA(cudaPeekAtLastError());
+
+        /*
         cuda_pull_array(l.drop_blocks_scale_gpu, l.drop_blocks_scale, l.batch);
 
         for (int b = 0; b < l.batch; ++b) {
             const float prob = l.drop_blocks_scale[b] * block_size * block_size / (float)l.outputs;
             const float scale = 1.0f / (1.0f - prob);
-            printf(" %d x %d - block_size = %d, block_size*block_size = %d , ", l.w, l.h, block_size, block_size*block_size);
-            printf(" , l.drop_blocks_scale[b] = %f, prob = %f, calc scale = %f \t cur_prob = %f, cur_scale = %f \n",
-                l.drop_blocks_scale[b], prob, scale, cur_prob, cur_scale);
+            //printf(" %d x %d - block_size = %d, block_size*block_size = %d , ", l.w, l.h, block_size, block_size*block_size);
+            //printf(" , l.drop_blocks_scale[b] = %f, prob = %f, calc scale = %f \t cur_prob = %f, cur_scale = %f \n",
+            //    l.drop_blocks_scale[b], prob, scale, cur_prob, cur_scale);
             l.drop_blocks_scale[b] = scale;
         }
 
         cuda_push_array(l.drop_blocks_scale_gpu, l.drop_blocks_scale, l.batch);
+        */
 
         num_blocks = get_number_of_blocks(l.outputs * l.batch, BLOCK);
         scale_dropblock_kernel << <num_blocks, BLOCK, 0, get_cuda_stream() >> > (state.input, l.outputs * l.batch, l.outputs, l.drop_blocks_scale_gpu);
@@ -176,14 +191,14 @@ void forward_dropout_layer_gpu(dropout_layer l, network_state state)
 void backward_dropout_layer_gpu(dropout_layer l, network_state state)
 {
     if(!state.delta) return;
-    //int iteration_num = (*state.net.seen) / (state.net.batch*state.net.subdivisions);
+    //int iteration_num = get_current_iteration(state.net); //(*state.net.seen) / (state.net.batch*state.net.subdivisions);
     //if (iteration_num < state.net.burn_in) return;
 
     int size = l.inputs*l.batch;
 
     // dropblock
     if (l.dropblock) {
-        int iteration_num = (*state.net.seen) / (state.net.batch*state.net.subdivisions);
+        int iteration_num = get_current_iteration(state.net); //(*state.net.seen) / (state.net.batch*state.net.subdivisions);
         float multiplier = 1.0;
         if (iteration_num < (state.net.max_batches*0.85))
             multiplier = (iteration_num / (float)(state.net.max_batches*0.85));
diff --git a/src/gaussian_yolo_layer.c b/src/gaussian_yolo_layer.c
index 7e9efb21..d3bd3f58 100644
--- a/src/gaussian_yolo_layer.c
+++ b/src/gaussian_yolo_layer.c
@@ -371,13 +371,20 @@ void delta_gaussian_yolo_class(float *output, float *delta, int index, int class
 {
     int n;
     if (delta[index]){
-        delta[index + stride*class_id] = (1 - label_smooth_eps) - output[index + stride*class_id];
+        float y_true = 1;
+        if (label_smooth_eps) y_true = y_true *  (1 - label_smooth_eps) + 0.5*label_smooth_eps;
+        delta[index + stride*class_id] = y_true - output[index + stride*class_id];
+        //delta[index + stride*class_id] = 1 - output[index + stride*class_id];
+
         if (classes_multipliers) delta[index + stride*class_id] *= classes_multipliers[class_id];
         if(avg_cat) *avg_cat += output[index + stride*class_id];
         return;
     }
     for(n = 0; n < classes; ++n){
-        delta[index + stride*n] = ((n == class_id) ? (1 - label_smooth_eps) : (0 + label_smooth_eps/classes)) - output[index + stride*n];
+        float y_true = ((n == class_id) ? 1 : 0);
+        if (label_smooth_eps) y_true = y_true *  (1 - label_smooth_eps) + 0.5*label_smooth_eps;
+        delta[index + stride*n] = y_true - output[index + stride*n];
+
         if (classes_multipliers && n == class_id) delta[index + stride*class_id] *= classes_multipliers[class_id];
         if(n == class_id && avg_cat) *avg_cat += output[index + stride*n];
     }
diff --git a/src/gru_layer.c b/src/gru_layer.c
index 787f46ea..de301df3 100644
--- a/src/gru_layer.c
+++ b/src/gru_layer.c
@@ -206,14 +206,14 @@ void push_gru_layer(layer l)
 {
 }
 
-void update_gru_layer_gpu(layer l, int batch, float learning_rate, float momentum, float decay)
+void update_gru_layer_gpu(layer l, int batch, float learning_rate, float momentum, float decay, float loss_scale)
 {
-    update_connected_layer_gpu(*(l.input_r_layer), batch, learning_rate, momentum, decay);
-    update_connected_layer_gpu(*(l.input_z_layer), batch, learning_rate, momentum, decay);
-    update_connected_layer_gpu(*(l.input_h_layer), batch, learning_rate, momentum, decay);
-    update_connected_layer_gpu(*(l.state_r_layer), batch, learning_rate, momentum, decay);
-    update_connected_layer_gpu(*(l.state_z_layer), batch, learning_rate, momentum, decay);
-    update_connected_layer_gpu(*(l.state_h_layer), batch, learning_rate, momentum, decay);
+    update_connected_layer_gpu(*(l.input_r_layer), batch, learning_rate, momentum, decay, loss_scale);
+    update_connected_layer_gpu(*(l.input_z_layer), batch, learning_rate, momentum, decay, loss_scale);
+    update_connected_layer_gpu(*(l.input_h_layer), batch, learning_rate, momentum, decay, loss_scale);
+    update_connected_layer_gpu(*(l.state_r_layer), batch, learning_rate, momentum, decay, loss_scale);
+    update_connected_layer_gpu(*(l.state_z_layer), batch, learning_rate, momentum, decay, loss_scale);
+    update_connected_layer_gpu(*(l.state_h_layer), batch, learning_rate, momentum, decay, loss_scale);
 }
 
 void forward_gru_layer_gpu(layer l, network_state state)
diff --git a/src/gru_layer.h b/src/gru_layer.h
index ae6b55f4..c13e46e8 100644
--- a/src/gru_layer.h
+++ b/src/gru_layer.h
@@ -18,7 +18,7 @@ void update_gru_layer(layer l, int batch, float learning_rate, float momentum, f
 #ifdef GPU
 void forward_gru_layer_gpu(layer l, network_state state);
 void backward_gru_layer_gpu(layer l, network_state state);
-void update_gru_layer_gpu(layer l, int batch, float learning_rate, float momentum, float decay);
+void update_gru_layer_gpu(layer l, int batch, float learning_rate, float momentum, float decay, float loss_scale);
 void push_gru_layer(layer l);
 void pull_gru_layer(layer l);
 #endif
diff --git a/src/http_stream.cpp b/src/http_stream.cpp
index 1f3d138d..c71a7f47 100644
--- a/src/http_stream.cpp
+++ b/src/http_stream.cpp
@@ -258,10 +258,11 @@ public:
                 //_write(s, head, 0);
                 if (!close_all_sockets) _write(s, ", \n", 0);
                 int n = _write(s, outputbuf, outlen);
-                if (n < outlen)
+                if (n < (int)outlen)
                 {
                     cerr << "JSON_sender: kill client " << s << endl;
-                    ::shutdown(s, 2);
+                    close_socket(s);
+                    //::shutdown(s, 2);
                     FD_CLR(s, &master);
                 }
 
@@ -448,7 +449,7 @@ public:
         cv::imencode(".jpg", frame, outbuf, params);  //REMOVED FOR COMPATIBILITY
         // https://docs.opencv.org/3.4/d4/da8/group__imgcodecs.html#ga292d81be8d76901bff7988d18d2b42ac
         //std::cerr << "cv::imencode call disabled!" << std::endl;
-        size_t outlen = outbuf.size();
+        int outlen = static_cast<int>(outbuf.size());
 
 #ifdef _WIN32
         for (unsigned i = 0; i<rread.fd_count; i++)
@@ -504,11 +505,12 @@ public:
                 sprintf(head, "--mjpegstream\r\nContent-Type: image/jpeg\r\nContent-Length: %zu\r\n\r\n", outlen);
                 _write(s, head, 0);
                 int n = _write(s, (char*)(&outbuf[0]), outlen);
-                //cerr << "known client " << s << " " << n << endl;
-                if (n < outlen)
+                cerr << "known client: " << s << ", sent = " << n << ", must be sent outlen = " << outlen << endl;
+                if (n < (int)outlen)
                 {
                     cerr << "MJPG_sender: kill client " << s << endl;
-                    ::shutdown(s, 2);
+                    //::shutdown(s, 2);
+                    close_socket(s);
                     FD_CLR(s, &master);
                 }
             }
diff --git a/src/image_opencv.cpp b/src/image_opencv.cpp
index 7adf6231..531676ba 100644
--- a/src/image_opencv.cpp
+++ b/src/image_opencv.cpp
@@ -1344,7 +1344,7 @@ void show_opencv_info()
 #else  // OPENCV
 extern "C" void show_opencv_info()
 {
-    std::cerr << " OpenCV isn't used \n";
+    std::cerr << " OpenCV isn't used - data increase will run slowly \n";
 }
 extern "C" int wait_key_cv(int delay) { return 0; }
 extern "C" int wait_until_press_key_cv() { return 0; }
diff --git a/src/local_layer.c b/src/local_layer.c
index 83b43c9a..88c7b125 100644
--- a/src/local_layer.c
+++ b/src/local_layer.c
@@ -253,7 +253,7 @@ void backward_local_layer_gpu(local_layer l, network_state state)
     }
 }
 
-void update_local_layer_gpu(local_layer l, int batch, float learning_rate, float momentum, float decay)
+void update_local_layer_gpu(local_layer l, int batch, float learning_rate, float momentum, float decay, float loss_scale)
 {
     int locations = l.out_w*l.out_h;
     int size = l.size*l.size*l.c*l.n*locations;
diff --git a/src/local_layer.h b/src/local_layer.h
index 8ef0dbad..45e02a5c 100644
--- a/src/local_layer.h
+++ b/src/local_layer.h
@@ -15,7 +15,7 @@ extern "C" {
 #ifdef GPU
 void forward_local_layer_gpu(local_layer layer, network_state state);
 void backward_local_layer_gpu(local_layer layer, network_state state);
-void update_local_layer_gpu(local_layer layer, int batch, float learning_rate, float momentum, float decay);
+void update_local_layer_gpu(local_layer layer, int batch, float learning_rate, float momentum, float decay, float loss_scale);
 
 void push_local_layer(local_layer layer);
 void pull_local_layer(local_layer layer);
diff --git a/src/lstm_layer.c b/src/lstm_layer.c
index e0cf7797..a7945561 100644
--- a/src/lstm_layer.c
+++ b/src/lstm_layer.c
@@ -401,16 +401,16 @@ void backward_lstm_layer(layer l, network_state state)
 }
 
 #ifdef GPU
-void update_lstm_layer_gpu(layer l, int batch, float learning_rate, float momentum, float decay)
+void update_lstm_layer_gpu(layer l, int batch, float learning_rate, float momentum, float decay, float loss_scale)
 {
-    update_connected_layer_gpu(*(l.wf), batch, learning_rate, momentum, decay);
-    update_connected_layer_gpu(*(l.wi), batch, learning_rate, momentum, decay);
-    update_connected_layer_gpu(*(l.wg), batch, learning_rate, momentum, decay);
-    update_connected_layer_gpu(*(l.wo), batch, learning_rate, momentum, decay);
-    update_connected_layer_gpu(*(l.uf), batch, learning_rate, momentum, decay);
-    update_connected_layer_gpu(*(l.ui), batch, learning_rate, momentum, decay);
-    update_connected_layer_gpu(*(l.ug), batch, learning_rate, momentum, decay);
-    update_connected_layer_gpu(*(l.uo), batch, learning_rate, momentum, decay);
+    update_connected_layer_gpu(*(l.wf), batch, learning_rate, momentum, decay, loss_scale);
+    update_connected_layer_gpu(*(l.wi), batch, learning_rate, momentum, decay, loss_scale);
+    update_connected_layer_gpu(*(l.wg), batch, learning_rate, momentum, decay, loss_scale);
+    update_connected_layer_gpu(*(l.wo), batch, learning_rate, momentum, decay, loss_scale);
+    update_connected_layer_gpu(*(l.uf), batch, learning_rate, momentum, decay, loss_scale);
+    update_connected_layer_gpu(*(l.ui), batch, learning_rate, momentum, decay, loss_scale);
+    update_connected_layer_gpu(*(l.ug), batch, learning_rate, momentum, decay, loss_scale);
+    update_connected_layer_gpu(*(l.uo), batch, learning_rate, momentum, decay, loss_scale);
 }
 
 void forward_lstm_layer_gpu(layer l, network_state state)
diff --git a/src/lstm_layer.h b/src/lstm_layer.h
index dc8eb37a..a116a83d 100644
--- a/src/lstm_layer.h
+++ b/src/lstm_layer.h
@@ -18,7 +18,7 @@ void update_lstm_layer(layer l, int batch, float learning_rate, float momentum,
 #ifdef GPU
 void forward_lstm_layer_gpu(layer l, network_state state);
 void backward_lstm_layer_gpu(layer l, network_state state);
-void update_lstm_layer_gpu(layer l, int batch, float learning_rate, float momentum, float decay);
+void update_lstm_layer_gpu(layer l, int batch, float learning_rate, float momentum, float decay, float loss_scale);
 #endif
 
 #ifdef __cplusplus
diff --git a/src/network.c b/src/network.c
index 771df85c..8f09dab7 100644
--- a/src/network.c
+++ b/src/network.c
@@ -57,6 +57,11 @@ load_args get_base_args(network *net)
     return args;
 }
 
+int64_t get_current_iteration(network net)
+{
+    return *net.cur_iteration;
+}
+
 int get_current_batch(network net)
 {
     int batch_num = (*net.seen)/(net.batch*net.subdivisions);
@@ -240,6 +245,7 @@ network make_network(int n)
     net.n = n;
     net.layers = (layer*)xcalloc(net.n, sizeof(layer));
     net.seen = (uint64_t*)xcalloc(1, sizeof(uint64_t));
+    net.cur_iteration = (int*)xcalloc(1, sizeof(int));
 #ifdef GPU
     net.input_gpu = (float**)xcalloc(1, sizeof(float*));
     net.truth_gpu = (float**)xcalloc(1, sizeof(float*));
@@ -359,7 +365,7 @@ float train_network_datum(network net, float *x, float *y)
     forward_network(net, state);
     backward_network(net, state);
     float error = get_network_cost(net);
-    if(((*net.seen)/net.batch)%net.subdivisions == 0) update_network(net);
+    //if(((*net.seen)/net.batch)%net.subdivisions == 0) update_network(net);
     return error;
 }
 
@@ -404,6 +410,12 @@ float train_network_waitkey(network net, data d, int wait_key)
         sum += err;
         if(wait_key) wait_key_cv(5);
     }
+    (*net.cur_iteration) += 1;
+#ifdef GPU
+    update_network_gpu(net);
+#else   // GPU
+    update_network(net);
+#endif  // GPU
     free(X);
     free(y);
     return (float)sum/(n*batch);
@@ -523,7 +535,7 @@ int resize_network(network *net, int w, int h)
     //fflush(stderr);
     for (i = 0; i < net->n; ++i){
         layer l = net->layers[i];
-        //printf(" %d: layer = %d,", i, l.type);
+        //printf(" (resize %d: layer = %d) , ", i, l.type);
         if(l.type == CONVOLUTIONAL){
             resize_convolutional_layer(&l, w, h);
         }
@@ -1048,6 +1060,7 @@ void free_network(network net)
     free(net.scales);
     free(net.steps);
     free(net.seen);
+    free(net.cur_iteration);
 
 #ifdef GPU
     if (gpu_index >= 0) cuda_free(net.workspace);
@@ -1120,8 +1133,9 @@ void fuse_conv_batchnorm(network net)
         {
             if (l->nweights > 0) {
                 //cuda_pull_array(l.weights_gpu, l.weights, l.nweights);
-                for (int i = 0; i < l->nweights; ++i) printf(" w = %f,", l->weights[i]);
-                printf(" l->nweights = %d \n", l->nweights);
+                int i;
+                for (i = 0; i < l->nweights; ++i) printf(" w = %f,", l->weights[i]);
+                printf(" l->nweights = %d, j = %d \n", l->nweights, j);
             }
 
             // nweights - l.n or l.n*l.c or (l.n*l.c*l.h*l.w)
diff --git a/src/network.h b/src/network.h
index 7e51c37c..aad50215 100644
--- a/src/network.h
+++ b/src/network.h
@@ -108,6 +108,7 @@ float get_current_seq_subdivisions(network net);
 int get_sequence_value(network net);
 float get_current_rate(network net);
 int get_current_batch(network net);
+int64_t get_current_iteration(network net);
 //void free_network(network net); // darknet.h
 void compare_networks(network n1, network n2, data d);
 char *get_layer_string(LAYER_TYPE a);
diff --git a/src/network_kernels.cu b/src/network_kernels.cu
index 34d9ec6a..75b3d6a2 100644
--- a/src/network_kernels.cu
+++ b/src/network_kernels.cu
@@ -169,7 +169,8 @@ void backward_network_gpu(network net, network_state state)
     for(i = net.n-1; i >= 0; --i){
         state.index = i;
         layer l = net.layers[i];
-        if (l.stopbackward) break;
+        if (l.stopbackward == 1) break;
+        if (l.stopbackward > get_current_iteration(net)) break;
         if(i == 0){
             state.input = original_input;
             state.delta = original_delta;
@@ -247,8 +248,9 @@ void update_network_gpu(network net)
         layer l = net.layers[i];
         l.t = get_current_batch(net);
         if (iteration_num > (net.max_batches * 1 / 2)) l.deform = 0;
-        if(l.update_gpu){
-            l.update_gpu(l, update_batch, rate, net.momentum, net.decay);
+        if (l.burnin_update && (l.burnin_update*net.burn_in > iteration_num)) continue;
+        if(l.update_gpu && l.dont_update < iteration_num){
+            l.update_gpu(l, update_batch, rate, net.momentum, net.decay, net.loss_scale);
         }
     }
 }
@@ -318,7 +320,7 @@ float train_network_datum_gpu(network net, float *x, float *y)
     float error = get_network_cost(net);
     //if (((*net.seen) / net.batch) % net.subdivisions == 0) update_network_gpu(net);
     const int sequence = get_sequence_value(net);
-    if (((*net.seen) / net.batch) % (net.subdivisions*sequence) == 0) update_network_gpu(net);
+    //if (((*net.seen) / net.batch) % (net.subdivisions*sequence) == 0) update_network_gpu(net);
 
     return error;
 }
@@ -379,7 +381,7 @@ void update_layer(layer l, network net)
     float rate = get_current_rate(net);
     l.t = get_current_batch(net);
     if(l.update_gpu){
-        l.update_gpu(l, update_batch, rate, net.momentum, net.decay);
+        l.update_gpu(l, update_batch, rate, net.momentum, net.decay, net.loss_scale);
     }
 }
 
@@ -564,7 +566,9 @@ float train_networks(network *nets, int n, data d, int interval)
         sum += errors[i];
     }
     //cudaDeviceSynchronize();
-    if (get_current_batch(nets[0]) % interval == 0) {
+    *nets[0].cur_iteration += (n - 1);
+    if (get_current_iteration(nets[0]) % interval == 0)
+    {
         printf("Syncing... ");
         fflush(stdout);
         sync_nets(nets, n, interval);
diff --git a/src/parser.c b/src/parser.c
index f6f0fa0e..a2084378 100644
--- a/src/parser.c
+++ b/src/parser.c
@@ -997,6 +997,7 @@ route_layer parse_route(list *options, size_params params)
         if(next.out_w == first.out_w && next.out_h == first.out_h){
             layer.out_c += next.out_c;
         }else{
+            fprintf(stderr, " The width and height of the input layers are different. \n");
             layer.out_h = layer.out_w = layer.out_c = 0;
         }
     }
@@ -1053,6 +1054,10 @@ void parse_net_options(list *options, network *net)
     net->batch *= net->time_steps;
     net->subdivisions = subdivs;
 
+    *net->seen = 0;
+    *net->cur_iteration = 0;
+    net->loss_scale = option_find_float_quiet(options, "loss_scale", 1);
+    net->dynamic_minibatch = option_find_int_quiet(options, "dynamic_minibatch", 0);
     net->optimized_memory = option_find_int_quiet(options, "optimized_memory", 0);
     net->workspace_size_limit = (size_t)1024*1024 * option_find_float_quiet(options, "workspace_size_limit_MB", 1024);  // 1024 MB by default
 
@@ -1093,15 +1098,17 @@ void parse_net_options(list *options, network *net)
     char *policy_s = option_find_str(options, "policy", "constant");
     net->policy = get_policy(policy_s);
     net->burn_in = option_find_int_quiet(options, "burn_in", 0);
-#ifdef CUDNN_HALF
+#ifdef GPU
     if (net->gpu_index >= 0) {
         int compute_capability = get_gpu_compute_capability(net->gpu_index);
-        if (get_gpu_compute_capability(net->gpu_index) >= 700) net->cudnn_half = 1;
+#ifdef CUDNN_HALF
+        if (compute_capability >= 700) net->cudnn_half = 1;
         else net->cudnn_half = 0;
+#endif// CUDNN_HALF
         fprintf(stderr, " compute_capability = %d, cudnn_half = %d \n", compute_capability, net->cudnn_half);
     }
     else fprintf(stderr, " GPU isn't used \n");
-#endif
+#endif// GPU
     if(net->policy == STEP){
         net->step = option_find_int(options, "step", 1);
         net->scale = option_find_float(options, "scale", 1);
@@ -1201,7 +1208,7 @@ network parse_network_cfg_custom(char *filename, int batch, int time_steps)
     params.batch = net.batch;
     params.time_steps = net.time_steps;
     params.net = net;
-    printf("batch = %d, time_steps = %d, train = %d \n", net.batch, net.time_steps, params.train);
+    printf("mini_batch = %d, batch = %d, time_steps = %d, train = %d \n", net.batch, net.batch * net.subdivisions, net.time_steps, params.train);
 
     int avg_outputs = 0;
     float bflops = 0;
@@ -1357,7 +1364,11 @@ network parse_network_cfg_custom(char *filename, int batch, int time_steps)
         }
 #endif // GPU
 
+        l.clip = option_find_float_quiet(options, "clip", 0);
+        l.dynamic_minibatch = net.dynamic_minibatch;
         l.onlyforward = option_find_int_quiet(options, "onlyforward", 0);
+        l.dont_update = option_find_int_quiet(options, "dont_update", 0);
+        l.burnin_update = option_find_int_quiet(options, "burnin_update", 0);
         l.stopbackward = option_find_int_quiet(options, "stopbackward", 0);
         l.dontload = option_find_int_quiet(options, "dontload", 0);
         l.dontloadscales = option_find_int_quiet(options, "dontloadscales", 0);
@@ -1549,8 +1560,15 @@ void save_shortcut_weights(layer l, FILE *fp)
 #ifdef GPU
     if (gpu_index >= 0) {
         pull_shortcut_layer(l);
+        printf("\n pull_shortcut_layer \n");
     }
 #endif
+    int i;
+    for (i = 0; i < l.nweights; ++i) printf(" %f, ", l.weight_updates[i]);
+    printf(" l.nweights = %d - update \n", l.nweights);
+    for (i = 0; i < l.nweights; ++i) printf(" %f, ", l.weights[i]);
+    printf(" l.nweights = %d \n\n", l.nweights);
+
     int num = l.nweights;
     fwrite(l.weights, sizeof(float), num, fp);
 }
@@ -1626,6 +1644,7 @@ void save_weights_upto(network net, char *filename, int cutoff)
     fwrite(&major, sizeof(int), 1, fp);
     fwrite(&minor, sizeof(int), 1, fp);
     fwrite(&revision, sizeof(int), 1, fp);
+    //(*net.seen) = (*net.cur_iteration) * net.batch * net.subdivisions;
     fwrite(net.seen, sizeof(uint64_t), 1, fp);
 
     int i;
@@ -1835,7 +1854,7 @@ void load_shortcut_weights(layer l, FILE *fp)
     read_bytes = fread(l.weights, sizeof(float), num, fp);
     if (read_bytes > 0 && read_bytes < num) printf("\n Warning: Unexpected end of wights-file! l.weights - l.index = %d \n", l.index);
     //for (int i = 0; i < l.nweights; ++i) printf(" %f, ", l.weights[i]);
-    //printf("\n\n");
+    //printf(" read_bytes = %d \n\n", read_bytes);
 #ifdef GPU
     if (gpu_index >= 0) {
         push_shortcut_layer(l);
@@ -1873,6 +1892,7 @@ void load_weights_upto(network *net, char *filename, int cutoff)
         fread(&iseen, sizeof(uint32_t), 1, fp);
         *net->seen = iseen;
     }
+    *net->cur_iteration = get_current_batch(*net);
     printf(", trained: %.0f K-images (%.0f Kilo-batches_64) \n", (float)(*net->seen / 1000), (float)(*net->seen / 64000));
     int transpose = (major > 1000) || (minor > 1000);
 
@@ -1968,7 +1988,10 @@ network *load_network_custom(char *cfg, char *weights, int clear, int batch)
         load_weights(net, weights);
     }
     fuse_conv_batchnorm(*net);
-    if (clear) (*net->seen) = 0;
+    if (clear) {
+        (*net->seen) = 0;
+        (*net->cur_iteration) = 0;
+    }
     return net;
 }
 
@@ -1982,6 +2005,9 @@ network *load_network(char *cfg, char *weights, int clear)
         printf(" Try to load weights: %s \n", weights);
         load_weights(net, weights);
     }
-    if (clear) (*net->seen) = 0;
+    if (clear) {
+        (*net->seen) = 0;
+        (*net->cur_iteration) = 0;
+    }
     return net;
 }
diff --git a/src/region_layer.c b/src/region_layer.c
index 091b42f9..3825b425 100644
--- a/src/region_layer.c
+++ b/src/region_layer.c
@@ -67,7 +67,8 @@ void resize_region_layer(layer *l, int w, int h)
     l->delta = (float*)xrealloc(l->delta, l->batch * l->outputs * sizeof(float));
 
 #ifdef GPU
-    if (old_w < w || old_h < h) {
+    //if (old_w < w || old_h < h)
+    {
         cuda_free(l->delta_gpu);
         cuda_free(l->output_gpu);
 
diff --git a/src/rnn.c b/src/rnn.c
index 6c9b38c8..53fb8f32 100644
--- a/src/rnn.c
+++ b/src/rnn.c
@@ -155,7 +155,10 @@ void train_char_rnn(char *cfgfile, char *weightfile, char *filename, int clear,
     fprintf(stderr, "Learning Rate: %g, Momentum: %g, Decay: %g\n", net.learning_rate, net.momentum, net.decay);
     int batch = net.batch;
     int steps = net.time_steps;
-    if(clear) *net.seen = 0;
+    if (clear) {
+        *net.seen = 0;
+        *net.cur_iteration = 0;
+    }
     int i = (*net.seen)/net.batch;
 
     int streams = batch/steps;
diff --git a/src/rnn_layer.c b/src/rnn_layer.c
index 328bcbf0..98f0d489 100644
--- a/src/rnn_layer.c
+++ b/src/rnn_layer.c
@@ -196,11 +196,11 @@ void push_rnn_layer(layer l)
     push_connected_layer(*(l.output_layer));
 }
 
-void update_rnn_layer_gpu(layer l, int batch, float learning_rate, float momentum, float decay)
+void update_rnn_layer_gpu(layer l, int batch, float learning_rate, float momentum, float decay, float loss_scale)
 {
-    update_connected_layer_gpu(*(l.input_layer), batch, learning_rate, momentum, decay);
-    update_connected_layer_gpu(*(l.self_layer), batch, learning_rate, momentum, decay);
-    update_connected_layer_gpu(*(l.output_layer), batch, learning_rate, momentum, decay);
+    update_connected_layer_gpu(*(l.input_layer), batch, learning_rate, momentum, decay, loss_scale);
+    update_connected_layer_gpu(*(l.self_layer), batch, learning_rate, momentum, decay, loss_scale);
+    update_connected_layer_gpu(*(l.output_layer), batch, learning_rate, momentum, decay, loss_scale);
 }
 
 void forward_rnn_layer_gpu(layer l, network_state state)
diff --git a/src/rnn_layer.h b/src/rnn_layer.h
index 43b57d82..a2aa0f9e 100644
--- a/src/rnn_layer.h
+++ b/src/rnn_layer.h
@@ -19,7 +19,7 @@ void update_rnn_layer(layer l, int batch, float learning_rate, float momentum, f
 #ifdef GPU
 void forward_rnn_layer_gpu(layer l, network_state state);
 void backward_rnn_layer_gpu(layer l, network_state state);
-void update_rnn_layer_gpu(layer l, int batch, float learning_rate, float momentum, float decay);
+void update_rnn_layer_gpu(layer l, int batch, float learning_rate, float momentum, float decay, float loss_scale);
 void push_rnn_layer(layer l);
 void pull_rnn_layer(layer l);
 #endif
diff --git a/src/shortcut_layer.c b/src/shortcut_layer.c
index 0acb138a..17f0df94 100644
--- a/src/shortcut_layer.c
+++ b/src/shortcut_layer.c
@@ -53,7 +53,7 @@ layer make_shortcut_layer(int batch, int n, int *input_layers, int* input_sizes,
     if (l.nweights > 0) {
         l.weights = (float*)calloc(l.nweights, sizeof(float));
         float scale = sqrt(2. / l.nweights);
-        for (i = 0; i < l.nweights; ++i) l.weights[i] = 1 + 0.01*rand_uniform(-1, 1);// scale*rand_uniform(-1, 1);   // rand_normal();
+        for (i = 0; i < l.nweights; ++i) l.weights[i] = 1;// +0.01*rand_uniform(-1, 1);// scale*rand_uniform(-1, 1);   // rand_normal();
 
         if (train) l.weight_updates = (float*)calloc(l.nweights, sizeof(float));
         l.update = update_shortcut_layer;
@@ -240,7 +240,7 @@ void backward_shortcut_layer_gpu(const layer l, network_state state)
     //shortcut_gpu(l.batch, l.out_w, l.out_h, l.out_c, l.delta_gpu, l.w, l.h, l.c, state.net.layers[l.index].delta_gpu);
 }
 
-void update_shortcut_layer_gpu(layer l, int batch, float learning_rate_init, float momentum, float decay)
+void update_shortcut_layer_gpu(layer l, int batch, float learning_rate_init, float momentum, float decay, float loss_scale)
 {
     if (l.nweights > 0) {
         float learning_rate = learning_rate_init*l.learning_rate_scale;
@@ -248,21 +248,43 @@ void update_shortcut_layer_gpu(layer l, int batch, float learning_rate_init, flo
         //float decay = a.decay;
         //int batch = a.batch;
 
+        // Loss scale for Mixed-Precision on Tensor-Cores
+        if (loss_scale != 1.0) {
+            if(l.weight_updates_gpu && l.nweights > 0) scal_ongpu(l.nweights, 1.0 / loss_scale, l.weight_updates_gpu, 1);
+        }
+
         reset_nan_and_inf(l.weight_updates_gpu, l.nweights);
         fix_nan_and_inf(l.weights_gpu, l.nweights);
 
-        axpy_ongpu(l.nweights, -decay*batch, l.weights_gpu, 1, l.weight_updates_gpu, 1);
+        //constrain_weight_updates_ongpu(l.nweights, 1, l.weights_gpu, l.weight_updates_gpu);
+        constrain_ongpu(l.nweights, 1, l.weight_updates_gpu, 1);
+
+        /*
+        cuda_pull_array_async(l.weights_gpu, l.weights, l.nweights);
+        cuda_pull_array_async(l.weight_updates_gpu, l.weight_updates, l.nweights);
+        CHECK_CUDA(cudaStreamSynchronize(get_cuda_stream()));
+        for (int i = 0; i < l.nweights; ++i) printf(" %f, ", l.weight_updates[i]);
+        printf(" l.nweights = %d - updates \n", l.nweights);
+        for (int i = 0; i < l.nweights; ++i) printf(" %f, ", l.weights[i]);
+        printf(" l.nweights = %d \n\n", l.nweights);
+        */
+
+        //axpy_ongpu(l.nweights, -decay*batch, l.weights_gpu, 1, l.weight_updates_gpu, 1);
         axpy_ongpu(l.nweights, learning_rate / batch, l.weight_updates_gpu, 1, l.weights_gpu, 1);
         scal_ongpu(l.nweights, momentum, l.weight_updates_gpu, 1);
 
+        //fill_ongpu(l.nweights, 0, l.weight_updates_gpu, 1);
+
         //if (l.clip) {
-        //    constrain_gpu(l.nweights, l.clip, l.weights_gpu, 1);
+        //    constrain_ongpu(l.nweights, l.clip, l.weights_gpu, 1);
         //}
     }
 }
 
 void pull_shortcut_layer(layer l)
 {
+    constrain_ongpu(l.nweights, 1, l.weight_updates_gpu, 1);
+    cuda_pull_array_async(l.weight_updates_gpu, l.weight_updates, l.nweights);
     cuda_pull_array_async(l.weights_gpu, l.weights, l.nweights);
     CHECK_CUDA(cudaPeekAtLastError());
     CHECK_CUDA(cudaStreamSynchronize(get_cuda_stream()));
diff --git a/src/shortcut_layer.h b/src/shortcut_layer.h
index 29191620..f0bf33a7 100644
--- a/src/shortcut_layer.h
+++ b/src/shortcut_layer.h
@@ -18,7 +18,7 @@ void resize_shortcut_layer(layer *l, int w, int h, network *net);
 #ifdef GPU
 void forward_shortcut_layer_gpu(const layer l, network_state state);
 void backward_shortcut_layer_gpu(const layer l, network_state state);
-void update_shortcut_layer_gpu(layer l, int batch, float learning_rate_init, float momentum, float decay);
+void update_shortcut_layer_gpu(layer l, int batch, float learning_rate_init, float momentum, float decay, float loss_scale);
 void pull_shortcut_layer(layer l);
 void push_shortcut_layer(layer l);
 #endif
diff --git a/src/tag.c b/src/tag.c
index ec9327af..d7e1349a 100644
--- a/src/tag.c
+++ b/src/tag.c
@@ -13,7 +13,10 @@ void train_tag(char *cfgfile, char *weightfile, int clear)
     if(weightfile){
         load_weights(&net, weightfile);
     }
-    if(clear) *net.seen = 0;
+    if (clear) {
+        *net.seen = 0;
+        *net.cur_iteration = 0;
+    }
     printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", net.learning_rate, net.momentum, net.decay);
     int imgs = 1024;
     list* plist = get_paths("tag/train.list");
diff --git a/src/yolo_console_dll.cpp b/src/yolo_console_dll.cpp
index 8f73d869..6d1bb2a5 100644
--- a/src/yolo_console_dll.cpp
+++ b/src/yolo_console_dll.cpp
@@ -124,7 +124,7 @@ cv::Mat zed_capture_rgb(sl::Camera &zed) {
 
 cv::Mat zed_capture_3d(sl::Camera &zed) {
     sl::Mat cur_cloud;
-    zed.retrieveMeasure(cur_cloud, 
+    zed.retrieveMeasure(cur_cloud,
 #ifdef ZED_STEREO_2_COMPAT_MODE
         sl::MEASURE_XYZ
 #else
@@ -151,11 +151,13 @@ std::vector<bbox_t> get_3d_coordinates(std::vector<bbox_t> bbox_vect, cv::Mat xy
 #ifndef USE_CMAKE_LIBS
 #pragma comment(lib, "opencv_world" OPENCV_VERSION ".lib")
 #ifdef TRACK_OPTFLOW
+/*
 #pragma comment(lib, "opencv_cudaoptflow" OPENCV_VERSION ".lib")
 #pragma comment(lib, "opencv_cudaimgproc" OPENCV_VERSION ".lib")
 #pragma comment(lib, "opencv_core" OPENCV_VERSION ".lib")
 #pragma comment(lib, "opencv_imgproc" OPENCV_VERSION ".lib")
 #pragma comment(lib, "opencv_highgui" OPENCV_VERSION ".lib")
+*/
 #endif    // TRACK_OPTFLOW
 #endif    // USE_CMAKE_LIBS
 #else     // OpenCV 2.x
@@ -342,7 +344,7 @@ int main(int argc, char *argv[])
     #endif
                 //init_params.sdk_cuda_ctx = (CUcontext)detector.get_cuda_context();
                 init_params.sdk_gpu_id = detector.cur_gpu_id;
-                
+
                 if (filename == "zed_camera" || file_ext == "svo") {
                     std::cout << "ZED 3D Camera " << zed.open(init_params) << std::endl;
                     if (!zed.isOpened()) {
@@ -411,7 +413,7 @@ int main(int argc, char *argv[])
                         if (use_zed_camera) {
                             while (zed.grab() !=
         #ifdef ZED_STEREO_2_COMPAT_MODE
-                                sl::SUCCESS 
+                                sl::SUCCESS
         #else
                                 sl::ERROR_CODE::SUCCESS
         #endif
diff --git a/src/yolo_layer.c b/src/yolo_layer.c
index 86a78895..1ea6bf4d 100644
--- a/src/yolo_layer.c
+++ b/src/yolo_layer.c
@@ -257,7 +257,12 @@ void delta_yolo_class(float *output, float *delta, int index, int class_id, int
 {
     int n;
     if (delta[index + stride*class_id]){
-        delta[index + stride*class_id] = (1 - label_smooth_eps) - output[index + stride*class_id];
+        float y_true = 1;
+        if(label_smooth_eps) y_true = y_true *  (1 - label_smooth_eps) + 0.5*label_smooth_eps;
+        float result_delta = y_true - output[index + stride*class_id];
+        if(!isnan(result_delta) && !isinf(result_delta)) delta[index + stride*class_id] = result_delta;
+        //delta[index + stride*class_id] = 1 - output[index + stride*class_id];
+
         if (classes_multipliers) delta[index + stride*class_id] *= classes_multipliers[class_id];
         if(avg_cat) *avg_cat += output[index + stride*class_id];
         return;
@@ -285,7 +290,11 @@ void delta_yolo_class(float *output, float *delta, int index, int class_id, int
     else {
         // default
         for (n = 0; n < classes; ++n) {
-            delta[index + stride*n] = ((n == class_id) ? (1 - label_smooth_eps) : (0 + label_smooth_eps/classes)) - output[index + stride*n];
+            float y_true = ((n == class_id) ? 1 : 0);
+            if (label_smooth_eps) y_true = y_true *  (1 - label_smooth_eps) + 0.5*label_smooth_eps;
+            float result_delta = y_true - output[index + stride*n];
+            if (!isnan(result_delta) && !isinf(result_delta)) delta[index + stride*n] = result_delta;
+
             if (classes_multipliers && n == class_id) delta[index + stride*class_id] *= classes_multipliers[class_id];
             if (n == class_id && avg_cat) *avg_cat += output[index + stride*n];
         }
@@ -362,17 +371,18 @@ void forward_yolo_layer(const layer l, network_state state)
                     for (t = 0; t < l.max_boxes; ++t) {
                         box truth = float_to_box_stride(state.truth + t*(4 + 1) + b*l.truths, 1);
                         int class_id = state.truth[t*(4 + 1) + b*l.truths + 4];
-                        if (class_id >= l.classes) {
+                        if (class_id >= l.classes || class_id < 0) {
                             printf("\n Warning: in txt-labels class_id=%d >= classes=%d in cfg-file. In txt-labels class_id should be [from 0 to %d] \n", class_id, l.classes, l.classes - 1);
                             printf("\n truth.x = %f, truth.y = %f, truth.w = %f, truth.h = %f, class_id = %d \n", truth.x, truth.y, truth.w, truth.h, class_id);
                             if (check_mistakes) getchar();
-                            continue; // if label contains class_id more than number of classes in the cfg-file
+                            continue; // if label contains class_id more than number of classes in the cfg-file and class_id check garbage value
                         }
                         if (!truth.x) break;  // continue;
 
                         int class_index = entry_index(l, b, n*l.w*l.h + j*l.w + i, 4 + 1);
                         int obj_index = entry_index(l, b, n*l.w*l.h + j*l.w + i, 4);
                         float objectness = l.output[obj_index];
+                        if (isnan(objectness) || isinf(objectness)) l.output[obj_index] = 0;
                         int class_id_match = compare_yolo_class(l.output, l.classes, class_index, l.w*l.h, objectness, class_id, 0.25f);
 
                         float iou = box_iou(pred, truth);
@@ -415,7 +425,7 @@ void forward_yolo_layer(const layer l, network_state state)
                 system(buff);
             }
             int class_id = state.truth[t*(4 + 1) + b*l.truths + 4];
-            if (class_id >= l.classes) continue; // if label contains class_id more than number of classes in the cfg-file
+            if (class_id >= l.classes || class_id < 0) continue; // if label contains class_id more than number of classes in the cfg-file and class_id check garbage value
 
             if (!truth.x) break;  // continue;
             float best_iou = 0;
@@ -531,6 +541,9 @@ void forward_yolo_layer(const layer l, network_state state)
         }
     }
 
+    if (count == 0) count = 1;
+    if (class_count == 0) class_count = 1;
+
     //*(l.cost) = pow(mag_array(l.delta, l.outputs * l.batch), 2);
     //printf("Region %d Avg IOU: %f, Class: %f, Obj: %f, No Obj: %f, .5R: %f, .75R: %f,  count: %d\n", state.index, avg_iou / count, avg_cat / class_count, avg_obj / count, avg_anyobj / (l.w*l.h*l.n*l.batch), recall / count, recall75 / count, count);
 
@@ -810,6 +823,6 @@ void forward_yolo_layer_gpu(const layer l, network_state state)
 
 void backward_yolo_layer_gpu(const layer l, network_state state)
 {
-    axpy_ongpu(l.batch*l.inputs, 1, l.delta_gpu, 1, state.delta, 1);
+    axpy_ongpu(l.batch*l.inputs, state.net.loss_scale, l.delta_gpu, 1, state.delta, 1);
 }
 #endif