diff --git a/README.md b/README.md index 632f8962..a00d8924 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,5 @@ # Yolo-v3 and Yolo-v2 for Windows and Linux -### (neural network for object detection) - Tensor Cores can be used on [Linux](https://github.com/AlexeyAB/darknet#how-to-compile-on-linux) and [Windows](https://github.com/AlexeyAB/darknet#how-to-compile-on-windows-using-vcpkg) +### (neural network for object detection) - Tensor Cores can be used on [Linux](https://github.com/AlexeyAB/darknet#how-to-compile-on-linux) and [Windows](https://github.com/AlexeyAB/darknet#how-to-compile-on-windows-using-cmake-gui) More details: http://pjreddie.com/darknet/yolo/ @@ -8,7 +8,8 @@ More details: http://pjreddie.com/darknet/yolo/ [![TravisCI](https://travis-ci.org/AlexeyAB/darknet.svg?branch=master)](https://travis-ci.org/AlexeyAB/darknet) [![AppveyorCI](https://ci.appveyor.com/api/projects/status/594bwb5uoc1fxwiu/branch/master?svg=true)](https://ci.appveyor.com/project/AlexeyAB/darknet/branch/master) [![Contributors](https://img.shields.io/github/contributors/AlexeyAB/Darknet.svg)](https://github.com/AlexeyAB/darknet/graphs/contributors) -[![License: Unlicense](https://img.shields.io/badge/license-Unlicense-blue.svg)](https://github.com/AlexeyAB/darknet/blob/master/LICENSE) +[![License: Unlicense](https://img.shields.io/badge/license-Unlicense-blue.svg)](https://github.com/AlexeyAB/darknet/blob/master/LICENSE) +[![DOI](https://zenodo.org/badge/75388965.svg)](https://zenodo.org/badge/latestdoi/75388965) * [Requirements (and how to install dependecies)](#requirements) @@ -395,7 +396,7 @@ Training Yolo v3: * https://github.com/AlexeyAB/darknet/blob/0039fd26786ab5f71d5af725fc18b3f521e7acfd/cfg/yolov3.cfg#L610 * https://github.com/AlexeyAB/darknet/blob/0039fd26786ab5f71d5af725fc18b3f521e7acfd/cfg/yolov3.cfg#L696 * https://github.com/AlexeyAB/darknet/blob/0039fd26786ab5f71d5af725fc18b3f521e7acfd/cfg/yolov3.cfg#L783 - * change [`filters=255`] to filters=(classes + 5)x3 in the 3 `[convolutional]` before each `[yolo]` layer + * change [`filters=255`] to filters=(classes + 5)x3 in the 3 `[convolutional]` before each `[yolo]` layer, keep in mind that it only has to be the last `[convolutional]` before each of the `[yolo]` layers. * https://github.com/AlexeyAB/darknet/blob/0039fd26786ab5f71d5af725fc18b3f521e7acfd/cfg/yolov3.cfg#L603 * https://github.com/AlexeyAB/darknet/blob/0039fd26786ab5f71d5af725fc18b3f521e7acfd/cfg/yolov3.cfg#L689 * https://github.com/AlexeyAB/darknet/blob/0039fd26786ab5f71d5af725fc18b3f521e7acfd/cfg/yolov3.cfg#L776 diff --git a/include/darknet.h b/include/darknet.h index a13168da..ecc06402 100644 --- a/include/darknet.h +++ b/include/darknet.h @@ -34,6 +34,8 @@ #define SECRET_NUM -1234 +typedef enum { UNUSED_DEF_VAL } UNUSED_ENUM_TYPE; + #ifdef GPU #include @@ -42,8 +44,8 @@ #ifdef CUDNN #include -#endif -#endif +#endif // CUDNN +#endif // GPU #ifdef __cplusplus extern "C" { @@ -209,13 +211,14 @@ struct layer { void(*update) (struct layer, int, float, float, float); void(*forward_gpu) (struct layer, struct network_state); void(*backward_gpu) (struct layer, struct network_state); - void(*update_gpu) (struct layer, int, float, float, float); + void(*update_gpu) (struct layer, int, float, float, float, float); layer *share_layer; int train; int avgpool; int batch_normalize; int shortcut; int batch; + int dynamic_minibatch; int forced; int flipped; int inputs; @@ -321,6 +324,8 @@ struct layer { int onlyforward; int stopbackward; + int dont_update; + int burnin_update; int dontload; int dontsave; int dontloadscales; @@ -495,7 +500,7 @@ struct layer { size_t workspace_size; -#ifdef GPU +//#ifdef GPU int *indexes_gpu; float *z_gpu; @@ -610,8 +615,21 @@ struct layer { cudnnConvolutionBwdDataAlgo_t bd_algo, bd_algo16; cudnnConvolutionBwdFilterAlgo_t bf_algo, bf_algo16; cudnnPoolingDescriptor_t poolingDesc; +#else // CUDNN + void* srcTensorDesc, *dstTensorDesc; + void* srcTensorDesc16, *dstTensorDesc16; + void* dsrcTensorDesc, *ddstTensorDesc; + void* dsrcTensorDesc16, *ddstTensorDesc16; + void* normTensorDesc, *normDstTensorDesc, *normDstTensorDescF16; + void* weightDesc, *weightDesc16; + void* dweightDesc, *dweightDesc16; + void* convDesc; + UNUSED_ENUM_TYPE fw_algo, fw_algo16; + UNUSED_ENUM_TYPE bd_algo, bd_algo16; + UNUSED_ENUM_TYPE bf_algo, bf_algo16; + void* poolingDesc; #endif // CUDNN -#endif // GPU +//#endif // GPU }; @@ -625,6 +643,8 @@ typedef struct network { int n; int batch; uint64_t *seen; + int *cur_iteration; + float loss_scale; int *t; float epoch; int subdivisions; @@ -701,7 +721,7 @@ typedef struct network { float *cost; float clip; -#ifdef GPU +//#ifdef GPU //float *input_gpu; //float *truth_gpu; float *delta_gpu; @@ -722,8 +742,9 @@ typedef struct network { float *global_delta_gpu; float *state_delta_gpu; size_t max_delta_gpu_size; -#endif +//#endif // GPU int optimized_memory; + int dynamic_minibatch; size_t workspace_size_limit; } network; diff --git a/src/activation_kernels.cu b/src/activation_kernels.cu index c7cd49f2..ef2ddbe4 100644 --- a/src/activation_kernels.cu +++ b/src/activation_kernels.cu @@ -493,6 +493,7 @@ __global__ void activate_array_normalize_channels_softmax_kernel(float *x, int s for (k = 0; k < channels; ++k) { float val = x[wh_i + k * wh_step + b*wh_step*channels]; val = expf(val - max_val) / sum; + if (isnan(val) || isinf(val)) val = 0; output_gpu[wh_i + k * wh_step + b*wh_step*channels] = val; } } @@ -535,6 +536,7 @@ __global__ void gradient_array_normalize_channels_softmax_kernel(float *x, int s float delta = delta_gpu[index]; float grad = x[index] * (1 - x[index]); delta = delta * grad; + if (isnan(delta) || isinf(delta)) delta = 0; delta_gpu[index] = delta; } } diff --git a/src/batchnorm_layer.c b/src/batchnorm_layer.c index 7d9b172d..7432434c 100644 --- a/src/batchnorm_layer.c +++ b/src/batchnorm_layer.c @@ -258,15 +258,17 @@ void forward_batchnorm_layer_gpu(layer l, network_state state) fast_mean_gpu(l.output_gpu, l.batch, l.out_c, l.out_h*l.out_w, l.mean_gpu); //fast_v_gpu(l.output_gpu, l.mean_gpu, l.batch, l.out_c, l.out_h*l.out_w, l.v_cbn_gpu); - int minibatch_index = state.net.current_subdivision + 1; - float alpha = 0.01; + const int minibatch_index = state.net.current_subdivision + 1; + const int max_minibatch_index = state.net.subdivisions; + //printf("\n minibatch_index = %d, max_minibatch_index = %d \n", minibatch_index, max_minibatch_index); + const float alpha = 0.01; int inverse_variance = 0; #ifdef CUDNN inverse_variance = 1; #endif // CUDNN - fast_v_cbn_gpu(l.output_gpu, l.mean_gpu, l.batch, l.out_c, l.out_h*l.out_w, minibatch_index, l.m_cbn_avg_gpu, l.v_cbn_avg_gpu, l.variance_gpu, + fast_v_cbn_gpu(l.output_gpu, l.mean_gpu, l.batch, l.out_c, l.out_h*l.out_w, minibatch_index, max_minibatch_index, l.m_cbn_avg_gpu, l.v_cbn_avg_gpu, l.variance_gpu, alpha, l.rolling_mean_gpu, l.rolling_variance_gpu, inverse_variance, .00001); normalize_scale_bias_gpu(l.output_gpu, l.mean_gpu, l.variance_gpu, l.scales_gpu, l.biases_gpu, l.batch, l.out_c, l.out_h*l.out_w, inverse_variance, .00001f); @@ -385,9 +387,9 @@ void backward_batchnorm_layer_gpu(layer l, network_state state) } } -void update_batchnorm_layer_gpu(layer l, int batch, float learning_rate_init, float momentum, float decay) +void update_batchnorm_layer_gpu(layer l, int batch, float learning_rate_init, float momentum, float decay, float loss_scale) { - float learning_rate = learning_rate_init*l.learning_rate_scale; + float learning_rate = learning_rate_init * l.learning_rate_scale / loss_scale; //float momentum = a.momentum; //float decay = a.decay; //int batch = a.batch; diff --git a/src/batchnorm_layer.h b/src/batchnorm_layer.h index f5137465..afdc54b7 100644 --- a/src/batchnorm_layer.h +++ b/src/batchnorm_layer.h @@ -18,7 +18,7 @@ void resize_batchnorm_layer(layer *l, int w, int h); #ifdef GPU void forward_batchnorm_layer_gpu(layer l, network_state state); void backward_batchnorm_layer_gpu(layer l, network_state state); -void update_batchnorm_layer_gpu(layer l, int batch, float learning_rate_init, float momentum, float decay); +void update_batchnorm_layer_gpu(layer l, int batch, float learning_rate_init, float momentum, float decay, float loss_scale); void pull_batchnorm_layer(layer l); void push_batchnorm_layer(layer l); #endif diff --git a/src/blas.h b/src/blas.h index 252497a8..2f914516 100644 --- a/src/blas.h +++ b/src/blas.h @@ -60,6 +60,7 @@ void fix_nan_and_inf_cpu(float *input, size_t size); #ifdef GPU +void constrain_weight_updates_ongpu(int N, float coef, float *weights_gpu, float *weight_updates_gpu); void axpy_ongpu(int N, float ALPHA, float * X, int INCX, float * Y, int INCY); void axpy_ongpu_offset(int N, float ALPHA, float * X, int OFFX, int INCX, float * Y, int OFFY, int INCY); void simple_copy_ongpu(int size, float *src, float *dst); @@ -87,7 +88,7 @@ void fast_variance_delta_gpu(float *x, float *delta, float *mean, float *varianc void fast_mean_gpu(float *x, int batch, int filters, int spatial, float *mean); void fast_variance_gpu(float *x, float *mean, int batch, int filters, int spatial, float *variance); -void fast_v_cbn_gpu(const float *x, float *mean, int batch, int filters, int spatial, int minibatch_index, float *m_avg, float *v_avg, float *variance, +void fast_v_cbn_gpu(const float *x, float *mean, int batch, int filters, int spatial, int minibatch_index, int max_minibatch_index, float *m_avg, float *v_avg, float *variance, const float alpha, float *rolling_mean_gpu, float *rolling_variance_gpu, int inverse_variance, float epsilon); void normalize_scale_bias_gpu(float *x, float *mean, float *variance, float *scales, float *biases, int batch, int filters, int spatial, int inverse_variance, float epsilon); void compare_2_arrays_gpu(float *one, float *two, int size); diff --git a/src/blas_kernels.cu b/src/blas_kernels.cu index d7ad75ea..a35177b5 100644 --- a/src/blas_kernels.cu +++ b/src/blas_kernels.cu @@ -418,6 +418,24 @@ __global__ void reorg_kernel(int N, float *x, int w, int h, int c, int batch, in //else out[0] = x[0]; } +__global__ void constrain_weight_updates_kernel(int N, float coef, float *weights_gpu, float *weight_updates_gpu) +{ + int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x; + if (i < N) { + const float w = weights_gpu[i]; + const float wu = weight_updates_gpu[i]; + const float wu_sign = (wu == 0) ? 0 : (fabs(wu) / wu); + const float abs_limit = fabs(w * coef); + if (fabs(wu) > abs_limit) weight_updates_gpu[i] = abs_limit * wu_sign; + } +} + +extern "C" void constrain_weight_updates_ongpu(int N, float coef, float *weights_gpu, float *weight_updates_gpu) +{ + constrain_weight_updates_kernel << > >(N, coef, weights_gpu, weight_updates_gpu); + CHECK_CUDA(cudaPeekAtLastError()); +} + __global__ void axpy_kernel(int N, float ALPHA, float *X, int OFFX, int INCX, float *Y, int OFFY, int INCY) { int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x; @@ -572,7 +590,7 @@ extern "C" void fast_variance_gpu(float *x, float *mean, int batch, int filters, } -__global__ void fast_v_cbn_kernel(const float *x, float *mean, int batch, int filters, int spatial, int minibatch_index, float *m_avg, float *v_avg, float *variance, +__global__ void fast_v_cbn_kernel(const float *x, float *mean, int batch, int filters, int spatial, int minibatch_index, int max_minibatch_index, float *m_avg, float *v_avg, float *variance, const float alpha, float *rolling_mean_gpu, float *rolling_variance_gpu, int inverse_variance, float epsilon) { const int threads = BLOCK; @@ -615,16 +633,19 @@ __global__ void fast_v_cbn_kernel(const float *x, float *mean, int batch, int f if (inverse_variance) variance[filter] = 1.0f / sqrtf(variance_tmp + epsilon); else variance[filter] = variance_tmp; - rolling_mean_gpu[filter] = alpha * mean[filter] + (1 - alpha) * rolling_mean_gpu[filter]; + //if (max_minibatch_index == minibatch_index) + { + rolling_mean_gpu[filter] = alpha * mean[filter] + (1 - alpha) * rolling_mean_gpu[filter]; - rolling_variance_gpu[filter] = alpha * variance_tmp + (1 - alpha) * rolling_variance_gpu[filter]; + rolling_variance_gpu[filter] = alpha * variance_tmp + (1 - alpha) * rolling_variance_gpu[filter]; + } } } -extern "C" void fast_v_cbn_gpu(const float *x, float *mean, int batch, int filters, int spatial, int minibatch_index, float *m_avg, float *v_avg, float *variance, +extern "C" void fast_v_cbn_gpu(const float *x, float *mean, int batch, int filters, int spatial, int minibatch_index, int max_minibatch_index, float *m_avg, float *v_avg, float *variance, const float alpha, float *rolling_mean_gpu, float *rolling_variance_gpu, int inverse_variance, float epsilon) { - fast_v_cbn_kernel << > >(x, mean, batch, filters, spatial, minibatch_index, m_avg, v_avg, variance, alpha, rolling_mean_gpu, rolling_variance_gpu, inverse_variance, epsilon); + fast_v_cbn_kernel << > >(x, mean, batch, filters, spatial, minibatch_index, max_minibatch_index, m_avg, v_avg, variance, alpha, rolling_mean_gpu, rolling_variance_gpu, inverse_variance, epsilon); CHECK_CUDA(cudaPeekAtLastError()); } @@ -798,6 +819,12 @@ __device__ float relu(float src) { return 0; } +__device__ float lrelu(float src) { + const float eps = 0.001; + if (src > eps) return src; + return eps; +} + __global__ void shortcut_singlelayer_simple_kernel(int size, int src_outputs, int batch, int n, int *outputs_of_layers_gpu, float **layers_output_gpu, float *out, float *in, float *weights_gpu, int nweights, WEIGHTS_NORMALIZATION_T weights_normalizion) { const int id = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x; @@ -849,7 +876,7 @@ __global__ void shortcut_multilayer_kernel(int size, int src_outputs, int batch, for (int i = 0; i < (n + 1); ++i) { const int weights_index = src_i / step + i*layer_step; // [0 or c or (c, h ,w)] const float w = weights_gpu[weights_index]; - if (weights_normalizion == RELU_NORMALIZATION) sum += relu(w); + if (weights_normalizion == RELU_NORMALIZATION) sum += lrelu(w); else if (weights_normalizion == SOFTMAX_NORMALIZATION) sum += expf(w - max_val); } } @@ -858,7 +885,7 @@ __global__ void shortcut_multilayer_kernel(int size, int src_outputs, int batch, if (weights_gpu) { float w = weights_gpu[src_i / step]; - if (weights_normalizion == RELU_NORMALIZATION) w = relu(w) / sum; + if (weights_normalizion == RELU_NORMALIZATION) w = lrelu(w) / sum; else if (weights_normalizion == SOFTMAX_NORMALIZATION) w = expf(w - max_val) / sum; out_val = in[id] * w; // [0 or c or (c, h ,w)] @@ -876,7 +903,7 @@ __global__ void shortcut_multilayer_kernel(int size, int src_outputs, int batch, if (weights_gpu) { const int weights_index = src_i / step + (i + 1)*layer_step; // [0 or c or (c, h ,w)] float w = weights_gpu[weights_index]; - if (weights_normalizion == RELU_NORMALIZATION) w = relu(w) / sum; + if (weights_normalizion == RELU_NORMALIZATION) w = lrelu(w) / sum; else if (weights_normalizion == SOFTMAX_NORMALIZATION) w = expf(w - max_val) / sum; out_val += add[add_index] * w; // [0 or c or (c, h ,w)] @@ -932,34 +959,27 @@ __global__ void backward_shortcut_multilayer_kernel(int size, int src_outputs, i for (i = 0; i < (n + 1); ++i) { const int weights_index = src_i / step + i*layer_step; // [0 or c or (c, h ,w)] const float w = weights_gpu[weights_index]; - if (weights_normalizion == RELU_NORMALIZATION) sum += relu(w); + if (weights_normalizion == RELU_NORMALIZATION) sum += lrelu(w); else if (weights_normalizion == SOFTMAX_NORMALIZATION) sum += expf(w - max_val); } - /* - grad = 0; - for (i = 0; i < (n + 1); ++i) { - const int weights_index = src_i / step + i*layer_step; // [0 or c or (c, h ,w)] - const float delta_w = delta_in[id] * in[id]; - const float w = weights_gpu[weights_index]; - if (weights_normalizion == RELU_NORMALIZATION) grad += delta_w * relu(w) / sum; - else if (weights_normalizion == SOFTMAX_NORMALIZATION) grad += delta_w * expf(w - max_val) / sum; - } - */ } if (weights_gpu) { float w = weights_gpu[src_i / step]; - if (weights_normalizion == RELU_NORMALIZATION) w = relu(w) / sum; + if (weights_normalizion == RELU_NORMALIZATION) w = lrelu(w) / sum; else if (weights_normalizion == SOFTMAX_NORMALIZATION) w = expf(w - max_val) / sum; if (weights_normalizion == RELU_NORMALIZATION) grad = w; else if (weights_normalizion == SOFTMAX_NORMALIZATION) grad = w*(1-w); delta_out[id] += delta_in[id] * w; // [0 or c or (c, h ,w)] - float weights_update_tmp = delta_in[id] * in[id] * grad; + float weights_update_tmp = delta_in[id] * in[id] * grad;// / step; if (layer_step == 1 && (size/32) > (id/32 + 1)) { + if (isnan(weights_update_tmp) || isinf(weights_update_tmp)) { + weights_update_tmp = 0; + } float wu = warpAllReduceSum(weights_update_tmp); if (threadIdx.x % 32 == 0) { if (!isnan(wu) && !isinf(wu)) @@ -994,13 +1014,18 @@ __global__ void backward_shortcut_multilayer_kernel(int size, int src_outputs, i else if (weights_normalizion == SOFTMAX_NORMALIZATION) grad = w*(1 - w); layer_delta[add_index] += delta_in[id] * w; - float weights_update_tmp = delta_in[id] * add[add_index] * grad; + float weights_update_tmp = delta_in[id] * add[add_index] * grad;// / step; if (layer_step == 1 && (size / 32) > (id / 32 + 1)) { + if (isnan(weights_update_tmp) || isinf(weights_update_tmp)) { + weights_update_tmp = 0; + } float wu = warpAllReduceSum(weights_update_tmp); if (threadIdx.x % 32 == 0) { if (!isnan(wu) && !isinf(wu)) atomicAdd(&weight_updates_gpu[weights_index], wu); + //if(weights_gpu[weights_index] != 1) printf(" wu = %f, weights_update_tmp = %f, w = %f, weights_gpu[weights_index] = %f, grad = %f, weights_normalizion = %d ", + // wu, weights_update_tmp, w, weights_gpu[weights_index], grad, weights_normalizion); } } else { @@ -1357,8 +1382,9 @@ __global__ void fix_nan_and_inf_kernel(float *input, size_t size) const int index = blockIdx.x*blockDim.x + threadIdx.x; if (index < size) { float val = input[index]; - if (isnan(val) || isinf(val)) - input[index] = 1.0f / index; // pseudo random value + if (isnan(val) || isinf(val)) { + input[index] = 1.0f / (fabs((float)index) + 1); // pseudo random value + } } } @@ -1377,8 +1403,9 @@ __global__ void reset_nan_and_inf_kernel(float *input, size_t size) const int index = blockIdx.x*blockDim.x + threadIdx.x; if (index < size) { float val = input[index]; - if (isnan(val) || isinf(val)) + if (isnan(val) || isinf(val)) { input[index] = 0; + } } } diff --git a/src/classifier.c b/src/classifier.c index 4cfbedb0..a0c5f734 100644 --- a/src/classifier.c +++ b/src/classifier.c @@ -48,7 +48,10 @@ void train_classifier(char *datacfg, char *cfgfile, char *weightfile, int *gpus, if(weightfile){ load_weights(&nets[i], weightfile); } - if(clear) *nets[i].seen = 0; + if (clear) { + *nets[i].seen = 0; + *nets[i].cur_iteration = 0; + } nets[i].learning_rate *= ngpus; } srand(time(0)); @@ -166,8 +169,8 @@ void train_classifier(char *datacfg, char *cfgfile, char *weightfile, int *gpus, calc_topk_for_each = fmax(calc_topk_for_each, 100); if (i % 10 == 0) { if (calc_topk) { - fprintf(stderr, "\n (next TOP5 calculation at %d iterations) ", calc_topk_for_each); - if (topk > 0) fprintf(stderr, " Last accuracy TOP5 = %2.2f %% \n", topk * 100); + fprintf(stderr, "\n (next TOP%d calculation at %d iterations) ", topk_data, calc_topk_for_each); + if (topk > 0) fprintf(stderr, " Last accuracy TOP%d = %2.2f %% \n", topk_data, topk * 100); } if (net.cudnn_half) { @@ -179,7 +182,7 @@ void train_classifier(char *datacfg, char *cfgfile, char *weightfile, int *gpus, int draw_precision = 0; if (calc_topk && (i >= calc_topk_for_each || i == net.max_batches)) { iter_topk = i; - topk = validate_classifier_single(datacfg, cfgfile, weightfile, &net, topk_data); // calc TOP5 + topk = validate_classifier_single(datacfg, cfgfile, weightfile, &net, topk_data); // calc TOP-n printf("\n accuracy %s = %f \n", topk_buff, topk); draw_precision = 1; } @@ -767,13 +770,14 @@ void try_classifier(char *datacfg, char *cfgfile, char *weightfile, char *filena float *predictions = network_predict(net, X); layer l = net.layers[layer_num]; - for(int i = 0; i < l.c; ++i){ + int i; + for(i = 0; i < l.c; ++i){ if(l.rolling_mean) printf("%f %f %f\n", l.rolling_mean[i], l.rolling_variance[i], l.scales[i]); } #ifdef GPU cuda_pull_array(l.output_gpu, l.output, l.outputs); #endif - for(int i = 0; i < l.outputs; ++i){ + for(i = 0; i < l.outputs; ++i){ printf("%f\n", l.output[i]); } /* @@ -791,7 +795,7 @@ void try_classifier(char *datacfg, char *cfgfile, char *weightfile, char *filena top_predictions(net, top, indexes); printf("%s: Predicted in %f seconds.\n", input, sec(clock()-time)); - for(int i = 0; i < top; ++i){ + for(i = 0; i < top; ++i){ int index = indexes[i]; printf("%s: %f\n", names[index], predictions[index]); } diff --git a/src/connected_layer.c b/src/connected_layer.c index 08a16aaa..25a5ffa1 100644 --- a/src/connected_layer.c +++ b/src/connected_layer.c @@ -308,8 +308,17 @@ void push_connected_layer(connected_layer l) CHECK_CUDA(cudaPeekAtLastError()); } -void update_connected_layer_gpu(connected_layer l, int batch, float learning_rate, float momentum, float decay) +void update_connected_layer_gpu(connected_layer l, int batch, float learning_rate_init, float momentum, float decay, float loss_scale) { + float learning_rate = learning_rate_init * l.learning_rate_scale; + + // Loss scale for Mixed-Precision on Tensor-Cores + if (loss_scale != 1.0) { + scal_ongpu(l.inputs*l.outputs, 1.0 / loss_scale, l.weight_updates_gpu, 1); + scal_ongpu(l.outputs, 1.0 / loss_scale, l.bias_updates_gpu, 1); + scal_ongpu(l.outputs, 1.0 / loss_scale, l.scale_updates_gpu, 1); + } + axpy_ongpu(l.outputs, learning_rate/batch, l.bias_updates_gpu, 1, l.biases_gpu, 1); scal_ongpu(l.outputs, momentum, l.bias_updates_gpu, 1); diff --git a/src/connected_layer.h b/src/connected_layer.h index 80805cc2..284c5125 100644 --- a/src/connected_layer.h +++ b/src/connected_layer.h @@ -22,7 +22,7 @@ void statistics_connected_layer(layer l); #ifdef GPU void forward_connected_layer_gpu(connected_layer layer, network_state state); void backward_connected_layer_gpu(connected_layer layer, network_state state); -void update_connected_layer_gpu(connected_layer layer, int batch, float learning_rate, float momentum, float decay); +void update_connected_layer_gpu(connected_layer layer, int batch, float learning_rate, float momentum, float decay, float loss_scale); void push_connected_layer(connected_layer layer); void pull_connected_layer(connected_layer layer); #endif diff --git a/src/conv_lstm_layer.c b/src/conv_lstm_layer.c index bc53848a..cb7e3fe1 100644 --- a/src/conv_lstm_layer.c +++ b/src/conv_lstm_layer.c @@ -791,21 +791,21 @@ void push_conv_lstm_layer(layer l) push_convolutional_layer(*(l.uo)); } -void update_conv_lstm_layer_gpu(layer l, int batch, float learning_rate, float momentum, float decay) +void update_conv_lstm_layer_gpu(layer l, int batch, float learning_rate, float momentum, float decay, float loss_scale) { if (l.peephole) { - update_convolutional_layer_gpu(*(l.vf), batch, learning_rate, momentum, decay); - update_convolutional_layer_gpu(*(l.vi), batch, learning_rate, momentum, decay); - update_convolutional_layer_gpu(*(l.vo), batch, learning_rate, momentum, decay); + update_convolutional_layer_gpu(*(l.vf), batch, learning_rate, momentum, decay, loss_scale); + update_convolutional_layer_gpu(*(l.vi), batch, learning_rate, momentum, decay, loss_scale); + update_convolutional_layer_gpu(*(l.vo), batch, learning_rate, momentum, decay, loss_scale); } - update_convolutional_layer_gpu(*(l.wf), batch, learning_rate, momentum, decay); - update_convolutional_layer_gpu(*(l.wi), batch, learning_rate, momentum, decay); - update_convolutional_layer_gpu(*(l.wg), batch, learning_rate, momentum, decay); - update_convolutional_layer_gpu(*(l.wo), batch, learning_rate, momentum, decay); - update_convolutional_layer_gpu(*(l.uf), batch, learning_rate, momentum, decay); - update_convolutional_layer_gpu(*(l.ui), batch, learning_rate, momentum, decay); - update_convolutional_layer_gpu(*(l.ug), batch, learning_rate, momentum, decay); - update_convolutional_layer_gpu(*(l.uo), batch, learning_rate, momentum, decay); + update_convolutional_layer_gpu(*(l.wf), batch, learning_rate, momentum, decay, loss_scale); + update_convolutional_layer_gpu(*(l.wi), batch, learning_rate, momentum, decay, loss_scale); + update_convolutional_layer_gpu(*(l.wg), batch, learning_rate, momentum, decay, loss_scale); + update_convolutional_layer_gpu(*(l.wo), batch, learning_rate, momentum, decay, loss_scale); + update_convolutional_layer_gpu(*(l.uf), batch, learning_rate, momentum, decay, loss_scale); + update_convolutional_layer_gpu(*(l.ui), batch, learning_rate, momentum, decay, loss_scale); + update_convolutional_layer_gpu(*(l.ug), batch, learning_rate, momentum, decay, loss_scale); + update_convolutional_layer_gpu(*(l.uo), batch, learning_rate, momentum, decay, loss_scale); } void forward_conv_lstm_layer_gpu(layer l, network_state state) diff --git a/src/conv_lstm_layer.h b/src/conv_lstm_layer.h index 0a0438b0..14232cab 100644 --- a/src/conv_lstm_layer.h +++ b/src/conv_lstm_layer.h @@ -23,7 +23,7 @@ void update_conv_lstm_layer(layer l, int batch, float learning_rate, float momen #ifdef GPU void forward_conv_lstm_layer_gpu(layer l, network_state state); void backward_conv_lstm_layer_gpu(layer l, network_state state); -void update_conv_lstm_layer_gpu(layer l, int batch, float learning_rate, float momentum, float decay); +void update_conv_lstm_layer_gpu(layer l, int batch, float learning_rate, float momentum, float decay, float loss_scale); #endif #ifdef __cplusplus diff --git a/src/convolutional_kernels.cu b/src/convolutional_kernels.cu index 00d3961c..3b4a343a 100644 --- a/src/convolutional_kernels.cu +++ b/src/convolutional_kernels.cu @@ -419,8 +419,8 @@ void forward_convolutional_layer_gpu(convolutional_layer l, network_state state) //#ifdef CUDNN_HALF //if (state.use_mixed_precision) { - int iteration_num = (*state.net.seen) / (state.net.batch*state.net.subdivisions); - if (state.index != 0 && state.net.cudnn_half && !l.xnor && (!state.train || iteration_num > 3*state.net.burn_in) && + int iteration_num = get_current_iteration(state.net); // (*state.net.seen) / (state.net.batch*state.net.subdivisions); + if (state.index != 0 && state.net.cudnn_half && !l.xnor && (!state.train || iteration_num > 3 * state.net.burn_in) && (l.c / l.groups) % 8 == 0 && l.n % 8 == 0 && !state.train && l.groups <= 1 && l.size > 1) { //printf("\n CUDNN_HALF!!! state.index = %d \n", state.index); @@ -671,8 +671,8 @@ void backward_convolutional_layer_gpu(convolutional_layer l, network_state state float alpha = 1, beta = 0; //#ifdef CUDNN_HALF - int iteration_num = (*state.net.seen) / (state.net.batch*state.net.subdivisions); - if (state.index != 0 && state.net.cudnn_half && !l.xnor && (!state.train || iteration_num > 3*state.net.burn_in) && + int iteration_num = get_current_iteration(state.net); //(*state.net.seen) / (state.net.batch*state.net.subdivisions); + if (state.index != 0 && state.net.cudnn_half && !l.xnor && (!state.train || iteration_num > 3 * state.net.burn_in) && (l.c / l.groups) % 8 == 0 && l.n % 8 == 0 && !state.train && l.groups <= 1 && l.size > 1) { const size_t input16_size = l.batch*l.c*l.w*l.h; @@ -717,7 +717,7 @@ void backward_convolutional_layer_gpu(convolutional_layer l, network_state state l.normDstTensorDescF16, delta16, // input l.normDstTensorDescF16, - l.x_norm_gpu, // output (new delta) + l.output_gpu, //l.x_norm_gpu, // output (new delta) l.normTensorDesc, l.scales_gpu, // input (should be FP32) l.scale_updates_gpu, // output (should be FP32) @@ -726,7 +726,7 @@ void backward_convolutional_layer_gpu(convolutional_layer l, network_state state l.mean_gpu, // input (should be FP32) l.variance_gpu)); // input (should be FP32) - simple_copy_ongpu(l.outputs*l.batch / 2, l.x_norm_gpu, delta16); + simple_copy_ongpu(l.outputs*l.batch / 2, l.output_gpu, delta16); //copy_ongpu(l.outputs*l.batch / 2, l.x_norm_gpu, 1, delta16, 1); //cudaMemcpyAsync(delta16, l.x_norm_gpu, l.outputs*l.batch * sizeof(half), cudaMemcpyDefault, get_cuda_stream()); } @@ -978,7 +978,7 @@ void assisted_activation2_gpu(float alpha, float *output, float *gt_gpu, float * void assisted_excitation_forward_gpu(convolutional_layer l, network_state state) { - const int iteration_num = (*state.net.seen) / (state.net.batch*state.net.subdivisions); + const int iteration_num = get_current_iteration(state.net); //(*state.net.seen) / (state.net.batch*state.net.subdivisions); // epoch //const float epoch = (float)(*state.net.seen) / state.net.train_images_num; @@ -1188,7 +1188,7 @@ void push_convolutional_layer(convolutional_layer l) CHECK_CUDA(cudaPeekAtLastError()); } -void update_convolutional_layer_gpu(layer l, int batch, float learning_rate_init, float momentum, float decay) +void update_convolutional_layer_gpu(layer l, int batch, float learning_rate_init, float momentum, float decay, float loss_scale) { /* @@ -1232,6 +1232,13 @@ void update_convolutional_layer_gpu(layer l, int batch, float learning_rate_init //float decay = a.decay; //int batch = a.batch; + // Loss scale for Mixed-Precision on Tensor-Cores + if (loss_scale != 1.0) { + if (l.weight_updates_gpu && l.nweights > 0) scal_ongpu(l.nweights, 1.0 / loss_scale, l.weight_updates_gpu, 1); + if (l.bias_updates_gpu && l.n > 0) scal_ongpu(l.n, 1.0 / loss_scale, l.bias_updates_gpu, 1); + if (l.scale_updates_gpu && l.n > 0) scal_ongpu(l.n, 1.0 / loss_scale, l.scale_updates_gpu, 1); + } + reset_nan_and_inf(l.weight_updates_gpu, l.nweights); fix_nan_and_inf(l.weights_gpu, l.nweights); @@ -1280,9 +1287,9 @@ void update_convolutional_layer_gpu(layer l, int batch, float learning_rate_init //} } - //if (l.clip) { - // constrain_gpu(l.nweights, l.clip, l.weights_gpu, 1); - //} + if (l.clip) { + constrain_ongpu(l.nweights, l.clip, l.weights_gpu, 1); + } } /* diff --git a/src/convolutional_layer.c b/src/convolutional_layer.c index f6c4d3dc..0e0f614e 100644 --- a/src/convolutional_layer.c +++ b/src/convolutional_layer.c @@ -786,7 +786,7 @@ void resize_convolutional_layer(convolutional_layer *l, int w, int h) if (l->activation == SWISH || l->activation == MISH) l->activation_input = (float*)realloc(l->activation_input, total_batch*l->outputs * sizeof(float)); #ifdef GPU - if (old_w < w || old_h < h) { + if (old_w < w || old_h < h || l->dynamic_minibatch) { if (l->train) { cuda_free(l->delta_gpu); l->delta_gpu = cuda_make_array(l->delta, total_batch*l->outputs); diff --git a/src/convolutional_layer.h b/src/convolutional_layer.h index 18135937..e83ca873 100644 --- a/src/convolutional_layer.h +++ b/src/convolutional_layer.h @@ -15,7 +15,7 @@ extern "C" { #ifdef GPU void forward_convolutional_layer_gpu(convolutional_layer layer, network_state state); void backward_convolutional_layer_gpu(convolutional_layer layer, network_state state); -void update_convolutional_layer_gpu(convolutional_layer layer, int batch, float learning_rate, float momentum, float decay); +void update_convolutional_layer_gpu(convolutional_layer layer, int batch, float learning_rate, float momentum, float decay, float loss_scale); void push_convolutional_layer(convolutional_layer layer); void pull_convolutional_layer(convolutional_layer layer); diff --git a/src/crnn_layer.c b/src/crnn_layer.c index 8b9ed2f9..84646b4a 100644 --- a/src/crnn_layer.c +++ b/src/crnn_layer.c @@ -265,11 +265,11 @@ void push_crnn_layer(layer l) push_convolutional_layer(*(l.output_layer)); } -void update_crnn_layer_gpu(layer l, int batch, float learning_rate, float momentum, float decay) +void update_crnn_layer_gpu(layer l, int batch, float learning_rate, float momentum, float decay, float loss_scale) { - update_convolutional_layer_gpu(*(l.input_layer), batch, learning_rate, momentum, decay); - update_convolutional_layer_gpu(*(l.self_layer), batch, learning_rate, momentum, decay); - update_convolutional_layer_gpu(*(l.output_layer), batch, learning_rate, momentum, decay); + update_convolutional_layer_gpu(*(l.input_layer), batch, learning_rate, momentum, decay, loss_scale); + update_convolutional_layer_gpu(*(l.self_layer), batch, learning_rate, momentum, decay, loss_scale); + update_convolutional_layer_gpu(*(l.output_layer), batch, learning_rate, momentum, decay, loss_scale); } void forward_crnn_layer_gpu(layer l, network_state state) diff --git a/src/crnn_layer.h b/src/crnn_layer.h index c2dc7a25..b85df6ff 100644 --- a/src/crnn_layer.h +++ b/src/crnn_layer.h @@ -20,7 +20,7 @@ void update_crnn_layer(layer l, int batch, float learning_rate, float momentum, #ifdef GPU void forward_crnn_layer_gpu(layer l, network_state state); void backward_crnn_layer_gpu(layer l, network_state state); -void update_crnn_layer_gpu(layer l, int batch, float learning_rate, float momentum, float decay); +void update_crnn_layer_gpu(layer l, int batch, float learning_rate, float momentum, float decay, float loss_scale); void push_crnn_layer(layer l); void pull_crnn_layer(layer l); #endif diff --git a/src/dark_cuda.c b/src/dark_cuda.c index e51617af..e7594309 100644 --- a/src/dark_cuda.c +++ b/src/dark_cuda.c @@ -1,6 +1,7 @@ #ifdef __cplusplus extern "C" { #endif +int cuda_debug_sync = 0; int gpu_index = 0; #ifdef __cplusplus } @@ -19,6 +20,7 @@ int gpu_index = 0; #pragma comment(lib, "cuda.lib") + #ifdef CUDNN #ifndef USE_CMAKE_LIBS #pragma comment(lib, "cudnn.lib") @@ -29,6 +31,7 @@ int gpu_index = 0; #error "If you set CUDNN_HALF=1 then you must set CUDNN=1" #endif + void cuda_set_device(int n) { gpu_index = n; @@ -86,10 +89,13 @@ void check_error_extended(cudaError_t status, const char *file, int line, const check_error(status); } #if defined(DEBUG) || defined(CUDA_DEBUG) - status = cudaDeviceSynchronize(); - if (status != cudaSuccess) - printf("CUDA status = cudaDeviceSynchronize() Error: file: %s() : line: %d : build time: %s \n", file, line, date_time); + cuda_debug_sync = 1; #endif + if (cuda_debug_sync) { + status = cudaDeviceSynchronize(); + if (status != cudaSuccess) + printf("CUDA status = cudaDeviceSynchronize() Error: file: %s() : line: %d : build time: %s \n", file, line, date_time); + } check_error(status); } @@ -173,6 +179,9 @@ void cudnn_check_error(cudnnStatus_t status) #if defined(DEBUG) || defined(CUDA_DEBUG) cudaDeviceSynchronize(); #endif + if (cuda_debug_sync) { + cudaDeviceSynchronize(); + } cudnnStatus_t status2 = CUDNN_STATUS_SUCCESS; #ifdef CUDNN_ERRQUERY_RAWCODE cudnnStatus_t status_tmp = cudnnQueryRuntimeError(cudnn_handle(), &status2, CUDNN_ERRQUERY_RAWCODE, NULL); @@ -208,10 +217,13 @@ void cudnn_check_error_extended(cudnnStatus_t status, const char *file, int line cudnn_check_error(status); } #if defined(DEBUG) || defined(CUDA_DEBUG) - status = cudaDeviceSynchronize(); - if (status != CUDNN_STATUS_SUCCESS) - printf("\n cuDNN status = cudaDeviceSynchronize() Error in: file: %s() : line: %d : build time: %s \n", file, line, date_time); + cuda_debug_sync = 1; #endif + if (cuda_debug_sync) { + cudaError_t status = cudaDeviceSynchronize(); + if (status != CUDNN_STATUS_SUCCESS) + printf("\n cudaError_t status = cudaDeviceSynchronize() Error in: file: %s() : line: %d : build time: %s \n", file, line, date_time); + } cudnn_check_error(status); } #endif diff --git a/src/dark_cuda.h b/src/dark_cuda.h index 4798d729..5f6978ae 100644 --- a/src/dark_cuda.h +++ b/src/dark_cuda.h @@ -6,6 +6,8 @@ extern "C" { #endif + +extern int cuda_debug_sync; extern int gpu_index; #ifdef __cplusplus } diff --git a/src/darknet.c b/src/darknet.c index ec47b05a..93d88b1b 100644 --- a/src/darknet.c +++ b/src/darknet.c @@ -165,17 +165,19 @@ void oneoff(char *cfgfile, char *weightfile, char *outfile) copy_cpu(l.n/3*l.c, l.weights, 1, l.weights + l.n/3*l.c, 1); copy_cpu(l.n/3*l.c, l.weights, 1, l.weights + 2*l.n/3*l.c, 1); *net.seen = 0; + *net.cur_iteration = 0; save_weights(net, outfile); } void partial(char *cfgfile, char *weightfile, char *outfile, int max) { gpu_index = -1; - network net = parse_network_cfg(cfgfile); + network net = parse_network_cfg_custom(cfgfile, 1, 1); if(weightfile){ load_weights_upto(&net, weightfile, max); } *net.seen = 0; + *net.cur_iteration = 0; save_weights_upto(net, outfile, max); } @@ -455,15 +457,22 @@ int main(int argc, char **argv) #ifndef GPU gpu_index = -1; + printf(" GPU isn't used \n"); init_cpu(); -#else +#else // GPU if(gpu_index >= 0){ cuda_set_device(gpu_index); CHECK_CUDA(cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync)); } show_cuda_cudnn_info(); -#endif + cuda_debug_sync = find_arg(argc, argv, "-cuda_debug_sync"); + +#ifdef CUDNN_HALF + printf(" CUDNN_HALF=1 \n"); +#endif // CUDNN_HALF + +#endif // GPU show_opencv_info(); diff --git a/src/data.c b/src/data.c index 9e564e2c..a103cc5a 100644 --- a/src/data.c +++ b/src/data.c @@ -8,6 +8,8 @@ #include #include +extern int check_mistakes; + #define NUMCHARS 37 pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER; @@ -178,7 +180,6 @@ matrix load_image_augment_paths(char **paths, int n, int use_flip, int min, int return X; } -extern int check_mistakes; box_label *read_boxes(char *filename, int *n) { @@ -950,7 +951,10 @@ data load_data_detection(int n, char **paths, int m, int w, int h, int c, int bo const int random_index = random_gen(); c = c ? c : 3; - assert(use_mixup != 2); + if (use_mixup == 2) { + printf("\n cutmix=1 - isn't supported for Detector \n"); + exit(0); + } if (use_mixup == 3 && letter_box) { printf("\n Combination: letter_box=1 & mosaic=1 - isn't supported, use only 1 of these parameters \n"); exit(0); @@ -1211,7 +1215,15 @@ data load_data_detection(int n, char **paths, int m, int w, int h, int c, int bo if(track) random_paths = get_sequential_paths(paths, n, m, mini_batch, augment_speed); else random_paths = get_random_paths(paths, n, m); - assert(use_mixup < 2); + //assert(use_mixup < 2); + if (use_mixup == 2) { + printf("\n cutmix=1 - isn't supported for Detector \n"); + exit(0); + } + if (use_mixup == 3) { + printf("\n mosaic=1 - compile Darknet with OpenCV for using mosaic=1 \n"); + exit(0); + } int mixup = use_mixup ? random_gen() % 2 : 0; //printf("\n mixup = %d \n", mixup); if (mixup) { diff --git a/src/detector.c b/src/detector.c index 3ececa5f..08874179 100644 --- a/src/detector.c +++ b/src/detector.c @@ -66,19 +66,22 @@ void train_detector(char *datacfg, char *cfgfile, char *weightfile, int *gpus, i srand(time(0)); int seed = rand(); - int i; - for (i = 0; i < ngpus; ++i) { + int k; + for (k = 0; k < ngpus; ++k) { srand(seed); #ifdef GPU - cuda_set_device(gpus[i]); + cuda_set_device(gpus[k]); #endif - nets[i] = parse_network_cfg(cfgfile); - nets[i].benchmark_layers = benchmark_layers; + nets[k] = parse_network_cfg(cfgfile); + nets[k].benchmark_layers = benchmark_layers; if (weightfile) { - load_weights(&nets[i], weightfile); + load_weights(&nets[k], weightfile); + } + if (clear) { + *nets[k].seen = 0; + *nets[k].cur_iteration = 0; } - if (clear) *nets[i].seen = 0; - nets[i].learning_rate *= ngpus; + nets[k].learning_rate *= ngpus; } srand(time(0)); network net = nets[0]; @@ -105,12 +108,13 @@ void train_detector(char *datacfg, char *cfgfile, char *weightfile, int *gpus, i int train_images_num = plist->size; char **paths = (char **)list_to_array(plist); - int init_w = net.w; - int init_h = net.h; + const int init_w = net.w; + const int init_h = net.h; + const int init_b = net.batch; int iter_save, iter_save_last, iter_map; - iter_save = get_current_batch(net); - iter_save_last = get_current_batch(net); - iter_map = get_current_batch(net); + iter_save = get_current_iteration(net); + iter_save_last = get_current_iteration(net); + iter_map = get_current_iteration(net); float mean_average_precision = -1; float best_map = mean_average_precision; @@ -169,7 +173,7 @@ void train_detector(char *datacfg, char *cfgfile, char *weightfile, int *gpus, i start = what_time_is_it_now(); //while(i*imgs < N*120){ - while (get_current_batch(net) < net.max_batches) { + while (get_current_iteration(net) < net.max_batches) { if (l.random && count++ % 10 == 0) { float rand_coef = 1.4; if (l.random != 1.0) rand_coef = l.random; @@ -179,26 +183,48 @@ void train_detector(char *datacfg, char *cfgfile, char *weightfile, int *gpus, i int dim_h = roundl(random_val*init_h / net.resize_step + 1) * net.resize_step; if (random_val < 1 && (dim_w > init_w || dim_h > init_h)) dim_w = init_w, dim_h = init_h; - // at the beginning - if (avg_loss < 0) { - dim_w = roundl(rand_coef*init_w / net.resize_step + 1) * net.resize_step; - dim_h = roundl(rand_coef*init_h / net.resize_step + 1) * net.resize_step; + int max_dim_w = roundl(rand_coef*init_w / net.resize_step + 1) * net.resize_step; + int max_dim_h = roundl(rand_coef*init_h / net.resize_step + 1) * net.resize_step; + + // at the beginning (check if enough memory) and at the end (calc rolling mean/variance) + if (avg_loss < 0 || get_current_iteration(net) > net.max_batches - 100) { + dim_w = max_dim_w; + dim_h = max_dim_h; } if (dim_w < net.resize_step) dim_w = net.resize_step; if (dim_h < net.resize_step) dim_h = net.resize_step; + int dim_b = (init_b * max_dim_w * max_dim_h) / (dim_w * dim_h); + int new_dim_b = (int)(dim_b * 0.8); + if (new_dim_b > init_b) dim_b = new_dim_b; - printf("%d x %d \n", dim_w, dim_h); args.w = dim_w; args.h = dim_h; + int k; + if (net.dynamic_minibatch) { + for (k = 0; k < ngpus; ++k) { + (*nets[k].seen) = init_b * net.subdivisions * get_current_iteration(net); // remove this line, when you will save to weights-file both: seen & cur_iteration + nets[k].batch = dim_b; + int j; + for (j = 0; j < nets[k].n; ++j) + nets[k].layers[j].batch = dim_b; + } + net.batch = dim_b; + imgs = net.batch * net.subdivisions * ngpus; + args.n = imgs; + printf("\n %d x %d (batch = %d) \n", dim_w, dim_h, net.batch); + } + else + printf("\n %d x %d \n", dim_w, dim_h); + pthread_join(load_thread, 0); train = buffer; free_data(train); load_thread = load_data(args); - for (i = 0; i < ngpus; ++i) { - resize_network(nets + i, dim_w, dim_h); + for (k = 0; k < ngpus; ++k) { + resize_network(nets + k, dim_w, dim_h); } net = nets[0]; } @@ -250,7 +276,8 @@ void train_detector(char *datacfg, char *cfgfile, char *weightfile, int *gpus, i if (avg_loss < 0 || avg_loss != avg_loss) avg_loss = loss; // if(-inf or nan) avg_loss = avg_loss*.9 + loss*.1; - i = get_current_batch(net); + const int iteration = get_current_iteration(net); + //i = get_current_batch(net); int calc_map_for_each = 4 * train_images_num / (net.batch * net.subdivisions); // calculate mAP for each 4 Epochs calc_map_for_each = fmax(calc_map_for_each, 100); @@ -263,22 +290,36 @@ void train_detector(char *datacfg, char *cfgfile, char *weightfile, int *gpus, i } if (net.cudnn_half) { - if (i < net.burn_in * 3) fprintf(stderr, "\n Tensor Cores are disabled until the first %d iterations are reached.", 3 * net.burn_in); + if (iteration < net.burn_in * 3) fprintf(stderr, "\n Tensor Cores are disabled until the first %d iterations are reached.", 3 * net.burn_in); else fprintf(stderr, "\n Tensor Cores are used."); } - printf("\n %d: %f, %f avg loss, %f rate, %lf seconds, %d images\n", get_current_batch(net), loss, avg_loss, get_current_rate(net), (what_time_is_it_now() - time), i*imgs); + printf("\n %d: %f, %f avg loss, %f rate, %lf seconds, %d images\n", iteration, loss, avg_loss, get_current_rate(net), (what_time_is_it_now() - time), iteration*imgs); int draw_precision = 0; - if (calc_map && (i >= next_map_calc || i == net.max_batches)) { + if (calc_map && (iteration >= next_map_calc || iteration == net.max_batches)) { if (l.random) { - printf("Resizing to initial size: %d x %d \n", init_w, init_h); + printf("Resizing to initial size: %d x %d ", init_w, init_h); args.w = init_w; args.h = init_h; + int k; + if (net.dynamic_minibatch) { + for (k = 0; k < ngpus; ++k) { + for (k = 0; k < ngpus; ++k) { + nets[k].batch = init_b; + int j; + for (j = 0; j < nets[k].n; ++j) + nets[k].layers[j].batch = init_b; + } + } + net.batch = init_b; + imgs = init_b * net.subdivisions * ngpus; + args.n = imgs; + printf("\n %d x %d (batch = %d) \n", init_w, init_h, init_b); + } pthread_join(load_thread, 0); free_data(train); train = buffer; load_thread = load_data(args); - int k; for (k = 0; k < ngpus; ++k) { resize_network(nets + k, init_w, init_h); } @@ -290,7 +331,7 @@ void train_detector(char *datacfg, char *cfgfile, char *weightfile, int *gpus, i // combine Training and Validation networks //network net_combined = combine_train_valid_networks(net, net_map); - iter_map = i; + iter_map = iteration; mean_average_precision = validate_detector_map(datacfg, cfgfile, weightfile, 0.25, 0.5, 0, net.letter_box, &net_map);// &net_combined); printf("\n mean_average_precision (mAP@0.5) = %f \n", mean_average_precision); if (mean_average_precision > best_map) { @@ -319,18 +360,18 @@ void train_detector(char *datacfg, char *cfgfile, char *weightfile, int *gpus, i //if (i % 1000 == 0 || (i < 1000 && i % 100 == 0)) { //if (i % 100 == 0) { - if (i >= (iter_save + 1000) || i % 1000 == 0) { - iter_save = i; + if (iteration >= (iter_save + 1000) || iteration % 1000 == 0) { + iter_save = iteration; #ifdef GPU if (ngpus != 1) sync_nets(nets, ngpus, 0); #endif char buff[256]; - sprintf(buff, "%s/%s_%d.weights", backup_directory, base, i); + sprintf(buff, "%s/%s_%d.weights", backup_directory, base, iteration); save_weights(net, buff); } - if (i >= (iter_save_last + 100) || i % 100 == 0) { - iter_save_last = i; + if (iteration >= (iter_save_last + 100) || (iteration % 100 == 0 && iteration > 1)) { + iter_save_last = iteration; #ifdef GPU if (ngpus != 1) sync_nets(nets, ngpus, 0); #endif @@ -364,7 +405,7 @@ void train_detector(char *datacfg, char *cfgfile, char *weightfile, int *gpus, i free_list_contents_kvp(options); free_list(options); - for (i = 0; i < ngpus; ++i) free_network(nets[i]); + for (k = 0; k < ngpus; ++k) free_network(nets[k]); free(nets); //free_network(net); diff --git a/src/dropout_layer.c b/src/dropout_layer.c index dccf7eeb..3cc73ed2 100644 --- a/src/dropout_layer.c +++ b/src/dropout_layer.c @@ -33,8 +33,10 @@ dropout_layer make_dropout_layer(int batch, int inputs, float probability, int d l.forward_gpu = forward_dropout_layer_gpu; l.backward_gpu = backward_dropout_layer_gpu; l.rand_gpu = cuda_make_array(l.rand, inputs*batch); - l.drop_blocks_scale = cuda_make_array_pinned(l.rand, l.batch); - l.drop_blocks_scale_gpu = cuda_make_array(l.rand, l.batch); + if (l.dropblock) { + l.drop_blocks_scale = cuda_make_array_pinned(l.rand, l.batch); + l.drop_blocks_scale_gpu = cuda_make_array(l.rand, l.batch); + } #endif if (l.dropblock) { if(l.dropblock_size_abs) fprintf(stderr, "dropblock p = %.3f l.dropblock_size_abs = %d %4d -> %4d\n", probability, l.dropblock_size_abs, inputs, inputs); @@ -48,11 +50,18 @@ void resize_dropout_layer(dropout_layer *l, int inputs) { l->inputs = l->outputs = inputs; l->rand = (float*)xrealloc(l->rand, l->inputs * l->batch * sizeof(float)); - #ifdef GPU +#ifdef GPU cuda_free(l->rand_gpu); - l->rand_gpu = cuda_make_array(l->rand, l->inputs*l->batch); - #endif + + if (l->dropblock) { + cudaFreeHost(l->drop_blocks_scale); + l->drop_blocks_scale = cuda_make_array_pinned(l->rand, l->batch); + + cuda_free(l->drop_blocks_scale_gpu); + l->drop_blocks_scale_gpu = cuda_make_array(l->rand, l->batch); + } +#endif } void forward_dropout_layer(dropout_layer l, network_state state) diff --git a/src/dropout_layer_kernels.cu b/src/dropout_layer_kernels.cu index 346de4bb..a963235e 100644 --- a/src/dropout_layer_kernels.cu +++ b/src/dropout_layer_kernels.cu @@ -74,6 +74,15 @@ __global__ void dropblock_fast_kernel(float *rand, float prob, int w, int h, int } +__global__ void set_scales_dropblock_kernel(float *drop_blocks_scale, int block_size_w, int block_size_h, int outputs, int batch) +{ + const int index = blockIdx.x*blockDim.x + threadIdx.x; + if (index >= batch) return; + + const float prob = drop_blocks_scale[index] * block_size_w * block_size_h / (float)outputs; + const float scale = 1.0f / (1.0f - prob); + drop_blocks_scale[index] = scale; +} __global__ void scale_dropblock_kernel(float *output, int size, int outputs, float *drop_blocks_scale) { @@ -95,7 +104,7 @@ __global__ void yoloswag420blazeit360noscope(float *input, int size, float *rand void forward_dropout_layer_gpu(dropout_layer l, network_state state) { if (!state.train) return; - int iteration_num = (*state.net.seen) / (state.net.batch*state.net.subdivisions); + int iteration_num = get_current_iteration(state.net); // (*state.net.seen) / (state.net.batch*state.net.subdivisions); //if (iteration_num < state.net.burn_in) return; // We gradually increase the block size and the probability of dropout - during the first half of the training @@ -136,18 +145,24 @@ void forward_dropout_layer_gpu(dropout_layer l, network_state state) dropblock_fast_kernel << > > (l.rand_gpu, block_prob, l.w, l.h, l.w*l.h, l.c, block_size, l.drop_blocks_scale_gpu, state.input); CHECK_CUDA(cudaPeekAtLastError()); + num_blocks = get_number_of_blocks(l.batch, BLOCK); + set_scales_dropblock_kernel << > > (l.drop_blocks_scale_gpu, block_size, block_size, l.outputs, l.batch); + CHECK_CUDA(cudaPeekAtLastError()); + + /* cuda_pull_array(l.drop_blocks_scale_gpu, l.drop_blocks_scale, l.batch); for (int b = 0; b < l.batch; ++b) { const float prob = l.drop_blocks_scale[b] * block_size * block_size / (float)l.outputs; const float scale = 1.0f / (1.0f - prob); - printf(" %d x %d - block_size = %d, block_size*block_size = %d , ", l.w, l.h, block_size, block_size*block_size); - printf(" , l.drop_blocks_scale[b] = %f, prob = %f, calc scale = %f \t cur_prob = %f, cur_scale = %f \n", - l.drop_blocks_scale[b], prob, scale, cur_prob, cur_scale); + //printf(" %d x %d - block_size = %d, block_size*block_size = %d , ", l.w, l.h, block_size, block_size*block_size); + //printf(" , l.drop_blocks_scale[b] = %f, prob = %f, calc scale = %f \t cur_prob = %f, cur_scale = %f \n", + // l.drop_blocks_scale[b], prob, scale, cur_prob, cur_scale); l.drop_blocks_scale[b] = scale; } cuda_push_array(l.drop_blocks_scale_gpu, l.drop_blocks_scale, l.batch); + */ num_blocks = get_number_of_blocks(l.outputs * l.batch, BLOCK); scale_dropblock_kernel << > > (state.input, l.outputs * l.batch, l.outputs, l.drop_blocks_scale_gpu); @@ -176,14 +191,14 @@ void forward_dropout_layer_gpu(dropout_layer l, network_state state) void backward_dropout_layer_gpu(dropout_layer l, network_state state) { if(!state.delta) return; - //int iteration_num = (*state.net.seen) / (state.net.batch*state.net.subdivisions); + //int iteration_num = get_current_iteration(state.net); //(*state.net.seen) / (state.net.batch*state.net.subdivisions); //if (iteration_num < state.net.burn_in) return; int size = l.inputs*l.batch; // dropblock if (l.dropblock) { - int iteration_num = (*state.net.seen) / (state.net.batch*state.net.subdivisions); + int iteration_num = get_current_iteration(state.net); //(*state.net.seen) / (state.net.batch*state.net.subdivisions); float multiplier = 1.0; if (iteration_num < (state.net.max_batches*0.85)) multiplier = (iteration_num / (float)(state.net.max_batches*0.85)); diff --git a/src/gaussian_yolo_layer.c b/src/gaussian_yolo_layer.c index 7e9efb21..d3bd3f58 100644 --- a/src/gaussian_yolo_layer.c +++ b/src/gaussian_yolo_layer.c @@ -371,13 +371,20 @@ void delta_gaussian_yolo_class(float *output, float *delta, int index, int class { int n; if (delta[index]){ - delta[index + stride*class_id] = (1 - label_smooth_eps) - output[index + stride*class_id]; + float y_true = 1; + if (label_smooth_eps) y_true = y_true * (1 - label_smooth_eps) + 0.5*label_smooth_eps; + delta[index + stride*class_id] = y_true - output[index + stride*class_id]; + //delta[index + stride*class_id] = 1 - output[index + stride*class_id]; + if (classes_multipliers) delta[index + stride*class_id] *= classes_multipliers[class_id]; if(avg_cat) *avg_cat += output[index + stride*class_id]; return; } for(n = 0; n < classes; ++n){ - delta[index + stride*n] = ((n == class_id) ? (1 - label_smooth_eps) : (0 + label_smooth_eps/classes)) - output[index + stride*n]; + float y_true = ((n == class_id) ? 1 : 0); + if (label_smooth_eps) y_true = y_true * (1 - label_smooth_eps) + 0.5*label_smooth_eps; + delta[index + stride*n] = y_true - output[index + stride*n]; + if (classes_multipliers && n == class_id) delta[index + stride*class_id] *= classes_multipliers[class_id]; if(n == class_id && avg_cat) *avg_cat += output[index + stride*n]; } diff --git a/src/gru_layer.c b/src/gru_layer.c index 787f46ea..de301df3 100644 --- a/src/gru_layer.c +++ b/src/gru_layer.c @@ -206,14 +206,14 @@ void push_gru_layer(layer l) { } -void update_gru_layer_gpu(layer l, int batch, float learning_rate, float momentum, float decay) +void update_gru_layer_gpu(layer l, int batch, float learning_rate, float momentum, float decay, float loss_scale) { - update_connected_layer_gpu(*(l.input_r_layer), batch, learning_rate, momentum, decay); - update_connected_layer_gpu(*(l.input_z_layer), batch, learning_rate, momentum, decay); - update_connected_layer_gpu(*(l.input_h_layer), batch, learning_rate, momentum, decay); - update_connected_layer_gpu(*(l.state_r_layer), batch, learning_rate, momentum, decay); - update_connected_layer_gpu(*(l.state_z_layer), batch, learning_rate, momentum, decay); - update_connected_layer_gpu(*(l.state_h_layer), batch, learning_rate, momentum, decay); + update_connected_layer_gpu(*(l.input_r_layer), batch, learning_rate, momentum, decay, loss_scale); + update_connected_layer_gpu(*(l.input_z_layer), batch, learning_rate, momentum, decay, loss_scale); + update_connected_layer_gpu(*(l.input_h_layer), batch, learning_rate, momentum, decay, loss_scale); + update_connected_layer_gpu(*(l.state_r_layer), batch, learning_rate, momentum, decay, loss_scale); + update_connected_layer_gpu(*(l.state_z_layer), batch, learning_rate, momentum, decay, loss_scale); + update_connected_layer_gpu(*(l.state_h_layer), batch, learning_rate, momentum, decay, loss_scale); } void forward_gru_layer_gpu(layer l, network_state state) diff --git a/src/gru_layer.h b/src/gru_layer.h index ae6b55f4..c13e46e8 100644 --- a/src/gru_layer.h +++ b/src/gru_layer.h @@ -18,7 +18,7 @@ void update_gru_layer(layer l, int batch, float learning_rate, float momentum, f #ifdef GPU void forward_gru_layer_gpu(layer l, network_state state); void backward_gru_layer_gpu(layer l, network_state state); -void update_gru_layer_gpu(layer l, int batch, float learning_rate, float momentum, float decay); +void update_gru_layer_gpu(layer l, int batch, float learning_rate, float momentum, float decay, float loss_scale); void push_gru_layer(layer l); void pull_gru_layer(layer l); #endif diff --git a/src/http_stream.cpp b/src/http_stream.cpp index 1f3d138d..c71a7f47 100644 --- a/src/http_stream.cpp +++ b/src/http_stream.cpp @@ -258,10 +258,11 @@ public: //_write(s, head, 0); if (!close_all_sockets) _write(s, ", \n", 0); int n = _write(s, outputbuf, outlen); - if (n < outlen) + if (n < (int)outlen) { cerr << "JSON_sender: kill client " << s << endl; - ::shutdown(s, 2); + close_socket(s); + //::shutdown(s, 2); FD_CLR(s, &master); } @@ -448,7 +449,7 @@ public: cv::imencode(".jpg", frame, outbuf, params); //REMOVED FOR COMPATIBILITY // https://docs.opencv.org/3.4/d4/da8/group__imgcodecs.html#ga292d81be8d76901bff7988d18d2b42ac //std::cerr << "cv::imencode call disabled!" << std::endl; - size_t outlen = outbuf.size(); + int outlen = static_cast(outbuf.size()); #ifdef _WIN32 for (unsigned i = 0; in; ++i){ layer l = net->layers[i]; - //printf(" %d: layer = %d,", i, l.type); + //printf(" (resize %d: layer = %d) , ", i, l.type); if(l.type == CONVOLUTIONAL){ resize_convolutional_layer(&l, w, h); } @@ -1048,6 +1060,7 @@ void free_network(network net) free(net.scales); free(net.steps); free(net.seen); + free(net.cur_iteration); #ifdef GPU if (gpu_index >= 0) cuda_free(net.workspace); @@ -1120,8 +1133,9 @@ void fuse_conv_batchnorm(network net) { if (l->nweights > 0) { //cuda_pull_array(l.weights_gpu, l.weights, l.nweights); - for (int i = 0; i < l->nweights; ++i) printf(" w = %f,", l->weights[i]); - printf(" l->nweights = %d \n", l->nweights); + int i; + for (i = 0; i < l->nweights; ++i) printf(" w = %f,", l->weights[i]); + printf(" l->nweights = %d, j = %d \n", l->nweights, j); } // nweights - l.n or l.n*l.c or (l.n*l.c*l.h*l.w) diff --git a/src/network.h b/src/network.h index 7e51c37c..aad50215 100644 --- a/src/network.h +++ b/src/network.h @@ -108,6 +108,7 @@ float get_current_seq_subdivisions(network net); int get_sequence_value(network net); float get_current_rate(network net); int get_current_batch(network net); +int64_t get_current_iteration(network net); //void free_network(network net); // darknet.h void compare_networks(network n1, network n2, data d); char *get_layer_string(LAYER_TYPE a); diff --git a/src/network_kernels.cu b/src/network_kernels.cu index 34d9ec6a..75b3d6a2 100644 --- a/src/network_kernels.cu +++ b/src/network_kernels.cu @@ -169,7 +169,8 @@ void backward_network_gpu(network net, network_state state) for(i = net.n-1; i >= 0; --i){ state.index = i; layer l = net.layers[i]; - if (l.stopbackward) break; + if (l.stopbackward == 1) break; + if (l.stopbackward > get_current_iteration(net)) break; if(i == 0){ state.input = original_input; state.delta = original_delta; @@ -247,8 +248,9 @@ void update_network_gpu(network net) layer l = net.layers[i]; l.t = get_current_batch(net); if (iteration_num > (net.max_batches * 1 / 2)) l.deform = 0; - if(l.update_gpu){ - l.update_gpu(l, update_batch, rate, net.momentum, net.decay); + if (l.burnin_update && (l.burnin_update*net.burn_in > iteration_num)) continue; + if(l.update_gpu && l.dont_update < iteration_num){ + l.update_gpu(l, update_batch, rate, net.momentum, net.decay, net.loss_scale); } } } @@ -318,7 +320,7 @@ float train_network_datum_gpu(network net, float *x, float *y) float error = get_network_cost(net); //if (((*net.seen) / net.batch) % net.subdivisions == 0) update_network_gpu(net); const int sequence = get_sequence_value(net); - if (((*net.seen) / net.batch) % (net.subdivisions*sequence) == 0) update_network_gpu(net); + //if (((*net.seen) / net.batch) % (net.subdivisions*sequence) == 0) update_network_gpu(net); return error; } @@ -379,7 +381,7 @@ void update_layer(layer l, network net) float rate = get_current_rate(net); l.t = get_current_batch(net); if(l.update_gpu){ - l.update_gpu(l, update_batch, rate, net.momentum, net.decay); + l.update_gpu(l, update_batch, rate, net.momentum, net.decay, net.loss_scale); } } @@ -564,7 +566,9 @@ float train_networks(network *nets, int n, data d, int interval) sum += errors[i]; } //cudaDeviceSynchronize(); - if (get_current_batch(nets[0]) % interval == 0) { + *nets[0].cur_iteration += (n - 1); + if (get_current_iteration(nets[0]) % interval == 0) + { printf("Syncing... "); fflush(stdout); sync_nets(nets, n, interval); diff --git a/src/parser.c b/src/parser.c index f6f0fa0e..a2084378 100644 --- a/src/parser.c +++ b/src/parser.c @@ -997,6 +997,7 @@ route_layer parse_route(list *options, size_params params) if(next.out_w == first.out_w && next.out_h == first.out_h){ layer.out_c += next.out_c; }else{ + fprintf(stderr, " The width and height of the input layers are different. \n"); layer.out_h = layer.out_w = layer.out_c = 0; } } @@ -1053,6 +1054,10 @@ void parse_net_options(list *options, network *net) net->batch *= net->time_steps; net->subdivisions = subdivs; + *net->seen = 0; + *net->cur_iteration = 0; + net->loss_scale = option_find_float_quiet(options, "loss_scale", 1); + net->dynamic_minibatch = option_find_int_quiet(options, "dynamic_minibatch", 0); net->optimized_memory = option_find_int_quiet(options, "optimized_memory", 0); net->workspace_size_limit = (size_t)1024*1024 * option_find_float_quiet(options, "workspace_size_limit_MB", 1024); // 1024 MB by default @@ -1093,15 +1098,17 @@ void parse_net_options(list *options, network *net) char *policy_s = option_find_str(options, "policy", "constant"); net->policy = get_policy(policy_s); net->burn_in = option_find_int_quiet(options, "burn_in", 0); -#ifdef CUDNN_HALF +#ifdef GPU if (net->gpu_index >= 0) { int compute_capability = get_gpu_compute_capability(net->gpu_index); - if (get_gpu_compute_capability(net->gpu_index) >= 700) net->cudnn_half = 1; +#ifdef CUDNN_HALF + if (compute_capability >= 700) net->cudnn_half = 1; else net->cudnn_half = 0; +#endif// CUDNN_HALF fprintf(stderr, " compute_capability = %d, cudnn_half = %d \n", compute_capability, net->cudnn_half); } else fprintf(stderr, " GPU isn't used \n"); -#endif +#endif// GPU if(net->policy == STEP){ net->step = option_find_int(options, "step", 1); net->scale = option_find_float(options, "scale", 1); @@ -1201,7 +1208,7 @@ network parse_network_cfg_custom(char *filename, int batch, int time_steps) params.batch = net.batch; params.time_steps = net.time_steps; params.net = net; - printf("batch = %d, time_steps = %d, train = %d \n", net.batch, net.time_steps, params.train); + printf("mini_batch = %d, batch = %d, time_steps = %d, train = %d \n", net.batch, net.batch * net.subdivisions, net.time_steps, params.train); int avg_outputs = 0; float bflops = 0; @@ -1357,7 +1364,11 @@ network parse_network_cfg_custom(char *filename, int batch, int time_steps) } #endif // GPU + l.clip = option_find_float_quiet(options, "clip", 0); + l.dynamic_minibatch = net.dynamic_minibatch; l.onlyforward = option_find_int_quiet(options, "onlyforward", 0); + l.dont_update = option_find_int_quiet(options, "dont_update", 0); + l.burnin_update = option_find_int_quiet(options, "burnin_update", 0); l.stopbackward = option_find_int_quiet(options, "stopbackward", 0); l.dontload = option_find_int_quiet(options, "dontload", 0); l.dontloadscales = option_find_int_quiet(options, "dontloadscales", 0); @@ -1549,8 +1560,15 @@ void save_shortcut_weights(layer l, FILE *fp) #ifdef GPU if (gpu_index >= 0) { pull_shortcut_layer(l); + printf("\n pull_shortcut_layer \n"); } #endif + int i; + for (i = 0; i < l.nweights; ++i) printf(" %f, ", l.weight_updates[i]); + printf(" l.nweights = %d - update \n", l.nweights); + for (i = 0; i < l.nweights; ++i) printf(" %f, ", l.weights[i]); + printf(" l.nweights = %d \n\n", l.nweights); + int num = l.nweights; fwrite(l.weights, sizeof(float), num, fp); } @@ -1626,6 +1644,7 @@ void save_weights_upto(network net, char *filename, int cutoff) fwrite(&major, sizeof(int), 1, fp); fwrite(&minor, sizeof(int), 1, fp); fwrite(&revision, sizeof(int), 1, fp); + //(*net.seen) = (*net.cur_iteration) * net.batch * net.subdivisions; fwrite(net.seen, sizeof(uint64_t), 1, fp); int i; @@ -1835,7 +1854,7 @@ void load_shortcut_weights(layer l, FILE *fp) read_bytes = fread(l.weights, sizeof(float), num, fp); if (read_bytes > 0 && read_bytes < num) printf("\n Warning: Unexpected end of wights-file! l.weights - l.index = %d \n", l.index); //for (int i = 0; i < l.nweights; ++i) printf(" %f, ", l.weights[i]); - //printf("\n\n"); + //printf(" read_bytes = %d \n\n", read_bytes); #ifdef GPU if (gpu_index >= 0) { push_shortcut_layer(l); @@ -1873,6 +1892,7 @@ void load_weights_upto(network *net, char *filename, int cutoff) fread(&iseen, sizeof(uint32_t), 1, fp); *net->seen = iseen; } + *net->cur_iteration = get_current_batch(*net); printf(", trained: %.0f K-images (%.0f Kilo-batches_64) \n", (float)(*net->seen / 1000), (float)(*net->seen / 64000)); int transpose = (major > 1000) || (minor > 1000); @@ -1968,7 +1988,10 @@ network *load_network_custom(char *cfg, char *weights, int clear, int batch) load_weights(net, weights); } fuse_conv_batchnorm(*net); - if (clear) (*net->seen) = 0; + if (clear) { + (*net->seen) = 0; + (*net->cur_iteration) = 0; + } return net; } @@ -1982,6 +2005,9 @@ network *load_network(char *cfg, char *weights, int clear) printf(" Try to load weights: %s \n", weights); load_weights(net, weights); } - if (clear) (*net->seen) = 0; + if (clear) { + (*net->seen) = 0; + (*net->cur_iteration) = 0; + } return net; } diff --git a/src/region_layer.c b/src/region_layer.c index 091b42f9..3825b425 100644 --- a/src/region_layer.c +++ b/src/region_layer.c @@ -67,7 +67,8 @@ void resize_region_layer(layer *l, int w, int h) l->delta = (float*)xrealloc(l->delta, l->batch * l->outputs * sizeof(float)); #ifdef GPU - if (old_w < w || old_h < h) { + //if (old_w < w || old_h < h) + { cuda_free(l->delta_gpu); cuda_free(l->output_gpu); diff --git a/src/rnn.c b/src/rnn.c index 6c9b38c8..53fb8f32 100644 --- a/src/rnn.c +++ b/src/rnn.c @@ -155,7 +155,10 @@ void train_char_rnn(char *cfgfile, char *weightfile, char *filename, int clear, fprintf(stderr, "Learning Rate: %g, Momentum: %g, Decay: %g\n", net.learning_rate, net.momentum, net.decay); int batch = net.batch; int steps = net.time_steps; - if(clear) *net.seen = 0; + if (clear) { + *net.seen = 0; + *net.cur_iteration = 0; + } int i = (*net.seen)/net.batch; int streams = batch/steps; diff --git a/src/rnn_layer.c b/src/rnn_layer.c index 328bcbf0..98f0d489 100644 --- a/src/rnn_layer.c +++ b/src/rnn_layer.c @@ -196,11 +196,11 @@ void push_rnn_layer(layer l) push_connected_layer(*(l.output_layer)); } -void update_rnn_layer_gpu(layer l, int batch, float learning_rate, float momentum, float decay) +void update_rnn_layer_gpu(layer l, int batch, float learning_rate, float momentum, float decay, float loss_scale) { - update_connected_layer_gpu(*(l.input_layer), batch, learning_rate, momentum, decay); - update_connected_layer_gpu(*(l.self_layer), batch, learning_rate, momentum, decay); - update_connected_layer_gpu(*(l.output_layer), batch, learning_rate, momentum, decay); + update_connected_layer_gpu(*(l.input_layer), batch, learning_rate, momentum, decay, loss_scale); + update_connected_layer_gpu(*(l.self_layer), batch, learning_rate, momentum, decay, loss_scale); + update_connected_layer_gpu(*(l.output_layer), batch, learning_rate, momentum, decay, loss_scale); } void forward_rnn_layer_gpu(layer l, network_state state) diff --git a/src/rnn_layer.h b/src/rnn_layer.h index 43b57d82..a2aa0f9e 100644 --- a/src/rnn_layer.h +++ b/src/rnn_layer.h @@ -19,7 +19,7 @@ void update_rnn_layer(layer l, int batch, float learning_rate, float momentum, f #ifdef GPU void forward_rnn_layer_gpu(layer l, network_state state); void backward_rnn_layer_gpu(layer l, network_state state); -void update_rnn_layer_gpu(layer l, int batch, float learning_rate, float momentum, float decay); +void update_rnn_layer_gpu(layer l, int batch, float learning_rate, float momentum, float decay, float loss_scale); void push_rnn_layer(layer l); void pull_rnn_layer(layer l); #endif diff --git a/src/shortcut_layer.c b/src/shortcut_layer.c index 0acb138a..17f0df94 100644 --- a/src/shortcut_layer.c +++ b/src/shortcut_layer.c @@ -53,7 +53,7 @@ layer make_shortcut_layer(int batch, int n, int *input_layers, int* input_sizes, if (l.nweights > 0) { l.weights = (float*)calloc(l.nweights, sizeof(float)); float scale = sqrt(2. / l.nweights); - for (i = 0; i < l.nweights; ++i) l.weights[i] = 1 + 0.01*rand_uniform(-1, 1);// scale*rand_uniform(-1, 1); // rand_normal(); + for (i = 0; i < l.nweights; ++i) l.weights[i] = 1;// +0.01*rand_uniform(-1, 1);// scale*rand_uniform(-1, 1); // rand_normal(); if (train) l.weight_updates = (float*)calloc(l.nweights, sizeof(float)); l.update = update_shortcut_layer; @@ -240,7 +240,7 @@ void backward_shortcut_layer_gpu(const layer l, network_state state) //shortcut_gpu(l.batch, l.out_w, l.out_h, l.out_c, l.delta_gpu, l.w, l.h, l.c, state.net.layers[l.index].delta_gpu); } -void update_shortcut_layer_gpu(layer l, int batch, float learning_rate_init, float momentum, float decay) +void update_shortcut_layer_gpu(layer l, int batch, float learning_rate_init, float momentum, float decay, float loss_scale) { if (l.nweights > 0) { float learning_rate = learning_rate_init*l.learning_rate_scale; @@ -248,21 +248,43 @@ void update_shortcut_layer_gpu(layer l, int batch, float learning_rate_init, flo //float decay = a.decay; //int batch = a.batch; + // Loss scale for Mixed-Precision on Tensor-Cores + if (loss_scale != 1.0) { + if(l.weight_updates_gpu && l.nweights > 0) scal_ongpu(l.nweights, 1.0 / loss_scale, l.weight_updates_gpu, 1); + } + reset_nan_and_inf(l.weight_updates_gpu, l.nweights); fix_nan_and_inf(l.weights_gpu, l.nweights); - axpy_ongpu(l.nweights, -decay*batch, l.weights_gpu, 1, l.weight_updates_gpu, 1); + //constrain_weight_updates_ongpu(l.nweights, 1, l.weights_gpu, l.weight_updates_gpu); + constrain_ongpu(l.nweights, 1, l.weight_updates_gpu, 1); + + /* + cuda_pull_array_async(l.weights_gpu, l.weights, l.nweights); + cuda_pull_array_async(l.weight_updates_gpu, l.weight_updates, l.nweights); + CHECK_CUDA(cudaStreamSynchronize(get_cuda_stream())); + for (int i = 0; i < l.nweights; ++i) printf(" %f, ", l.weight_updates[i]); + printf(" l.nweights = %d - updates \n", l.nweights); + for (int i = 0; i < l.nweights; ++i) printf(" %f, ", l.weights[i]); + printf(" l.nweights = %d \n\n", l.nweights); + */ + + //axpy_ongpu(l.nweights, -decay*batch, l.weights_gpu, 1, l.weight_updates_gpu, 1); axpy_ongpu(l.nweights, learning_rate / batch, l.weight_updates_gpu, 1, l.weights_gpu, 1); scal_ongpu(l.nweights, momentum, l.weight_updates_gpu, 1); + //fill_ongpu(l.nweights, 0, l.weight_updates_gpu, 1); + //if (l.clip) { - // constrain_gpu(l.nweights, l.clip, l.weights_gpu, 1); + // constrain_ongpu(l.nweights, l.clip, l.weights_gpu, 1); //} } } void pull_shortcut_layer(layer l) { + constrain_ongpu(l.nweights, 1, l.weight_updates_gpu, 1); + cuda_pull_array_async(l.weight_updates_gpu, l.weight_updates, l.nweights); cuda_pull_array_async(l.weights_gpu, l.weights, l.nweights); CHECK_CUDA(cudaPeekAtLastError()); CHECK_CUDA(cudaStreamSynchronize(get_cuda_stream())); diff --git a/src/shortcut_layer.h b/src/shortcut_layer.h index 29191620..f0bf33a7 100644 --- a/src/shortcut_layer.h +++ b/src/shortcut_layer.h @@ -18,7 +18,7 @@ void resize_shortcut_layer(layer *l, int w, int h, network *net); #ifdef GPU void forward_shortcut_layer_gpu(const layer l, network_state state); void backward_shortcut_layer_gpu(const layer l, network_state state); -void update_shortcut_layer_gpu(layer l, int batch, float learning_rate_init, float momentum, float decay); +void update_shortcut_layer_gpu(layer l, int batch, float learning_rate_init, float momentum, float decay, float loss_scale); void pull_shortcut_layer(layer l); void push_shortcut_layer(layer l); #endif diff --git a/src/tag.c b/src/tag.c index ec9327af..d7e1349a 100644 --- a/src/tag.c +++ b/src/tag.c @@ -13,7 +13,10 @@ void train_tag(char *cfgfile, char *weightfile, int clear) if(weightfile){ load_weights(&net, weightfile); } - if(clear) *net.seen = 0; + if (clear) { + *net.seen = 0; + *net.cur_iteration = 0; + } printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", net.learning_rate, net.momentum, net.decay); int imgs = 1024; list* plist = get_paths("tag/train.list"); diff --git a/src/yolo_console_dll.cpp b/src/yolo_console_dll.cpp index 8f73d869..6d1bb2a5 100644 --- a/src/yolo_console_dll.cpp +++ b/src/yolo_console_dll.cpp @@ -124,7 +124,7 @@ cv::Mat zed_capture_rgb(sl::Camera &zed) { cv::Mat zed_capture_3d(sl::Camera &zed) { sl::Mat cur_cloud; - zed.retrieveMeasure(cur_cloud, + zed.retrieveMeasure(cur_cloud, #ifdef ZED_STEREO_2_COMPAT_MODE sl::MEASURE_XYZ #else @@ -151,11 +151,13 @@ std::vector get_3d_coordinates(std::vector bbox_vect, cv::Mat xy #ifndef USE_CMAKE_LIBS #pragma comment(lib, "opencv_world" OPENCV_VERSION ".lib") #ifdef TRACK_OPTFLOW +/* #pragma comment(lib, "opencv_cudaoptflow" OPENCV_VERSION ".lib") #pragma comment(lib, "opencv_cudaimgproc" OPENCV_VERSION ".lib") #pragma comment(lib, "opencv_core" OPENCV_VERSION ".lib") #pragma comment(lib, "opencv_imgproc" OPENCV_VERSION ".lib") #pragma comment(lib, "opencv_highgui" OPENCV_VERSION ".lib") +*/ #endif // TRACK_OPTFLOW #endif // USE_CMAKE_LIBS #else // OpenCV 2.x @@ -342,7 +344,7 @@ int main(int argc, char *argv[]) #endif //init_params.sdk_cuda_ctx = (CUcontext)detector.get_cuda_context(); init_params.sdk_gpu_id = detector.cur_gpu_id; - + if (filename == "zed_camera" || file_ext == "svo") { std::cout << "ZED 3D Camera " << zed.open(init_params) << std::endl; if (!zed.isOpened()) { @@ -411,7 +413,7 @@ int main(int argc, char *argv[]) if (use_zed_camera) { while (zed.grab() != #ifdef ZED_STEREO_2_COMPAT_MODE - sl::SUCCESS + sl::SUCCESS #else sl::ERROR_CODE::SUCCESS #endif diff --git a/src/yolo_layer.c b/src/yolo_layer.c index 86a78895..1ea6bf4d 100644 --- a/src/yolo_layer.c +++ b/src/yolo_layer.c @@ -257,7 +257,12 @@ void delta_yolo_class(float *output, float *delta, int index, int class_id, int { int n; if (delta[index + stride*class_id]){ - delta[index + stride*class_id] = (1 - label_smooth_eps) - output[index + stride*class_id]; + float y_true = 1; + if(label_smooth_eps) y_true = y_true * (1 - label_smooth_eps) + 0.5*label_smooth_eps; + float result_delta = y_true - output[index + stride*class_id]; + if(!isnan(result_delta) && !isinf(result_delta)) delta[index + stride*class_id] = result_delta; + //delta[index + stride*class_id] = 1 - output[index + stride*class_id]; + if (classes_multipliers) delta[index + stride*class_id] *= classes_multipliers[class_id]; if(avg_cat) *avg_cat += output[index + stride*class_id]; return; @@ -285,7 +290,11 @@ void delta_yolo_class(float *output, float *delta, int index, int class_id, int else { // default for (n = 0; n < classes; ++n) { - delta[index + stride*n] = ((n == class_id) ? (1 - label_smooth_eps) : (0 + label_smooth_eps/classes)) - output[index + stride*n]; + float y_true = ((n == class_id) ? 1 : 0); + if (label_smooth_eps) y_true = y_true * (1 - label_smooth_eps) + 0.5*label_smooth_eps; + float result_delta = y_true - output[index + stride*n]; + if (!isnan(result_delta) && !isinf(result_delta)) delta[index + stride*n] = result_delta; + if (classes_multipliers && n == class_id) delta[index + stride*class_id] *= classes_multipliers[class_id]; if (n == class_id && avg_cat) *avg_cat += output[index + stride*n]; } @@ -362,17 +371,18 @@ void forward_yolo_layer(const layer l, network_state state) for (t = 0; t < l.max_boxes; ++t) { box truth = float_to_box_stride(state.truth + t*(4 + 1) + b*l.truths, 1); int class_id = state.truth[t*(4 + 1) + b*l.truths + 4]; - if (class_id >= l.classes) { + if (class_id >= l.classes || class_id < 0) { printf("\n Warning: in txt-labels class_id=%d >= classes=%d in cfg-file. In txt-labels class_id should be [from 0 to %d] \n", class_id, l.classes, l.classes - 1); printf("\n truth.x = %f, truth.y = %f, truth.w = %f, truth.h = %f, class_id = %d \n", truth.x, truth.y, truth.w, truth.h, class_id); if (check_mistakes) getchar(); - continue; // if label contains class_id more than number of classes in the cfg-file + continue; // if label contains class_id more than number of classes in the cfg-file and class_id check garbage value } if (!truth.x) break; // continue; int class_index = entry_index(l, b, n*l.w*l.h + j*l.w + i, 4 + 1); int obj_index = entry_index(l, b, n*l.w*l.h + j*l.w + i, 4); float objectness = l.output[obj_index]; + if (isnan(objectness) || isinf(objectness)) l.output[obj_index] = 0; int class_id_match = compare_yolo_class(l.output, l.classes, class_index, l.w*l.h, objectness, class_id, 0.25f); float iou = box_iou(pred, truth); @@ -415,7 +425,7 @@ void forward_yolo_layer(const layer l, network_state state) system(buff); } int class_id = state.truth[t*(4 + 1) + b*l.truths + 4]; - if (class_id >= l.classes) continue; // if label contains class_id more than number of classes in the cfg-file + if (class_id >= l.classes || class_id < 0) continue; // if label contains class_id more than number of classes in the cfg-file and class_id check garbage value if (!truth.x) break; // continue; float best_iou = 0; @@ -531,6 +541,9 @@ void forward_yolo_layer(const layer l, network_state state) } } + if (count == 0) count = 1; + if (class_count == 0) class_count = 1; + //*(l.cost) = pow(mag_array(l.delta, l.outputs * l.batch), 2); //printf("Region %d Avg IOU: %f, Class: %f, Obj: %f, No Obj: %f, .5R: %f, .75R: %f, count: %d\n", state.index, avg_iou / count, avg_cat / class_count, avg_obj / count, avg_anyobj / (l.w*l.h*l.n*l.batch), recall / count, recall75 / count, count); @@ -810,6 +823,6 @@ void forward_yolo_layer_gpu(const layer l, network_state state) void backward_yolo_layer_gpu(const layer l, network_state state) { - axpy_ongpu(l.batch*l.inputs, 1, l.delta_gpu, 1, state.delta, 1); + axpy_ongpu(l.batch*l.inputs, state.net.loss_scale, l.delta_gpu, 1, state.delta, 1); } #endif