diff --git a/include/darknet.h b/include/darknet.h index 2be89f05..ac50e49e 100644 --- a/include/darknet.h +++ b/include/darknet.h @@ -551,6 +551,8 @@ struct layer { float * mean_gpu; float * variance_gpu; + float * m_cbn_avg_gpu; + float * v_cbn_avg_gpu; float * rolling_mean_gpu; float * rolling_variance_gpu; diff --git a/src/batchnorm_layer.c b/src/batchnorm_layer.c index 333db97f..1b16dd5f 100644 --- a/src/batchnorm_layer.c +++ b/src/batchnorm_layer.c @@ -251,51 +251,81 @@ void forward_batchnorm_layer_gpu(layer l, network_state state) if (state.train) { simple_copy_ongpu(l.outputs*l.batch, l.output_gpu, l.x_gpu); + + // cbn + if (l.batch_normalize == 2) { + + fast_mean_gpu(l.output_gpu, l.batch, l.out_c, l.out_h*l.out_w, l.mean_gpu); + + //fast_v_gpu(l.output_gpu, l.mean_gpu, l.batch, l.out_c, l.out_h*l.out_w, l.v_cbn_gpu); + int minibatch_index = state.net.current_subdivision + 1; + float alpha = 0.01; + + int inverse_variance = 0; #ifdef CUDNN - float one = 1; - float zero = 0; - cudnnBatchNormalizationForwardTraining(cudnn_handle(), - CUDNN_BATCHNORM_SPATIAL, - &one, - &zero, - l.normDstTensorDesc, - l.x_gpu, // input - l.normDstTensorDesc, - l.output_gpu, // output - l.normTensorDesc, - l.scales_gpu, - l.biases_gpu, - .01, - l.rolling_mean_gpu, // output (should be FP32) - l.rolling_variance_gpu, // output (should be FP32) - .00001, - l.mean_gpu, // output (should be FP32) - l.variance_gpu); // output (should be FP32) - - if (state.net.try_fix_nan) { - fix_nan_and_inf(l.scales_gpu, l.n); - fix_nan_and_inf(l.biases_gpu, l.n); - fix_nan_and_inf(l.mean_gpu, l.n); - fix_nan_and_inf(l.variance_gpu, l.n); - fix_nan_and_inf(l.rolling_mean_gpu, l.n); - fix_nan_and_inf(l.rolling_variance_gpu, l.n); + inverse_variance = 1; +#endif // CUDNN + + fast_v_cbn_gpu(l.output_gpu, l.mean_gpu, l.batch, l.out_c, l.out_h*l.out_w, minibatch_index, l.m_cbn_avg_gpu, l.v_cbn_avg_gpu, l.variance_gpu, + alpha, l.rolling_mean_gpu, l.rolling_variance_gpu, inverse_variance, .00001); + + normalize_scale_bias_gpu(l.output_gpu, l.mean_gpu, l.variance_gpu, l.scales_gpu, l.biases_gpu, l.batch, l.out_c, l.out_h*l.out_w, .00001f); + +#ifndef CUDNN + simple_copy_ongpu(l.outputs*l.batch, l.output_gpu, l.x_norm_gpu); +#endif // CUDNN + + //printf("\n CBN \n"); } + else { +#ifdef CUDNN + float one = 1; + float zero = 0; + cudnnBatchNormalizationForwardTraining(cudnn_handle(), + CUDNN_BATCHNORM_SPATIAL, + &one, + &zero, + l.normDstTensorDesc, + l.x_gpu, // input + l.normDstTensorDesc, + l.output_gpu, // output + l.normTensorDesc, + l.scales_gpu, + l.biases_gpu, + .01, + l.rolling_mean_gpu, // output (should be FP32) + l.rolling_variance_gpu, // output (should be FP32) + .00001, + l.mean_gpu, // output (should be FP32) + l.variance_gpu); // output (should be FP32) + + if (state.net.try_fix_nan) { + fix_nan_and_inf(l.scales_gpu, l.n); + fix_nan_and_inf(l.biases_gpu, l.n); + fix_nan_and_inf(l.mean_gpu, l.n); + fix_nan_and_inf(l.variance_gpu, l.n); + fix_nan_and_inf(l.rolling_mean_gpu, l.n); + fix_nan_and_inf(l.rolling_variance_gpu, l.n); + } + + //simple_copy_ongpu(l.outputs*l.batch, l.output_gpu, l.x_norm_gpu); #else // CUDNN - fast_mean_gpu(l.output_gpu, l.batch, l.out_c, l.out_h*l.out_w, l.mean_gpu); - fast_variance_gpu(l.output_gpu, l.mean_gpu, l.batch, l.out_c, l.out_h*l.out_w, l.variance_gpu); + fast_mean_gpu(l.output_gpu, l.batch, l.out_c, l.out_h*l.out_w, l.mean_gpu); + fast_variance_gpu(l.output_gpu, l.mean_gpu, l.batch, l.out_c, l.out_h*l.out_w, l.variance_gpu); - scal_ongpu(l.out_c, .99, l.rolling_mean_gpu, 1); - axpy_ongpu(l.out_c, .01, l.mean_gpu, 1, l.rolling_mean_gpu, 1); - scal_ongpu(l.out_c, .99, l.rolling_variance_gpu, 1); - axpy_ongpu(l.out_c, .01, l.variance_gpu, 1, l.rolling_variance_gpu, 1); + scal_ongpu(l.out_c, .99, l.rolling_mean_gpu, 1); + axpy_ongpu(l.out_c, .01, l.mean_gpu, 1, l.rolling_mean_gpu, 1); + scal_ongpu(l.out_c, .99, l.rolling_variance_gpu, 1); + axpy_ongpu(l.out_c, .01, l.variance_gpu, 1, l.rolling_variance_gpu, 1); - copy_ongpu(l.outputs*l.batch, l.output_gpu, 1, l.x_gpu, 1); - normalize_gpu(l.output_gpu, l.mean_gpu, l.variance_gpu, l.batch, l.out_c, l.out_h*l.out_w); - copy_ongpu(l.outputs*l.batch, l.output_gpu, 1, l.x_norm_gpu, 1); + copy_ongpu(l.outputs*l.batch, l.output_gpu, 1, l.x_gpu, 1); + normalize_gpu(l.output_gpu, l.mean_gpu, l.variance_gpu, l.batch, l.out_c, l.out_h*l.out_w); + copy_ongpu(l.outputs*l.batch, l.output_gpu, 1, l.x_norm_gpu, 1); - scale_bias_gpu(l.output_gpu, l.scales_gpu, l.batch, l.out_c, l.out_h*l.out_w); - add_bias_gpu(l.output_gpu, l.biases_gpu, l.batch, l.out_c, l.out_w*l.out_h); + scale_bias_gpu(l.output_gpu, l.scales_gpu, l.batch, l.out_c, l.out_h*l.out_w); + add_bias_gpu(l.output_gpu, l.biases_gpu, l.batch, l.out_c, l.out_w*l.out_h); #endif // CUDNN + } } else { normalize_gpu(l.output_gpu, l.rolling_mean_gpu, l.rolling_variance_gpu, l.batch, l.out_c, l.out_h*l.out_w); diff --git a/src/blas.h b/src/blas.h index aafff998..fd2cf0bb 100644 --- a/src/blas.h +++ b/src/blas.h @@ -85,14 +85,17 @@ void normalize_delta_gpu(float *x, float *mean, float *variance, float *mean_del void fast_mean_delta_gpu(float *delta, float *variance, int batch, int filters, int spatial, float *mean_delta); void fast_variance_delta_gpu(float *x, float *delta, float *mean, float *variance, int batch, int filters, int spatial, float *variance_delta); -void fast_variance_gpu(float *x, float *mean, int batch, int filters, int spatial, float *variance); void fast_mean_gpu(float *x, int batch, int filters, int spatial, float *mean); +void fast_variance_gpu(float *x, float *mean, int batch, int filters, int spatial, float *variance); +void fast_v_cbn_gpu(const float *x, float *mean, int batch, int filters, int spatial, int minibatch_index, float *m_avg, float *v_avg, float *variance, + const float alpha, float *rolling_mean_gpu, float *rolling_variance_gpu, int inverse_variance, float epsilon); +void normalize_scale_bias_gpu(float *x, float *mean, float *variance, float *scales, float *biases, int batch, int filters, int spatial, float epsilon); +void compare_2_arrays_gpu(float *one, float *two, int size); void shortcut_gpu(int batch, int w1, int h1, int c1, float *add, int w2, int h2, int c2, float *out); void shortcut_multilayer_gpu(int src_outputs, int batch, int n, int *outputs_of_layers_gpu, float **layers_output_gpu, float *out, float *in, float *weights_gpu, int nweights, WEIGHTS_NORMALIZATION_T weights_normalizion); void backward_shortcut_multilayer_gpu(int src_outputs, int batch, int n, int *outputs_of_layers_gpu, float **layers_delta_gpu, float *delta_out, float *delta_in, float *weights, float *weight_updates, int nweights, float *in, float **layers_output, WEIGHTS_NORMALIZATION_T weights_normalizion); void input_shortcut_gpu(float *in, int batch, int w1, int h1, int c1, float *add, int w2, int h2, int c2, float *out); -void scale_bias_gpu(float *output, float *biases, int batch, int n, int size); void backward_scale_gpu(float *x_norm, float *delta, int batch, int n, int size, float *scale_updates); void scale_bias_gpu(float *output, float *biases, int batch, int n, int size); void add_bias_gpu(float *output, float *biases, int batch, int n, int size); diff --git a/src/blas_kernels.cu b/src/blas_kernels.cu index eac136ca..7014e3fa 100644 --- a/src/blas_kernels.cu +++ b/src/blas_kernels.cu @@ -10,6 +10,24 @@ #include "tree.h" +__global__ void compare_2_arrays_kernel(float *one, float *two, int size) +{ + const int index = blockIdx.x*blockDim.x + threadIdx.x; + if (index >= size) return; + + if (one[index] != two[index]) printf(" i: %d - one = %f, two = %f \n", index, one[index], two[index]); +} + +void compare_2_arrays_gpu(float *one, float *two, int size) +{ + const int num_blocks = get_number_of_blocks(size, BLOCK); + + compare_2_arrays_kernel << > >(one, two, size); + CHECK_CUDA(cudaPeekAtLastError()); + CHECK_CUDA(cudaDeviceSynchronize()); +} + + __global__ void scale_bias_kernel(float *output, float *scale, int batch, int filters, int spatial, int current_size) { const int index = blockIdx.x*blockDim.x + threadIdx.x; @@ -179,7 +197,7 @@ __global__ void normalize_kernel(int N, float *x, float *mean, float *variance, if (index >= N) return; int f = (index / spatial) % filters; - x[index] = (x[index] - mean[f]) / (sqrtf(variance[f] + .000001f)); + x[index] = (x[index] - mean[f]) / (sqrtf(variance[f] + .00001f)); } extern "C" void normalize_gpu(float *x, float *mean, float *variance, int batch, int filters, int spatial) @@ -470,8 +488,6 @@ __global__ void mul_kernel(int N, float *X, int INCX, float *Y, int INCY) } - - __global__ void fast_mean_kernel(float *x, int batch, int filters, int spatial, float *mean) { const int threads = BLOCK; @@ -492,14 +508,21 @@ __global__ void fast_mean_kernel(float *x, int batch, int filters, int spatial, __syncthreads(); if(id == 0){ - mean[filter] = 0; + float mean_tmp = 0; for(i = 0; i < threads; ++i){ - mean[filter] += local[i]; + mean_tmp += local[i]; } - mean[filter] /= spatial * batch; + mean_tmp /= spatial * batch; + mean[filter] = mean_tmp; } } +extern "C" void fast_mean_gpu(float *x, int batch, int filters, int spatial, float *mean) +{ + fast_mean_kernel << > >(x, batch, filters, spatial, mean); + CHECK_CUDA(cudaPeekAtLastError()); +} + __global__ void fast_variance_kernel(float *x, float *mean, int batch, int filters, int spatial, float *variance) { const int threads = BLOCK; @@ -521,27 +544,100 @@ __global__ void fast_variance_kernel(float *x, float *mean, int batch, int filt __syncthreads(); if(id == 0){ - variance[filter] = 0; + float variance_tmp = 0; for(i = 0; i < threads; ++i){ - variance[filter] += local[i]; + variance_tmp += local[i]; } - variance[filter] /= (spatial * batch - 1); + variance_tmp /= (spatial * batch);// -1); + variance[filter] = variance_tmp; } } -extern "C" void fast_mean_gpu(float *x, int batch, int filters, int spatial, float *mean) +extern "C" void fast_variance_gpu(float *x, float *mean, int batch, int filters, int spatial, float *variance) { - fast_mean_kernel<<>>(x, batch, filters, spatial, mean); + fast_variance_kernel<<>>(x, mean, batch, filters, spatial, variance); CHECK_CUDA(cudaPeekAtLastError()); } -extern "C" void fast_variance_gpu(float *x, float *mean, int batch, int filters, int spatial, float *variance) + +__global__ void fast_v_cbn_kernel(const float *x, float *mean, int batch, int filters, int spatial, int minibatch_index, float *m_avg, float *v_avg, float *variance, + const float alpha, float *rolling_mean_gpu, float *rolling_variance_gpu, int inverse_variance, float epsilon) { - fast_variance_kernel<<>>(x, mean, batch, filters, spatial, variance); + const int threads = BLOCK; + __shared__ float local[threads]; + + int id = threadIdx.x; + local[id] = 0; + + int filter = blockIdx.x; + + int i, j; + for (j = 0; j < batch; ++j) { + for (i = 0; i < spatial; i += threads) { + int index = j*spatial*filters + filter*spatial + i + id; + + local[id] += (i + id < spatial) ? powf(x[index], 2) : 0; + } + } + __syncthreads(); + + if (id == 0) { + float v_tmp = 0; + v_tmp = 0; + for (i = 0; i < threads; ++i) { + v_tmp += local[i]; + } + v_tmp /= (spatial * batch - 1); + + v_tmp = fmax(v_tmp, powf(mean[filter], 2)); + + + const float alpha_cbn = 1.0f / minibatch_index; + + m_avg[filter] = alpha_cbn * mean[filter] + (1 - alpha_cbn) * m_avg[filter]; + mean[filter] = m_avg[filter]; + + v_avg[filter] = alpha_cbn * v_tmp + (1 - alpha_cbn) * v_avg[filter]; + + float variance_tmp = fmax(0.0f, v_avg[filter] - powf(m_avg[filter], 2)); + if (inverse_variance) variance_tmp = 1.0f / sqrtf(variance_tmp + epsilon); + variance[filter] = variance_tmp; + + rolling_mean_gpu[filter] = alpha * mean[filter] + (1 - alpha) * rolling_mean_gpu[filter]; + + rolling_variance_gpu[filter] = alpha * variance[filter] + (1 - alpha) * rolling_variance_gpu[filter]; + } +} + +extern "C" void fast_v_cbn_gpu(const float *x, float *mean, int batch, int filters, int spatial, int minibatch_index, float *m_avg, float *v_avg, float *variance, + const float alpha, float *rolling_mean_gpu, float *rolling_variance_gpu, int inverse_variance, float epsilon) +{ + fast_v_cbn_kernel << > >(x, mean, batch, filters, spatial, minibatch_index, m_avg, v_avg, variance, alpha, rolling_mean_gpu, rolling_variance_gpu, inverse_variance, epsilon); CHECK_CUDA(cudaPeekAtLastError()); } +__global__ void normalize_scale_bias_kernel(int N, float *x, float *mean, float *variance, float *scales, float *biases, int batch, int filters, int spatial, float epsilon) +{ + const int index = blockIdx.x*blockDim.x + threadIdx.x; + if (index >= N) return; + int f = (index / spatial) % filters; + + float val = (x[index] - mean[f]) / (sqrtf(variance[f] + epsilon)) * scales[f] + biases[f]; + + if (!isnan(val) && !isinf(val)) + x[index] = val; +} + +extern "C" void normalize_scale_bias_gpu(float *x, float *mean, float *variance, float *scales, float *biases, int batch, int filters, int spatial, float epsilon) +{ + const int current_size = batch * filters * spatial; + const int num_blocks = get_number_of_blocks(current_size, BLOCK); + + normalize_scale_bias_kernel << > >(current_size, x, mean, variance, scales, biases, batch, filters, spatial, epsilon); + CHECK_CUDA(cudaPeekAtLastError()); +} + extern "C" void mean_gpu(float *x, int batch, int filters, int spatial, float *mean) { mean_kernel<<>>(x, batch, filters, spatial, mean); diff --git a/src/convolutional_kernels.cu b/src/convolutional_kernels.cu index 80c31026..00d3961c 100644 --- a/src/convolutional_kernels.cu +++ b/src/convolutional_kernels.cu @@ -578,7 +578,7 @@ void forward_convolutional_layer_gpu(convolutional_layer l, network_state state) l.c / l.groups, // input channels l.h, l.w, // input size (h, w) l.size, l.size, // kernel size (h, w) - l.pad, l.pad, // padding (h, w) + l.pad * l.dilation, l.pad * l.dilation, // padding (h, w) l.stride_y, l.stride_x, // stride (h, w) l.dilation, l.dilation, // dilation (h, w) state.workspace); // output @@ -858,7 +858,7 @@ void backward_convolutional_layer_gpu(convolutional_layer l, network_state state l.c / l.groups, // input channels l.h, l.w, // input size (h, w) l.size, l.size, // kernel size (h, w) - l.pad, l.pad, // padding (h, w) + l.pad * l.dilation, l.pad * l.dilation, // padding (h, w) l.stride_y, l.stride_x, // stride (h, w) l.dilation, l.dilation, // dilation (h, w) state.workspace); // output @@ -883,7 +883,7 @@ void backward_convolutional_layer_gpu(convolutional_layer l, network_state state l.c / l.groups, // input channels l.h, l.w, // input size (h, w) l.size, l.size, // kernel size (h, w) - l.pad, l.pad, // padding size (h, w) + l.pad * l.dilation, l.pad * l.dilation, // padding size (h, w) l.stride_y, l.stride_x, // stride size (h, w) l.dilation, l.dilation, // dilation size (h, w) delta); // output (delta) diff --git a/src/convolutional_layer.c b/src/convolutional_layer.c index ff7831d6..d92a7838 100644 --- a/src/convolutional_layer.c +++ b/src/convolutional_layer.c @@ -416,7 +416,7 @@ convolutional_layer make_convolutional_layer(int batch, int steps, int h, int w, if (l.share_layer) { if (l.size != l.share_layer->size || l.nweights != l.share_layer->nweights || l.c != l.share_layer->c || l.n != l.share_layer->n) { - printf("Layer size, nweights, channels or filters don't match for the share_layer"); + printf(" Layer size, nweights, channels or filters don't match for the share_layer"); getchar(); } @@ -611,6 +611,8 @@ convolutional_layer make_convolutional_layer(int batch, int steps, int h, int w, l.mean_gpu = cuda_make_array(l.mean, n); l.variance_gpu = cuda_make_array(l.variance, n); + l.m_cbn_avg_gpu = cuda_make_array(l.mean, n); + l.v_cbn_avg_gpu = cuda_make_array(l.variance, n); #ifndef CUDNN l.mean_delta_gpu = cuda_make_array(l.mean, n); l.variance_delta_gpu = cuda_make_array(l.variance, n); @@ -1238,7 +1240,7 @@ void forward_convolutional_layer(convolutional_layer l, network_state state) l.c / l.groups, // input channels l.h, l.w, // input size (h, w) l.size, l.size, // kernel size (h, w) - l.pad, l.pad, // padding (h, w) + l.pad * l.dilation, l.pad * l.dilation, // padding (h, w) l.stride_y, l.stride_x, // stride (h, w) l.dilation, l.dilation, // dilation (h, w) b); // output @@ -1429,7 +1431,7 @@ void backward_convolutional_layer(convolutional_layer l, network_state state) l.c / l.groups, // input channels l.h, l.w, // input size (h, w) l.size, l.size, // kernel size (h, w) - l.pad, l.pad, // padding (h, w) + l.pad * l.dilation, l.pad * l.dilation, // padding (h, w) l.stride_y, l.stride_x, // stride (h, w) l.dilation, l.dilation, // dilation (h, w) b); // output @@ -1451,7 +1453,7 @@ void backward_convolutional_layer(convolutional_layer l, network_state state) l.c / l.groups, // input channels (h, w) l.h, l.w, // input size (h, w) l.size, l.size, // kernel size (h, w) - l.pad, l.pad, // padding (h, w) + l.pad * l.dilation, l.pad * l.dilation, // padding (h, w) l.stride_y, l.stride_x, // stride (h, w) l.dilation, l.dilation, // dilation (h, w) state.delta + (i*l.groups + j)* (l.c / l.groups)*l.h*l.w); // output (delta) diff --git a/src/layer.c b/src/layer.c index ae72da6a..d00e2213 100644 --- a/src/layer.c +++ b/src/layer.c @@ -158,6 +158,8 @@ void free_layer_custom(layer l, int keep_cudnn_desc) if (l.binary_weights_gpu) cuda_free(l.binary_weights_gpu); if (l.mean_gpu) cuda_free(l.mean_gpu), l.mean_gpu = NULL; if (l.variance_gpu) cuda_free(l.variance_gpu), l.variance_gpu = NULL; + if (l.m_cbn_avg_gpu) cuda_free(l.m_cbn_avg_gpu), l.m_cbn_avg_gpu = NULL; + if (l.v_cbn_avg_gpu) cuda_free(l.v_cbn_avg_gpu), l.v_cbn_avg_gpu = NULL; if (l.rolling_mean_gpu) cuda_free(l.rolling_mean_gpu), l.rolling_mean_gpu = NULL; if (l.rolling_variance_gpu) cuda_free(l.rolling_variance_gpu), l.rolling_variance_gpu = NULL; if (l.variance_delta_gpu) cuda_free(l.variance_delta_gpu), l.variance_delta_gpu = NULL; diff --git a/src/network.c b/src/network.c index ee8004e3..239b3d82 100644 --- a/src/network.c +++ b/src/network.c @@ -1089,14 +1089,14 @@ void fuse_conv_batchnorm(network net) int f; for (f = 0; f < l->n; ++f) { - l->biases[f] = l->biases[f] - (double)l->scales[f] * l->rolling_mean[f] / (sqrt((double)l->rolling_variance[f] + .000001)); + l->biases[f] = l->biases[f] - (double)l->scales[f] * l->rolling_mean[f] / (sqrt((double)l->rolling_variance[f] + .00001)); const size_t filter_size = l->size*l->size*l->c / l->groups; int i; for (i = 0; i < filter_size; ++i) { int w_index = f*filter_size + i; - l->weights[w_index] = (double)l->weights[w_index] * l->scales[f] / (sqrt((double)l->rolling_variance[f] + .000001)); + l->weights[w_index] = (double)l->weights[w_index] * l->scales[f] / (sqrt((double)l->rolling_variance[f] + .00001)); } } diff --git a/src/parser.c b/src/parser.c index 42443cfd..f6f0fa0e 100644 --- a/src/parser.c +++ b/src/parser.c @@ -198,6 +198,8 @@ convolutional_layer parse_convolutional(list *options, size_params params) batch=params.batch; if(!(h && w && c)) error("Layer before convolutional layer must output image."); int batch_normalize = option_find_int_quiet(options, "batch_normalize", 0); + int cbn = option_find_int_quiet(options, "cbn", 0); + if (cbn) batch_normalize = 2; int binary = option_find_int_quiet(options, "binary", 0); int xnor = option_find_int_quiet(options, "xnor", 0); int use_bin_output = option_find_int_quiet(options, "bin_output", 0);