diff --git a/include/darknet.h b/include/darknet.h index 07839ad6..ffc5539a 100644 --- a/include/darknet.h +++ b/include/darknet.h @@ -223,6 +223,7 @@ struct layer { int flipped; int inputs; int outputs; + float mean_alpha; int nweights; int nbiases; int extra; @@ -595,6 +596,7 @@ struct layer { float * input_antialiasing_gpu; float * output_gpu; + float * output_avg_gpu; float * activation_input_gpu; float * loss_gpu; float * delta_gpu; diff --git a/src/blas.h b/src/blas.h index 819130a6..60c5d7c2 100644 --- a/src/blas.h +++ b/src/blas.h @@ -100,6 +100,7 @@ void backward_shortcut_multilayer_gpu(int src_outputs, int batch, int n, int *ou float *weights, float *weight_updates, int nweights, float *in, float **layers_output, WEIGHTS_NORMALIZATION_T weights_normalizion); void input_shortcut_gpu(float *in, int batch, int w1, int h1, int c1, float *add, int w2, int h2, int c2, float *out); void backward_scale_gpu(float *x_norm, float *delta, int batch, int n, int size, float *scale_updates); +void mean_array_gpu(float *src, int size, float alpha, float *avg); void scale_bias_gpu(float *output, float *biases, int batch, int n, int size); void add_bias_gpu(float *output, float *biases, int batch, int n, int size); void backward_bias_gpu(float *bias_updates, float *delta, int batch, int n, int size); diff --git a/src/blas_kernels.cu b/src/blas_kernels.cu index 4eacc9e8..e36f0a7d 100644 --- a/src/blas_kernels.cu +++ b/src/blas_kernels.cu @@ -39,6 +39,24 @@ void compare_2_arrays_gpu(float *one, float *two, int size) CHECK_CUDA(cudaDeviceSynchronize()); } +__global__ void mean_array_kernel(float *src, int size, float alpha, float *avg) +{ + const int i = blockIdx.x*blockDim.x + threadIdx.x; + if (i >= size) return; + + avg[i] = avg[i] * (1 - alpha) + src[i] * alpha; + src[i] = avg[i]; +} + + +void mean_array_gpu(float *src, int size, float alpha, float *avg) +{ + const int num_blocks = get_number_of_blocks(size, BLOCK); + + mean_array_kernel << > >(src, size, alpha, avg); + CHECK_CUDA(cudaPeekAtLastError()); +} + __global__ void scale_bias_kernel(float *output, float *scale, int batch, int filters, int spatial, int current_size) { diff --git a/src/demo.c b/src/demo.c index 72a0ef4b..8abcd72c 100644 --- a/src/demo.c +++ b/src/demo.c @@ -101,9 +101,11 @@ void *detect_in_thread(void *ptr) float *X = det_s.data; float *prediction = network_predict(net, X); - memcpy(predictions[demo_index], prediction, l.outputs * sizeof(float)); - mean_arrays(predictions, NFRAMES, l.outputs, avg); - l.output = avg; + int i; + for (i = 0; i < net.n; ++i) { + layer l = net.layers[i]; + if (l.type == YOLO) l.mean_alpha = 1.0 / NFRAMES; + } cv_images[demo_index] = det_img; det_img = cv_images[(demo_index + NFRAMES / 2 + 1) % NFRAMES]; diff --git a/src/layer.c b/src/layer.c index 4bc84441..026d58e9 100644 --- a/src/layer.c +++ b/src/layer.c @@ -190,6 +190,7 @@ void free_layer_custom(layer l, int keep_cudnn_desc) if (l.optimized_memory < 2) { if (l.x_gpu) cuda_free(l.x_gpu); l.x_gpu = NULL; if (l.output_gpu) cuda_free(l.output_gpu), l.output_gpu = NULL; + if (l.output_avg_gpu) cuda_free(l.output_avg_gpu), l.output_avg_gpu = NULL; if (l.activation_input_gpu) cuda_free(l.activation_input_gpu), l.activation_input_gpu = NULL; } if (l.delta_gpu && l.keep_delta_gpu && l.optimized_memory < 3) cuda_free(l.delta_gpu), l.delta_gpu = NULL; diff --git a/src/utils.c b/src/utils.c index 7448af89..7242d999 100644 --- a/src/utils.c +++ b/src/utils.c @@ -578,10 +578,12 @@ void mean_arrays(float **a, int n, int els, float *avg) int j; memset(avg, 0, els*sizeof(float)); for(j = 0; j < n; ++j){ + #pragma omp parallel for for(i = 0; i < els; ++i){ avg[i] += a[j][i]; } } + #pragma omp parallel for for(i = 0; i < els; ++i){ avg[i] /= n; } diff --git a/src/yolo_layer.c b/src/yolo_layer.c index 486b17aa..2ba39d1e 100644 --- a/src/yolo_layer.c +++ b/src/yolo_layer.c @@ -113,6 +113,7 @@ void resize_yolo_layer(layer *l, int w, int h) l->delta_gpu = cuda_make_array(l->delta, l->batch*l->outputs); l->output_gpu = cuda_make_array(l->output, l->batch*l->outputs); + l->output_avg_gpu = cuda_make_array(l->output, l->batch*l->outputs); #endif } @@ -871,6 +872,7 @@ void forward_yolo_layer_gpu(const layer l, network_state state) } if(!state.train || l.onlyforward){ //cuda_pull_array(l.output_gpu, l.output, l.batch*l.outputs); + if (l.mean_alpha && l.output_avg_gpu) mean_array_gpu(l.output_gpu, l.batch*l.outputs, l.mean_alpha, l.output_avg_gpu); cuda_pull_array_async(l.output_gpu, l.output, l.batch*l.outputs); CHECK_CUDA(cudaPeekAtLastError()); return;