Demo speed fixed

pull/5550/head
AlexeyAB 5 years ago
parent 1f60ca57ad
commit ccafff912b
  1. 2
      include/darknet.h
  2. 1
      src/blas.h
  3. 18
      src/blas_kernels.cu
  4. 8
      src/demo.c
  5. 1
      src/layer.c
  6. 2
      src/utils.c
  7. 2
      src/yolo_layer.c

@ -223,6 +223,7 @@ struct layer {
int flipped;
int inputs;
int outputs;
float mean_alpha;
int nweights;
int nbiases;
int extra;
@ -595,6 +596,7 @@ struct layer {
float * input_antialiasing_gpu;
float * output_gpu;
float * output_avg_gpu;
float * activation_input_gpu;
float * loss_gpu;
float * delta_gpu;

@ -100,6 +100,7 @@ void backward_shortcut_multilayer_gpu(int src_outputs, int batch, int n, int *ou
float *weights, float *weight_updates, int nweights, float *in, float **layers_output, WEIGHTS_NORMALIZATION_T weights_normalizion);
void input_shortcut_gpu(float *in, int batch, int w1, int h1, int c1, float *add, int w2, int h2, int c2, float *out);
void backward_scale_gpu(float *x_norm, float *delta, int batch, int n, int size, float *scale_updates);
void mean_array_gpu(float *src, int size, float alpha, float *avg);
void scale_bias_gpu(float *output, float *biases, int batch, int n, int size);
void add_bias_gpu(float *output, float *biases, int batch, int n, int size);
void backward_bias_gpu(float *bias_updates, float *delta, int batch, int n, int size);

@ -39,6 +39,24 @@ void compare_2_arrays_gpu(float *one, float *two, int size)
CHECK_CUDA(cudaDeviceSynchronize());
}
__global__ void mean_array_kernel(float *src, int size, float alpha, float *avg)
{
const int i = blockIdx.x*blockDim.x + threadIdx.x;
if (i >= size) return;
avg[i] = avg[i] * (1 - alpha) + src[i] * alpha;
src[i] = avg[i];
}
void mean_array_gpu(float *src, int size, float alpha, float *avg)
{
const int num_blocks = get_number_of_blocks(size, BLOCK);
mean_array_kernel << <num_blocks, BLOCK, 0, get_cuda_stream() >> >(src, size, alpha, avg);
CHECK_CUDA(cudaPeekAtLastError());
}
__global__ void scale_bias_kernel(float *output, float *scale, int batch, int filters, int spatial, int current_size)
{

@ -101,9 +101,11 @@ void *detect_in_thread(void *ptr)
float *X = det_s.data;
float *prediction = network_predict(net, X);
memcpy(predictions[demo_index], prediction, l.outputs * sizeof(float));
mean_arrays(predictions, NFRAMES, l.outputs, avg);
l.output = avg;
int i;
for (i = 0; i < net.n; ++i) {
layer l = net.layers[i];
if (l.type == YOLO) l.mean_alpha = 1.0 / NFRAMES;
}
cv_images[demo_index] = det_img;
det_img = cv_images[(demo_index + NFRAMES / 2 + 1) % NFRAMES];

@ -190,6 +190,7 @@ void free_layer_custom(layer l, int keep_cudnn_desc)
if (l.optimized_memory < 2) {
if (l.x_gpu) cuda_free(l.x_gpu); l.x_gpu = NULL;
if (l.output_gpu) cuda_free(l.output_gpu), l.output_gpu = NULL;
if (l.output_avg_gpu) cuda_free(l.output_avg_gpu), l.output_avg_gpu = NULL;
if (l.activation_input_gpu) cuda_free(l.activation_input_gpu), l.activation_input_gpu = NULL;
}
if (l.delta_gpu && l.keep_delta_gpu && l.optimized_memory < 3) cuda_free(l.delta_gpu), l.delta_gpu = NULL;

@ -578,10 +578,12 @@ void mean_arrays(float **a, int n, int els, float *avg)
int j;
memset(avg, 0, els*sizeof(float));
for(j = 0; j < n; ++j){
#pragma omp parallel for
for(i = 0; i < els; ++i){
avg[i] += a[j][i];
}
}
#pragma omp parallel for
for(i = 0; i < els; ++i){
avg[i] /= n;
}

@ -113,6 +113,7 @@ void resize_yolo_layer(layer *l, int w, int h)
l->delta_gpu = cuda_make_array(l->delta, l->batch*l->outputs);
l->output_gpu = cuda_make_array(l->output, l->batch*l->outputs);
l->output_avg_gpu = cuda_make_array(l->output, l->batch*l->outputs);
#endif
}
@ -871,6 +872,7 @@ void forward_yolo_layer_gpu(const layer l, network_state state)
}
if(!state.train || l.onlyforward){
//cuda_pull_array(l.output_gpu, l.output, l.batch*l.outputs);
if (l.mean_alpha && l.output_avg_gpu) mean_array_gpu(l.output_gpu, l.batch*l.outputs, l.mean_alpha, l.output_avg_gpu);
cuda_pull_array_async(l.output_gpu, l.output, l.batch*l.outputs);
CHECK_CUDA(cudaPeekAtLastError());
return;

Loading…
Cancel
Save