diff --git a/build/darknet/darknet.vcxproj b/build/darknet/darknet.vcxproj
index b5fff824..8c154ed5 100644
--- a/build/darknet/darknet.vcxproj
+++ b/build/darknet/darknet.vcxproj
@@ -132,7 +132,7 @@
true
true
C:\opencv_2.4.9\opencv\build\include;..\..\3rdparty\include;%(AdditionalIncludeDirectories);$(CudaToolkitIncludeDir);$(cudnn)\include
- OPENCV;_TIMESPEC_DEFINED;_CRT_SECURE_NO_WARNINGS;GPU;WIN32;NDEBUG;_CONSOLE;_LIB;%(PreprocessorDefinitions)
+ CUDNN;OPENCV;_TIMESPEC_DEFINED;_CRT_SECURE_NO_WARNINGS;GPU;WIN32;NDEBUG;_CONSOLE;_LIB;%(PreprocessorDefinitions)
c11
c++1y
CompileAsCpp
diff --git a/src/blas_kernels.cu b/src/blas_kernels.cu
index d9401766..79fc1c1d 100644
--- a/src/blas_kernels.cu
+++ b/src/blas_kernels.cu
@@ -223,6 +223,7 @@ __global__ void fast_mean_delta_kernel(float *delta, float *variance, int batch,
local[id] += (i+id < spatial) ? delta[index] : 0;
}
}
+ __syncthreads();
if(id == 0){
mean_delta[filter] = 0;
@@ -251,6 +252,7 @@ __global__ void fast_variance_delta_kernel(float *x, float *delta, float *mean,
local[id] += (i+id < spatial) ? delta[index]*(x[index] - mean[filter]) : 0;
}
}
+ __syncthreads();
if(id == 0){
variance_delta[filter] = 0;
@@ -446,6 +448,7 @@ __global__ void fast_mean_kernel(float *x, int batch, int filters, int spatial,
local[id] += (i+id < spatial) ? x[index] : 0;
}
}
+ __syncthreads();
if(id == 0){
mean[filter] = 0;
@@ -474,6 +477,7 @@ __global__ void fast_variance_kernel(float *x, float *mean, int batch, int filt
local[id] += (i+id < spatial) ? pow((x[index] - mean[filter]), 2) : 0;
}
}
+ __syncthreads();
if(id == 0){
variance[filter] = 0;
diff --git a/src/convolutional_kernels.cu b/src/convolutional_kernels.cu
index 005269b0..03c9ab79 100644
--- a/src/convolutional_kernels.cu
+++ b/src/convolutional_kernels.cu
@@ -127,6 +127,7 @@ void forward_convolutional_layer_gpu(convolutional_layer l, network_state state)
activate_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation);
//if(l.dot > 0) dot_error_gpu(l);
if(l.binary || l.xnor) swap_binary(&l);
+ //cudaDeviceSynchronize(); // for correct profiling of performance
}
void backward_convolutional_layer_gpu(convolutional_layer l, network_state state)