Minor fixes

pull/2352/head
AlexeyAB 6 years ago
parent 5446d19576
commit d767e8ca38
  1. 2
      include/darknet.h
  2. 4
      src/activations.h
  3. 15
      src/connected_layer.c
  4. 4
      src/convolutional_kernels.cu
  5. 12
      src/im2col_kernels.cu
  6. 1
      src/yolo_layer.c

@ -45,7 +45,7 @@ struct network;
typedef struct network network; typedef struct network network;
struct network_state; struct network_state;
typedef struct network_state; typedef struct network_state network_state;
struct layer; struct layer;
typedef struct layer layer; typedef struct layer layer;

@ -76,11 +76,11 @@ static inline float loggy_gradient(float x)
static inline float stair_gradient(float x) static inline float stair_gradient(float x)
{ {
if (floor(x) == x) return 0; if (floor(x) == x) return 0;
return 1; return 1.0f;
} }
static inline float relu_gradient(float x){return (x>0);} static inline float relu_gradient(float x){return (x>0);}
static inline float elu_gradient(float x){return (x >= 0) + (x < 0)*(x + 1);} static inline float elu_gradient(float x){return (x >= 0) + (x < 0)*(x + 1);}
static inline float selu_gradient(float x) { return (x >= 0)*1.0507 + (x < 0)*(x + 1.0507f*1.6732f); } static inline float selu_gradient(float x) { return (x >= 0)*1.0507f + (x < 0)*(x + 1.0507f*1.6732f); }
static inline float relie_gradient(float x){return (x>0) ? 1 : .01f;} static inline float relie_gradient(float x){return (x>0) ? 1 : .01f;}
static inline float ramp_gradient(float x){return (x>0)+.1f;} static inline float ramp_gradient(float x){return (x>0)+.1f;}
static inline float leaky_gradient(float x){return (x>0) ? 1 : .1f;} static inline float leaky_gradient(float x){return (x>0) ? 1 : .1f;}

@ -84,7 +84,7 @@ connected_layer make_connected_layer(int batch, int steps, int inputs, int outpu
l.update = update_connected_layer; l.update = update_connected_layer;
//float scale = 1./sqrt(inputs); //float scale = 1./sqrt(inputs);
float scale = sqrt(2./inputs); float scale = sqrt(2.f/inputs);
for(i = 0; i < outputs*inputs; ++i){ for(i = 0; i < outputs*inputs; ++i){
l.weights[i] = scale*rand_uniform(-1, 1); l.weights[i] = scale*rand_uniform(-1, 1);
} }
@ -182,10 +182,10 @@ void forward_connected_layer(connected_layer l, network_state state)
mean_cpu(l.output, l.batch, l.outputs, 1, l.mean); mean_cpu(l.output, l.batch, l.outputs, 1, l.mean);
variance_cpu(l.output, l.mean, l.batch, l.outputs, 1, l.variance); variance_cpu(l.output, l.mean, l.batch, l.outputs, 1, l.variance);
scal_cpu(l.outputs, .95, l.rolling_mean, 1); scal_cpu(l.outputs, .95f, l.rolling_mean, 1);
axpy_cpu(l.outputs, .05, l.mean, 1, l.rolling_mean, 1); axpy_cpu(l.outputs, .05f, l.mean, 1, l.rolling_mean, 1);
scal_cpu(l.outputs, .95, l.rolling_variance, 1); scal_cpu(l.outputs, .95f, l.rolling_variance, 1);
axpy_cpu(l.outputs, .05, l.variance, 1, l.rolling_variance, 1); axpy_cpu(l.outputs, .05f, l.variance, 1, l.rolling_variance, 1);
copy_cpu(l.outputs*l.batch, l.output, 1, l.x, 1); copy_cpu(l.outputs*l.batch, l.output, 1, l.x, 1);
normalize_cpu(l.output, l.mean, l.variance, l.batch, l.outputs, 1); normalize_cpu(l.output, l.mean, l.variance, l.batch, l.outputs, 1);
@ -242,7 +242,7 @@ void denormalize_connected_layer(layer l)
{ {
int i, j; int i, j;
for(i = 0; i < l.outputs; ++i){ for(i = 0; i < l.outputs; ++i){
float scale = l.scales[i]/sqrt(l.rolling_variance[i] + .000001); float scale = l.scales[i]/sqrt(l.rolling_variance[i] + .000001f);
for(j = 0; j < l.inputs; ++j){ for(j = 0; j < l.inputs; ++j){
l.weights[i*l.inputs + j] *= scale; l.weights[i*l.inputs + j] *= scale;
} }
@ -285,6 +285,7 @@ void pull_connected_layer(connected_layer l)
cuda_pull_array(l.rolling_mean_gpu, l.rolling_mean, l.outputs); cuda_pull_array(l.rolling_mean_gpu, l.rolling_mean, l.outputs);
cuda_pull_array(l.rolling_variance_gpu, l.rolling_variance, l.outputs); cuda_pull_array(l.rolling_variance_gpu, l.rolling_variance, l.outputs);
} }
CHECK_CUDA(cudaPeekAtLastError());
} }
void push_connected_layer(connected_layer l) void push_connected_layer(connected_layer l)
@ -298,6 +299,7 @@ void push_connected_layer(connected_layer l)
cuda_push_array(l.rolling_mean_gpu, l.rolling_mean, l.outputs); cuda_push_array(l.rolling_mean_gpu, l.rolling_mean, l.outputs);
cuda_push_array(l.rolling_variance_gpu, l.rolling_variance, l.outputs); cuda_push_array(l.rolling_variance_gpu, l.rolling_variance, l.outputs);
} }
CHECK_CUDA(cudaPeekAtLastError());
} }
void update_connected_layer_gpu(connected_layer l, int batch, float learning_rate, float momentum, float decay) void update_connected_layer_gpu(connected_layer l, int batch, float learning_rate, float momentum, float decay)
@ -317,7 +319,6 @@ void update_connected_layer_gpu(connected_layer l, int batch, float learning_rat
void forward_connected_layer_gpu(connected_layer l, network_state state) void forward_connected_layer_gpu(connected_layer l, network_state state)
{ {
int i;
fill_ongpu(l.outputs*l.batch, 0, l.output_gpu, 1); fill_ongpu(l.outputs*l.batch, 0, l.output_gpu, 1);
int m = l.batch; int m = l.batch;

@ -84,7 +84,7 @@ __global__ void set_zero_kernel(float *src, int size)
__inline__ __device__ __inline__ __device__
float warpAllReduceSum(float val) { float warpAllReduceSum(float val) {
for (int mask = WARP_SIZE / 2; mask > 0; mask /= 2) for (int mask = WARP_SIZE / 2; mask > 0; mask /= 2)
#if CUDA_VERSION >= 9000 #if CUDART_VERSION >= 9000
val += __shfl_xor_sync(0xffffffff, val, mask); val += __shfl_xor_sync(0xffffffff, val, mask);
#else #else
val += __shfl_xor(val, mask); val += __shfl_xor(val, mask);
@ -807,6 +807,7 @@ void pull_convolutional_layer(convolutional_layer layer)
cuda_pull_array_async(layer.m_gpu, layer.m, layer.c*layer.n*layer.size*layer.size); cuda_pull_array_async(layer.m_gpu, layer.m, layer.c*layer.n*layer.size*layer.size);
cuda_pull_array_async(layer.v_gpu, layer.v, layer.c*layer.n*layer.size*layer.size); cuda_pull_array_async(layer.v_gpu, layer.v, layer.c*layer.n*layer.size*layer.size);
} }
CHECK_CUDA(cudaPeekAtLastError());
cudaStreamSynchronize(get_cuda_stream()); cudaStreamSynchronize(get_cuda_stream());
} }
@ -828,6 +829,7 @@ void push_convolutional_layer(convolutional_layer layer)
cuda_push_array(layer.m_gpu, layer.m, layer.c*layer.n*layer.size*layer.size); cuda_push_array(layer.m_gpu, layer.m, layer.c*layer.n*layer.size*layer.size);
cuda_push_array(layer.v_gpu, layer.v, layer.c*layer.n*layer.size*layer.size); cuda_push_array(layer.v_gpu, layer.v, layer.c*layer.n*layer.size*layer.size);
} }
CHECK_CUDA(cudaPeekAtLastError());
} }
void update_convolutional_layer_gpu(layer l, int batch, float learning_rate_init, float momentum, float decay) void update_convolutional_layer_gpu(layer l, int batch, float learning_rate_init, float momentum, float decay)

@ -17,7 +17,7 @@ extern "C" {
template<typename T1, typename T2> template<typename T1, typename T2>
__device__ inline T1 __shfl_custom(T1 val, T2 lane) { __device__ inline T1 __shfl_custom(T1 val, T2 lane) {
#if CUDA_VERSION >= 9000 #if CUDART_VERSION >= 9000
return __shfl_sync(FULL_MASK, val, lane); return __shfl_sync(FULL_MASK, val, lane);
#else #else
return __shfl(val, lane); return __shfl(val, lane);
@ -26,7 +26,7 @@ __device__ inline T1 __shfl_custom(T1 val, T2 lane) {
template<typename T> template<typename T>
__device__ inline uint32_t __ballot_custom(T val) { __device__ inline uint32_t __ballot_custom(T val) {
#if CUDA_VERSION >= 9000 #if CUDART_VERSION >= 9000
return __ballot_sync(FULL_MASK, val); return __ballot_sync(FULL_MASK, val);
#else #else
return __ballot(val); return __ballot(val);
@ -1223,7 +1223,7 @@ __global__ void gemm_nn_custom_bin_mean_transposed_gpu_kernel(int M, int N, int
__inline__ __device__ __inline__ __device__
int warpAllReduceSum(int val) { int warpAllReduceSum(int val) {
for (int mask = WARP_SIZE / 2; mask > 0; mask /= 2) for (int mask = WARP_SIZE / 2; mask > 0; mask /= 2)
#if CUDA_VERSION >= 9000 #if CUDART_VERSION >= 9000
val += __shfl_xor_sync(FULL_MASK, val, mask); val += __shfl_xor_sync(FULL_MASK, val, mask);
#else #else
val += __shfl_xor(val, mask); val += __shfl_xor(val, mask);
@ -1233,7 +1233,7 @@ int warpAllReduceSum(int val) {
} }
// Tensor Cores binary (CC >= 7.3 && CUDA >= 10.0) - __CUDA_SUBBYTE_IMMA__ // Tensor Cores binary (CC >= 7.3 && CUDA >= 10.0) - __CUDA_SUBBYTE_IMMA__
#if CUDA_VERSION >= 10000 #if CUDART_VERSION >= 10000
#include <mma.h> #include <mma.h>
#define WMMA_M 8 #define WMMA_M 8
@ -1779,7 +1779,7 @@ void gemm_nn_custom_bin_mean_transposed_gpu(int M, int N, int K,
//if (M % 8 == 0 && N % 8 == 0 && M == 128) //if (M % 8 == 0 && N % 8 == 0 && M == 128)
//if (M >= 32) // l.n >= 32 //if (M >= 32) // l.n >= 32
#if CUDA_VERSION >= 10000 #if CUDART_VERSION >= 10000
if (1) if (1)
{ {
const int M_aligned = M + (8 - (M % 8)); const int M_aligned = M + (8 - (M % 8));
@ -1800,7 +1800,7 @@ void gemm_nn_custom_bin_mean_transposed_gpu(int M, int N, int K,
//getchar(); //getchar();
} }
else else
#endif //# CUDA_VERSION >= 10000 #endif //# CUDART_VERSION >= 10000
{ {
gemm_nn_custom_bin_mean_transposed_gpu_kernel << <num_blocks, BLOCK, 0, get_cuda_stream() >> > ( gemm_nn_custom_bin_mean_transposed_gpu_kernel << <num_blocks, BLOCK, 0, get_cuda_stream() >> > (
M, N, K, M, N, K,

@ -454,6 +454,7 @@ void forward_yolo_layer_gpu(const layer l, network_state state)
if(!state.train || l.onlyforward){ if(!state.train || l.onlyforward){
//cuda_pull_array(l.output_gpu, l.output, l.batch*l.outputs); //cuda_pull_array(l.output_gpu, l.output, l.batch*l.outputs);
cuda_pull_array_async(l.output_gpu, l.output, l.batch*l.outputs); cuda_pull_array_async(l.output_gpu, l.output, l.batch*l.outputs);
CHECK_CUDA(cudaPeekAtLastError());
return; return;
} }

Loading…
Cancel
Save