|
|
@ -400,6 +400,10 @@ extern "C" void gradient_array_ongpu(float *x, int n, ACTIVATION a, float *delta |
|
|
|
else if (a == HARDTAN) gradient_array_hardtan_kernel << <num_blocks, BLOCK, 0, get_cuda_stream() >> >(x, n, delta); |
|
|
|
else if (a == HARDTAN) gradient_array_hardtan_kernel << <num_blocks, BLOCK, 0, get_cuda_stream() >> >(x, n, delta); |
|
|
|
else if (a == RELU) gradient_array_relu_kernel << <num_blocks, BLOCK, 0, get_cuda_stream() >> >(x, n, delta); |
|
|
|
else if (a == RELU) gradient_array_relu_kernel << <num_blocks, BLOCK, 0, get_cuda_stream() >> >(x, n, delta); |
|
|
|
else if (a == NORM_CHAN) gradient_array_relu_kernel << <num_blocks, BLOCK, 0, get_cuda_stream() >> >(x, n, delta); |
|
|
|
else if (a == NORM_CHAN) gradient_array_relu_kernel << <num_blocks, BLOCK, 0, get_cuda_stream() >> >(x, n, delta); |
|
|
|
|
|
|
|
else if (a == NORM_CHAN_SOFTMAX) { |
|
|
|
|
|
|
|
printf(" Error: should be used custom NORM_CHAN_SOFTMAX-function for gradient \n"); |
|
|
|
|
|
|
|
exit(0); |
|
|
|
|
|
|
|
} |
|
|
|
else if (a == SELU) gradient_array_selu_kernel << <num_blocks, BLOCK, 0, get_cuda_stream() >> >(x, n, delta); |
|
|
|
else if (a == SELU) gradient_array_selu_kernel << <num_blocks, BLOCK, 0, get_cuda_stream() >> >(x, n, delta); |
|
|
|
else |
|
|
|
else |
|
|
|
gradient_array_kernel << <cuda_gridsize(n), BLOCK, 0, get_cuda_stream() >> > (x, n, a, delta); |
|
|
|
gradient_array_kernel << <cuda_gridsize(n), BLOCK, 0, get_cuda_stream() >> > (x, n, a, delta); |
|
|
@ -456,4 +460,79 @@ extern "C" void activate_array_normalize_channels_ongpu(float *x, int n, int bat |
|
|
|
|
|
|
|
|
|
|
|
activate_array_normalize_channels_kernel << <num_blocks, BLOCK, 0, get_cuda_stream() >> > (x, size, batch, channels, wh_step, output_gpu); |
|
|
|
activate_array_normalize_channels_kernel << <num_blocks, BLOCK, 0, get_cuda_stream() >> > (x, size, batch, channels, wh_step, output_gpu); |
|
|
|
CHECK_CUDA(cudaPeekAtLastError()); |
|
|
|
CHECK_CUDA(cudaPeekAtLastError()); |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
__global__ void activate_array_normalize_channels_softmax_kernel(float *x, int size, int batch, int channels, int wh_step, float *output_gpu) |
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
int i = blockIdx.x * blockDim.x + threadIdx.x; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
int wh_i = i % wh_step; |
|
|
|
|
|
|
|
int b = i / wh_step; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
const float eps = 0.0001; |
|
|
|
|
|
|
|
if (i < size) { |
|
|
|
|
|
|
|
float sum = eps; |
|
|
|
|
|
|
|
int k; |
|
|
|
|
|
|
|
for (k = 0; k < channels; ++k) { |
|
|
|
|
|
|
|
float val = x[wh_i + k * wh_step + b*wh_step*channels]; |
|
|
|
|
|
|
|
sum += expf(val); |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
for (k = 0; k < channels; ++k) { |
|
|
|
|
|
|
|
float val = x[wh_i + k * wh_step + b*wh_step*channels]; |
|
|
|
|
|
|
|
val = expf(val) / sum; |
|
|
|
|
|
|
|
output_gpu[wh_i + k * wh_step + b*wh_step*channels] = val; |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
extern "C" void activate_array_normalize_channels_softmax_ongpu(float *x, int n, int batch, int channels, int wh_step, float *output_gpu) |
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
// n = w*h*c*batch |
|
|
|
|
|
|
|
// size = w*h*batch |
|
|
|
|
|
|
|
int size = n / channels; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
const int num_blocks = get_number_of_blocks(size, BLOCK); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
activate_array_normalize_channels_softmax_kernel << <num_blocks, BLOCK, 0, get_cuda_stream() >> > (x, size, batch, channels, wh_step, output_gpu); |
|
|
|
|
|
|
|
CHECK_CUDA(cudaPeekAtLastError()); |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
__global__ void gradient_array_normalize_channels_softmax_kernel(float *x, int size, int batch, int channels, int wh_step, float *delta_gpu) |
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
int i = blockIdx.x * blockDim.x + threadIdx.x; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
int wh_i = i % wh_step; |
|
|
|
|
|
|
|
int b = i / wh_step; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
const float eps = 0.0001; |
|
|
|
|
|
|
|
if (i < size) { |
|
|
|
|
|
|
|
float grad = eps; |
|
|
|
|
|
|
|
int k; |
|
|
|
|
|
|
|
for (k = 0; k < channels; ++k) { |
|
|
|
|
|
|
|
float out = x[wh_i + k * wh_step + b*wh_step*channels]; |
|
|
|
|
|
|
|
float delta = delta_gpu[wh_i + k * wh_step + b*wh_step*channels]; |
|
|
|
|
|
|
|
grad += out*delta; |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
for (k = 0; k < channels; ++k) { |
|
|
|
|
|
|
|
float delta = delta_gpu[wh_i + k * wh_step + b*wh_step*channels]; |
|
|
|
|
|
|
|
delta = delta * grad; |
|
|
|
|
|
|
|
delta_gpu[wh_i + k * wh_step + b*wh_step*channels] = delta; |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
extern "C" void gradient_array_normalize_channels_softmax_ongpu(float *output_gpu, int n, int batch, int channels, int wh_step, float *delta_gpu) |
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
// n = w*h*c*batch |
|
|
|
|
|
|
|
// size = w*h*batch |
|
|
|
|
|
|
|
int size = n / channels; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
const int num_blocks = get_number_of_blocks(size, BLOCK); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
gradient_array_normalize_channels_softmax_kernel << <num_blocks, BLOCK, 0, get_cuda_stream() >> > (output_gpu, size, batch, channels, wh_step, delta_gpu); |
|
|
|
|
|
|
|
CHECK_CUDA(cudaPeekAtLastError()); |
|
|
|
} |
|
|
|
} |