@ -7,6 +7,7 @@ extern "C" {
#include "blas.h"
#include "cuda.h"
#include "utils.h"
#include "tree.h"
}
__global__ void scale_bias_kernel(float *output, float *biases, int n, int size)
@ -419,7 +420,13 @@ __global__ void fill_kernel(int N, float ALPHA, float *X, int INCX)
if(i < N) X[i*INCX] = ALPHA;
}
__global__ void mask_kernel(int n, float *x, float mask_num, float *mask)
__global__ void mask_kernel_new_api(int n, float *x, float mask_num, float *mask, float val)
{
int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
if (i < n && mask[i] == mask_num) x[i] = val;
}
__global__ void mask_kernel(int n, float *x, float mask_num, float *mask)
{
int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
if(i < n && mask[i] == mask_num) x[i] = mask_num;
@ -592,6 +599,12 @@ extern "C" void reorg_ongpu(float *x, int w, int h, int c, int batch, int stride
check_error(cudaPeekAtLastError());
}
extern "C" void mask_gpu_new_api(int N, float * X, float mask_num, float * mask, float val)
{
mask_kernel_new_api <<<cuda_gridsize(N), BLOCK >>>(N, X, mask_num, mask, val);
check_error(cudaPeekAtLastError());
}
extern "C" void mask_ongpu(int N, float * X, float mask_num, float * mask)
{
mask_kernel<<<cuda_gridsize(N), BLOCK, 0, get_cuda_stream() >>>(N, X, mask_num, mask);
@ -687,6 +700,23 @@ extern "C" void smooth_l1_gpu(int n, float *pred, float *truth, float *delta, fl
check_error(cudaPeekAtLastError());
}
__global__ void softmax_x_ent_kernel(int n, float *pred, float *truth, float *delta, float *error)
{
int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
if (i < n) {
float t = truth[i];
float p = pred[i];
error[i] = (t) ? -log(p) : 0;
delta[i] = t - p;
}
}
extern "C" void softmax_x_ent_gpu(int n, float *pred, float *truth, float *delta, float *error)
{
softmax_x_ent_kernel << <cuda_gridsize(n), BLOCK >> >(n, pred, truth, delta, error);
check_error(cudaPeekAtLastError());
}
__global__ void l2_kernel(int n, float *pred, float *truth, float *delta, float *error)
{
int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
@ -784,6 +814,40 @@ extern "C" void softmax_gpu(float *input, int n, int offset, int groups, float t
check_error(cudaPeekAtLastError());
}
__device__ void softmax_device_new_api(float *input, int n, float temp, int stride, float *output)
{
int i;
float sum = 0;
float largest = -INFINITY;
for (i = 0; i < n; ++i) {
int val = input[i*stride];
largest = (val>largest) ? val : largest;
}
for (i = 0; i < n; ++i) {
float e = expf(input[i*stride] / temp - largest / temp);
sum += e;
output[i*stride] = e;
}
for (i = 0; i < n; ++i) {
output[i*stride] /= sum;
}
}
__global__ void softmax_kernel_new_api(float *input, int n, int batch, int batch_offset, int groups, int group_offset, int stride, float temp, float *output)
{
int id = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
if (id >= batch*groups) return;
int b = id / groups;
int g = id % groups;
softmax_device_new_api(input + b*batch_offset + g*group_offset, n, temp, stride, output + b*batch_offset + g*group_offset);
}
extern "C" void softmax_gpu_new_api(float *input, int n, int batch, int batch_offset, int groups, int group_offset, int stride, float temp, float *output)
{
softmax_kernel_new_api << <cuda_gridsize(batch*groups), BLOCK >> >(input, n, batch, batch_offset, groups, group_offset, stride, temp, output);
check_error(cudaPeekAtLastError());
}
__global__ void upsample_kernel(size_t N, float *x, int w, int h, int c, int batch, int stride, int forward, float scale, float *out)
{
@ -814,4 +878,36 @@ extern "C" void upsample_gpu(float *in, int w, int h, int c, int batch, int stri
size_t size = w*h*c*batch*stride*stride;
upsample_kernel << <cuda_gridsize(size), BLOCK >> >(size, in, w, h, c, batch, stride, forward, scale, out);
check_error(cudaPeekAtLastError());
}
__global__ void softmax_tree_kernel(float *input, int spatial, int batch, int stride, float temp, float *output, int groups, int *group_size, int *group_offset)
{
int id = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
if (id >= spatial*batch*groups) return;
int s = id % spatial;
id = id / spatial;
int g = id % groups;
int b = id / groups;
int goff = group_offset[g] * spatial;
int boff = b*stride;
softmax_device_new_api(input + goff + boff + s, group_size[g], temp, spatial, output + goff + boff + s);
}
extern "C" void softmax_tree_gpu(float *input, int spatial, int batch, int stride, float temp, float *output, tree hier)
{
int *tree_groups_size = cuda_make_int_array_new_api(hier.group_size, hier.groups);
int *tree_groups_offset = cuda_make_int_array_new_api(hier.group_offset, hier.groups);
/*
static int *tree_groups_size = 0;
static int *tree_groups_offset = 0;
if(!tree_groups_size){
tree_groups_size = cuda_make_int_array(hier.group_size, hier.groups);
tree_groups_offset = cuda_make_int_array(hier.group_offset, hier.groups);
}
*/
int num = spatial*batch*hier.groups;
softmax_tree_kernel <<<cuda_gridsize(num), BLOCK >>>(input, spatial, batch, stride, temp, output, hier.groups, tree_groups_size, tree_groups_offset);
check_error(cudaPeekAtLastError());
cuda_free((float *)tree_groups_size);
cuda_free((float *)tree_groups_offset);
}