mirror of https://github.com/AlexeyAB/darknet.git
parent
76ee68f96d
commit
787d534560
21 changed files with 641 additions and 91 deletions
@ -1,14 +1,124 @@ |
||||
#include "mini_blas.h" |
||||
|
||||
void axpy_cpu(int N, float ALPHA, float *X, int INCX, float *Y, int INCY) |
||||
inline void axpy_cpu(int N, float ALPHA, float *X, int INCX, float *Y, int INCY) |
||||
{ |
||||
int i; |
||||
for(i = 0; i < N; ++i) Y[i*INCY] += ALPHA*X[i*INCX]; |
||||
} |
||||
|
||||
void scal_cpu(int N, float ALPHA, float *X, int INCX) |
||||
inline void scal_cpu(int N, float ALPHA, float *X, int INCX) |
||||
{ |
||||
int i; |
||||
for(i = 0; i < N; ++i) X[i*INCX] *= ALPHA; |
||||
} |
||||
|
||||
inline void copy_cpu(int N, float *X, int INCX, float *Y, int INCY) |
||||
{ |
||||
int i; |
||||
for(i = 0; i < N; ++i) Y[i*INCY] = X[i*INCX]; |
||||
} |
||||
|
||||
inline float dot_cpu(int N, float *X, int INCX, float *Y, int INCY) |
||||
{ |
||||
int i; |
||||
float dot = 0; |
||||
for(i = 0; i < N; ++i) dot += X[i*INCX] * Y[i*INCY]; |
||||
return dot; |
||||
} |
||||
|
||||
#ifdef GPU |
||||
#include "opencl.h" |
||||
|
||||
cl_kernel get_axpy_kernel() |
||||
{ |
||||
static int init = 0; |
||||
static cl_kernel kernel; |
||||
if(!init){ |
||||
kernel = get_kernel("src/axpy.cl", "axpy", 0); |
||||
init = 1; |
||||
} |
||||
return kernel; |
||||
} |
||||
|
||||
cl_kernel get_copy_kernel() |
||||
{ |
||||
static int init = 0; |
||||
static cl_kernel kernel; |
||||
if(!init){ |
||||
kernel = get_kernel("src/axpy.cl", "copy", 0); |
||||
init = 1; |
||||
} |
||||
return kernel; |
||||
} |
||||
|
||||
cl_kernel get_scal_kernel() |
||||
{ |
||||
static int init = 0; |
||||
static cl_kernel kernel; |
||||
if(!init){ |
||||
kernel = get_kernel("src/axpy.cl", "scal", 0); |
||||
init = 1; |
||||
} |
||||
return kernel; |
||||
} |
||||
|
||||
|
||||
void axpy_ongpu(int N, float ALPHA, cl_mem X, int INCX, cl_mem Y, int INCY) |
||||
{ |
||||
cl_setup(); |
||||
cl_kernel kernel = get_axpy_kernel(); |
||||
cl_command_queue queue = cl.queue; |
||||
|
||||
cl_uint i = 0; |
||||
cl.error = clSetKernelArg(kernel, i++, sizeof(N), (void*) &N); |
||||
cl.error = clSetKernelArg(kernel, i++, sizeof(ALPHA), (void*) &ALPHA); |
||||
cl.error = clSetKernelArg(kernel, i++, sizeof(X), (void*) &X); |
||||
cl.error = clSetKernelArg(kernel, i++, sizeof(INCX), (void*) &INCX); |
||||
cl.error = clSetKernelArg(kernel, i++, sizeof(Y), (void*) &Y); |
||||
cl.error = clSetKernelArg(kernel, i++, sizeof(INCY), (void*) &INCY); |
||||
check_error(cl); |
||||
|
||||
const size_t global_size[] = {N}; |
||||
|
||||
clEnqueueNDRangeKernel(queue, kernel, 1, 0, global_size, 0, 0, 0, 0); |
||||
check_error(cl); |
||||
|
||||
} |
||||
void copy_ongpu(int N, cl_mem X, int INCX, cl_mem Y, int INCY) |
||||
{ |
||||
cl_setup(); |
||||
cl_kernel kernel = get_copy_kernel(); |
||||
cl_command_queue queue = cl.queue; |
||||
|
||||
cl_uint i = 0; |
||||
cl.error = clSetKernelArg(kernel, i++, sizeof(N), (void*) &N); |
||||
cl.error = clSetKernelArg(kernel, i++, sizeof(X), (void*) &X); |
||||
cl.error = clSetKernelArg(kernel, i++, sizeof(INCX), (void*) &INCX); |
||||
cl.error = clSetKernelArg(kernel, i++, sizeof(Y), (void*) &Y); |
||||
cl.error = clSetKernelArg(kernel, i++, sizeof(INCY), (void*) &INCY); |
||||
check_error(cl); |
||||
|
||||
const size_t global_size[] = {N}; |
||||
|
||||
clEnqueueNDRangeKernel(queue, kernel, 1, 0, global_size, 0, 0, 0, 0); |
||||
check_error(cl); |
||||
} |
||||
void scal_ongpu(int N, float ALPHA, cl_mem X, int INCX) |
||||
{ |
||||
cl_setup(); |
||||
cl_kernel kernel = get_scal_kernel(); |
||||
cl_command_queue queue = cl.queue; |
||||
|
||||
cl_uint i = 0; |
||||
cl.error = clSetKernelArg(kernel, i++, sizeof(N), (void*) &N); |
||||
cl.error = clSetKernelArg(kernel, i++, sizeof(ALPHA), (void*) &ALPHA); |
||||
cl.error = clSetKernelArg(kernel, i++, sizeof(X), (void*) &X); |
||||
cl.error = clSetKernelArg(kernel, i++, sizeof(INCX), (void*) &INCX); |
||||
check_error(cl); |
||||
|
||||
const size_t global_size[] = {N}; |
||||
|
||||
clEnqueueNDRangeKernel(queue, kernel, 1, 0, global_size, 0, 0, 0, 0); |
||||
check_error(cl); |
||||
} |
||||
#endif |
||||
|
@ -0,0 +1,18 @@ |
||||
__kernel void axpy(int N, float ALPHA, __global float *X, int INCX, __global float *Y, int INCY) |
||||
{ |
||||
int i = get_global_id(0); |
||||
Y[i*INCY] += ALPHA*X[i*INCX]; |
||||
} |
||||
|
||||
__kernel void scal(int N, float ALPHA, __global float *X, int INCX) |
||||
{ |
||||
int i = get_global_id(0); |
||||
X[i*INCX] *= ALPHA; |
||||
} |
||||
|
||||
__kernel void copy(int N, __global float *X, int INCX, __global float *Y, int INCY) |
||||
{ |
||||
int i = get_global_id(0); |
||||
Y[i*INCY] = X[i*INCX]; |
||||
} |
||||
|
@ -1,41 +1,46 @@ |
||||
int index(int row, int col) |
||||
__kernel void col2im(__global float *data_col, int batch, |
||||
int channels, int height, int width, |
||||
int ksize, int stride, int pad, __global float *data_im) |
||||
{ |
||||
|
||||
} |
||||
|
||||
__kernel void col2im(__global float *data_col, int batch, |
||||
int channels, int height, int width, |
||||
int ksize, int stride, int pad, __global float *data_im) |
||||
{ |
||||
int height_col = (height - ksize) / stride + 1; |
||||
int width_col = (width - ksize) / stride + 1; |
||||
if (pad){ |
||||
height_col = 1 + (height-1) / stride; |
||||
width_col = 1 + (width-1) / stride; |
||||
pad = ksize/2; |
||||
} |
||||
|
||||
int id = get_global_id(0); |
||||
int index = id; |
||||
int w = id%width; |
||||
int w = id%width + pad; |
||||
id /= width; |
||||
int h = id%height; |
||||
int h = id%height + pad; |
||||
id /= height; |
||||
int c = id%channels; |
||||
id /= channels; |
||||
int b = id%batch; |
||||
|
||||
int height_col = (height - ksize) / stride + 1; |
||||
int width_col = (width - ksize) / stride + 1; |
||||
int w_start = (w<ksize)?0:(w-ksize)/stride + 1; |
||||
int w_end = w/stride + 1; |
||||
if(width_col < w_end) w_end = width_col; |
||||
|
||||
int h_start = (h<ksize)?0:(h-ksize)/stride+1; |
||||
int h_end = h/stride + 1; |
||||
if(height_col < h_end) h_end = height_col; |
||||
|
||||
int rows = channels * ksize * ksize; |
||||
if (pad){ |
||||
height_col = 1 + (height-1) / stride; |
||||
width_col = 1 + (width-1) / stride; |
||||
pad = ksize/2; |
||||
} |
||||
int cols = height_col*width_col; |
||||
int batch_offset = b*cols*rows; |
||||
int channel_offset = c*cols*ksize*ksize; |
||||
data_col[index] = 0; |
||||
int i,j; |
||||
for(i = 0; i < ksize; ++i){ |
||||
row_offset = i*height_col*width_col; |
||||
for(j = 0; j < ksize; ++j){ |
||||
col_offset = |
||||
int offset = (c*ksize*ksize + h * ksize + w)*height_col*width_col; |
||||
offset += b*cols*rows; |
||||
int h_coeff = (1-stride*ksize*height_col)*width_col; |
||||
int w_coeff = 1-stride*height_col*width_col; |
||||
float val = 0; |
||||
int h_col, w_col; |
||||
for(h_col = h_start; h_col < h_end; ++h_col){ |
||||
for(w_col = w_start; w_col < w_end; ++w_col){ |
||||
val += data_col[offset +h_col*h_coeff + w_col*w_coeff]; |
||||
} |
||||
} |
||||
|
||||
data_col[col_index] = im2col_get_pixel(data_im, height, width, channels, b, im_row, im_col, c_im, pad); |
||||
data_im[index] = val; |
||||
} |
||||
|
@ -0,0 +1,49 @@ |
||||
#include "cost_layer.h" |
||||
#include "mini_blas.h" |
||||
#include <math.h> |
||||
#include <stdlib.h> |
||||
#include <stdio.h> |
||||
|
||||
cost_layer *make_cost_layer(int batch, int inputs) |
||||
{ |
||||
fprintf(stderr, "Cost Layer: %d inputs\n", inputs); |
||||
cost_layer *layer = calloc(1, sizeof(cost_layer)); |
||||
layer->batch = batch; |
||||
layer->inputs = inputs; |
||||
layer->delta = calloc(inputs*batch, sizeof(float)); |
||||
layer->output = calloc(1, sizeof(float)); |
||||
#ifdef GPU |
||||
layer->delta_cl = cl_make_array(layer->delta, inputs*batch); |
||||
#endif |
||||
return layer; |
||||
} |
||||
|
||||
void forward_cost_layer(cost_layer layer, float *input, float *truth) |
||||
{ |
||||
if (!truth) return; |
||||
copy_cpu(layer.batch*layer.inputs, truth, 1, layer.delta, 1); |
||||
axpy_cpu(layer.batch*layer.inputs, -1, input, 1, layer.delta, 1); |
||||
*(layer.output) = dot_cpu(layer.batch*layer.inputs, layer.delta, 1, layer.delta, 1); |
||||
} |
||||
|
||||
void backward_cost_layer(const cost_layer layer, float *input, float *delta) |
||||
{ |
||||
copy_cpu(layer.batch*layer.inputs, layer.delta, 1, delta, 1); |
||||
} |
||||
|
||||
#ifdef GPU |
||||
void forward_cost_layer_gpu(cost_layer layer, cl_mem input, cl_mem truth) |
||||
{ |
||||
if (!truth) return; |
||||
copy_ongpu(layer.batch*layer.inputs, truth, 1, layer.delta_cl, 1); |
||||
axpy_ongpu(layer.batch*layer.inputs, -1, input, 1, layer.delta_cl, 1); |
||||
cl_read_array(layer.delta_cl, layer.delta, layer.batch*layer.inputs); |
||||
*(layer.output) = dot_cpu(layer.batch*layer.inputs, layer.delta, 1, layer.delta, 1); |
||||
} |
||||
|
||||
void backward_cost_layer_gpu(const cost_layer layer, cl_mem input, cl_mem delta) |
||||
{ |
||||
copy_ongpu(layer.batch*layer.inputs, layer.delta_cl, 1, delta, 1); |
||||
} |
||||
#endif |
||||
|
@ -0,0 +1,24 @@ |
||||
#ifndef COST_LAYER_H |
||||
#define COST_LAYER_H |
||||
#include "opencl.h" |
||||
|
||||
typedef struct { |
||||
int inputs; |
||||
int batch; |
||||
float *delta; |
||||
float *output; |
||||
#ifdef GPU |
||||
cl_mem delta_cl; |
||||
#endif |
||||
} cost_layer; |
||||
|
||||
cost_layer *make_cost_layer(int batch, int inputs); |
||||
void forward_cost_layer(const cost_layer layer, float *input, float *truth); |
||||
void backward_cost_layer(const cost_layer layer, float *input, float *delta); |
||||
|
||||
#ifdef GPU |
||||
void forward_cost_layer_gpu(cost_layer layer, cl_mem input, cl_mem truth); |
||||
void backward_cost_layer_gpu(const cost_layer layer, cl_mem input, cl_mem delta); |
||||
#endif |
||||
|
||||
#endif |
@ -0,0 +1,24 @@ |
||||
#include "freeweight_layer.h" |
||||
#include "stdlib.h" |
||||
#include "stdio.h" |
||||
|
||||
freeweight_layer *make_freeweight_layer(int batch, int inputs) |
||||
{ |
||||
fprintf(stderr, "Freeweight Layer: %d inputs\n", inputs); |
||||
freeweight_layer *layer = calloc(1, sizeof(freeweight_layer)); |
||||
layer->inputs = inputs; |
||||
layer->batch = batch; |
||||
return layer; |
||||
}
|
||||
|
||||
void forward_freeweight_layer(freeweight_layer layer, float *input) |
||||
{ |
||||
int i; |
||||
for(i = 0; i < layer.batch * layer.inputs; ++i){ |
||||
input[i] *= 2.*((float)rand()/RAND_MAX); |
||||
} |
||||
} |
||||
void backward_freeweight_layer(freeweight_layer layer, float *input, float *delta) |
||||
{ |
||||
// Don't do shit LULZ
|
||||
} |
@ -0,0 +1,14 @@ |
||||
#ifndef FREEWEIGHT_LAYER_H |
||||
#define FREEWEIGHT_LAYER_H |
||||
|
||||
typedef struct{ |
||||
int batch; |
||||
int inputs; |
||||
} freeweight_layer; |
||||
|
||||
freeweight_layer *make_freeweight_layer(int batch, int inputs); |
||||
|
||||
void forward_freeweight_layer(freeweight_layer layer, float *input); |
||||
void backward_freeweight_layer(freeweight_layer layer, float *input, float *delta); |
||||
|
||||
#endif |
Loading…
Reference in new issue