mirror of https://github.com/AlexeyAB/darknet.git
commit
b4b729a15e
31 changed files with 1891 additions and 739 deletions
@ -0,0 +1,86 @@ |
||||
#include "mini_blas.h" |
||||
|
||||
void cpu_gemm_nn(int TA, int TB, int M, int N, int K, float ALPHA,
|
||||
float *A, int lda,
|
||||
float *B, int ldb, |
||||
float BETA, |
||||
float *C, int ldc) |
||||
{ |
||||
int i,j,k; |
||||
for(i = 0; i < M; ++i){ |
||||
for(k = 0; k < K; ++k){ |
||||
register float A_PART = ALPHA*A[i*lda+k]; |
||||
for(j = 0; j < N; ++j){ |
||||
C[i*ldc+j] += A_PART*B[k*ldb+j]; |
||||
} |
||||
} |
||||
} |
||||
} |
||||
|
||||
void cpu_gemm_nt(int TA, int TB, int M, int N, int K, float ALPHA,
|
||||
float *A, int lda,
|
||||
float *B, int ldb, |
||||
float BETA, |
||||
float *C, int ldc) |
||||
{ |
||||
int i,j,k; |
||||
for(i = 0; i < M; ++i){ |
||||
for(j = 0; j < N; ++j){ |
||||
register float sum = 0; |
||||
for(k = 0; k < K; ++k){ |
||||
sum += ALPHA*A[i*lda+k]*B[k+j*ldb]; |
||||
} |
||||
C[i*ldc+j] += sum; |
||||
} |
||||
} |
||||
} |
||||
|
||||
void cpu_gemm_tn(int TA, int TB, int M, int N, int K, float ALPHA,
|
||||
float *A, int lda,
|
||||
float *B, int ldb, |
||||
float BETA, |
||||
float *C, int ldc) |
||||
{ |
||||
int i,j,k; |
||||
for(i = 0; i < M; ++i){ |
||||
for(k = 0; k < K; ++k){ |
||||
register float A_PART = ALPHA*A[k*lda+i]; |
||||
for(j = 0; j < N; ++j){ |
||||
C[i*ldc+j] += A_PART*B[k*ldb+j]; |
||||
} |
||||
} |
||||
} |
||||
} |
||||
void cpu_gemm_tt(int TA, int TB, int M, int N, int K, float ALPHA,
|
||||
float *A, int lda,
|
||||
float *B, int ldb, |
||||
float BETA, |
||||
float *C, int ldc) |
||||
{ |
||||
int i,j,k; |
||||
for(i = 0; i < M; ++i){ |
||||
for(j = 0; j < N; ++j){ |
||||
for(k = 0; k < K; ++k){ |
||||
C[i*ldc+j] += ALPHA*A[i+k*lda]*B[k+j*ldb]; |
||||
} |
||||
} |
||||
} |
||||
} |
||||
|
||||
|
||||
void cpu_gemm(int TA, int TB, int M, int N, int K, float ALPHA,
|
||||
float *A, int lda,
|
||||
float *B, int ldb, |
||||
float BETA, |
||||
float *C, int ldc) |
||||
{ |
||||
// Assume beta = 1 LULZ
|
||||
if(!TA && !TB) |
||||
cpu_gemm_nn( TA, TB, M, N, K, ALPHA,A,lda, B, ldb,BETA,C,ldc); |
||||
else if(TA && !TB) |
||||
cpu_gemm_tn( TA, TB, M, N, K, ALPHA,A,lda, B, ldb,BETA,C,ldc); |
||||
else if(!TA && TB) |
||||
cpu_gemm_nt( TA, TB, M, N, K, ALPHA,A,lda, B, ldb,BETA,C,ldc); |
||||
else |
||||
cpu_gemm_tt( TA, TB, M, N, K, ALPHA,A,lda, B, ldb,BETA,C,ldc); |
||||
} |
@ -0,0 +1,72 @@ |
||||
|
||||
|
||||
__kernel void gemm(int TA, int TB, int M, int N, int K, float ALPHA, |
||||
__global float *A, int lda, |
||||
__global float *B, int ldb, |
||||
float BETA, |
||||
__global float *C, int ldc) |
||||
{ |
||||
__local float Asub[BLOCK][BLOCK]; |
||||
__local float Bsub[BLOCK][BLOCK]; |
||||
|
||||
float val = 0; |
||||
|
||||
int row_block = get_group_id(0); |
||||
int col_block = get_group_id(1); |
||||
|
||||
int sub_row = get_local_id(0); |
||||
int sub_col = get_local_id(1); |
||||
|
||||
int row = row_block*BLOCK + sub_row; |
||||
int col = col_block*BLOCK + sub_col; |
||||
|
||||
int i,j; |
||||
for(i = 0; i < K; i += BLOCK){ |
||||
int arow = row_block*BLOCK + sub_row; |
||||
int acol = i + sub_col; |
||||
|
||||
int brow = i + sub_row; |
||||
int bcol = col_block*BLOCK + sub_col; |
||||
|
||||
Asub[sub_row][sub_col] = TA ? A[arow + acol*lda] : A[arow*lda + acol]; |
||||
Bsub[sub_row][sub_col] = TB ? B[brow + bcol*ldb] : B[brow*ldb + bcol]; |
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
|
||||
for(j = 0; j < BLOCK && i+j<K; ++j){ |
||||
val += Asub[sub_row][j]*Bsub[j][sub_col]; |
||||
} |
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
} |
||||
|
||||
if(row < M && col < N){ |
||||
C[row*ldc+col] = val; |
||||
} |
||||
} |
||||
|
||||
/* |
||||
__kernel void gemm_slow(int TA, int TB, int M, int N, int K, float ALPHA, |
||||
__global float *A, int lda, |
||||
__global float *B, int ldb, |
||||
float BETA, |
||||
__global float *C, int ldc) |
||||
{ |
||||
float val = 0; |
||||
int row = get_global_id(0); |
||||
int col = get_global_id(1); |
||||
int i; |
||||
for(i = 0; i < K; ++i){ |
||||
float Aval; |
||||
if(TA) Aval = A[i*lda+row]; |
||||
else Aval = A[row*lda+i]; |
||||
|
||||
float Bval; |
||||
if(TB) Bval = B[col*ldb+i]; |
||||
else Bval = B[col+i*ldb]; |
||||
|
||||
val += Aval*Bval; |
||||
} |
||||
C[row*ldc+col] = val; |
||||
} |
||||
|
||||
*/ |
@ -0,0 +1,153 @@ |
||||
#include <stdio.h> |
||||
#include <stdlib.h> |
||||
#include <string.h> |
||||
#include <time.h> |
||||
#include <math.h> |
||||
|
||||
#include "opencl.h" |
||||
#include "mini_blas.h" |
||||
|
||||
#define STR_HELPER(x) #x |
||||
#define STR(x) STR_HELPER(x) |
||||
|
||||
#define BLOCK 8 |
||||
|
||||
cl_kernel get_gemm_kernel() |
||||
{ |
||||
static int init = 0; |
||||
static cl_kernel gemm_kernel; |
||||
if(!init){ |
||||
gemm_kernel = get_kernel("src/gemm.cl", "gemm", "-D BLOCK=" STR(BLOCK) ); |
||||
init = 1; |
||||
} |
||||
return gemm_kernel; |
||||
} |
||||
|
||||
void gpu_gemm(int TA, int TB, int M, int N, int K, float ALPHA,
|
||||
float *A, int lda,
|
||||
float *B, int ldb, |
||||
float BETA, |
||||
float *C, int ldc) |
||||
{ |
||||
cl_setup(); |
||||
cl_kernel gemm_kernel = get_gemm_kernel(); |
||||
cl_context context = cl.context; |
||||
cl_command_queue queue = cl.queue; |
||||
|
||||
size_t size = sizeof(float)*(TA ? lda*K:lda*M); |
||||
cl_mem A_gpu = clCreateBuffer(context, |
||||
CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR, |
||||
size, A, &cl.error); |
||||
check_error(cl); |
||||
|
||||
size = sizeof(float)*(TB ? ldb*N:ldb*K); |
||||
cl_mem B_gpu = clCreateBuffer(context, |
||||
CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR, |
||||
size, B, &cl.error); |
||||
check_error(cl); |
||||
|
||||
size = sizeof(float)*(ldc*M); |
||||
cl_mem C_gpu = clCreateBuffer(context, |
||||
CL_MEM_WRITE_ONLY|CL_MEM_COPY_HOST_PTR, |
||||
size, C, &cl.error); |
||||
check_error(cl); |
||||
|
||||
cl_uint i = 0; |
||||
cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(TA), (void*) &TA); |
||||
cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(TB), (void*) &TB); |
||||
cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(M), (void*) &M); |
||||
cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(N), (void*) &N); |
||||
cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(K), (void*) &K); |
||||
cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(ALPHA), (void*) &ALPHA); |
||||
cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(A_gpu), (void*) &A_gpu); |
||||
cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(lda), (void*) &lda); |
||||
cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(B_gpu), (void*) &B_gpu); |
||||
cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(ldb), (void*) &ldb); |
||||
cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(BETA), (void*) &BETA); |
||||
cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(C_gpu), (void*) &C_gpu); |
||||
cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(ldc), (void*) &ldc); |
||||
check_error(cl); |
||||
|
||||
const size_t global_size[] = {ceil((float)M/BLOCK)*BLOCK, ceil((float)N/BLOCK)*BLOCK}; |
||||
const size_t local_size[] = {BLOCK, BLOCK}; |
||||
//printf("%zd %zd %zd %zd\n", global_size[0], global_size[1], local_size[0], local_size[1]);
|
||||
|
||||
clEnqueueNDRangeKernel(queue, gemm_kernel, 2, 0, global_size, local_size, 0, 0, 0); |
||||
check_error(cl); |
||||
clEnqueueReadBuffer(queue, C_gpu, CL_TRUE, 0, size, C, 0, 0, 0); |
||||
check_error(cl); |
||||
|
||||
clReleaseMemObject(A_gpu); |
||||
clReleaseMemObject(B_gpu); |
||||
clReleaseMemObject(C_gpu); |
||||
|
||||
} |
||||
|
||||
/*
|
||||
cl_kernel get_gemm_kernel_slow() |
||||
{ |
||||
static int init = 0; |
||||
static cl_kernel gemm_kernel; |
||||
if(!init){ |
||||
gemm_kernel = get_kernel("src/gemm.cl", "gemm_slow"); |
||||
init = 1; |
||||
} |
||||
return gemm_kernel; |
||||
} |
||||
|
||||
void gpu_gemm_slow(int TA, int TB, int M, int N, int K, float ALPHA,
|
||||
float *A, int lda,
|
||||
float *B, int ldb, |
||||
float BETA, |
||||
float *C, int ldc) |
||||
{ |
||||
cl_setup(); |
||||
cl_kernel gemm_kernel = get_gemm_kernel_slow(); |
||||
cl_context context = cl.context; |
||||
cl_command_queue queue = cl.queue; |
||||
|
||||
size_t size = sizeof(float)*(TA ? lda*K:lda*M); |
||||
cl_mem A_gpu = clCreateBuffer(context, |
||||
CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR, |
||||
size, A, &cl.error); |
||||
check_error(cl); |
||||
|
||||
size = sizeof(float)*(TB ? ldb*N:ldb*K); |
||||
cl_mem B_gpu = clCreateBuffer(context, |
||||
CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR, |
||||
size, B, &cl.error); |
||||
check_error(cl); |
||||
|
||||
size = sizeof(float)*(ldc*M); |
||||
cl_mem C_gpu = clCreateBuffer(context, |
||||
CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR, |
||||
size, C, &cl.error); |
||||
check_error(cl); |
||||
|
||||
cl_uint i = 0; |
||||
cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(TA), (void*) &TA); |
||||
cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(TB), (void*) &TB); |
||||
cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(M), (void*) &M); |
||||
cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(N), (void*) &N); |
||||
cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(K), (void*) &K); |
||||
cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(ALPHA), (void*) &ALPHA); |
||||
cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(A_gpu), (void*) &A_gpu); |
||||
cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(lda), (void*) &lda); |
||||
cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(B_gpu), (void*) &B_gpu); |
||||
cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(ldb), (void*) &ldb); |
||||
cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(BETA), (void*) &BETA); |
||||
cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(C_gpu), (void*) &C_gpu); |
||||
cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(ldc), (void*) &ldc); |
||||
check_error(cl); |
||||
|
||||
const size_t global_size[] = {M, N}; |
||||
|
||||
clEnqueueNDRangeKernel(queue, gemm_kernel, 2, 0, global_size, 0, 0, 0, 0); |
||||
clEnqueueReadBuffer(queue, C_gpu, CL_TRUE, 0, size, C, 0, 0, 0); |
||||
|
||||
clReleaseMemObject(A_gpu); |
||||
clReleaseMemObject(B_gpu); |
||||
clReleaseMemObject(C_gpu); |
||||
|
||||
} |
||||
*/ |
@ -0,0 +1,96 @@ |
||||
#include "normalization_layer.h" |
||||
#include <stdio.h> |
||||
|
||||
image get_normalization_image(normalization_layer layer) |
||||
{ |
||||
int h = layer.h; |
||||
int w = layer.w; |
||||
int c = layer.c; |
||||
return float_to_image(h,w,c,layer.output); |
||||
} |
||||
|
||||
image get_normalization_delta(normalization_layer layer) |
||||
{ |
||||
int h = layer.h; |
||||
int w = layer.w; |
||||
int c = layer.c; |
||||
return float_to_image(h,w,c,layer.delta); |
||||
} |
||||
|
||||
normalization_layer *make_normalization_layer(int batch, int h, int w, int c, int size, float alpha, float beta, float kappa) |
||||
{ |
||||
fprintf(stderr, "Local Response Normalization Layer: %d x %d x %d image, %d size\n", h,w,c,size); |
||||
normalization_layer *layer = calloc(1, sizeof(normalization_layer)); |
||||
layer->batch = batch; |
||||
layer->h = h; |
||||
layer->w = w; |
||||
layer->c = c; |
||||
layer->kappa = kappa; |
||||
layer->size = size; |
||||
layer->alpha = alpha; |
||||
layer->beta = beta; |
||||
layer->output = calloc(h * w * c * batch, sizeof(float)); |
||||
layer->delta = calloc(h * w * c * batch, sizeof(float)); |
||||
layer->sums = calloc(h*w, sizeof(float)); |
||||
return layer; |
||||
} |
||||
|
||||
void resize_normalization_layer(normalization_layer *layer, int h, int w, int c) |
||||
{ |
||||
layer->h = h; |
||||
layer->w = w; |
||||
layer->c = c; |
||||
layer->output = realloc(layer->output, h * w * c * layer->batch * sizeof(float)); |
||||
layer->delta = realloc(layer->delta, h * w * c * layer->batch * sizeof(float)); |
||||
layer->sums = realloc(layer->sums, h*w * sizeof(float)); |
||||
} |
||||
|
||||
void add_square_array(float *src, float *dest, int n) |
||||
{ |
||||
int i; |
||||
for(i = 0; i < n; ++i){ |
||||
dest[i] += src[i]*src[i]; |
||||
} |
||||
} |
||||
void sub_square_array(float *src, float *dest, int n) |
||||
{ |
||||
int i; |
||||
for(i = 0; i < n; ++i){ |
||||
dest[i] -= src[i]*src[i]; |
||||
} |
||||
} |
||||
|
||||
void forward_normalization_layer(const normalization_layer layer, float *in) |
||||
{ |
||||
int i,j,k; |
||||
memset(layer.sums, 0, layer.h*layer.w*sizeof(float)); |
||||
int imsize = layer.h*layer.w; |
||||
for(j = 0; j < layer.size/2; ++j){ |
||||
if(j < layer.c) add_square_array(in+j*imsize, layer.sums, imsize); |
||||
} |
||||
for(k = 0; k < layer.c; ++k){ |
||||
int next = k+layer.size/2; |
||||
int prev = k-layer.size/2-1; |
||||
if(next < layer.c) add_square_array(in+next*imsize, layer.sums, imsize); |
||||
if(prev > 0) sub_square_array(in+prev*imsize, layer.sums, imsize); |
||||
for(i = 0; i < imsize; ++i){ |
||||
layer.output[k*imsize + i] = in[k*imsize+i] / pow(layer.kappa + layer.alpha * layer.sums[i], layer.beta); |
||||
} |
||||
} |
||||
} |
||||
|
||||
void backward_normalization_layer(const normalization_layer layer, float *in, float *delta) |
||||
{ |
||||
//TODO!
|
||||
} |
||||
|
||||
void visualize_normalization_layer(normalization_layer layer, char *window) |
||||
{ |
||||
image delta = get_normalization_image(layer); |
||||
image dc = collapse_image_layers(delta, 1); |
||||
char buff[256]; |
||||
sprintf(buff, "%s: Output", window); |
||||
show_image(dc, buff); |
||||
save_image(dc, buff); |
||||
free_image(dc); |
||||
} |
@ -0,0 +1,26 @@ |
||||
#ifndef NORMALIZATION_LAYER_H |
||||
#define NORMALIZATION_LAYER_H |
||||
|
||||
#include "image.h" |
||||
|
||||
typedef struct { |
||||
int batch; |
||||
int h,w,c; |
||||
int size; |
||||
float alpha; |
||||
float beta; |
||||
float kappa; |
||||
float *delta; |
||||
float *output; |
||||
float *sums; |
||||
} normalization_layer; |
||||
|
||||
image get_normalization_image(normalization_layer layer); |
||||
normalization_layer *make_normalization_layer(int batch, int h, int w, int c, int size, float alpha, float beta, float kappa); |
||||
void resize_normalization_layer(normalization_layer *layer, int h, int w, int c); |
||||
void forward_normalization_layer(const normalization_layer layer, float *in); |
||||
void backward_normalization_layer(const normalization_layer layer, float *in, float *delta); |
||||
void visualize_normalization_layer(normalization_layer layer, char *window); |
||||
|
||||
#endif |
||||
|
@ -0,0 +1,77 @@ |
||||
#include "opencl.h" |
||||
#include <stdio.h> |
||||
#include <stdlib.h> |
||||
#include <string.h> |
||||
|
||||
cl_info cl = {0}; |
||||
|
||||
void check_error(cl_info info) |
||||
{ |
||||
if (info.error != CL_SUCCESS) { |
||||
printf("\n Error number %d", info.error); |
||||
} |
||||
} |
||||
|
||||
cl_info cl_init() |
||||
{ |
||||
cl_info info; |
||||
info.initialized = 0; |
||||
cl_uint platforms, devices; |
||||
// Fetch the Platform and Device IDs; we only want one.
|
||||
info.error=clGetPlatformIDs(1, &info.platform, &platforms); |
||||
check_error(info); |
||||
info.error=clGetDeviceIDs(info.platform, CL_DEVICE_TYPE_ALL, 1, &info.device, &devices); |
||||
check_error(info); |
||||
|
||||
cl_context_properties properties[]={ |
||||
CL_CONTEXT_PLATFORM, (cl_context_properties)info.platform, |
||||
0}; |
||||
// Note that nVidia's OpenCL requires the platform property
|
||||
info.context=clCreateContext(properties, 1, &info.device, 0, 0, &info.error); |
||||
check_error(info); |
||||
info.queue = clCreateCommandQueue(info.context, info.device, 0, &info.error); |
||||
check_error(info); |
||||
info.initialized = 1; |
||||
return info; |
||||
} |
||||
|
||||
cl_program cl_fprog(char *filename, char *options, cl_info info) |
||||
{ |
||||
size_t srcsize; |
||||
char src[8192]; |
||||
memset(src, 0, 8192); |
||||
FILE *fil=fopen(filename,"r"); |
||||
srcsize=fread(src, sizeof src, 1, fil); |
||||
fclose(fil); |
||||
const char *srcptr[]={src}; |
||||
// Submit the source code of the example kernel to OpenCL
|
||||
cl_program prog=clCreateProgramWithSource(info.context,1, srcptr, &srcsize, &info.error); |
||||
check_error(info); |
||||
char build_c[4096]; |
||||
// and compile it (after this we could extract the compiled version)
|
||||
info.error=clBuildProgram(prog, 0, 0, options, 0, 0); |
||||
if ( info.error != CL_SUCCESS ) { |
||||
fprintf(stderr, "Error Building Program: %d\n", info.error); |
||||
clGetProgramBuildInfo( prog, info.device, CL_PROGRAM_BUILD_LOG, 4096, build_c, 0); |
||||
fprintf(stderr, "Build Log for %s program:\n%s\n", filename, build_c); |
||||
} |
||||
return prog; |
||||
} |
||||
|
||||
void cl_setup() |
||||
{ |
||||
if(!cl.initialized){ |
||||
cl = cl_init(); |
||||
} |
||||
} |
||||
|
||||
cl_kernel get_kernel(char *filename, char *kernelname, char *options) |
||||
{ |
||||
cl_setup(); |
||||
cl_program prog = cl_fprog(filename, options, cl); |
||||
cl_kernel kernel=clCreateKernel(prog, kernelname, &cl.error); |
||||
check_error(cl); |
||||
return kernel; |
||||
} |
||||
|
||||
|
@ -0,0 +1,21 @@ |
||||
#ifdef __APPLE__ |
||||
#include <OpenCL/opencl.h> |
||||
#else |
||||
#include <CL/cl.h> |
||||
#endif |
||||
|
||||
typedef struct { |
||||
int initialized; |
||||
cl_int error; |
||||
cl_platform_id platform; |
||||
cl_device_id device; |
||||
cl_context context; |
||||
cl_command_queue queue; |
||||
}cl_info; |
||||
|
||||
extern cl_info cl; |
||||
|
||||
void cl_setup(); |
||||
void check_error(cl_info info); |
||||
cl_kernel get_kernel(char *filename, char *kernelname, char *options); |
||||
|
File diff suppressed because it is too large
Load Diff
Before Width: | Height: | Size: 1.1 KiB |
Before Width: | Height: | Size: 30 KiB |
Before Width: | Height: | Size: 16 KiB |
Loading…
Reference in new issue