From 5ef74c2031a040f30a670dc7d60790fc6a9ec720 Mon Sep 17 00:00:00 2001 From: Joseph Redmon Date: Fri, 2 May 2014 15:20:34 -0700 Subject: [PATCH] Slowly refactoring and pushing to GPU --- Makefile | 16 +-- src/convolutional_layer.c | 17 ++- src/data.c | 2 +- src/gpu_gemm.c | 236 -------------------------------------- src/list.c | 2 + src/mini_blas.c | 75 +----------- src/mini_blas.h | 11 +- src/opencl.c | 5 +- 8 files changed, 31 insertions(+), 333 deletions(-) delete mode 100644 src/gpu_gemm.c diff --git a/Makefile b/Makefile index 3b01ab2d..445c7756 100644 --- a/Makefile +++ b/Makefile @@ -1,29 +1,29 @@ CC=gcc GPU=1 -COMMON=-Wall `pkg-config --cflags opencv` -I/usr/local/cuda/include/ +COMMON=-Wall -Werror -Wfatal-errors `pkg-config --cflags opencv` -I/usr/local/cuda/include/ +ifeq ($(GPU), 1) +COMMON+=-DGPU +else +endif UNAME = $(shell uname) -OPTS=-O3 +OPTS=-O3 -flto ifeq ($(UNAME), Darwin) COMMON+= -isystem /usr/local/Cellar/opencv/2.4.6.1/include/opencv -isystem /usr/local/Cellar/opencv/2.4.6.1/include ifeq ($(GPU), 1) LDFLAGS= -framework OpenCL endif else -OPTS+= -march=native ifeq ($(GPU), 1) LDFLAGS= -lOpenCL endif endif CFLAGS= $(COMMON) $(OPTS) -CFLAGS= $(COMMON) -O0 -g +#CFLAGS= $(COMMON) -O0 -g LDFLAGS+=`pkg-config --libs opencv` -lm VPATH=./src/ EXEC=cnn -OBJ=network.o image.o tests.o connected_layer.o maxpool_layer.o activations.o list.o option_list.o parser.o utils.o data.o matrix.o softmax_layer.o mini_blas.o convolutional_layer.o cpu_gemm.o normalization_layer.o -ifeq ($(GPU), 1) -OBJ+=gpu_gemm.o opencl.o -endif +OBJ=network.o image.o tests.o connected_layer.o maxpool_layer.o activations.o list.o option_list.o parser.o utils.o data.o matrix.o softmax_layer.o mini_blas.o convolutional_layer.o gemm.o normalization_layer.o opencl.o im2col.o col2im.o axpy.o all: $(EXEC) diff --git a/src/convolutional_layer.c b/src/convolutional_layer.c index 45bb54a5..31a4af64 100644 --- a/src/convolutional_layer.c +++ b/src/convolutional_layer.c @@ -100,7 +100,7 @@ void forward_convolutional_layer(const convolutional_layer layer, float *in) float *b = layer.col_image; float *c = layer.output; for(i = 0; i < layer.batch; ++i){ - im2col_cpu(in+i*(n/layer.batch), layer.c, layer.h, layer.w, layer.size, layer.stride, b+i*(n/layer.batch)); + im2col_gpu(in+i*(n/layer.batch), layer.c, layer.h, layer.w, layer.size, layer.stride, b+i*(n/layer.batch)); } gemm(0,0,m,n,k,1,a,k,b,n,0,c,n); activate_array(layer.output, m*n, layer.activation); @@ -162,16 +162,13 @@ void backward_convolutional_layer(convolutional_layer layer, float *delta) void update_convolutional_layer(convolutional_layer layer, float step, float momentum, float decay) { - int i; int size = layer.size*layer.size*layer.c*layer.n; - for(i = 0; i < layer.n; ++i){ - layer.biases[i] += step*layer.bias_updates[i]; - layer.bias_updates[i] *= momentum; - } - for(i = 0; i < size; ++i){ - layer.filters[i] += step*(layer.filter_updates[i] - decay*layer.filters[i]); - layer.filter_updates[i] *= momentum; - } + axpy_cpu(layer.n, step, layer.bias_updates, 1, layer.biases, 1); + scal_cpu(layer.n, momentum, layer.bias_updates, 1); + + scal_cpu(size, 1.-step*decay, layer.filters, 1); + axpy_cpu(size, step, layer.filter_updates, 1, layer.filters, 1); + scal_cpu(size, momentum, layer.filter_updates, 1); } void test_convolutional_layer() diff --git a/src/data.c b/src/data.c index 39ece116..6d2061ed 100644 --- a/src/data.c +++ b/src/data.c @@ -123,7 +123,7 @@ data load_cifar10_data(char *filename) { data d; d.shallow = 0; - unsigned long i,j; + long i,j; matrix X = make_matrix(10000, 3072); matrix y = make_matrix(10000, 10); d.X = X; diff --git a/src/gpu_gemm.c b/src/gpu_gemm.c deleted file mode 100644 index 4a8aaca8..00000000 --- a/src/gpu_gemm.c +++ /dev/null @@ -1,236 +0,0 @@ -#include -#include -#include -#include -#include - -#include "opencl.h" -#include "mini_blas.h" - -#define STR_HELPER(x) #x -#define STR(x) STR_HELPER(x) - -#define BLOCK 8 - -cl_kernel get_gemm_kernel() -{ - static int init = 0; - static cl_kernel gemm_kernel; - if(!init){ - gemm_kernel = get_kernel("src/gemm.cl", "gemm", "-D BLOCK=" STR(BLOCK) ); - init = 1; - } - return gemm_kernel; -} - -void gpu_gemm(int TA, int TB, int M, int N, int K, float ALPHA, - float *A, int lda, - float *B, int ldb, - float BETA, - float *C, int ldc) -{ - cl_setup(); - cl_kernel gemm_kernel = get_gemm_kernel(); - cl_context context = cl.context; - cl_command_queue queue = cl.queue; - - size_t size = sizeof(float)*(TA ? lda*K:lda*M); - cl_mem A_gpu = clCreateBuffer(context, - CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR, - size, A, &cl.error); - check_error(cl); - - size = sizeof(float)*(TB ? ldb*N:ldb*K); - cl_mem B_gpu = clCreateBuffer(context, - CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR, - size, B, &cl.error); - check_error(cl); - - size = sizeof(float)*(ldc*M); - cl_mem C_gpu = clCreateBuffer(context, - CL_MEM_WRITE_ONLY|CL_MEM_COPY_HOST_PTR, - size, C, &cl.error); - check_error(cl); - - cl_uint i = 0; - cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(TA), (void*) &TA); - cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(TB), (void*) &TB); - cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(M), (void*) &M); - cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(N), (void*) &N); - cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(K), (void*) &K); - cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(ALPHA), (void*) &ALPHA); - cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(A_gpu), (void*) &A_gpu); - cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(lda), (void*) &lda); - cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(B_gpu), (void*) &B_gpu); - cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(ldb), (void*) &ldb); - cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(BETA), (void*) &BETA); - cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(C_gpu), (void*) &C_gpu); - cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(ldc), (void*) &ldc); - check_error(cl); - - const size_t global_size[] = {ceil((float)M/BLOCK)*BLOCK, ceil((float)N/BLOCK)*BLOCK}; - const size_t local_size[] = {BLOCK, BLOCK}; - //printf("%zd %zd %zd %zd\n", global_size[0], global_size[1], local_size[0], local_size[1]); - - clEnqueueNDRangeKernel(queue, gemm_kernel, 2, 0, global_size, local_size, 0, 0, 0); - check_error(cl); - clEnqueueReadBuffer(queue, C_gpu, CL_TRUE, 0, size, C, 0, 0, 0); - check_error(cl); - - clReleaseMemObject(A_gpu); - clReleaseMemObject(B_gpu); - clReleaseMemObject(C_gpu); - -} - -void time_gpu_random_matrix(int TA, int TB, int m, int k, int n) -{ - float *a; - if(!TA) a = random_matrix(m,k); - else a = random_matrix(k,m); - int lda = (!TA)?k:m; - float *b; - if(!TB) b = random_matrix(k,n); - else b = random_matrix(n,k); - int ldb = (!TB)?n:k; - - float *c = random_matrix(m,n); - int i; - clock_t start = clock(), end; - for(i = 0; i<1000; ++i){ - gpu_gemm(TA,TB,m,n,k,1,a,lda,b,ldb,1,c,n); - } - end = clock(); - printf("Matrix Multiplication %dx%d * %dx%d, TA=%d, TB=%d: %lf ms\n",m,k,k,n, TA, TB, (float)(end-start)/CLOCKS_PER_SEC); - free(a); - free(b); - free(c); -} - -void test_gpu_accuracy(int TA, int TB, int m, int k, int n) -{ - srand(0); - float *a; - if(!TA) a = random_matrix(m,k); - else a = random_matrix(k,m); - int lda = (!TA)?k:m; - float *b; - if(!TB) b = random_matrix(k,n); - else b = random_matrix(n,k); - int ldb = (!TB)?n:k; - - float *c = random_matrix(m,n); - float *c_gpu = random_matrix(m,n); - memset(c, 0, m*n*sizeof(float)); - memset(c_gpu, 0, m*n*sizeof(float)); - int i; - //pm(m,k,b); - gpu_gemm(TA,TB,m,n,k,1,a,lda,b,ldb,1,c_gpu,n); - //pm(m, n, c_gpu); - cpu_gemm(TA,TB,m,n,k,1,a,lda,b,ldb,1,c,n); - //pm(m, n, c); - double sse = 0; - for(i = 0; i < m*n; ++i) { - //printf("%f %f\n", c[i], c_gpu[i]); - sse += pow(c[i]-c_gpu[i], 2); - } - printf("Matrix Multiplication %dx%d * %dx%d, TA=%d, TB=%d: %g MSE\n",m,k,k,n, TA, TB, sse/(m*n)); - free(a); - free(b); - free(c); -} - -void test_gpu_blas() -{ - test_gpu_accuracy(0,0,17,10,10); - test_gpu_accuracy(1,0,17,10,10); - test_gpu_accuracy(0,1,17,10,10); - test_gpu_accuracy(1,1,17,10,10); - - test_gpu_accuracy(0,0,1000,10,100); - test_gpu_accuracy(1,0,1000,10,100); - test_gpu_accuracy(0,1,1000,10,100); - test_gpu_accuracy(1,1,1000,10,100); - - time_gpu_random_matrix(0,0,1000,1000,100); - time_random_matrix(0,0,1000,1000,100); - - time_gpu_random_matrix(0,1,1000,1000,100); - time_random_matrix(0,1,1000,1000,100); - - time_gpu_random_matrix(1,0,1000,1000,100); - time_random_matrix(1,0,1000,1000,100); - - time_gpu_random_matrix(1,1,1000,1000,100); - time_random_matrix(1,1,1000,1000,100); - -} - -/* -cl_kernel get_gemm_kernel_slow() -{ - static int init = 0; - static cl_kernel gemm_kernel; - if(!init){ - gemm_kernel = get_kernel("src/gemm.cl", "gemm_slow"); - init = 1; - } - return gemm_kernel; -} - -void gpu_gemm_slow(int TA, int TB, int M, int N, int K, float ALPHA, - float *A, int lda, - float *B, int ldb, - float BETA, - float *C, int ldc) -{ - cl_setup(); - cl_kernel gemm_kernel = get_gemm_kernel_slow(); - cl_context context = cl.context; - cl_command_queue queue = cl.queue; - - size_t size = sizeof(float)*(TA ? lda*K:lda*M); - cl_mem A_gpu = clCreateBuffer(context, - CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR, - size, A, &cl.error); - check_error(cl); - - size = sizeof(float)*(TB ? ldb*N:ldb*K); - cl_mem B_gpu = clCreateBuffer(context, - CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR, - size, B, &cl.error); - check_error(cl); - - size = sizeof(float)*(ldc*M); - cl_mem C_gpu = clCreateBuffer(context, - CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR, - size, C, &cl.error); - check_error(cl); - - cl_uint i = 0; - cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(TA), (void*) &TA); - cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(TB), (void*) &TB); - cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(M), (void*) &M); - cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(N), (void*) &N); - cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(K), (void*) &K); - cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(ALPHA), (void*) &ALPHA); - cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(A_gpu), (void*) &A_gpu); - cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(lda), (void*) &lda); - cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(B_gpu), (void*) &B_gpu); - cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(ldb), (void*) &ldb); - cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(BETA), (void*) &BETA); - cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(C_gpu), (void*) &C_gpu); - cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(ldc), (void*) &ldc); - check_error(cl); - - const size_t global_size[] = {M, N}; - - clEnqueueNDRangeKernel(queue, gemm_kernel, 2, 0, global_size, 0, 0, 0, 0); - clEnqueueReadBuffer(queue, C_gpu, CL_TRUE, 0, size, C, 0, 0, 0); - - clReleaseMemObject(A_gpu); - clReleaseMemObject(B_gpu); - clReleaseMemObject(C_gpu); - -} -*/ diff --git a/src/list.c b/src/list.c index 948d960e..0e4165d3 100644 --- a/src/list.c +++ b/src/list.c @@ -11,6 +11,7 @@ list *make_list() return l; } +/* void transfer_node(list *s, list *d, node *n) { node *prev, *next; @@ -22,6 +23,7 @@ void transfer_node(list *s, list *d, node *n) if(s->front == n) s->front = next; if(s->back == n) s->back = prev; } +*/ void *list_pop(list *l){ if(!l->back) return 0; diff --git a/src/mini_blas.c b/src/mini_blas.c index 70dcb546..eb6953d7 100644 --- a/src/mini_blas.c +++ b/src/mini_blas.c @@ -1,4 +1,3 @@ - #include #include #include @@ -18,77 +17,7 @@ void pm(int M, int N, float *A) printf("\n"); } -void gemm(int TA, int TB, int M, int N, int K, float ALPHA, - float *A, int lda, - float *B, int ldb, - float BETA, - float *C, int ldc) -{ - gpu_gemm( TA, TB, M, N, K, ALPHA,A,lda, B, ldb,BETA,C,ldc); -} - -void im2row(float *image, int h, int w, int c, int size, int stride, float *matrix) -{ - int i; - int mc = c; - int mw = (size*size); - int mh = ((h-size)/stride+1)*((w-size)/stride+1); - int msize = mc*mw*mh; - for(i = 0; i < msize; ++i){ - int channel = i/(mh*mw); - int block = (i%(mh*mw))/mw; - int position = i%mw; - int block_h = block/((w-size)/stride+1); - int block_w = block%((w-size)/stride+1); - int ph, pw, pc; - ph = position/size+block_h; - pw = position%size+block_w; - pc = channel; - matrix[i] = image[pc*h*w+ph*w+pw]; - } -} -void im2col(float *image, int h, int w, int c, int size, int stride, float *matrix) -{ - int b,p; - int blocks = ((h-size)/stride+1)*((w-size)/stride+1); - int pixels = (size*size*c); - for(b = 0; b < blocks; ++b){ - int block_h = b/((w-size)/stride+1); - int block_w = b%((w-size)/stride+1); - for(p = 0; p < pixels; ++p){ - int ph, pw, pc; - int position = p%(size*size); - pc = p/(size*size); - ph = position/size+block_h; - pw = position%size+block_w; - matrix[b+p*blocks] = image[pc*h*w+ph*w+pw]; - } - } -} - -//From Berkeley Vision's Caffe! -void im2col_cpu(float* data_im, const int channels, - const int height, const int width, const int ksize, const int stride, - float* data_col) -{ - int c,h,w; - int height_col = (height - ksize) / stride + 1; - int width_col = (width - ksize) / stride + 1; - int channels_col = channels * ksize * ksize; - for ( c = 0; c < channels_col; ++c) { - int w_offset = c % ksize; - int h_offset = (c / ksize) % ksize; - int c_im = c / ksize / ksize; - for ( h = 0; h < height_col; ++h) { - for ( w = 0; w < width_col; ++w) { - data_col[(c * height_col + h) * width_col + w] = - data_im[(c_im * height + h * stride + h_offset) * width - + w * stride + w_offset]; - } - } - } -} - +//This one might be too, can't remember. void col2im_cpu(float* data_col, const int channels, const int height, const int width, const int ksize, const int stride, float* data_im) @@ -135,7 +64,7 @@ void time_random_matrix(int TA, int TB, int m, int k, int n) int i; clock_t start = clock(), end; for(i = 0; i<1000; ++i){ - cpu_gemm(TA,TB,m,n,k,1,a,lda,b,ldb,1,c,n); + gemm_cpu(TA,TB,m,n,k,1,a,lda,b,ldb,1,c,n); } end = clock(); printf("Matrix Multiplication %dx%d * %dx%d, TA=%d, TB=%d: %lf ms\n",m,k,k,n, TA, TB, (float)(end-start)/CLOCKS_PER_SEC); diff --git a/src/mini_blas.h b/src/mini_blas.h index 31af193b..34f15de8 100644 --- a/src/mini_blas.h +++ b/src/mini_blas.h @@ -6,8 +6,9 @@ void gemm(int TA, int TB, int M, int N, int K, float ALPHA, float *C, int ldc); float *random_matrix(int rows, int cols); void time_random_matrix(int TA, int TB, int m, int k, int n); -void im2row(float *image, int h, int w, int c, int size, int stride, float *matrix); -void im2col(float *image, int h, int w, int c, int size, int stride, float *matrix); +void im2col_gpu(float* data_im, const int channels, + const int height, const int width, const int ksize, const int stride, + float* data_col); void im2col_cpu(float* data_im, const int channels, const int height, const int width, const int ksize, const int stride, float* data_col); @@ -16,14 +17,16 @@ void col2im_cpu(float* data_col, const int channels, float* data_im); void test_blas(); -void gpu_gemm(int TA, int TB, int M, int N, int K, float ALPHA, +void gemm_gpu(int TA, int TB, int M, int N, int K, float ALPHA, float *A, int lda, float *B, int ldb, float BETA, float *C, int ldc); -void cpu_gemm(int TA, int TB, int M, int N, int K, float ALPHA, +void gemm_cpu(int TA, int TB, int M, int N, int K, float ALPHA, float *A, int lda, float *B, int ldb, float BETA, float *C, int ldc); +void axpy_cpu(int N, float ALPHA, float *X, int INCX, float *Y, int INCY); +void scal_cpu(int N, float ALPHA, float *X, int INCX); void test_gpu_blas(); diff --git a/src/opencl.c b/src/opencl.c index 08bc8a73..0d645ba5 100644 --- a/src/opencl.c +++ b/src/opencl.c @@ -1,3 +1,4 @@ +#ifdef GPU #include "opencl.h" #include #include @@ -12,6 +13,7 @@ void check_error(cl_info info) { if (info.error != CL_SUCCESS) { printf("\n Error number %d", info.error); + exit(1); } } @@ -66,6 +68,7 @@ cl_program cl_fprog(char *filename, char *options, cl_info info) clGetProgramBuildInfo( prog, info.device, CL_PROGRAM_BUILD_LOG, 4096, build_c, 0); fprintf(stderr, "Build Log for %s program:\n%s\n", filename, build_c); } + check_error(info); return prog; } @@ -85,4 +88,4 @@ cl_kernel get_kernel(char *filename, char *kernelname, char *options) return kernel; } - +#endif