Added batch to col2im, padding option

pull/5299/head
Joseph Redmon 11 years ago
parent cd8d53df21
commit 70d622ea54
  1. 10
      Makefile
  2. 6
      src/activations.cl
  3. 42
      src/cnn.c
  4. 47
      src/col2im.c
  5. 17
      src/connected_layer.c
  6. 41
      src/convolutional_layer.c
  7. 3
      src/convolutional_layer.h
  8. 8
      src/data.c
  9. 1
      src/data.h
  10. 72
      src/detection_layer.c
  11. 40
      src/detection_layer.h
  12. 4
      src/gemm.cl
  13. 32
      src/im2col.c
  14. 72
      src/maxpool_layer.c
  15. 22
      src/mini_blas.c
  16. 8
      src/mini_blas.h
  17. 110
      src/network.c
  18. 20
      src/opencl.c
  19. 2
      src/opencl.h
  20. 3
      src/parser.c

@ -23,19 +23,21 @@ CFLAGS= $(COMMON) $(OPTS)
LDFLAGS+=`pkg-config --libs opencv` -lm LDFLAGS+=`pkg-config --libs opencv` -lm
VPATH=./src/ VPATH=./src/
EXEC=cnn EXEC=cnn
OBJDIR=./obj/
OBJ=network.o image.o tests.o connected_layer.o maxpool_layer.o activations.o list.o option_list.o parser.o utils.o data.o matrix.o softmax_layer.o mini_blas.o convolutional_layer.o gemm.o normalization_layer.o opencl.o im2col.o col2im.o axpy.o OBJ=network.o image.o cnn.o connected_layer.o maxpool_layer.o activations.o list.o option_list.o parser.o utils.o data.o matrix.o softmax_layer.o mini_blas.o convolutional_layer.o gemm.o normalization_layer.o opencl.o im2col.o col2im.o axpy.o
OBJS = $(addprefix $(OBJDIR), $(OBJ))
all: $(EXEC) all: $(EXEC)
$(EXEC): $(OBJ) $(EXEC): $(OBJS)
$(CC) $(CFLAGS) $(LDFLAGS) $^ -o $@ $(CC) $(CFLAGS) $(LDFLAGS) $^ -o $@
%.o: %.c $(OBJDIR)%.o: %.c
$(CC) $(CFLAGS) -c $< -o $@ $(CC) $(CFLAGS) -c $< -o $@
.PHONY: clean .PHONY: clean
clean: clean:
rm -rf $(OBJ) $(EXEC) rm -rf $(OBJS) $(EXEC)

@ -2,6 +2,12 @@ typedef enum{
SIGMOID, RELU, LINEAR, RAMP, TANH SIGMOID, RELU, LINEAR, RAMP, TANH
}ACTIVATION; }ACTIVATION;
float linear_activate(float x){return x;}
float sigmoid_activate(float x){return 1./(1. + exp(-x));}
float relu_activate(float x){return x*(x>0);}
float ramp_activate(float x){return x*(x>0)+.1*x;}
float tanh_activate(float x){return (exp(2*x)-1)/(exp(2*x)+1);}
float activate(float x, ACTIVATION a, float dropout) float activate(float x, ACTIVATION a, float dropout)
{ {
//if((float)rand()/RAND_MAX < dropout) return 0; //if((float)rand()/RAND_MAX < dropout) return 0;

@ -52,7 +52,7 @@ void test_convolve_matrix()
int i; int i;
clock_t start = clock(), end; clock_t start = clock(), end;
for(i = 0; i < 1000; ++i){ for(i = 0; i < 1000; ++i){
im2col_cpu(dog.data, 1, dog.c, dog.h, dog.w, size, stride, matrix); im2col_cpu(dog.data, 1, dog.c, dog.h, dog.w, size, stride, 0, matrix);
gemm(0,0,n,mw,mh,1,filters,mh,matrix,mw,1,edge.data,mw); gemm(0,0,n,mw,mh,1,filters,mh,matrix,mw,1,edge.data,mw);
} }
end = clock(); end = clock();
@ -76,7 +76,7 @@ void verify_convolutional_layer()
int size = 3; int size = 3;
float eps = .00000001; float eps = .00000001;
image test = make_random_image(5,5, 1); image test = make_random_image(5,5, 1);
convolutional_layer layer = *make_convolutional_layer(1,test.h,test.w,test.c, n, size, stride, RELU); convolutional_layer layer = *make_convolutional_layer(1,test.h,test.w,test.c, n, size, stride, 0, RELU);
image out = get_convolutional_image(layer); image out = get_convolutional_image(layer);
float **jacobian = calloc(test.h*test.w*test.c, sizeof(float)); float **jacobian = calloc(test.h*test.w*test.c, sizeof(float));
@ -301,7 +301,7 @@ void test_vince()
void test_nist() void test_nist()
{ {
srand(444444); srand(444444);
srand(888888); srand(222222);
network net = parse_network_cfg("cfg/nist.cfg"); network net = parse_network_cfg("cfg/nist.cfg");
data train = load_categorical_data_csv("data/mnist/mnist_train.csv", 0, 10); data train = load_categorical_data_csv("data/mnist/mnist_train.csv", 0, 10);
data test = load_categorical_data_csv("data/mnist/mnist_test.csv",0,10); data test = load_categorical_data_csv("data/mnist/mnist_test.csv",0,10);
@ -309,22 +309,26 @@ void test_nist()
normalize_data_rows(test); normalize_data_rows(test);
//randomize_data(train); //randomize_data(train);
int count = 0; int count = 0;
float lr = .00005; float lr = .000075;
float momentum = .9; float momentum = .9;
float decay = 0.0001; float decay = 0.0001;
decay = 0; decay = 0;
//clock_t start = clock(), end; //clock_t start = clock(), end;
int batch = 10000; int iters = 100;
while(++count <= 10000){ while(++count <= 10){
float loss = train_network_sgd(net, train, batch, lr, momentum, decay); clock_t start = clock(), end;
float loss = train_network_sgd(net, train, iters, lr, momentum, decay);
end = clock();
float test_acc = network_accuracy(net, test); float test_acc = network_accuracy(net, test);
printf("%3d %5f %5f\n",count, loss, test_acc); printf("%d: %f %f, Time: %lf seconds, LR: %f, Momentum: %f, Decay: %f\n", count, loss, test_acc,(float)(end-start)/CLOCKS_PER_SEC, lr, momentum, decay);
//printf("%5d Training Loss: %lf, Params: %f %f %f, ",count*1000, loss, lr, momentum, decay); //printf("%5d Training Loss: %lf, Params: %f %f %f, ",count*1000, loss, lr, momentum, decay);
//end = clock(); //end = clock();
//printf("Time: %lf seconds\n", (float)(end-start)/CLOCKS_PER_SEC); //printf("Time: %lf seconds\n", (float)(end-start)/CLOCKS_PER_SEC);
//start=end; //start=end;
//lr *= .5; //lr *= .5;
} }
//save_network(net, "cfg/nist_basic_trained.cfg");
} }
void test_ensemble() void test_ensemble()
@ -431,7 +435,7 @@ void test_im2row()
float *matrix = calloc(msize, sizeof(float)); float *matrix = calloc(msize, sizeof(float));
int i; int i;
for(i = 0; i < 1000; ++i){ for(i = 0; i < 1000; ++i){
im2col_cpu(test.data, 1, c, h, w, size, stride, matrix); im2col_cpu(test.data, 1, c, h, w, size, stride, 0, matrix);
//image render = float_to_image(mh, mw, mc, matrix); //image render = float_to_image(mh, mw, mc, matrix);
} }
} }
@ -442,34 +446,36 @@ void flip_network()
save_network(net, "cfg/voc_imagenet_rev.cfg"); save_network(net, "cfg/voc_imagenet_rev.cfg");
} }
void train_VOC() void tune_VOC()
{ {
network net = parse_network_cfg("cfg/voc_start.cfg"); network net = parse_network_cfg("cfg/voc_start.cfg");
srand(2222222); srand(2222222);
int i = 20; int i = 20;
char *labels[] = {"aeroplane","bicycle","bird","boat","bottle","bus","car","cat","chair","cow","diningtable","dog","horse","motorbike","person","pottedplant","sheep","sofa","train","tvmonitor"}; char *labels[] = {"aeroplane","bicycle","bird","boat","bottle","bus","car","cat","chair","cow","diningtable","dog","horse","motorbike","person","pottedplant","sheep","sofa","train","tvmonitor"};
float lr = .00001; float lr = .000005;
float momentum = .9; float momentum = .9;
float decay = 0.01; float decay = 0.0001;
while(i++ < 1000 || 1){ while(i++ < 1000 || 1){
data train = load_data_image_pathfile_random("images/VOC2012/val_paths.txt", 1000, labels, 20, 300, 400); data train = load_data_image_pathfile_random("/home/pjreddie/VOC2012/trainval_paths.txt", 10, labels, 20, 256, 256);
image im = float_to_image(300, 400, 3,train.X.vals[0]); image im = float_to_image(256, 256, 3,train.X.vals[0]);
show_image(im, "input"); show_image(im, "input");
visualize_network(net); visualize_network(net);
cvWaitKey(100); cvWaitKey(100);
normalize_data_rows(train); translate_data_rows(train, -144);
clock_t start = clock(), end; clock_t start = clock(), end;
float loss = train_network_sgd(net, train, 1000, lr, momentum, decay); float loss = train_network_sgd(net, train, 10, lr, momentum, decay);
end = clock(); end = clock();
printf("%d: %f, Time: %lf seconds, LR: %f, Momentum: %f, Decay: %f\n", i, loss, (float)(end-start)/CLOCKS_PER_SEC, lr, momentum, decay); printf("%d: %f, Time: %lf seconds, LR: %f, Momentum: %f, Decay: %f\n", i, loss, (float)(end-start)/CLOCKS_PER_SEC, lr, momentum, decay);
free_data(train); free_data(train);
/*
if(i%10==0){ if(i%10==0){
char buff[256]; char buff[256];
sprintf(buff, "cfg/voc_clean_ramp_%d.cfg", i); sprintf(buff, "/home/pjreddie/voc_cfg/voc_ramp_%d.cfg", i);
save_network(net, buff); save_network(net, buff);
} }
*/
//lr *= .99; //lr *= .99;
} }
} }
@ -778,7 +784,7 @@ int main(int argc, char *argv[])
//test_cifar10(); //test_cifar10();
//test_vince(); //test_vince();
//test_full(); //test_full();
//train_VOC(); //tune_VOC();
//features_VOC_image(argv[1], argv[2], argv[3], 0); //features_VOC_image(argv[1], argv[2], argv[3], 0);
//features_VOC_image(argv[1], argv[2], argv[3], 1); //features_VOC_image(argv[1], argv[2], argv[3], 1);
//features_VOC_image_size(argv[1], atoi(argv[2]), atoi(argv[3])); //features_VOC_image_size(argv[1], atoi(argv[2]), atoi(argv[3]));

@ -0,0 +1,47 @@
inline void col2im_set_pixel(float *im, int height, int width, int channels,
int row, int col, int channel, int pad, float val)
{
row -= pad;
col -= pad;
if (row < 0 || col < 0 ||
row >= height || col >= width) return;
im[col + width*(row + channel*height)] = val;
}
//This one might be too, can't remember.
void col2im_cpu(float* data_col,
const int batch, const int channels, const int height, const int width,
const int ksize, const int stride, int pad, float* data_im)
{
int c,h,w,b;
int height_col = (height - ksize) / stride + 1;
int width_col = (width - ksize) / stride + 1;
if (pad){
height_col = 1 + (height-1) / stride;
width_col = 1 + (width-1) / stride;
pad = ksize/2;
}
int channels_col = channels * ksize * ksize;
int im_size = height*width*channels;
int col_size = height_col*width_col*channels_col;
for (b = 0; b < batch; ++b) {
for (c = 0; c < channels_col; ++c) {
int w_offset = c % ksize;
int h_offset = (c / ksize) % ksize;
int c_im = c / ksize / ksize;
for (h = 0; h < height_col; ++h) {
for (w = 0; w < width_col; ++w) {
int im_row = h_offset + h * stride;
int im_col = w_offset + w * stride;
double val = data_col[(c * height_col + h) * width_col + w];
col2im_set_pixel(data_im, height, width, channels,
im_row, im_col, c_im, pad, val);
}
}
}
data_im += im_size;
data_col+= col_size;
}
}

@ -57,8 +57,11 @@ void update_connected_layer(connected_layer layer, float step, float momentum, f
void forward_connected_layer(connected_layer layer, float *input, int train) void forward_connected_layer(connected_layer layer, float *input, int train)
{ {
int i;
if(!train) layer.dropout = 0; if(!train) layer.dropout = 0;
memcpy(layer.output, layer.biases, layer.outputs*sizeof(float)); for(i = 0; i < layer.batch; ++i){
memcpy(layer.output+i*layer.outputs, layer.biases, layer.outputs*sizeof(float));
}
int m = layer.batch; int m = layer.batch;
int k = layer.inputs; int k = layer.inputs;
int n = layer.outputs; int n = layer.outputs;
@ -82,16 +85,16 @@ void backward_connected_layer(connected_layer layer, float *input, float *delta)
float *a = input; float *a = input;
float *b = layer.delta; float *b = layer.delta;
float *c = layer.weight_updates; float *c = layer.weight_updates;
gemm(0,0,m,n,k,1,a,k,b,n,1,c,n); gemm(1,0,m,n,k,1,a,k,b,n,1,c,n);
m = layer.inputs; m = layer.batch;
k = layer.outputs; k = layer.outputs;
n = layer.batch; n = layer.inputs;
a = layer.weights; a = layer.delta;
b = layer.delta; b = layer.weights;
c = delta; c = delta;
if(c) gemm(0,0,m,n,k,1,a,k,b,n,0,c,n); if(c) gemm(0,1,m,n,k,1,a,k,b,k,0,c,n);
} }

@ -5,12 +5,18 @@
int convolutional_out_height(convolutional_layer layer) int convolutional_out_height(convolutional_layer layer)
{ {
return (layer.h-layer.size)/layer.stride + 1; int h = layer.h;
if (!layer.pad) h -= layer.size;
else h -= 1;
return h/layer.stride + 1;
} }
int convolutional_out_width(convolutional_layer layer) int convolutional_out_width(convolutional_layer layer)
{ {
return (layer.w-layer.size)/layer.stride + 1; int w = layer.w;
if (!layer.pad) w -= layer.size;
else w -= 1;
return w/layer.stride + 1;
} }
image get_convolutional_image(convolutional_layer layer) image get_convolutional_image(convolutional_layer layer)
@ -31,7 +37,7 @@ image get_convolutional_delta(convolutional_layer layer)
return float_to_image(h,w,c,layer.delta); return float_to_image(h,w,c,layer.delta);
} }
convolutional_layer *make_convolutional_layer(int batch, int h, int w, int c, int n, int size, int stride, ACTIVATION activation) convolutional_layer *make_convolutional_layer(int batch, int h, int w, int c, int n, int size, int stride, int pad, ACTIVATION activation)
{ {
int i; int i;
size = 2*(size/2)+1; //HA! And you thought you'd use an even sized filter... size = 2*(size/2)+1; //HA! And you thought you'd use an even sized filter...
@ -43,6 +49,7 @@ convolutional_layer *make_convolutional_layer(int batch, int h, int w, int c, in
layer->batch = batch; layer->batch = batch;
layer->stride = stride; layer->stride = stride;
layer->size = size; layer->size = size;
layer->pad = pad;
layer->filters = calloc(c*n*size*size, sizeof(float)); layer->filters = calloc(c*n*size*size, sizeof(float));
layer->filter_updates = calloc(c*n*size*size, sizeof(float)); layer->filter_updates = calloc(c*n*size*size, sizeof(float));
@ -64,6 +71,17 @@ convolutional_layer *make_convolutional_layer(int batch, int h, int w, int c, in
layer->output = calloc(layer->batch*out_h * out_w * n, sizeof(float)); layer->output = calloc(layer->batch*out_h * out_w * n, sizeof(float));
layer->delta = calloc(layer->batch*out_h * out_w * n, sizeof(float)); layer->delta = calloc(layer->batch*out_h * out_w * n, sizeof(float));
#ifdef GPU #ifdef GPU
layer->filters_cl = cl_make_array(layer->filters, c*n*size*size);
layer->filter_updates_cl = cl_make_array(layer->filter_updates, c*n*size*size);
layer->filter_momentum_cl = cl_make_array(layer->filter_momentum, c*n*size*size);
layer->biases_cl = cl_make_array(layer->biases, n);
layer->bias_updates_cl = cl_make_array(layer->bias_updates, n);
layer->bias_momentum_cl = cl_make_array(layer->bias_momentum, n);
layer->col_image_cl = cl_make_array(layer->col_image, layer->batch*out_h*out_w*size*size*c);
layer->delta_cl = cl_make_array(layer->delta, layer->batch*out_h*out_w*n);
layer->output_cl = cl_make_array(layer->output, layer->batch*out_h*out_w*n);
#endif #endif
layer->activation = activation; layer->activation = activation;
@ -91,12 +109,14 @@ void resize_convolutional_layer(convolutional_layer *layer, int h, int w, int c)
void bias_output(const convolutional_layer layer) void bias_output(const convolutional_layer layer)
{ {
int i,j; int i,j,b;
int out_h = convolutional_out_height(layer); int out_h = convolutional_out_height(layer);
int out_w = convolutional_out_width(layer); int out_w = convolutional_out_width(layer);
for(i = 0; i < layer.n; ++i){ for(b = 0; b < layer.batch; ++b){
for(j = 0; j < out_h*out_w; ++j){ for(i = 0; i < layer.n; ++i){
layer.output[i*out_h*out_w + j] = layer.biases[i]; for(j = 0; j < out_h*out_w; ++j){
layer.output[(b*layer.n + i)*out_h*out_w + j] = layer.biases[i];
}
} }
} }
} }
@ -114,7 +134,7 @@ void forward_convolutional_layer(const convolutional_layer layer, float *in)
float *b = layer.col_image; float *b = layer.col_image;
float *c = layer.output; float *c = layer.output;
im2col_cpu(in,layer.batch, layer.c, layer.h, layer.w, im2col_cpu(in,layer.batch, layer.c, layer.h, layer.w,
layer.size, layer.stride, b); layer.size, layer.stride, layer.pad, b);
bias_output(layer); bias_output(layer);
gemm(0,0,m,n,k,1,a,k,b,n,1,c,n); gemm(0,0,m,n,k,1,a,k,b,n,1,c,n);
activate_array(layer.output, m*n, layer.activation, 0.); activate_array(layer.output, m*n, layer.activation, 0.);
@ -169,7 +189,6 @@ void backward_convolutional_layer(convolutional_layer layer, float *delta)
gemm(0,1,m,n,k,1,a,k,b,k,1,c,n); gemm(0,1,m,n,k,1,a,k,b,k,1,c,n);
if(delta){ if(delta){
int i;
m = layer.size*layer.size*layer.c; m = layer.size*layer.size*layer.c;
k = layer.n; k = layer.n;
n = convolutional_out_height(layer)* n = convolutional_out_height(layer)*
@ -183,9 +202,7 @@ void backward_convolutional_layer(convolutional_layer layer, float *delta)
gemm(1,0,m,n,k,1,a,m,b,n,0,c,n); gemm(1,0,m,n,k,1,a,m,b,n,0,c,n);
memset(delta, 0, layer.batch*layer.h*layer.w*layer.c*sizeof(float)); memset(delta, 0, layer.batch*layer.h*layer.w*layer.c*sizeof(float));
for(i = 0; i < layer.batch; ++i){ col2im_cpu(c, layer.batch, layer.c, layer.h, layer.w, layer.size, layer.stride, layer.pad, delta);
col2im_cpu(c+i*n/layer.batch, layer.c, layer.h, layer.w, layer.size, layer.stride, delta+i*n/layer.batch);
}
} }
} }

@ -14,6 +14,7 @@ typedef struct {
int n; int n;
int size; int size;
int stride; int stride;
int pad;
float *filters; float *filters;
float *filter_updates; float *filter_updates;
float *filter_momentum; float *filter_momentum;
@ -47,7 +48,7 @@ typedef struct {
void forward_convolutional_layer_gpu(convolutional_layer layer, cl_mem in); void forward_convolutional_layer_gpu(convolutional_layer layer, cl_mem in);
#endif #endif
convolutional_layer *make_convolutional_layer(int batch, int h, int w, int c, int n, int size, int stride, ACTIVATION activation); convolutional_layer *make_convolutional_layer(int batch, int h, int w, int c, int n, int size, int stride, int pad, ACTIVATION activation);
void resize_convolutional_layer(convolutional_layer *layer, int h, int w, int c); void resize_convolutional_layer(convolutional_layer *layer, int h, int w, int c);
void forward_convolutional_layer(const convolutional_layer layer, float *in); void forward_convolutional_layer(const convolutional_layer layer, float *in);
void update_convolutional_layer(convolutional_layer layer, float step, float momentum, float decay); void update_convolutional_layer(convolutional_layer layer, float step, float momentum, float decay);

@ -166,6 +166,14 @@ void scale_data_rows(data d, float s)
} }
} }
void translate_data_rows(data d, float s)
{
int i;
for(i = 0; i < d.X.rows; ++i){
translate_array(d.X.vals[i], d.X.cols, s);
}
}
void normalize_data_rows(data d) void normalize_data_rows(data d)
{ {
int i; int i;

@ -22,6 +22,7 @@ list *get_paths(char *filename);
data load_categorical_data_csv(char *filename, int target, int k); data load_categorical_data_csv(char *filename, int target, int k);
void normalize_data_rows(data d); void normalize_data_rows(data d);
void scale_data_rows(data d, float s); void scale_data_rows(data d, float s);
void translate_data_rows(data d, float s);
void randomize_data(data d); void randomize_data(data d);
data *split_data(data d, int part, int total); data *split_data(data d, int part, int total);

@ -0,0 +1,72 @@
int detection_out_height(detection_layer layer)
{
return layer.size + layer.h*layer.stride;
}
int detection_out_width(detection_layer layer)
{
return layer.size + layer.w*layer.stride;
}
detection_layer *make_detection_layer(int batch, int h, int w, int c, int n, int size, int stride, ACTIVATION activation)
{
int i;
size = 2*(size/2)+1; //HA! And you thought you'd use an even sized filter...
detection_layer *layer = calloc(1, sizeof(detection_layer));
layer->h = h;
layer->w = w;
layer->c = c;
layer->n = n;
layer->batch = batch;
layer->stride = stride;
layer->size = size;
assert(c%n == 0);
layer->filters = calloc(c*size*size, sizeof(float));
layer->filter_updates = calloc(c*size*size, sizeof(float));
layer->filter_momentum = calloc(c*size*size, sizeof(float));
float scale = 1./(size*size*c);
for(i = 0; i < c*n*size*size; ++i) layer->filters[i] = scale*(rand_uniform());
int out_h = detection_out_height(*layer);
int out_w = detection_out_width(*layer);
layer->output = calloc(layer->batch * out_h * out_w * n, sizeof(float));
layer->delta = calloc(layer->batch * out_h * out_w * n, sizeof(float));
layer->activation = activation;
fprintf(stderr, "Convolutional Layer: %d x %d x %d image, %d filters -> %d x %d x %d image\n", h,w,c,n, out_h, out_w, n);
srand(0);
return layer;
}
void forward_detection_layer(const detection_layer layer, float *in)
{
int out_h = detection_out_height(layer);
int out_w = detection_out_width(layer);
int i,j,fh, fw,c;
memset(layer.output, 0, layer->batch*layer->n*out_h*out_w*sizeof(float));
for(c = 0; c < layer.c; ++c){
for(i = 0; i < layer.h; ++i){
for(j = 0; j < layer.w; ++j){
float val = layer->input[j+(i + c*layer.h)*layer.w];
for(fh = 0; fh < layer.size; ++fh){
for(fw = 0; fw < layer.size; ++fw){
int h = i*layer.stride + fh;
int w = j*layer.stride + fw;
layer.output[w+(h+c/n*out_h)*out_w] += val*layer->filters[fw+(fh+c*layer.size)*layer.size];
}
}
}
}
}
}
void backward_detection_layer(const detection_layer layer, float *delta)
{
}

@ -0,0 +1,40 @@
#ifndef DETECTION_LAYER_H
#define DETECTION_LAYER_H
typedef struct {
int batch;
int h,w,c;
int n;
int size;
int stride;
float *filters;
float *filter_updates;
float *filter_momentum;
float *biases;
float *bias_updates;
float *bias_momentum;
float *col_image;
float *delta;
float *output;
#ifdef GPU
cl_mem filters_cl;
cl_mem filter_updates_cl;
cl_mem filter_momentum_cl;
cl_mem biases_cl;
cl_mem bias_updates_cl;
cl_mem bias_momentum_cl;
cl_mem col_image_cl;
cl_mem delta_cl;
cl_mem output_cl;
#endif
ACTIVATION activation;
} convolutional_layer;
#endif

@ -27,8 +27,8 @@ __kernel void gemm(int TA, int TB, int M, int N, int K, float ALPHA,
int brow = i + sub_row; int brow = i + sub_row;
int bcol = col_block*BLOCK + sub_col; int bcol = col_block*BLOCK + sub_col;
Asub[sub_row][sub_col] = TA ? A[arow + acol*lda] : A[arow*lda + acol]; if(arow < M && acol < K)Asub[sub_row][sub_col] = TA ? A[arow + acol*lda] : A[arow*lda + acol];
Bsub[sub_row][sub_col] = TB ? B[brow + bcol*ldb] : B[brow*ldb + bcol]; if(brow < K && bcol < N)Bsub[sub_row][sub_col] = TB ? B[brow + bcol*ldb] : B[brow*ldb + bcol];
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);

@ -1,27 +1,45 @@
#include "mini_blas.h" #include "mini_blas.h"
inline float im2col_get_pixel(float *im, int height, int width, int channels,
int row, int col, int channel, int pad)
{
row -= pad;
col -= pad;
if (row < 0 || col < 0 ||
row >= height || col >= width) return 0;
return im[col + width*(row + channel*height)];
}
//From Berkeley Vision's Caffe! //From Berkeley Vision's Caffe!
//https://github.com/BVLC/caffe/blob/master/LICENSE //https://github.com/BVLC/caffe/blob/master/LICENSE
void im2col_cpu(float* data_im, void im2col_cpu(float* data_im,
const int batch, const int channels, const int height, const int width, const int batch, const int channels, const int height, const int width,
const int ksize, const int stride, float* data_col) const int ksize, const int stride, int pad, float* data_col)
{ {
int c,h,w,b; int c,h,w,b;
int height_col = (height - ksize) / stride + 1; int height_col = (height - ksize) / stride + 1;
int width_col = (width - ksize) / stride + 1; int width_col = (width - ksize) / stride + 1;
if (pad){
height_col = 1 + (height-1) / stride;
width_col = 1 + (width-1) / stride;
pad = ksize/2;
}
int channels_col = channels * ksize * ksize; int channels_col = channels * ksize * ksize;
int im_size = height*width*channels; int im_size = height*width*channels;
int col_size = height_col*width_col*channels_col; int col_size = height_col*width_col*channels_col;
for(b = 0; b < batch; ++b){ for (b = 0; b < batch; ++b) {
for ( c = 0; c < channels_col; ++c) { for (c = 0; c < channels_col; ++c) {
int w_offset = c % ksize; int w_offset = c % ksize;
int h_offset = (c / ksize) % ksize; int h_offset = (c / ksize) % ksize;
int c_im = c / ksize / ksize; int c_im = c / ksize / ksize;
for ( h = 0; h < height_col; ++h) { for (h = 0; h < height_col; ++h) {
for ( w = 0; w < width_col; ++w) { for (w = 0; w < width_col; ++w) {
int im_row = h_offset + h * stride;
int im_col = w_offset + w * stride;
data_col[(c * height_col + h) * width_col + w] = data_col[(c * height_col + h) * width_col + w] =
data_im[(c_im * height + h * stride + h_offset) * width im2col_get_pixel(data_im, height, width, channels,
+ w * stride + w_offset]; im_row, im_col, c_im, pad);
} }
} }
} }

@ -19,7 +19,6 @@ image get_maxpool_delta(maxpool_layer layer)
maxpool_layer *make_maxpool_layer(int batch, int h, int w, int c, int stride) maxpool_layer *make_maxpool_layer(int batch, int h, int w, int c, int stride)
{ {
c = c*batch;
fprintf(stderr, "Maxpool Layer: %d x %d x %d image, %d stride\n", h,w,c,stride); fprintf(stderr, "Maxpool Layer: %d x %d x %d image, %d stride\n", h,w,c,stride);
maxpool_layer *layer = calloc(1, sizeof(maxpool_layer)); maxpool_layer *layer = calloc(1, sizeof(maxpool_layer));
layer->batch = batch; layer->batch = batch;
@ -27,8 +26,8 @@ maxpool_layer *make_maxpool_layer(int batch, int h, int w, int c, int stride)
layer->w = w; layer->w = w;
layer->c = c; layer->c = c;
layer->stride = stride; layer->stride = stride;
layer->output = calloc(((h-1)/stride+1) * ((w-1)/stride+1) * c, sizeof(float)); layer->output = calloc(((h-1)/stride+1) * ((w-1)/stride+1) * c*batch, sizeof(float));
layer->delta = calloc(((h-1)/stride+1) * ((w-1)/stride+1) * c, sizeof(float)); layer->delta = calloc(((h-1)/stride+1) * ((w-1)/stride+1) * c*batch, sizeof(float));
return layer; return layer;
} }
@ -37,22 +36,30 @@ void resize_maxpool_layer(maxpool_layer *layer, int h, int w, int c)
layer->h = h; layer->h = h;
layer->w = w; layer->w = w;
layer->c = c; layer->c = c;
layer->output = realloc(layer->output, ((h-1)/layer->stride+1) * ((w-1)/layer->stride+1) * c * sizeof(float)); layer->output = realloc(layer->output, ((h-1)/layer->stride+1) * ((w-1)/layer->stride+1) * c * layer->batch* sizeof(float));
layer->delta = realloc(layer->delta, ((h-1)/layer->stride+1) * ((w-1)/layer->stride+1) * c * sizeof(float)); layer->delta = realloc(layer->delta, ((h-1)/layer->stride+1) * ((w-1)/layer->stride+1) * c * layer->batch*sizeof(float));
} }
void forward_maxpool_layer(const maxpool_layer layer, float *in) void forward_maxpool_layer(const maxpool_layer layer, float *in)
{ {
image input = float_to_image(layer.h, layer.w, layer.c, in); int b;
image output = get_maxpool_image(layer); for(b = 0; b < layer.batch; ++b){
int i,j,k; image input = float_to_image(layer.h, layer.w, layer.c, in+b*layer.h*layer.w*layer.c);
for(i = 0; i < output.h*output.w*output.c; ++i) output.data[i] = -DBL_MAX;
for(k = 0; k < input.c; ++k){ int h = (layer.h-1)/layer.stride + 1;
for(i = 0; i < input.h; ++i){ int w = (layer.w-1)/layer.stride + 1;
for(j = 0; j < input.w; ++j){ int c = layer.c;
float val = get_pixel(input, i, j, k); image output = float_to_image(h,w,c,layer.output+b*h*w*c);
float cur = get_pixel(output, i/layer.stride, j/layer.stride, k);
if(val > cur) set_pixel(output, i/layer.stride, j/layer.stride, k, val); int i,j,k;
for(i = 0; i < output.h*output.w*output.c; ++i) output.data[i] = -DBL_MAX;
for(k = 0; k < input.c; ++k){
for(i = 0; i < input.h; ++i){
for(j = 0; j < input.w; ++j){
float val = get_pixel(input, i, j, k);
float cur = get_pixel(output, i/layer.stride, j/layer.stride, k);
if(val > cur) set_pixel(output, i/layer.stride, j/layer.stride, k, val);
}
} }
} }
} }
@ -60,21 +67,28 @@ void forward_maxpool_layer(const maxpool_layer layer, float *in)
void backward_maxpool_layer(const maxpool_layer layer, float *in, float *delta) void backward_maxpool_layer(const maxpool_layer layer, float *in, float *delta)
{ {
image input = float_to_image(layer.h, layer.w, layer.c, in); int b;
image input_delta = float_to_image(layer.h, layer.w, layer.c, delta); for(b = 0; b < layer.batch; ++b){
image output_delta = get_maxpool_delta(layer); image input = float_to_image(layer.h, layer.w, layer.c, in+b*layer.h*layer.w*layer.c);
image output = get_maxpool_image(layer); image input_delta = float_to_image(layer.h, layer.w, layer.c, delta+b*layer.h*layer.w*layer.c);
int i,j,k; int h = (layer.h-1)/layer.stride + 1;
for(k = 0; k < input.c; ++k){ int w = (layer.w-1)/layer.stride + 1;
for(i = 0; i < input.h; ++i){ int c = layer.c;
for(j = 0; j < input.w; ++j){ image output = float_to_image(h,w,c,layer.output+b*h*w*c);
float val = get_pixel(input, i, j, k); image output_delta = float_to_image(h,w,c,layer.delta+b*h*w*c);
float cur = get_pixel(output, i/layer.stride, j/layer.stride, k);
float d = get_pixel(output_delta, i/layer.stride, j/layer.stride, k); int i,j,k;
if(val == cur) { for(k = 0; k < input.c; ++k){
set_pixel(input_delta, i, j, k, d); for(i = 0; i < input.h; ++i){
for(j = 0; j < input.w; ++j){
float val = get_pixel(input, i, j, k);
float cur = get_pixel(output, i/layer.stride, j/layer.stride, k);
float d = get_pixel(output_delta, i/layer.stride, j/layer.stride, k);
if(val == cur) {
set_pixel(input_delta, i, j, k, d);
}
else set_pixel(input_delta, i, j, k, 0);
} }
else set_pixel(input_delta, i, j, k, 0);
} }
} }
} }

@ -17,28 +17,6 @@ void pm(int M, int N, float *A)
printf("\n"); printf("\n");
} }
//This one might be too, can't remember.
void col2im_cpu(float* data_col, const int channels,
const int height, const int width, const int ksize, const int stride,
float* data_im)
{
int c,h,w;
int height_col = (height - ksize) / stride + 1;
int width_col = (width - ksize) / stride + 1;
int channels_col = channels * ksize * ksize;
for ( c = 0; c < channels_col; ++c) {
int w_offset = c % ksize;
int h_offset = (c / ksize) % ksize;
int c_im = c / ksize / ksize;
for ( h = 0; h < height_col; ++h) {
for ( w = 0; w < width_col; ++w) {
data_im[(c_im * height + h * stride + h_offset) * width
+ w * stride + w_offset]+= data_col[(c * height_col + h) * width_col + w];
}
}
}
}
float *random_matrix(int rows, int cols) float *random_matrix(int rows, int cols)
{ {
int i; int i;

@ -27,11 +27,11 @@ void gemm_ongpu(int TA, int TB, int M, int N, int K, float ALPHA,
void im2col_cpu(float* data_im, void im2col_cpu(float* data_im,
const int batch, const int channels, const int height, const int width, const int batch, const int channels, const int height, const int width,
const int ksize, const int stride, float* data_col); const int ksize, const int stride, int pad, float* data_col);
void col2im_cpu(float* data_col, const int channels, void col2im_cpu(float* data_col,
const int height, const int width, const int ksize, const int stride, const int batch, const int channels, const int height, const int width,
float* data_im); const int ksize, const int stride, int pad, float* data_im);
void test_blas(); void test_blas();
void gemm_gpu(int TA, int TB, int M, int N, int K, float ALPHA, void gemm_gpu(int TA, int TB, int M, int N, int K, float ALPHA,

@ -113,10 +113,9 @@ void save_network(network net, char *filename)
fclose(fp); fclose(fp);
} }
#ifdef GPU
void forward_network(network net, float *input, int train) void forward_network(network net, float *input, int train)
{ {
int i;
#ifdef GPU
cl_setup(); cl_setup();
size_t size = get_network_input_size(net); size_t size = get_network_input_size(net);
if(!net.input_cl){ if(!net.input_cl){
@ -126,16 +125,46 @@ void forward_network(network net, float *input, int train)
} }
cl_write_array(net.input_cl, input, size); cl_write_array(net.input_cl, input, size);
cl_mem input_cl = net.input_cl; cl_mem input_cl = net.input_cl;
#endif int i;
for(i = 0; i < net.n; ++i){ for(i = 0; i < net.n; ++i){
if(net.types[i] == CONVOLUTIONAL){ if(net.types[i] == CONVOLUTIONAL){
convolutional_layer layer = *(convolutional_layer *)net.layers[i]; convolutional_layer layer = *(convolutional_layer *)net.layers[i];
#ifdef GPU
forward_convolutional_layer_gpu(layer, input_cl); forward_convolutional_layer_gpu(layer, input_cl);
input_cl = layer.output_cl; input_cl = layer.output_cl;
#else input = layer.output;
}
else if(net.types[i] == CONNECTED){
connected_layer layer = *(connected_layer *)net.layers[i];
forward_connected_layer(layer, input, train);
input = layer.output;
}
else if(net.types[i] == SOFTMAX){
softmax_layer layer = *(softmax_layer *)net.layers[i];
forward_softmax_layer(layer, input);
input = layer.output;
}
else if(net.types[i] == MAXPOOL){
maxpool_layer layer = *(maxpool_layer *)net.layers[i];
forward_maxpool_layer(layer, input);
input = layer.output;
}
else if(net.types[i] == NORMALIZATION){
normalization_layer layer = *(normalization_layer *)net.layers[i];
forward_normalization_layer(layer, input);
input = layer.output;
}
}
}
#else
void forward_network(network net, float *input, int train)
{
int i;
for(i = 0; i < net.n; ++i){
if(net.types[i] == CONVOLUTIONAL){
convolutional_layer layer = *(convolutional_layer *)net.layers[i];
forward_convolutional_layer(layer, input); forward_convolutional_layer(layer, input);
#endif
input = layer.output; input = layer.output;
} }
else if(net.types[i] == CONNECTED){ else if(net.types[i] == CONNECTED){
@ -160,6 +189,7 @@ void forward_network(network net, float *input, int train)
} }
} }
} }
#endif
void update_network(network net, float step, float momentum, float decay) void update_network(network net, float step, float momentum, float decay)
{ {
@ -238,9 +268,10 @@ float calculate_error_network(network net, float *truth)
float sum = 0; float sum = 0;
float *delta = get_network_delta(net); float *delta = get_network_delta(net);
float *out = get_network_output(net); float *out = get_network_output(net);
int i, k = get_network_output_size(net); int i;
for(i = 0; i < k; ++i){ for(i = 0; i < get_network_output_size(net)*net.batch; ++i){
//printf("%f, ", out[i]); //if(i %get_network_output_size(net) == 0) printf("\n");
//printf("%5.2f %5.2f, ", out[i], truth[i]);
delta[i] = truth[i] - out[i]; delta[i] = truth[i] - out[i];
sum += delta[i]*delta[i]; sum += delta[i]*delta[i];
} }
@ -305,20 +336,38 @@ float train_network_datum(network net, float *x, float *y, float step, float mom
float train_network_sgd(network net, data d, int n, float step, float momentum,float decay) float train_network_sgd(network net, data d, int n, float step, float momentum,float decay)
{ {
int i; int batch = net.batch;
float error = 0; float *X = calloc(batch*d.X.cols, sizeof(float));
int correct = 0; float *y = calloc(batch*d.y.cols, sizeof(float));
int pos = 0;
int i,j;
float sum = 0;
for(i = 0; i < n; ++i){ for(i = 0; i < n; ++i){
int index = rand()%d.X.rows; for(j = 0; j < batch; ++j){
float err = train_network_datum(net, d.X.vals[index], d.y.vals[index], step, momentum, decay); int index = rand()%d.X.rows;
memcpy(X+j*d.X.cols, d.X.vals[index], d.X.cols*sizeof(float));
memcpy(y+j*d.y.cols, d.y.vals[index], d.y.cols*sizeof(float));
}
float err = train_network_datum(net, X, y, step, momentum, decay);
sum += err;
//train_network_datum(net, X, y, step, momentum, decay);
/*
float *y = d.y.vals[index]; float *y = d.y.vals[index];
int class = get_predicted_class_network(net); int class = get_predicted_class_network(net);
correct += (y[class]?1:0); correct += (y[class]?1:0);
if(y[1]){ */
error += err;
++pos; /*
for(j = 0; j < d.y.cols*batch; ++j){
printf("%6.3f ", y[j]);
} }
printf("\n");
for(j = 0; j < d.y.cols*batch; ++j){
printf("%6.3f ", get_network_output(net)[j]);
}
printf("\n");
printf("\n");
*/
//printf("%d %f %f\n", i,net.output[0], d.y.vals[index][0]); //printf("%d %f %f\n", i,net.output[0], d.y.vals[index][0]);
@ -327,7 +376,9 @@ float train_network_sgd(network net, data d, int n, float step, float momentum,f
//} //}
} }
//printf("Accuracy: %f\n",(float) correct/n); //printf("Accuracy: %f\n",(float) correct/n);
return error/pos; free(X);
free(y);
return (float)sum/(n*batch);
} }
float train_network_batch(network net, data d, int n, float step, float momentum,float decay) float train_network_batch(network net, data d, int n, float step, float momentum,float decay)
{ {
@ -448,7 +499,7 @@ int get_network_output_size(network net)
int get_network_input_size(network net) int get_network_input_size(network net)
{ {
return get_network_output_size_layer(net, 0); return get_network_input_size_layer(net, 0);
} }
image get_network_image_layer(network net, int i) image get_network_image_layer(network net, int i)
@ -505,15 +556,24 @@ float *network_predict(network net, float *input)
matrix network_predict_data(network net, data test) matrix network_predict_data(network net, data test)
{ {
int i,j; int i,j,b;
int k = get_network_output_size(net); int k = get_network_output_size(net);
matrix pred = make_matrix(test.X.rows, k); matrix pred = make_matrix(test.X.rows, k);
for(i = 0; i < test.X.rows; ++i){ float *X = calloc(net.batch*test.X.rows, sizeof(float));
float *out = network_predict(net, test.X.vals[i]); for(i = 0; i < test.X.rows; i += net.batch){
for(j = 0; j < k; ++j){ for(b = 0; b < net.batch; ++b){
pred.vals[i][j] = out[j]; if(i+b == test.X.rows) break;
memcpy(X+b*test.X.cols, test.X.vals[i+b], test.X.cols*sizeof(float));
}
float *out = network_predict(net, X);
for(b = 0; b < net.batch; ++b){
if(i+b == test.X.rows) break;
for(j = 0; j < k; ++j){
pred.vals[i+b][j] = out[j+b*k];
}
} }
} }
free(X);
return pred; return pred;
} }

@ -32,7 +32,8 @@ cl_info cl_init()
if(num_devices > MAX_DEVICES) num_devices = MAX_DEVICES; if(num_devices > MAX_DEVICES) num_devices = MAX_DEVICES;
int index = getpid()%num_devices; int index = getpid()%num_devices;
printf("%d rand, %d devices, %d index\n", getpid(), num_devices, index); printf("%d rand, %d devices, %d index\n", getpid(), num_devices, index);
info.device = devices[index]; //info.device = devices[index];
info.device = devices[1];
fprintf(stderr, "Found %d device(s)\n", num_devices); fprintf(stderr, "Found %d device(s)\n", num_devices);
check_error(info); check_error(info);
@ -102,4 +103,21 @@ void cl_write_array(cl_mem mem, float *x, int n)
check_error(cl); check_error(cl);
} }
void cl_copy_array(cl_mem src, cl_mem dst, int n)
{
cl_setup();
clEnqueueCopyBuffer(cl.queue, src, dst, 0, 0, sizeof(float)*n,0,0,0);
check_error(cl);
}
cl_mem cl_make_array(float *x, int n)
{
cl_setup();
cl_mem mem = clCreateBuffer(cl.context,
CL_MEM_READ_WRITE|CL_MEM_COPY_HOST_PTR,
sizeof(float)*n, x, &cl.error);
check_error(cl);
return mem;
}
#endif #endif

@ -23,5 +23,7 @@ void check_error(cl_info info);
cl_kernel get_kernel(char *filename, char *kernelname, char *options); cl_kernel get_kernel(char *filename, char *kernelname, char *options);
void cl_read_array(cl_mem mem, float *x, int n); void cl_read_array(cl_mem mem, float *x, int n);
void cl_write_array(cl_mem mem, float *x, int n); void cl_write_array(cl_mem mem, float *x, int n);
cl_mem cl_make_array(float *x, int n);
void cl_copy_array(cl_mem src, cl_mem dst, int n);
#endif #endif
#endif #endif

@ -48,6 +48,7 @@ convolutional_layer *parse_convolutional(list *options, network net, int count)
int n = option_find_int(options, "filters",1); int n = option_find_int(options, "filters",1);
int size = option_find_int(options, "size",1); int size = option_find_int(options, "size",1);
int stride = option_find_int(options, "stride",1); int stride = option_find_int(options, "stride",1);
int pad = option_find_int(options, "pad",0);
char *activation_s = option_find_str(options, "activation", "sigmoid"); char *activation_s = option_find_str(options, "activation", "sigmoid");
ACTIVATION activation = get_activation(activation_s); ACTIVATION activation = get_activation(activation_s);
if(count == 0){ if(count == 0){
@ -62,7 +63,7 @@ convolutional_layer *parse_convolutional(list *options, network net, int count)
c = m.c; c = m.c;
if(h == 0) error("Layer before convolutional layer must output image."); if(h == 0) error("Layer before convolutional layer must output image.");
} }
convolutional_layer *layer = make_convolutional_layer(net.batch,h,w,c,n,size,stride, activation); convolutional_layer *layer = make_convolutional_layer(net.batch,h,w,c,n,size,stride,pad,activation);
char *data = option_find_str(options, "data", 0); char *data = option_find_str(options, "data", 0);
if(data){ if(data){
char *curr = data; char *curr = data;

Loading…
Cancel
Save