From cc41339805d8b48edef67d0dcf4925b45c212277 Mon Sep 17 00:00:00 2001 From: AlexeyAB Date: Wed, 19 Jun 2019 23:36:02 +0300 Subject: [PATCH] Added [scale_channels] layer for squeeze-and-excitation blocks --- Makefile | 2 +- build/darknet/darknet.vcxproj | 2 + include/darknet.h | 2 + src/blas.h | 5 ++ src/blas_kernels.cu | 46 +++++++++++++ src/classifier.c | 4 +- src/convolutional_kernels.cu | 7 +- src/dropout_layer_kernels.cu | 7 ++ src/network.c | 1 + src/network_kernels.cu | 1 + src/parser.c | 38 +++++++++++ src/scale_channels_layer.c | 118 ++++++++++++++++++++++++++++++++++ src/scale_channels_layer.h | 23 +++++++ src/shortcut_layer.c | 2 + 14 files changed, 253 insertions(+), 5 deletions(-) create mode 100644 src/scale_channels_layer.c create mode 100644 src/scale_channels_layer.h diff --git a/Makefile b/Makefile index 7f0547d3..dc880be9 100644 --- a/Makefile +++ b/Makefile @@ -118,7 +118,7 @@ LDFLAGS+= -L/usr/local/zed/lib -lsl_core -lsl_input -lsl_zed #-lstdc++ -D_GLIBCXX_USE_CXX11_ABI=0 endif -OBJ=image_opencv.o http_stream.o gemm.o utils.o dark_cuda.o convolutional_layer.o list.o image.o activations.o im2col.o col2im.o blas.o crop_layer.o dropout_layer.o maxpool_layer.o softmax_layer.o data.o matrix.o network.o connected_layer.o cost_layer.o parser.o option_list.o darknet.o detection_layer.o captcha.o route_layer.o writing.o box.o nightmare.o normalization_layer.o avgpool_layer.o coco.o dice.o yolo.o detector.o layer.o compare.o classifier.o local_layer.o swag.o shortcut_layer.o activation_layer.o rnn_layer.o gru_layer.o rnn.o rnn_vid.o crnn_layer.o demo.o tag.o cifar.o go.o batchnorm_layer.o art.o region_layer.o reorg_layer.o reorg_old_layer.o super.o voxel.o tree.o yolo_layer.o upsample_layer.o lstm_layer.o conv_lstm_layer.o +OBJ=image_opencv.o http_stream.o gemm.o utils.o dark_cuda.o convolutional_layer.o list.o image.o activations.o im2col.o col2im.o blas.o crop_layer.o dropout_layer.o maxpool_layer.o softmax_layer.o data.o matrix.o network.o connected_layer.o cost_layer.o parser.o option_list.o darknet.o detection_layer.o captcha.o route_layer.o writing.o box.o nightmare.o normalization_layer.o avgpool_layer.o coco.o dice.o yolo.o detector.o layer.o compare.o classifier.o local_layer.o swag.o shortcut_layer.o activation_layer.o rnn_layer.o gru_layer.o rnn.o rnn_vid.o crnn_layer.o demo.o tag.o cifar.o go.o batchnorm_layer.o art.o region_layer.o reorg_layer.o reorg_old_layer.o super.o voxel.o tree.o yolo_layer.o upsample_layer.o lstm_layer.o conv_lstm_layer.o scale_channels_layer.o ifeq ($(GPU), 1) LDFLAGS+= -lstdc++ OBJ+=convolutional_kernels.o activation_kernels.o im2col_kernels.o col2im_kernels.o blas_kernels.o crop_layer_kernels.o dropout_layer_kernels.o maxpool_layer_kernels.o network_kernels.o avgpool_layer_kernels.o diff --git a/build/darknet/darknet.vcxproj b/build/darknet/darknet.vcxproj index 5a78ac60..019e02c6 100644 --- a/build/darknet/darknet.vcxproj +++ b/build/darknet/darknet.vcxproj @@ -226,6 +226,7 @@ + @@ -284,6 +285,7 @@ + diff --git a/include/darknet.h b/include/darknet.h index e807ff9b..25e702b2 100644 --- a/include/darknet.h +++ b/include/darknet.h @@ -136,6 +136,7 @@ typedef enum { AVGPOOL, LOCAL, SHORTCUT, + SCALE_CHANNELS, ACTIVE, RNN, GRU, @@ -153,6 +154,7 @@ typedef enum { UPSAMPLE, LOGXENT, L2NORM, + EMPTY, BLANK } LAYER_TYPE; diff --git a/src/blas.h b/src/blas.h index ca9cd588..2b46eba3 100644 --- a/src/blas.h +++ b/src/blas.h @@ -117,6 +117,11 @@ void add_3_arrays_activate(float *a1, float *a2, float *a3, size_t size, ACTIVAT void sum_of_mults(float *a1, float *a2, float *b1, float *b2, size_t size, float *dst); void activate_and_mult(float *a1, float *a2, size_t size, ACTIVATION a, float *dst); +void scale_channels_gpu(float *in_w_h_c, int size, int channel_size, float *scales_c, float *out); +void backward_scale_channels_gpu(float *in_w_h_c_delta, int size, int channel_size, + float *in_scales_c, float *out_from_delta, + float *in_from_output, float *out_state_delta); + #endif #ifdef __cplusplus } diff --git a/src/blas_kernels.cu b/src/blas_kernels.cu index ed418c10..35f9a1ad 100644 --- a/src/blas_kernels.cu +++ b/src/blas_kernels.cu @@ -1099,3 +1099,49 @@ extern "C" void activate_and_mult(float *a1, float *a2, size_t size, ACTIVATION } activate_and_mult_kernel << > >(a1, a2, size, a, dst); } + + + +__global__ void scale_channels_kernel(float *in_w_h_c, int size, int channel_size, float *scales_c, float *out) +{ + const int index = blockIdx.x*blockDim.x + threadIdx.x; + if (index < size) { + out[index] = in_w_h_c[index] * scales_c[index / channel_size]; + } +} + +extern "C" void scale_channels_gpu(float *in_w_h_c, int size, int channel_size, float *scales_c, float *out) +{ + const int block_size = BLOCK; + const int num_blocks = get_number_of_blocks(size, block_size); + scale_channels_kernel << > >(in_w_h_c, size, channel_size, scales_c, out); + CHECK_CUDA(cudaPeekAtLastError()); +} + + +__global__ void backward_scale_channels_kernel(float *in_w_h_c_delta, int size, int channel_size, + float *in_scales_c, float *out_from_delta, + float *in_from_output, float *out_state_delta) +{ + const int index = blockIdx.x*blockDim.x + threadIdx.x; + if (index < size) { + out_state_delta[index / channel_size] += in_w_h_c_delta[index] * in_from_output[index]; // l.delta * from (should be divided by channel_size?) + out_from_delta[index] += in_scales_c[index / channel_size] * in_w_h_c_delta[index]; // input * l.delta + + //out_state_delta[index / channel_size] += in_w_h_c_delta[index] / channel_size; + //out_from_delta[index] = in_w_h_c_delta[index]; + } +} + +extern "C" void backward_scale_channels_gpu(float *in_w_h_c_delta, int size, int channel_size, + float *in_scales_c, float *out_from_delta, + float *in_from_output, float *out_state_delta) +{ + const int block_size = BLOCK; + const int num_blocks = get_number_of_blocks(size, block_size); + backward_scale_channels_kernel << > > (in_w_h_c_delta, size, channel_size, + in_scales_c, out_from_delta, + in_from_output, out_state_delta); + + CHECK_CUDA(cudaPeekAtLastError()); +} diff --git a/src/classifier.c b/src/classifier.c index f9708e44..f5fbfaeb 100644 --- a/src/classifier.c +++ b/src/classifier.c @@ -134,14 +134,14 @@ void train_classifier(char *datacfg, char *cfgfile, char *weightfile, int *gpus, #else loss = train_network(net, train); #endif - if(avg_loss == -1) avg_loss = loss; + if(avg_loss == -1 || isnan(avg_loss) || isinf(avg_loss)) avg_loss = loss; avg_loss = avg_loss*.9 + loss*.1; i = get_current_batch(net); int calc_topk_for_each = iter_topk + 2 * train_images_num / (net.batch * net.subdivisions); // calculate TOPk for each 2 Epochs calc_topk_for_each = fmax(calc_topk_for_each, net.burn_in); - calc_topk_for_each = fmax(calc_topk_for_each, 1000); + calc_topk_for_each = fmax(calc_topk_for_each, 100); if (i % 10 == 0) { if (calc_topk) { fprintf(stderr, "\n (next TOP5 calculation at %d iterations) ", calc_topk_for_each); diff --git a/src/convolutional_kernels.cu b/src/convolutional_kernels.cu index ab8d7fa8..eaa6e803 100644 --- a/src/convolutional_kernels.cu +++ b/src/convolutional_kernels.cu @@ -415,7 +415,7 @@ void forward_convolutional_layer_gpu(convolutional_layer l, network_state state) //if (state.use_mixed_precision) { int iteration_num = (*state.net.seen) / (state.net.batch*state.net.subdivisions); if (state.index != 0 && state.net.cudnn_half && !l.xnor && (!state.train || iteration_num > 3*state.net.burn_in) && - l.c % 8 == 0 && l.n % 8 == 0) + (l.c / l.groups) % 8 == 0 && l.n % 8 == 0 && !state.train) { //printf("\n CUDNN_HALF!!! state.index = %d \n", state.index); @@ -629,7 +629,7 @@ void backward_convolutional_layer_gpu(convolutional_layer l, network_state state //#ifdef CUDNN_HALF int iteration_num = (*state.net.seen) / (state.net.batch*state.net.subdivisions); if (state.index != 0 && state.net.cudnn_half && !l.xnor && (!state.train || iteration_num > 3*state.net.burn_in) && - l.c % 8 == 0 && l.n % 8 == 0) + (l.c / l.groups) % 8 == 0 && l.n % 8 == 0 && !state.train) { const size_t input16_size = l.batch*l.c*l.w*l.h; const size_t delta16_size = l.batch*l.n*l.out_w*l.out_h; @@ -910,6 +910,9 @@ void update_convolutional_layer_gpu(layer l, int batch, float learning_rate_init //float decay = a.decay; //int batch = a.batch; + fix_nan_and_inf(l.weight_updates_gpu, l.nweights); + fix_nan_and_inf(l.weights_gpu, l.nweights); + if (l.adam) { //adam_update_gpu(l.weights_gpu, l.weight_updates_gpu, l.m_gpu, l.v_gpu, a.B1, a.B2, a.eps, decay, learning_rate, l.nweights, batch, a.t); adam_update_gpu(l.weights_gpu, l.weight_updates_gpu, l.m_gpu, l.v_gpu, l.B1, l.B2, l.eps, decay, learning_rate, l.nweights, batch, l.t); diff --git a/src/dropout_layer_kernels.cu b/src/dropout_layer_kernels.cu index d3af0ea6..dadaa9fe 100644 --- a/src/dropout_layer_kernels.cu +++ b/src/dropout_layer_kernels.cu @@ -15,6 +15,10 @@ __global__ void yoloswag420blazeit360noscope(float *input, int size, float *rand void forward_dropout_layer_gpu(dropout_layer layer, network_state state) { if (!state.train) return; + int iteration_num = (*state.net.seen) / (state.net.batch*state.net.subdivisions); + //if (iteration_num < state.net.burn_in) return; + + int size = layer.inputs*layer.batch; cuda_random(layer.rand_gpu, size); /* @@ -32,6 +36,9 @@ void forward_dropout_layer_gpu(dropout_layer layer, network_state state) void backward_dropout_layer_gpu(dropout_layer layer, network_state state) { if(!state.delta) return; + int iteration_num = (*state.net.seen) / (state.net.batch*state.net.subdivisions); + //if (iteration_num < state.net.burn_in) return; + int size = layer.inputs*layer.batch; yoloswag420blazeit360noscope<<>>(state.delta, size, layer.rand_gpu, layer.probability, layer.scale); diff --git a/src/network.c b/src/network.c index d6b8f5c7..0e655c78 100644 --- a/src/network.c +++ b/src/network.c @@ -318,6 +318,7 @@ void backward_network(network net, network_state state) } layer l = net.layers[i]; if (l.stopbackward) break; + if (l.onlyforward) continue; l.backward(l, state); } } diff --git a/src/network_kernels.cu b/src/network_kernels.cu index edaefc34..3ebf2900 100644 --- a/src/network_kernels.cu +++ b/src/network_kernels.cu @@ -106,6 +106,7 @@ void backward_network_gpu(network net, network_state state) state.input = prev.output_gpu; state.delta = prev.delta_gpu; } + if (l.onlyforward) continue; l.backward_gpu(l, state); /* diff --git a/src/parser.c b/src/parser.c index f3c02198..d45f625c 100644 --- a/src/parser.c +++ b/src/parser.c @@ -31,6 +31,7 @@ #include "rnn_layer.h" #include "route_layer.h" #include "shortcut_layer.h" +#include "scale_channels_layer.h" #include "softmax_layer.h" #include "utils.h" #include "upsample_layer.h" @@ -48,6 +49,7 @@ LAYER_TYPE string_to_layer_type(char * type) { if (strcmp(type, "[shortcut]")==0) return SHORTCUT; + if (strcmp(type, "[scale_channels]") == 0) return SCALE_CHANNELS; if (strcmp(type, "[crop]")==0) return CROP; if (strcmp(type, "[cost]")==0) return COST; if (strcmp(type, "[detection]")==0) return DETECTION; @@ -80,6 +82,7 @@ LAYER_TYPE string_to_layer_type(char * type) || strcmp(type, "[softmax]")==0) return SOFTMAX; if (strcmp(type, "[route]")==0) return ROUTE; if (strcmp(type, "[upsample]") == 0) return UPSAMPLE; + if (strcmp(type, "[empty]") == 0) return EMPTY; return BLANK; } @@ -600,6 +603,24 @@ layer parse_shortcut(list *options, size_params params, network net) } +layer parse_scale_channels(list *options, size_params params, network net) +{ + char *l = option_find(options, "from"); + int index = atoi(l); + if (index < 0) index = params.index + index; + + int batch = params.batch; + layer from = net.layers[index]; + + layer s = make_scale_channels_layer(batch, index, params.w, params.h, params.c, from.out_w, from.out_h, from.out_c); + + char *activation_s = option_find_str_quiet(options, "activation", "linear"); + ACTIVATION activation = get_activation(activation_s); + s.activation = activation; + return s; +} + + layer parse_activation(list *options, size_params params) { char *activation_s = option_find_str(options, "activation", "linear"); @@ -895,6 +916,10 @@ network parse_network_cfg_custom(char *filename, int batch, int time_steps) l = parse_shortcut(options, params, net); net.layers[count - 1].use_bin_output = 0; net.layers[l.index].use_bin_output = 0; + }else if (lt == SCALE_CHANNELS) { + l = parse_scale_channels(options, params, net); + net.layers[count - 1].use_bin_output = 0; + net.layers[l.index].use_bin_output = 0; }else if(lt == DROPOUT){ l = parse_dropout(options, params); l.output = net.layers[count-1].output; @@ -902,6 +927,19 @@ network parse_network_cfg_custom(char *filename, int batch, int time_steps) #ifdef GPU l.output_gpu = net.layers[count-1].output_gpu; l.delta_gpu = net.layers[count-1].delta_gpu; +#endif + } + else if (lt == EMPTY) { + layer empty_layer; + empty_layer.out_w = params.w; + empty_layer.out_h = params.h; + empty_layer.out_c = params.c; + l = empty_layer; + l.output = net.layers[count - 1].output; + l.delta = net.layers[count - 1].delta; +#ifdef GPU + l.output_gpu = net.layers[count - 1].output_gpu; + l.delta_gpu = net.layers[count - 1].delta_gpu; #endif }else{ fprintf(stderr, "Type not recognized: %s\n", s->type); diff --git a/src/scale_channels_layer.c b/src/scale_channels_layer.c new file mode 100644 index 00000000..32d11014 --- /dev/null +++ b/src/scale_channels_layer.c @@ -0,0 +1,118 @@ +#include "scale_channels_layer.h" +#include "dark_cuda.h" +#include "blas.h" +#include +#include + +layer make_scale_channels_layer(int batch, int index, int w, int h, int c, int w2, int h2, int c2) +{ + fprintf(stderr,"scale Layer: %d\n", index); + layer l = { (LAYER_TYPE)0 }; + l.type = SCALE_CHANNELS; + l.batch = batch; + l.w = w; + l.h = h; + l.c = c; + assert(w == 1 & h == 1); + + l.out_w = w2; + l.out_h = h2; + l.out_c = c2; + assert(l.out_c == l.c); + + l.outputs = l.out_w*l.out_h*l.out_c; + l.inputs = l.outputs; + l.index = index; + + l.delta = (float*)calloc(l.outputs * batch, sizeof(float)); + l.output = (float*)calloc(l.outputs * batch, sizeof(float)); + + l.forward = forward_scale_channels_layer; + l.backward = backward_scale_channels_layer; +#ifdef GPU + l.forward_gpu = forward_scale_channels_layer_gpu; + l.backward_gpu = backward_scale_channels_layer_gpu; + + l.delta_gpu = cuda_make_array(l.delta, l.outputs*batch); + l.output_gpu = cuda_make_array(l.output, l.outputs*batch); +#endif + return l; +} + +void resize_scale_channels_layer(layer *l, int w, int h) +{ + l->out_w = w; + l->out_h = h; + l->outputs = l->out_w*l->out_h*l->out_c; + l->inputs = l->outputs; + l->delta = (float*)realloc(l->delta, l->outputs * l->batch * sizeof(float)); + l->output = (float*)realloc(l->output, l->outputs * l->batch * sizeof(float)); + +#ifdef GPU + cuda_free(l->output_gpu); + cuda_free(l->delta_gpu); + l->output_gpu = cuda_make_array(l->output, l->outputs*l->batch); + l->delta_gpu = cuda_make_array(l->delta, l->outputs*l->batch); +#endif + +} + +void forward_scale_channels_layer(const layer l, network_state state) +{ + int size = l.batch * l.out_c * l.out_w * l.out_h; + int channel_size = l.out_w * l.out_h; + float *from_output = state.net.layers[l.index].output; + + int i; + #pragma omp parallel for + for (i = 0; i < size; ++i) { + l.output[i] = state.input[i / channel_size] * from_output[i]; + } + + activate_array(l.output, l.outputs*l.batch, l.activation); +} + +void backward_scale_channels_layer(const layer l, network_state state) +{ + gradient_array(l.output, l.outputs*l.batch, l.activation, l.delta); + //axpy_cpu(l.outputs*l.batch, 1, l.delta, 1, state.delta, 1); + //scale_cpu(l.batch, l.out_w, l.out_h, l.out_c, l.delta, l.w, l.h, l.c, state.net.layers[l.index].delta); + + int size = l.batch * l.out_c * l.out_w * l.out_h; + int channel_size = l.out_w * l.out_h; + float *from_output = state.net.layers[l.index].output; + float *from_delta = state.net.layers[l.index].delta; + + int i; + #pragma omp parallel for + for (i = 0; i < size; ++i) { + state.delta[i / channel_size] += l.delta[i] * from_output[i]; // l.delta * from (should be divided by channel_size?) + + from_delta[i] = state.input[i / channel_size] * l.delta[i]; // input * l.delta + } +} + +#ifdef GPU +void forward_scale_channels_layer_gpu(const layer l, network_state state) +{ + int size = l.batch * l.out_c * l.out_w * l.out_h; + int channel_size = l.out_w * l.out_h; + + scale_channels_gpu(state.net.layers[l.index].output_gpu, size, channel_size, state.input, l.output_gpu); + + activate_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation); +} + +void backward_scale_channels_layer_gpu(const layer l, network_state state) +{ + gradient_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation, l.delta_gpu); + + int size = l.batch * l.out_c * l.out_w * l.out_h; + int channel_size = l.out_w * l.out_h; + float *from_output = state.net.layers[l.index].output_gpu; + float *from_delta = state.net.layers[l.index].delta_gpu; + + + backward_scale_channels_gpu(l.delta_gpu, size, channel_size, state.input, from_delta, from_output, state.delta); +} +#endif diff --git a/src/scale_channels_layer.h b/src/scale_channels_layer.h new file mode 100644 index 00000000..a20c0703 --- /dev/null +++ b/src/scale_channels_layer.h @@ -0,0 +1,23 @@ +#ifndef SCALE_CHANNELS_LAYER_H +#define SCALE_CHANNELS_LAYER_H + +#include "layer.h" +#include "network.h" + +#ifdef __cplusplus +extern "C" { +#endif +layer make_scale_channels_layer(int batch, int index, int w, int h, int c, int w2, int h2, int c2); +void forward_scale_channels_layer(const layer l, network_state state); +void backward_scale_channels_layer(const layer l, network_state state); +void resize_scale_channels_layer(layer *l, int w, int h); + +#ifdef GPU +void forward_scale_channels_layer_gpu(const layer l, network_state state); +void backward_scale_channels_layer_gpu(const layer l, network_state state); +#endif + +#ifdef __cplusplus +} +#endif +#endif // SCALE_CHANNELS_LAYER_H diff --git a/src/shortcut_layer.c b/src/shortcut_layer.c index 06cd6056..070ae5d9 100644 --- a/src/shortcut_layer.c +++ b/src/shortcut_layer.c @@ -19,6 +19,8 @@ layer make_shortcut_layer(int batch, int index, int w, int h, int c, int w2, int l.outputs = w*h*c; l.inputs = l.outputs; + if(w != w2 || h != h2 || c != c) fprintf(stderr, " w = %d, w2 = %d, h = %d, h2 = %d, c = %d, c2 = %d \n", w, w2, h, h2, c, c2); + l.index = index; l.delta = (float*)calloc(l.outputs * batch, sizeof(float));