From e455cdef789d90fc01f0c1cc42663a28f93ac4e0 Mon Sep 17 00:00:00 2001 From: AlexeyAB Date: Mon, 1 Jun 2020 05:13:21 +0300 Subject: [PATCH] Added [conv_lstm] lstm_activation=tanh shortcut=1 --- include/darknet.h | 1 + src/blas_kernels.cu | 8 +++++--- src/conv_lstm_layer.c | 47 ++++++++++++++++++++++++++----------------- src/parser.c | 5 ++++- src/utils.c | 4 ++-- 5 files changed, 40 insertions(+), 25 deletions(-) diff --git a/include/darknet.h b/include/darknet.h index 6926b0d5..544003eb 100644 --- a/include/darknet.h +++ b/include/darknet.h @@ -205,6 +205,7 @@ typedef struct update_args { struct layer { LAYER_TYPE type; ACTIVATION activation; + ACTIVATION lstm_activation; COST_TYPE cost_type; void(*forward) (struct layer, struct network_state); void(*backward) (struct layer, struct network_state); diff --git a/src/blas_kernels.cu b/src/blas_kernels.cu index de0a2cb4..c3d3fd06 100644 --- a/src/blas_kernels.cu +++ b/src/blas_kernels.cu @@ -1539,7 +1539,8 @@ __global__ void add_3_arrays_activate_kernel(float *a1, float *a2, float *a3, si if (a2) val += a2[index]; if (a3) val += a3[index]; if (a == LOGISTIC) val = 1.f / (1.f + expf(-val)); - else if(a == TANH) val = (2 / (1 + expf(-2 * val)) - 1); + else if (a == TANH) val = (2 / (1 + expf(-2 * val)) - 1); + else if (a == LEAKY) val = (val < 0) ? val*0.1 : val; dst[index] = val; } } @@ -1548,7 +1549,7 @@ extern "C" void add_3_arrays_activate(float *a1, float *a2, float *a3, size_t si { const int block_size = BLOCK; const int num_blocks = get_number_of_blocks(size, block_size); - if (a != LOGISTIC && a != TANH) { + if (!(a == LOGISTIC || a == TANH || a == LEAKY || a == LINEAR)) { printf(" add_3_arrays_activate() doesn't support activation %d, it supports only LOGISTIC and TANH \n", a); exit(EXIT_FAILURE); } @@ -1578,6 +1579,7 @@ __global__ void activate_and_mult_kernel(float *a1, float *a2, size_t size, ACTI if (index < size) { float val = a1[index]; if (a == TANH) val = (2 / (1 + expf(-2 * val)) - 1); + else if (a == LEAKY) val = (val < 0) ? val*0.1 : val; dst[index] = val * a2[index]; } } @@ -1586,7 +1588,7 @@ extern "C" void activate_and_mult(float *a1, float *a2, size_t size, ACTIVATION { const int block_size = BLOCK; const int num_blocks = get_number_of_blocks(size, block_size); - if (a != TANH) { + if (!(a == TANH || a == LEAKY || a == LINEAR)) { printf(" activat_and_mult() doesn't support activation %d, it supports only TANH \n", a); exit(EXIT_FAILURE); } diff --git a/src/conv_lstm_layer.c b/src/conv_lstm_layer.c index 32967542..b7e274aa 100644 --- a/src/conv_lstm_layer.c +++ b/src/conv_lstm_layer.c @@ -889,7 +889,7 @@ void forward_conv_lstm_layer_gpu(layer l, network_state state) simple_copy_ongpu(l.outputs*l.batch, state.input, l.bottelneck_hi_gpu + l.outputs*l.batch); s.input = l.bottelneck_hi_gpu; forward_convolutional_layer_gpu(wf, s); // 2x input channels - activate_array_ongpu(wf.output_gpu, l.outputs*l.batch, TANH); + activate_array_ongpu(wf.output_gpu, l.outputs*l.batch, l.lstm_activation); s.input = wf.output_gpu; } else { @@ -928,7 +928,7 @@ void forward_conv_lstm_layer_gpu(layer l, network_state state) //activate_array_ongpu(l.i_gpu, l.outputs*l.batch, LOGISTIC); // g = wg + ug - add_3_arrays_activate((l.bottleneck)?NULL:wg.output_gpu, ug.output_gpu, NULL, l.outputs*l.batch, TANH, l.g_gpu); + add_3_arrays_activate((l.bottleneck)?NULL:wg.output_gpu, ug.output_gpu, NULL, l.outputs*l.batch, l.lstm_activation, l.g_gpu); //copy_ongpu(l.outputs*l.batch, wg.output_gpu, 1, l.g_gpu, 1); //axpy_ongpu(l.outputs*l.batch, 1, ug.output_gpu, 1, l.g_gpu, 1); //activate_array_ongpu(l.g_gpu, l.outputs*l.batch, TANH); @@ -952,18 +952,25 @@ void forward_conv_lstm_layer_gpu(layer l, network_state state) //activate_array_ongpu(l.o_gpu, l.outputs*l.batch, LOGISTIC); // h = o * tanh(c) - activate_and_mult(l.c_gpu, l.o_gpu, l.outputs*l.batch, TANH, l.h_gpu); + activate_and_mult(l.c_gpu, l.o_gpu, l.outputs*l.batch, l.lstm_activation, l.h_gpu); //simple_copy_ongpu(l.outputs*l.batch, l.c_gpu, l.h_gpu); //activate_array_ongpu(l.h_gpu, l.outputs*l.batch, TANH); //mul_ongpu(l.outputs*l.batch, l.o_gpu, 1, l.h_gpu, 1); - fix_nan_and_inf(l.c_gpu, l.outputs*l.batch); - fix_nan_and_inf(l.h_gpu, l.outputs*l.batch); + fix_nan_and_inf(l.c_gpu, l.outputs*l.batch); // should be fix_nan_and_inf() + fix_nan_and_inf(l.h_gpu, l.outputs*l.batch); // should be fix_nan_and_inf() if (l.state_constrain) constrain_ongpu(l.outputs*l.batch, l.state_constrain, l.c_gpu, 1); if(state.train) simple_copy_ongpu(l.outputs*l.batch, l.c_gpu, l.cell_gpu); simple_copy_ongpu(l.outputs*l.batch, l.h_gpu, l.output_gpu); // is required for both Detection and Training + if (l.shortcut) { + printf("\n shortcut \n"); + // partial residual connection + if (l.bottleneck) axpy_ongpu(l.outputs*l.batch/2, 1, wf.output_gpu, 1, l.output_gpu, 1); + //else axpy_ongpu(l.outputs*l.batch, 1, l.f_gpu, 1, l.output_gpu, 1); + } + state.input += l.inputs*l.batch; l.output_gpu += l.outputs*l.batch; l.cell_gpu += l.outputs*l.batch; @@ -1066,10 +1073,10 @@ void backward_conv_lstm_layer_gpu(layer l, network_state state) //activate_array_ongpu(l.i_gpu, l.outputs*l.batch, LOGISTIC); // g = wg + ug - add_3_arrays_activate((l.bottleneck) ? NULL : wg.output_gpu, ug.output_gpu, NULL, l.outputs*l.batch, TANH, l.g_gpu); + add_3_arrays_activate((l.bottleneck) ? NULL : wg.output_gpu, ug.output_gpu, NULL, l.outputs*l.batch, l.lstm_activation, l.g_gpu); // TANH //copy_ongpu(l.outputs*l.batch, wg.output_gpu, 1, l.g_gpu, 1); //axpy_ongpu(l.outputs*l.batch, 1, ug.output_gpu, 1, l.g_gpu, 1); - //activate_array_ongpu(l.g_gpu, l.outputs*l.batch, TANH); + //activate_array_ongpu(l.g_gpu, l.outputs*l.batch, l.lstm_activation); // o = wo + uo + vo add_3_arrays_activate((l.bottleneck) ? NULL : wo.output_gpu, uo.output_gpu, (l.peephole) ? vo.output_gpu : NULL, l.outputs*l.batch, LOGISTIC, l.o_gpu); @@ -1082,12 +1089,12 @@ void backward_conv_lstm_layer_gpu(layer l, network_state state) simple_copy_ongpu(l.outputs*l.batch, l.delta_gpu, l.temp3_gpu); // temp3 = delta simple_copy_ongpu(l.outputs*l.batch, l.c_gpu, l.temp_gpu); - activate_array_ongpu(l.temp_gpu, l.outputs*l.batch, TANH); // temp = tanh(c) + activate_array_ongpu(l.temp_gpu, l.outputs*l.batch, l.lstm_activation); // temp = tanh(c) simple_copy_ongpu(l.outputs*l.batch, l.temp3_gpu, l.temp2_gpu); mul_ongpu(l.outputs*l.batch, l.o_gpu, 1, l.temp2_gpu, 1); // temp2 = delta * o - gradient_array_ongpu(l.temp_gpu, l.outputs*l.batch, TANH, l.temp2_gpu); // temp2 = delta * o * grad_tanh(tanh(c)) + gradient_array_ongpu(l.temp_gpu, l.outputs*l.batch, l.lstm_activation, l.temp2_gpu); // temp2 = delta * o * grad_tanh(tanh(c)) //??? axpy_ongpu(l.outputs*l.batch, 1, l.dc_gpu, 1, l.temp2_gpu, 1); // temp2 = delta * o * grad_tanh(tanh(c)) + delta_c(???) // temp = tanh(c) @@ -1095,7 +1102,7 @@ void backward_conv_lstm_layer_gpu(layer l, network_state state) // temp3 = delta simple_copy_ongpu(l.outputs*l.batch, l.c_gpu, l.temp_gpu); - activate_array_ongpu(l.temp_gpu, l.outputs*l.batch, TANH); // temp = tanh(c) + activate_array_ongpu(l.temp_gpu, l.outputs*l.batch, l.lstm_activation); // temp = tanh(c) mul_ongpu(l.outputs*l.batch, l.temp3_gpu, 1, l.temp_gpu, 1); // temp = delta * tanh(c) gradient_array_ongpu(l.o_gpu, l.outputs*l.batch, LOGISTIC, l.temp_gpu); // temp = delta * tanh(c) * grad_logistic(o) @@ -1133,14 +1140,14 @@ void backward_conv_lstm_layer_gpu(layer l, network_state state) // g simple_copy_ongpu(l.outputs*l.batch, l.temp2_gpu, l.temp_gpu); mul_ongpu(l.outputs*l.batch, l.i_gpu, 1, l.temp_gpu, 1); - gradient_array_ongpu(l.g_gpu, l.outputs*l.batch, TANH, l.temp_gpu); + gradient_array_ongpu(l.g_gpu, l.outputs*l.batch, l.lstm_activation, l.temp_gpu); // delta for c,f,i,g(w,u,v): temp = (delta * o * grad_tanh(tanh(c)) + delta_c(???)) * i * grad_tanh(g) if (!l.bottleneck) { simple_copy_ongpu(l.outputs*l.batch, l.temp_gpu, wg.delta_gpu); s.input = l.prev_state_gpu; - s.delta = l.dh_gpu; - backward_convolutional_layer_gpu(wg, s); + s.delta = l.dh_gpu; // comment this + backward_convolutional_layer_gpu(wg, s); // lead to nan } simple_copy_ongpu(l.outputs*l.batch, l.temp_gpu, ug.delta_gpu); @@ -1170,8 +1177,8 @@ void backward_conv_lstm_layer_gpu(layer l, network_state state) if (!l.bottleneck) { simple_copy_ongpu(l.outputs*l.batch, l.temp_gpu, wi.delta_gpu); s.input = l.prev_state_gpu; - s.delta = l.dh_gpu; - backward_convolutional_layer_gpu(wi, s); + s.delta = l.dh_gpu; // comment this + backward_convolutional_layer_gpu(wi, s); // lead to nan (after 1000 it) } simple_copy_ongpu(l.outputs*l.batch, l.temp_gpu, ui.delta_gpu); @@ -1217,9 +1224,10 @@ void backward_conv_lstm_layer_gpu(layer l, network_state state) fill_ongpu(l.outputs * l.batch * 2, 0, l.bottelneck_delta_gpu, 1); s.input = l.bottelneck_hi_gpu; s.delta = l.bottelneck_delta_gpu; - gradient_array_ongpu(wf.output_gpu, l.outputs*l.batch, TANH, wf.delta_gpu); + if (l.shortcut) axpy_ongpu(l.outputs*l.batch/2, 1, l.delta_gpu, 1, wf.delta_gpu, 1); // partial residual connection + gradient_array_ongpu(wf.output_gpu, l.outputs*l.batch, l.lstm_activation, wf.delta_gpu); - fix_nan_and_inf(wf.delta_gpu, l.outputs*l.batch); + reset_nan_and_inf(wf.delta_gpu, l.outputs*l.batch); constrain_ongpu(l.outputs*l.batch, 1, wf.delta_gpu, 1); } else { @@ -1232,7 +1240,7 @@ void backward_conv_lstm_layer_gpu(layer l, network_state state) backward_convolutional_layer_gpu(wf, s); if (l.bottleneck) { - fix_nan_and_inf(l.bottelneck_delta_gpu, l.outputs*l.batch*2); + reset_nan_and_inf(l.bottelneck_delta_gpu, l.outputs*l.batch*2); //constrain_ongpu(l.outputs*l.batch*2, 1, l.bottelneck_delta_gpu, 1); if (l.dh_gpu) axpy_ongpu(l.outputs*l.batch, 1, l.bottelneck_delta_gpu, 1, l.dh_gpu, 1); axpy_ongpu(l.outputs*l.batch, 1, l.bottelneck_delta_gpu + l.outputs*l.batch, 1, state.delta, 1); // lead to nan @@ -1242,7 +1250,8 @@ void backward_conv_lstm_layer_gpu(layer l, network_state state) simple_copy_ongpu(l.outputs*l.batch, l.temp2_gpu, l.temp_gpu); mul_ongpu(l.outputs*l.batch, l.f_gpu, 1, l.temp_gpu, 1); simple_copy_ongpu(l.outputs*l.batch, l.temp_gpu, l.dc_gpu); - fix_nan_and_inf(l.dc_gpu, l.outputs*l.batch); + reset_nan_and_inf(l.dc_gpu, l.outputs*l.batch); + if (i != 0) reset_nan_and_inf(l.dh_gpu, l.outputs*l.batch); // delta for c,f,i,g(w,u,v): delta_c = temp = (delta * o * grad_tanh(tanh(c)) + delta_c(???)) * f // (grad_linear(c)==1) state.input -= l.inputs*l.batch; diff --git a/src/parser.c b/src/parser.c index d2c02c25..5cf570fd 100644 --- a/src/parser.c +++ b/src/parser.c @@ -308,7 +308,7 @@ layer parse_conv_lstm(list *options, size_params params) int output_filters = option_find_int(options, "output", 1); int groups = option_find_int_quiet(options, "groups", 1); - char *activation_s = option_find_str(options, "activation", "LINEAR"); + char *activation_s = option_find_str(options, "activation", "linear"); ACTIVATION activation = get_activation(activation_s); int batch_normalize = option_find_int_quiet(options, "batch_normalize", 0); int xnor = option_find_int_quiet(options, "xnor", 0); @@ -320,6 +320,9 @@ layer parse_conv_lstm(list *options, size_params params) l.state_constrain = option_find_int_quiet(options, "state_constrain", params.time_steps * 32); l.shortcut = option_find_int_quiet(options, "shortcut", 0); + char *lstm_activation_s = option_find_str(options, "lstm_activation", "tanh"); + l.lstm_activation = get_activation(lstm_activation_s); + return l; } diff --git a/src/utils.c b/src/utils.c index 008c27f7..741ad4cb 100644 --- a/src/utils.c +++ b/src/utils.c @@ -645,8 +645,8 @@ void normalize_array(float *a, int n) for(i = 0; i < n; ++i){ a[i] = (a[i] - mu)/sigma; } - mu = mean_array(a,n); - sigma = sqrt(variance_array(a,n)); + //mu = mean_array(a,n); + //sigma = sqrt(variance_array(a,n)); } void translate_array(float *a, int n, float s)