From a7e5976c1b8e0d826d064bebd58fce2a046f718c Mon Sep 17 00:00:00 2001 From: AlexeyAB Date: Thu, 9 May 2019 22:47:23 +0300 Subject: [PATCH] Added sequential subdivisions --- include/darknet.h | 2 + src/conv_lstm_layer.c | 1162 ++++++++++++++++++++++++++++++++++++++++ src/conv_lstm_layer.h | 33 ++ src/detector.c | 10 +- src/network.c | 34 ++ src/network.h | 3 + src/network_kernels.cu | 6 +- src/parser.c | 12 +- 8 files changed, 1257 insertions(+), 5 deletions(-) create mode 100644 src/conv_lstm_layer.c create mode 100644 src/conv_lstm_layer.h diff --git a/include/darknet.h b/include/darknet.h index 29a62924..fbd7e27d 100644 --- a/include/darknet.h +++ b/include/darknet.h @@ -559,6 +559,7 @@ typedef struct network { int time_steps; int step; int max_batches; + float *seq_scales; float *scales; int *steps; int num_steps; @@ -591,6 +592,7 @@ typedef struct network { int track; int augment_speed; int sequential_subdivisions; + int init_sequential_subdivisions; int current_subdivision; int try_fix_nan; diff --git a/src/conv_lstm_layer.c b/src/conv_lstm_layer.c new file mode 100644 index 00000000..7d395ad0 --- /dev/null +++ b/src/conv_lstm_layer.c @@ -0,0 +1,1162 @@ +// Page 4: https://arxiv.org/abs/1506.04214v2 +// Page 3: https://arxiv.org/pdf/1705.06368v3.pdf +// https://wikimedia.org/api/rest_v1/media/math/render/svg/1edbece2559479959fe829e9c6657efb380debe7 + +#include "conv_lstm_layer.h" +#include "connected_layer.h" +#include "convolutional_layer.h" +#include "utils.h" +#include "dark_cuda.h" +#include "blas.h" +#include "gemm.h" + +#include +#include +#include +#include + +static void increment_layer(layer *l, int steps) +{ + int num = l->outputs*l->batch*steps; + l->output += num; + l->delta += num; + l->x += num; + l->x_norm += num; + +#ifdef GPU + l->output_gpu += num; + l->delta_gpu += num; + l->x_gpu += num; + l->x_norm_gpu += num; +#endif +} + + +layer make_conv_lstm_layer(int batch, int h, int w, int c, int output_filters, int steps, int size, int stride, int pad, ACTIVATION activation, int batch_normalize, int peephole, int xnor) +{ + fprintf(stderr, "CONV_LSTM Layer: %d x %d x %d image, %d filters\n", h, w, c, output_filters); + /* + batch = batch / steps; + layer l = { (LAYER_TYPE)0 }; + l.batch = batch; + l.type = LSTM; + l.steps = steps; + l.inputs = inputs; + l.out_w = 1; + l.out_h = 1; + l.out_c = outputs; + */ + batch = batch / steps; + layer l = { (LAYER_TYPE)0 }; + l.batch = batch; + l.type = CONV_LSTM; + l.steps = steps; + l.size = size; + l.stride = stride; + l.pad = pad; + l.h = h; + l.w = w; + l.c = c; + l.out_c = output_filters; + l.inputs = h * w * c; + l.xnor = xnor; + l.peephole = peephole; + + // U + l.uf = (layer*)malloc(sizeof(layer)); + *(l.uf) = make_convolutional_layer(batch, steps, h, w, c, output_filters, size, stride, pad, activation, batch_normalize, 0, xnor, 0, 0, 0); + l.uf->batch = batch; + if (l.workspace_size < l.uf->workspace_size) l.workspace_size = l.uf->workspace_size; + + l.ui = (layer*)malloc(sizeof(layer)); + *(l.ui) = make_convolutional_layer(batch, steps, h, w, c, output_filters, size, stride, pad, activation, batch_normalize, 0, xnor, 0, 0, 0); + l.ui->batch = batch; + if (l.workspace_size < l.ui->workspace_size) l.workspace_size = l.ui->workspace_size; + + l.ug = (layer*)malloc(sizeof(layer)); + *(l.ug) = make_convolutional_layer(batch, steps, h, w, c, output_filters, size, stride, pad, activation, batch_normalize, 0, xnor, 0, 0, 0); + l.ug->batch = batch; + if (l.workspace_size < l.ug->workspace_size) l.workspace_size = l.ug->workspace_size; + + l.uo = (layer*)malloc(sizeof(layer)); + *(l.uo) = make_convolutional_layer(batch, steps, h, w, c, output_filters, size, stride, pad, activation, batch_normalize, 0, xnor, 0, 0, 0); + l.uo->batch = batch; + if (l.workspace_size < l.uo->workspace_size) l.workspace_size = l.uo->workspace_size; + + + // W + l.wf = (layer*)malloc(sizeof(layer)); + *(l.wf) = make_convolutional_layer(batch, steps, h, w, output_filters, output_filters, size, stride, pad, activation, batch_normalize, 0, xnor, 0, 0, 0); + l.wf->batch = batch; + if (l.workspace_size < l.wf->workspace_size) l.workspace_size = l.wf->workspace_size; + + l.wi = (layer*)malloc(sizeof(layer)); + *(l.wi) = make_convolutional_layer(batch, steps, h, w, output_filters, output_filters, size, stride, pad, activation, batch_normalize, 0, xnor, 0, 0, 0); + l.wi->batch = batch; + if (l.workspace_size < l.wi->workspace_size) l.workspace_size = l.wi->workspace_size; + + l.wg = (layer*)malloc(sizeof(layer)); + *(l.wg) = make_convolutional_layer(batch, steps, h, w, output_filters, output_filters, size, stride, pad, activation, batch_normalize, 0, xnor, 0, 0, 0); + l.wg->batch = batch; + if (l.workspace_size < l.wg->workspace_size) l.workspace_size = l.wg->workspace_size; + + l.wo = (layer*)malloc(sizeof(layer)); + *(l.wo) = make_convolutional_layer(batch, steps, h, w, output_filters, output_filters, size, stride, pad, activation, batch_normalize, 0, xnor, 0, 0, 0); + l.wo->batch = batch; + if (l.workspace_size < l.wo->workspace_size) l.workspace_size = l.wo->workspace_size; + + + // V + l.vf = (layer*)malloc(sizeof(layer)); + if (l.peephole) { + *(l.vf) = make_convolutional_layer(batch, steps, h, w, output_filters, output_filters, size, stride, pad, activation, batch_normalize, 0, xnor, 0, 0, 0); + l.vf->batch = batch; + if (l.workspace_size < l.vf->workspace_size) l.workspace_size = l.vf->workspace_size; + } + + l.vi = (layer*)malloc(sizeof(layer)); + if (l.peephole) { + *(l.vi) = make_convolutional_layer(batch, steps, h, w, output_filters, output_filters, size, stride, pad, activation, batch_normalize, 0, xnor, 0, 0, 0); + l.vi->batch = batch; + if (l.workspace_size < l.vi->workspace_size) l.workspace_size = l.vi->workspace_size; + } + + l.vo = (layer*)malloc(sizeof(layer)); + if (l.peephole) { + *(l.vo) = make_convolutional_layer(batch, steps, h, w, output_filters, output_filters, size, stride, pad, activation, batch_normalize, 0, xnor, 0, 0, 0); + l.vo->batch = batch; + if (l.workspace_size < l.vo->workspace_size) l.workspace_size = l.vo->workspace_size; + } + + + l.batch_normalize = batch_normalize; + + l.out_h = l.wo->out_h; + l.out_w = l.wo->out_w; + l.outputs = l.wo->outputs; + int outputs = l.outputs; + l.inputs = w*h*c; + + assert(l.wo->outputs == l.uo->outputs); + + l.output = (float*)calloc(outputs * batch * steps, sizeof(float)); + //l.state = (float*)calloc(outputs * batch, sizeof(float)); + + l.forward = forward_conv_lstm_layer; + l.update = update_conv_lstm_layer; + l.backward = backward_conv_lstm_layer; + + l.prev_state_cpu = (float*)calloc(batch*outputs, sizeof(float)); + l.prev_cell_cpu = (float*)calloc(batch*outputs, sizeof(float)); + l.cell_cpu = (float*)calloc(batch*outputs*steps, sizeof(float)); + + l.f_cpu = (float*)calloc(batch*outputs, sizeof(float)); + l.i_cpu = (float*)calloc(batch*outputs, sizeof(float)); + l.g_cpu = (float*)calloc(batch*outputs, sizeof(float)); + l.o_cpu = (float*)calloc(batch*outputs, sizeof(float)); + l.c_cpu = (float*)calloc(batch*outputs, sizeof(float)); + l.stored_c_cpu = (float*)calloc(batch*outputs, sizeof(float)); + l.h_cpu = (float*)calloc(batch*outputs, sizeof(float)); + l.stored_h_cpu = (float*)calloc(batch*outputs, sizeof(float)); + l.temp_cpu = (float*)calloc(batch*outputs, sizeof(float)); + l.temp2_cpu = (float*)calloc(batch*outputs, sizeof(float)); + l.temp3_cpu = (float*)calloc(batch*outputs, sizeof(float)); + l.dc_cpu = (float*)calloc(batch*outputs, sizeof(float)); + l.dh_cpu = (float*)calloc(batch*outputs, sizeof(float)); + +#ifdef GPU + l.forward_gpu = forward_conv_lstm_layer_gpu; + l.backward_gpu = backward_conv_lstm_layer_gpu; + l.update_gpu = update_conv_lstm_layer_gpu; + + //l.state_gpu = cuda_make_array(l.state, batch*l.outputs); + + l.output_gpu = cuda_make_array(0, batch*outputs*steps); + l.delta_gpu = cuda_make_array(0, batch*l.outputs*steps); + + l.prev_state_gpu = cuda_make_array(0, batch*outputs); + l.prev_cell_gpu = cuda_make_array(0, batch*outputs); + l.cell_gpu = cuda_make_array(0, batch*outputs*steps); + + l.f_gpu = cuda_make_array(0, batch*outputs); + l.i_gpu = cuda_make_array(0, batch*outputs); + l.g_gpu = cuda_make_array(0, batch*outputs); + l.o_gpu = cuda_make_array(0, batch*outputs); + l.c_gpu = cuda_make_array(0, batch*outputs); + l.h_gpu = cuda_make_array(0, batch*outputs); + l.stored_c_gpu = cuda_make_array(0, batch*outputs); + l.stored_h_gpu = cuda_make_array(0, batch*outputs); + l.temp_gpu = cuda_make_array(0, batch*outputs); + l.temp2_gpu = cuda_make_array(0, batch*outputs); + l.temp3_gpu = cuda_make_array(0, batch*outputs); + l.dc_gpu = cuda_make_array(0, batch*outputs); + l.dh_gpu = cuda_make_array(0, batch*outputs); + l.last_prev_state_gpu = cuda_make_array(0, l.batch*l.outputs); + l.last_prev_cell_gpu = cuda_make_array(0, l.batch*l.outputs); + +#endif + + l.bflops = l.uf->bflops + l.ui->bflops + l.ug->bflops + l.uo->bflops + + l.wf->bflops + l.wi->bflops + l.wg->bflops + l.wo->bflops + + l.vf->bflops + l.vi->bflops + l.vo->bflops; + + return l; +} + +void update_conv_lstm_layer(layer l, int batch, float learning_rate, float momentum, float decay) +{ + if (l.peephole) { + update_convolutional_layer(*(l.vf), batch, learning_rate, momentum, decay); + update_convolutional_layer(*(l.vi), batch, learning_rate, momentum, decay); + update_convolutional_layer(*(l.vo), batch, learning_rate, momentum, decay); + } + update_convolutional_layer(*(l.wf), batch, learning_rate, momentum, decay); + update_convolutional_layer(*(l.wi), batch, learning_rate, momentum, decay); + update_convolutional_layer(*(l.wg), batch, learning_rate, momentum, decay); + update_convolutional_layer(*(l.wo), batch, learning_rate, momentum, decay); + update_convolutional_layer(*(l.uf), batch, learning_rate, momentum, decay); + update_convolutional_layer(*(l.ui), batch, learning_rate, momentum, decay); + update_convolutional_layer(*(l.ug), batch, learning_rate, momentum, decay); + update_convolutional_layer(*(l.uo), batch, learning_rate, momentum, decay); +} + +void resize_conv_lstm_layer(layer *l, int w, int h) +{ + if (l->peephole) { + resize_convolutional_layer(l->vf, w, h); + if (l->workspace_size < l->vf->workspace_size) l->workspace_size = l->vf->workspace_size; + + resize_convolutional_layer(l->vi, w, h); + if (l->workspace_size < l->vi->workspace_size) l->workspace_size = l->vi->workspace_size; + + resize_convolutional_layer(l->vo, w, h); + if (l->workspace_size < l->vo->workspace_size) l->workspace_size = l->vo->workspace_size; + } + + resize_convolutional_layer(l->wf, w, h); + if (l->workspace_size < l->wf->workspace_size) l->workspace_size = l->wf->workspace_size; + + resize_convolutional_layer(l->wi, w, h); + if (l->workspace_size < l->wi->workspace_size) l->workspace_size = l->wi->workspace_size; + + resize_convolutional_layer(l->wg, w, h); + if (l->workspace_size < l->wg->workspace_size) l->workspace_size = l->wg->workspace_size; + + resize_convolutional_layer(l->wo, w, h); + if (l->workspace_size < l->wo->workspace_size) l->workspace_size = l->wo->workspace_size; + + + resize_convolutional_layer(l->uf, w, h); + if (l->workspace_size < l->uf->workspace_size) l->workspace_size = l->uf->workspace_size; + + resize_convolutional_layer(l->ui, w, h); + if (l->workspace_size < l->ui->workspace_size) l->workspace_size = l->ui->workspace_size; + + resize_convolutional_layer(l->ug, w, h); + if (l->workspace_size < l->ug->workspace_size) l->workspace_size = l->ug->workspace_size; + + resize_convolutional_layer(l->uo, w, h); + if (l->workspace_size < l->uo->workspace_size) l->workspace_size = l->uo->workspace_size; + + l->w = w; + l->h = h; + l->out_h = l->wo->out_h; + l->out_w = l->wo->out_w; + l->outputs = l->wo->outputs; + int outputs = l->outputs; + l->inputs = w*h*l->c; + int steps = l->steps; + int batch = l->batch; + + assert(l->wo->outputs == l->uo->outputs); + + l->output = (float*)realloc(l->output, outputs * batch * steps * sizeof(float)); + //l->state = (float*)realloc(l->state, outputs * batch * sizeof(float)); + + l->prev_state_cpu = (float*)realloc(l->prev_state_cpu, batch*outputs * sizeof(float)); + l->prev_cell_cpu = (float*)realloc(l->prev_cell_cpu, batch*outputs * sizeof(float)); + l->cell_cpu = (float*)realloc(l->cell_cpu, batch*outputs*steps * sizeof(float)); + + l->f_cpu = (float*)realloc(l->f_cpu, batch*outputs * sizeof(float)); + l->i_cpu = (float*)realloc(l->i_cpu, batch*outputs * sizeof(float)); + l->g_cpu = (float*)realloc(l->g_cpu, batch*outputs * sizeof(float)); + l->o_cpu = (float*)realloc(l->o_cpu, batch*outputs * sizeof(float)); + l->c_cpu = (float*)realloc(l->c_cpu, batch*outputs * sizeof(float)); + l->h_cpu = (float*)realloc(l->h_cpu, batch*outputs * sizeof(float)); + l->temp_cpu = (float*)realloc(l->temp_cpu, batch*outputs * sizeof(float)); + l->temp2_cpu = (float*)realloc(l->temp2_cpu, batch*outputs * sizeof(float)); + l->temp3_cpu = (float*)realloc(l->temp3_cpu, batch*outputs * sizeof(float)); + l->dc_cpu = (float*)realloc(l->dc_cpu, batch*outputs * sizeof(float)); + l->dh_cpu = (float*)realloc(l->dh_cpu, batch*outputs * sizeof(float)); + l->stored_c_cpu = (float*)realloc(l->stored_c_cpu, batch*outputs * sizeof(float)); + l->stored_h_cpu = (float*)realloc(l->stored_h_cpu, batch*outputs * sizeof(float)); + +#ifdef GPU + //if (l->state_gpu) cudaFree(l->state_gpu); + //l->state_gpu = cuda_make_array(l->state, batch*l->outputs); + + if (l->output_gpu) cudaFree(l->output_gpu); + l->output_gpu = cuda_make_array(0, batch*outputs*steps); + + if (l->delta_gpu) cudaFree(l->delta_gpu); + l->delta_gpu = cuda_make_array(0, batch*outputs*steps); + + if (l->prev_state_gpu) cudaFree(l->prev_state_gpu); + l->prev_state_gpu = cuda_make_array(0, batch*outputs); + + if (l->prev_cell_gpu) cudaFree(l->prev_cell_gpu); + l->prev_cell_gpu = cuda_make_array(0, batch*outputs); + + if (l->cell_gpu) cudaFree(l->cell_gpu); + l->cell_gpu = cuda_make_array(0, batch*outputs*steps); + + if (l->f_gpu) cudaFree(l->f_gpu); + l->f_gpu = cuda_make_array(0, batch*outputs); + + if (l->i_gpu) cudaFree(l->i_gpu); + l->i_gpu = cuda_make_array(0, batch*outputs); + + if (l->g_gpu) cudaFree(l->g_gpu); + l->g_gpu = cuda_make_array(0, batch*outputs); + + if (l->o_gpu) cudaFree(l->o_gpu); + l->o_gpu = cuda_make_array(0, batch*outputs); + + if (l->c_gpu) cudaFree(l->c_gpu); + l->c_gpu = cuda_make_array(0, batch*outputs); + + if (l->h_gpu) cudaFree(l->h_gpu); + l->h_gpu = cuda_make_array(0, batch*outputs); + + if (l->temp_gpu) cudaFree(l->temp_gpu); + l->temp_gpu = cuda_make_array(0, batch*outputs); + + if (l->temp2_gpu) cudaFree(l->temp2_gpu); + l->temp2_gpu = cuda_make_array(0, batch*outputs); + + if (l->temp3_gpu) cudaFree(l->temp3_gpu); + l->temp3_gpu = cuda_make_array(0, batch*outputs); + + if (l->dc_gpu) cudaFree(l->dc_gpu); + l->dc_gpu = cuda_make_array(0, batch*outputs); + + if (l->dh_gpu) cudaFree(l->dh_gpu); + l->dh_gpu = cuda_make_array(0, batch*outputs); + + if (l->stored_c_gpu) cudaFree(l->stored_c_gpu); + l->stored_c_gpu = cuda_make_array(0, batch*outputs); + + if (l->stored_h_gpu) cudaFree(l->stored_h_gpu); + l->stored_h_gpu = cuda_make_array(0, batch*outputs); + + if (l->last_prev_state_gpu) cudaFree(l->last_prev_state_gpu); + l->last_prev_state_gpu = cuda_make_array(0, batch*outputs); + + if (l->last_prev_cell_gpu) cudaFree(l->last_prev_cell_gpu); + l->last_prev_cell_gpu = cuda_make_array(0, batch*outputs); +#endif +} + +void free_state_conv_lstm(layer l) +{ + int i; + for (i = 0; i < l.outputs * l.batch; ++i) l.h_cpu[i] = 0; + for (i = 0; i < l.outputs * l.batch; ++i) l.c_cpu[i] = 0; + +#ifdef GPU + cuda_push_array(l.h_gpu, l.h_cpu, l.outputs * l.batch); + cuda_push_array(l.c_gpu, l.c_cpu, l.outputs * l.batch); + + //fill_ongpu(l.outputs * l.batch, 0, l.dc_gpu, 1); // dont use + //fill_ongpu(l.outputs * l.batch, 0, l.dh_gpu, 1); // dont use +#endif // GPU +} + +void randomize_state_conv_lstm(layer l) +{ + int i; + for (i = 0; i < l.outputs * l.batch; ++i) l.h_cpu[i] = rand_uniform(-1, 1); + for (i = 0; i < l.outputs * l.batch; ++i) l.c_cpu[i] = rand_uniform(-1, 1); + +#ifdef GPU + cuda_push_array(l.h_gpu, l.h_cpu, l.outputs * l.batch); + cuda_push_array(l.c_gpu, l.c_cpu, l.outputs * l.batch); +#endif // GPU +} + + +void remember_state_conv_lstm(layer l) +{ + memcpy(l.stored_c_cpu, l.c_cpu, l.outputs * l.batch * sizeof(float)); + memcpy(l.stored_h_cpu, l.h_cpu, l.outputs * l.batch * sizeof(float)); + +#ifdef GPU + copy_ongpu(l.outputs*l.batch, l.c_gpu, 1, l.stored_c_gpu, 1); + copy_ongpu(l.outputs*l.batch, l.h_gpu, 1, l.stored_h_gpu, 1); +#endif // GPU +} + +void restore_state_conv_lstm(layer l) +{ + memcpy(l.c_cpu, l.stored_c_cpu, l.outputs * l.batch * sizeof(float)); + memcpy(l.h_cpu, l.stored_h_cpu, l.outputs * l.batch * sizeof(float)); + +#ifdef GPU + copy_ongpu(l.outputs*l.batch, l.stored_c_gpu, 1, l.c_gpu, 1); + copy_ongpu(l.outputs*l.batch, l.stored_h_gpu, 1, l.h_gpu, 1); +#endif // GPU +} + +void forward_conv_lstm_layer(layer l, network_state state) +{ + network_state s = { 0 }; + s.train = state.train; + s.workspace = state.workspace; + s.net = state.net; + int i; + layer vf = *(l.vf); + layer vi = *(l.vi); + layer vo = *(l.vo); + + layer wf = *(l.wf); + layer wi = *(l.wi); + layer wg = *(l.wg); + layer wo = *(l.wo); + + layer uf = *(l.uf); + layer ui = *(l.ui); + layer ug = *(l.ug); + layer uo = *(l.uo); + + if (l.peephole) { + fill_cpu(l.outputs * l.batch * l.steps, 0, vf.delta, 1); + fill_cpu(l.outputs * l.batch * l.steps, 0, vi.delta, 1); + fill_cpu(l.outputs * l.batch * l.steps, 0, vo.delta, 1); + } + + fill_cpu(l.outputs * l.batch * l.steps, 0, wf.delta, 1); + fill_cpu(l.outputs * l.batch * l.steps, 0, wi.delta, 1); + fill_cpu(l.outputs * l.batch * l.steps, 0, wg.delta, 1); + fill_cpu(l.outputs * l.batch * l.steps, 0, wo.delta, 1); + + fill_cpu(l.outputs * l.batch * l.steps, 0, uf.delta, 1); + fill_cpu(l.outputs * l.batch * l.steps, 0, ui.delta, 1); + fill_cpu(l.outputs * l.batch * l.steps, 0, ug.delta, 1); + fill_cpu(l.outputs * l.batch * l.steps, 0, uo.delta, 1); + if (state.train) { + fill_cpu(l.outputs * l.batch * l.steps, 0, l.delta, 1); + } + + for (i = 0; i < l.steps; ++i) + { + if (l.peephole) { + assert(l.outputs == vf.out_w * vf.out_h * vf.out_c); + s.input = l.c_cpu; + forward_convolutional_layer(vf, s); + forward_convolutional_layer(vi, s); + // vo below + } + + assert(l.outputs == wf.out_w * wf.out_h * wf.out_c); + assert(wf.c == l.out_c && wi.c == l.out_c && wg.c == l.out_c && wo.c == l.out_c); + + s.input = l.h_cpu; + forward_convolutional_layer(wf, s); + forward_convolutional_layer(wi, s); + forward_convolutional_layer(wg, s); + forward_convolutional_layer(wo, s); + + s.input = state.input; + forward_convolutional_layer(uf, s); + forward_convolutional_layer(ui, s); + forward_convolutional_layer(ug, s); + forward_convolutional_layer(uo, s); + + // f = wf + uf + vf + copy_cpu(l.outputs*l.batch, wf.output, 1, l.f_cpu, 1); + axpy_cpu(l.outputs*l.batch, 1, uf.output, 1, l.f_cpu, 1); + if (l.peephole) axpy_cpu(l.outputs*l.batch, 1, vf.output, 1, l.f_cpu, 1); + + // i = wi + ui + vi + copy_cpu(l.outputs*l.batch, wi.output, 1, l.i_cpu, 1); + axpy_cpu(l.outputs*l.batch, 1, ui.output, 1, l.i_cpu, 1); + if (l.peephole) axpy_cpu(l.outputs*l.batch, 1, vi.output, 1, l.i_cpu, 1); + + // g = wg + ug + copy_cpu(l.outputs*l.batch, wg.output, 1, l.g_cpu, 1); + axpy_cpu(l.outputs*l.batch, 1, ug.output, 1, l.g_cpu, 1); + + activate_array(l.f_cpu, l.outputs*l.batch, LOGISTIC); + activate_array(l.i_cpu, l.outputs*l.batch, LOGISTIC); + activate_array(l.g_cpu, l.outputs*l.batch, TANH); + + // c = f*c + i*g + copy_cpu(l.outputs*l.batch, l.i_cpu, 1, l.temp_cpu, 1); + mul_cpu(l.outputs*l.batch, l.g_cpu, 1, l.temp_cpu, 1); + mul_cpu(l.outputs*l.batch, l.f_cpu, 1, l.c_cpu, 1); + axpy_cpu(l.outputs*l.batch, 1, l.temp_cpu, 1, l.c_cpu, 1); + + // o = wo + uo + vo(c_new) + s.input = l.c_cpu; + if (l.peephole) forward_convolutional_layer(vo, s); + copy_cpu(l.outputs*l.batch, wo.output, 1, l.o_cpu, 1); + axpy_cpu(l.outputs*l.batch, 1, uo.output, 1, l.o_cpu, 1); + if (l.peephole) axpy_cpu(l.outputs*l.batch, 1, vo.output, 1, l.o_cpu, 1); + activate_array(l.o_cpu, l.outputs*l.batch, LOGISTIC); + + // h = o * tanh(c) + copy_cpu(l.outputs*l.batch, l.c_cpu, 1, l.h_cpu, 1); + activate_array(l.h_cpu, l.outputs*l.batch, TANH); + mul_cpu(l.outputs*l.batch, l.o_cpu, 1, l.h_cpu, 1); + + copy_cpu(l.outputs*l.batch, l.c_cpu, 1, l.cell_cpu, 1); + copy_cpu(l.outputs*l.batch, l.h_cpu, 1, l.output, 1); + + state.input += l.inputs*l.batch; + l.output += l.outputs*l.batch; + l.cell_cpu += l.outputs*l.batch; + + if (l.peephole) { + increment_layer(&vf, 1); + increment_layer(&vi, 1); + increment_layer(&vo, 1); + } + + increment_layer(&wf, 1); + increment_layer(&wi, 1); + increment_layer(&wg, 1); + increment_layer(&wo, 1); + + increment_layer(&uf, 1); + increment_layer(&ui, 1); + increment_layer(&ug, 1); + increment_layer(&uo, 1); + } +} + +void backward_conv_lstm_layer(layer l, network_state state) +{ + network_state s = { 0 }; + s.train = state.train; + s.workspace = state.workspace; + int i; + layer vf = *(l.vf); + layer vi = *(l.vi); + layer vo = *(l.vo); + + layer wf = *(l.wf); + layer wi = *(l.wi); + layer wg = *(l.wg); + layer wo = *(l.wo); + + layer uf = *(l.uf); + layer ui = *(l.ui); + layer ug = *(l.ug); + layer uo = *(l.uo); + + if (l.peephole) { + increment_layer(&vf, l.steps - 1); + increment_layer(&vi, l.steps - 1); + increment_layer(&vo, l.steps - 1); + } + + increment_layer(&wf, l.steps - 1); + increment_layer(&wi, l.steps - 1); + increment_layer(&wg, l.steps - 1); + increment_layer(&wo, l.steps - 1); + + increment_layer(&uf, l.steps - 1); + increment_layer(&ui, l.steps - 1); + increment_layer(&ug, l.steps - 1); + increment_layer(&uo, l.steps - 1); + + state.input += l.inputs*l.batch*(l.steps - 1); + if (state.delta) state.delta += l.inputs*l.batch*(l.steps - 1); + + l.output += l.outputs*l.batch*(l.steps - 1); + l.cell_cpu += l.outputs*l.batch*(l.steps - 1); + l.delta += l.outputs*l.batch*(l.steps - 1); + + for (i = l.steps - 1; i >= 0; --i) { + if (i != 0) copy_cpu(l.outputs*l.batch, l.cell_cpu - l.outputs*l.batch, 1, l.prev_cell_cpu, 1); + copy_cpu(l.outputs*l.batch, l.cell_cpu, 1, l.c_cpu, 1); + if (i != 0) copy_cpu(l.outputs*l.batch, l.output - l.outputs*l.batch, 1, l.prev_state_cpu, 1); + copy_cpu(l.outputs*l.batch, l.output, 1, l.h_cpu, 1); + + l.dh_cpu = (i == 0) ? 0 : l.delta - l.outputs*l.batch; + + // f = wf + uf + vf + copy_cpu(l.outputs*l.batch, wf.output, 1, l.f_cpu, 1); + axpy_cpu(l.outputs*l.batch, 1, uf.output, 1, l.f_cpu, 1); + if (l.peephole) axpy_cpu(l.outputs*l.batch, 1, vf.output, 1, l.f_cpu, 1); + + // i = wi + ui + vi + copy_cpu(l.outputs*l.batch, wi.output, 1, l.i_cpu, 1); + axpy_cpu(l.outputs*l.batch, 1, ui.output, 1, l.i_cpu, 1); + if (l.peephole) axpy_cpu(l.outputs*l.batch, 1, vi.output, 1, l.i_cpu, 1); + + // g = wg + ug + copy_cpu(l.outputs*l.batch, wg.output, 1, l.g_cpu, 1); + axpy_cpu(l.outputs*l.batch, 1, ug.output, 1, l.g_cpu, 1); + + // o = wo + uo + vo + copy_cpu(l.outputs*l.batch, wo.output, 1, l.o_cpu, 1); + axpy_cpu(l.outputs*l.batch, 1, uo.output, 1, l.o_cpu, 1); + if (l.peephole) axpy_cpu(l.outputs*l.batch, 1, vo.output, 1, l.o_cpu, 1); + + activate_array(l.f_cpu, l.outputs*l.batch, LOGISTIC); + activate_array(l.i_cpu, l.outputs*l.batch, LOGISTIC); + activate_array(l.g_cpu, l.outputs*l.batch, TANH); + activate_array(l.o_cpu, l.outputs*l.batch, LOGISTIC); + + copy_cpu(l.outputs*l.batch, l.delta, 1, l.temp3_cpu, 1); + + copy_cpu(l.outputs*l.batch, l.c_cpu, 1, l.temp_cpu, 1); + activate_array(l.temp_cpu, l.outputs*l.batch, TANH); + + copy_cpu(l.outputs*l.batch, l.temp3_cpu, 1, l.temp2_cpu, 1); + mul_cpu(l.outputs*l.batch, l.o_cpu, 1, l.temp2_cpu, 1); + + gradient_array(l.temp_cpu, l.outputs*l.batch, TANH, l.temp2_cpu); + axpy_cpu(l.outputs*l.batch, 1, l.dc_cpu, 1, l.temp2_cpu, 1); + // temp = tanh(c) + // temp2 = delta * o * grad_tanh(tahc(c)) + // temp3 = delta + + copy_cpu(l.outputs*l.batch, l.c_cpu, 1, l.temp_cpu, 1); + activate_array(l.temp_cpu, l.outputs*l.batch, TANH); + mul_cpu(l.outputs*l.batch, l.temp3_cpu, 1, l.temp_cpu, 1); + gradient_array(l.o_cpu, l.outputs*l.batch, LOGISTIC, l.temp_cpu); + // delta for o(w,u,v): temp = delta * tanh(c) * grad_logistic(o) + // delta for c,f,i,g(w,u,v): temp2 = delta * o * grad_tanh(tahc(c)) + delta_c(???) + // delta for output: temp3 = delta + + // o + // delta for O(w,u,v): temp = delta * tanh(c) * grad_logistic(o) + if (l.peephole) { + copy_cpu(l.outputs*l.batch, l.temp_cpu, 1, vo.delta, 1); + s.input = l.cell_cpu; + //s.delta = l.dc_cpu; + backward_convolutional_layer(vo, s); + } + + copy_cpu(l.outputs*l.batch, l.temp_cpu, 1, wo.delta, 1); + s.input = l.prev_state_cpu; + //s.delta = l.dh_cpu; + backward_convolutional_layer(wo, s); + + copy_cpu(l.outputs*l.batch, l.temp_cpu, 1, uo.delta, 1); + s.input = state.input; + s.delta = state.delta; + backward_convolutional_layer(uo, s); + + // g + copy_cpu(l.outputs*l.batch, l.temp2_cpu, 1, l.temp_cpu, 1); + mul_cpu(l.outputs*l.batch, l.i_cpu, 1, l.temp_cpu, 1); + gradient_array(l.g_cpu, l.outputs*l.batch, TANH, l.temp_cpu); + // delta for c,f,i,g(w,u,v): temp2 = (delta * o * grad_tanh(tahc(c)) + delta_c(???)) * g * grad_logistic(i) + + copy_cpu(l.outputs*l.batch, l.temp_cpu, 1, wg.delta, 1); + s.input = l.prev_state_cpu; + //s.delta = l.dh_cpu; + backward_convolutional_layer(wg, s); + + copy_cpu(l.outputs*l.batch, l.temp_cpu, 1, ug.delta, 1); + s.input = state.input; + s.delta = state.delta; + backward_convolutional_layer(ug, s); + + // i + copy_cpu(l.outputs*l.batch, l.temp2_cpu, 1, l.temp_cpu, 1); + mul_cpu(l.outputs*l.batch, l.g_cpu, 1, l.temp_cpu, 1); + gradient_array(l.i_cpu, l.outputs*l.batch, LOGISTIC, l.temp_cpu); + // delta for c,f,i,g(w,u,v): temp2 = (delta * o * grad_tanh(tahc(c)) + delta_c(???)) * g * grad_logistic(i) + + if (l.peephole) { + copy_cpu(l.outputs*l.batch, l.temp_cpu, 1, vi.delta, 1); + s.input = l.prev_cell_cpu; + //s.delta = l.dc_cpu; + backward_convolutional_layer(vi, s); + } + + copy_cpu(l.outputs*l.batch, l.temp_cpu, 1, wi.delta, 1); + s.input = l.prev_state_cpu; + //s.delta = l.dh_cpu; + backward_convolutional_layer(wi, s); + + copy_cpu(l.outputs*l.batch, l.temp_cpu, 1, ui.delta, 1); + s.input = state.input; + s.delta = state.delta; + backward_convolutional_layer(ui, s); + + // f + copy_cpu(l.outputs*l.batch, l.temp2_cpu, 1, l.temp_cpu, 1); + mul_cpu(l.outputs*l.batch, l.prev_cell_cpu, 1, l.temp_cpu, 1); + gradient_array(l.f_cpu, l.outputs*l.batch, LOGISTIC, l.temp_cpu); + // delta for c,f,i,g(w,u,v): temp2 = (delta * o * grad_tanh(tahc(c)) + delta_c(???)) * c * grad_logistic(f) + + if (l.peephole) { + copy_cpu(l.outputs*l.batch, l.temp_cpu, 1, vf.delta, 1); + s.input = l.prev_cell_cpu; + //s.delta = l.dc_cpu; + backward_convolutional_layer(vf, s); + } + + copy_cpu(l.outputs*l.batch, l.temp_cpu, 1, wf.delta, 1); + s.input = l.prev_state_cpu; + //s.delta = l.dh_cpu; + backward_convolutional_layer(wf, s); + + copy_cpu(l.outputs*l.batch, l.temp_cpu, 1, uf.delta, 1); + s.input = state.input; + s.delta = state.delta; + backward_convolutional_layer(uf, s); + + copy_cpu(l.outputs*l.batch, l.temp2_cpu, 1, l.temp_cpu, 1); + mul_cpu(l.outputs*l.batch, l.f_cpu, 1, l.temp_cpu, 1); + copy_cpu(l.outputs*l.batch, l.temp_cpu, 1, l.dc_cpu, 1); + + state.input -= l.inputs*l.batch; + if (state.delta) state.delta -= l.inputs*l.batch; + l.output -= l.outputs*l.batch; + l.cell_cpu -= l.outputs*l.batch; + l.delta -= l.outputs*l.batch; + + if (l.peephole) { + increment_layer(&vf, -1); + increment_layer(&vi, -1); + increment_layer(&vo, -1); + } + + increment_layer(&wf, -1); + increment_layer(&wi, -1); + increment_layer(&wg, -1); + increment_layer(&wo, -1); + + increment_layer(&uf, -1); + increment_layer(&ui, -1); + increment_layer(&ug, -1); + increment_layer(&uo, -1); + } +} + +#ifdef GPU +void pull_conv_lstm_layer(layer l) +{ + if (l.peephole) { + pull_convolutional_layer(*(l.vf)); + pull_convolutional_layer(*(l.vi)); + pull_convolutional_layer(*(l.vo)); + } + pull_convolutional_layer(*(l.wf)); + pull_convolutional_layer(*(l.wi)); + pull_convolutional_layer(*(l.wg)); + pull_convolutional_layer(*(l.wo)); + pull_convolutional_layer(*(l.uf)); + pull_convolutional_layer(*(l.ui)); + pull_convolutional_layer(*(l.ug)); + pull_convolutional_layer(*(l.uo)); +} + +void push_conv_lstm_layer(layer l) +{ + if (l.peephole) { + push_convolutional_layer(*(l.vf)); + push_convolutional_layer(*(l.vi)); + push_convolutional_layer(*(l.vo)); + } + push_convolutional_layer(*(l.wf)); + push_convolutional_layer(*(l.wi)); + push_convolutional_layer(*(l.wg)); + push_convolutional_layer(*(l.wo)); + push_convolutional_layer(*(l.uf)); + push_convolutional_layer(*(l.ui)); + push_convolutional_layer(*(l.ug)); + push_convolutional_layer(*(l.uo)); +} + +void update_conv_lstm_layer_gpu(layer l, int batch, float learning_rate, float momentum, float decay) +{ + if (l.peephole) { + update_convolutional_layer_gpu(*(l.vf), batch, learning_rate, momentum, decay); + update_convolutional_layer_gpu(*(l.vi), batch, learning_rate, momentum, decay); + update_convolutional_layer_gpu(*(l.vo), batch, learning_rate, momentum, decay); + } + update_convolutional_layer_gpu(*(l.wf), batch, learning_rate, momentum, decay); + update_convolutional_layer_gpu(*(l.wi), batch, learning_rate, momentum, decay); + update_convolutional_layer_gpu(*(l.wg), batch, learning_rate, momentum, decay); + update_convolutional_layer_gpu(*(l.wo), batch, learning_rate, momentum, decay); + update_convolutional_layer_gpu(*(l.uf), batch, learning_rate, momentum, decay); + update_convolutional_layer_gpu(*(l.ui), batch, learning_rate, momentum, decay); + update_convolutional_layer_gpu(*(l.ug), batch, learning_rate, momentum, decay); + update_convolutional_layer_gpu(*(l.uo), batch, learning_rate, momentum, decay); +} + +void forward_conv_lstm_layer_gpu(layer l, network_state state) +{ + network_state s = { 0 }; + s.train = state.train; + s.workspace = state.workspace; + s.net = state.net; + //if (!state.train) s.index = state.index; // don't use TC for training (especially without cuda_convert_f32_to_f16() ) + int i; + layer vf = *(l.vf); + layer vi = *(l.vi); + layer vo = *(l.vo); + + layer wf = *(l.wf); + layer wi = *(l.wi); + layer wg = *(l.wg); + layer wo = *(l.wo); + + layer uf = *(l.uf); + layer ui = *(l.ui); + layer ug = *(l.ug); + layer uo = *(l.uo); + + if (state.train) { + if (l.peephole) { + fill_ongpu(l.outputs * l.batch * l.steps, 0, vf.delta_gpu, 1); + fill_ongpu(l.outputs * l.batch * l.steps, 0, vi.delta_gpu, 1); + fill_ongpu(l.outputs * l.batch * l.steps, 0, vo.delta_gpu, 1); + } + + fill_ongpu(l.outputs * l.batch * l.steps, 0, wf.delta_gpu, 1); + fill_ongpu(l.outputs * l.batch * l.steps, 0, wi.delta_gpu, 1); + fill_ongpu(l.outputs * l.batch * l.steps, 0, wg.delta_gpu, 1); + fill_ongpu(l.outputs * l.batch * l.steps, 0, wo.delta_gpu, 1); + + fill_ongpu(l.outputs * l.batch * l.steps, 0, uf.delta_gpu, 1); + fill_ongpu(l.outputs * l.batch * l.steps, 0, ui.delta_gpu, 1); + fill_ongpu(l.outputs * l.batch * l.steps, 0, ug.delta_gpu, 1); + fill_ongpu(l.outputs * l.batch * l.steps, 0, uo.delta_gpu, 1); + + fill_ongpu(l.outputs * l.batch * l.steps, 0, l.delta_gpu, 1); + } + + for (i = 0; i < l.steps; ++i) + { + if (l.peephole) { + assert(l.outputs == vf.out_w * vf.out_h * vf.out_c); + s.input = l.c_gpu; + forward_convolutional_layer_gpu(vf, s); + forward_convolutional_layer_gpu(vi, s); + // vo below + } + + assert(l.outputs == wf.out_w * wf.out_h * wf.out_c); + assert(wf.c == l.out_c && wi.c == l.out_c && wg.c == l.out_c && wo.c == l.out_c); + + s.input = l.h_gpu; + forward_convolutional_layer_gpu(wf, s); + forward_convolutional_layer_gpu(wi, s); + forward_convolutional_layer_gpu(wg, s); + forward_convolutional_layer_gpu(wo, s); + + assert(l.inputs == uf.w * uf.h * uf.c); + assert(uf.c == l.c && ui.c == l.c && ug.c == l.c && uo.c == l.c); + + s.input = state.input; + forward_convolutional_layer_gpu(uf, s); + forward_convolutional_layer_gpu(ui, s); + forward_convolutional_layer_gpu(ug, s); + forward_convolutional_layer_gpu(uo, s); + + // f = wf + uf + vf + copy_ongpu(l.outputs*l.batch, wf.output_gpu, 1, l.f_gpu, 1); + axpy_ongpu(l.outputs*l.batch, 1, uf.output_gpu, 1, l.f_gpu, 1); + if (l.peephole) axpy_ongpu(l.outputs*l.batch, 1, vf.output_gpu, 1, l.f_gpu, 1); + + // i = wi + ui + vi + copy_ongpu(l.outputs*l.batch, wi.output_gpu, 1, l.i_gpu, 1); + axpy_ongpu(l.outputs*l.batch, 1, ui.output_gpu, 1, l.i_gpu, 1); + if (l.peephole) axpy_ongpu(l.outputs*l.batch, 1, vi.output_gpu, 1, l.i_gpu, 1); + + // g = wg + ug + copy_ongpu(l.outputs*l.batch, wg.output_gpu, 1, l.g_gpu, 1); + axpy_ongpu(l.outputs*l.batch, 1, ug.output_gpu, 1, l.g_gpu, 1); + + activate_array_ongpu(l.f_gpu, l.outputs*l.batch, LOGISTIC); + activate_array_ongpu(l.i_gpu, l.outputs*l.batch, LOGISTIC); + activate_array_ongpu(l.g_gpu, l.outputs*l.batch, TANH); + + // c = f*c + i*g + copy_ongpu(l.outputs*l.batch, l.i_gpu, 1, l.temp_gpu, 1); + mul_ongpu(l.outputs*l.batch, l.g_gpu, 1, l.temp_gpu, 1); + mul_ongpu(l.outputs*l.batch, l.f_gpu, 1, l.c_gpu, 1); + axpy_ongpu(l.outputs*l.batch, 1, l.temp_gpu, 1, l.c_gpu, 1); + + // o = wo + uo + vo(c_new) + if (l.peephole) { + s.input = l.c_gpu; + forward_convolutional_layer_gpu(vo, s); + } + copy_ongpu(l.outputs*l.batch, wo.output_gpu, 1, l.o_gpu, 1); + axpy_ongpu(l.outputs*l.batch, 1, uo.output_gpu, 1, l.o_gpu, 1); + if (l.peephole) axpy_ongpu(l.outputs*l.batch, 1, vo.output_gpu, 1, l.o_gpu, 1); + activate_array_ongpu(l.o_gpu, l.outputs*l.batch, LOGISTIC); + + // h = o * tanh(c) + copy_ongpu(l.outputs*l.batch, l.c_gpu, 1, l.h_gpu, 1); + activate_array_ongpu(l.h_gpu, l.outputs*l.batch, TANH); + mul_ongpu(l.outputs*l.batch, l.o_gpu, 1, l.h_gpu, 1); + + //constrain_ongpu(l.outputs*l.batch, 1, l.c_gpu, 1); + //constrain_ongpu(l.outputs*l.batch, 1, l.h_gpu, 1); + fix_nan_and_inf(l.c_gpu, l.outputs*l.batch); + fix_nan_and_inf(l.h_gpu, l.outputs*l.batch); + + copy_ongpu(l.outputs*l.batch, l.c_gpu, 1, l.cell_gpu, 1); + copy_ongpu(l.outputs*l.batch, l.h_gpu, 1, l.output_gpu, 1); + + state.input += l.inputs*l.batch; + l.output_gpu += l.outputs*l.batch; + l.cell_gpu += l.outputs*l.batch; + + if (l.peephole) { + increment_layer(&vf, 1); + increment_layer(&vi, 1); + increment_layer(&vo, 1); + } + + increment_layer(&wf, 1); + increment_layer(&wi, 1); + increment_layer(&wg, 1); + increment_layer(&wo, 1); + + increment_layer(&uf, 1); + increment_layer(&ui, 1); + increment_layer(&ug, 1); + increment_layer(&uo, 1); + } +} + +void backward_conv_lstm_layer_gpu(layer l, network_state state) +{ + float *last_output = l.output_gpu + l.outputs*l.batch*(l.steps - 1); + float *last_cell = l.cell_gpu + l.outputs*l.batch*(l.steps - 1); + + network_state s = { 0 }; + s.train = state.train; + s.workspace = state.workspace; + s.net = state.net; + int i; + layer vf = *(l.vf); + layer vi = *(l.vi); + layer vo = *(l.vo); + + layer wf = *(l.wf); + layer wi = *(l.wi); + layer wg = *(l.wg); + layer wo = *(l.wo); + + layer uf = *(l.uf); + layer ui = *(l.ui); + layer ug = *(l.ug); + layer uo = *(l.uo); + + if (l.peephole) { + increment_layer(&vf, l.steps - 1); + increment_layer(&vi, l.steps - 1); + increment_layer(&vo, l.steps - 1); + } + + increment_layer(&wf, l.steps - 1); + increment_layer(&wi, l.steps - 1); + increment_layer(&wg, l.steps - 1); + increment_layer(&wo, l.steps - 1); + + increment_layer(&uf, l.steps - 1); + increment_layer(&ui, l.steps - 1); + increment_layer(&ug, l.steps - 1); + increment_layer(&uo, l.steps - 1); + + state.input += l.inputs*l.batch*(l.steps - 1); + if (state.delta) state.delta += l.inputs*l.batch*(l.steps - 1); + + l.output_gpu += l.outputs*l.batch*(l.steps - 1); + l.cell_gpu += l.outputs*l.batch*(l.steps - 1); + l.delta_gpu += l.outputs*l.batch*(l.steps - 1); + + fix_nan_and_inf(l.dc_gpu, l.outputs*l.batch); + //fill_ongpu(l.outputs * l.batch, 0, l.dc_gpu, 1); // dont use + const int sequence = get_sequence_value(state.net); + + for (i = l.steps - 1; i >= 0; --i) { + if (i != 0) copy_ongpu(l.outputs*l.batch, l.cell_gpu - l.outputs*l.batch, 1, l.prev_cell_gpu, 1); + //else fill_ongpu(l.outputs * l.batch, 0, l.prev_cell_gpu, 1); // dont use + else if (state.net.current_subdivision % sequence != 0) copy_ongpu(l.outputs*l.batch, l.last_prev_cell_gpu, 1, l.prev_cell_gpu, 1); + + copy_ongpu(l.outputs*l.batch, l.cell_gpu, 1, l.c_gpu, 1); + + if (i != 0) copy_ongpu(l.outputs*l.batch, l.output_gpu - l.outputs*l.batch, 1, l.prev_state_gpu, 1); + //else fill_ongpu(l.outputs * l.batch, 0, l.prev_state_gpu, 1); // dont use + else if(state.net.current_subdivision % sequence != 0) copy_ongpu(l.outputs*l.batch, l.last_prev_state_gpu, 1, l.prev_state_gpu, 1); + + copy_ongpu(l.outputs*l.batch, l.output_gpu, 1, l.h_gpu, 1); + + l.dh_gpu = (i == 0) ? 0 : l.delta_gpu - l.outputs*l.batch; + + // f = wf + uf + vf + copy_ongpu(l.outputs*l.batch, wf.output_gpu, 1, l.f_gpu, 1); + axpy_ongpu(l.outputs*l.batch, 1, uf.output_gpu, 1, l.f_gpu, 1); + if (l.peephole) axpy_ongpu(l.outputs*l.batch, 1, vf.output_gpu, 1, l.f_gpu, 1); + + // i = wi + ui + vi + copy_ongpu(l.outputs*l.batch, wi.output_gpu, 1, l.i_gpu, 1); + axpy_ongpu(l.outputs*l.batch, 1, ui.output_gpu, 1, l.i_gpu, 1); + if (l.peephole) axpy_ongpu(l.outputs*l.batch, 1, vi.output_gpu, 1, l.i_gpu, 1); + + // g = wg + ug + copy_ongpu(l.outputs*l.batch, wg.output_gpu, 1, l.g_gpu, 1); + axpy_ongpu(l.outputs*l.batch, 1, ug.output_gpu, 1, l.g_gpu, 1); + + // o = wo + uo + vo + copy_ongpu(l.outputs*l.batch, wo.output_gpu, 1, l.o_gpu, 1); + axpy_ongpu(l.outputs*l.batch, 1, uo.output_gpu, 1, l.o_gpu, 1); + if (l.peephole) axpy_ongpu(l.outputs*l.batch, 1, vo.output_gpu, 1, l.o_gpu, 1); + + activate_array_ongpu(l.f_gpu, l.outputs*l.batch, LOGISTIC); + activate_array_ongpu(l.i_gpu, l.outputs*l.batch, LOGISTIC); + activate_array_ongpu(l.g_gpu, l.outputs*l.batch, TANH); + activate_array_ongpu(l.o_gpu, l.outputs*l.batch, LOGISTIC); + + + copy_ongpu(l.outputs*l.batch, l.delta_gpu, 1, l.temp3_gpu, 1); // temp3 = delta + + copy_ongpu(l.outputs*l.batch, l.c_gpu, 1, l.temp_gpu, 1); + activate_array_ongpu(l.temp_gpu, l.outputs*l.batch, TANH); // temp = tanh(c) + + copy_ongpu(l.outputs*l.batch, l.temp3_gpu, 1, l.temp2_gpu, 1); + mul_ongpu(l.outputs*l.batch, l.o_gpu, 1, l.temp2_gpu, 1); // temp2 = delta * o + + gradient_array_ongpu(l.temp_gpu, l.outputs*l.batch, TANH, l.temp2_gpu); // temp2 = delta * o * grad_tanh(tahc(c)) + //??? + axpy_ongpu(l.outputs*l.batch, 1, l.dc_gpu, 1, l.temp2_gpu, 1); // temp2 = delta * o * grad_tanh(tahc(c)) + delta_c(???) + // temp = tanh(c) + // temp2 = delta * o * grad_tanh(tahc(c)) + delta_c(???) + // temp3 = delta + + copy_ongpu(l.outputs*l.batch, l.c_gpu, 1, l.temp_gpu, 1); + activate_array_ongpu(l.temp_gpu, l.outputs*l.batch, TANH); // temp = tanh(c) + + mul_ongpu(l.outputs*l.batch, l.temp3_gpu, 1, l.temp_gpu, 1); // temp = delta * tanh(c) + gradient_array_ongpu(l.o_gpu, l.outputs*l.batch, LOGISTIC, l.temp_gpu); // temp = delta * tanh(c) * grad_logistic(o) + // delta for o(w,u,v): temp = delta * tanh(c) * grad_logistic(o) + // delta for c,f,i,g(w,u,v): temp2 = delta * o * grad_tanh(tahc(c)) + delta_c(???) + // delta for output: temp3 = delta + + // o + // delta for O(w,u,v): temp = delta * tanh(c) * grad_logistic(o) + if (l.peephole) { + copy_ongpu(l.outputs*l.batch, l.temp_gpu, 1, vo.delta_gpu, 1); + s.input = l.cell_gpu; + //s.delta = l.dc_gpu; + backward_convolutional_layer_gpu(vo, s); + } + + copy_ongpu(l.outputs*l.batch, l.temp_gpu, 1, wo.delta_gpu, 1); + s.input = l.prev_state_gpu; + //s.delta = l.dh_gpu; + backward_convolutional_layer_gpu(wo, s); + + copy_ongpu(l.outputs*l.batch, l.temp_gpu, 1, uo.delta_gpu, 1); + s.input = state.input; + s.delta = state.delta; + backward_convolutional_layer_gpu(uo, s); + + // g + copy_ongpu(l.outputs*l.batch, l.temp2_gpu, 1, l.temp_gpu, 1); + mul_ongpu(l.outputs*l.batch, l.i_gpu, 1, l.temp_gpu, 1); + gradient_array_ongpu(l.g_gpu, l.outputs*l.batch, TANH, l.temp_gpu); + // delta for c,f,i,g(w,u,v): temp = (delta * o * grad_tanh(tahc(c)) + delta_c(???)) * i * grad_tanh(g) + + copy_ongpu(l.outputs*l.batch, l.temp_gpu, 1, wg.delta_gpu, 1); + s.input = l.prev_state_gpu; + //s.delta = l.dh_gpu; + backward_convolutional_layer_gpu(wg, s); + + copy_ongpu(l.outputs*l.batch, l.temp_gpu, 1, ug.delta_gpu, 1); + s.input = state.input; + s.delta = state.delta; + backward_convolutional_layer_gpu(ug, s); + + // i + copy_ongpu(l.outputs*l.batch, l.temp2_gpu, 1, l.temp_gpu, 1); + mul_ongpu(l.outputs*l.batch, l.g_gpu, 1, l.temp_gpu, 1); + gradient_array_ongpu(l.i_gpu, l.outputs*l.batch, LOGISTIC, l.temp_gpu); + // delta for c,f,i,g(w,u,v): temp = (delta * o * grad_tanh(tahc(c)) + delta_c(???)) * g * grad_logistic(i) + + if (l.peephole) { + copy_ongpu(l.outputs*l.batch, l.temp_gpu, 1, vi.delta_gpu, 1); + s.input = l.prev_cell_gpu; + //s.delta = l.dc_gpu; + backward_convolutional_layer_gpu(vi, s); + } + + copy_ongpu(l.outputs*l.batch, l.temp_gpu, 1, wi.delta_gpu, 1); + s.input = l.prev_state_gpu; + //s.delta = l.dh_gpu; + backward_convolutional_layer_gpu(wi, s); + + copy_ongpu(l.outputs*l.batch, l.temp_gpu, 1, ui.delta_gpu, 1); + s.input = state.input; + s.delta = state.delta; + backward_convolutional_layer_gpu(ui, s); + + // f + copy_ongpu(l.outputs*l.batch, l.temp2_gpu, 1, l.temp_gpu, 1); + mul_ongpu(l.outputs*l.batch, l.prev_cell_gpu, 1, l.temp_gpu, 1); + gradient_array_ongpu(l.f_gpu, l.outputs*l.batch, LOGISTIC, l.temp_gpu); + // delta for c,f,i,g(w,u,v): temp = (delta * o * grad_tanh(tahc(c)) + delta_c(???)) * c * grad_logistic(f) + + if (l.peephole) { + copy_ongpu(l.outputs*l.batch, l.temp_gpu, 1, vf.delta_gpu, 1); + s.input = l.prev_cell_gpu; + //s.delta = l.dc_gpu; + backward_convolutional_layer_gpu(vf, s); + } + + copy_ongpu(l.outputs*l.batch, l.temp_gpu, 1, wf.delta_gpu, 1); + s.input = l.prev_state_gpu; + //s.delta = l.dh_gpu; + backward_convolutional_layer_gpu(wf, s); + + copy_ongpu(l.outputs*l.batch, l.temp_gpu, 1, uf.delta_gpu, 1); + s.input = state.input; + s.delta = state.delta; + backward_convolutional_layer_gpu(uf, s); + + // c + copy_ongpu(l.outputs*l.batch, l.temp2_gpu, 1, l.temp_gpu, 1); + mul_ongpu(l.outputs*l.batch, l.f_gpu, 1, l.temp_gpu, 1); + copy_ongpu(l.outputs*l.batch, l.temp_gpu, 1, l.dc_gpu, 1); + // delta for c,f,i,g(w,u,v): delta_c = temp = (delta * o * grad_tanh(tahc(c)) + delta_c(???)) * f // (grad_linear(c)==1) + + state.input -= l.inputs*l.batch; + if (state.delta) state.delta -= l.inputs*l.batch; // new delta: state.delta = prev_layer.delta_gpu; + l.output_gpu -= l.outputs*l.batch; + l.cell_gpu -= l.outputs*l.batch; + l.delta_gpu -= l.outputs*l.batch; + + if (l.peephole) { + increment_layer(&vf, -1); + increment_layer(&vi, -1); + increment_layer(&vo, -1); + } + + increment_layer(&wf, -1); + increment_layer(&wi, -1); + increment_layer(&wg, -1); + increment_layer(&wo, -1); + + increment_layer(&uf, -1); + increment_layer(&ui, -1); + increment_layer(&ug, -1); + increment_layer(&uo, -1); + } + + copy_ongpu(l.outputs*l.batch, last_output, 1, l.last_prev_state_gpu, 1); + copy_ongpu(l.outputs*l.batch, last_cell, 1, l.last_prev_cell_gpu, 1); +} +#endif diff --git a/src/conv_lstm_layer.h b/src/conv_lstm_layer.h new file mode 100644 index 00000000..3bd9b4ef --- /dev/null +++ b/src/conv_lstm_layer.h @@ -0,0 +1,33 @@ +#ifndef CONV_LSTM_LAYER_H +#define CONV_LSTM_LAYER_H + +#include "activations.h" +#include "layer.h" +#include "network.h" +#define USET + +#ifdef __cplusplus +extern "C" { +#endif +layer make_conv_lstm_layer(int batch, int h, int w, int c, int output_filters, int steps, int size, int stride, int pad, ACTIVATION activation, int batch_normalize, int peephole, int xnor); +void resize_conv_lstm_layer(layer *l, int w, int h); +void free_state_conv_lstm(layer l); +void randomize_state_conv_lstm(layer l); +void remember_state_conv_lstm(layer l); +void restore_state_conv_lstm(layer l); + +void forward_conv_lstm_layer(layer l, network_state state); +void backward_conv_lstm_layer(layer l, network_state state); +void update_conv_lstm_layer(layer l, int batch, float learning_rate, float momentum, float decay); + +#ifdef GPU +void forward_conv_lstm_layer_gpu(layer l, network_state state); +void backward_conv_lstm_layer_gpu(layer l, network_state state); +void update_conv_lstm_layer_gpu(layer l, int batch, float learning_rate, float momentum, float decay); +#endif + +#ifdef __cplusplus +} +#endif + +#endif // CONV_LSTM_LAYER_H diff --git a/src/detector.c b/src/detector.c index f535a032..a34354f1 100644 --- a/src/detector.c +++ b/src/detector.c @@ -191,6 +191,11 @@ void train_detector(char *datacfg, char *cfgfile, char *weightfile, int *gpus, i time = what_time_is_it_now(); pthread_join(load_thread, 0); train = buffer; + if (net.track) { + net.sequential_subdivisions = get_current_seq_subdivisions(net); + args.threads = net.sequential_subdivisions * ngpus; + printf(" sequential_subdivisions = %d \n", net.sequential_subdivisions); + } load_thread = load_data(args); /* @@ -662,7 +667,7 @@ float validate_detector_map(char *datacfg, char *cfgfile, char *weightfile, floa char *train_images = option_find_str(options, "train", "data/train.txt"); valid_images = option_find_str(options, "valid", train_images); net = *existing_net; - remember_network_recurrent_state(*existing_net); + //remember_network_recurrent_state(*existing_net); free_network_recurrent_state(*existing_net); } else { @@ -1073,7 +1078,8 @@ float validate_detector_map(char *datacfg, char *cfgfile, char *weightfile, floa if (existing_net) { //set_batch_network(&net, initial_batch); //free_network_recurrent_state(*existing_net); - restore_network_recurrent_state(*existing_net); + //restore_network_recurrent_state(*existing_net); + randomize_network_recurrent_state(*existing_net); } else { free_network(net); diff --git a/src/network.c b/src/network.c index 98d1446b..b25aa639 100644 --- a/src/network.c +++ b/src/network.c @@ -91,6 +91,30 @@ void reset_rnn(network *net) reset_network_state(net, 0); } +float get_current_seq_subdivisions(network net) +{ + int sequence_subdivisions = net.init_sequential_subdivisions; + + if (net.policy) + { + int batch_num = get_current_batch(net); + int i; + for (i = 0; i < net.num_steps; ++i) { + if (net.steps[i] > batch_num) break; + sequence_subdivisions *= net.seq_scales[i]; + } + } + return sequence_subdivisions; +} + +int get_sequence_value(network net) +{ + int sequence = 1; + if (net.sequential_subdivisions != 0) sequence = net.subdivisions / net.sequential_subdivisions; + if (sequence < 1) sequence = 1; + return sequence; +} + float get_current_rate(network net) { int batch_num = get_current_batch(net); @@ -928,6 +952,7 @@ void free_network(network net) } free(net.layers); + free(net.seq_scales); free(net.scales); free(net.steps); free(net.seen); @@ -1124,6 +1149,15 @@ void free_network_recurrent_state(network net) } } +void randomize_network_recurrent_state(network net) +{ + int k; + for (k = 0; k < net.n; ++k) { + if (net.layers[k].type == CONV_LSTM) randomize_state_conv_lstm(net.layers[k]); + if (net.layers[k].type == CRNN) free_state_crnn(net.layers[k]); + } +} + void remember_network_recurrent_state(network net) { diff --git a/src/network.h b/src/network.h index 1b272ac1..186fe3d8 100644 --- a/src/network.h +++ b/src/network.h @@ -104,6 +104,8 @@ void backward_network_gpu(network net, network_state state); void update_network_gpu(network net); #endif +float get_current_seq_subdivisions(network net); +int get_sequence_value(network net); float get_current_rate(network net); int get_current_batch(network net); void free_network(network net); @@ -164,6 +166,7 @@ int get_network_background(network net); network combine_train_valid_networks(network net_train, network net_map); void copy_weights_net(network net_train, network *net_map); void free_network_recurrent_state(network net); +void randomize_network_recurrent_state(network net); void remember_network_recurrent_state(network net); void restore_network_recurrent_state(network net); diff --git a/src/network_kernels.cu b/src/network_kernels.cu index b66f4959..6e50df0b 100644 --- a/src/network_kernels.cu +++ b/src/network_kernels.cu @@ -126,7 +126,7 @@ void update_network_gpu(network net) { cuda_set_device(net.gpu_index); int i; - int update_batch = net.batch*net.subdivisions; + int update_batch = net.batch*net.subdivisions * get_sequence_value(net); float rate = get_current_rate(net); for(i = 0; i < net.n; ++i){ layer l = net.layers[i]; @@ -200,7 +200,9 @@ float train_network_datum_gpu(network net, float *x, float *y) *net.seen += net.batch; forward_backward_network_gpu(net, x, y); float error = get_network_cost(net); - if (((*net.seen) / net.batch) % net.subdivisions == 0) update_network_gpu(net); + //if (((*net.seen) / net.batch) % net.subdivisions == 0) update_network_gpu(net); + const int sequence = get_sequence_value(net); + if (((*net.seen) / net.batch) % (net.subdivisions*sequence) == 0) update_network_gpu(net); return error; } diff --git a/src/parser.c b/src/parser.c index 64dbac2d..9ca3d5e2 100644 --- a/src/parser.c +++ b/src/parser.c @@ -672,7 +672,7 @@ void parse_net_options(list *options, network *net) net->time_steps = option_find_int_quiet(options, "time_steps",1); net->track = option_find_int_quiet(options, "track", 0); net->augment_speed = option_find_int_quiet(options, "augment_speed", 2); - net->sequential_subdivisions = option_find_int_quiet(options, "sequential_subdivisions", 0); + net->init_sequential_subdivisions = net->sequential_subdivisions = option_find_int_quiet(options, "sequential_subdivisions", subdivs); net->try_fix_nan = option_find_int_quiet(options, "try_fix_nan", 0); net->batch /= subdivs; net->batch *= net->time_steps; @@ -721,6 +721,7 @@ void parse_net_options(list *options, network *net) } else if (net->policy == STEPS){ char *l = option_find(options, "steps"); char *p = option_find(options, "scales"); + char *s = option_find(options, "seq_scales"); if(!l || !p) error("STEPS policy must have steps and scales in cfg file"); int len = strlen(l); @@ -731,6 +732,14 @@ void parse_net_options(list *options, network *net) } int* steps = (int*)calloc(n, sizeof(int)); float* scales = (float*)calloc(n, sizeof(float)); + float* seq_scales = (float*)calloc(n, sizeof(float)); + for (i = 0; i < n; ++i) { + if (s) { + seq_scales[i] = atof(s); + s = strchr(s, ',') + 1; + } else + seq_scales[i] = 1; + } for(i = 0; i < n; ++i){ int step = atoi(l); float scale = atof(p); @@ -741,6 +750,7 @@ void parse_net_options(list *options, network *net) } net->scales = scales; net->steps = steps; + net->seq_scales = seq_scales; net->num_steps = n; } else if (net->policy == EXP){ net->gamma = option_find_float(options, "gamma", 1);