From cc41339805d8b48edef67d0dcf4925b45c212277 Mon Sep 17 00:00:00 2001
From: AlexeyAB <alexeyab84@gmail.com>
Date: Wed, 19 Jun 2019 23:36:02 +0300
Subject: [PATCH] Added [scale_channels] layer for squeeze-and-excitation
 blocks

---
 Makefile                      |   2 +-
 build/darknet/darknet.vcxproj |   2 +
 include/darknet.h             |   2 +
 src/blas.h                    |   5 ++
 src/blas_kernels.cu           |  46 +++++++++++++
 src/classifier.c              |   4 +-
 src/convolutional_kernels.cu  |   7 +-
 src/dropout_layer_kernels.cu  |   7 ++
 src/network.c                 |   1 +
 src/network_kernels.cu        |   1 +
 src/parser.c                  |  38 +++++++++++
 src/scale_channels_layer.c    | 118 ++++++++++++++++++++++++++++++++++
 src/scale_channels_layer.h    |  23 +++++++
 src/shortcut_layer.c          |   2 +
 14 files changed, 253 insertions(+), 5 deletions(-)
 create mode 100644 src/scale_channels_layer.c
 create mode 100644 src/scale_channels_layer.h
diff --git a/Makefile b/Makefile
index 7f0547d3..dc880be9 100644
--- a/Makefile
+++ b/Makefile
@@ -118,7 +118,7 @@ LDFLAGS+= -L/usr/local/zed/lib -lsl_core -lsl_input -lsl_zed
 #-lstdc++ -D_GLIBCXX_USE_CXX11_ABI=0 
 endif
 
-OBJ=image_opencv.o http_stream.o gemm.o utils.o dark_cuda.o convolutional_layer.o list.o image.o activations.o im2col.o col2im.o blas.o crop_layer.o dropout_layer.o maxpool_layer.o softmax_layer.o data.o matrix.o network.o connected_layer.o cost_layer.o parser.o option_list.o darknet.o detection_layer.o captcha.o route_layer.o writing.o box.o nightmare.o normalization_layer.o avgpool_layer.o coco.o dice.o yolo.o detector.o layer.o compare.o classifier.o local_layer.o swag.o shortcut_layer.o activation_layer.o rnn_layer.o gru_layer.o rnn.o rnn_vid.o crnn_layer.o demo.o tag.o cifar.o go.o batchnorm_layer.o art.o region_layer.o reorg_layer.o reorg_old_layer.o super.o voxel.o tree.o yolo_layer.o upsample_layer.o lstm_layer.o conv_lstm_layer.o
+OBJ=image_opencv.o http_stream.o gemm.o utils.o dark_cuda.o convolutional_layer.o list.o image.o activations.o im2col.o col2im.o blas.o crop_layer.o dropout_layer.o maxpool_layer.o softmax_layer.o data.o matrix.o network.o connected_layer.o cost_layer.o parser.o option_list.o darknet.o detection_layer.o captcha.o route_layer.o writing.o box.o nightmare.o normalization_layer.o avgpool_layer.o coco.o dice.o yolo.o detector.o layer.o compare.o classifier.o local_layer.o swag.o shortcut_layer.o activation_layer.o rnn_layer.o gru_layer.o rnn.o rnn_vid.o crnn_layer.o demo.o tag.o cifar.o go.o batchnorm_layer.o art.o region_layer.o reorg_layer.o reorg_old_layer.o super.o voxel.o tree.o yolo_layer.o upsample_layer.o lstm_layer.o conv_lstm_layer.o scale_channels_layer.o
 ifeq ($(GPU), 1) 
 LDFLAGS+= -lstdc++ 
 OBJ+=convolutional_kernels.o activation_kernels.o im2col_kernels.o col2im_kernels.o blas_kernels.o crop_layer_kernels.o dropout_layer_kernels.o maxpool_layer_kernels.o network_kernels.o avgpool_layer_kernels.o
diff --git a/build/darknet/darknet.vcxproj b/build/darknet/darknet.vcxproj
index 5a78ac60..019e02c6 100644
--- a/build/darknet/darknet.vcxproj
+++ b/build/darknet/darknet.vcxproj
@@ -226,6 +226,7 @@
     <ClCompile Include="..\..\src\rnn_layer.c" />
     <ClCompile Include="..\..\src\rnn_vid.c" />
     <ClCompile Include="..\..\src\route_layer.c" />
+    <ClCompile Include="..\..\src\scale_channels_layer.c" />
     <ClCompile Include="..\..\src\shortcut_layer.c" />
     <ClCompile Include="..\..\src\softmax_layer.c" />
     <ClCompile Include="..\..\src\super.c" />
@@ -284,6 +285,7 @@
     <ClInclude Include="..\..\src\reorg_old_layer.h" />
     <ClInclude Include="..\..\src\rnn_layer.h" />
     <ClInclude Include="..\..\src\route_layer.h" />
+    <ClInclude Include="..\..\src\scale_channels_layer.h" />
     <ClInclude Include="..\..\src\shortcut_layer.h" />
     <ClInclude Include="..\..\src\softmax_layer.h" />
     <ClInclude Include="..\..\src\stb_image.h" />
diff --git a/include/darknet.h b/include/darknet.h
index e807ff9b..25e702b2 100644
--- a/include/darknet.h
+++ b/include/darknet.h
@@ -136,6 +136,7 @@ typedef enum {
     AVGPOOL,
     LOCAL,
     SHORTCUT,
+    SCALE_CHANNELS,
     ACTIVE,
     RNN,
     GRU,
@@ -153,6 +154,7 @@ typedef enum {
     UPSAMPLE,
     LOGXENT,
     L2NORM,
+    EMPTY,
     BLANK
 } LAYER_TYPE;
 
diff --git a/src/blas.h b/src/blas.h
index ca9cd588..2b46eba3 100644
--- a/src/blas.h
+++ b/src/blas.h
@@ -117,6 +117,11 @@ void add_3_arrays_activate(float *a1, float *a2, float *a3, size_t size, ACTIVAT
 void sum_of_mults(float *a1, float *a2, float *b1, float *b2, size_t size, float *dst);
 void activate_and_mult(float *a1, float *a2, size_t size, ACTIVATION a, float *dst);
 
+void scale_channels_gpu(float *in_w_h_c, int size, int channel_size, float *scales_c, float *out);
+void backward_scale_channels_gpu(float *in_w_h_c_delta, int size, int channel_size,
+    float *in_scales_c, float *out_from_delta,
+    float *in_from_output, float *out_state_delta);
+
 #endif
 #ifdef __cplusplus
 }
diff --git a/src/blas_kernels.cu b/src/blas_kernels.cu
index ed418c10..35f9a1ad 100644
--- a/src/blas_kernels.cu
+++ b/src/blas_kernels.cu
@@ -1099,3 +1099,49 @@ extern "C" void activate_and_mult(float *a1, float *a2, size_t size, ACTIVATION
     }
     activate_and_mult_kernel << <num_blocks, block_size, 0, get_cuda_stream() >> >(a1, a2, size, a, dst);
 }
+
+
+
+__global__ void scale_channels_kernel(float *in_w_h_c, int size, int channel_size, float *scales_c, float *out)
+{
+    const int index = blockIdx.x*blockDim.x + threadIdx.x;
+    if (index < size) {
+        out[index] = in_w_h_c[index] * scales_c[index / channel_size];
+    }
+}
+
+extern "C" void scale_channels_gpu(float *in_w_h_c, int size, int channel_size, float *scales_c, float *out)
+{
+    const int block_size = BLOCK;
+    const int num_blocks = get_number_of_blocks(size, block_size);
+    scale_channels_kernel << <num_blocks, block_size, 0, get_cuda_stream() >> >(in_w_h_c, size, channel_size, scales_c, out);
+    CHECK_CUDA(cudaPeekAtLastError());
+}
+
+
+__global__ void backward_scale_channels_kernel(float *in_w_h_c_delta, int size, int channel_size,
+    float *in_scales_c, float *out_from_delta,
+    float *in_from_output, float *out_state_delta)
+{
+    const int index = blockIdx.x*blockDim.x + threadIdx.x;
+    if (index < size) {
+        out_state_delta[index / channel_size] += in_w_h_c_delta[index] * in_from_output[index]; // l.delta * from  (should be divided by channel_size?)
+        out_from_delta[index] += in_scales_c[index / channel_size] * in_w_h_c_delta[index]; // input * l.delta
+
+        //out_state_delta[index / channel_size] += in_w_h_c_delta[index] / channel_size;
+        //out_from_delta[index] = in_w_h_c_delta[index];
+    }
+}
+
+extern "C" void backward_scale_channels_gpu(float *in_w_h_c_delta, int size, int channel_size,
+    float *in_scales_c, float *out_from_delta,
+    float *in_from_output, float *out_state_delta)
+{
+    const int block_size = BLOCK;
+    const int num_blocks = get_number_of_blocks(size, block_size);
+    backward_scale_channels_kernel << <num_blocks, block_size, 0, get_cuda_stream() >> > (in_w_h_c_delta, size, channel_size,
+        in_scales_c, out_from_delta,
+        in_from_output, out_state_delta);
+
+    CHECK_CUDA(cudaPeekAtLastError());
+}
diff --git a/src/classifier.c b/src/classifier.c
index f9708e44..f5fbfaeb 100644
--- a/src/classifier.c
+++ b/src/classifier.c
@@ -134,14 +134,14 @@ void train_classifier(char *datacfg, char *cfgfile, char *weightfile, int *gpus,
 #else
         loss = train_network(net, train);
 #endif
-        if(avg_loss == -1) avg_loss = loss;
+        if(avg_loss == -1 || isnan(avg_loss) || isinf(avg_loss)) avg_loss = loss;
         avg_loss = avg_loss*.9 + loss*.1;
 
         i = get_current_batch(net);
 
         int calc_topk_for_each = iter_topk + 2 * train_images_num / (net.batch * net.subdivisions);  // calculate TOPk for each 2 Epochs
         calc_topk_for_each = fmax(calc_topk_for_each, net.burn_in);
-        calc_topk_for_each = fmax(calc_topk_for_each, 1000);
+        calc_topk_for_each = fmax(calc_topk_for_each, 100);
         if (i % 10 == 0) {
             if (calc_topk) {
                 fprintf(stderr, "\n (next TOP5 calculation at %d iterations) ", calc_topk_for_each);
diff --git a/src/convolutional_kernels.cu b/src/convolutional_kernels.cu
index ab8d7fa8..eaa6e803 100644
--- a/src/convolutional_kernels.cu
+++ b/src/convolutional_kernels.cu
@@ -415,7 +415,7 @@ void forward_convolutional_layer_gpu(convolutional_layer l, network_state state)
     //if (state.use_mixed_precision) {
     int iteration_num = (*state.net.seen) / (state.net.batch*state.net.subdivisions);
     if (state.index != 0 && state.net.cudnn_half && !l.xnor && (!state.train || iteration_num > 3*state.net.burn_in) &&
-        l.c % 8 == 0 && l.n % 8 == 0)
+        (l.c / l.groups) % 8 == 0 && l.n % 8 == 0 && !state.train)
     {
         //printf("\n CUDNN_HALF!!! state.index = %d \n", state.index);
 
@@ -629,7 +629,7 @@ void backward_convolutional_layer_gpu(convolutional_layer l, network_state state
 //#ifdef CUDNN_HALF
     int iteration_num = (*state.net.seen) / (state.net.batch*state.net.subdivisions);
     if (state.index != 0 && state.net.cudnn_half && !l.xnor && (!state.train || iteration_num > 3*state.net.burn_in) &&
-        l.c % 8 == 0 && l.n % 8 == 0)
+        (l.c / l.groups) % 8 == 0 && l.n % 8 == 0 && !state.train)
     {
         const size_t input16_size = l.batch*l.c*l.w*l.h;
         const size_t delta16_size = l.batch*l.n*l.out_w*l.out_h;
@@ -910,6 +910,9 @@ void update_convolutional_layer_gpu(layer l, int batch, float learning_rate_init
     //float decay = a.decay;
     //int batch = a.batch;
 
+    fix_nan_and_inf(l.weight_updates_gpu, l.nweights);
+    fix_nan_and_inf(l.weights_gpu, l.nweights);
+
     if (l.adam) {
         //adam_update_gpu(l.weights_gpu, l.weight_updates_gpu, l.m_gpu, l.v_gpu, a.B1, a.B2, a.eps, decay, learning_rate, l.nweights, batch, a.t);
         adam_update_gpu(l.weights_gpu, l.weight_updates_gpu, l.m_gpu, l.v_gpu, l.B1, l.B2, l.eps, decay, learning_rate, l.nweights, batch, l.t);
diff --git a/src/dropout_layer_kernels.cu b/src/dropout_layer_kernels.cu
index d3af0ea6..dadaa9fe 100644
--- a/src/dropout_layer_kernels.cu
+++ b/src/dropout_layer_kernels.cu
@@ -15,6 +15,10 @@ __global__ void yoloswag420blazeit360noscope(float *input, int size, float *rand
 void forward_dropout_layer_gpu(dropout_layer layer, network_state state)
 {
     if (!state.train) return;
+    int iteration_num = (*state.net.seen) / (state.net.batch*state.net.subdivisions);
+    //if (iteration_num < state.net.burn_in) return;
+
+
     int size = layer.inputs*layer.batch;
     cuda_random(layer.rand_gpu, size);
     /*
@@ -32,6 +36,9 @@ void forward_dropout_layer_gpu(dropout_layer layer, network_state state)
 void backward_dropout_layer_gpu(dropout_layer layer, network_state state)
 {
     if(!state.delta) return;
+    int iteration_num = (*state.net.seen) / (state.net.batch*state.net.subdivisions);
+    //if (iteration_num < state.net.burn_in) return;
+
     int size = layer.inputs*layer.batch;
 
     yoloswag420blazeit360noscope<<<cuda_gridsize(size), BLOCK, 0, get_cuda_stream() >>>(state.delta, size, layer.rand_gpu, layer.probability, layer.scale);
diff --git a/src/network.c b/src/network.c
index d6b8f5c7..0e655c78 100644
--- a/src/network.c
+++ b/src/network.c
@@ -318,6 +318,7 @@ void backward_network(network net, network_state state)
         }
         layer l = net.layers[i];
         if (l.stopbackward) break;
+        if (l.onlyforward) continue;
         l.backward(l, state);
     }
 }
diff --git a/src/network_kernels.cu b/src/network_kernels.cu
index edaefc34..3ebf2900 100644
--- a/src/network_kernels.cu
+++ b/src/network_kernels.cu
@@ -106,6 +106,7 @@ void backward_network_gpu(network net, network_state state)
             state.input = prev.output_gpu;
             state.delta = prev.delta_gpu;
         }
+        if (l.onlyforward) continue;
         l.backward_gpu(l, state);
 
         /*
diff --git a/src/parser.c b/src/parser.c
index f3c02198..d45f625c 100644
--- a/src/parser.c
+++ b/src/parser.c
@@ -31,6 +31,7 @@
 #include "rnn_layer.h"
 #include "route_layer.h"
 #include "shortcut_layer.h"
+#include "scale_channels_layer.h"
 #include "softmax_layer.h"
 #include "utils.h"
 #include "upsample_layer.h"
@@ -48,6 +49,7 @@ LAYER_TYPE string_to_layer_type(char * type)
 {
 
     if (strcmp(type, "[shortcut]")==0) return SHORTCUT;
+    if (strcmp(type, "[scale_channels]") == 0) return SCALE_CHANNELS;
     if (strcmp(type, "[crop]")==0) return CROP;
     if (strcmp(type, "[cost]")==0) return COST;
     if (strcmp(type, "[detection]")==0) return DETECTION;
@@ -80,6 +82,7 @@ LAYER_TYPE string_to_layer_type(char * type)
             || strcmp(type, "[softmax]")==0) return SOFTMAX;
     if (strcmp(type, "[route]")==0) return ROUTE;
     if (strcmp(type, "[upsample]") == 0) return UPSAMPLE;
+    if (strcmp(type, "[empty]") == 0) return EMPTY;
     return BLANK;
 }
 
@@ -600,6 +603,24 @@ layer parse_shortcut(list *options, size_params params, network net)
 }
 
 
+layer parse_scale_channels(list *options, size_params params, network net)
+{
+    char *l = option_find(options, "from");
+    int index = atoi(l);
+    if (index < 0) index = params.index + index;
+
+    int batch = params.batch;
+    layer from = net.layers[index];
+
+    layer s = make_scale_channels_layer(batch, index, params.w, params.h, params.c, from.out_w, from.out_h, from.out_c);
+
+    char *activation_s = option_find_str_quiet(options, "activation", "linear");
+    ACTIVATION activation = get_activation(activation_s);
+    s.activation = activation;
+    return s;
+}
+
+
 layer parse_activation(list *options, size_params params)
 {
     char *activation_s = option_find_str(options, "activation", "linear");
@@ -895,6 +916,10 @@ network parse_network_cfg_custom(char *filename, int batch, int time_steps)
             l = parse_shortcut(options, params, net);
             net.layers[count - 1].use_bin_output = 0;
             net.layers[l.index].use_bin_output = 0;
+        }else if (lt == SCALE_CHANNELS) {
+            l = parse_scale_channels(options, params, net);
+            net.layers[count - 1].use_bin_output = 0;
+            net.layers[l.index].use_bin_output = 0;
         }else if(lt == DROPOUT){
             l = parse_dropout(options, params);
             l.output = net.layers[count-1].output;
@@ -902,6 +927,19 @@ network parse_network_cfg_custom(char *filename, int batch, int time_steps)
 #ifdef GPU
             l.output_gpu = net.layers[count-1].output_gpu;
             l.delta_gpu = net.layers[count-1].delta_gpu;
+#endif
+        }
+        else if (lt == EMPTY) {
+            layer empty_layer;
+            empty_layer.out_w = params.w;
+            empty_layer.out_h = params.h;
+            empty_layer.out_c = params.c;
+            l = empty_layer;
+            l.output = net.layers[count - 1].output;
+            l.delta = net.layers[count - 1].delta;
+#ifdef GPU
+            l.output_gpu = net.layers[count - 1].output_gpu;
+            l.delta_gpu = net.layers[count - 1].delta_gpu;
 #endif
         }else{
             fprintf(stderr, "Type not recognized: %s\n", s->type);
diff --git a/src/scale_channels_layer.c b/src/scale_channels_layer.c
new file mode 100644
index 00000000..32d11014
--- /dev/null
+++ b/src/scale_channels_layer.c
@@ -0,0 +1,118 @@
+#include "scale_channels_layer.h"
+#include "dark_cuda.h"
+#include "blas.h"
+#include <stdio.h>
+#include <assert.h>
+
+layer make_scale_channels_layer(int batch, int index, int w, int h, int c, int w2, int h2, int c2)
+{
+    fprintf(stderr,"scale Layer: %d\n", index);
+    layer l = { (LAYER_TYPE)0 };
+    l.type = SCALE_CHANNELS;
+    l.batch = batch;
+    l.w = w;
+    l.h = h;
+    l.c = c;
+    assert(w == 1 & h == 1);
+
+    l.out_w = w2;
+    l.out_h = h2;
+    l.out_c = c2;
+    assert(l.out_c == l.c);
+
+    l.outputs = l.out_w*l.out_h*l.out_c;
+    l.inputs = l.outputs;
+    l.index = index;
+
+    l.delta = (float*)calloc(l.outputs * batch, sizeof(float));
+    l.output = (float*)calloc(l.outputs * batch, sizeof(float));
+
+    l.forward = forward_scale_channels_layer;
+    l.backward = backward_scale_channels_layer;
+#ifdef GPU
+    l.forward_gpu = forward_scale_channels_layer_gpu;
+    l.backward_gpu = backward_scale_channels_layer_gpu;
+
+    l.delta_gpu =  cuda_make_array(l.delta, l.outputs*batch);
+    l.output_gpu = cuda_make_array(l.output, l.outputs*batch);
+#endif
+    return l;
+}
+
+void resize_scale_channels_layer(layer *l, int w, int h)
+{
+    l->out_w = w;
+    l->out_h = h;
+    l->outputs = l->out_w*l->out_h*l->out_c;
+    l->inputs = l->outputs;
+    l->delta = (float*)realloc(l->delta, l->outputs * l->batch * sizeof(float));
+    l->output = (float*)realloc(l->output, l->outputs * l->batch * sizeof(float));
+
+#ifdef GPU
+    cuda_free(l->output_gpu);
+    cuda_free(l->delta_gpu);
+    l->output_gpu = cuda_make_array(l->output, l->outputs*l->batch);
+    l->delta_gpu = cuda_make_array(l->delta, l->outputs*l->batch);
+#endif
+
+}
+
+void forward_scale_channels_layer(const layer l, network_state state)
+{
+    int size = l.batch * l.out_c * l.out_w * l.out_h;
+    int channel_size = l.out_w * l.out_h;
+    float *from_output = state.net.layers[l.index].output;
+
+    int i;
+    #pragma omp parallel for
+    for (i = 0; i < size; ++i) {
+        l.output[i] = state.input[i / channel_size] * from_output[i];
+    }
+
+    activate_array(l.output, l.outputs*l.batch, l.activation);
+}
+
+void backward_scale_channels_layer(const layer l, network_state state)
+{
+    gradient_array(l.output, l.outputs*l.batch, l.activation, l.delta);
+    //axpy_cpu(l.outputs*l.batch, 1, l.delta, 1, state.delta, 1);
+    //scale_cpu(l.batch, l.out_w, l.out_h, l.out_c, l.delta, l.w, l.h, l.c, state.net.layers[l.index].delta);
+
+    int size = l.batch * l.out_c * l.out_w * l.out_h;
+    int channel_size = l.out_w * l.out_h;
+    float *from_output = state.net.layers[l.index].output;
+    float *from_delta = state.net.layers[l.index].delta;
+
+    int i;
+    #pragma omp parallel for
+    for (i = 0; i < size; ++i) {
+        state.delta[i / channel_size] += l.delta[i] * from_output[i]; // l.delta * from  (should be divided by channel_size?)
+
+        from_delta[i] = state.input[i / channel_size] * l.delta[i]; // input * l.delta
+    }
+}
+
+#ifdef GPU
+void forward_scale_channels_layer_gpu(const layer l, network_state state)
+{
+    int size = l.batch * l.out_c * l.out_w * l.out_h;
+    int channel_size = l.out_w * l.out_h;
+
+    scale_channels_gpu(state.net.layers[l.index].output_gpu, size, channel_size, state.input, l.output_gpu);
+
+    activate_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation);
+}
+
+void backward_scale_channels_layer_gpu(const layer l, network_state state)
+{
+    gradient_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation, l.delta_gpu);
+
+    int size = l.batch * l.out_c * l.out_w * l.out_h;
+    int channel_size = l.out_w * l.out_h;
+    float *from_output = state.net.layers[l.index].output_gpu;
+    float *from_delta = state.net.layers[l.index].delta_gpu;
+
+
+    backward_scale_channels_gpu(l.delta_gpu, size, channel_size, state.input, from_delta, from_output, state.delta);
+}
+#endif
diff --git a/src/scale_channels_layer.h b/src/scale_channels_layer.h
new file mode 100644
index 00000000..a20c0703
--- /dev/null
+++ b/src/scale_channels_layer.h
@@ -0,0 +1,23 @@
+#ifndef SCALE_CHANNELS_LAYER_H
+#define SCALE_CHANNELS_LAYER_H
+
+#include "layer.h"
+#include "network.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+layer make_scale_channels_layer(int batch, int index, int w, int h, int c, int w2, int h2, int c2);
+void forward_scale_channels_layer(const layer l, network_state state);
+void backward_scale_channels_layer(const layer l, network_state state);
+void resize_scale_channels_layer(layer *l, int w, int h);
+
+#ifdef GPU
+void forward_scale_channels_layer_gpu(const layer l, network_state state);
+void backward_scale_channels_layer_gpu(const layer l, network_state state);
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+#endif  // SCALE_CHANNELS_LAYER_H
diff --git a/src/shortcut_layer.c b/src/shortcut_layer.c
index 06cd6056..070ae5d9 100644
--- a/src/shortcut_layer.c
+++ b/src/shortcut_layer.c
@@ -19,6 +19,8 @@ layer make_shortcut_layer(int batch, int index, int w, int h, int c, int w2, int
     l.outputs = w*h*c;
     l.inputs = l.outputs;
 
+    if(w != w2 || h != h2 || c != c) fprintf(stderr, " w = %d, w2 = %d, h = %d, h2 = %d, c = %d, c2 = %d \n", w, w2, h, h2, c, c2);
+
     l.index = index;
 
     l.delta = (float*)calloc(l.outputs * batch, sizeof(float));