Minor speedup of Mish/Swish (if train==0 then don't write input of activation to the memory)

5 years ago · 0acd6990d8
parent 9cd44eea95
commit 0acd6990d8
3 changed files with 33 additions and 10 deletions
--- a/src/activation_kernels.cu
+++ b/src/activation_kernels.cu
@ -217,11 +217,33 @@ __global__ void activate_array_swish_kernel(float *x, int n, float *output_sigmo
    if (i < n) {
        float x_val = x[i];
        float sigmoid = logistic_activate_kernel(x_val);
-        output_sigmoid_gpu[i] = sigmoid;
+        if (output_sigmoid_gpu) output_sigmoid_gpu[i] = sigmoid;
        output_gpu[i] = x_val * sigmoid;
    }
 }

+__device__ float mish_njuffa(float x)
+{
+    float r;
+    float e = expf(x);
+    r = 1.0f / fmaf(fmaf(-0.5f, e, -1.0f), e, -1.0f);
+    r = fmaf(r, x, x);
+    return r;
+}
+
+__device__ float mish_yashas(float x)
+{
+    auto e = __expf(x);
+    if (x <= -18.0f)
+        return x * e;
+
+    auto n = e * e + 2 * e;
+    if (x <= -5.0f)
+        return x * __fdividef(n, n + 2);
+
+    return x - 2 * __fdividef(x, n + 2);
+}
+
 // https://github.com/digantamisra98/Mish
 __global__ void activate_array_mish_kernel(float *x, int n, float *activation_input, float *output_gpu)
 {
@ -229,13 +251,15 @@ __global__ void activate_array_mish_kernel(float *x, int n, float *activation_in
    if (i < n) {
        const float MISH_THRESHOLD = 20;
        float x_val = x[i];
-        activation_input[i] = x_val;    // store value before activation
+        if (activation_input) activation_input[i] = x_val;    // store value before activation
        //output_gpu[i] = x_val * tanh_activate_kernel(logf(1 + expf(x_val)));

        // Pytorch: https://github.com/thomasbrandon/mish-cuda/blob/master/csrc/mish.h#L17-L20
        // TF: https://github.com/tensorflow/addons/blob/093cdfa85d334cbe19a37624c33198f3140109ed/tensorflow_addons/custom_ops/activations/cc/kernels/mish_op.h#L40-L49
        // log1p(x) == log(x + 1)
        output_gpu[i] = x_val * tanh_activate_kernel( softplus_kernel(x_val, MISH_THRESHOLD) );
+        //output_gpu[i] = mish_yashas(x_val);
+        //output_gpu[i] = mish_njuffa(x_val);
    }
 }

--- a/src/convolutional_layer.c
+++ b/src/convolutional_layer.c
@ -547,7 +547,7 @@ convolutional_layer make_convolutional_layer(int batch, int steps, int h, int w,

    if(gpu_index >= 0){

-        if (l.activation == SWISH || l.activation == MISH) {
+        if (train && (l.activation == SWISH || l.activation == MISH)) {
            l.activation_input_gpu = cuda_make_array(l.activation_input, total_batch*l.outputs);
        }

--- a/src/demo.c
+++ b/src/demo.c
@ -101,12 +101,6 @@ void *detect_in_thread(void *ptr)
        float *X = det_s.data;
        float *prediction = network_predict(net, X);

-        int i;
-        for (i = 0; i < net.n; ++i) {
-            layer l = net.layers[i];
-            if (l.type == YOLO) l.mean_alpha = 1.0 / NFRAMES;
-        }
-
        cv_images[demo_index] = det_img;
        det_img = cv_images[(demo_index + NFRAMES / 2 + 1) % NFRAMES];
        demo_index = (demo_index + 1) % NFRAMES;
@ -163,6 +157,12 @@ void demo(char *cfgfile, char *weightfile, float thresh, float hier_thresh, int
    calculate_binary_weights(net);
    srand(2222222);

+    int i;
+    for (i = 0; i < net.n; ++i) {
+        layer l = net.layers[i];
+        if (l.type == YOLO) l.mean_alpha = 1.0 / NFRAMES;
+    }
+
    if(filename){
        printf("video file: %s\n", filename);
        cap = get_capture_video_stream(filename);
@ -398,7 +398,6 @@ void demo(char *cfgfile, char *weightfile, float thresh, float hier_thresh, int

    free_ptrs((void **)names, net.layers[net.n - 1].classes);

-    int i;
    const int nsize = 8;
    for (j = 0; j < nsize; ++j) {
        for (i = 32; i < 127; ++i) {