From ff0733ed4096c3e12d88380ca6739dd7ab545a35 Mon Sep 17 00:00:00 2001
From: AlexeyAB <alexeyab84@gmail.com>
Date: Sat, 2 Feb 2019 15:16:25 +0300
Subject: [PATCH] Speedup repack_input_kernel_bin()

---
 src/im2col_kernels.cu | 113 ++++++++++++++++++------------------------
 1 file changed, 48 insertions(+), 65 deletions(-)

diff --git a/src/im2col_kernels.cu b/src/im2col_kernels.cu
index 2bdca30b..56ed19de 100644
--- a/src/im2col_kernels.cu
+++ b/src/im2col_kernels.cu
@@ -824,7 +824,54 @@ void repack_input_gpu_2(float *input, float *re_packed_input, int w, int h, int
 // --------------------------------
 
 
+// 32 channels -> 1 channel (with 32 floats)
+// 256 channels -> 8 channels (with 32 floats)
+__global__ void repack_input_kernel_bin(float *input, uint32_t *re_packed_input_bin, int w, int h, int c)
+{
+    //__shared__ uint32_t tmp[32];
+    const int index = blockIdx.x*blockDim.x + threadIdx.x;
+
+    const int global_warp_id = index / WARP_SIZE;
+    const int lane_id = threadIdx.x % WARP_SIZE;
+
+    const int items_per_channel = w * h;
+    const int items_per_channel_aligned = items_per_channel + WARP_SIZE - (items_per_channel % WARP_SIZE);
+
+    int i = 32 * (global_warp_id % (items_per_channel_aligned / WARP_SIZE));
+    int chan = 32 * (global_warp_id / (items_per_channel_aligned / WARP_SIZE));
+
+    if (chan < c)
+    {
+        uint32_t result_bits = 0;
+
+        for (int c_pack = 0; c_pack < 32; ++c_pack)
+        {
+            float src = 0;
+            if ((i + lane_id) < items_per_channel) {
+                src = input[(chan + c_pack)*items_per_channel + (i + lane_id)];
+            }
+            uint32_t bit_mask = __ballot_custom(src > 0);
 
+            uint32_t cur_bit = (bit_mask >> lane_id) & uint32_t(1);
+
+            result_bits |= (cur_bit << c_pack);
+        }
+        if ((i + lane_id) < items_per_channel) {
+            re_packed_input_bin[chan*items_per_channel / 32 + (i + lane_id)] = result_bits;
+        }
+    }
+}
+
+void repack_input_gpu_bin(float *input, uint32_t *re_packed_input_bin, int w, int h, int c)
+{
+    int size = (w * h * c) / 32 + 1;
+    const int block_size = BLOCK;
+    const int num_blocks = get_number_of_blocks(size, block_size);
+    //printf("\n num_blocks = %d, num_blocks/32 = %d,  block_size = %d \n", num_blocks, num_blocks / 32, block_size);
+    repack_input_kernel_bin << <num_blocks, block_size, 0, get_cuda_stream() >> >(input, re_packed_input_bin, w, h, c);
+}
+
+/*
 // 32 channels -> 1 channel (with 32 floats)
 // 256 channels -> 8 channels (with 32 floats)
 __global__ void repack_input_kernel_bin(float *input, uint32_t *re_packed_input_bin, int w, int h, int c)
@@ -870,74 +917,10 @@ void repack_input_gpu_bin(float *input, uint32_t *re_packed_input_bin, int w, in
     int size = w * h * c;
     const int block_size = 256;// 128;
     const int num_blocks = get_number_of_blocks(size, block_size);
-    //printf("\n num_blocks = %d, num_blocks/32 = %d,  block_size = %d \n", num_blocks, num_blocks/32, block_size);
+    printf("\n num_blocks = %d, num_blocks/32 = %d,  block_size = %d \n", num_blocks, num_blocks/32, block_size);
     repack_input_kernel_bin << <num_blocks, block_size, 0, get_cuda_stream() >> >(input, re_packed_input_bin, w, h, c);
 }
-
-
-/*
-// 32 channels -> 1 channel (with 32 floats)
-// 256 channels -> 8 channels (with 32 floats)
-__global__ void repack_input_kernel_bin(float *input, uint32_t *re_packed_input_bin, int w, int h, int c, int items_per_channel_align)
-{
-    __shared__ float tmp[33*32];    // misalgined array 32x32
-    //const int index = blockIdx.x*blockDim.x + threadIdx.x;
-
-    const int num_of_warps = blockDim.x / WARP_SIZE;
-    const int warp_id = threadIdx.x / WARP_SIZE;
-    const int lane_id = threadIdx.x % WARP_SIZE;
-
-    const int items_per_channel = w * h;
-    //const int items_per_channel_align = items_per_channel + (32 - items_per_channel % 32);
-    const int blocks_per_wh = items_per_channel_align / 32;
-    //const int blocks_per_c = c / 32;
-
-    // input[C x H x W] = input[C x ITEMS]
-    // BLOCK per C x ITEMS = 32x32
-
-    const int block_item_id = blockIdx.x % blocks_per_wh;
-    const int block_channel_id = blockIdx.x / blocks_per_wh;
-
-    const int block_item = block_item_id * 32;
-    const int block_channel = block_channel_id * 32;
-
-    const int lane_item = block_item + lane_id;
-    const int warp_channel = block_channel + warp_id;
-
-    if (warp_channel < c)
-    {
-        float src = 0;
-
-        if (lane_item < items_per_channel)
-            src = input[warp_channel*items_per_channel + lane_item];
-
-        tmp[warp_id * 33 + lane_id] = src;
-        __syncthreads();
-        src = tmp[lane_id * 33 + warp_id];
-
-        uint32_t bit_mask = __ballot_custom(src > 0);
-
-        const int warp_item = block_item + warp_id;
-
-        if (lane_id == 0 && warp_item < items_per_channel)
-            re_packed_input_bin[block_channel_id*items_per_channel + warp_item] = bit_mask;
-    }
-}
-
-#define BLOCK_REPACK 1024
-void repack_input_gpu_bin(float *input, uint32_t *re_packed_input_bin, int w, int h, int c)
-{
-    int items_per_channel = w*h;
-    int items_per_channel_align = items_per_channel + (32 - items_per_channel % 32);
-    int channel_align = c + (32 - c % 32);
-
-    //int size = w * h * c;
-    int size = items_per_channel_align * channel_align;
-    const int num_blocks = get_number_of_blocks(size, BLOCK_REPACK);
-    repack_input_kernel_bin << <num_blocks, BLOCK_REPACK, 0, get_cuda_stream() >> >(input, re_packed_input_bin, w, h, c, items_per_channel_align);
-}
 */
-// --------------------------------