Speedup repack_input_kernel_bin()

6 years ago · ff0733ed40
parent 2d747cab2b
commit ff0733ed40
1 changed files with 48 additions and 65 deletions
--- a/src/im2col_kernels.cu
+++ b/src/im2col_kernels.cu
@ -824,7 +824,54 @@ void repack_input_gpu_2(float *input, float *re_packed_input, int w, int h, int
 // --------------------------------
 // 32 channels -> 1 channel (with 32 floats)
 // 256 channels -> 8 channels (with 32 floats)
 __global__ void repack_input_kernel_bin(float *input, uint32_t *re_packed_input_bin, int w, int h, int c)
 {
    //__shared__ uint32_t tmp[32];
    const int index = blockIdx.x*blockDim.x + threadIdx.x;
    const int global_warp_id = index / WARP_SIZE;
    const int lane_id = threadIdx.x % WARP_SIZE;
    const int items_per_channel = w * h;
    const int items_per_channel_aligned = items_per_channel + WARP_SIZE - (items_per_channel % WARP_SIZE);
    int i = 32 * (global_warp_id % (items_per_channel_aligned / WARP_SIZE));
    int chan = 32 * (global_warp_id / (items_per_channel_aligned / WARP_SIZE));
    if (chan < c)
    {
        uint32_t result_bits = 0;
        for (int c_pack = 0; c_pack < 32; ++c_pack)
        {
            float src = 0;
            if ((i + lane_id) < items_per_channel) {
                src = input[(chan + c_pack)*items_per_channel + (i + lane_id)];
            }
            uint32_t bit_mask = __ballot_custom(src > 0);
            uint32_t cur_bit = (bit_mask >> lane_id) & uint32_t(1);
            result_bits |= (cur_bit << c_pack);
        }
        if ((i + lane_id) < items_per_channel) {
            re_packed_input_bin[chan*items_per_channel / 32 + (i + lane_id)] = result_bits;
        }
    }
 }
 void repack_input_gpu_bin(float *input, uint32_t *re_packed_input_bin, int w, int h, int c)
 {
    int size = (w * h * c) / 32 + 1;
    const int block_size = BLOCK;
    const int num_blocks = get_number_of_blocks(size, block_size);
    //printf("\n num_blocks = %d, num_blocks/32 = %d,  block_size = %d \n", num_blocks, num_blocks / 32, block_size);
    repack_input_kernel_bin << <num_blocks, block_size, 0, get_cuda_stream() >> >(input, re_packed_input_bin, w, h, c);
 }
 /*
 // 32 channels -> 1 channel (with 32 floats)
 // 256 channels -> 8 channels (with 32 floats)
 __global__ void repack_input_kernel_bin(float *input, uint32_t *re_packed_input_bin, int w, int h, int c)
@ -870,74 +917,10 @@ void repack_input_gpu_bin(float *input, uint32_t *re_packed_input_bin, int w, in
    int size = w * h * c;
    const int block_size = 256;// 128;
    const int num_blocks = get_number_of_blocks(size, block_size);
-    //printf("\n num_blocks = %d, num_blocks/32 = %d,  block_size = %d \n", num_blocks, num_blocks/32, block_size);
+    printf("\n num_blocks = %d, num_blocks/32 = %d,  block_size = %d \n", num_blocks, num_blocks/32, block_size);
    repack_input_kernel_bin << <num_blocks, block_size, 0, get_cuda_stream() >> >(input, re_packed_input_bin, w, h, c);
 }
 /*
 // 32 channels -> 1 channel (with 32 floats)
 // 256 channels -> 8 channels (with 32 floats)
 __global__ void repack_input_kernel_bin(float *input, uint32_t *re_packed_input_bin, int w, int h, int c, int items_per_channel_align)
 {
    __shared__ float tmp[33*32];    // misalgined array 32x32
    //const int index = blockIdx.x*blockDim.x + threadIdx.x;
    const int num_of_warps = blockDim.x / WARP_SIZE;
    const int warp_id = threadIdx.x / WARP_SIZE;
    const int lane_id = threadIdx.x % WARP_SIZE;
    const int items_per_channel = w * h;
    //const int items_per_channel_align = items_per_channel + (32 - items_per_channel % 32);
    const int blocks_per_wh = items_per_channel_align / 32;
    //const int blocks_per_c = c / 32;
    // input[C x H x W] = input[C x ITEMS]
    // BLOCK per C x ITEMS = 32x32
    const int block_item_id = blockIdx.x % blocks_per_wh;
    const int block_channel_id = blockIdx.x / blocks_per_wh;
    const int block_item = block_item_id * 32;
    const int block_channel = block_channel_id * 32;
    const int lane_item = block_item + lane_id;
    const int warp_channel = block_channel + warp_id;
    if (warp_channel < c)
    {
        float src = 0;
        if (lane_item < items_per_channel)
            src = input[warp_channel*items_per_channel + lane_item];
        tmp[warp_id * 33 + lane_id] = src;
        __syncthreads();
        src = tmp[lane_id * 33 + warp_id];
        uint32_t bit_mask = __ballot_custom(src > 0);
        const int warp_item = block_item + warp_id;
        if (lane_id == 0 && warp_item < items_per_channel)
            re_packed_input_bin[block_channel_id*items_per_channel + warp_item] = bit_mask;
    }
 }
 #define BLOCK_REPACK 1024
 void repack_input_gpu_bin(float *input, uint32_t *re_packed_input_bin, int w, int h, int c)
 {
    int items_per_channel = w*h;
    int items_per_channel_align = items_per_channel + (32 - items_per_channel % 32);
    int channel_align = c + (32 - c % 32);
    //int size = w * h * c;
    int size = items_per_channel_align * channel_align;
    const int num_blocks = get_number_of_blocks(size, BLOCK_REPACK);
    repack_input_kernel_bin << <num_blocks, BLOCK_REPACK, 0, get_cuda_stream() >> >(input, re_packed_input_bin, w, h, c, items_per_channel_align);
 }
 */
 // --------------------------------