From ff0733ed4096c3e12d88380ca6739dd7ab545a35 Mon Sep 17 00:00:00 2001 From: AlexeyAB Date: Sat, 2 Feb 2019 15:16:25 +0300 Subject: [PATCH] Speedup repack_input_kernel_bin() --- src/im2col_kernels.cu | 113 ++++++++++++++++++------------------------ 1 file changed, 48 insertions(+), 65 deletions(-) diff --git a/src/im2col_kernels.cu b/src/im2col_kernels.cu index 2bdca30b..56ed19de 100644 --- a/src/im2col_kernels.cu +++ b/src/im2col_kernels.cu @@ -824,7 +824,54 @@ void repack_input_gpu_2(float *input, float *re_packed_input, int w, int h, int // -------------------------------- +// 32 channels -> 1 channel (with 32 floats) +// 256 channels -> 8 channels (with 32 floats) +__global__ void repack_input_kernel_bin(float *input, uint32_t *re_packed_input_bin, int w, int h, int c) +{ + //__shared__ uint32_t tmp[32]; + const int index = blockIdx.x*blockDim.x + threadIdx.x; + + const int global_warp_id = index / WARP_SIZE; + const int lane_id = threadIdx.x % WARP_SIZE; + + const int items_per_channel = w * h; + const int items_per_channel_aligned = items_per_channel + WARP_SIZE - (items_per_channel % WARP_SIZE); + + int i = 32 * (global_warp_id % (items_per_channel_aligned / WARP_SIZE)); + int chan = 32 * (global_warp_id / (items_per_channel_aligned / WARP_SIZE)); + + if (chan < c) + { + uint32_t result_bits = 0; + + for (int c_pack = 0; c_pack < 32; ++c_pack) + { + float src = 0; + if ((i + lane_id) < items_per_channel) { + src = input[(chan + c_pack)*items_per_channel + (i + lane_id)]; + } + uint32_t bit_mask = __ballot_custom(src > 0); + uint32_t cur_bit = (bit_mask >> lane_id) & uint32_t(1); + + result_bits |= (cur_bit << c_pack); + } + if ((i + lane_id) < items_per_channel) { + re_packed_input_bin[chan*items_per_channel / 32 + (i + lane_id)] = result_bits; + } + } +} + +void repack_input_gpu_bin(float *input, uint32_t *re_packed_input_bin, int w, int h, int c) +{ + int size = (w * h * c) / 32 + 1; + const int block_size = BLOCK; + const int num_blocks = get_number_of_blocks(size, block_size); + //printf("\n num_blocks = %d, num_blocks/32 = %d, block_size = %d \n", num_blocks, num_blocks / 32, block_size); + repack_input_kernel_bin << > >(input, re_packed_input_bin, w, h, c); +} + +/* // 32 channels -> 1 channel (with 32 floats) // 256 channels -> 8 channels (with 32 floats) __global__ void repack_input_kernel_bin(float *input, uint32_t *re_packed_input_bin, int w, int h, int c) @@ -870,74 +917,10 @@ void repack_input_gpu_bin(float *input, uint32_t *re_packed_input_bin, int w, in int size = w * h * c; const int block_size = 256;// 128; const int num_blocks = get_number_of_blocks(size, block_size); - //printf("\n num_blocks = %d, num_blocks/32 = %d, block_size = %d \n", num_blocks, num_blocks/32, block_size); + printf("\n num_blocks = %d, num_blocks/32 = %d, block_size = %d \n", num_blocks, num_blocks/32, block_size); repack_input_kernel_bin << > >(input, re_packed_input_bin, w, h, c); } - - -/* -// 32 channels -> 1 channel (with 32 floats) -// 256 channels -> 8 channels (with 32 floats) -__global__ void repack_input_kernel_bin(float *input, uint32_t *re_packed_input_bin, int w, int h, int c, int items_per_channel_align) -{ - __shared__ float tmp[33*32]; // misalgined array 32x32 - //const int index = blockIdx.x*blockDim.x + threadIdx.x; - - const int num_of_warps = blockDim.x / WARP_SIZE; - const int warp_id = threadIdx.x / WARP_SIZE; - const int lane_id = threadIdx.x % WARP_SIZE; - - const int items_per_channel = w * h; - //const int items_per_channel_align = items_per_channel + (32 - items_per_channel % 32); - const int blocks_per_wh = items_per_channel_align / 32; - //const int blocks_per_c = c / 32; - - // input[C x H x W] = input[C x ITEMS] - // BLOCK per C x ITEMS = 32x32 - - const int block_item_id = blockIdx.x % blocks_per_wh; - const int block_channel_id = blockIdx.x / blocks_per_wh; - - const int block_item = block_item_id * 32; - const int block_channel = block_channel_id * 32; - - const int lane_item = block_item + lane_id; - const int warp_channel = block_channel + warp_id; - - if (warp_channel < c) - { - float src = 0; - - if (lane_item < items_per_channel) - src = input[warp_channel*items_per_channel + lane_item]; - - tmp[warp_id * 33 + lane_id] = src; - __syncthreads(); - src = tmp[lane_id * 33 + warp_id]; - - uint32_t bit_mask = __ballot_custom(src > 0); - - const int warp_item = block_item + warp_id; - - if (lane_id == 0 && warp_item < items_per_channel) - re_packed_input_bin[block_channel_id*items_per_channel + warp_item] = bit_mask; - } -} - -#define BLOCK_REPACK 1024 -void repack_input_gpu_bin(float *input, uint32_t *re_packed_input_bin, int w, int h, int c) -{ - int items_per_channel = w*h; - int items_per_channel_align = items_per_channel + (32 - items_per_channel % 32); - int channel_align = c + (32 - c % 32); - - //int size = w * h * c; - int size = items_per_channel_align * channel_align; - const int num_blocks = get_number_of_blocks(size, BLOCK_REPACK); - repack_input_kernel_bin << > >(input, re_packed_input_bin, w, h, c, items_per_channel_align); -} */ -// --------------------------------