Speedup repack_input_kernel_bin()

pull/2352/head
AlexeyAB 6 years ago
parent 2d747cab2b
commit ff0733ed40
  1. 113
      src/im2col_kernels.cu

@ -824,7 +824,54 @@ void repack_input_gpu_2(float *input, float *re_packed_input, int w, int h, int
// -------------------------------- // --------------------------------
// 32 channels -> 1 channel (with 32 floats)
// 256 channels -> 8 channels (with 32 floats)
__global__ void repack_input_kernel_bin(float *input, uint32_t *re_packed_input_bin, int w, int h, int c)
{
//__shared__ uint32_t tmp[32];
const int index = blockIdx.x*blockDim.x + threadIdx.x;
const int global_warp_id = index / WARP_SIZE;
const int lane_id = threadIdx.x % WARP_SIZE;
const int items_per_channel = w * h;
const int items_per_channel_aligned = items_per_channel + WARP_SIZE - (items_per_channel % WARP_SIZE);
int i = 32 * (global_warp_id % (items_per_channel_aligned / WARP_SIZE));
int chan = 32 * (global_warp_id / (items_per_channel_aligned / WARP_SIZE));
if (chan < c)
{
uint32_t result_bits = 0;
for (int c_pack = 0; c_pack < 32; ++c_pack)
{
float src = 0;
if ((i + lane_id) < items_per_channel) {
src = input[(chan + c_pack)*items_per_channel + (i + lane_id)];
}
uint32_t bit_mask = __ballot_custom(src > 0);
uint32_t cur_bit = (bit_mask >> lane_id) & uint32_t(1);
result_bits |= (cur_bit << c_pack);
}
if ((i + lane_id) < items_per_channel) {
re_packed_input_bin[chan*items_per_channel / 32 + (i + lane_id)] = result_bits;
}
}
}
void repack_input_gpu_bin(float *input, uint32_t *re_packed_input_bin, int w, int h, int c)
{
int size = (w * h * c) / 32 + 1;
const int block_size = BLOCK;
const int num_blocks = get_number_of_blocks(size, block_size);
//printf("\n num_blocks = %d, num_blocks/32 = %d, block_size = %d \n", num_blocks, num_blocks / 32, block_size);
repack_input_kernel_bin << <num_blocks, block_size, 0, get_cuda_stream() >> >(input, re_packed_input_bin, w, h, c);
}
/*
// 32 channels -> 1 channel (with 32 floats) // 32 channels -> 1 channel (with 32 floats)
// 256 channels -> 8 channels (with 32 floats) // 256 channels -> 8 channels (with 32 floats)
__global__ void repack_input_kernel_bin(float *input, uint32_t *re_packed_input_bin, int w, int h, int c) __global__ void repack_input_kernel_bin(float *input, uint32_t *re_packed_input_bin, int w, int h, int c)
@ -870,74 +917,10 @@ void repack_input_gpu_bin(float *input, uint32_t *re_packed_input_bin, int w, in
int size = w * h * c; int size = w * h * c;
const int block_size = 256;// 128; const int block_size = 256;// 128;
const int num_blocks = get_number_of_blocks(size, block_size); const int num_blocks = get_number_of_blocks(size, block_size);
//printf("\n num_blocks = %d, num_blocks/32 = %d, block_size = %d \n", num_blocks, num_blocks/32, block_size); printf("\n num_blocks = %d, num_blocks/32 = %d, block_size = %d \n", num_blocks, num_blocks/32, block_size);
repack_input_kernel_bin << <num_blocks, block_size, 0, get_cuda_stream() >> >(input, re_packed_input_bin, w, h, c); repack_input_kernel_bin << <num_blocks, block_size, 0, get_cuda_stream() >> >(input, re_packed_input_bin, w, h, c);
} }
/*
// 32 channels -> 1 channel (with 32 floats)
// 256 channels -> 8 channels (with 32 floats)
__global__ void repack_input_kernel_bin(float *input, uint32_t *re_packed_input_bin, int w, int h, int c, int items_per_channel_align)
{
__shared__ float tmp[33*32]; // misalgined array 32x32
//const int index = blockIdx.x*blockDim.x + threadIdx.x;
const int num_of_warps = blockDim.x / WARP_SIZE;
const int warp_id = threadIdx.x / WARP_SIZE;
const int lane_id = threadIdx.x % WARP_SIZE;
const int items_per_channel = w * h;
//const int items_per_channel_align = items_per_channel + (32 - items_per_channel % 32);
const int blocks_per_wh = items_per_channel_align / 32;
//const int blocks_per_c = c / 32;
// input[C x H x W] = input[C x ITEMS]
// BLOCK per C x ITEMS = 32x32
const int block_item_id = blockIdx.x % blocks_per_wh;
const int block_channel_id = blockIdx.x / blocks_per_wh;
const int block_item = block_item_id * 32;
const int block_channel = block_channel_id * 32;
const int lane_item = block_item + lane_id;
const int warp_channel = block_channel + warp_id;
if (warp_channel < c)
{
float src = 0;
if (lane_item < items_per_channel)
src = input[warp_channel*items_per_channel + lane_item];
tmp[warp_id * 33 + lane_id] = src;
__syncthreads();
src = tmp[lane_id * 33 + warp_id];
uint32_t bit_mask = __ballot_custom(src > 0);
const int warp_item = block_item + warp_id;
if (lane_id == 0 && warp_item < items_per_channel)
re_packed_input_bin[block_channel_id*items_per_channel + warp_item] = bit_mask;
}
}
#define BLOCK_REPACK 1024
void repack_input_gpu_bin(float *input, uint32_t *re_packed_input_bin, int w, int h, int c)
{
int items_per_channel = w*h;
int items_per_channel_align = items_per_channel + (32 - items_per_channel % 32);
int channel_align = c + (32 - c % 32);
//int size = w * h * c;
int size = items_per_channel_align * channel_align;
const int num_blocks = get_number_of_blocks(size, BLOCK_REPACK);
repack_input_kernel_bin << <num_blocks, BLOCK_REPACK, 0, get_cuda_stream() >> >(input, re_packed_input_bin, w, h, c, items_per_channel_align);
}
*/ */
// --------------------------------

Loading…
Cancel
Save