|
|
|
@ -59,3 +59,196 @@ void im2col_ongpu(float *im, |
|
|
|
|
stride, height_col, |
|
|
|
|
width_col, data_col); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
__global__ void im2col_align_gpu_kernel(const int n, const float* data_im, |
|
|
|
|
const int height, const int width, const int ksize, |
|
|
|
|
const int pad, |
|
|
|
|
const int stride, |
|
|
|
|
const int height_col, const int width_col, |
|
|
|
|
float *data_col, const int bit_align) |
|
|
|
|
{ |
|
|
|
|
int index = blockIdx.x*blockDim.x + threadIdx.x; |
|
|
|
|
for (; index < n; index += blockDim.x*gridDim.x) { |
|
|
|
|
int w_out = index % width_col; |
|
|
|
|
int h_index = index / width_col; |
|
|
|
|
int h_out = h_index % height_col; |
|
|
|
|
int channel_in = h_index / height_col; |
|
|
|
|
int channel_out = channel_in * ksize * ksize; |
|
|
|
|
int h_in = h_out * stride - pad; |
|
|
|
|
int w_in = w_out * stride - pad; |
|
|
|
|
float* data_col_ptr = data_col; |
|
|
|
|
//data_col_ptr += (channel_out * height_col + h_out) * width_col + w_out; |
|
|
|
|
data_col_ptr += channel_out * bit_align + h_out * width_col + w_out; |
|
|
|
|
const float* data_im_ptr = data_im; |
|
|
|
|
data_im_ptr += (channel_in * height + h_in) * width + w_in; |
|
|
|
|
for (int i = 0; i < ksize; ++i) { |
|
|
|
|
for (int j = 0; j < ksize; ++j) { |
|
|
|
|
int h = h_in + i; |
|
|
|
|
int w = w_in + j; |
|
|
|
|
|
|
|
|
|
*data_col_ptr = (h >= 0 && w >= 0 && h < height && w < width) ? |
|
|
|
|
data_im_ptr[i * width + j] : 0; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
//data_col_ptr += height_col * width_col; |
|
|
|
|
data_col_ptr += bit_align; |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
void im2col_align_ongpu(float *im, |
|
|
|
|
int channels, int height, int width, |
|
|
|
|
int ksize, int stride, int pad, float *data_col, int bit_align) { |
|
|
|
|
// We are going to launch channels * height_col * width_col kernels, each |
|
|
|
|
// kernel responsible for copying a single-channel grid. |
|
|
|
|
int height_col = (height + 2 * pad - ksize) / stride + 1; |
|
|
|
|
int width_col = (width + 2 * pad - ksize) / stride + 1; |
|
|
|
|
int num_kernels = channels * height_col * width_col; |
|
|
|
|
im2col_align_gpu_kernel << <(num_kernels + BLOCK - 1) / BLOCK, |
|
|
|
|
BLOCK, 0, get_cuda_stream() >> >( |
|
|
|
|
num_kernels, im, height, width, ksize, pad, |
|
|
|
|
stride, height_col, |
|
|
|
|
width_col, data_col, bit_align); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// -------------------------------- |
|
|
|
|
|
|
|
|
|
#define WARP_SIZE 32 |
|
|
|
|
|
|
|
|
|
__global__ void float_to_bit_gpu_kernel(float *src, unsigned char *dst, size_t size) |
|
|
|
|
{ |
|
|
|
|
//size_t dst_size = size / 8 + 1; |
|
|
|
|
//memset(dst, 0, dst_size); |
|
|
|
|
//uint32_t bit_mask = __ballot_sync(FULL_MASK, src[i] > 0); |
|
|
|
|
const int size_aligned = size + (WARP_SIZE - size % WARP_SIZE); |
|
|
|
|
|
|
|
|
|
int index = blockIdx.x*blockDim.x + threadIdx.x; |
|
|
|
|
float src_val; |
|
|
|
|
|
|
|
|
|
for (; index < size_aligned; index += blockDim.x*gridDim.x) |
|
|
|
|
{ |
|
|
|
|
if(index < size) src_val = src[index]; |
|
|
|
|
else src_val = 0; |
|
|
|
|
unsigned int bit_mask = __ballot_sync(0xffffffff, src_val > 0); |
|
|
|
|
if (threadIdx.x % WARP_SIZE == 0) ((unsigned int*)dst)[index / 32] = bit_mask; |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
void float_to_bit_gpu(float *src, unsigned char *dst, size_t size) |
|
|
|
|
{ |
|
|
|
|
const int num_blocks = size / BLOCK + 1; |
|
|
|
|
float_to_bit_gpu_kernel<<<num_blocks, BLOCK, 0, get_cuda_stream()>>>(src, dst, size); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
// -------------------------------- |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
__device__ __host__ static inline void set_bit(unsigned char *const dst, size_t index) { |
|
|
|
|
size_t dst_i = index / 8; |
|
|
|
|
int dst_shift = index % 8; |
|
|
|
|
dst[dst_i] |= 1 << dst_shift; |
|
|
|
|
//dst[dst_i] |= 1 << (8 - dst_shift); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
__device__ __host__ static inline unsigned char get_bit(unsigned char const*const src, size_t index) { |
|
|
|
|
size_t src_i = index / 8; |
|
|
|
|
int src_shift = index % 8; |
|
|
|
|
unsigned char val = (src[src_i] & (1 << src_shift)) > 0; |
|
|
|
|
//unsigned char val = (src[src_i] & (1 << (8 - src_shift))) > 0; |
|
|
|
|
return val; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
// Intel CPUs and nVidia CUDA GPU are little endian |
|
|
|
|
__device__ __host__ unsigned char reverse_byte(unsigned char a) |
|
|
|
|
{ |
|
|
|
|
return ((a & 0x1) << 7) | ((a & 0x2) << 5) | |
|
|
|
|
((a & 0x4) << 3) | ((a & 0x8) << 1) | |
|
|
|
|
((a & 0x10) >> 1) | ((a & 0x20) >> 3) | |
|
|
|
|
((a & 0x40) >> 5) | ((a & 0x80) >> 7); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
__device__ __host__ unsigned char reverse_byte_2(unsigned char a) |
|
|
|
|
{ |
|
|
|
|
return ((a * 0x0802LU & 0x22110LU) | (a * 0x8020LU & 0x88440LU)) * 0x10101LU >> 16; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
__device__ __host__ void transpose8rS32_reversed_diagonale(unsigned char* A, int m, int n, unsigned char* B) |
|
|
|
|
{ |
|
|
|
|
unsigned x, y, t; |
|
|
|
|
|
|
|
|
|
// Load the array and pack it into x and y. |
|
|
|
|
x = (A[0] << 24) | (A[m] << 16) | (A[2 * m] << 8) | A[3 * m]; |
|
|
|
|
y = (A[4 * m] << 24) | (A[5 * m] << 16) | (A[6 * m] << 8) | A[7 * m]; |
|
|
|
|
|
|
|
|
|
t = (x ^ (x >> 7)) & 0x00AA00AA; x = x ^ t ^ (t << 7); |
|
|
|
|
t = (y ^ (y >> 7)) & 0x00AA00AA; y = y ^ t ^ (t << 7); |
|
|
|
|
|
|
|
|
|
t = (x ^ (x >> 14)) & 0x0000CCCC; x = x ^ t ^ (t << 14); |
|
|
|
|
t = (y ^ (y >> 14)) & 0x0000CCCC; y = y ^ t ^ (t << 14); |
|
|
|
|
|
|
|
|
|
t = (x & 0xF0F0F0F0) | ((y >> 4) & 0x0F0F0F0F); |
|
|
|
|
y = ((x << 4) & 0xF0F0F0F0) | (y & 0x0F0F0F0F); |
|
|
|
|
x = t; |
|
|
|
|
|
|
|
|
|
B[7 * n] = reverse_byte(x >> 24); B[6 * n] = reverse_byte(x >> 16); B[5 * n] = reverse_byte(x >> 8); B[4 * n] = reverse_byte(x); |
|
|
|
|
B[3 * n] = reverse_byte(y >> 24); B[2 * n] = reverse_byte(y >> 16); B[1 * n] = reverse_byte(y >> 8); B[0 * n] = reverse_byte(y); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
__global__ void transpose_bin_gpu_kernel(unsigned char *A, unsigned char *B, const int n, const int m, |
|
|
|
|
const int lda, const int ldb, const int block_size) |
|
|
|
|
{ |
|
|
|
|
int i; |
|
|
|
|
int index = blockIdx.x*blockDim.x + threadIdx.x; |
|
|
|
|
|
|
|
|
|
//for (i = 0; i < n; i += 8) |
|
|
|
|
{ |
|
|
|
|
i = (index*8) % n; |
|
|
|
|
int j; |
|
|
|
|
//for (j = 0; j < m - 8; j += 8) |
|
|
|
|
{ |
|
|
|
|
j = ((index * 8) / n) * 8; |
|
|
|
|
if (j < m - 8) { |
|
|
|
|
int a_index = i*lda + j; |
|
|
|
|
int b_index = j*ldb + i; |
|
|
|
|
//transpose_8x8_bits_my(&A[a_index/8], &B[b_index/8], lda/8, ldb/8); |
|
|
|
|
transpose8rS32_reversed_diagonale(&A[a_index / 8], lda / 8, ldb / 8, &B[b_index / 8]); |
|
|
|
|
} |
|
|
|
|
else if (j < m) { |
|
|
|
|
for (; j < m; ++j) { |
|
|
|
|
if (get_bit(A, i*lda + j)) set_bit(B, j*ldb + i); |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
void transpose_bin_gpu(unsigned char *A, unsigned char *B, const int n, const int m, |
|
|
|
|
const int lda, const int ldb, const int block_size) |
|
|
|
|
{ |
|
|
|
|
size_t size = n*m/64 + 1; |
|
|
|
|
const int num_blocks = size / BLOCK + 1; |
|
|
|
|
transpose_bin_gpu_kernel << <num_blocks, BLOCK, 0, get_cuda_stream() >> >(A, B, n, m, lda, ldb, block_size); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// -------------------------------- |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
__global__ void fill_int8_gpu_kernel(unsigned char *src, unsigned char val, size_t size) { |
|
|
|
|
int index = blockIdx.x*blockDim.x + threadIdx.x; |
|
|
|
|
if(index < size) src[index] = 0; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
void fill_int8_gpu(unsigned char *src, unsigned char val, size_t size) { |
|
|
|
|
const int num_blocks = size / BLOCK + 1; |
|
|
|
|
fill_int8_gpu_kernel<<<num_blocks, BLOCK, 0, get_cuda_stream() >>>(src, val, size); |
|
|
|
|
} |