XNOR coalesced memory access, and avoid bank conflicts

pull/1724/head
AlexeyAB 7 years ago
parent 8e0b265067
commit 03e95320a1
  1. 32
      src/convolutional_kernels.cu
  2. 7
      src/im2col.h
  3. 469
      src/im2col_kernels.cu
  4. 1
      src/layer.h
  5. 5
      src/network.c
  6. 1
      src/parser.c

@ -117,13 +117,16 @@ void forward_convolutional_layer_gpu(convolutional_layer l, network_state state)
} }
if(l.xnor){ if(l.xnor){
if (!l.align_bit_weights_gpu || state.train) { if (!l.align_bit_weights_gpu || state.train) {
binarize_weights_gpu(l.weights_gpu, l.n, l.c*l.size*l.size, l.binary_weights_gpu); binarize_weights_gpu(l.weights_gpu, l.n, l.c*l.size*l.size, l.binary_weights_gpu);
swap_binary(&l);
binarize_gpu(state.input, l.c*l.h*l.w*l.batch, l.binary_input_gpu);
state.input = l.binary_input_gpu;
} }
swap_binary(&l); //swap_binary(&l);
binarize_gpu(state.input, l.c*l.h*l.w*l.batch, l.binary_input_gpu); //binarize_gpu(state.input, l.c*l.h*l.w*l.batch, l.binary_input_gpu);
state.input = l.binary_input_gpu; //state.input = l.binary_input_gpu;
//cudaDeviceSynchronize(); //cudaDeviceSynchronize();
if (l.align_bit_weights_gpu && !state.train) if (l.align_bit_weights_gpu && !state.train)
@ -141,6 +144,7 @@ void forward_convolutional_layer_gpu(convolutional_layer l, network_state state)
size_t t_intput_size = new_ldb * n; size_t t_intput_size = new_ldb * n;
size_t t_bit_input_size = t_intput_size / 8;// +1; size_t t_bit_input_size = t_intput_size / 8;// +1;
//if(0)
{ {
int i = 0; int i = 0;
im2col_align_ongpu(state.input + i*l.c*l.h*l.w, l.c, l.h, l.w, l.size, l.stride, l.pad, l.align_workspace_gpu, l.bit_align); im2col_align_ongpu(state.input + i*l.c*l.h*l.w, l.c, l.h, l.w, l.size, l.stride, l.pad, l.align_workspace_gpu, l.bit_align);
@ -156,10 +160,18 @@ void forward_convolutional_layer_gpu(convolutional_layer l, network_state state)
//cudaDeviceSynchronize(); //cudaDeviceSynchronize();
// should be optimized // should be optimized
gemm_nn_custom_bin_mean_transposed_gpu(m, n, k, //if(0) {//if (k > 1000) { // sequentially input-shared - BAD
(unsigned char *)l.align_bit_weights_gpu, new_ldb, (unsigned char *)l.transposed_align_workspace_gpu, new_ldb, l.output_gpu, n, l.mean_arr_gpu); // gemm_nn_custom_bin_mean_transposed_sequentially_gpu(m, n, k,
// (unsigned char *)l.align_bit_weights_gpu, new_ldb, (unsigned char *)l.transposed_align_workspace_gpu, new_ldb, l.output_gpu, n, l.mean_arr_gpu);
//}
//else { // coalescing & weights-shared-memory - GOOD
gemm_nn_custom_bin_mean_transposed_gpu(m, n, k,
(unsigned char *)l.align_bit_weights_gpu, new_ldb, (unsigned char *)l.transposed_align_workspace_gpu,
new_ldb, l.output_gpu, n, l.mean_arr_gpu, l.biases_gpu);
//}
//cudaDeviceSynchronize(); //cudaDeviceSynchronize();
//check_error(status); //check_error(status);
//getchar();
} }
@ -172,12 +184,14 @@ void forward_convolutional_layer_gpu(convolutional_layer l, network_state state)
//cudaDeviceSynchronize(); //cudaDeviceSynchronize();
//check_error(status); //check_error(status);
add_bias_gpu(l.output_gpu, l.biases_gpu, l.batch, l.n, l.out_w*l.out_h);
} }
*/ */
add_bias_gpu(l.output_gpu, l.biases_gpu, l.batch, l.n, l.out_w*l.out_h); //add_bias_gpu(l.output_gpu, l.biases_gpu, l.batch, l.n, l.out_w*l.out_h);
activate_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation); if(l.activation != LINEAR) activate_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation);
if (l.binary || l.xnor) swap_binary(&l); //if (l.binary || l.xnor) swap_binary(&l);
//cudaDeviceSynchronize(); //cudaDeviceSynchronize();
return; return;
} }

@ -24,7 +24,14 @@ void transpose_bin_gpu(unsigned char *A, unsigned char *B, const int n, const in
void fill_int8_gpu(unsigned char *src, unsigned char val, size_t size); void fill_int8_gpu(unsigned char *src, unsigned char val, size_t size);
// shared_memory + partial coalescing = GOOD
void gemm_nn_custom_bin_mean_transposed_gpu(int M, int N, int K, void gemm_nn_custom_bin_mean_transposed_gpu(int M, int N, int K,
unsigned char *A, int lda,
unsigned char *B, int ldb,
float *C, int ldc, float *mean_arr, float *bias);
// sequentially - BAD
void gemm_nn_custom_bin_mean_transposed_sequentially_gpu(int M, int N, int K,
unsigned char *A, int lda, unsigned char *A, int lda,
unsigned char *B, int ldb, unsigned char *B, int ldb,
float *C, int ldc, float *mean_arr); float *C, int ldc, float *mean_arr);

@ -8,6 +8,10 @@ extern "C" {
#include "cuda.h" #include "cuda.h"
} }
#include <stdio.h>
#include <assert.h>
#include <cuda.h>
// src: https://github.com/BVLC/caffe/blob/master/src/caffe/util/im2col.cu // src: https://github.com/BVLC/caffe/blob/master/src/caffe/util/im2col.cu
// You may also want to read: https://github.com/BVLC/caffe/blob/master/LICENSE // You may also want to read: https://github.com/BVLC/caffe/blob/master/LICENSE
@ -105,6 +109,7 @@ __global__ void im2col_align_gpu_kernel(const int n, const float* data_im,
//float src_val = (h >= 0 && w >= 0 && h < height && w < width) ? data_im_ptr[i * width + j] : 0; //float src_val = (h >= 0 && w >= 0 && h < height && w < width) ? data_im_ptr[i * width + j] : 0;
//unsigned int bit_mask = __ballot_sync(0xffffffff, src_val > 0); //unsigned int bit_mask = __ballot_sync(0xffffffff, src_val > 0);
//if (threadIdx.x % WARP_SIZE == 0) *((unsigned int*)data_col_ptr_32) = bit_mask; //if (threadIdx.x % WARP_SIZE == 0) *((unsigned int*)data_col_ptr_32) = bit_mask;
// use atomicOr() // *dst_ptr |= (mask << (col_index % 8));
//data_col_ptr_32 += bit_align / 32; //data_col_ptr_32 += bit_align / 32;
//data_col_ptr += height_col * width_col; //data_col_ptr += height_col * width_col;
@ -283,6 +288,10 @@ __device__ __host__ static inline uint8_t xnor_bit1(uint8_t a, uint8_t b) {
return ~(a^b) & 0b1; return ~(a^b) & 0b1;
} }
__device__ __host__ static inline uint32_t xnor_int32(uint32_t a, uint32_t b) {
return ~(a^b);
}
__device__ __host__ static inline uint64_t xnor_int64(uint64_t a, uint64_t b) { __device__ __host__ static inline uint64_t xnor_int64(uint64_t a, uint64_t b) {
return ~(a^b); return ~(a^b);
} }
@ -356,7 +365,7 @@ __global__ void gemm_nn_custom_bin_mean_transposed_gpu_kernel(int M, int N, int
*/ */
/*
// B (input) in the shared_memory // B (input) in the shared_memory
__global__ void gemm_nn_custom_bin_mean_transposed_gpu_kernel(int M, int N, int K, __global__ void gemm_nn_custom_bin_mean_transposed_gpu_kernel(int M, int N, int K,
unsigned char *A, int lda, unsigned char *A, int lda,
@ -367,25 +376,27 @@ __global__ void gemm_nn_custom_bin_mean_transposed_gpu_kernel(int M, int N, int
__shared__ uint64_t B_s[4096]; // 32 KB // [ldb x N`] // max = 262 144 bits __shared__ uint64_t B_s[4096]; // 32 KB // [ldb x N`] // max = 262 144 bits
int start_j = blockIdx.x*blockDim.x / M; int start_j = blockIdx.x*blockDim.x / M;
int end_j = (blockIdx.x*blockDim.x + blockDim.x) / M + 1; {
int end_j = (blockIdx.x*blockDim.x + blockDim.x) / M + 1;
size_t shared_size = ldb * (end_j - start_j); size_t shared_size = ldb * (end_j - start_j);
//float tmp_shared_size = ldb * (blockDim.x / M); //float tmp_shared_size = ldb * (blockDim.x / M);
//int passes = (4096 * 64) / tmp_shared_size - 1; //int passes = (4096 * 64) / tmp_shared_size - 1;
//size_t shared_size = tmp_shared_size * passes; //size_t shared_size = tmp_shared_size * passes;
int k; int k;
for (int k = threadIdx.x * 256; k < shared_size; k += blockDim.x * 256) { for (int k = threadIdx.x * 256; k < shared_size; k += blockDim.x * 256) {
int x = start_j*ldb + k; int x = start_j*ldb + k;
if (x < (N*ldb)) *((ulonglong4 *)(B_s + k / 8)) = *((ulonglong4 *)(B + x / 8)); if (x < (N*ldb)) *((ulonglong4 *)(B_s + k / 8)) = *((ulonglong4 *)(B + x / 8));
} }
////if (j_cur < N && (index % M == 0 || threadIdx.x == 0)) { ////if (j_cur < N && (index % M == 0 || threadIdx.x == 0)) {
//// for (int k = 0; k < K; k += 64) { // l.size*l.size*l.c - one filter size [27 - 9216] //// for (int k = 0; k < K; k += 64) { // l.size*l.size*l.c - one filter size [27 - 9216]
//// *((uint64_t *)(B_s + (local_j*ldb + k) / 8)) = *((uint64_t *)(B + (j_cur*ldb + k) / 8)); // input //// *((uint64_t *)(B_s + (local_j*ldb + k) / 8)) = *((uint64_t *)(B + (j_cur*ldb + k) / 8)); // input
////}
////} ////}
////} }
__syncthreads(); __syncthreads();
int index = blockIdx.x*blockDim.x + threadIdx.x; int index = blockIdx.x*blockDim.x + threadIdx.x;
@ -427,14 +438,19 @@ __global__ void gemm_nn_custom_bin_mean_transposed_gpu_kernel(int M, int N, int
} }
int f1 = (K % bit_step == 0) ? 0 : (bit_step - (K % bit_step)); int f1 = (K % bit_step == 0) ? 0 : (bit_step - (K % bit_step));
count = count - f1; // remove extra bits (from empty space for align only) //C[i*ldc + j] += 2 * count*mean_val;
//C[i*ldc + j] += -2 * f1*mean_val;
//C[i*ldc + j] += - K*mean_val;
count = count - f1; // remove extra bits (from empty space for align only)
C[i*ldc + j] = (2 * count - K) * mean_val; C[i*ldc + j] = (2 * count - K) * mean_val;
//B_s[0] = (2 * count - K) * mean_val; //B_s[0] = (2 * count - K) * mean_val;
} }
} }
} }
} }
*/
/* /*
// A (weights) in the shared_memory // A (weights) in the shared_memory
@ -497,13 +513,293 @@ __global__ void gemm_nn_custom_bin_mean_transposed_gpu_kernel(int M, int N, int
} }
*/ */
#include <cstdio> __inline__ __device__
int warpAllReduceSum(int val) {
for (int mask = WARP_SIZE / 2; mask > 0; mask /= 2)
val += __shfl_xor(val, mask);
return val;
}
// Coalescing
// A (weights) in the shared_memory - GOOD
__global__ void gemm_nn_custom_bin_mean_transposed_gpu_kernel(int M, int N, int K,
unsigned char *A, int lda,
unsigned char *B, int ldb,
float *C, int ldc, float *mean_arr, float *bias_arr)
{
int index = blockIdx.x*blockDim.x + threadIdx.x;
__shared__ uint8_t A_s[6144*8/4];
//__shared__ uint64_t A_s[6144]; // 48 KB // [lda x M`]
//__shared__ uint8_t A_s[6144*8]; // 48 KB // [lda x M`]
int start_i = blockIdx.x*blockDim.x / N;
int end_i = (blockIdx.x*blockDim.x + blockDim.x) / N + 1;
size_t shared_size = lda * (end_i - start_i);
int i_cur = index / N;
int local_i = i_cur - start_i;
for (int k = threadIdx.x * 64; k < shared_size; k += blockDim.x * 64) {
int x = start_i*lda + k;
if (x < (M*lda)) *((uint64_t *)(A_s + k / 8)) = *((uint64_t *)(A + x / 8));
}
__syncthreads();
int i, j, k, h;
j = index % N;
{ // out_h*out_w - one channel output size [169 - 173056]
i = index / N;
//if (i < M) // l.n - filters [16 - 55 - 1024]
{
int count = 0;
k = 0;
//#ifdef NON_USED
// 32 thread X 64 bit = 2048 bit
for (; k < (K - 2048); k += 2048) { // l.size*l.size*l.c - one filter size [27 - 9216]
uint64_t c_bit64;
//int64_t A_cur_index = (i*lda + k) / 8;
int64_t A_cur_index = (local_i*lda + k) / 8;
int64_t B_cur_index = (j*ldb + k) / 8;
if (i >= M) A_cur_index = 0;
#pragma unroll WARP_SIZE
for (int t = 0; t < WARP_SIZE; ++t) {
const int lane_id = threadIdx.x % WARP_SIZE;
const int64_t A_i = __shfl(A_cur_index, t) + 8 * lane_id;
const int64_t B_i = __shfl(B_cur_index, t) + 8 * lane_id;
{
//uint64_t a_bit64 = *((uint64_t *)(A + A_i)); // weights
uint64_t a_bit64 = *((uint64_t *)(A_s + A_i)); // weights
uint64_t b_bit64 = *((uint64_t *)(B + B_i)); // input
c_bit64 = xnor_int64(a_bit64, b_bit64);
int tmp_count = __popcll(c_bit64);
int sum_count = warpAllReduceSum(tmp_count);
if (lane_id == t) count += sum_count;
}
}
}
//#endif
//#ifdef NON_USED
// 32 thread X 32 bit = 1024 bit
for (; k < (K - 1024); k += 1024) { // l.size*l.size*l.c - one filter size [27 - 9216]
//int64_t A_cur_index = (i*lda + k) / 8;
int64_t A_cur_index = (local_i*lda + k) / 8;
int64_t B_cur_index = (j*ldb + k) / 8;
if (i >= M) A_cur_index = 0;
#pragma unroll WARP_SIZE
for (int t = 0; t < WARP_SIZE; ++t) {
const int lane_id = threadIdx.x % WARP_SIZE;
const int64_t A_i = __shfl(A_cur_index, t) + 4 * lane_id;
const int64_t B_i = __shfl(B_cur_index, t) + 4 * lane_id;
{
//uint64_t a_bit64 = *((uint64_t *)(A + A_i)); // weights
uint32_t a_bit32 = *((uint32_t *)(A_s + A_i)); // weights
uint32_t b_bit32 = *((uint32_t *)(B + B_i)); // input
uint32_t c_bit32 = xnor_int32(a_bit32, b_bit32);
int tmp_count = __popc(c_bit32);
int sum_count = warpAllReduceSum(tmp_count);
if (lane_id == t) count += sum_count;
}
}
}
//#endif
if (i < M)
{
float mean_val = mean_arr[i];
float bias_val = bias_arr[i];
//#ifdef NON_USED
for (; k < K; k += 256) { // l.size*l.size*l.c - one filter size [27 - 144 - 9216]
//ulonglong4 a_bit256 = *((ulonglong4 *)(A + (i*lda + k) / 8)); // weights
ulonglong4 a_bit256 = *((ulonglong4 *)(A_s + (local_i*lda + k) / 8)); // weights
ulonglong4 b_bit256 = *((ulonglong4 *)(B + (j*ldb + k) / 8)); // input
ulonglong4 c_bit256 = xnor_int256(a_bit256, b_bit256);
count += __popcll(c_bit256.w) + __popcll(c_bit256.x) +
__popcll(c_bit256.y) + __popcll(c_bit256.z);
}
//#endif
#ifdef NON_USED
for (; k < K; k += 64) { // l.size*l.size*l.c - one filter size [27 - 9216]
//uint64_t a_bit64 = *((uint64_t *)(A + (i*lda + k) / 8)); // weights
uint64_t a_bit64 = *((uint64_t *)(A_s + (local_i*lda + k) / 8)); // weights
uint64_t b_bit64 = *((uint64_t *)(B + (j*ldb + k) / 8)); // input
uint64_t c_bit64 = xnor_int64(a_bit64, b_bit64);
count += __popcll(c_bit64);
}
#endif
const int bit_step = 256;
int f1 = (K % bit_step == 0) ? 0 : (bit_step - (K % bit_step));
count = count - f1; // remove extra bits (from empty space for align only)
C[i*ldc + j] = (2 * count - K) *mean_val + bias_val;
}
}
}
}
/*
// Coalescing
// B (input) in the shared_memory - GOOD
__global__ void gemm_nn_custom_bin_mean_transposed_gpu_kernel(int M, int N, int K,
unsigned char *A, int lda,
unsigned char *B, int ldb,
float *C, int ldc, float *mean_arr, float *bias_arr)
{
int index = blockIdx.x*blockDim.x + threadIdx.x;
__shared__ uint8_t B_s[4096*8]; // 32 KB // [ldb x N`] // max = 262 144 bits
//__shared__ uint64_t B_s[4096]; // 32 KB // [ldb x N`] // max = 262 144 bits
int start_j = blockIdx.x*blockDim.x / M;
int end_j = (blockIdx.x*blockDim.x + blockDim.x) / M + 1;
size_t shared_size = ldb * (end_j - start_j);
int j_cur = index / M;
int local_j = j_cur - start_j;
for (int k = threadIdx.x * 256; k < shared_size; k += blockDim.x * 256) {
int x = start_j*ldb + k;
if (x < (N*ldb)) *((ulonglong4 *)(B_s + k / 8)) = *((ulonglong4 *)(B + x / 8));
}
__syncthreads();
int i, j, k;
i = index % M; // l.n - filters [16 - 55 - 1024]
{
j = index / M; // out_h*out_w - one channel output size [169 - 173056]
if (j < N)
{
int count = 0;
k = 0;
//#ifdef NON_USED
// 32 thread X 64 bit = 2048 bit
for (; k < (K - 2048); k += 2048) { // l.size*l.size*l.c - one filter size [27 - 9216]
uint64_t c_bit64;
int64_t A_cur_index = (i*lda + k) / 8;
//int64_t B_cur_index = (j*ldb + k) / 8;
int64_t B_cur_index = (local_j*ldb + k) / 8;
if (i >= M) A_cur_index = 0;
#pragma unroll WARP_SIZE
for (int t = 0; t < WARP_SIZE; ++t) {
const int lane_id = threadIdx.x % WARP_SIZE;
const int64_t A_i = __shfl(A_cur_index, t) + 8 * lane_id;
const int64_t B_i = __shfl(B_cur_index, t) + 8 * lane_id;
{
uint64_t a_bit64 = *((uint64_t *)(A + A_i)); // weights
//uint64_t b_bit64 = *((uint64_t *)(B + B_i)); // input
uint64_t b_bit64 = *((uint64_t *)(B_s + B_i)); // input
c_bit64 = xnor_int64(a_bit64, b_bit64);
int tmp_count = __popcll(c_bit64);
int sum_count = warpAllReduceSum(tmp_count);
if (lane_id == t) count += sum_count;
}
}
}
//#endif
//#ifdef NON_USED
// 32 thread X 32 bit = 1024 bit
for (; k < (K - 1024); k += 1024) { // l.size*l.size*l.c - one filter size [27 - 9216]
int64_t A_cur_index = (i*lda + k) / 8;
//int64_t B_cur_index = (j*ldb + k) / 8;
int64_t B_cur_index = (local_j*ldb + k) / 8;
if (i >= M) A_cur_index = 0;
#pragma unroll WARP_SIZE
for (int t = 0; t < WARP_SIZE; ++t) {
const int lane_id = threadIdx.x % WARP_SIZE;
const int64_t A_i = __shfl(A_cur_index, t) + 4 * lane_id;
const int64_t B_i = __shfl(B_cur_index, t) + 4 * lane_id;
{
uint32_t a_bit32 = *((uint32_t *)(A + A_i)); // weights
//uint32_t b_bit32 = *((uint32_t *)(B + B_i)); // input
uint32_t b_bit32 = *((uint32_t *)(B_s + B_i)); // input
uint32_t c_bit32 = xnor_int32(a_bit32, b_bit32);
int tmp_count = __popc(c_bit32);
int sum_count = warpAllReduceSum(tmp_count);
if (lane_id == t) count += sum_count;
}
}
}
//#endif
if (i < M)
{
float mean_val = mean_arr[i];
float bias_val = bias_arr[i];
//#ifdef NON_USED
for (; k < K; k += 256) { // l.size*l.size*l.c - one filter size [27 - 144 - 9216]
ulonglong4 a_bit256 = *((ulonglong4 *)(A + (i*lda + k) / 8)); // weights
//ulonglong4 b_bit256 = *((ulonglong4 *)(B + (j*ldb + k) / 8)); // input
ulonglong4 b_bit256 = *((ulonglong4 *)(B_s + (local_j*ldb + k) / 8)); // input
ulonglong4 c_bit256 = xnor_int256(a_bit256, b_bit256);
count += __popcll(c_bit256.w) + __popcll(c_bit256.x) +
__popcll(c_bit256.y) + __popcll(c_bit256.z);
}
//#endif
#ifdef NON_USED
for (; k < K; k += 64) { // l.size*l.size*l.c - one filter size [27 - 9216]
uint64_t a_bit64 = *((uint64_t *)(A + (i*lda + k) / 8)); // weights
//uint64_t b_bit64 = *((uint64_t *)(B + (j*ldb + k) / 8)); // input
uint64_t b_bit64 = *((uint64_t *)(B_s + (local_j*ldb + k) / 8)); // input
uint64_t c_bit64 = xnor_int64(a_bit64, b_bit64);
count += __popcll(c_bit64);
}
#endif
const int bit_step = 256;
int f1 = (K % bit_step == 0) ? 0 : (bit_step - (K % bit_step));
count = count - f1; // remove extra bits (from empty space for align only)
C[i*ldc + j] = (2 * count - K) * mean_val + bias_val;
}
}
}
}
*/
// GOOD
void gemm_nn_custom_bin_mean_transposed_gpu(int M, int N, int K, void gemm_nn_custom_bin_mean_transposed_gpu(int M, int N, int K,
unsigned char *A, int lda, unsigned char *A, int lda,
unsigned char *B, int ldb, unsigned char *B, int ldb,
float *C, int ldc, float *mean_arr) float *C, int ldc, float *mean_arr, float *bias)
{ {
size_t size = M*N; size_t size = M*N;
const int num_blocks = size / BLOCK + 1; const int num_blocks = size / BLOCK + 1;
@ -516,6 +812,143 @@ void gemm_nn_custom_bin_mean_transposed_gpu(int M, int N, int K,
//printf(" shared_memory: (w) lda*BLOCK/N = %d, (i) ldb*BLOCK/M = %d, \t lda = %d \n\n", lda*BLOCK / N, ldb*BLOCK / M, lda); //printf(" shared_memory: (w) lda*BLOCK/N = %d, (i) ldb*BLOCK/M = %d, \t lda = %d \n\n", lda*BLOCK / N, ldb*BLOCK / M, lda);
gemm_nn_custom_bin_mean_transposed_gpu_kernel<<<num_blocks, BLOCK, 0, get_cuda_stream() >>>( gemm_nn_custom_bin_mean_transposed_gpu_kernel<<<num_blocks, BLOCK, 0, get_cuda_stream() >>>(
M, N, K,
A, lda,
B, ldb,
C, ldc,
mean_arr, bias);
}
// --------------------------------
// --------------------------------
// sequentially - B (input) in the shared_memory - BAD
// --------------------------------
__global__ void gemm_nn_custom_bin_mean_transposed_sequentially_gpu_kernel(int M, int N, int K,
unsigned char *A, int lda,
unsigned char *B, int ldb,
float *C, int ldc, float *mean_arr)
{
//__shared__ float mean_shared[32];
//__shared__ uint32_t B_s[8192]; // 32 KB // [ldb x N`] // max = 262 144 bits
//__shared__ uint32_t B_s[4096]; // 16 KB // [ldb x N`] // max = 131 072 bits
__shared__ uint8_t B_s[4096*4]; // 16 KB // [ldb x N`] // max = 131 072 bits
const int K_items = WARP_SIZE;
int start_j = blockIdx.x*blockDim.x / (K_items * M);
{
int end_j = (blockIdx.x*blockDim.x + blockDim.x) / (K_items * M) + 1;
if (end_j > N) end_j = N;
size_t shared_size = ldb * (end_j - start_j);
if (shared_size != 0) {
//if(threadIdx.x == 0) printf(" start_j = %d, end_j = %d, shared_size = %d \n", start_j, end_j, shared_size);
int k;
for (int k = threadIdx.x * 32; k < shared_size; k += blockDim.x * 32) {
int x = start_j*ldb + k;
if (x < (N*ldb)) *((uint32_t *)(B_s + k / 8)) = *((uint32_t *)(B + x / 8));
}
}
}
__syncthreads();
int index = blockIdx.x*blockDim.x + threadIdx.x;
{
int i; // l.n
int j; // out_h*out_w
int k; // l.size * l.size * l.c
const int index2 = index / K_items;
i = index2 % M; // max M
j = index2 / M; // max N
//j = index2 % N; // max N
//i = index2 / N; // max M
//int j_cur = index / M;
//int local_j = j_cur - start_j;
int local_j = j - start_j;
//if (i <= 1 && j <= 1 ) printf(" k = %d, K = %d, K_items = %d, i = %d, j = %d, lda = %d, ldb = %d, ldc = %d \n",
// k, K, K_items, i, j, lda, ldb, ldc);
{ // l.n - filters [16 - 55 - 1024]
// further improvements: for (l.n == 1024) iterate several (j)
if (j < N)
{ // out_h*out_w - one channel output size [169 - 173056]
int count = 0;
const int bit_step = 32;
for (k = (threadIdx.x % WARP_SIZE) * bit_step; k < K; k += bit_step*WARP_SIZE)
{ // l.size*l.size*l.c - one filter size [27 - 144 - 9216]
uint32_t a_bit32 = *((uint32_t *)(A + (i*lda + k) / 8)); // weights
//uint32_t b_bit32 = *((uint32_t *)(B + (j*ldb + k) / 8)); // input
uint32_t b_bit32 = *((uint32_t *)(B_s + (local_j*ldb + k) / 8)); // input
uint32_t c_bit32 = xnor_int32(a_bit32, b_bit32);
count += __popc(c_bit32);
}
/*
const int bit_step = 64;
for (k = (threadIdx.x % WARP_SIZE) * bit_step; k < K; k += bit_step*WARP_SIZE)
{ // l.size*l.size*l.c - one filter size [27 - 144 - 9216]
uint64_t a_bit64 = *((uint64_t *)(A + (i*lda + k) / 8)); // weights
//uint64_t b_bit64 = *((uint64_t *)(B + (j*ldb + k) / 8));
uint64_t b_bit64 = *((uint64_t *)(B_s + (local_j*ldb + k) / 8)); // input
uint64_t c_bit64 = xnor_int64(a_bit64, b_bit64);
count += __popcll(c_bit64);
}
*/
//atomicAdd(&C[i*ldc + j], (2 * count) * mean_val);
for (int offset = WARP_SIZE / 2; offset > 0; offset /= 2)
count += __shfl_down(count, offset);
if (threadIdx.x % WARP_SIZE == 0) {
int f1 = (K % bit_step == 0) ? 0 : (bit_step - (K % bit_step));
count = count - f1;
float mean_val = mean_arr[i];
C[i*ldc + j] = (2 * count - K) * mean_val;
//B_s[threadIdx.x / WARP_SIZE] = (2 * count - K) * mean_val;
}
}
}
}
}
// sequentially - BAD
void gemm_nn_custom_bin_mean_transposed_sequentially_gpu(int M, int N, int K,
unsigned char *A, int lda,
unsigned char *B, int ldb,
float *C, int ldc, float *mean_arr)
{
//size_t size = M*N;
size_t size = M*N * 32;
const int num_blocks = size / BLOCK + 1;
//printf(" K = %d \n", K);
/*
printf("\n gemm_bin size = %d, num_blocks = %d, M*K = %d KB, N*K = %d KB \n (w) M*K/num_blocks = %d KB, (i) N*K/num_blocks = %d KB \n",
size, num_blocks, M*K / 1024, N*K / 1024, M*lda / num_blocks / 1024, N*ldb / num_blocks / 1024);
printf(" M / 512 = %d, N / 512 = %d, M*lda / 512 = %d, N*ldb / 512 = %d \n", M / 512, N / 512, M*lda/512, N*ldb/512);
*/
//printf(" shared_memory: (w) lda*BLOCK/N = %d, (i) ldb*BLOCK/M = %d, \t lda = %d \n\n", lda*BLOCK / N, ldb*BLOCK / M, lda);
gemm_nn_custom_bin_mean_transposed_sequentially_gpu_kernel << <num_blocks, BLOCK, 0, get_cuda_stream() >> >(
M, N, K, M, N, K,
A, lda, A, lda,
B, ldb, B, ldb,

@ -89,6 +89,7 @@ struct layer{
int index; int index;
int binary; int binary;
int xnor; int xnor;
int use_bin_output;
int steps; int steps;
int hidden; int hidden;
float dot; float dot;

@ -862,8 +862,13 @@ void calculate_binary_weights(network net)
if (l->xnor) { if (l->xnor) {
//printf("\n %d \n", j); //printf("\n %d \n", j);
l->lda_align = 256; // 256bit for AVX2 l->lda_align = 256; // 256bit for AVX2
//if (l->size*l->size*l->c >= 2048) l->lda_align = 512;
binary_align_weights(l); binary_align_weights(l);
if(net.layers[j + 1].use_bin_output) {
l->activation = LINEAR;
}
} }
} }
} }

@ -167,6 +167,7 @@ convolutional_layer parse_convolutional(list *options, size_params params)
convolutional_layer layer = make_convolutional_layer(batch,h,w,c,n,size,stride,padding,activation, batch_normalize, binary, xnor, params.net.adam); convolutional_layer layer = make_convolutional_layer(batch,h,w,c,n,size,stride,padding,activation, batch_normalize, binary, xnor, params.net.adam);
layer.flipped = option_find_int_quiet(options, "flipped", 0); layer.flipped = option_find_int_quiet(options, "flipped", 0);
layer.dot = option_find_float_quiet(options, "dot", 0); layer.dot = option_find_float_quiet(options, "dot", 0);
layer.use_bin_output = option_find_int_quiet(options, "bin_output", 0);
if(params.net.adam){ if(params.net.adam){
layer.B1 = params.net.B1; layer.B1 = params.net.B1;
layer.B2 = params.net.B2; layer.B2 = params.net.B2;

Loading…
Cancel
Save