Try to fuse conv_xnor+shortcut -> conv_xnor

pull/2383/head
AlexeyAB 6 years ago
parent 9e138adf09
commit 5448e07445
  1. 2
      include/darknet.h
  2. 6
      src/convolutional_kernels.cu
  3. 3
      src/im2col.h
  4. 43
      src/im2col_kernels.cu
  5. 15
      src/network.c

@ -457,6 +457,8 @@ struct layer {
float *binary_input_gpu; float *binary_input_gpu;
float *binary_weights_gpu; float *binary_weights_gpu;
float *bin_conv_shortcut_in_gpu;
float *bin_conv_shortcut_out_gpu;
float * mean_gpu; float * mean_gpu;
float * variance_gpu; float * variance_gpu;

@ -300,7 +300,8 @@ void forward_convolutional_layer_gpu(convolutional_layer l, network_state state)
//start_timer(); //start_timer();
gemm_nn_custom_bin_mean_transposed_gpu(m, n, k, gemm_nn_custom_bin_mean_transposed_gpu(m, n, k,
(unsigned char *)l.align_bit_weights_gpu, new_ldb, (unsigned char *)l.transposed_align_workspace_gpu, (unsigned char *)l.align_bit_weights_gpu, new_ldb, (unsigned char *)l.transposed_align_workspace_gpu,
new_ldb, l.output_gpu, n, l.mean_arr_gpu, l.biases_gpu, l.activation == LEAKY); new_ldb, l.output_gpu, n, l.mean_arr_gpu, l.biases_gpu, l.activation == LEAKY,
l.bin_conv_shortcut_in_gpu, l.bin_conv_shortcut_out_gpu);
//cudaDeviceSynchronize(); //cudaDeviceSynchronize();
//stop_timer_and_show_name("gemm_nn_custom_bin_mean_transposed_gpu"); //stop_timer_and_show_name("gemm_nn_custom_bin_mean_transposed_gpu");
@ -370,7 +371,8 @@ void forward_convolutional_layer_gpu(convolutional_layer l, network_state state)
//start_timer(); //start_timer();
gemm_nn_custom_bin_mean_transposed_gpu(m, n, k, gemm_nn_custom_bin_mean_transposed_gpu(m, n, k,
(unsigned char *)l.align_bit_weights_gpu, new_ldb, (unsigned char *)l.transposed_align_workspace_gpu, (unsigned char *)l.align_bit_weights_gpu, new_ldb, (unsigned char *)l.transposed_align_workspace_gpu,
new_ldb, l.output_gpu, n, l.mean_arr_gpu, l.biases_gpu, l.activation == LEAKY); new_ldb, l.output_gpu, n, l.mean_arr_gpu, l.biases_gpu, l.activation == LEAKY,
l.bin_conv_shortcut_in_gpu, l.bin_conv_shortcut_out_gpu);
//cudaDeviceSynchronize(); //cudaDeviceSynchronize();
//stop_timer_and_show_name("gemm_nn_custom_bin_mean_transposed_gpu"); //stop_timer_and_show_name("gemm_nn_custom_bin_mean_transposed_gpu");
//} //}

@ -44,7 +44,8 @@ void fill_int8_gpu(unsigned char *src, unsigned char val, size_t size);
void gemm_nn_custom_bin_mean_transposed_gpu(int M, int N, int K, void gemm_nn_custom_bin_mean_transposed_gpu(int M, int N, int K,
unsigned char *A, int lda, unsigned char *A, int lda,
unsigned char *B, int ldb, unsigned char *B, int ldb,
float *C, int ldc, float *mean_arr, float *bias, int leaky_activation); float *C, int ldc, float *mean_arr, float *bias, int leaky_activation,
float *shortcut_in_gpu, float *shortcut_out_gpu);
// sequentially - BAD // sequentially - BAD
void gemm_nn_custom_bin_mean_transposed_sequentially_gpu(int M, int N, int K, void gemm_nn_custom_bin_mean_transposed_sequentially_gpu(int M, int N, int K,

@ -1247,7 +1247,8 @@ int warpAllReduceSum(int val) {
__global__ void gemm_nn_custom_bin_mean_transposed_tensor_kernel(int M, int N, int K, __global__ void gemm_nn_custom_bin_mean_transposed_tensor_kernel(int M, int N, int K,
unsigned char *A, int lda, unsigned char *A, int lda,
unsigned char *B, int ldb, unsigned char *B, int ldb,
float *C, int ldc, float *mean_arr, float *bias_arr, int leaky_activation) float *C, int ldc, float *mean_arr, float *bias_arr, int leaky_activation,
float *shortcut_in_gpu, float *shortcut_out_gpu)
{ {
// total 57% // total 57%
int index = blockIdx.x*blockDim.x + threadIdx.x; int index = blockIdx.x*blockDim.x + threadIdx.x;
@ -1411,9 +1412,14 @@ __global__ void gemm_nn_custom_bin_mean_transposed_tensor_kernel(int M, int N, i
float bias_val = bias_arr[i + i_d]; float bias_val = bias_arr[i + i_d];
float dst_val = count *mean_val + bias_val; float dst_val = count *mean_val + bias_val;
if (leaky_activation) if (leaky_activation)
dst_val = (dst_val > 0) ? (dst_val) : (0.1f*dst_val); // Leaky activation dst_val = (dst_val >= 0) ? (dst_val) : (0.1f*dst_val); // Leaky activation
size_t out_index = (i + i_d)*ldc + (c_x * 8 + j + j_d);
C[out_index] = dst_val;
C[(i + i_d)*ldc + (c_x*8 + j + j_d)] = dst_val; if (shortcut_out_gpu) {
shortcut_out_gpu[out_index] = shortcut_in_gpu[out_index] + dst_val;
}
} }
} }
@ -1574,7 +1580,8 @@ __global__ void gemm_nn_custom_bin_mean_transposed_tensor_kernel(int M, int N, i
__global__ void gemm_nn_custom_bin_mean_transposed_gpu_kernel(int M, int N, int K, __global__ void gemm_nn_custom_bin_mean_transposed_gpu_kernel(int M, int N, int K,
unsigned char *A, int lda, unsigned char *A, int lda,
unsigned char *B, int ldb, unsigned char *B, int ldb,
float *C, int ldc, float *mean_arr, float *bias_arr, int leaky_activation) float *C, int ldc, float *mean_arr, float *bias_arr, int leaky_activation,
float *shortcut_in_gpu, float *shortcut_out_gpu)
{ {
// total 57% // total 57%
int index = blockIdx.x*blockDim.x + threadIdx.x; int index = blockIdx.x*blockDim.x + threadIdx.x;
@ -1628,7 +1635,7 @@ __global__ void gemm_nn_custom_bin_mean_transposed_gpu_kernel(int M, int N, int
//ulonglong4 a_bit256 = *((ulonglong4 *)(A + A_i)); // weights //ulonglong4 a_bit256 = *((ulonglong4 *)(A + A_i)); // weights
ulonglong4 a_bit256 = *((ulonglong4 *)(A_s + A_i)); // weights ulonglong4 a_bit256 = *((ulonglong4 *)(A_s + A_i)); // weights
ulonglong4 b_bit256 = *((ulonglong4 *)(B + B_i)); // input ulonglong4 b_bit256 = *((ulonglong4 *)(B + B_i)); // input
c_bit256 = xnor_int256(a_bit256, b_bit256); c_bit256 = xor_int256(a_bit256, b_bit256);
int tmp_count = __popcll(c_bit256.w) + __popcll(c_bit256.x) + int tmp_count = __popcll(c_bit256.w) + __popcll(c_bit256.x) +
__popcll(c_bit256.y) + __popcll(c_bit256.z); __popcll(c_bit256.y) + __popcll(c_bit256.z);
@ -1661,7 +1668,7 @@ __global__ void gemm_nn_custom_bin_mean_transposed_gpu_kernel(int M, int N, int
//uint64_t a_bit64 = *((uint64_t *)(A + A_i)); // weights //uint64_t a_bit64 = *((uint64_t *)(A + A_i)); // weights
uint64_t a_bit64 = *((uint64_t *)(A_s + A_i)); // weights uint64_t a_bit64 = *((uint64_t *)(A_s + A_i)); // weights
uint64_t b_bit64 = *((uint64_t *)(B + B_i)); // input uint64_t b_bit64 = *((uint64_t *)(B + B_i)); // input
c_bit64 = xnor_int64(a_bit64, b_bit64); c_bit64 = xor_int64(a_bit64, b_bit64);
int tmp_count = __popcll(c_bit64); int tmp_count = __popcll(c_bit64);
int sum_count = warpAllReduceSum(tmp_count); int sum_count = warpAllReduceSum(tmp_count);
@ -1691,7 +1698,7 @@ __global__ void gemm_nn_custom_bin_mean_transposed_gpu_kernel(int M, int N, int
//uint64_t a_bit64 = *((uint64_t *)(A + A_i)); // weights //uint64_t a_bit64 = *((uint64_t *)(A + A_i)); // weights
uint32_t a_bit32 = *((uint32_t *)(A_s + A_i)); // weights uint32_t a_bit32 = *((uint32_t *)(A_s + A_i)); // weights
uint32_t b_bit32 = *((uint32_t *)(B + B_i)); // input uint32_t b_bit32 = *((uint32_t *)(B + B_i)); // input
uint32_t c_bit32 = xnor_int32(a_bit32, b_bit32); uint32_t c_bit32 = xor_int32(a_bit32, b_bit32);
int tmp_count = __popc(c_bit32); int tmp_count = __popc(c_bit32);
int sum_count = warpAllReduceSum(tmp_count); int sum_count = warpAllReduceSum(tmp_count);
@ -1712,7 +1719,7 @@ __global__ void gemm_nn_custom_bin_mean_transposed_gpu_kernel(int M, int N, int
//ulonglong4 a_bit256 = *((ulonglong4 *)(A + (i*lda + k) / 8)); // weights //ulonglong4 a_bit256 = *((ulonglong4 *)(A + (i*lda + k) / 8)); // weights
ulonglong4 a_bit256 = *((ulonglong4 *)(A_s + (local_i*lda + k) / 8)); // weights ulonglong4 a_bit256 = *((ulonglong4 *)(A_s + (local_i*lda + k) / 8)); // weights
ulonglong4 b_bit256 = *((ulonglong4 *)(B + (j*ldb + k) / 8)); // input ulonglong4 b_bit256 = *((ulonglong4 *)(B + (j*ldb + k) / 8)); // input
ulonglong4 c_bit256 = xnor_int256(a_bit256, b_bit256); ulonglong4 c_bit256 = xor_int256(a_bit256, b_bit256);
count += __popcll(c_bit256.w) + __popcll(c_bit256.x) + count += __popcll(c_bit256.w) + __popcll(c_bit256.x) +
__popcll(c_bit256.y) + __popcll(c_bit256.z); __popcll(c_bit256.y) + __popcll(c_bit256.z);
@ -1724,7 +1731,7 @@ __global__ void gemm_nn_custom_bin_mean_transposed_gpu_kernel(int M, int N, int
//uint64_t a_bit64 = *((uint64_t *)(A + (i*lda + k) / 8)); // weights //uint64_t a_bit64 = *((uint64_t *)(A + (i*lda + k) / 8)); // weights
uint64_t a_bit64 = *((uint64_t *)(A_s + (local_i*lda + k) / 8)); // weights uint64_t a_bit64 = *((uint64_t *)(A_s + (local_i*lda + k) / 8)); // weights
uint64_t b_bit64 = *((uint64_t *)(B + (j*ldb + k) / 8)); // input uint64_t b_bit64 = *((uint64_t *)(B + (j*ldb + k) / 8)); // input
uint64_t c_bit64 = xnor_int64(a_bit64, b_bit64); uint64_t c_bit64 = xor_int64(a_bit64, b_bit64);
count += __popcll(c_bit64); count += __popcll(c_bit64);
} }
@ -1735,8 +1742,13 @@ __global__ void gemm_nn_custom_bin_mean_transposed_gpu_kernel(int M, int N, int
count = count - f1; // remove extra bits (from empty space for align only) count = count - f1; // remove extra bits (from empty space for align only)
float dst_val = (2 * count - K) *mean_val + bias_val; float dst_val = (2 * count - K) *mean_val + bias_val;
if(leaky_activation) if(leaky_activation)
dst_val = (dst_val > 0) ? (dst_val) : (0.1f*dst_val); // Leaky activation dst_val = (dst_val >= 0) ? (dst_val) : (0.1f*dst_val); // Leaky activation
C[i*ldc + j] = dst_val; size_t out_index = i*ldc + j;
C[out_index] = dst_val;
if (shortcut_out_gpu) {
shortcut_out_gpu[out_index] = shortcut_in_gpu[out_index] + dst_val;
}
} }
} }
} }
@ -1762,7 +1774,8 @@ __global__ void gemm_nn_custom_bin_mean_transposed_gpu_kernel(int M, int N, int
void gemm_nn_custom_bin_mean_transposed_gpu(int M, int N, int K, void gemm_nn_custom_bin_mean_transposed_gpu(int M, int N, int K,
unsigned char *A, int lda, unsigned char *A, int lda,
unsigned char *B, int ldb, unsigned char *B, int ldb,
float *C, int ldc, float *mean_arr, float *bias, int leaky_activation) float *C, int ldc, float *mean_arr, float *bias, int leaky_activation,
float *shortcut_in_gpu, float *shortcut_out_gpu)
{ {
int size = M*N; int size = M*N;
const int num_blocks = get_number_of_blocks(size, BLOCK); const int num_blocks = get_number_of_blocks(size, BLOCK);
@ -1794,7 +1807,8 @@ void gemm_nn_custom_bin_mean_transposed_gpu(int M, int N, int K,
A, lda, A, lda,
B, ldb, B, ldb,
C, ldc, C, ldc,
mean_arr, bias, leaky_activation); mean_arr, bias, leaky_activation,
shortcut_in_gpu, shortcut_out_gpu);
//cudaDeviceSynchronize(); //cudaDeviceSynchronize();
//getchar(); //getchar();
@ -1807,7 +1821,8 @@ void gemm_nn_custom_bin_mean_transposed_gpu(int M, int N, int K,
A, lda, A, lda,
B, ldb, B, ldb,
C, ldc, C, ldc,
mean_arr, bias, leaky_activation); mean_arr, bias, leaky_activation,
shortcut_in_gpu, shortcut_out_gpu);
} }
CHECK_CUDA(cudaPeekAtLastError()); CHECK_CUDA(cudaPeekAtLastError());
} }

@ -975,7 +975,7 @@ void fuse_conv_batchnorm(network net)
} }
} }
void forward_blank_layer(layer l, network_state state) {}
void calculate_binary_weights(network net) void calculate_binary_weights(network net)
{ {
@ -996,6 +996,19 @@ void calculate_binary_weights(network net)
if (net.layers[j].use_bin_output) { if (net.layers[j].use_bin_output) {
l->activation = LINEAR; l->activation = LINEAR;
} }
// fuse conv_xnor + shortcut -> conv_xnor
if ((j + 1) < net.n && net.layers[j].type == CONVOLUTIONAL) {
layer *sc = &net.layers[j + 1];
if (sc->type == SHORTCUT && sc->w == sc->out_w && sc->h == sc->out_h && sc->c == sc->out_c)
{
l->bin_conv_shortcut_in_gpu = net.layers[net.layers[j + 1].index].output_gpu;
l->bin_conv_shortcut_out_gpu = net.layers[j + 1].output_gpu;
net.layers[j + 1].type = BLANK;
net.layers[j + 1].forward_gpu = forward_blank_layer;
}
}
} }
} }
} }

Loading…
Cancel
Save