diff --git a/include/darknet.h b/include/darknet.h index 6675599c..1478b247 100644 --- a/include/darknet.h +++ b/include/darknet.h @@ -457,6 +457,8 @@ struct layer { float *binary_input_gpu; float *binary_weights_gpu; + float *bin_conv_shortcut_in_gpu; + float *bin_conv_shortcut_out_gpu; float * mean_gpu; float * variance_gpu; diff --git a/src/convolutional_kernels.cu b/src/convolutional_kernels.cu index be90728f..7c26f42e 100644 --- a/src/convolutional_kernels.cu +++ b/src/convolutional_kernels.cu @@ -300,7 +300,8 @@ void forward_convolutional_layer_gpu(convolutional_layer l, network_state state) //start_timer(); gemm_nn_custom_bin_mean_transposed_gpu(m, n, k, (unsigned char *)l.align_bit_weights_gpu, new_ldb, (unsigned char *)l.transposed_align_workspace_gpu, - new_ldb, l.output_gpu, n, l.mean_arr_gpu, l.biases_gpu, l.activation == LEAKY); + new_ldb, l.output_gpu, n, l.mean_arr_gpu, l.biases_gpu, l.activation == LEAKY, + l.bin_conv_shortcut_in_gpu, l.bin_conv_shortcut_out_gpu); //cudaDeviceSynchronize(); //stop_timer_and_show_name("gemm_nn_custom_bin_mean_transposed_gpu"); @@ -370,7 +371,8 @@ void forward_convolutional_layer_gpu(convolutional_layer l, network_state state) //start_timer(); gemm_nn_custom_bin_mean_transposed_gpu(m, n, k, (unsigned char *)l.align_bit_weights_gpu, new_ldb, (unsigned char *)l.transposed_align_workspace_gpu, - new_ldb, l.output_gpu, n, l.mean_arr_gpu, l.biases_gpu, l.activation == LEAKY); + new_ldb, l.output_gpu, n, l.mean_arr_gpu, l.biases_gpu, l.activation == LEAKY, + l.bin_conv_shortcut_in_gpu, l.bin_conv_shortcut_out_gpu); //cudaDeviceSynchronize(); //stop_timer_and_show_name("gemm_nn_custom_bin_mean_transposed_gpu"); //} diff --git a/src/im2col.h b/src/im2col.h index 7957b8eb..ab4f3483 100644 --- a/src/im2col.h +++ b/src/im2col.h @@ -44,7 +44,8 @@ void fill_int8_gpu(unsigned char *src, unsigned char val, size_t size); void gemm_nn_custom_bin_mean_transposed_gpu(int M, int N, int K, unsigned char *A, int lda, unsigned char *B, int ldb, - float *C, int ldc, float *mean_arr, float *bias, int leaky_activation); + float *C, int ldc, float *mean_arr, float *bias, int leaky_activation, + float *shortcut_in_gpu, float *shortcut_out_gpu); // sequentially - BAD void gemm_nn_custom_bin_mean_transposed_sequentially_gpu(int M, int N, int K, diff --git a/src/im2col_kernels.cu b/src/im2col_kernels.cu index 4d22beda..1cb4cf34 100644 --- a/src/im2col_kernels.cu +++ b/src/im2col_kernels.cu @@ -1247,7 +1247,8 @@ int warpAllReduceSum(int val) { __global__ void gemm_nn_custom_bin_mean_transposed_tensor_kernel(int M, int N, int K, unsigned char *A, int lda, unsigned char *B, int ldb, - float *C, int ldc, float *mean_arr, float *bias_arr, int leaky_activation) + float *C, int ldc, float *mean_arr, float *bias_arr, int leaky_activation, + float *shortcut_in_gpu, float *shortcut_out_gpu) { // total 57% int index = blockIdx.x*blockDim.x + threadIdx.x; @@ -1411,9 +1412,14 @@ __global__ void gemm_nn_custom_bin_mean_transposed_tensor_kernel(int M, int N, i float bias_val = bias_arr[i + i_d]; float dst_val = count *mean_val + bias_val; if (leaky_activation) - dst_val = (dst_val > 0) ? (dst_val) : (0.1f*dst_val); // Leaky activation + dst_val = (dst_val >= 0) ? (dst_val) : (0.1f*dst_val); // Leaky activation + + size_t out_index = (i + i_d)*ldc + (c_x * 8 + j + j_d); + C[out_index] = dst_val; - C[(i + i_d)*ldc + (c_x*8 + j + j_d)] = dst_val; + if (shortcut_out_gpu) { + shortcut_out_gpu[out_index] = shortcut_in_gpu[out_index] + dst_val; + } } } @@ -1574,7 +1580,8 @@ __global__ void gemm_nn_custom_bin_mean_transposed_tensor_kernel(int M, int N, i __global__ void gemm_nn_custom_bin_mean_transposed_gpu_kernel(int M, int N, int K, unsigned char *A, int lda, unsigned char *B, int ldb, - float *C, int ldc, float *mean_arr, float *bias_arr, int leaky_activation) + float *C, int ldc, float *mean_arr, float *bias_arr, int leaky_activation, + float *shortcut_in_gpu, float *shortcut_out_gpu) { // total 57% int index = blockIdx.x*blockDim.x + threadIdx.x; @@ -1628,7 +1635,7 @@ __global__ void gemm_nn_custom_bin_mean_transposed_gpu_kernel(int M, int N, int //ulonglong4 a_bit256 = *((ulonglong4 *)(A + A_i)); // weights ulonglong4 a_bit256 = *((ulonglong4 *)(A_s + A_i)); // weights ulonglong4 b_bit256 = *((ulonglong4 *)(B + B_i)); // input - c_bit256 = xnor_int256(a_bit256, b_bit256); + c_bit256 = xor_int256(a_bit256, b_bit256); int tmp_count = __popcll(c_bit256.w) + __popcll(c_bit256.x) + __popcll(c_bit256.y) + __popcll(c_bit256.z); @@ -1661,7 +1668,7 @@ __global__ void gemm_nn_custom_bin_mean_transposed_gpu_kernel(int M, int N, int //uint64_t a_bit64 = *((uint64_t *)(A + A_i)); // weights uint64_t a_bit64 = *((uint64_t *)(A_s + A_i)); // weights uint64_t b_bit64 = *((uint64_t *)(B + B_i)); // input - c_bit64 = xnor_int64(a_bit64, b_bit64); + c_bit64 = xor_int64(a_bit64, b_bit64); int tmp_count = __popcll(c_bit64); int sum_count = warpAllReduceSum(tmp_count); @@ -1691,7 +1698,7 @@ __global__ void gemm_nn_custom_bin_mean_transposed_gpu_kernel(int M, int N, int //uint64_t a_bit64 = *((uint64_t *)(A + A_i)); // weights uint32_t a_bit32 = *((uint32_t *)(A_s + A_i)); // weights uint32_t b_bit32 = *((uint32_t *)(B + B_i)); // input - uint32_t c_bit32 = xnor_int32(a_bit32, b_bit32); + uint32_t c_bit32 = xor_int32(a_bit32, b_bit32); int tmp_count = __popc(c_bit32); int sum_count = warpAllReduceSum(tmp_count); @@ -1712,7 +1719,7 @@ __global__ void gemm_nn_custom_bin_mean_transposed_gpu_kernel(int M, int N, int //ulonglong4 a_bit256 = *((ulonglong4 *)(A + (i*lda + k) / 8)); // weights ulonglong4 a_bit256 = *((ulonglong4 *)(A_s + (local_i*lda + k) / 8)); // weights ulonglong4 b_bit256 = *((ulonglong4 *)(B + (j*ldb + k) / 8)); // input - ulonglong4 c_bit256 = xnor_int256(a_bit256, b_bit256); + ulonglong4 c_bit256 = xor_int256(a_bit256, b_bit256); count += __popcll(c_bit256.w) + __popcll(c_bit256.x) + __popcll(c_bit256.y) + __popcll(c_bit256.z); @@ -1724,7 +1731,7 @@ __global__ void gemm_nn_custom_bin_mean_transposed_gpu_kernel(int M, int N, int //uint64_t a_bit64 = *((uint64_t *)(A + (i*lda + k) / 8)); // weights uint64_t a_bit64 = *((uint64_t *)(A_s + (local_i*lda + k) / 8)); // weights uint64_t b_bit64 = *((uint64_t *)(B + (j*ldb + k) / 8)); // input - uint64_t c_bit64 = xnor_int64(a_bit64, b_bit64); + uint64_t c_bit64 = xor_int64(a_bit64, b_bit64); count += __popcll(c_bit64); } @@ -1735,8 +1742,13 @@ __global__ void gemm_nn_custom_bin_mean_transposed_gpu_kernel(int M, int N, int count = count - f1; // remove extra bits (from empty space for align only) float dst_val = (2 * count - K) *mean_val + bias_val; if(leaky_activation) - dst_val = (dst_val > 0) ? (dst_val) : (0.1f*dst_val); // Leaky activation - C[i*ldc + j] = dst_val; + dst_val = (dst_val >= 0) ? (dst_val) : (0.1f*dst_val); // Leaky activation + size_t out_index = i*ldc + j; + C[out_index] = dst_val; + + if (shortcut_out_gpu) { + shortcut_out_gpu[out_index] = shortcut_in_gpu[out_index] + dst_val; + } } } } @@ -1762,7 +1774,8 @@ __global__ void gemm_nn_custom_bin_mean_transposed_gpu_kernel(int M, int N, int void gemm_nn_custom_bin_mean_transposed_gpu(int M, int N, int K, unsigned char *A, int lda, unsigned char *B, int ldb, - float *C, int ldc, float *mean_arr, float *bias, int leaky_activation) + float *C, int ldc, float *mean_arr, float *bias, int leaky_activation, + float *shortcut_in_gpu, float *shortcut_out_gpu) { int size = M*N; const int num_blocks = get_number_of_blocks(size, BLOCK); @@ -1794,7 +1807,8 @@ void gemm_nn_custom_bin_mean_transposed_gpu(int M, int N, int K, A, lda, B, ldb, C, ldc, - mean_arr, bias, leaky_activation); + mean_arr, bias, leaky_activation, + shortcut_in_gpu, shortcut_out_gpu); //cudaDeviceSynchronize(); //getchar(); @@ -1807,7 +1821,8 @@ void gemm_nn_custom_bin_mean_transposed_gpu(int M, int N, int K, A, lda, B, ldb, C, ldc, - mean_arr, bias, leaky_activation); + mean_arr, bias, leaky_activation, + shortcut_in_gpu, shortcut_out_gpu); } CHECK_CUDA(cudaPeekAtLastError()); } diff --git a/src/network.c b/src/network.c index 115edeb0..483219b4 100644 --- a/src/network.c +++ b/src/network.c @@ -975,7 +975,7 @@ void fuse_conv_batchnorm(network net) } } - +void forward_blank_layer(layer l, network_state state) {} void calculate_binary_weights(network net) { @@ -996,6 +996,19 @@ void calculate_binary_weights(network net) if (net.layers[j].use_bin_output) { l->activation = LINEAR; } + + // fuse conv_xnor + shortcut -> conv_xnor + if ((j + 1) < net.n && net.layers[j].type == CONVOLUTIONAL) { + layer *sc = &net.layers[j + 1]; + if (sc->type == SHORTCUT && sc->w == sc->out_w && sc->h == sc->out_h && sc->c == sc->out_c) + { + l->bin_conv_shortcut_in_gpu = net.layers[net.layers[j + 1].index].output_gpu; + l->bin_conv_shortcut_out_gpu = net.layers[j + 1].output_gpu; + + net.layers[j + 1].type = BLANK; + net.layers[j + 1].forward_gpu = forward_blank_layer; + } + } } } }