diff --git a/src/convolutional_layer.c b/src/convolutional_layer.c index 4794364e..f5050521 100644 --- a/src/convolutional_layer.c +++ b/src/convolutional_layer.c @@ -681,6 +681,102 @@ void bit_to_float(unsigned char *src, float *dst, size_t size, size_t filters, f } } +void binary_align_weights(convolutional_layer *l) +{ + int m = l->n; + int k = l->size*l->size*l->c; + size_t new_lda = k + (l->lda_align - k % l->lda_align); // (k / 8 + 1) * 8; + l->new_lda = new_lda; + + binarize_weights(l->weights, m, k, l->binary_weights); + + size_t align_weights_size = new_lda * m; + l->align_bit_weights_size = align_weights_size / 8 + 1; + float *align_weights = calloc(align_weights_size, sizeof(float)); + l->align_bit_weights = calloc(l->align_bit_weights_size, sizeof(char)); + + size_t i, j; + // align A without transpose + for (i = 0; i < m; ++i) { + for (j = 0; j < k; ++j) { + align_weights[i*new_lda + j] = l->binary_weights[i*k + j]; + } + } + + + //if (l->c % 32 == 0) + if(gpu_index < 0 && l->stride == 1 && l->pad == 1 && l->c % 32 == 0) + { + int fil, chan; + const int items_per_filter = l->c * l->size * l->size; + //const int dst_items_per_filter = new_lda; + for (fil = 0; fil < l->n; ++fil) + { + for (chan = 0; chan < l->c; chan += 32) + { + const int items_per_channel = l->size*l->size; + for (i = 0; i < items_per_channel; ++i) + { + uint32_t val = 0; + int c_pack; + for (c_pack = 0; c_pack < 32; ++c_pack) { + float src = l->binary_weights[fil*items_per_filter + (chan + c_pack)*items_per_channel + i]; + + //align_weights[fil*items_per_filter + chan*items_per_channel + i * 32 + c_pack] = src; + + align_weights[fil*new_lda + chan*items_per_channel + i*32 + c_pack] = src; + //val |= (src << c); + } + + } + } + } + + //printf("\n l.index = %d \t aw[0] = %f, aw[1] = %f, aw[2] = %f, aw[3] = %f \n", l->index, align_weights[0], align_weights[1], align_weights[2], align_weights[3]); + //memcpy(l->binary_weights, align_weights, (l->size * l->size * l->c * l->n) * sizeof(float)); + + float_to_bit(align_weights, l->align_bit_weights, align_weights_size); + + get_mean_array(l->binary_weights, m*k, l->n, l->mean_arr); + //get_mean_array(l->binary_weights, m*new_lda, l->n, l->mean_arr); + } + else { + float_to_bit(align_weights, l->align_bit_weights, align_weights_size); + + get_mean_array(l->binary_weights, m*k, l->n, l->mean_arr); + } + + //l->mean_arr = calloc(l->n, sizeof(float)); + + //get_mean_array(align_weights, align_weights_size, l->n, l->mean_arr); + + + + +#ifdef GPU + cudaError_t status; + l->align_workspace_size = l->bit_align * l->size * l->size * l->c; + status = cudaMalloc((void **)&l->align_workspace_gpu, l->align_workspace_size * sizeof(float)); + status = cudaMalloc((void **)&l->transposed_align_workspace_gpu, l->align_workspace_size * sizeof(float)); + check_error(status); + + //l->align_bit_weights_gpu = cuda_make_array(l->align_bit_weights, l->align_bit_weights_size * sizeof(char)/sizeof(float)); + status = cudaMalloc((void **)&l->align_bit_weights_gpu, l->align_bit_weights_size); + check_error(status); + status = cudaMemcpy(l->align_bit_weights_gpu, l->align_bit_weights, l->align_bit_weights_size, cudaMemcpyHostToDevice); + check_error(status); + status = cudaMemcpy(l->binary_weights_gpu, l->binary_weights, m*k * sizeof(float), cudaMemcpyHostToDevice); + check_error(status); + + //l->mean_arr_gpu = cuda_make_array(l->mean_arr, l->n); + cuda_push_array(l->mean_arr_gpu, l->mean_arr, l->n); + cudaDeviceSynchronize(); +#endif // GPU + + free(align_weights); +} + +/* void binary_align_weights(convolutional_layer *l) { int m = l->n; @@ -729,6 +825,7 @@ void binary_align_weights(convolutional_layer *l) free(align_weights); } +*/ // binary transpose size_t binary_transpose_align_input(int k, int n, float *b, char **t_bit_input, size_t ldb_align, int bit_align) @@ -782,117 +879,98 @@ void forward_convolutional_layer(convolutional_layer l, network_state state) u++; for(i = 0; i < l.batch; ++i){ - //im2col_cpu(state.input, l.c, l.h, l.w, l.size, l.stride, l.pad, b); - - //float *t_input = NULL; - //if (l.xnor) { - // size_t new_ldb = k + (l.lda_align - k%l.lda_align); - // size_t t_intput_size = new_ldb * n; - // t_input = calloc(t_intput_size, sizeof(float)); - // im2col_cpu_custom_transpose(state.input, l.c, l.h, l.w, l.size, l.stride, l.pad, t_input, new_ldb); - //} - //if (l.xnor && l.size == 3 && l.stride == 1 && l.pad == 1) {} - //else - // further optimizations: im2col_bin() for XNOR, and then transpose_aling_bin() - //im2col_cpu_custom(state.input, l.c, l.h, l.w, l.size, l.stride, l.pad, b); - //gemm(0,0,m,n,k,1,a,k,b,n,1,c,n); //gemm_nn_custom(m, n, k, 1, a, k, b, n, c, n); - if (l.xnor && l.align_bit_weights && !state.train && (l.stride == 1 && l.pad == 1)) { + if (l.xnor && l.align_bit_weights && !state.train && (l.stride == 1 && l.pad == 1)) + { memset(b, 0, l.bit_align*l.size*l.size*l.c * sizeof(float)); - //im2col_cpu_custom_align(state.input, l.c, l.h, l.w, l.size, l.stride, l.pad, b, l.bit_align); - im2col_cpu_custom_bin(state.input, l.c, l.h, l.w, l.size, l.stride, l.pad, b, l.bit_align); - - size_t output_size = l.outputs; - //float *count_output = calloc(output_size, sizeof(float)); - //size_t bit_output_size = output_size / 8 + 1; - //char *bit_output = calloc(bit_output_size, sizeof(char)); - - size_t intput_size = n * k; // (out_h*out_w) X (l.size*l.size*l.c) : after im2col() - size_t bit_input_size = intput_size / 8 + 1; - //char *bit_input = calloc(bit_input_size, sizeof(char)); - - size_t weights_size = k * m; //l.size*l.size*l.c*l.n; - size_t bit_weights_size = weights_size / 8 + 1; - //char *bit_weights = calloc(bit_weights_size, sizeof(char)); - //float *mean_arr = calloc(l.n, sizeof(float)); - - // test: float->bit->float - //get_mean_array(l.weights, weights_size, l.n, mean_arr); - //float_to_bit(l.weights, bit_weights, weights_size); - //memset(l.weights, 0, weights_size * sizeof(float)); - //bit_to_float(bit_weights, l.weights, weights_size, l.n, mean_arr); // just for test float->bit->float - - //float_to_bit(b, bit_input, intput_size); - //memset(b, 0, intput_size * sizeof(float)); - //bit_to_float(bit_input, b, intput_size, 1, NULL); // just for test float->bit->float - - // transpose B from NxK to KxN (x-axis (ldb = l.size*l.size*l.c) - should be multiple of 8 bits) - { - /* - size_t ldb_align = 256;// 8; + if(l.c % 32 == 0) + { + int ldb_align = l.lda_align; size_t new_ldb = k + (ldb_align - k%ldb_align); // (k / 8 + 1) * 8; - size_t t_intput_size = new_ldb * n; + size_t t_intput_size = new_ldb * l.bit_align;// n; size_t t_bit_input_size = t_intput_size / 8;// +1; - float *t_input = calloc(t_intput_size, sizeof(float)); - char *t_bit_input = calloc(t_bit_input_size, sizeof(char)); - //printf("\n bit_input_size = %d, n = %d, k = %d, ldb = %d \n", bit_input_size, n, k, n); - //printf("\n t_bit_input_size = %d, k = %d, n = %d, new_ldb = %d \n", t_bit_input_size, k, n, new_ldb); + const int new_c = l.c / 32; + float *re_packed_input = calloc(l.c * l.w * l.h, sizeof(float)); + uint32_t *bin_re_packed_input = calloc(new_c * l.w * l.h + 1, sizeof(uint32_t)); - //printf("\n align_weights_size = %d, k = %d, m = %d, lda = %d \n", align_weights_size, k, m, k); - //printf("\n align_bit_weights_size = %d, k = %d, m = %d, new_lda = %d \n", align_bit_weights_size, k, m, new_ldb); + // float32x4 by channel (as in cuDNN) + repack_input(state.input, re_packed_input, l.w, l.h, l.c); + // 32 x floats -> 1 x uint32_t + float_to_bit(re_packed_input, (char *)bin_re_packed_input, l.c * l.w * l.h); - // transpose and align B - int i, j; - for (i = 0; i < n; ++i) { - for (j = 0; j < k; ++j) { - t_input[i*new_ldb + j] = b[j*n + i]; - } - } - float_to_bit(t_input, t_bit_input, t_intput_size); + free(re_packed_input); + // convolution the packed inputs and weights: float x 32 by channel (as in cuDNN) + //convolution_repacked((uint32_t *)bin_re_packed_input, (uint32_t *)l.align_bit_weights, l.output, + // l.w, l.h, l.c, l.n, l.size, l.pad, l.new_lda, l.mean_arr); + // // then exit from if() - if (!l.align_bit_weights) - { - size_t align_weights_size = new_ldb * m; - size_t align_bit_weights_size = align_weights_size / 8;// +1; - float *align_weights = calloc(align_weights_size, sizeof(float)); - l.align_bit_weights = calloc(align_bit_weights_size, sizeof(char)); - - // align A without transpose - for (i = 0; i < m; ++i) { - for (j = 0; j < k; ++j) { - align_weights[i*new_ldb + j] = a[i*k + j]; - } - } - float_to_bit(align_weights, l.align_bit_weights, align_weights_size); - l.mean_arr = calloc(l.n, sizeof(float)); - get_mean_array(align_weights, align_weights_size, l.n, l.mean_arr); + im2col_cpu_custom((float *)bin_re_packed_input, new_c, l.h, l.w, l.size, l.stride, l.pad, b); + //im2col_cpu((float *)bin_re_packed_input, new_c, l.h, l.w, l.size, l.stride, l.pad, b); - free(align_weights); - } - */ + free(bin_re_packed_input); - /* - if (l.size == 3 && l.stride == 1 && l.pad == 1) - { - //binarize_weights(l.weights, l.n, l.c*l.size*l.size, l.binary_weights); - //printf("\n mean = %f \n", l.mean_arr[0]); + int new_k = l.size*l.size*l.c / 32; - convolution_2d(l.w, l.h, l.size, l.n, l.c, l.pad, l.stride, - //l.weights, state.input, l.output, l.mean_arr); - l.binary_weights, state.input, l.output, l.mean_arr); - } - else { - */ +// gemm_nn_bin_32bit_packed(m, n, new_k, 1, +// l.align_bit_weights, l.new_lda/32, +// b, n, +// c, n, l.mean_arr); + +// // then exit from if() + + + //size_t new_ldb = k + (ldb_align - k%ldb_align); // (k / 8 + 1) * 8; + //size_t t_intput_size = new_ldb * l.bit_align;// n; + //size_t t_bit_input_size = t_intput_size / 8;// +1; + + char *t_bit_input = calloc(t_bit_input_size, sizeof(char)); + + transpose_uint32((uint32_t *)b, t_bit_input, new_k, n, n, new_ldb); + + // the main GEMM function + gemm_nn_custom_bin_mean_transposed(m, n, k, 1, l.align_bit_weights, new_ldb, t_bit_input, new_ldb, c, n, l.mean_arr); + + // // alternative GEMM + //gemm_nn_bin_transposed_32bit_packed(m, n, new_k, 1, + // l.align_bit_weights, l.new_lda/32, + // t_bit_input, new_ldb / 32, + // c, n, l.mean_arr); + + free(t_bit_input); + + } + else { // else (l.c % 32 != 0) + + //-------------------------------------------------------- + + //im2col_cpu_custom_align(state.input, l.c, l.h, l.w, l.size, l.stride, l.pad, b, l.bit_align); + im2col_cpu_custom_bin(state.input, l.c, l.h, l.w, l.size, l.stride, l.pad, b, l.bit_align); + + size_t output_size = l.outputs; + //float *count_output = calloc(output_size, sizeof(float)); + //size_t bit_output_size = output_size / 8 + 1; + //char *bit_output = calloc(bit_output_size, sizeof(char)); + size_t intput_size = n * k; // (out_h*out_w) X (l.size*l.size*l.c) : after im2col() + size_t bit_input_size = intput_size / 8 + 1; + //char *bit_input = calloc(bit_input_size, sizeof(char)); + + size_t weights_size = k * m; //l.size*l.size*l.c*l.n; + size_t bit_weights_size = weights_size / 8 + 1; + //char *bit_weights = calloc(bit_weights_size, sizeof(char)); + //float *mean_arr = calloc(l.n, sizeof(float)); + + // transpose B from NxK to KxN (x-axis (ldb = l.size*l.size*l.c) - should be multiple of 8 bits) + { //size_t ldb_align = 256; // 256 bit for AVX2 int ldb_align = l.lda_align; size_t new_ldb = k + (ldb_align - k%ldb_align); @@ -908,27 +986,11 @@ void forward_convolutional_layer(convolutional_layer l, network_state state) //free(t_input); free(t_bit_input); - //} + //} + } } - // for bit_input: (k * n) - //if (u == 8) gemm_nn_custom_bin_mean(m, n, k, 1, bit_weights, k, bit_input, n, c, n, mean_arr); // last xnor layer - //else gemm_nn_custom_bin_mean(m, n, k, 1, bit_weights, k, bit_input, n, c, n, NULL); - - //gemm_nn_custom_bin_mean(m, n, k, 1, bit_weights, k, bit_input, n, c, n, mean_arr); - - //printf("\n u = %d \n", u); - - //gemm_nn_custom(m, n, k, 1, a, k, b, n, c, n); - - //int j; - //if (u != 8) for (j = 0; j < l.n; ++j) l.biases[j] = l.biases[j] / (mean_arr[j]*2); - - //free(count_output); - //free(bit_input); - //free(bit_weights); - //free(mean_arr); } else { im2col_cpu_custom(state.input, l.c, l.h, l.w, l.size, l.stride, l.pad, b); diff --git a/src/gemm.c b/src/gemm.c index 6f021e73..bf52a118 100644 --- a/src/gemm.c +++ b/src/gemm.c @@ -487,6 +487,15 @@ void transpose_bin(uint32_t *A, uint32_t *B, const int n, const int m, } } } + +static inline int popcnt_32(uint32_t val32) { +#ifdef WIN32 // Windows + int tmp_count = __popcnt(val32); +#else // Linux + int tmp_count = __builtin_popcount(val32); +#endif + return tmp_count; +} //---------------------------- @@ -721,6 +730,91 @@ void gemm_nn(int M, int N, int K, float ALPHA, } + + +void gemm_nn_bin_32bit_packed(int M, int N, int K, float ALPHA, + uint32_t *A, int lda, + uint32_t *B, int ldb, + float *C, int ldc, float *mean_arr) +{ + int i; + #pragma omp parallel for + for (i = 0; i < M; ++i) { // l.n + int j, s; + float mean_val = mean_arr[i]; + //printf(" l.mean_arr[i] = %d \n ", l.mean_arr[i]); + for (s = 0; s < K; ++s) // l.size*l.size*l.c/32 or (l.size*l.size*l.c) + { + register uint32_t A_PART = A[i*lda + s]; + __m256i a256 = _mm256_set1_epi32(A_PART); + + for (j = 0; j < N - 8; j += 8) + { + __m256i b256 = *((__m256i*)&B[s*ldb + j]); + __m256i xor256 = _mm256_xor_si256(a256, b256); // xnor = xor(a,b) + __m256i all_1 = _mm256_set1_epi8(255); + __m256i xnor256 = _mm256_andnot_si256(xor256, all_1); // xnor = not(xor(a,b)) + + //_m256 count = _mm256_set_ps( + /* + __m256i count = _mm256_setr_epi32( + (int)popcnt_32(xnor256.m256i_u32[0]), + (int)popcnt_32(xnor256.m256i_u32[1]), + (int)popcnt_32(xnor256.m256i_u32[2]), + (int)popcnt_32(xnor256.m256i_u32[3]), + (int)popcnt_32(xnor256.m256i_u32[4]), + (int)popcnt_32(xnor256.m256i_u32[5]), + (int)popcnt_32(xnor256.m256i_u32[6]), + (int)popcnt_32(xnor256.m256i_u32[7])); + + __m256i val2 = _mm256_set1_epi32(2); + count = _mm256_mullo_epi32(count, val2); + + __m256i val32 = _mm256_set1_epi32(32); + count = _mm256_sub_epi32(count, val32); + + int z; + for (z = 0; z < 8; ++z) { + C[i*ldc + j + z] += count.m256i_i32[z] * mean_val; + } + */ + + __m256 count = _mm256_setr_ps( + popcnt_32(xnor256.m256i_u32[0]), + popcnt_32(xnor256.m256i_u32[1]), + popcnt_32(xnor256.m256i_u32[2]), + popcnt_32(xnor256.m256i_u32[3]), + popcnt_32(xnor256.m256i_u32[4]), + popcnt_32(xnor256.m256i_u32[5]), + popcnt_32(xnor256.m256i_u32[6]), + popcnt_32(xnor256.m256i_u32[7])); + + __m256 val2 = _mm256_set1_ps(2); + count = _mm256_mul_ps(count, val2); // count * 2 + + __m256 val32 = _mm256_set1_ps(32); + count = _mm256_sub_ps(count, val32); // count - 32 + + __m256 mean256 = _mm256_set1_ps(mean_val); + count = _mm256_mul_ps(count, mean256); // count * mean_val + + __m256 c256 = *((__m256*)&C[i*ldc + j]); + count = _mm256_add_ps(count, c256); // c = c + count + *((__m256*)&C[i*ldc + j]) = count; + } + + for (; j < N; ++j) // out_h*out_w; + { + register uint32_t B_PART = B[s*ldb + j]; + uint32_t xnor_result = ~(A_PART ^ B_PART); + int32_t count = popcnt_32(xnor_result); // must be Signed int + + C[i*ldc + j] += (2 * count - 32) * mean_val; + } + } + } +} + void convolution_2d_old(int w, int h, int ksize, int n, int c, int pad, int stride, float *weights, float *input, float *output) { @@ -1652,7 +1746,7 @@ void forward_maxpool_layer_avx(float *src, float *dst, int *indexes, int size, i } } -#else +#else // AVX void gemm_nn(int M, int N, int K, float ALPHA, float *A, int lda, @@ -1670,6 +1764,36 @@ void gemm_nn(int M, int N, int K, float ALPHA, } } +void gemm_nn_bin_32bit_packed(int M, int N, int K, float ALPHA, + uint32_t *A, int lda, + uint32_t *B, int ldb, + float *C, int ldc, float *mean_arr) +{ + int i; + #pragma omp parallel for + for (i = 0; i < M; ++i) { // l.n + int j, s; + float mean_val = mean_arr[i]; + //printf(" l.mean_arr[i] = %d \n ", l.mean_arr[i]); + for (s = 0; s < K; ++s) // l.size*l.size*l.c/32 or (l.size*l.size*l.c) + { + //register float A_PART = 1*a[i*k + s]; + register uint32_t A_PART = A[i*lda + s]; + for (j = 0; j < N; ++j) // out_h*out_w; + { + //c[i*n + j] += A_PART*b[s*n + j]; + register uint32_t B_PART = B[s*ldb + j]; + uint32_t xnor_result = ~(A_PART ^ B_PART); + //printf(" xnor_result = %d, ", xnor_result); + int32_t count = popcnt_32(xnor_result); // must be Signed int + + C[i*ldc + j] += (2 * count - 32) * mean_val; + //c[i*n + j] += count*mean; + } + } + } +} + void convolution_2d(int w, int h, int ksize, int n, int c, int pad, int stride, float *weights, float *input, float *output, float *mean) @@ -2102,6 +2226,135 @@ void forward_maxpool_layer_avx(float *src, float *dst, int *indexes, int size, i #endif // AVX + +// 32 channels -> 1 channel (with 32 floats) +// 256 channels -> 8 channels (with 32 floats) +void repack_input(float *input, float *re_packed_input, int w, int h, int c) +{ + const int items_per_channel = w * h; + int chan, i; + for (chan = 0; chan < c; chan += 32) + { + for (i = 0; i < items_per_channel; ++i) + { + int c_pack; + for (c_pack = 0; c_pack < 32; ++c_pack) { + float src = input[(chan + c_pack)*items_per_channel + i]; + + re_packed_input[chan*items_per_channel + i * 32 + c_pack] = src; + } + } + } +} + +void transpose_uint32(uint32_t *src, uint32_t *dst, int src_h, int src_w, int src_align, int dst_align) +{ + //l.bit_align - algined (n) by 32 + //new_ldb - aligned (k) by 256 + + int i; + //#pragma omp parallel for + for (i = 0; i < src_h; i += 1) // l.size*l.size*l.c; + { + int j; + for (j = 0; j < src_w; j += 1) // out_h*out_w; + { + ((uint32_t *)dst)[j*dst_align / 32 + i] = ((uint32_t *)src)[i*src_align + j]; + } + } +} + +void gemm_nn_bin_transposed_32bit_packed(int M, int N, int K, float ALPHA, + uint32_t *A, int lda, + uint32_t *B, int ldb, + float *C, int ldc, float *mean_arr) +{ + int i; + #pragma omp parallel for + for (i = 0; i < M; ++i) { // l.n + int j, s; + float mean_val = mean_arr[i]; + for (s = 0; s < K; ++s) // l.size*l.size*l.c/32 or (l.size*l.size*l.c) + { + register uint32_t A_PART = ((uint32_t*)A)[i*lda + s]; + for (j = 0; j < N; ++j) // out_h*out_w; + { + register uint32_t B_PART = ((uint32_t*)B)[j*ldb + s]; + uint32_t xnor_result = ~(A_PART ^ B_PART); + int32_t count = popcnt_32(xnor_result); // must be Signed int + + C[i*ldc + j] += (2 * count - 32) * mean_val; + } + } + } +} + +void convolution_repacked(uint32_t *packed_input, uint32_t *packed_weights, float *output, + int w, int h, int c, int n, int size, int pad, int new_lda, float *mean_arr) +{ + int fil; + // filter index + #pragma omp parallel for + for (fil = 0; fil < n; ++fil) { + float mean_val = mean_arr[fil]; + int chan, c_pack, y, x, f_y, f_x; + // channel index + for (chan = 0; chan < c / 32; ++chan) + //for (chan = 0; chan < l.c; chan += 32) + //for (c_pack = 0; c_pack < 32; ++c_pack) + // input - y + for (y = 0; y < h; ++y) + // input - x + for (x = 0; x < w; ++x) + { + int const output_index = fil*w*h + y*w + x; + float sum = 0; + + // filter - y + for (f_y = 0; f_y < size; ++f_y) + { + int input_y = y + f_y - pad; + // filter - x + for (f_x = 0; f_x < size; ++f_x) + { + int input_x = x + f_x - pad; + if (input_y < 0 || input_x < 0 || input_y >= h || input_x >= w) continue; + + // normal + //float input = state.input[(chan + c_pack)*l.w*l.h + input_y*l.w + input_x]; + //float weight = l.weights[fil*l.c*l.size*l.size + (chan + c_pack)*l.size*l.size + f_y*l.size + f_x]; + + // packed + //float input = re_packed_input[chan*l.w*l.h + (input_y*l.w + input_x) * 32 + c_pack]; + //float weight = l.weights[fil*l.c*l.size*l.size + chan*l.size*l.size + (f_y*l.size + f_x) * 32 + c_pack]; + //sum += input * weight; + + //float input = re_packed_input[chan*l.w*l.h + (input_y*l.w + input_x) * 32 + c_pack]; + //float weight = l.weights[fil*l.c*l.size*l.size + chan*l.size*l.size + (f_y*l.size + f_x) * 32 + c_pack]; + //uint32_t bit1 = input > 0; + //uint32_t bit2 = weight > 0; + //uint32_t count = (~(bit1 ^ bit2)) & 1; + //float result = (2 * (float)count - 1) * mean_val; + //printf("\n mul = %f, bit1 = %d, bit2 = %d, count = %d, mean = %f, result = %f ", input*weight, bit1, bit2, count, mean_val, result); + //sum += result; + + uint32_t input = ((uint32_t *)packed_input)[chan*w*h + input_y*w + input_x]; + //uint32_t weight = ((uint32_t *)l.align_bit_weights)[fil*l.c*l.size*l.size/32 + chan*l.size*l.size + f_y*l.size + f_x]; + uint32_t weight = ((uint32_t *)packed_weights)[fil*new_lda / 32 + chan*size*size + f_y*size + f_x]; + + uint32_t xnor_result = ~(input ^ weight); + int32_t count = popcnt_32(xnor_result); // mandatory Signed int + sum += (2 * count - 32) * mean_val; + } + } + // l.output[filters][width][height] += + // state.input[channels][width][height] * + // l.weights[filters][channels][filter_width][filter_height]; + output[output_index] += sum; + } + } +} + void gemm_nt(int M, int N, int K, float ALPHA, float *A, int lda, float *B, int ldb, diff --git a/src/gemm.h b/src/gemm.h index e297537f..dd727830 100644 --- a/src/gemm.h +++ b/src/gemm.h @@ -59,6 +59,23 @@ void gemm_bin(int M, int N, int K, float ALPHA, float *B, int ldb, float *C, int ldc); +void repack_input(float *input, float *re_packed_input, int w, int h, int c); + +void convolution_repacked(uint32_t *packed_input, uint32_t *packed_weights, float *output, + int w, int h, int c, int n, int size, int pad, int new_lda, float *mean_arr); + +void gemm_nn_bin_32bit_packed(int M, int N, int K, float ALPHA, + uint32_t *A, int lda, + uint32_t *B, int ldb, + float *C, int ldc, float *mean_arr); + +void transpose_uint32(uint32_t *src, uint32_t *dst, int src_h, int src_w, int src_align, int dst_align); + +void gemm_nn_bin_transposed_32bit_packed(int M, int N, int K, float ALPHA, + uint32_t *A, int lda, + uint32_t *B, int ldb, + float *C, int ldc, float *mean_arr); + void forward_maxpool_layer_avx(float *src, float *dst, int *indexes, int size, int w, int h, int out_w, int out_h, int c, int pad, int stride, int batch); diff --git a/src/image.c b/src/image.c index cb96ef37..1fcdd5c0 100644 --- a/src/image.c +++ b/src/image.c @@ -325,9 +325,9 @@ void draw_detections_v3(image im, detection *dets, int num, float thresh, char * printf("%s: %.0f%%", names[best_class], selected_detections[i].det.prob[best_class] * 100); if (ext_output) printf("\t(left_x: %4.0f top_y: %4.0f width: %4.0f height: %4.0f)\n", - (selected_detections[i].det.bbox.x - selected_detections[i].det.bbox.w / 2)*im.w, - (selected_detections[i].det.bbox.y - selected_detections[i].det.bbox.h / 2)*im.h, - selected_detections[i].det.bbox.w*im.w, selected_detections[i].det.bbox.h*im.h); + round((selected_detections[i].det.bbox.x - selected_detections[i].det.bbox.w / 2)*im.w), + round((selected_detections[i].det.bbox.y - selected_detections[i].det.bbox.h / 2)*im.h), + round(selected_detections[i].det.bbox.w*im.w), round(selected_detections[i].det.bbox.h*im.h)); else printf("\n"); int j;