|
|
@ -204,10 +204,11 @@ void gemm_nn_custom_bin_mean(int M, int N, int K, float ALPHA_UNUSED, |
|
|
|
{ |
|
|
|
{ |
|
|
|
int *count_arr = calloc(M*N, sizeof(int)); |
|
|
|
int *count_arr = calloc(M*N, sizeof(int)); |
|
|
|
|
|
|
|
|
|
|
|
int i, j, k, h; |
|
|
|
int i; |
|
|
|
|
|
|
|
|
|
|
|
#pragma omp parallel for |
|
|
|
#pragma omp parallel for |
|
|
|
for (i = 0; i < M; ++i) { // l.n - filters [16 - 55 - 1024]
|
|
|
|
for (i = 0; i < M; ++i) { // l.n - filters [16 - 55 - 1024]
|
|
|
|
|
|
|
|
int j, k, h; |
|
|
|
for (k = 0; k < K; ++k) { // l.size*l.size*l.c - one filter size [27 - 9216]
|
|
|
|
for (k = 0; k < K; ++k) { // l.size*l.size*l.c - one filter size [27 - 9216]
|
|
|
|
const char a_bit = get_bit(A, i*lda + k); |
|
|
|
const char a_bit = get_bit(A, i*lda + k); |
|
|
|
uint64_t a_bit64 = fill_bit_int64(a_bit); |
|
|
|
uint64_t a_bit64 = fill_bit_int64(a_bit); |
|
|
@ -271,10 +272,11 @@ void gemm_nn_custom_bin_mean_transposed(int M, int N, int K, float ALPHA_UNUSED, |
|
|
|
unsigned char *B, int ldb, |
|
|
|
unsigned char *B, int ldb, |
|
|
|
float *C, int ldc, float *mean_arr) |
|
|
|
float *C, int ldc, float *mean_arr) |
|
|
|
{ |
|
|
|
{ |
|
|
|
int i, j, k, h; |
|
|
|
int i; |
|
|
|
|
|
|
|
|
|
|
|
#pragma omp parallel for |
|
|
|
#pragma omp parallel for |
|
|
|
for (i = 0; i < M; ++i) { // l.n - filters [16 - 55 - 1024]
|
|
|
|
for (i = 0; i < M; ++i) { // l.n - filters [16 - 55 - 1024]
|
|
|
|
|
|
|
|
int j, k, h; |
|
|
|
float mean_val = mean_arr[i]; |
|
|
|
float mean_val = mean_arr[i]; |
|
|
|
|
|
|
|
|
|
|
|
for (j = 0; j < N; ++j) { // out_h*out_w - one channel output size [169 - 173056]
|
|
|
|
for (j = 0; j < N; ++j) { // out_h*out_w - one channel output size [169 - 173056]
|
|
|
@ -365,7 +367,7 @@ void transpose_bin(char *A, char *B, const int n, const int m, |
|
|
|
const int lda, const int ldb, const int block_size) |
|
|
|
const int lda, const int ldb, const int block_size) |
|
|
|
{ |
|
|
|
{ |
|
|
|
int i; |
|
|
|
int i; |
|
|
|
#pragma omp parallel for |
|
|
|
#pragma omp parallel for |
|
|
|
for (i = 0; i < n; i += 8) { |
|
|
|
for (i = 0; i < n; i += 8) { |
|
|
|
int j; |
|
|
|
int j; |
|
|
|
for (j = 0; j < m - 8; j += 8) { |
|
|
|
for (j = 0; j < m - 8; j += 8) { |
|
|
@ -617,14 +619,14 @@ void gemm_nn(int M, int N, int K, float ALPHA, |
|
|
|
void convolution_2d_old(int w, int h, int ksize, int n, int c, int pad, int stride, |
|
|
|
void convolution_2d_old(int w, int h, int ksize, int n, int c, int pad, int stride, |
|
|
|
float *weights, float *input, float *output) |
|
|
|
float *weights, float *input, float *output) |
|
|
|
{ |
|
|
|
{ |
|
|
|
int out_h = (h + 2 * pad - ksize) / stride + 1; // output_height=input_height for stride=1 and pad=1
|
|
|
|
const int out_h = (h + 2 * pad - ksize) / stride + 1; // output_height=input_height for stride=1 and pad=1
|
|
|
|
int out_w = (w + 2 * pad - ksize) / stride + 1; // output_width=input_width for stride=1 and pad=1
|
|
|
|
const int out_w = (w + 2 * pad - ksize) / stride + 1; // output_width=input_width for stride=1 and pad=1
|
|
|
|
int i, f, j; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
int fil; |
|
|
|
int fil; |
|
|
|
// filter index
|
|
|
|
// filter index
|
|
|
|
#pragma omp parallel for // "omp parallel for" - automatic parallelization of loop by using OpenMP
|
|
|
|
#pragma omp parallel for // "omp parallel for" - automatic parallelization of loop by using OpenMP
|
|
|
|
for (fil = 0; fil < n; ++fil) { |
|
|
|
for (fil = 0; fil < n; ++fil) { |
|
|
|
|
|
|
|
//int i, f, j;
|
|
|
|
int chan, y, x, f_y, f_x; |
|
|
|
int chan, y, x, f_y, f_x; |
|
|
|
// channel index
|
|
|
|
// channel index
|
|
|
|
for (chan = 0; chan < c; ++chan) |
|
|
|
for (chan = 0; chan < c; ++chan) |
|
|
@ -665,9 +667,9 @@ void convolution_2d_old(int w, int h, int ksize, int n, int c, int pad, int stri |
|
|
|
void convolution_2d(int w, int h, int ksize, int n, int c, int pad, int stride, |
|
|
|
void convolution_2d(int w, int h, int ksize, int n, int c, int pad, int stride, |
|
|
|
float *weights, float *input, float *output, float *mean) |
|
|
|
float *weights, float *input, float *output, float *mean) |
|
|
|
{ |
|
|
|
{ |
|
|
|
int out_h = (h + 2 * pad - ksize) / stride + 1; // output_height=input_height for stride=1 and pad=1
|
|
|
|
const int out_h = (h + 2 * pad - ksize) / stride + 1; // output_height=input_height for stride=1 and pad=1
|
|
|
|
int out_w = (w + 2 * pad - ksize) / stride + 1; // output_width=input_width for stride=1 and pad=1
|
|
|
|
const int out_w = (w + 2 * pad - ksize) / stride + 1; // output_width=input_width for stride=1 and pad=1
|
|
|
|
int i, f, j; |
|
|
|
int i; |
|
|
|
|
|
|
|
|
|
|
|
#if defined(_OPENMP) |
|
|
|
#if defined(_OPENMP) |
|
|
|
static int max_num_threads = 0; |
|
|
|
static int max_num_threads = 0; |
|
|
@ -684,9 +686,9 @@ void convolution_2d(int w, int h, int ksize, int n, int c, int pad, int stride, |
|
|
|
*((__m256*)&weights[i]) = _mm256_and_ps(*((__m256*)&weights[i]), _mm256_castsi256_ps(all256_sing1)); |
|
|
|
*((__m256*)&weights[i]) = _mm256_and_ps(*((__m256*)&weights[i]), _mm256_castsi256_ps(all256_sing1)); |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
for (i = 0; i < w*h*c; i += 8) { |
|
|
|
//for (i = 0; i < w*h*c; i += 8) {
|
|
|
|
//*((__m256*)&input[i]) = _mm256_and_ps(*((__m256*)&input[i]), _mm256_castsi256_ps(all256_sing1));
|
|
|
|
//*((__m256*)&input[i]) = _mm256_and_ps(*((__m256*)&input[i]), _mm256_castsi256_ps(all256_sing1));
|
|
|
|
} |
|
|
|
//}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
//__m256i all256_last_zero = _mm256_set1_epi32(0xFFFFFFFF);
|
|
|
|
//__m256i all256_last_zero = _mm256_set1_epi32(0xFFFFFFFF);
|
|
|
@ -704,7 +706,7 @@ void convolution_2d(int w, int h, int ksize, int n, int c, int pad, int stride, |
|
|
|
|
|
|
|
|
|
|
|
int fil; |
|
|
|
int fil; |
|
|
|
// filter index
|
|
|
|
// filter index
|
|
|
|
#pragma omp parallel for // "omp parallel for" - automatic parallelization of loop by using OpenMP
|
|
|
|
#pragma omp parallel for // "omp parallel for" - automatic parallelization of loop by using OpenMP
|
|
|
|
for (fil = 0; fil < n; ++fil) { |
|
|
|
for (fil = 0; fil < n; ++fil) { |
|
|
|
int chan, y, x, f_y, f_x; |
|
|
|
int chan, y, x, f_y, f_x; |
|
|
|
float cur_mean = fabs(mean[fil]); |
|
|
|
float cur_mean = fabs(mean[fil]); |
|
|
@ -914,16 +916,17 @@ void im2col_cpu_custom_transpose(float* data_im, |
|
|
|
int channels, int height, int width, |
|
|
|
int channels, int height, int width, |
|
|
|
int ksize, int stride, int pad, float* data_col, int ldb_align) |
|
|
|
int ksize, int stride, int pad, float* data_col, int ldb_align) |
|
|
|
{ |
|
|
|
{ |
|
|
|
int c, h, w; |
|
|
|
const int height_col = (height + 2 * pad - ksize) / stride + 1; |
|
|
|
int height_col = (height + 2 * pad - ksize) / stride + 1; |
|
|
|
const int width_col = (width + 2 * pad - ksize) / stride + 1; |
|
|
|
int width_col = (width + 2 * pad - ksize) / stride + 1; |
|
|
|
const int channels_col = channels * ksize * ksize; |
|
|
|
int channels_col = channels * ksize * ksize; |
|
|
|
int c; |
|
|
|
|
|
|
|
|
|
|
|
// optimized version
|
|
|
|
// optimized version
|
|
|
|
if (height_col == height && width_col == width && stride == 1 && pad == 1) |
|
|
|
if (height_col == height && width_col == width && stride == 1 && pad == 1) |
|
|
|
{ |
|
|
|
{ |
|
|
|
#pragma omp parallel for |
|
|
|
#pragma omp parallel for |
|
|
|
for (c = 0; c < channels_col; ++c) { |
|
|
|
for (c = 0; c < channels_col; ++c) { |
|
|
|
|
|
|
|
int h, w; |
|
|
|
int w_offset = c % ksize; |
|
|
|
int w_offset = c % ksize; |
|
|
|
int h_offset = (c / ksize) % ksize; |
|
|
|
int h_offset = (c / ksize) % ksize; |
|
|
|
int c_im = c / ksize / ksize; |
|
|
|
int c_im = c / ksize / ksize; |
|
|
@ -1005,6 +1008,7 @@ void im2col_cpu_custom_transpose(float* data_im, |
|
|
|
else { |
|
|
|
else { |
|
|
|
#pragma omp parallel for |
|
|
|
#pragma omp parallel for |
|
|
|
for (c = 0; c < channels_col; ++c) { |
|
|
|
for (c = 0; c < channels_col; ++c) { |
|
|
|
|
|
|
|
int h, w; |
|
|
|
int w_offset = c % ksize; |
|
|
|
int w_offset = c % ksize; |
|
|
|
int h_offset = (c / ksize) % ksize; |
|
|
|
int h_offset = (c / ksize) % ksize; |
|
|
|
int c_im = c / ksize / ksize; |
|
|
|
int c_im = c / ksize / ksize; |
|
|
@ -1029,17 +1033,17 @@ void im2col_cpu_custom(float* data_im, |
|
|
|
int channels, int height, int width, |
|
|
|
int channels, int height, int width, |
|
|
|
int ksize, int stride, int pad, float* data_col) |
|
|
|
int ksize, int stride, int pad, float* data_col) |
|
|
|
{ |
|
|
|
{ |
|
|
|
|
|
|
|
int c; |
|
|
|
int c, h, w; |
|
|
|
const int height_col = (height + 2 * pad - ksize) / stride + 1; |
|
|
|
int height_col = (height + 2 * pad - ksize) / stride + 1; |
|
|
|
const int width_col = (width + 2 * pad - ksize) / stride + 1; |
|
|
|
int width_col = (width + 2 * pad - ksize) / stride + 1; |
|
|
|
const int channels_col = channels * ksize * ksize; |
|
|
|
int channels_col = channels * ksize * ksize; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// optimized version
|
|
|
|
// optimized version
|
|
|
|
if (height_col == height && width_col == width && stride == 1 && pad == 1 && is_fma_avx2()) |
|
|
|
if (height_col == height && width_col == width && stride == 1 && pad == 1 && is_fma_avx2()) |
|
|
|
{ |
|
|
|
{ |
|
|
|
#pragma omp parallel for |
|
|
|
#pragma omp parallel for |
|
|
|
for (c = 0; c < channels_col; ++c) { |
|
|
|
for (c = 0; c < channels_col; ++c) { |
|
|
|
|
|
|
|
int h, w; |
|
|
|
int w_offset = c % ksize; |
|
|
|
int w_offset = c % ksize; |
|
|
|
int h_offset = (c / ksize) % ksize; |
|
|
|
int h_offset = (c / ksize) % ksize; |
|
|
|
int c_im = c / ksize / ksize; |
|
|
|
int c_im = c / ksize / ksize; |
|
|
@ -1121,10 +1125,10 @@ void im2col_cpu_custom_align(float* data_im, |
|
|
|
int channels, int height, int width, |
|
|
|
int channels, int height, int width, |
|
|
|
int ksize, int stride, int pad, float* data_col, int bit_align) |
|
|
|
int ksize, int stride, int pad, float* data_col, int bit_align) |
|
|
|
{ |
|
|
|
{ |
|
|
|
int c, h, w; |
|
|
|
int c; |
|
|
|
int height_col = (height + 2 * pad - ksize) / stride + 1; |
|
|
|
const int height_col = (height + 2 * pad - ksize) / stride + 1; |
|
|
|
int width_col = (width + 2 * pad - ksize) / stride + 1; |
|
|
|
const int width_col = (width + 2 * pad - ksize) / stride + 1; |
|
|
|
int channels_col = channels * ksize * ksize; |
|
|
|
const int channels_col = channels * ksize * ksize; |
|
|
|
|
|
|
|
|
|
|
|
// optimized version
|
|
|
|
// optimized version
|
|
|
|
if (height_col == height && width_col == width && stride == 1 && pad == 1 && is_fma_avx2()) |
|
|
|
if (height_col == height && width_col == width && stride == 1 && pad == 1 && is_fma_avx2()) |
|
|
@ -1133,6 +1137,7 @@ void im2col_cpu_custom_align(float* data_im, |
|
|
|
|
|
|
|
|
|
|
|
#pragma omp parallel for |
|
|
|
#pragma omp parallel for |
|
|
|
for (c = 0; c < channels_col; ++c) { |
|
|
|
for (c = 0; c < channels_col; ++c) { |
|
|
|
|
|
|
|
int h, w; |
|
|
|
int w_offset = c % ksize; |
|
|
|
int w_offset = c % ksize; |
|
|
|
int h_offset = (c / ksize) % ksize; |
|
|
|
int h_offset = (c / ksize) % ksize; |
|
|
|
int c_im = c / ksize / ksize; |
|
|
|
int c_im = c / ksize / ksize; |
|
|
@ -1218,10 +1223,10 @@ void im2col_cpu_custom_bin(float* data_im, |
|
|
|
int channels, int height, int width, |
|
|
|
int channels, int height, int width, |
|
|
|
int ksize, int stride, int pad, float* data_col, int bit_align) |
|
|
|
int ksize, int stride, int pad, float* data_col, int bit_align) |
|
|
|
{ |
|
|
|
{ |
|
|
|
int c, h, w; |
|
|
|
int c; |
|
|
|
int height_col = (height + 2 * pad - ksize) / stride + 1; |
|
|
|
const int height_col = (height + 2 * pad - ksize) / stride + 1; |
|
|
|
int width_col = (width + 2 * pad - ksize) / stride + 1; |
|
|
|
const int width_col = (width + 2 * pad - ksize) / stride + 1; |
|
|
|
int channels_col = channels * ksize * ksize; |
|
|
|
const int channels_col = channels * ksize * ksize; |
|
|
|
|
|
|
|
|
|
|
|
// optimized version
|
|
|
|
// optimized version
|
|
|
|
if (height_col == height && width_col == width && stride == 1 && pad == 1 && is_fma_avx2()) |
|
|
|
if (height_col == height && width_col == width && stride == 1 && pad == 1 && is_fma_avx2()) |
|
|
@ -1233,6 +1238,7 @@ void im2col_cpu_custom_bin(float* data_im, |
|
|
|
|
|
|
|
|
|
|
|
#pragma omp parallel for |
|
|
|
#pragma omp parallel for |
|
|
|
for (c = 0; c < channels_col; ++c) { |
|
|
|
for (c = 0; c < channels_col; ++c) { |
|
|
|
|
|
|
|
int h, w; |
|
|
|
int w_offset = c % ksize; |
|
|
|
int w_offset = c % ksize; |
|
|
|
int h_offset = (c / ksize) % ksize; |
|
|
|
int h_offset = (c / ksize) % ksize; |
|
|
|
int c_im = c / ksize / ksize; |
|
|
|
int c_im = c / ksize / ksize; |
|
|
@ -1451,8 +1457,8 @@ void forward_maxpool_layer_avx(float *src, float *dst, int *indexes, int size, i |
|
|
|
int pad, int stride, int batch) |
|
|
|
int pad, int stride, int batch) |
|
|
|
{ |
|
|
|
{ |
|
|
|
|
|
|
|
|
|
|
|
int w_offset = -pad / 2; |
|
|
|
const int w_offset = -pad / 2; |
|
|
|
int h_offset = -pad / 2; |
|
|
|
const int h_offset = -pad / 2; |
|
|
|
int b, k; |
|
|
|
int b, k; |
|
|
|
|
|
|
|
|
|
|
|
for (b = 0; b < batch; ++b) { |
|
|
|
for (b = 0; b < batch; ++b) { |
|
|
@ -1563,13 +1569,13 @@ void gemm_nn(int M, int N, int K, float ALPHA, |
|
|
|
void convolution_2d(int w, int h, int ksize, int n, int c, int pad, int stride, |
|
|
|
void convolution_2d(int w, int h, int ksize, int n, int c, int pad, int stride, |
|
|
|
float *weights, float *input, float *output, float *mean) |
|
|
|
float *weights, float *input, float *output, float *mean) |
|
|
|
{ |
|
|
|
{ |
|
|
|
int out_h = (h + 2 * pad - ksize) / stride + 1; // output_height=input_height for stride=1 and pad=1
|
|
|
|
const int out_h = (h + 2 * pad - ksize) / stride + 1; // output_height=input_height for stride=1 and pad=1
|
|
|
|
int out_w = (w + 2 * pad - ksize) / stride + 1; // output_width=input_width for stride=1 and pad=1
|
|
|
|
const int out_w = (w + 2 * pad - ksize) / stride + 1; // output_width=input_width for stride=1 and pad=1
|
|
|
|
int i, f, j; |
|
|
|
//int i, f, j;
|
|
|
|
|
|
|
|
|
|
|
|
int fil; |
|
|
|
int fil; |
|
|
|
// filter index
|
|
|
|
// filter index
|
|
|
|
#pragma omp parallel for // "omp parallel for" - automatic parallelization of loop by using OpenMP
|
|
|
|
#pragma omp parallel for // "omp parallel for" - automatic parallelization of loop by using OpenMP
|
|
|
|
for (fil = 0; fil < n; ++fil) { |
|
|
|
for (fil = 0; fil < n; ++fil) { |
|
|
|
int chan, y, x, f_y, f_x; |
|
|
|
int chan, y, x, f_y, f_x; |
|
|
|
// channel index
|
|
|
|
// channel index
|
|
|
@ -1613,10 +1619,11 @@ void gemm_nn_custom_bin_mean_transposed(int M, int N, int K, float ALPHA_UNUSED, |
|
|
|
unsigned char *B, int ldb, |
|
|
|
unsigned char *B, int ldb, |
|
|
|
float *C, int ldc, float *mean_arr) |
|
|
|
float *C, int ldc, float *mean_arr) |
|
|
|
{ |
|
|
|
{ |
|
|
|
int i, j, k, h; |
|
|
|
int i; |
|
|
|
|
|
|
|
|
|
|
|
#pragma omp parallel for |
|
|
|
#pragma omp parallel for |
|
|
|
for (i = 0; i < M; ++i) { // l.n - filters [16 - 55 - 1024]
|
|
|
|
for (i = 0; i < M; ++i) { // l.n - filters [16 - 55 - 1024]
|
|
|
|
|
|
|
|
int j, k; |
|
|
|
float mean_val = mean_arr[i]; |
|
|
|
float mean_val = mean_arr[i]; |
|
|
|
|
|
|
|
|
|
|
|
for (j = 0; j < N; ++j) { // out_h*out_w - one channel output size [169 - 173056]
|
|
|
|
for (j = 0; j < N; ++j) { // out_h*out_w - one channel output size [169 - 173056]
|
|
|
@ -1660,16 +1667,17 @@ void im2col_cpu_custom(float* data_im, |
|
|
|
im2col_cpu(data_im, channels, height, width, ksize, stride, pad, data_col); |
|
|
|
im2col_cpu(data_im, channels, height, width, ksize, stride, pad, data_col); |
|
|
|
return; |
|
|
|
return; |
|
|
|
|
|
|
|
|
|
|
|
int c, h, w; |
|
|
|
int c; |
|
|
|
int height_col = (height + 2 * pad - ksize) / stride + 1; |
|
|
|
const int height_col = (height + 2 * pad - ksize) / stride + 1; |
|
|
|
int width_col = (width + 2 * pad - ksize) / stride + 1; |
|
|
|
const int width_col = (width + 2 * pad - ksize) / stride + 1; |
|
|
|
int channels_col = channels * ksize * ksize; |
|
|
|
const int channels_col = channels * ksize * ksize; |
|
|
|
|
|
|
|
|
|
|
|
// optimized version
|
|
|
|
// optimized version
|
|
|
|
if (height_col == height && width_col == width && stride == 1 && pad == 1) |
|
|
|
if (height_col == height && width_col == width && stride == 1 && pad == 1) |
|
|
|
{ |
|
|
|
{ |
|
|
|
#pragma omp parallel for |
|
|
|
#pragma omp parallel for |
|
|
|
for (c = 0; c < channels_col; ++c) { |
|
|
|
for (c = 0; c < channels_col; ++c) { |
|
|
|
|
|
|
|
int h, w; |
|
|
|
int w_offset = c % ksize; |
|
|
|
int w_offset = c % ksize; |
|
|
|
int h_offset = (c / ksize) % ksize; |
|
|
|
int h_offset = (c / ksize) % ksize; |
|
|
|
int c_im = c / ksize / ksize; |
|
|
|
int c_im = c / ksize / ksize; |
|
|
@ -1750,10 +1758,10 @@ void im2col_cpu_custom_bin(float* data_im, |
|
|
|
int channels, int height, int width, |
|
|
|
int channels, int height, int width, |
|
|
|
int ksize, int stride, int pad, float* data_col, int bit_align) |
|
|
|
int ksize, int stride, int pad, float* data_col, int bit_align) |
|
|
|
{ |
|
|
|
{ |
|
|
|
int c, h, w; |
|
|
|
int c; |
|
|
|
int height_col = (height + 2 * pad - ksize) / stride + 1; |
|
|
|
const int height_col = (height + 2 * pad - ksize) / stride + 1; |
|
|
|
int width_col = (width + 2 * pad - ksize) / stride + 1; |
|
|
|
const int width_col = (width + 2 * pad - ksize) / stride + 1; |
|
|
|
int channels_col = channels * ksize * ksize; |
|
|
|
const int channels_col = channels * ksize * ksize; |
|
|
|
|
|
|
|
|
|
|
|
// optimized version
|
|
|
|
// optimized version
|
|
|
|
if (height_col == height && width_col == width && stride == 1 && pad == 1) |
|
|
|
if (height_col == height && width_col == width && stride == 1 && pad == 1) |
|
|
@ -1762,6 +1770,7 @@ void im2col_cpu_custom_bin(float* data_im, |
|
|
|
|
|
|
|
|
|
|
|
#pragma omp parallel for |
|
|
|
#pragma omp parallel for |
|
|
|
for (c = 0; c < channels_col; ++c) { |
|
|
|
for (c = 0; c < channels_col; ++c) { |
|
|
|
|
|
|
|
int h, w; |
|
|
|
int w_offset = c % ksize; |
|
|
|
int w_offset = c % ksize; |
|
|
|
int h_offset = (c / ksize) % ksize; |
|
|
|
int h_offset = (c / ksize) % ksize; |
|
|
|
int c_im = c / ksize / ksize; |
|
|
|
int c_im = c / ksize / ksize; |
|
|
@ -1906,9 +1915,10 @@ void float_to_bit(float *src, unsigned char *dst, size_t size) |
|
|
|
|
|
|
|
|
|
|
|
static inline void transpose_scalar_block(float *A, float *B, const int lda, const int ldb, const int block_size) |
|
|
|
static inline void transpose_scalar_block(float *A, float *B, const int lda, const int ldb, const int block_size) |
|
|
|
{ |
|
|
|
{ |
|
|
|
int i, j; |
|
|
|
int i; |
|
|
|
//#pragma omp parallel for
|
|
|
|
//#pragma omp parallel for
|
|
|
|
for (i = 0; i<block_size; i++) { |
|
|
|
for (i = 0; i<block_size; i++) { |
|
|
|
|
|
|
|
int j; |
|
|
|
for (j = 0; j<block_size; j++) { |
|
|
|
for (j = 0; j<block_size; j++) { |
|
|
|
B[j*ldb + i] = A[i*lda + j]; |
|
|
|
B[j*ldb + i] = A[i*lda + j]; |
|
|
|
} |
|
|
|
} |
|
|
@ -1938,8 +1948,8 @@ void forward_maxpool_layer_avx(float *src, float *dst, int *indexes, int size, i |
|
|
|
int pad, int stride, int batch) |
|
|
|
int pad, int stride, int batch) |
|
|
|
{ |
|
|
|
{ |
|
|
|
int b, k; |
|
|
|
int b, k; |
|
|
|
int w_offset = -pad / 2; |
|
|
|
const int w_offset = -pad / 2; |
|
|
|
int h_offset = -pad / 2; |
|
|
|
const int h_offset = -pad / 2; |
|
|
|
|
|
|
|
|
|
|
|
for (b = 0; b < batch; ++b) { |
|
|
|
for (b = 0; b < batch; ++b) { |
|
|
|
#pragma omp parallel for |
|
|
|
#pragma omp parallel for |
|
|
|