diff --git a/src/convolutional_kernels.cu b/src/convolutional_kernels.cu
index 556efda1..88f97624 100644
--- a/src/convolutional_kernels.cu
+++ b/src/convolutional_kernels.cu
@@ -141,70 +141,39 @@ void forward_convolutional_layer_gpu(convolutional_layer l, network_state state)
             size_t t_intput_size = new_ldb * n;
             size_t t_bit_input_size = t_intput_size / 8;// +1;
 
-            /*
-            int i = 0;
-            im2col_align_ongpu(state.input + i*l.c*l.h*l.w, l.c, l.h, l.w, l.size, l.stride, l.pad, l.align_workspace_gpu, l.bit_align);
-            //cudaDeviceSynchronize();
+            {
+                int i = 0;
+                im2col_align_ongpu(state.input + i*l.c*l.h*l.w, l.c, l.h, l.w, l.size, l.stride, l.pad, l.align_workspace_gpu, l.bit_align);
+                //cudaDeviceSynchronize();
 
-            // should be optimized
-            float_to_bit_gpu(l.align_workspace_gpu, (unsigned char *)state.workspace, l.align_workspace_size);
-            //cudaDeviceSynchronize();
+                // should be optimized
+                float_to_bit_gpu(l.align_workspace_gpu, (unsigned char *)state.workspace, l.align_workspace_size);
+                //cudaDeviceSynchronize();
 
-            //im2col_align_ongpu(state.input + i*l.c*l.h*l.w, l.c, l.h, l.w, l.size, l.stride, l.pad, state.workspace, l.bit_align);
+                //im2col_align_ongpu(state.input + i*l.c*l.h*l.w, l.c, l.h, l.w, l.size, l.stride, l.pad, state.workspace, l.bit_align);
 
-            transpose_bin_gpu((unsigned char *)state.workspace, (unsigned char *)l.transposed_align_workspace_gpu, k, n, l.bit_align, new_ldb, 8);
-            //cudaDeviceSynchronize();
+                transpose_bin_gpu((unsigned char *)state.workspace, (unsigned char *)l.transposed_align_workspace_gpu, k, n, l.bit_align, new_ldb, 8);
+                //cudaDeviceSynchronize();
 
-            // should be optimized
-            gemm_nn_custom_bin_mean_transposed_gpu(m, n, k,
-                (unsigned char *)l.align_bit_weights_gpu, new_ldb, (unsigned char *)l.transposed_align_workspace_gpu, new_ldb, l.output_gpu, n, l.mean_arr_gpu);
-            //cudaDeviceSynchronize();
-            //check_error(status);
-            */
+                // should be optimized
+                gemm_nn_custom_bin_mean_transposed_gpu(m, n, k,
+                    (unsigned char *)l.align_bit_weights_gpu, new_ldb, (unsigned char *)l.transposed_align_workspace_gpu, new_ldb, l.output_gpu, n, l.mean_arr_gpu);
+                //cudaDeviceSynchronize();
+                //check_error(status);
+            }
 
-            {
-                //
-
-                /*
-                float *input_cpu = (float *)calloc(input_size, sizeof(float));
-                status = cudaMemcpy(input_cpu, state.input, input_size* sizeof(float), cudaMemcpyDeviceToHost);
-                check_error(status);
-
-                // swaped(binary_weights <-> l.weights)
-                convolve_cpu(input_cpu, l.weights, l.output, l.w, l.h, l.c, l.n, l.size, l.pad); // CPU
-                status = cudaMemcpy(l.output_gpu, l.output, l.outputs * sizeof(float), cudaMemcpyHostToDevice);
-                check_error(status);
-                free(input_cpu);
-                */
-
-                /*
-                float *input_cpu = (float *)calloc(input_size, sizeof(float));
-                float *input_bin_cpu = (float *)calloc(input_size, sizeof(char));
-                //float *weights_bin_cpu = (float *)calloc(l.n*l.c*l.size*l.size, sizeof(char));
-                status = cudaMemcpy(input_cpu, state.input, input_size * sizeof(float), cudaMemcpyDeviceToHost);
-                check_error(status);
-                float_to_bit(input_cpu, (unsigned char *)input_bin_cpu, input_size);
-                //float_to_bit(l.weights, (unsigned char *)weights_bin_cpu, l.n*l.c*l.size*l.size); // l.align_bit_weights
-
-                convolve_bin_cpu(input_bin_cpu, (float *)l.align_bit_weights, l.output, l.w, l.h, l.c, l.n, l.size, l.pad, l.new_lda, l.mean_arr); // CPU
-                status = cudaMemcpy(l.output_gpu, l.output, l.outputs * sizeof(float), cudaMemcpyHostToDevice);
-                check_error(status);
-                //free(weights_bin_cpu);
-                free(input_bin_cpu);
-                free(input_cpu);
-                */
 
+            /*
+            {
                 float_to_bit_gpu(state.input, (unsigned char *)l.align_workspace_gpu, input_size);
                 convolve_bin_gpu(l.align_workspace_gpu, (float *)l.align_bit_weights_gpu, l.output_gpu, l.w, l.h, l.c, l.n, l.size, l.pad, l.new_lda, l.mean_arr_gpu);
 
-
                 //convolve_gpu(state.input, l.weights_gpu, l.output_gpu, l.w, l.h, l.c, l.n, l.size, l.pad);
 
                 //cudaDeviceSynchronize();
                 //check_error(status);
-
-
             }
+            */
 
             add_bias_gpu(l.output_gpu, l.biases_gpu, l.batch, l.n, l.out_w*l.out_h);
             activate_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation);
diff --git a/src/gemm.c b/src/gemm.c
index 123598a5..76315b02 100644
--- a/src/gemm.c
+++ b/src/gemm.c
@@ -204,10 +204,11 @@ void gemm_nn_custom_bin_mean(int M, int N, int K, float ALPHA_UNUSED,
 {
     int *count_arr = calloc(M*N, sizeof(int));
 
-    int i, j, k, h;
+    int i;
 
 #pragma omp parallel for
     for (i = 0; i < M; ++i) {   // l.n - filters [16 - 55 - 1024]
+        int j, k, h;
         for (k = 0; k < K; ++k) {   // l.size*l.size*l.c - one filter size [27 - 9216]
             const char a_bit = get_bit(A, i*lda + k);
             uint64_t a_bit64 = fill_bit_int64(a_bit);
@@ -271,10 +272,11 @@ void gemm_nn_custom_bin_mean_transposed(int M, int N, int K, float ALPHA_UNUSED,
     unsigned char *B, int ldb,
     float *C, int ldc, float *mean_arr)
 {
-    int i, j, k, h;
+    int i;
 
 #pragma omp parallel for
     for (i = 0; i < M; ++i) {   // l.n - filters [16 - 55 - 1024]
+        int j, k, h;
         float mean_val = mean_arr[i];
 
         for (j = 0; j < N; ++j) { // out_h*out_w - one channel output size [169 - 173056]
@@ -365,7 +367,7 @@ void transpose_bin(char *A, char *B, const int n, const int m,
     const int lda, const int ldb, const int block_size)
 {
     int i;
-#pragma omp parallel for
+    #pragma omp parallel for
     for (i = 0; i < n; i += 8) {
         int j;
         for (j = 0; j < m - 8; j += 8) {
@@ -617,14 +619,14 @@ void gemm_nn(int M, int N, int K, float ALPHA,
 void convolution_2d_old(int w, int h, int ksize, int n, int c, int pad, int stride,
     float *weights, float *input, float *output)
 {
-    int out_h = (h + 2 * pad - ksize) / stride + 1;    // output_height=input_height for stride=1 and pad=1
-    int out_w = (w + 2 * pad - ksize) / stride + 1;    // output_width=input_width for stride=1 and pad=1
-    int i, f, j;
+    const int out_h = (h + 2 * pad - ksize) / stride + 1;    // output_height=input_height for stride=1 and pad=1
+    const int out_w = (w + 2 * pad - ksize) / stride + 1;    // output_width=input_width for stride=1 and pad=1
 
     int fil;
     // filter index
-#pragma omp parallel for      // "omp parallel for" - automatic parallelization of loop by using OpenMP
+    #pragma omp parallel for      // "omp parallel for" - automatic parallelization of loop by using OpenMP
     for (fil = 0; fil < n; ++fil) {
+        //int i, f, j;
         int chan, y, x, f_y, f_x;
         // channel index
         for (chan = 0; chan < c; ++chan)
@@ -665,9 +667,9 @@ void convolution_2d_old(int w, int h, int ksize, int n, int c, int pad, int stri
 void convolution_2d(int w, int h, int ksize, int n, int c, int pad, int stride,
     float *weights, float *input, float *output, float *mean)
 {
-    int out_h = (h + 2 * pad - ksize) / stride + 1;    // output_height=input_height for stride=1 and pad=1
-    int out_w = (w + 2 * pad - ksize) / stride + 1;    // output_width=input_width for stride=1 and pad=1
-    int i, f, j;
+    const int out_h = (h + 2 * pad - ksize) / stride + 1;    // output_height=input_height for stride=1 and pad=1
+    const int out_w = (w + 2 * pad - ksize) / stride + 1;    // output_width=input_width for stride=1 and pad=1
+    int i;
 
 #if defined(_OPENMP)
     static int max_num_threads = 0;
@@ -684,9 +686,9 @@ void convolution_2d(int w, int h, int ksize, int n, int c, int pad, int stride,
         *((__m256*)&weights[i]) = _mm256_and_ps(*((__m256*)&weights[i]), _mm256_castsi256_ps(all256_sing1));
     }
 
-    for (i = 0; i < w*h*c; i += 8) {
+    //for (i = 0; i < w*h*c; i += 8) {
         //*((__m256*)&input[i]) = _mm256_and_ps(*((__m256*)&input[i]), _mm256_castsi256_ps(all256_sing1));
-    }
+    //}
 
 
     //__m256i all256_last_zero = _mm256_set1_epi32(0xFFFFFFFF);
@@ -704,7 +706,7 @@ void convolution_2d(int w, int h, int ksize, int n, int c, int pad, int stride,
 
     int fil;
     // filter index
-#pragma omp parallel for      // "omp parallel for" - automatic parallelization of loop by using OpenMP
+    #pragma omp parallel for      // "omp parallel for" - automatic parallelization of loop by using OpenMP
     for (fil = 0; fil < n; ++fil) {
         int chan, y, x, f_y, f_x;
         float cur_mean = fabs(mean[fil]);
@@ -914,16 +916,17 @@ void im2col_cpu_custom_transpose(float* data_im,
     int channels, int height, int width,
     int ksize, int stride, int pad, float* data_col, int ldb_align)
 {
-    int c, h, w;
-    int height_col = (height + 2 * pad - ksize) / stride + 1;
-    int width_col = (width + 2 * pad - ksize) / stride + 1;
-    int channels_col = channels * ksize * ksize;
+    const int height_col = (height + 2 * pad - ksize) / stride + 1;
+    const int width_col = (width + 2 * pad - ksize) / stride + 1;
+    const int channels_col = channels * ksize * ksize;
+    int c;
 
     // optimized version
     if (height_col == height && width_col == width && stride == 1 && pad == 1)
     {
-#pragma omp parallel for
+        #pragma omp parallel for
         for (c = 0; c < channels_col; ++c) {
+            int h, w;
             int w_offset = c % ksize;
             int h_offset = (c / ksize) % ksize;
             int c_im = c / ksize / ksize;
@@ -1005,6 +1008,7 @@ void im2col_cpu_custom_transpose(float* data_im,
     else {
         #pragma omp parallel for
         for (c = 0; c < channels_col; ++c) {
+            int h, w;
             int w_offset = c % ksize;
             int h_offset = (c / ksize) % ksize;
             int c_im = c / ksize / ksize;
@@ -1029,17 +1033,17 @@ void im2col_cpu_custom(float* data_im,
     int channels, int height, int width,
     int ksize, int stride, int pad, float* data_col)
 {
-
-    int c, h, w;
-    int height_col = (height + 2 * pad - ksize) / stride + 1;
-    int width_col = (width + 2 * pad - ksize) / stride + 1;
-    int channels_col = channels * ksize * ksize;
+    int c;
+    const int height_col = (height + 2 * pad - ksize) / stride + 1;
+    const int width_col = (width + 2 * pad - ksize) / stride + 1;
+    const int channels_col = channels * ksize * ksize;
 
     // optimized version
     if (height_col == height && width_col == width && stride == 1 && pad == 1 && is_fma_avx2())
     {
         #pragma omp parallel for
         for (c = 0; c < channels_col; ++c) {
+            int h, w;
             int w_offset = c % ksize;
             int h_offset = (c / ksize) % ksize;
             int c_im = c / ksize / ksize;
@@ -1121,10 +1125,10 @@ void im2col_cpu_custom_align(float* data_im,
     int channels, int height, int width,
     int ksize, int stride, int pad, float* data_col, int bit_align)
 {
-    int c, h, w;
-    int height_col = (height + 2 * pad - ksize) / stride + 1;
-    int width_col = (width + 2 * pad - ksize) / stride + 1;
-    int channels_col = channels * ksize * ksize;
+    int c;
+    const int height_col = (height + 2 * pad - ksize) / stride + 1;
+    const int width_col = (width + 2 * pad - ksize) / stride + 1;
+    const int channels_col = channels * ksize * ksize;
 
     // optimized version
     if (height_col == height && width_col == width && stride == 1 && pad == 1 && is_fma_avx2())
@@ -1133,6 +1137,7 @@ void im2col_cpu_custom_align(float* data_im,
 
         #pragma omp parallel for
         for (c = 0; c < channels_col; ++c) {
+            int h, w;
             int w_offset = c % ksize;
             int h_offset = (c / ksize) % ksize;
             int c_im = c / ksize / ksize;
@@ -1218,10 +1223,10 @@ void im2col_cpu_custom_bin(float* data_im,
     int channels, int height, int width,
     int ksize, int stride, int pad, float* data_col, int bit_align)
 {
-    int c, h, w;
-    int height_col = (height + 2 * pad - ksize) / stride + 1;
-    int width_col = (width + 2 * pad - ksize) / stride + 1;
-    int channels_col = channels * ksize * ksize;
+    int c;
+    const int height_col = (height + 2 * pad - ksize) / stride + 1;
+    const int width_col = (width + 2 * pad - ksize) / stride + 1;
+    const int channels_col = channels * ksize * ksize;
 
     // optimized version
     if (height_col == height && width_col == width && stride == 1 && pad == 1 && is_fma_avx2())
@@ -1233,6 +1238,7 @@ void im2col_cpu_custom_bin(float* data_im,
 
         #pragma omp parallel for
         for (c = 0; c < channels_col; ++c) {
+            int h, w;
             int w_offset = c % ksize;
             int h_offset = (c / ksize) % ksize;
             int c_im = c / ksize / ksize;
@@ -1451,8 +1457,8 @@ void forward_maxpool_layer_avx(float *src, float *dst, int *indexes, int size, i
     int pad, int stride, int batch)
 {
 
-    int w_offset = -pad / 2;
-    int h_offset = -pad / 2;
+    const int w_offset = -pad / 2;
+    const int h_offset = -pad / 2;
     int b, k;
 
     for (b = 0; b < batch; ++b) {
@@ -1563,13 +1569,13 @@ void gemm_nn(int M, int N, int K, float ALPHA,
 void convolution_2d(int w, int h, int ksize, int n, int c, int pad, int stride,
     float *weights, float *input, float *output, float *mean)
 {
-    int out_h = (h + 2 * pad - ksize) / stride + 1;    // output_height=input_height for stride=1 and pad=1
-    int out_w = (w + 2 * pad - ksize) / stride + 1;    // output_width=input_width for stride=1 and pad=1
-    int i, f, j;
+    const int out_h = (h + 2 * pad - ksize) / stride + 1;    // output_height=input_height for stride=1 and pad=1
+    const int out_w = (w + 2 * pad - ksize) / stride + 1;    // output_width=input_width for stride=1 and pad=1
+    //int i, f, j;
 
     int fil;
     // filter index
-#pragma omp parallel for      // "omp parallel for" - automatic parallelization of loop by using OpenMP
+    #pragma omp parallel for      // "omp parallel for" - automatic parallelization of loop by using OpenMP
     for (fil = 0; fil < n; ++fil) {
         int chan, y, x, f_y, f_x;
         // channel index
@@ -1613,10 +1619,11 @@ void gemm_nn_custom_bin_mean_transposed(int M, int N, int K, float ALPHA_UNUSED,
     unsigned char *B, int ldb,
     float *C, int ldc, float *mean_arr)
 {
-    int i, j, k, h;
+    int i;
 
-#pragma omp parallel for
+    #pragma omp parallel for
     for (i = 0; i < M; ++i) {   // l.n - filters [16 - 55 - 1024]
+        int j, k;
         float mean_val = mean_arr[i];
 
         for (j = 0; j < N; ++j) { // out_h*out_w - one channel output size [169 - 173056]
@@ -1660,16 +1667,17 @@ void im2col_cpu_custom(float* data_im,
     im2col_cpu(data_im, channels, height, width, ksize, stride, pad, data_col);
     return;
 
-    int c, h, w;
-    int height_col = (height + 2 * pad - ksize) / stride + 1;
-    int width_col = (width + 2 * pad - ksize) / stride + 1;
-    int channels_col = channels * ksize * ksize;
+    int c;
+    const int height_col = (height + 2 * pad - ksize) / stride + 1;
+    const int width_col = (width + 2 * pad - ksize) / stride + 1;
+    const int channels_col = channels * ksize * ksize;
 
     // optimized version
     if (height_col == height && width_col == width && stride == 1 && pad == 1)
     {
         #pragma omp parallel for
         for (c = 0; c < channels_col; ++c) {
+            int h, w;
             int w_offset = c % ksize;
             int h_offset = (c / ksize) % ksize;
             int c_im = c / ksize / ksize;
@@ -1750,10 +1758,10 @@ void im2col_cpu_custom_bin(float* data_im,
     int channels, int height, int width,
     int ksize, int stride, int pad, float* data_col, int bit_align)
 {
-    int c, h, w;
-    int height_col = (height + 2 * pad - ksize) / stride + 1;
-    int width_col = (width + 2 * pad - ksize) / stride + 1;
-    int channels_col = channels * ksize * ksize;
+    int c;
+    const int height_col = (height + 2 * pad - ksize) / stride + 1;
+    const int width_col = (width + 2 * pad - ksize) / stride + 1;
+    const int channels_col = channels * ksize * ksize;
 
     // optimized version
     if (height_col == height && width_col == width && stride == 1 && pad == 1)
@@ -1762,6 +1770,7 @@ void im2col_cpu_custom_bin(float* data_im,
 
         #pragma omp parallel for
         for (c = 0; c < channels_col; ++c) {
+            int h, w;
             int w_offset = c % ksize;
             int h_offset = (c / ksize) % ksize;
             int c_im = c / ksize / ksize;
@@ -1906,9 +1915,10 @@ void float_to_bit(float *src, unsigned char *dst, size_t size)
 
 static inline void transpose_scalar_block(float *A, float *B, const int lda, const int ldb, const int block_size)
 {
-    int i, j;
+    int i;
     //#pragma omp parallel for
     for (i = 0; i<block_size; i++) {
+        int j;
         for (j = 0; j<block_size; j++) {
             B[j*ldb + i] = A[i*lda + j];
         }
@@ -1938,8 +1948,8 @@ void forward_maxpool_layer_avx(float *src, float *dst, int *indexes, int size, i
     int pad, int stride, int batch)
 {
     int b, k;
-    int w_offset = -pad / 2;
-    int h_offset = -pad / 2;
+    const int w_offset = -pad / 2;
+    const int h_offset = -pad / 2;
 
     for (b = 0; b < batch; ++b) {
         #pragma omp parallel for
diff --git a/src/im2col_kernels.cu b/src/im2col_kernels.cu
index 1d719ee8..cbcf53f4 100644
--- a/src/im2col_kernels.cu
+++ b/src/im2col_kernels.cu
@@ -1,6 +1,7 @@
-#include "cuda_runtime.h"
+﻿#include "cuda_runtime.h"
 #include "curand.h"
 #include "cublas_v2.h"
+#include <stdint.h>
 
 extern "C" {
 #include "im2col.h"
@@ -70,6 +71,8 @@ __global__ void im2col_align_gpu_kernel(const int n, const float* data_im,
     const int height_col, const int width_col,
     float *data_col, const int bit_align)
 {
+    //__shared__ float tmp_s[1];
+
     int index = blockIdx.x*blockDim.x + threadIdx.x;
     for (; index < n; index += blockDim.x*gridDim.x) {
         int w_out = index % width_col;
@@ -90,9 +93,15 @@ __global__ void im2col_align_gpu_kernel(const int n, const float* data_im,
                 int h = h_in + i;
                 int w = w_in + j;
 
-                *data_col_ptr = (h >= 0 && w >= 0 && h < height && w < width) ?
+                float val = (h >= 0 && w >= 0 && h < height && w < width) ?
                     data_im_ptr[i * width + j] : 0;
 
+                *data_col_ptr = val;
+                //tmp_s[0] = val;
+
+                //*data_col_ptr = (h >= 0 && w >= 0 && h < height && w < width) ?
+                //    data_im_ptr[i * width + j] : 0;
+
                 //float src_val = (h >= 0 && w >= 0 && h < height && w < width) ? data_im_ptr[i * width + j] : 0;
                 //unsigned int bit_mask = __ballot_sync(0xffffffff, src_val > 0);
                 //if (threadIdx.x % WARP_SIZE == 0) *((unsigned int*)data_col_ptr_32) = bit_mask;
@@ -204,6 +213,10 @@ __device__ __host__ void transpose8rS32_reversed_diagonale(unsigned char* A, int
 
     B[7 * n] = reverse_byte(x >> 24);  B[6 * n] = reverse_byte(x >> 16);  B[5 * n] = reverse_byte(x >> 8);  B[4 * n] = reverse_byte(x);
     B[3 * n] = reverse_byte(y >> 24);  B[2 * n] = reverse_byte(y >> 16);  B[1 * n] = reverse_byte(y >> 8);  B[0 * n] = reverse_byte(y);
+
+    //__device__ ​ unsigned int 	__brev(unsigned int  x)
+    //Reverse the bit order of a 32 bit unsigned integer.
+    // https://docs.nvidia.com/cuda/cuda-math-api/group__CUDA__MATH__INTRINSIC__INT.html
 }
 
 
@@ -257,10 +270,10 @@ void fill_int8_gpu(unsigned char *src, unsigned char val, size_t size) {
 }
 // --------------------------------
 
-typedef unsigned long long int uint64_t;
-typedef unsigned int uint32_t;
-typedef unsigned char uint8_t;
-typedef char int8_t;
+//typedef unsigned long long int uint64_t;
+//typedef unsigned int uint32_t;
+//typedef unsigned char uint8_t;
+//typedef char int8_t;
 
 __device__ __host__ static inline uint64_t broadcast_bit_1_to_64(uint8_t src) {
     return (src > 0) ? 0xFFFFFFFFFFFFFFFF : 0;
@@ -274,6 +287,29 @@ __device__ __host__ static inline uint64_t xnor_int64(uint64_t a, uint64_t b) {
     return ~(a^b);
 }
 
+__device__ __host__ static inline uint4 xnor_int128(uint4 a, uint4 b) {
+    uint4 res;
+    res.w = ~(a.w^b.w);
+    res.x = ~(a.x^b.x);
+    res.y = ~(a.y^b.y);
+    res.z = ~(a.z^b.z);
+    return res;
+}
+
+__device__ __host__ static inline ulonglong4 xnor_int256(ulonglong4 a, ulonglong4 b) {
+    ulonglong4 res;
+    res.w = ~(a.w^b.w);
+    res.x = ~(a.x^b.x);
+    res.y = ~(a.y^b.y);
+    res.z = ~(a.z^b.z);
+    return res;
+}
+
+
+__device__ static inline int popcnt_256(ulonglong4 a) {
+    return __popcll(a.w) + __popcll(a.x) + __popcll(a.y) + __popcll(a.z);
+}
+
 /*
 __global__ void gemm_nn_custom_bin_mean_transposed_gpu_kernel(int M, int N, int K,
     unsigned char *A, int lda,
@@ -320,75 +356,87 @@ __global__ void gemm_nn_custom_bin_mean_transposed_gpu_kernel(int M, int N, int
 */
 
 
-/*
+
 // B (input) in the shared_memory
 __global__ void gemm_nn_custom_bin_mean_transposed_gpu_kernel(int M, int N, int K,
     unsigned char *A, int lda,
     unsigned char *B, int ldb,
     float *C, int ldc, float *mean_arr)
 {
-    int index = blockIdx.x*blockDim.x + threadIdx.x;
 
-    __shared__ uint64_t B_s[4096];  // 32 KB // [ldb x N`]
+    __shared__ uint64_t B_s[4096];  // 32 KB // [ldb x N`] // max = 262 144 bits
 
     int start_j = blockIdx.x*blockDim.x / M;
     int end_j = (blockIdx.x*blockDim.x + blockDim.x) / M + 1;
 
     size_t shared_size = ldb * (end_j - start_j);
 
-    int j_cur = index / M;
-    int local_j = j_cur - start_j;
+    //float tmp_shared_size = ldb * (blockDim.x / M);
+    //int passes = (4096 * 64) / tmp_shared_size - 1;
+    //size_t shared_size = tmp_shared_size * passes;
 
-    for (int k = threadIdx.x * 64; k < shared_size; k += blockDim.x * 64) {
+    int k;
+    for (int k = threadIdx.x * 256; k < shared_size; k += blockDim.x * 256) {
         int x = start_j*ldb + k;
-        if (x < (N*ldb)) *((uint64_t *)(B_s + k / 8)) = *((uint64_t *)(B + x / 8));
+        if (x < (N*ldb)) *((ulonglong4 *)(B_s + k / 8)) = *((ulonglong4 *)(B + x / 8));
     }
 
-    //if (j_cur < N && (index % M == 0 || threadIdx.x == 0)) {
-      //  for (int k = 0; k < K; k += 64) {   // l.size*l.size*l.c - one filter size [27 - 9216]
-        //    *((uint64_t *)(B_s + (local_j*ldb + k) / 8)) = *((uint64_t *)(B + (j_cur*ldb + k) / 8));    // input
-        //}
-    //}
+    ////if (j_cur < N && (index % M == 0 || threadIdx.x == 0)) {
+      ////  for (int k = 0; k < K; k += 64) {   // l.size*l.size*l.c - one filter size [27 - 9216]
+        ////    *((uint64_t *)(B_s + (local_j*ldb + k) / 8)) = *((uint64_t *)(B + (j_cur*ldb + k) / 8));    // input
+        ////}
+    ////}
     __syncthreads();
 
+    int index = blockIdx.x*blockDim.x + threadIdx.x;
+
+
     //if (index == 0)
+    //for(int in_tmp = threadIdx.x; in_tmp < 1*blockDim.x; in_tmp += blockDim.x)
     {
-        int i, j, k, h;
+        //int index = blockIdx.x*blockDim.x*1 + in_tmp;
+
+        int j_cur = index / M;
+        int local_j = j_cur - start_j;
+
+        int i, j, h;
 
         //#pragma omp parallel for
         //for (i = 0; i < M; ++i)
         i = index % M;
         //if(i < M)
         {   // l.n - filters [16 - 55 - 1024]
+            // further improvements: for (l.n == 1024) iterate several (j)
             float mean_val = mean_arr[i];
 
             //for (j = 0; j < N; ++j)
             j = index / M;
             if (j < N)
             { // out_h*out_w - one channel output size [169 - 173056]
+                const int bit_step = 256;
                 int count = 0;
-
-                for (k = 0; k < K; k += 64) {   // l.size*l.size*l.c - one filter size [27 - 9216]
-                    uint64_t a_bit64 = *((uint64_t *)(A + (i*lda + k) / 8));    // weights
-                    //uint64_t b_bit64 = *((uint64_t *)(B + (j*ldb + k) / 8));
-                    uint64_t b_bit64 = *((uint64_t *)(B_s + (local_j*ldb + k) / 8));    // input
-                    uint64_t c_bit64 = xnor_int64(a_bit64, b_bit64);
-
-                    int tmp_count = __popcll(c_bit64);
-
-                    if (K - k < 64)  tmp_count = tmp_count - (64 - (K - k));    // remove extra bits
-                    count += tmp_count;
-                    //binary_int64_printf(c_bit64);
-                    //printf(", count = %d \n\n", tmp_count);
+                int k = 0;
+                for (k = 0; k < K; k += bit_step) {   // l.size*l.size*l.c - one filter size [27 - 144 - 9216]
+                    ulonglong4 a_bit256 = *((ulonglong4 *)(A + (i*lda + k) / 8));    // weights
+                    //ulonglong4 b_bit256 = *((ulonglong4 *)(B + (j*ldb + k) / 8));
+                    ulonglong4 b_bit256 = *((ulonglong4 *)(B_s + (local_j*ldb + k) / 8));    // input
+                    ulonglong4 c_bit256 = xnor_int256(a_bit256, b_bit256);
+
+                    count += __popcll(c_bit256.w) + __popcll(c_bit256.x) +
+                        __popcll(c_bit256.y) + __popcll(c_bit256.z);
                 }
 
+                int f1 = (K % bit_step == 0) ? 0 : (bit_step - (K % bit_step));
+                count = count - f1;    // remove extra bits (from empty space for align only)
+
                 C[i*ldc + j] = (2 * count - K) * mean_val;
+                //B_s[0] = (2 * count - K) * mean_val;
             }
         }
     }
 }
-*/
 
+/*
 // A (weights) in the shared_memory
 __global__ void gemm_nn_custom_bin_mean_transposed_gpu_kernel(int M, int N, int K,
     unsigned char *A, int lda,
@@ -447,9 +495,11 @@ __global__ void gemm_nn_custom_bin_mean_transposed_gpu_kernel(int M, int N, int
         }
     }
 }
+*/
 
 #include <cstdio>
 
+
 void gemm_nn_custom_bin_mean_transposed_gpu(int M, int N, int K,
     unsigned char *A, int lda,
     unsigned char *B, int ldb,
@@ -742,7 +792,7 @@ __global__ void convolve_bin_gpu_kernel(float *input, float *weights, float *out
     int index2 = index / in_w;
     y = index2 % in_h;
     fil = index2 / in_h;
-    if (fil < n)    // (1-6 for one BLOCK)
+    //if (fil < n)    // (1-6 for one BLOCK)
     {
         //float mean_val = mean_arr_gpu[fil];
         int const output_index = fil*in_w*in_h + y*in_w + x;
@@ -772,48 +822,76 @@ __global__ void convolve_bin_gpu_kernel(float *input, float *weights, float *out
             int const weights_pre_index = fil*new_lda + chan*size*size;
             int const input_pre_index = chan*in_w*in_h;
 
-            __shared__ uint32_t input_shared[416*416/32];   // 21.2 KB bytes
-            const int input_shared_size = in_w*in_h / 32;
+            __shared__ uint32_t input_shared[416*416/32 + 1];   // 21.2 KB bytes (for input size 832x832)
+            const int input_shared_size = in_w*in_h / 32 + 1;
             const int add_input_index = input_pre_index % 32;
+            __syncthreads();    // why??? but is required
 
             for (int s = threadIdx.x; s < input_shared_size; s += blockDim.x) {
-                input_shared[s] = ((uint32_t *)weights)[input_pre_index / 32 + s];
+                input_shared[s] = ((uint32_t *)input)[input_pre_index / 32 + s];
             }
             __syncthreads();
 
-            // filter - y
-            for (f_y = 0; f_y < size; ++f_y)
-            {
-                int input_y = y + f_y - pad;
-                // filter - x
-                for (f_x = 0; f_x < size; ++f_x)
-                {
-                    int input_x = x + f_x - pad;
-                    if (input_y < 0 || input_x < 0 || input_y >= in_h || input_x >= in_w) continue;
-
-                    int input_index = input_pre_index + input_y*in_w + input_x;
-                    int weights_index = weights_pre_index + f_y*size + f_x;
-                    //int weights_index = fil*in_c*size*size + chan*size*size + f_y*size + f_x;
-                    //int weights_index = fil*new_lda + chan*size*size + f_y*size + f_x;
-
-                    uint8_t in_bit = get_bit((uint8_t *)input, input_index);
-                    //uint8_t w_bit = get_bit((uint8_t *)weights, weights_index);
+            /*
+            __shared__ uint8_t input_shared[208 * 208 / 8 + 1];   // 5.4 KB bytes (for input size 416x416)
+            const int input_shared_size = in_w*in_h / 8 + 1;
+            const int add_input_index = input_pre_index % 8;
+            __syncthreads();
 
-                    //int weights_index = fil*in_c*size*size + chan*size*size + f_y*size + f_x;
-                    int weights_shared_index = (fil - min_fil)*new_lda + chan*size*size + f_y*size + f_x;
-                    //uint8_t in_bit = get_bit((uint8_t *)weights_shared, weights_shared_index);
-                    uint8_t w_bit = get_bit((uint8_t *)weights_shared, weights_shared_index);
+            for (int s = threadIdx.x; s < input_shared_size; s += blockDim.x) {
+                ((uint8_t *)input_shared)[s] = ((uint8_t *)input)[input_pre_index / 8 + s];
+            }
+            __syncthreads();
+            */
+            int src_index = -1;
+            uint32_t input_byte;
 
-                    //int input_index = input_pre_index + input_y*in_w + input_x;
-                    //int input_shared_index = /*input_pre_index +*/ input_y*in_w + input_x + add_input_index;
-                    //uint8_t in_bit = get_bit((uint8_t *)input_shared, input_shared_index);
+            if (fil < n)    // (1-6 for one BLOCK)
+            {
+                // filter - y
+                for (f_y = 0; f_y < size; ++f_y)
+                {
+                    int input_y = y + f_y - pad;
+                    // filter - x
+                    for (f_x = 0; f_x < size; ++f_x)
+                    {
+                        int input_x = x + f_x - pad;
+                        if (input_y < 0 || input_x < 0 || input_y >= in_h || input_x >= in_w) continue;
+
+                        int input_index = input_pre_index + input_y*in_w + input_x;
+                        int weights_index = weights_pre_index + f_y*size + f_x;
+                        //int weights_index = fil*in_c*size*size + chan*size*size + f_y*size + f_x;
+                        //int weights_index = fil*new_lda + chan*size*size + f_y*size + f_x;
+
+                        //uint8_t in_bit = get_bit((uint8_t *)input, input_index);
+                        //uint8_t w_bit = get_bit((uint8_t *)weights, weights_index);
+
+                        //int weights_index = fil*in_c*size*size + chan*size*size + f_y*size + f_x;
+                        int weights_shared_index = (fil - min_fil)*new_lda + chan*size*size + f_y*size + f_x;
+                        //uint8_t in_bit = get_bit((uint8_t *)weights_shared, weights_shared_index);
+                        uint8_t w_bit = get_bit((uint8_t *)weights_shared, weights_shared_index);
+
+                        //int input_index = input_pre_index + input_y*in_w + input_x;
+                        int input_shared_index = /*input_pre_index +*/ input_y*in_w + input_x + add_input_index;
+                        uint8_t in_bit = get_bit((uint8_t *)input_shared, input_shared_index);
+                        /*
+                        int new_src_index = input_shared_index / 32;
+                        int src_shift = input_shared_index % 32;
+                        //if (new_src_index != src_index)
+                        {
+                            src_index = new_src_index;
+                            input_byte = ((uint32_t *)input_shared)[src_index];
+                        }
+                        uint8_t in_bit = (input_byte & (1 << src_shift)) >> src_shift;
+                        */
 
-                    int res = xnor_bit1(in_bit, w_bit);
-                    sum += res;
-                    good_val++;
+                        int res = xnor_bit1(in_bit, w_bit);
+                        sum += res;
+                        good_val++;
 
-                    //sum += input[input_index] *weights[weights_index];
+                        //sum += input[input_index] *weights[weights_index];
 
+                    }
                 }
             }
             // l.output[filters][width][height] +=
@@ -822,7 +900,8 @@ __global__ void convolve_bin_gpu_kernel(float *input, float *weights, float *out
             //output[output_index] += sum;
         }
         sum = sum - (good_val - sum);
-        output[output_index] = sum * mean_arr_gpu[fil]; // atoimcAdd for inter-BLOCK sum
+        //output[output_index] = sum * mean_arr_gpu[fil]; // atoimcAdd for inter-BLOCK sum
+        atomicAdd(&output[output_index], sum * mean_arr_gpu[fil]);
     }
 
 }