diff --git a/Makefile b/Makefile
index 3f10ed35..99781ea9 100644
--- a/Makefile
+++ b/Makefile
@@ -7,7 +7,7 @@ OPENMP=0
 LIBSO=0
 
 # set GPU=1 and CUDNN=1 to speedup on GPU
-# set CUDNN_HALF=1 to further speedup 3 x times (Mixed-precision using Tensor Cores) on GPU Tesla V100, Titan V, DGX-2
+# set CUDNN_HALF=1 to further speedup 3 x times (Mixed-precision on Tensor Cores) GPU: Volta, Xavier, Turing and higher
 # set AVX=1 and OPENMP=1 to speedup on CPU (if error occurs then set AVX=0)
 
 DEBUG=0
diff --git a/src/convolutional_kernels.cu b/src/convolutional_kernels.cu
index 04d3820f..166d5c17 100644
--- a/src/convolutional_kernels.cu
+++ b/src/convolutional_kernels.cu
@@ -138,8 +138,8 @@ void fast_binarize_weights_gpu(float *weights, int n, int size, float *binary, f
 __global__ void cuda_f32_to_f16(float* input_f32, size_t size, half *output_f16)
 {
     int idx = blockIdx.x * blockDim.x + threadIdx.x;
-    //if (idx < size) output_f16[idx] = __float2half(input_f32[idx]);
-    if (idx < size) output_f16[idx] = __float2half_rn(input_f32[idx]);
+    if (idx < size) output_f16[idx] = __float2half(input_f32[idx]);
+    //if (idx < size) output_f16[idx] = __float2half_rn(input_f32[idx]); // can't be compiled on Linux without casting
     // __float2half_ru, __float2half_rd, __float2half_rz, __float2half_rn
     //if (idx < size) *((unsigned short *)output_f16 + idx) = __float2half(input_f32[idx]);
 }