Compile fix

7 years ago · 742bb7c7ce
parent 7c2f302321
commit 742bb7c7ce
2 changed files with 3 additions and 3 deletions
--- a/2
+++ b/2
@ -7,7 +7,7 @@ OPENMP=0
 LIBSO=0

 # set GPU=1 and CUDNN=1 to speedup on GPU
-# set CUDNN_HALF=1 to further speedup 3 x times (Mixed-precision using Tensor Cores) on GPU Tesla V100, Titan V, DGX-2
+# set CUDNN_HALF=1 to further speedup 3 x times (Mixed-precision on Tensor Cores) GPU: Volta, Xavier, Turing and higher
 # set AVX=1 and OPENMP=1 to speedup on CPU (if error occurs then set AVX=0)

 DEBUG=0
--- a/src/convolutional_kernels.cu
+++ b/src/convolutional_kernels.cu
@ -138,8 +138,8 @@ void fast_binarize_weights_gpu(float *weights, int n, int size, float *binary, f
 __global__ void cuda_f32_to_f16(float* input_f32, size_t size, half *output_f16)
 {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
-    //if (idx < size) output_f16[idx] = __float2half(input_f32[idx]);
-    if (idx < size) output_f16[idx] = __float2half_rn(input_f32[idx]);
+    if (idx < size) output_f16[idx] = __float2half(input_f32[idx]);
+    //if (idx < size) output_f16[idx] = __float2half_rn(input_f32[idx]); // can't be compiled on Linux without casting
    // __float2half_ru, __float2half_rd, __float2half_rz, __float2half_rn
    //if (idx < size) *((unsigned short *)output_f16 + idx) = __float2half(input_f32[idx]);
 }