diff --git a/Makefile b/Makefile index 3f10ed35..99781ea9 100644 --- a/Makefile +++ b/Makefile @@ -7,7 +7,7 @@ OPENMP=0 LIBSO=0 # set GPU=1 and CUDNN=1 to speedup on GPU -# set CUDNN_HALF=1 to further speedup 3 x times (Mixed-precision using Tensor Cores) on GPU Tesla V100, Titan V, DGX-2 +# set CUDNN_HALF=1 to further speedup 3 x times (Mixed-precision on Tensor Cores) GPU: Volta, Xavier, Turing and higher # set AVX=1 and OPENMP=1 to speedup on CPU (if error occurs then set AVX=0) DEBUG=0 diff --git a/src/convolutional_kernels.cu b/src/convolutional_kernels.cu index 04d3820f..166d5c17 100644 --- a/src/convolutional_kernels.cu +++ b/src/convolutional_kernels.cu @@ -138,8 +138,8 @@ void fast_binarize_weights_gpu(float *weights, int n, int size, float *binary, f __global__ void cuda_f32_to_f16(float* input_f32, size_t size, half *output_f16) { int idx = blockIdx.x * blockDim.x + threadIdx.x; - //if (idx < size) output_f16[idx] = __float2half(input_f32[idx]); - if (idx < size) output_f16[idx] = __float2half_rn(input_f32[idx]); + if (idx < size) output_f16[idx] = __float2half(input_f32[idx]); + //if (idx < size) output_f16[idx] = __float2half_rn(input_f32[idx]); // can't be compiled on Linux without casting // __float2half_ru, __float2half_rd, __float2half_rz, __float2half_rn //if (idx < size) *((unsigned short *)output_f16 + idx) = __float2half(input_f32[idx]); }