diff --git a/src/convolutional_kernels.cu b/src/convolutional_kernels.cu index 7d4ef8f5..44f62ed3 100644 --- a/src/convolutional_kernels.cu +++ b/src/convolutional_kernels.cu @@ -295,8 +295,7 @@ void forward_convolutional_layer_gpu(convolutional_layer l, network_state state) //#ifdef CUDNN_HALF //if (state.use_mixed_precision) { int iteration_num = (*state.net.seen) / (state.net.batch*state.net.subdivisions); - if (state.net.cudnn_half && !l.xnor && (!state.train || iteration_num > 3*state.net.burn_in)) - //if(state.index != 0) + if (state.index != 0 && state.net.cudnn_half && !l.xnor && (!state.train || iteration_num > 3*state.net.burn_in)) { //printf("\n CUDNN_HALF!!! state.index = %d \n", state.index); @@ -476,8 +475,7 @@ void backward_convolutional_layer_gpu(convolutional_layer l, network_state state //#ifdef CUDNN_HALF int iteration_num = (*state.net.seen) / (state.net.batch*state.net.subdivisions); - if (state.net.cudnn_half && !l.xnor && (!state.train || iteration_num > 3*state.net.burn_in)) - //if (state.index != 0) + if (state.index != 0 && state.net.cudnn_half && !l.xnor && (!state.train || iteration_num > 3*state.net.burn_in)) { const size_t input16_size = l.batch*l.c*l.w*l.h; diff --git a/src/parser.c b/src/parser.c index c1647666..b01c5458 100644 --- a/src/parser.c +++ b/src/parser.c @@ -732,6 +732,8 @@ network parse_network_cfg_custom(char *filename, int batch) float bflops = 0; size_t workspace_size = 0; + size_t max_inputs = 0; + size_t max_outputs = 0; n = n->next; int count = 0; free_section(s); @@ -806,6 +808,8 @@ network parse_network_cfg_custom(char *filename, int batch) option_unused(options); net.layers[count] = l; if (l.workspace_size > workspace_size) workspace_size = l.workspace_size; + if (l.inputs > max_inputs) max_inputs = l.inputs; + if (l.outputs > max_outputs) max_outputs = l.outputs; free_section(s); n = n->next; ++count; @@ -828,6 +832,14 @@ network parse_network_cfg_custom(char *filename, int batch) net.workspace = cuda_make_array(0, workspace_size/sizeof(float) + 1); int size = get_network_input_size(net) * net.batch; net.input_state_gpu = cuda_make_array(0, size); + + // pre-allocate memory for inference on Tensor Cores (fp16) + if (net.cudnn_half) { + *net.max_input16_size = max_inputs; + check_error(cudaMalloc((void **)net.input16_gpu, *net.max_input16_size * sizeof(short))); //sizeof(half) + *net.max_output16_size = max_outputs; + check_error(cudaMalloc((void **)net.output16_gpu, *net.max_output16_size * sizeof(short))); //sizeof(half) + } }else { net.workspace = calloc(1, workspace_size); }