diff --git a/src/network.c b/src/network.c index b960f27f..65d7629f 100644 --- a/src/network.c +++ b/src/network.c @@ -459,12 +459,13 @@ int resize_network(network *net, int w, int h) const int size = get_network_input_size(*net) * net->batch; #ifdef GPU if(gpu_index >= 0){ - printf(" try to allocate workspace = %zu * sizeof(float), ", workspace_size / sizeof(float) + 1); + printf(" try to allocate additional workspace_size = %1.2f MB \n", (float)workspace_size / 1000000); net->workspace = cuda_make_array(0, workspace_size/sizeof(float) + 1); net->input_state_gpu = cuda_make_array(0, size); if (cudaSuccess == cudaHostAlloc(&net->input_pinned_cpu, size * sizeof(float), cudaHostRegisterMapped)) net->input_pinned_cpu_flag = 1; else { + cudaGetLastError(); // reset CUDA-error net->input_pinned_cpu = calloc(size, sizeof(float)); net->input_pinned_cpu_flag = 0; } diff --git a/src/parser.c b/src/parser.c index 90594c92..9ca9a942 100644 --- a/src/parser.c +++ b/src/parser.c @@ -853,7 +853,10 @@ network parse_network_cfg_custom(char *filename, int batch, int time_steps) int size = get_network_input_size(net) * net.batch; net.input_state_gpu = cuda_make_array(0, size); if (cudaSuccess == cudaHostAlloc(&net.input_pinned_cpu, size * sizeof(float), cudaHostRegisterMapped)) net.input_pinned_cpu_flag = 1; - else net.input_pinned_cpu = calloc(size, sizeof(float)); + else { + cudaGetLastError(); // reset CUDA-error + net.input_pinned_cpu = calloc(size, sizeof(float)); + } // pre-allocate memory for inference on Tensor Cores (fp16) if (net.cudnn_half) { @@ -863,7 +866,7 @@ network parse_network_cfg_custom(char *filename, int batch, int time_steps) check_error(cudaMalloc((void **)net.output16_gpu, *net.max_output16_size * sizeof(short))); //sizeof(half) } if (workspace_size) { - printf(" Allocate workspace_size = %zu \n", workspace_size); + printf(" Allocate additional workspace_size = %1.2f MB \n", (float)workspace_size / 1000000); net.workspace = cuda_make_array(0, workspace_size / sizeof(float) + 1); } else { diff --git a/src/yolo_layer.c b/src/yolo_layer.c index 74654af1..1268ff5e 100644 --- a/src/yolo_layer.c +++ b/src/yolo_layer.c @@ -56,11 +56,17 @@ layer make_yolo_layer(int batch, int w, int h, int n, int total, int *mask, int free(l.output); if (cudaSuccess == cudaHostAlloc(&l.output, batch*l.outputs*sizeof(float), cudaHostRegisterMapped)) l.output_pinned = 1; - else l.output = calloc(batch*l.outputs, sizeof(float)); + else { + cudaGetLastError(); // reset CUDA-error + l.output = calloc(batch*l.outputs, sizeof(float)); + } free(l.delta); if (cudaSuccess == cudaHostAlloc(&l.delta, batch*l.outputs*sizeof(float), cudaHostRegisterMapped)) l.delta_pinned = 1; - else l.delta = calloc(batch*l.outputs, sizeof(float)); + else { + cudaGetLastError(); // reset CUDA-error + l.delta = calloc(batch*l.outputs, sizeof(float)); + } #endif fprintf(stderr, "yolo\n"); @@ -84,6 +90,7 @@ void resize_yolo_layer(layer *l, int w, int h) if (l->output_pinned) { cudaFreeHost(l->output); if (cudaSuccess != cudaHostAlloc(&l->output, l->batch*l->outputs * sizeof(float), cudaHostRegisterMapped)) { + cudaGetLastError(); // reset CUDA-error l->output = realloc(l->output, l->batch*l->outputs * sizeof(float)); l->output_pinned = 0; } @@ -92,6 +99,7 @@ void resize_yolo_layer(layer *l, int w, int h) if (l->delta_pinned) { cudaFreeHost(l->delta); if (cudaSuccess != cudaHostAlloc(&l->delta, l->batch*l->outputs * sizeof(float), cudaHostRegisterMapped)) { + cudaGetLastError(); // reset CUDA-error l->delta = realloc(l->delta, l->batch*l->outputs * sizeof(float)); l->delta_pinned = 0; }