Added cudaGetLastError() for cudaHostAlloc() to reset last cuda error

6 years ago · c71354ab2e
parent 381f90ebb8
commit c71354ab2e
3 changed files with 17 additions and 5 deletions
--- a/src/network.c
+++ b/src/network.c
@ -459,12 +459,13 @@ int resize_network(network *net, int w, int h)
    const int size = get_network_input_size(*net) * net->batch;
 #ifdef GPU
    if(gpu_index >= 0){
-        printf(" try to allocate workspace = %zu * sizeof(float), ", workspace_size / sizeof(float) + 1);
+        printf(" try to allocate additional workspace_size = %1.2f MB \n", (float)workspace_size / 1000000);
        net->workspace = cuda_make_array(0, workspace_size/sizeof(float) + 1);
        net->input_state_gpu = cuda_make_array(0, size);
        if (cudaSuccess == cudaHostAlloc(&net->input_pinned_cpu, size * sizeof(float), cudaHostRegisterMapped))
            net->input_pinned_cpu_flag = 1;
        else {
+            cudaGetLastError(); // reset CUDA-error
            net->input_pinned_cpu = calloc(size, sizeof(float));
            net->input_pinned_cpu_flag = 0;
        }
--- a/src/parser.c
+++ b/src/parser.c
@ -853,7 +853,10 @@ network parse_network_cfg_custom(char *filename, int batch, int time_steps)
        int size = get_network_input_size(net) * net.batch;
        net.input_state_gpu = cuda_make_array(0, size);
        if (cudaSuccess == cudaHostAlloc(&net.input_pinned_cpu, size * sizeof(float), cudaHostRegisterMapped)) net.input_pinned_cpu_flag = 1;
-        else net.input_pinned_cpu = calloc(size, sizeof(float));
+        else {
+            cudaGetLastError(); // reset CUDA-error
+            net.input_pinned_cpu = calloc(size, sizeof(float));
+        }

        // pre-allocate memory for inference on Tensor Cores (fp16)
        if (net.cudnn_half) {
@ -863,7 +866,7 @@ network parse_network_cfg_custom(char *filename, int batch, int time_steps)
            check_error(cudaMalloc((void **)net.output16_gpu, *net.max_output16_size * sizeof(short))); //sizeof(half)
        }
        if (workspace_size) {
-            printf(" Allocate workspace_size = %zu \n", workspace_size);
+            printf(" Allocate additional workspace_size = %1.2f MB \n", (float)workspace_size / 1000000);
            net.workspace = cuda_make_array(0, workspace_size / sizeof(float) + 1);
        }
        else {
--- a/src/yolo_layer.c
+++ b/src/yolo_layer.c
@ -56,11 +56,17 @@ layer make_yolo_layer(int batch, int w, int h, int n, int total, int *mask, int

    free(l.output);
    if (cudaSuccess == cudaHostAlloc(&l.output, batch*l.outputs*sizeof(float), cudaHostRegisterMapped)) l.output_pinned = 1;
-    else l.output = calloc(batch*l.outputs, sizeof(float));
+    else {
+        cudaGetLastError(); // reset CUDA-error
+        l.output = calloc(batch*l.outputs, sizeof(float));
+    }

    free(l.delta);
    if (cudaSuccess == cudaHostAlloc(&l.delta, batch*l.outputs*sizeof(float), cudaHostRegisterMapped)) l.delta_pinned = 1;
-    else l.delta = calloc(batch*l.outputs, sizeof(float));
+    else {
+        cudaGetLastError(); // reset CUDA-error
+        l.delta = calloc(batch*l.outputs, sizeof(float));
+    }
 #endif

    fprintf(stderr, "yolo\n");
@ -84,6 +90,7 @@ void resize_yolo_layer(layer *l, int w, int h)
    if (l->output_pinned) {
        cudaFreeHost(l->output);
        if (cudaSuccess != cudaHostAlloc(&l->output, l->batch*l->outputs * sizeof(float), cudaHostRegisterMapped)) {
+            cudaGetLastError(); // reset CUDA-error
            l->output = realloc(l->output, l->batch*l->outputs * sizeof(float));
            l->output_pinned = 0;
        }
@ -92,6 +99,7 @@ void resize_yolo_layer(layer *l, int w, int h)
    if (l->delta_pinned) {
        cudaFreeHost(l->delta);
        if (cudaSuccess != cudaHostAlloc(&l->delta, l->batch*l->outputs * sizeof(float), cudaHostRegisterMapped)) {
+            cudaGetLastError(); // reset CUDA-error
            l->delta = realloc(l->delta, l->batch*l->outputs * sizeof(float));
            l->delta_pinned = 0;
        }