Optimized memory consumption during Training with -map flag

6 years ago · 1b20072096
parent b87080882a
commit 1b20072096
1 changed files with 76 additions and 41 deletions
--- a/src/detector.c
+++ b/src/detector.c
@ -44,6 +44,36 @@ void train_detector(char *datacfg, char *cfgfile, char *weightfile, int *gpus, i
    char *valid_images = option_find_str(options, "valid", train_images);
    char *backup_directory = option_find_str(options, "backup", "/backup/");

+    int train_images_num = 0;
+    network net_map;
+    if (calc_map) {
+        FILE* valid_file = fopen(valid_images, "r");
+        if (!valid_file) {
+            printf("\n Error: There is no %s file for mAP calculation!\n Don't use -map flag.\n Or set valid=%s in your %s file. \n", valid_images, train_images, datacfg);
+            getchar();
+            exit(-1);
+        }
+        else fclose(valid_file);
+        list *plist = get_paths(train_images);
+        train_images_num = plist->size;
+        free_list(plist);
+
+        cuda_set_device(gpus[0]);
+        printf(" Prepare additional network for mAP calculation...\n");
+        net_map = parse_network_cfg_custom(cfgfile, 1);
+        int k;  // free memory unnecessary arrays
+        for (k = 0; k < net_map.n; ++k) {
+            free_layer(net_map.layers[k]);
+        }
+#ifdef GPU
+        cuda_free(net_map.workspace);
+        if (*net_map.input16_gpu) cuda_free(*net_map.input16_gpu);
+        if (*net_map.output16_gpu) cuda_free(*net_map.output16_gpu);
+#else
+        free(net_map.workspace);
+#endif
+    }
+
    srand(time(0));
    char *base = basecfg(cfgfile);
    printf("%s\n", base);
@ -68,44 +98,6 @@ void train_detector(char *datacfg, char *cfgfile, char *weightfile, int *gpus, i
    srand(time(0));
    network net = nets[0];

-    int train_images_num = 0;
-    network net_map;
-    if (calc_map) {
-        FILE* valid_file = fopen(valid_images, "r");
-        if (!valid_file) {
-            printf("\n Error: There is no %s file for mAP calculation!\n Don't use -map flag.\n Or set valid=%s in your %s file. \n", valid_images, train_images, datacfg);
-            getchar();
-            exit(-1);
-        }
-        else fclose(valid_file);
-        list *plist = get_paths(train_images);
-        train_images_num = plist->size;
-        free_list(plist);
-
-        cuda_set_device(net.gpu_index);
-        printf(" Prepare additional network for mAP calculation...\n");
-        net_map = parse_network_cfg_custom(cfgfile, 1);
-        int k;
-        for (k = 0; k < net.n; ++k) {
-            layer l = net.layers[k];
-            if (l.type == CONVOLUTIONAL) {
-                net_map.layers[k].biases = l.biases;
-                net_map.layers[k].scales = l.scales;
-                net_map.layers[k].rolling_mean = l.rolling_mean;
-                net_map.layers[k].rolling_variance = l.rolling_variance;
-                net_map.layers[k].weights = l.weights;
-#ifdef GPU
-                net_map.layers[k].biases_gpu = l.biases_gpu;
-                net_map.layers[k].scales_gpu = l.scales_gpu;
-                net_map.layers[k].rolling_mean_gpu = l.rolling_mean_gpu;
-                net_map.layers[k].rolling_variance_gpu = l.rolling_variance_gpu;
-                net_map.layers[k].weights_gpu = l.weights_gpu;
-                net_map.layers[k].weights_gpu16 = l.weights_gpu16;
-#endif  // GPU
-            }
-        }
-    }
-
    const int actual_batch_size = net.batch * net.subdivisions;
    if (actual_batch_size == 1) {
        printf("\n Error: You set incorrect value batch=1 for Training! You should set batch=64 subdivision=64 \n");
@ -159,7 +151,7 @@ void train_detector(char *datacfg, char *cfgfile, char *weightfile, int *gpus, i

 #ifdef OPENCV
    args.threads = 3 * ngpus;   // Amazon EC2 Tesla V100: p3.2xlarge (8 logical cores) - p3.16xlarge
-                                //args.threads = 12 * ngpus;    // Ryzen 7 2700X (16 logical cores)
+    //args.threads = 12 * ngpus;    // Ryzen 7 2700X (16 logical cores)
    IplImage* img = NULL;
    float max_img_loss = 5;
    int number_of_lines = 100;
@ -255,8 +247,50 @@ void train_detector(char *datacfg, char *cfgfile, char *weightfile, int *gpus, i
            int draw_precision = 0;
            int calc_map_for_each = 4 * train_images_num / (net.batch * net.subdivisions);
            if (calc_map && (i >= (iter_map + calc_map_for_each) || i == net.max_batches) && i >= net.burn_in && i >= 1000) {
+                if (l.random) {
+                    printf("Resizing to initial size: %d x %d \n", init_w, init_h);
+                    args.w = init_w;
+                    args.h = init_h;
+                    pthread_join(load_thread, 0);
+                    train = buffer;
+                    load_thread = load_data(args);
+                    int k;
+                    for (k = 0; k < ngpus; ++k) {
+                        resize_network(nets + k, init_w, init_h);
+                    }
+                    net = nets[0];
+                }
+
+                // combine Training and Validation networks
+                network net_combined = make_network(net.n);
+                layer *old_layers = net_combined.layers;
+                net_combined = net;
+                net_combined.layers = old_layers;
+                net_combined.batch = 1;
+
+                int k;
+                for (k = 0; k < net.n; ++k) {
+                    layer *l = &(net.layers[k]);
+                    net_combined.layers[k] = net.layers[k];
+                    net_combined.layers[k].batch = 1;
+
+                    if (l->type == CONVOLUTIONAL) {
+#ifdef CUDNN
+                        net_combined.layers[k].normTensorDesc = net_map.layers[k].normTensorDesc;
+                        net_combined.layers[k].normDstTensorDesc = net_map.layers[k].normDstTensorDesc;
+                        net_combined.layers[k].normDstTensorDescF16 = net_map.layers[k].normDstTensorDescF16;
+
+                        net_combined.layers[k].srcTensorDesc = net_map.layers[k].srcTensorDesc;
+                        net_combined.layers[k].dstTensorDesc = net_map.layers[k].dstTensorDesc;
+
+                        net_combined.layers[k].srcTensorDesc16 = net_map.layers[k].srcTensorDesc16;
+                        net_combined.layers[k].dstTensorDesc16 = net_map.layers[k].dstTensorDesc16;
+#endif // CUDNN
+                    }
+                }
+
                iter_map = i;
-                mean_average_precision = validate_detector_map(datacfg, cfgfile, weightfile, 0.25, 0.5, &net_map);
+                mean_average_precision = validate_detector_map(datacfg, cfgfile, weightfile, 0.25, 0.5, &net_combined);
                printf("\n mean_average_precision = %f \n", mean_average_precision);
                draw_precision = 1;
            }
@ -312,8 +346,9 @@ void train_detector(char *datacfg, char *cfgfile, char *weightfile, int *gpus, i
    free_list_contents_kvp(options);
    free_list(options);

+    for (i = 0; i < ngpus; ++i) free_network(nets[i]);
    free(nets);
-    free_network(net);
+    //free_network(net);
 }