Optimized memory consumption during Training with -map flag

pull/2111/head
AlexeyAB 6 years ago
parent b87080882a
commit 1b20072096
  1. 117
      src/detector.c

@ -44,6 +44,36 @@ void train_detector(char *datacfg, char *cfgfile, char *weightfile, int *gpus, i
char *valid_images = option_find_str(options, "valid", train_images);
char *backup_directory = option_find_str(options, "backup", "/backup/");
int train_images_num = 0;
network net_map;
if (calc_map) {
FILE* valid_file = fopen(valid_images, "r");
if (!valid_file) {
printf("\n Error: There is no %s file for mAP calculation!\n Don't use -map flag.\n Or set valid=%s in your %s file. \n", valid_images, train_images, datacfg);
getchar();
exit(-1);
}
else fclose(valid_file);
list *plist = get_paths(train_images);
train_images_num = plist->size;
free_list(plist);
cuda_set_device(gpus[0]);
printf(" Prepare additional network for mAP calculation...\n");
net_map = parse_network_cfg_custom(cfgfile, 1);
int k; // free memory unnecessary arrays
for (k = 0; k < net_map.n; ++k) {
free_layer(net_map.layers[k]);
}
#ifdef GPU
cuda_free(net_map.workspace);
if (*net_map.input16_gpu) cuda_free(*net_map.input16_gpu);
if (*net_map.output16_gpu) cuda_free(*net_map.output16_gpu);
#else
free(net_map.workspace);
#endif
}
srand(time(0));
char *base = basecfg(cfgfile);
printf("%s\n", base);
@ -68,44 +98,6 @@ void train_detector(char *datacfg, char *cfgfile, char *weightfile, int *gpus, i
srand(time(0));
network net = nets[0];
int train_images_num = 0;
network net_map;
if (calc_map) {
FILE* valid_file = fopen(valid_images, "r");
if (!valid_file) {
printf("\n Error: There is no %s file for mAP calculation!\n Don't use -map flag.\n Or set valid=%s in your %s file. \n", valid_images, train_images, datacfg);
getchar();
exit(-1);
}
else fclose(valid_file);
list *plist = get_paths(train_images);
train_images_num = plist->size;
free_list(plist);
cuda_set_device(net.gpu_index);
printf(" Prepare additional network for mAP calculation...\n");
net_map = parse_network_cfg_custom(cfgfile, 1);
int k;
for (k = 0; k < net.n; ++k) {
layer l = net.layers[k];
if (l.type == CONVOLUTIONAL) {
net_map.layers[k].biases = l.biases;
net_map.layers[k].scales = l.scales;
net_map.layers[k].rolling_mean = l.rolling_mean;
net_map.layers[k].rolling_variance = l.rolling_variance;
net_map.layers[k].weights = l.weights;
#ifdef GPU
net_map.layers[k].biases_gpu = l.biases_gpu;
net_map.layers[k].scales_gpu = l.scales_gpu;
net_map.layers[k].rolling_mean_gpu = l.rolling_mean_gpu;
net_map.layers[k].rolling_variance_gpu = l.rolling_variance_gpu;
net_map.layers[k].weights_gpu = l.weights_gpu;
net_map.layers[k].weights_gpu16 = l.weights_gpu16;
#endif // GPU
}
}
}
const int actual_batch_size = net.batch * net.subdivisions;
if (actual_batch_size == 1) {
printf("\n Error: You set incorrect value batch=1 for Training! You should set batch=64 subdivision=64 \n");
@ -159,7 +151,7 @@ void train_detector(char *datacfg, char *cfgfile, char *weightfile, int *gpus, i
#ifdef OPENCV
args.threads = 3 * ngpus; // Amazon EC2 Tesla V100: p3.2xlarge (8 logical cores) - p3.16xlarge
//args.threads = 12 * ngpus; // Ryzen 7 2700X (16 logical cores)
//args.threads = 12 * ngpus; // Ryzen 7 2700X (16 logical cores)
IplImage* img = NULL;
float max_img_loss = 5;
int number_of_lines = 100;
@ -255,8 +247,50 @@ void train_detector(char *datacfg, char *cfgfile, char *weightfile, int *gpus, i
int draw_precision = 0;
int calc_map_for_each = 4 * train_images_num / (net.batch * net.subdivisions);
if (calc_map && (i >= (iter_map + calc_map_for_each) || i == net.max_batches) && i >= net.burn_in && i >= 1000) {
if (l.random) {
printf("Resizing to initial size: %d x %d \n", init_w, init_h);
args.w = init_w;
args.h = init_h;
pthread_join(load_thread, 0);
train = buffer;
load_thread = load_data(args);
int k;
for (k = 0; k < ngpus; ++k) {
resize_network(nets + k, init_w, init_h);
}
net = nets[0];
}
// combine Training and Validation networks
network net_combined = make_network(net.n);
layer *old_layers = net_combined.layers;
net_combined = net;
net_combined.layers = old_layers;
net_combined.batch = 1;
int k;
for (k = 0; k < net.n; ++k) {
layer *l = &(net.layers[k]);
net_combined.layers[k] = net.layers[k];
net_combined.layers[k].batch = 1;
if (l->type == CONVOLUTIONAL) {
#ifdef CUDNN
net_combined.layers[k].normTensorDesc = net_map.layers[k].normTensorDesc;
net_combined.layers[k].normDstTensorDesc = net_map.layers[k].normDstTensorDesc;
net_combined.layers[k].normDstTensorDescF16 = net_map.layers[k].normDstTensorDescF16;
net_combined.layers[k].srcTensorDesc = net_map.layers[k].srcTensorDesc;
net_combined.layers[k].dstTensorDesc = net_map.layers[k].dstTensorDesc;
net_combined.layers[k].srcTensorDesc16 = net_map.layers[k].srcTensorDesc16;
net_combined.layers[k].dstTensorDesc16 = net_map.layers[k].dstTensorDesc16;
#endif // CUDNN
}
}
iter_map = i;
mean_average_precision = validate_detector_map(datacfg, cfgfile, weightfile, 0.25, 0.5, &net_map);
mean_average_precision = validate_detector_map(datacfg, cfgfile, weightfile, 0.25, 0.5, &net_combined);
printf("\n mean_average_precision = %f \n", mean_average_precision);
draw_precision = 1;
}
@ -312,8 +346,9 @@ void train_detector(char *datacfg, char *cfgfile, char *weightfile, int *gpus, i
free_list_contents_kvp(options);
free_list(options);
for (i = 0; i < ngpus; ++i) free_network(nets[i]);
free(nets);
free_network(net);
//free_network(net);
}

Loading…
Cancel
Save