diff --git a/include/darknet.h b/include/darknet.h index 30fe8765..b6b21d85 100644 --- a/include/darknet.h +++ b/include/darknet.h @@ -271,6 +271,7 @@ struct layer { int stretch_sway; float angle; float jitter; + float resize; float saturation; float exposure; float shift; @@ -890,6 +891,7 @@ typedef struct load_args { int show_imgs; int dontuse_opencv; float jitter; + float resize; int flip; int gaussian_noise; int blur; diff --git a/src/data.c b/src/data.c index 613fcd85..91db51f8 100644 --- a/src/data.c +++ b/src/data.c @@ -998,7 +998,7 @@ void blend_truth_mosaic(float *new_truth, int boxes, float *old_truth, int w, in #include "http_stream.h" data load_data_detection(int n, char **paths, int m, int w, int h, int c, int boxes, int classes, int use_flip, int use_gaussian_noise, int use_blur, int use_mixup, - float jitter, float hue, float saturation, float exposure, int mini_batch, int track, int augment_speed, int letter_box, int show_imgs) + float jitter, float resize, float hue, float saturation, float exposure, int mini_batch, int track, int augment_speed, int letter_box, int show_imgs) { const int random_index = random_gen(); c = c ? c : 3; @@ -1036,6 +1036,7 @@ data load_data_detection(int n, char **paths, int m, int w, int h, int c, int bo d.X.cols = h*w*c; float r1 = 0, r2 = 0, r3 = 0, r4 = 0, r_scale = 0; + float resize_r1 = 0, resize_r2 = 0; float dhue = 0, dsat = 0, dexp = 0, flip = 0, blur = 0; int augmentation_calculated = 0, gaussian_noise = 0; @@ -1070,9 +1071,22 @@ data load_data_detection(int n, char **paths, int m, int w, int h, int c, int bo int dw = (ow*jitter); int dh = (oh*jitter); + float resize_down = resize, resize_up = resize; + if (resize_down > 1.0) resize_down = 1 / resize_down; + int min_rdw = ow*(1 - (1 / resize_down)) / 2; // < 0 + int min_rdh = oh*(1 - (1 / resize_down)) / 2; // < 0 + + if (resize_up < 1.0) resize_up = 1 / resize_up; + int max_rdw = ow*(1 - (1 / resize_up)) / 2; // > 0 + int max_rdh = oh*(1 - (1 / resize_up)) / 2; // > 0 + //printf(" down = %f, up = %f \n", (1 - (1 / resize_down)) / 2, (1 - (1 / resize_up)) / 2); + if (!augmentation_calculated || !track) { augmentation_calculated = 1; + resize_r1 = random_float(); + resize_r2 = random_float(); + r1 = random_float(); r2 = random_float(); r3 = random_float(); @@ -1101,6 +1115,21 @@ data load_data_detection(int n, char **paths, int m, int w, int h, int c, int bo int pright = rand_precalc_random(-dw, dw, r2); int ptop = rand_precalc_random(-dh, dh, r3); int pbot = rand_precalc_random(-dh, dh, r4); + + if (resize < 1) { + // downsize only + pleft += rand_precalc_random(min_rdw, 0, resize_r1); + pright += rand_precalc_random(min_rdw, 0, resize_r2); + ptop += rand_precalc_random(min_rdh, 0, resize_r1); + pbot += rand_precalc_random(min_rdh, 0, resize_r2); + } + else { + pleft += rand_precalc_random(min_rdw, max_rdw, resize_r1); + pright += rand_precalc_random(min_rdw, max_rdw, resize_r2); + ptop += rand_precalc_random(min_rdh, max_rdh, resize_r1); + pbot += rand_precalc_random(min_rdh, max_rdh, resize_r2); + } + //printf("\n pleft = %d, pright = %d, ptop = %d, pbot = %d, ow = %d, oh = %d \n", pleft, pright, ptop, pbot, ow, oh); //float scale = rand_precalc_random(.25, 2, r_scale); // unused currently @@ -1127,7 +1156,27 @@ data load_data_detection(int n, char **paths, int m, int w, int h, int c, int bo pright = pright - delta_w; //printf(" result_ar = %f, ow_tmp = %f, delta_w = %d, pleft = %f, pright = %f \n", result_ar, ow_tmp, delta_w, pleft, pright); } + + //printf("\n pleft = %d, pright = %d, ptop = %d, pbot = %d, ow = %d, oh = %d \n", pleft, pright, ptop, pbot, ow, oh); + } + + /* + // move each 2nd image to the corner - so that most of it was visible + if (use_mixup == 3 && random_gen() % 2 == 0) { + if (flip) { + if (i_mixup == 0) pleft += pright, pright = 0, pbot += ptop, ptop = 0; + if (i_mixup == 1) pright += pleft, pleft = 0, pbot += ptop, ptop = 0; + if (i_mixup == 2) pleft += pright, pright = 0, ptop += pbot, pbot = 0; + if (i_mixup == 3) pright += pleft, pleft = 0, ptop += pbot, pbot = 0; + } + else { + if (i_mixup == 0) pright += pleft, pleft = 0, pbot += ptop, ptop = 0; + if (i_mixup == 1) pleft += pright, pright = 0, pbot += ptop, ptop = 0; + if (i_mixup == 2) pright += pleft, pleft = 0, ptop += pbot, pbot = 0; + if (i_mixup == 3) pleft += pright, pright = 0, ptop += pbot, pbot = 0; + } } + */ int swidth = ow - pleft - pright; int sheight = oh - ptop - pbot; @@ -1264,8 +1313,8 @@ void blend_images(image new_img, float alpha, image old_img, float beta) new_img.data[i] = new_img.data[i] * alpha + old_img.data[i] * beta; } -data load_data_detection(int n, char **paths, int m, int w, int h, int c, int boxes, int classes, int use_flip, int gaussian_noise, int use_blur, int use_mixup, float jitter, - float hue, float saturation, float exposure, int mini_batch, int track, int augment_speed, int letter_box, int show_imgs) +data load_data_detection(int n, char **paths, int m, int w, int h, int c, int boxes, int classes, int use_flip, int gaussian_noise, int use_blur, int use_mixup, + float jitter, float resize, float hue, float saturation, float exposure, int mini_batch, int track, int augment_speed, int letter_box, int show_imgs) { const int random_index = random_gen(); c = c ? c : 3; @@ -1299,6 +1348,7 @@ data load_data_detection(int n, char **paths, int m, int w, int h, int c, int bo d.X.cols = h*w*c; float r1 = 0, r2 = 0, r3 = 0, r4 = 0, r_scale; + float resize_r1 = 0, resize_r2 = 0; float dhue = 0, dsat = 0, dexp = 0, flip = 0; int augmentation_calculated = 0; @@ -1318,9 +1368,21 @@ data load_data_detection(int n, char **paths, int m, int w, int h, int c, int bo int dw = (ow*jitter); int dh = (oh*jitter); + float resize_down = resize, resize_up = resize; + if (resize_down > 1.0) resize_down = 1 / resize_down; + int min_rdw = ow*(1 - (1 / resize_down)) / 2; + int min_rdh = oh*(1 - (1 / resize_down)) / 2; + + if (resize_up < 1.0) resize_up = 1 / resize_up; + int max_rdw = ow*(1 - (1 / resize_up)) / 2; + int max_rdh = oh*(1 - (1 / resize_up)) / 2; + if (!augmentation_calculated || !track) { augmentation_calculated = 1; + resize_r1 = random_float(); + resize_r2 = random_float(); + r1 = random_float(); r2 = random_float(); r3 = random_float(); @@ -1340,7 +1402,19 @@ data load_data_detection(int n, char **paths, int m, int w, int h, int c, int bo int ptop = rand_precalc_random(-dh, dh, r3); int pbot = rand_precalc_random(-dh, dh, r4); - float scale = rand_precalc_random(.25, 2, r_scale); // unused currently + if (resize < 1) { + // downsize only + pleft += rand_precalc_random(min_rdw, 0, resize_r1); + pright += rand_precalc_random(min_rdw, 0, resize_r2); + ptop += rand_precalc_random(min_rdh, 0, resize_r1); + pbot += rand_precalc_random(min_rdh, 0, resize_r2); + } + else { + pleft += rand_precalc_random(min_rdw, max_rdw, resize_r1); + pright += rand_precalc_random(min_rdw, max_rdw, resize_r2); + ptop += rand_precalc_random(min_rdh, max_rdh, resize_r1); + pbot += rand_precalc_random(min_rdh, max_rdh, resize_r2); + } if (letter_box) { @@ -1454,7 +1528,7 @@ void *load_thread(void *ptr) } else if (a.type == REGION_DATA){ *a.d = load_data_region(a.n, a.paths, a.m, a.w, a.h, a.num_boxes, a.classes, a.jitter, a.hue, a.saturation, a.exposure); } else if (a.type == DETECTION_DATA){ - *a.d = load_data_detection(a.n, a.paths, a.m, a.w, a.h, a.c, a.num_boxes, a.classes, a.flip, a.gaussian_noise, a.blur, a.mixup, a.jitter, + *a.d = load_data_detection(a.n, a.paths, a.m, a.w, a.h, a.c, a.num_boxes, a.classes, a.flip, a.gaussian_noise, a.blur, a.mixup, a.jitter, a.resize, a.hue, a.saturation, a.exposure, a.mini_batch, a.track, a.augment_speed, a.letter_box, a.show_imgs); } else if (a.type == SWAG_DATA){ *a.d = load_data_swag(a.paths, a.n, a.classes, a.jitter); diff --git a/src/data.h b/src/data.h index b15a1ac9..c38eb38c 100644 --- a/src/data.h +++ b/src/data.h @@ -87,7 +87,7 @@ data load_data_captcha(char **paths, int n, int m, int k, int w, int h); data load_data_captcha_encode(char **paths, int n, int m, int w, int h); data load_data_old(char **paths, int n, int m, char **labels, int k, int w, int h); data load_data_detection(int n, char **paths, int m, int w, int h, int c, int boxes, int classes, int use_flip, int gaussian_noise, int use_blur, int use_mixup, - float jitter, float hue, float saturation, float exposure, int mini_batch, int track, int augment_speed, int letter_box, int show_imgs); + float jitter, float resize, float hue, float saturation, float exposure, int mini_batch, int track, int augment_speed, int letter_box, int show_imgs); data load_data_tag(char **paths, int n, int m, int k, int use_flip, int min, int max, int w, int h, float angle, float aspect, float hue, float saturation, float exposure); matrix load_image_augment_paths(char **paths, int n, int use_flip, int min, int max, int w, int h, float angle, float aspect, float hue, float saturation, float exposure, int dontuse_opencv); data load_data_super(char **paths, int n, int m, int w, int h, int scale); diff --git a/src/detector.c b/src/detector.c index 8eed0b7a..4df1d18e 100644 --- a/src/detector.c +++ b/src/detector.c @@ -104,7 +104,6 @@ void train_detector(char *datacfg, char *cfgfile, char *weightfile, int *gpus, i layer l = net.layers[net.n - 1]; int classes = l.classes; - float jitter = l.jitter; list *plist = get_paths(train_images); int train_images_num = plist->size; @@ -129,7 +128,8 @@ void train_detector(char *datacfg, char *cfgfile, char *weightfile, int *gpus, i args.m = plist->size; args.classes = classes; args.flip = net.flip; - args.jitter = jitter; + args.jitter = l.jitter; + args.resize = l.resize; args.num_boxes = l.max_boxes; net.num_boxes = args.num_boxes; net.train_images_num = train_images_num; diff --git a/src/parser.c b/src/parser.c index a562ab95..a77f6d09 100644 --- a/src/parser.c +++ b/src/parser.c @@ -449,6 +449,7 @@ layer parse_yolo(list *options, size_params params) } l.jitter = option_find_float(options, "jitter", .2); + l.resize = option_find_float_quiet(options, "resize", 1.0); l.focal_loss = option_find_int_quiet(options, "focal_loss", 0); l.ignore_thresh = option_find_float(options, "ignore_thresh", .5); @@ -564,6 +565,7 @@ layer parse_gaussian_yolo(list *options, size_params params) // Gaussian_YOLOv3 iou_loss, l.iou_loss, l.iou_normalizer, l.cls_normalizer, l.scale_x_y, l.yolo_point); l.jitter = option_find_float(options, "jitter", .2); + l.resize = option_find_float_quiet(options, "resize", 1.0); l.ignore_thresh = option_find_float(options, "ignore_thresh", .5); l.truth_thresh = option_find_float(options, "truth_thresh", 1); @@ -612,6 +614,7 @@ layer parse_region(list *options, size_params params) l.focal_loss = option_find_int_quiet(options, "focal_loss", 0); //l.max_boxes = option_find_int_quiet(options, "max",30); l.jitter = option_find_float(options, "jitter", .2); + l.resize = option_find_float_quiet(options, "resize", 1.0); l.rescore = option_find_int_quiet(options, "rescore",0); l.thresh = option_find_float(options, "thresh", .5); @@ -666,6 +669,7 @@ detection_layer parse_detection(list *options, size_params params) layer.noobject_scale = option_find_float(options, "noobject_scale", 1); layer.class_scale = option_find_float(options, "class_scale", 1); layer.jitter = option_find_float(options, "jitter", .2); + layer.resize = option_find_float_quiet(options, "resize", 1.0); layer.random = option_find_float_quiet(options, "random", 0); layer.reorg = option_find_int_quiet(options, "reorg", 0); return layer;