@ -1402,6 +1402,7 @@ __global__ void stretch_weights_kernel(const float *src_weight_gpu, float *weig
//const float coef = (kernel_size*kernel_size) / (kernel_size*kernel_size - dropout_sum);
//const float coef = (kernel_size*kernel_size) / (kernel_size*kernel_size - dropout_sum);
for (int y = 0; y < kernel_size; ++y) {
for (int y = 0; y < kernel_size; ++y) {
for (int x = 0; x < kernel_size; ++x) {
for (int x = 0; x < kernel_size; ++x) {
//if (scale < 1) weight_deform_gpu[x + y*kernel_size + i] /= scale;// *= coef;
weight_deform_gpu[x + y*kernel_size + i] /= scale;// *= coef;
weight_deform_gpu[x + y*kernel_size + i] /= scale;// *= coef;
}
}
}
}
@ -1422,11 +1423,6 @@ extern "C" void stretch_weights_gpu(const float *src_weight_gpu, float *weight_d
__global__ void sway_and_flip_weights_kernel(const float *src_weight_gpu, float *weight_deform_gpu, int nweights, int n, int kernel_size, int angle, int reverse)
__global__ void sway_and_flip_weights_kernel(const float *src_weight_gpu, float *weight_deform_gpu, int nweights, int n, int kernel_size, int angle, int reverse)
{
{
const int index = blockIdx.x*blockDim.x + threadIdx.x;
const int index = blockIdx.x*blockDim.x + threadIdx.x;
@ -1616,6 +1612,173 @@ extern "C" void rotate_weights_gpu(const float *src_weight_gpu, float *weight_de
__global__ void stretch_sway_flip_weights_kernel(const float *src_weight_gpu, float *weight_deform_gpu, int nweights, int n, int kernel_size, float angle, int reverse)
{
const int index = blockIdx.x*blockDim.x + threadIdx.x;
const int kernel_area = kernel_size * kernel_size;
const int i = index * kernel_area;
const int stage_step = (nweights / kernel_area) / 8; // 8 stages
const int stage_id = index / stage_step;
// nweights = (c / groups) * n * size * size;
// kernel_area = size*size
if (i < nweights)
{
if (stage_id == 0) {
// simple copy
for (int x = 0; x < kernel_size; ++x) {
for (int y = 0; y < kernel_size; ++y) {
weight_deform_gpu[x + y*kernel_size + i] = src_weight_gpu[x + y*kernel_size + i];
}
}
}
else if (stage_id == 1 || stage_id == 2 || stage_id == 3 || stage_id == 4)
{
float scale = 0.5;
if (stage_id == 1) scale = 0.65;
else if (stage_id == 2) scale = 0.8;
else if (stage_id == 3) scale = 1.2;
else if (stage_id == 4) scale = 1.4;
if (reverse) scale = 1 / scale;
const int x_c = kernel_size / 2;
const int y_c = kernel_size / 2;
float dropout_sum = 0;
for (int y = 0; y < kernel_size; ++y) {
for (int x = 0; x < kernel_size; ++x) {
// Xsource = x_c + (x_d - x_c) / scale
// Ysource = y_c + (y_d - y_c) / scale
float x_s = x_c + (x - x_c) / scale;
float y_s = y_c + (y - y_c) / scale;
int x_0 = floor(x_s); // round down
int x_1 = ceil(x_s); // round up
if (x_0 == x_1) x_1 = x_0 + 1;
int y_0 = floor(y_s);
int y_1 = ceil(y_s);
if (y_0 == y_1) y_1 = y_0 + 1;
float c_x_0 = x_1 - x_s;
float c_x_1 = x_s - x_0;
float c_y_0 = y_1 - y_s;
float c_y_1 = y_s - y_0;
float val = 0;
if (x_0 >= 0 && x_0 < kernel_size && y_0 >= 0 && y_0 < kernel_size) val += src_weight_gpu[x_0 + y_0*kernel_size + i] * c_x_0 * c_y_0;
else dropout_sum += c_x_0 * c_y_0;
if (x_1 >= 0 && x_1 < kernel_size && y_0 >= 0 && y_0 < kernel_size) val += src_weight_gpu[x_1 + y_0*kernel_size + i] * c_x_1 * c_y_0;
else dropout_sum += c_x_1 * c_y_0;
if (x_0 >= 0 && x_0 < kernel_size && y_1 >= 0 && y_1 < kernel_size) val += src_weight_gpu[x_0 + y_1*kernel_size + i] * c_x_0 * c_y_1;
else dropout_sum += c_x_0 * c_y_1;
if (x_1 >= 0 && x_1 < kernel_size && y_1 >= 0 && y_1 < kernel_size) val += src_weight_gpu[x_1 + y_1*kernel_size + i] * c_x_1 * c_y_1;
else dropout_sum += c_x_1 * c_y_1;
weight_deform_gpu[x + y*kernel_size + i] = val;
}
}
// compensate for dropped items
//const float coef = (kernel_size*kernel_size) / (kernel_size*kernel_size - dropout_sum);
for (int y = 0; y < kernel_size; ++y) {
for (int x = 0; x < kernel_size; ++x) {
if(scale > 1)
weight_deform_gpu[x + y*kernel_size + i] *= scale;// *= coef;
}
}
}
else if (stage_id == 5 || stage_id == 6)
{
// rotate left or right
if (stage_id == 6) angle = -angle;
if (reverse) angle = -angle;
const float cos_a = cosf(angle * 3.14159265 / 180);
const float sin_a = sinf(angle * 3.14159265 / 180);
const int x_c = kernel_size / 2;
const int y_c = kernel_size / 2;
float dropout_sum = 0;
for (int y = 0; y < kernel_size; ++y) {
for (int x = 0; x < kernel_size; ++x) {
// Xsource = x*cos(alpha) + y*sin(alpha)
// Ysource = -x*sin(alpha) + y*cos(alpha)
float x_s = x_c + (x - x_c)*cos_a + (y - y_c)*sin_a;
float y_s = y_c - (x - x_c)*sin_a + (y - y_c)*cos_a;
int x_0 = floor(x_s); // round down
int x_1 = ceil(x_s); // round up
if (x_0 == x_1) x_1 = x_0 + 1;
int y_0 = floor(y_s);
int y_1 = ceil(y_s);
if (y_0 == y_1) y_1 = y_0 + 1;
float c_x_0 = x_1 - x_s;
float c_x_1 = x_s - x_0;
float c_y_0 = y_1 - y_s;
float c_y_1 = y_s - y_0;
float val = 0;
if (x_0 >= 0 && x_0 < kernel_size && y_0 >= 0 && y_0 < kernel_size) val += src_weight_gpu[x_0 + y_0*kernel_size + i] * c_x_0 * c_y_0;
else dropout_sum += c_x_0 * c_y_0;
if (x_1 >= 0 && x_1 < kernel_size && y_0 >= 0 && y_0 < kernel_size) val += src_weight_gpu[x_1 + y_0*kernel_size + i] * c_x_1 * c_y_0;
else dropout_sum += c_x_1 * c_y_0;
if (x_0 >= 0 && x_0 < kernel_size && y_1 >= 0 && y_1 < kernel_size) val += src_weight_gpu[x_0 + y_1*kernel_size + i] * c_x_0 * c_y_1;
else dropout_sum += c_x_0 * c_y_1;
if (x_1 >= 0 && x_1 < kernel_size && y_1 >= 0 && y_1 < kernel_size) val += src_weight_gpu[x_1 + y_1*kernel_size + i] * c_x_1 * c_y_1;
else dropout_sum += c_x_1 * c_y_1;
weight_deform_gpu[x + y*kernel_size + i] = val;
}
}
// compensate for dropped items
const float coef = (kernel_size*kernel_size) / (kernel_size*kernel_size - dropout_sum);
for (int y = 0; y < kernel_size; ++y) {
for (int x = 0; x < kernel_size; ++x) {
weight_deform_gpu[x + y*kernel_size + i] *= coef;
}
}
}
else if (stage_id == 7)
{
// flip
for (int y = 0; y < kernel_size; ++y) {
for (int x = 0; x < kernel_size; ++x) {
weight_deform_gpu[(kernel_size - x - 1) + y*kernel_size + i] = src_weight_gpu[x + y*kernel_size + i];
}
}
}
}
}
extern "C" void stretch_sway_flip_weights_gpu(const float *src_weight_gpu, float *weight_deform_gpu, int nweights, int n, int size, int angle, int reverse)
{
const int kernel_area = size*size;
const int block_size = BLOCK;
const int num_blocks = get_number_of_blocks(nweights / kernel_area, block_size);
stretch_sway_flip_weights_kernel << <num_blocks, block_size, 0, get_cuda_stream() >> > (src_weight_gpu, weight_deform_gpu, nweights, n, size, angle, reverse);
CHECK_CUDA(cudaPeekAtLastError());
}
__global__ void reduce_and_expand_array_kernel(const float *src_gpu, float *dst_gpu, int current_size, int groups)
__global__ void reduce_and_expand_array_kernel(const float *src_gpu, float *dst_gpu, int current_size, int groups)
{
{
const int index = blockIdx.x*blockDim.x + threadIdx.x;
const int index = blockIdx.x*blockDim.x + threadIdx.x;