__global__ void dropblock_fast_kernel(float *rand, float prob, int w, int h, int spatial, int filters, int block_size, float *drop_blocks_scale, float *output)
{
const int threads = BLOCK;
__shared__ int index_block;
const int id = threadIdx.x;
const int f = blockIdx.x % filters;
const int b = blockIdx.x / filters;
if(id == 0) index_block = -1;
__syncthreads();
int i;
for (i = 0; i < spatial; i += threads) {
int index = b*spatial*f + f*spatial + i + id;
if (i + id < spatial) {
if (rand[id] < prob) {
index_block = id;
}
}
}
__syncthreads();
if (index_block == -1) return;
int b_x = index_block % w;
int b_y = index_block / w;
b_x = max(0, min(b_x, w - block_size));
b_y = max(0, min(b_y, h - block_size));
int block_square_size = block_size * block_size;
for (i = 0; i < block_square_size; i += threads)
{
int i_x = i % w;
int i_y = i / w;
int x = b_x + i_x;
int y = b_y + i_y;
int index = b*spatial*f + f*spatial + y*w + x;
output[index] = 0;
}
/*
if (id == 0) {
for (int x = b_x; x < (b_x + block_size); ++x)
{
for (int y = b_y; y < (b_y + block_size); ++y)
{
int index = b*spatial*f + f*spatial + y*w + x;
output[index] = 0;
}
}
}
*/
if (id == 0 && drop_blocks_scale) {
atomicAdd(&drop_blocks_scale[b], 1);
}
}
__global__ void scale_dropblock_kernel(float *output, int size, int outputs, float *drop_blocks_scale)
{
const int index = blockIdx.x*blockDim.x + threadIdx.x;