### (neural network for object detection) - Tensor Cores can be used on [Linux](https://github.com/AlexeyAB/darknet#how-to-compile-on-linux) and [Windows](https://github.com/AlexeyAB/darknet#how-to-compile-on-windows-using-vcpkg)
### (neural network for object detection) - Tensor Cores can be used on [Linux](https://github.com/AlexeyAB/darknet#how-to-compile-on-linux) and [Windows](https://github.com/AlexeyAB/darknet#how-to-compile-on-windows-using-cmake-gui)
More details: http://pjreddie.com/darknet/yolo/
@ -9,6 +9,7 @@ More details: http://pjreddie.com/darknet/yolo/
* change [`filters=255`] to filters=(classes + 5)x3 in the 3 `[convolutional]` before each `[yolo]` layer
* change [`filters=255`] to filters=(classes + 5)x3 in the 3 `[convolutional]` before each `[yolo]` layer, keep in mind that it only has to be the last `[convolutional]` before each of the `[yolo]` layers.
__global__ void axpy_kernel(int N, float ALPHA, float *X, int OFFX, int INCX, float *Y, int OFFY, int INCY)
{
int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
@ -572,7 +590,7 @@ extern "C" void fast_variance_gpu(float *x, float *mean, int batch, int filters,
}
__global__ void fast_v_cbn_kernel(const float *x, float *mean, int batch, int filters, int spatial, int minibatch_index, float *m_avg, float *v_avg, float *variance,
__global__ void fast_v_cbn_kernel(const float *x, float *mean, int batch, int filters, int spatial, int minibatch_index, int max_minibatch_index, float *m_avg, float *v_avg, float *variance,
extern "C" void fast_v_cbn_gpu(const float *x, float *mean, int batch, int filters, int spatial, int minibatch_index, float *m_avg, float *v_avg, float *variance,
extern "C" void fast_v_cbn_gpu(const float *x, float *mean, int batch, int filters, int spatial, int minibatch_index, int max_minibatch_index, float *m_avg, float *v_avg, float *variance,
__global__ void shortcut_singlelayer_simple_kernel(int size, int src_outputs, int batch, int n, int *outputs_of_layers_gpu, float **layers_output_gpu, float *out, float *in, float *weights_gpu, int nweights, WEIGHTS_NORMALIZATION_T weights_normalizion)
{
const int id = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
@ -849,7 +876,7 @@ __global__ void shortcut_multilayer_kernel(int size, int src_outputs, int batch,
for (int i = 0; i < (n + 1); ++i) {
const int weights_index = src_i / step + i*layer_step; // [0 or c or (c, h ,w)]
const float w = weights_gpu[weights_index];
if (weights_normalizion == RELU_NORMALIZATION) sum += relu(w);
if (weights_normalizion == RELU_NORMALIZATION) sum += lrelu(w);
else if (weights_normalizion == SOFTMAX_NORMALIZATION) sum += expf(w - max_val);
}
}
@ -858,7 +885,7 @@ __global__ void shortcut_multilayer_kernel(int size, int src_outputs, int batch,
if (weights_gpu) {
float w = weights_gpu[src_i / step];
if (weights_normalizion == RELU_NORMALIZATION) w = relu(w) / sum;
if (weights_normalizion == RELU_NORMALIZATION) w = lrelu(w) / sum;
else if (weights_normalizion == SOFTMAX_NORMALIZATION) w = expf(w - max_val) / sum;
out_val = in[id] * w; // [0 or c or (c, h ,w)]
@ -876,7 +903,7 @@ __global__ void shortcut_multilayer_kernel(int size, int src_outputs, int batch,
if (weights_gpu) {
const int weights_index = src_i / step + (i + 1)*layer_step; // [0 or c or (c, h ,w)]
float w = weights_gpu[weights_index];
if (weights_normalizion == RELU_NORMALIZATION) w = relu(w) / sum;
if (weights_normalizion == RELU_NORMALIZATION) w = lrelu(w) / sum;
else if (weights_normalizion == SOFTMAX_NORMALIZATION) w = expf(w - max_val) / sum;
out_val += add[add_index] * w; // [0 or c or (c, h ,w)]
@ -932,34 +959,27 @@ __global__ void backward_shortcut_multilayer_kernel(int size, int src_outputs, i
for (i = 0; i < (n + 1); ++i) {
const int weights_index = src_i / step + i*layer_step; // [0 or c or (c, h ,w)]
const float w = weights_gpu[weights_index];
if (weights_normalizion == RELU_NORMALIZATION) sum += relu(w);
if (weights_normalizion == RELU_NORMALIZATION) sum += lrelu(w);
else if (weights_normalizion == SOFTMAX_NORMALIZATION) sum += expf(w - max_val);
}
/*
grad = 0;
for (i = 0; i < (n + 1); ++i) {
const int weights_index = src_i / step + i*layer_step; // [0 or c or (c, h ,w)]
const float delta_w = delta_in[id] * in[id];
const float w = weights_gpu[weights_index];
if (weights_normalizion == RELU_NORMALIZATION) grad += delta_w * relu(w) / sum;
else if (weights_normalizion == SOFTMAX_NORMALIZATION) grad += delta_w * expf(w - max_val) / sum;
}
*/
}
if (weights_gpu) {
float w = weights_gpu[src_i / step];
if (weights_normalizion == RELU_NORMALIZATION) w = relu(w) / sum;
if (weights_normalizion == RELU_NORMALIZATION) w = lrelu(w) / sum;
else if (weights_normalizion == SOFTMAX_NORMALIZATION) w = expf(w - max_val) / sum;
if (weights_normalizion == RELU_NORMALIZATION) grad = w;
else if (weights_normalizion == SOFTMAX_NORMALIZATION) grad = w*(1-w);
delta_out[id] += delta_in[id] * w; // [0 or c or (c, h ,w)]
printf("\n Warning: in txt-labels class_id=%d >= classes=%d in cfg-file. In txt-labels class_id should be [from 0 to %d] \n",class_id,l.classes,l.classes-1);