pull/2272/head
AlexeyAB 6 years ago
parent 81f7fc2c7b
commit 46be08db37
  1. 19
      src/convolutional_layer.c
  2. 3
      src/cuda.c
  3. 4
      src/detector.c
  4. 28
      src/http_stream.cpp
  5. 2
      src/http_stream.h
  6. 15
      src/im2col_kernels.cu

@ -285,25 +285,6 @@ void cudnn_convolutional_setup(layer *l, int cudnn_preference)
//l->fw_algo16 = CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED;
//l->bd_algo16 = CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED;
//l->bf_algo16 = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED;
int fw = 0, bd = 0, bf = 0;
if (l->fw_algo16 == CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM) fw = 1;
//printf("Tensor Cores - Forward enabled: l->fw_algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM \n");
if (l->fw_algo16 == CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED) fw = 2;
//printf("Tensor Cores - Forward enabled: l->fw_algo = CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED \n");
if (l->bd_algo16 == CUDNN_CONVOLUTION_BWD_DATA_ALGO_1) bd = 1;
//printf("Tensor Cores - Backward-data enabled: l->bd_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1 \n");
if (l->bd_algo16 == CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED) bd = 2;
//printf("Tensor Cores - Backward-data enabled: l->bd_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED \n");
if (l->bf_algo16 == CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1) bf = 1;
//printf("Tensor Cores - Backward-filter enabled: l->bf_algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1 \n");
if (l->bf_algo16 == CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED) bf = 2;
//printf("Tensor Cores - Backward-filter enabled: l->bf_algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED \n");
//if (fw == 2 && bd == 2 && bf == 2) printf("TF ");
//else if (fw == 1 && bd == 1 && bf == 1) printf("TH ");
}
}
#endif

@ -91,7 +91,7 @@ cudaStream_t get_cuda_memcpy_stream() {
cudaError_t status = cudaStreamCreate(&streamsArray2[i]);
//cudaError_t status = cudaStreamCreateWithFlags(&streamsArray2[i], cudaStreamNonBlocking);
if (status != cudaSuccess) {
printf(" cudaStreamCreate Memcpy error: %d \n", status);
printf(" cudaStreamCreate-Memcpy error: %d \n", status);
const char *s = cudaGetErrorString(status);
char buffer[256];
printf("CUDA Error: %s\n", s);
@ -180,6 +180,7 @@ int *cuda_make_int_array(size_t n)
int *x_gpu;
size_t size = sizeof(int)*n;
cudaError_t status = cudaMalloc((void **)&x_gpu, size);
if(status != cudaSuccess) fprintf(stderr, " Try to set subdivisions=64 in your cfg-file. \n");
check_error(status);
return x_gpu;
}

@ -1274,9 +1274,11 @@ void test_detector(char *datacfg, char *cfgfile, char *weightfile, char *filenam
if (!input) break;
strtok(input, "\n");
}
//image im = load_image_resize(input, net.w, net.h, net.c);
//image sized = copy_image(im);
image im = load_image(input, 0, 0, net.c);
int letterbox = 0;
image sized = resize_image(im, net.w, net.h);
int letterbox = 0;
//image sized = letterbox_image(im, net.w, net.h); letterbox = 1;
layer l = net.layers[net.n - 1];

@ -593,6 +593,34 @@ image image_data_augmentation(IplImage* ipl, int w, int h,
}
image load_image_resize(char *filename, int w, int h, int c)
{
image out;
cv::Mat img(h, w, CV_8UC3);
try {
int flag = -1;
if (c == 0) flag = 1;
else if (c == 1) { flag = 0; img = cv::Mat(h, w, CV_8UC1); }
else if (c == 3) { flag = 1; img = cv::Mat(h, w, CV_8UC3); }
else {
fprintf(stderr, "OpenCV can't force load with %d channels\n", c);
}
//throw std::runtime_error("runtime_error");
cv::Mat loaded_image = cv::imread(filename, flag);
cv::resize(loaded_image, img, cv::Size(w, h), 0, 0, CV_INTER_LINEAR);
cv::cvtColor(img, img, cv::COLOR_RGB2BGR);
IplImage tmp = img;
out = ipl_to_image(&tmp);
}
catch (...) {
fprintf(stderr, "OpenCV can't load image %s channels\n", filename);
out = make_image(w, h, c);
}
return out;
}
#endif // OPENCV
// -----------------------------------------------------

@ -25,6 +25,8 @@ int get_stream_fps_cpp(CvCapture *cap);
image image_data_augmentation(IplImage* ipl, int w, int h,
int pleft, int ptop, int swidth, int sheight, int flip,
float jitter, float dhue, float dsat, float dexp);
image load_image_resize(char *filename, int w, int h, int c);
#endif // OPENCV
#ifdef __cplusplus

@ -1880,6 +1880,21 @@ __global__ void gemm_nn_custom_bin_mean_transposed_gpu_kernel(int M, int N, int
}
*/
// further optimization - use WMMA GEMM for using Tensor Cores
// https://github.com/NVIDIA-developer-blog/code-samples/blob/master/posts/tensor-cores/simpleTensorCoreGEMM.cu
// https://github.com/NVIDIA/cuda-samples/blob/master/Samples/cudaTensorCoreGemm/cudaTensorCoreGemm.cu
// https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#wmma-subbyte
// nvcuda::wmma::col_major -> cutlass::MatrixLayout::kColumnMajor (matrix is not transposed)
// Matrix A Matrix B Accumulator Matrix Size (m-n-k)
// precision::b1 precision::b1 int 8x8x128
// The only dimensions currently supported by WMMA for XNOR
// const int WMMA_M = 8;
// const int WMMA_N = 8;
// const int WMMA_K = 128;
// GOOD
void gemm_nn_custom_bin_mean_transposed_gpu(int M, int N, int K,
unsigned char *A, int lda,

Loading…
Cancel
Save