From 46be08db37ac1de7d877e460f58c9f7839035733 Mon Sep 17 00:00:00 2001 From: AlexeyAB Date: Tue, 22 Jan 2019 16:23:44 +0300 Subject: [PATCH] Minor fix --- src/convolutional_layer.c | 19 ------------------- src/cuda.c | 3 ++- src/detector.c | 4 +++- src/http_stream.cpp | 28 ++++++++++++++++++++++++++++ src/http_stream.h | 2 ++ src/im2col_kernels.cu | 15 +++++++++++++++ 6 files changed, 50 insertions(+), 21 deletions(-) diff --git a/src/convolutional_layer.c b/src/convolutional_layer.c index 04dd88b8..ec196bd7 100644 --- a/src/convolutional_layer.c +++ b/src/convolutional_layer.c @@ -285,25 +285,6 @@ void cudnn_convolutional_setup(layer *l, int cudnn_preference) //l->fw_algo16 = CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED; //l->bd_algo16 = CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED; //l->bf_algo16 = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED; - - int fw = 0, bd = 0, bf = 0; - if (l->fw_algo16 == CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM) fw = 1; - //printf("Tensor Cores - Forward enabled: l->fw_algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM \n"); - if (l->fw_algo16 == CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED) fw = 2; - //printf("Tensor Cores - Forward enabled: l->fw_algo = CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED \n"); - - if (l->bd_algo16 == CUDNN_CONVOLUTION_BWD_DATA_ALGO_1) bd = 1; - //printf("Tensor Cores - Backward-data enabled: l->bd_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1 \n"); - if (l->bd_algo16 == CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED) bd = 2; - //printf("Tensor Cores - Backward-data enabled: l->bd_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED \n"); - - if (l->bf_algo16 == CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1) bf = 1; - //printf("Tensor Cores - Backward-filter enabled: l->bf_algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1 \n"); - if (l->bf_algo16 == CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED) bf = 2; - //printf("Tensor Cores - Backward-filter enabled: l->bf_algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED \n"); - - //if (fw == 2 && bd == 2 && bf == 2) printf("TF "); - //else if (fw == 1 && bd == 1 && bf == 1) printf("TH "); } } #endif diff --git a/src/cuda.c b/src/cuda.c index 16a50c4e..7705b27d 100644 --- a/src/cuda.c +++ b/src/cuda.c @@ -91,7 +91,7 @@ cudaStream_t get_cuda_memcpy_stream() { cudaError_t status = cudaStreamCreate(&streamsArray2[i]); //cudaError_t status = cudaStreamCreateWithFlags(&streamsArray2[i], cudaStreamNonBlocking); if (status != cudaSuccess) { - printf(" cudaStreamCreate Memcpy error: %d \n", status); + printf(" cudaStreamCreate-Memcpy error: %d \n", status); const char *s = cudaGetErrorString(status); char buffer[256]; printf("CUDA Error: %s\n", s); @@ -180,6 +180,7 @@ int *cuda_make_int_array(size_t n) int *x_gpu; size_t size = sizeof(int)*n; cudaError_t status = cudaMalloc((void **)&x_gpu, size); + if(status != cudaSuccess) fprintf(stderr, " Try to set subdivisions=64 in your cfg-file. \n"); check_error(status); return x_gpu; } diff --git a/src/detector.c b/src/detector.c index 747cf422..193df65e 100644 --- a/src/detector.c +++ b/src/detector.c @@ -1274,9 +1274,11 @@ void test_detector(char *datacfg, char *cfgfile, char *weightfile, char *filenam if (!input) break; strtok(input, "\n"); } + //image im = load_image_resize(input, net.w, net.h, net.c); + //image sized = copy_image(im); image im = load_image(input, 0, 0, net.c); - int letterbox = 0; image sized = resize_image(im, net.w, net.h); + int letterbox = 0; //image sized = letterbox_image(im, net.w, net.h); letterbox = 1; layer l = net.layers[net.n - 1]; diff --git a/src/http_stream.cpp b/src/http_stream.cpp index 408fae97..5938d8e6 100644 --- a/src/http_stream.cpp +++ b/src/http_stream.cpp @@ -593,6 +593,34 @@ image image_data_augmentation(IplImage* ipl, int w, int h, } +image load_image_resize(char *filename, int w, int h, int c) +{ + image out; + cv::Mat img(h, w, CV_8UC3); + try { + int flag = -1; + if (c == 0) flag = 1; + else if (c == 1) { flag = 0; img = cv::Mat(h, w, CV_8UC1); } + else if (c == 3) { flag = 1; img = cv::Mat(h, w, CV_8UC3); } + else { + fprintf(stderr, "OpenCV can't force load with %d channels\n", c); + } + //throw std::runtime_error("runtime_error"); + cv::Mat loaded_image = cv::imread(filename, flag); + cv::resize(loaded_image, img, cv::Size(w, h), 0, 0, CV_INTER_LINEAR); + cv::cvtColor(img, img, cv::COLOR_RGB2BGR); + + IplImage tmp = img; + out = ipl_to_image(&tmp); + } + catch (...) { + fprintf(stderr, "OpenCV can't load image %s channels\n", filename); + out = make_image(w, h, c); + } + return out; +} + + #endif // OPENCV // ----------------------------------------------------- diff --git a/src/http_stream.h b/src/http_stream.h index 96e619d5..34091064 100644 --- a/src/http_stream.h +++ b/src/http_stream.h @@ -25,6 +25,8 @@ int get_stream_fps_cpp(CvCapture *cap); image image_data_augmentation(IplImage* ipl, int w, int h, int pleft, int ptop, int swidth, int sheight, int flip, float jitter, float dhue, float dsat, float dexp); + +image load_image_resize(char *filename, int w, int h, int c); #endif // OPENCV #ifdef __cplusplus diff --git a/src/im2col_kernels.cu b/src/im2col_kernels.cu index 3dd7a43b..fc016cef 100644 --- a/src/im2col_kernels.cu +++ b/src/im2col_kernels.cu @@ -1880,6 +1880,21 @@ __global__ void gemm_nn_custom_bin_mean_transposed_gpu_kernel(int M, int N, int } */ +// further optimization - use WMMA GEMM for using Tensor Cores +// https://github.com/NVIDIA-developer-blog/code-samples/blob/master/posts/tensor-cores/simpleTensorCoreGEMM.cu +// https://github.com/NVIDIA/cuda-samples/blob/master/Samples/cudaTensorCoreGemm/cudaTensorCoreGemm.cu +// https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#wmma-subbyte +// nvcuda::wmma::col_major -> cutlass::MatrixLayout::kColumnMajor (matrix is not transposed) + +// Matrix A Matrix B Accumulator Matrix Size (m-n-k) +// precision::b1 precision::b1 int 8x8x128 + +// The only dimensions currently supported by WMMA for XNOR +// const int WMMA_M = 8; +// const int WMMA_N = 8; +// const int WMMA_K = 128; + + // GOOD void gemm_nn_custom_bin_mean_transposed_gpu(int M, int N, int K, unsigned char *A, int lda,