Minor fix

6 years ago · 46be08db37
parent 81f7fc2c7b
commit 46be08db37
6 changed files with 50 additions and 21 deletions
--- a/src/convolutional_layer.c
+++ b/src/convolutional_layer.c
@ -285,25 +285,6 @@ void cudnn_convolutional_setup(layer *l, int cudnn_preference)
        //l->fw_algo16 = CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED;
        //l->bd_algo16 = CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED;
        //l->bf_algo16 = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED;
-
-        int fw = 0, bd = 0, bf = 0;
-        if (l->fw_algo16 == CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM) fw = 1;
-            //printf("Tensor Cores - Forward enabled: l->fw_algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM \n");
-        if (l->fw_algo16 == CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED) fw = 2;
-            //printf("Tensor Cores - Forward enabled: l->fw_algo = CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED \n");
-
-        if (l->bd_algo16 == CUDNN_CONVOLUTION_BWD_DATA_ALGO_1) bd = 1;
-            //printf("Tensor Cores - Backward-data enabled: l->bd_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1  \n");
-        if (l->bd_algo16 == CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED) bd = 2;
-            //printf("Tensor Cores - Backward-data enabled: l->bd_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED \n");
-
-        if (l->bf_algo16 == CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1) bf = 1;
-            //printf("Tensor Cores - Backward-filter enabled: l->bf_algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1   \n");
-        if (l->bf_algo16 == CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED) bf = 2;
-            //printf("Tensor Cores - Backward-filter enabled: l->bf_algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED \n");
-
-        //if (fw == 2 && bd == 2 && bf == 2) printf("TF ");
-        //else if (fw == 1 && bd == 1 && bf == 1) printf("TH ");
    }
 }
 #endif
--- a/src/cuda.c
+++ b/src/cuda.c
@ -91,7 +91,7 @@ cudaStream_t get_cuda_memcpy_stream() {
        cudaError_t status = cudaStreamCreate(&streamsArray2[i]);
        //cudaError_t status = cudaStreamCreateWithFlags(&streamsArray2[i], cudaStreamNonBlocking);
        if (status != cudaSuccess) {
-            printf(" cudaStreamCreate Memcpy error: %d \n", status);
+            printf(" cudaStreamCreate-Memcpy error: %d \n", status);
            const char *s = cudaGetErrorString(status);
            char buffer[256];
            printf("CUDA Error: %s\n", s);
@ -180,6 +180,7 @@ int *cuda_make_int_array(size_t n)
    int *x_gpu;
    size_t size = sizeof(int)*n;
    cudaError_t status = cudaMalloc((void **)&x_gpu, size);
+    if(status != cudaSuccess) fprintf(stderr, " Try to set subdivisions=64 in your cfg-file. \n");
    check_error(status);
    return x_gpu;
 }
--- a/src/detector.c
+++ b/src/detector.c
@ -1274,9 +1274,11 @@ void test_detector(char *datacfg, char *cfgfile, char *weightfile, char *filenam
            if (!input) break;
            strtok(input, "\n");
        }
+        //image im = load_image_resize(input, net.w, net.h, net.c);
+        //image sized = copy_image(im);
        image im = load_image(input, 0, 0, net.c);
-        int letterbox = 0;
        image sized = resize_image(im, net.w, net.h);
+        int letterbox = 0;
        //image sized = letterbox_image(im, net.w, net.h); letterbox = 1;
        layer l = net.layers[net.n - 1];

--- a/src/http_stream.cpp
+++ b/src/http_stream.cpp
@ -593,6 +593,34 @@ image image_data_augmentation(IplImage* ipl, int w, int h,
 }


+image load_image_resize(char *filename, int w, int h, int c)
+{
+    image out;
+    cv::Mat img(h, w, CV_8UC3);
+    try {
+        int flag = -1;
+        if (c == 0) flag = 1;
+        else if (c == 1) { flag = 0; img = cv::Mat(h, w, CV_8UC1); }
+        else if (c == 3) { flag = 1; img = cv::Mat(h, w, CV_8UC3); }
+        else {
+            fprintf(stderr, "OpenCV can't force load with %d channels\n", c);
+        }
+        //throw std::runtime_error("runtime_error");
+        cv::Mat loaded_image = cv::imread(filename, flag);
+        cv::resize(loaded_image, img, cv::Size(w, h), 0, 0, CV_INTER_LINEAR);
+        cv::cvtColor(img, img, cv::COLOR_RGB2BGR);
+
+        IplImage tmp = img;
+        out = ipl_to_image(&tmp);
+    }
+    catch (...) {
+        fprintf(stderr, "OpenCV can't load image %s channels\n", filename);
+        out = make_image(w, h, c);
+    }
+    return out;
+}
+
+
 #endif    // OPENCV

 // -----------------------------------------------------
--- a/src/http_stream.h
+++ b/src/http_stream.h
@ -25,6 +25,8 @@ int get_stream_fps_cpp(CvCapture *cap);
 image image_data_augmentation(IplImage* ipl, int w, int h,
    int pleft, int ptop, int swidth, int sheight, int flip,
    float jitter, float dhue, float dsat, float dexp);
+
+image load_image_resize(char *filename, int w, int h, int c);
 #endif  // OPENCV

 #ifdef __cplusplus
--- a/src/im2col_kernels.cu
+++ b/src/im2col_kernels.cu
@ -1880,6 +1880,21 @@ __global__ void gemm_nn_custom_bin_mean_transposed_gpu_kernel(int M, int N, int
 }
 */

+// further optimization - use WMMA GEMM for using Tensor Cores
+// https://github.com/NVIDIA-developer-blog/code-samples/blob/master/posts/tensor-cores/simpleTensorCoreGEMM.cu
+// https://github.com/NVIDIA/cuda-samples/blob/master/Samples/cudaTensorCoreGemm/cudaTensorCoreGemm.cu
+// https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#wmma-subbyte
+// nvcuda::wmma::col_major ->  cutlass::MatrixLayout::kColumnMajor (matrix is not transposed)
+
+// Matrix A	Matrix B	Accumulator	Matrix Size (m-n-k)
+// precision::b1	precision::b1	int	8x8x128
+
+// The only dimensions currently supported by WMMA for XNOR
+// const int WMMA_M = 8;
+// const int WMMA_N = 8;
+// const int WMMA_K = 128;
+
+
 // GOOD
 void gemm_nn_custom_bin_mean_transposed_gpu(int M, int N, int K,
    unsigned char *A, int lda,