pull/2272/head
AlexeyAB 6 years ago
parent 81f7fc2c7b
commit 46be08db37
  1. 19
      src/convolutional_layer.c
  2. 3
      src/cuda.c
  3. 4
      src/detector.c
  4. 28
      src/http_stream.cpp
  5. 2
      src/http_stream.h
  6. 15
      src/im2col_kernels.cu

@ -285,25 +285,6 @@ void cudnn_convolutional_setup(layer *l, int cudnn_preference)
//l->fw_algo16 = CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED; //l->fw_algo16 = CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED;
//l->bd_algo16 = CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED; //l->bd_algo16 = CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED;
//l->bf_algo16 = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED; //l->bf_algo16 = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED;
int fw = 0, bd = 0, bf = 0;
if (l->fw_algo16 == CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM) fw = 1;
//printf("Tensor Cores - Forward enabled: l->fw_algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM \n");
if (l->fw_algo16 == CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED) fw = 2;
//printf("Tensor Cores - Forward enabled: l->fw_algo = CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED \n");
if (l->bd_algo16 == CUDNN_CONVOLUTION_BWD_DATA_ALGO_1) bd = 1;
//printf("Tensor Cores - Backward-data enabled: l->bd_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1 \n");
if (l->bd_algo16 == CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED) bd = 2;
//printf("Tensor Cores - Backward-data enabled: l->bd_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED \n");
if (l->bf_algo16 == CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1) bf = 1;
//printf("Tensor Cores - Backward-filter enabled: l->bf_algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1 \n");
if (l->bf_algo16 == CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED) bf = 2;
//printf("Tensor Cores - Backward-filter enabled: l->bf_algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED \n");
//if (fw == 2 && bd == 2 && bf == 2) printf("TF ");
//else if (fw == 1 && bd == 1 && bf == 1) printf("TH ");
} }
} }
#endif #endif

@ -91,7 +91,7 @@ cudaStream_t get_cuda_memcpy_stream() {
cudaError_t status = cudaStreamCreate(&streamsArray2[i]); cudaError_t status = cudaStreamCreate(&streamsArray2[i]);
//cudaError_t status = cudaStreamCreateWithFlags(&streamsArray2[i], cudaStreamNonBlocking); //cudaError_t status = cudaStreamCreateWithFlags(&streamsArray2[i], cudaStreamNonBlocking);
if (status != cudaSuccess) { if (status != cudaSuccess) {
printf(" cudaStreamCreate Memcpy error: %d \n", status); printf(" cudaStreamCreate-Memcpy error: %d \n", status);
const char *s = cudaGetErrorString(status); const char *s = cudaGetErrorString(status);
char buffer[256]; char buffer[256];
printf("CUDA Error: %s\n", s); printf("CUDA Error: %s\n", s);
@ -180,6 +180,7 @@ int *cuda_make_int_array(size_t n)
int *x_gpu; int *x_gpu;
size_t size = sizeof(int)*n; size_t size = sizeof(int)*n;
cudaError_t status = cudaMalloc((void **)&x_gpu, size); cudaError_t status = cudaMalloc((void **)&x_gpu, size);
if(status != cudaSuccess) fprintf(stderr, " Try to set subdivisions=64 in your cfg-file. \n");
check_error(status); check_error(status);
return x_gpu; return x_gpu;
} }

@ -1274,9 +1274,11 @@ void test_detector(char *datacfg, char *cfgfile, char *weightfile, char *filenam
if (!input) break; if (!input) break;
strtok(input, "\n"); strtok(input, "\n");
} }
//image im = load_image_resize(input, net.w, net.h, net.c);
//image sized = copy_image(im);
image im = load_image(input, 0, 0, net.c); image im = load_image(input, 0, 0, net.c);
int letterbox = 0;
image sized = resize_image(im, net.w, net.h); image sized = resize_image(im, net.w, net.h);
int letterbox = 0;
//image sized = letterbox_image(im, net.w, net.h); letterbox = 1; //image sized = letterbox_image(im, net.w, net.h); letterbox = 1;
layer l = net.layers[net.n - 1]; layer l = net.layers[net.n - 1];

@ -593,6 +593,34 @@ image image_data_augmentation(IplImage* ipl, int w, int h,
} }
image load_image_resize(char *filename, int w, int h, int c)
{
image out;
cv::Mat img(h, w, CV_8UC3);
try {
int flag = -1;
if (c == 0) flag = 1;
else if (c == 1) { flag = 0; img = cv::Mat(h, w, CV_8UC1); }
else if (c == 3) { flag = 1; img = cv::Mat(h, w, CV_8UC3); }
else {
fprintf(stderr, "OpenCV can't force load with %d channels\n", c);
}
//throw std::runtime_error("runtime_error");
cv::Mat loaded_image = cv::imread(filename, flag);
cv::resize(loaded_image, img, cv::Size(w, h), 0, 0, CV_INTER_LINEAR);
cv::cvtColor(img, img, cv::COLOR_RGB2BGR);
IplImage tmp = img;
out = ipl_to_image(&tmp);
}
catch (...) {
fprintf(stderr, "OpenCV can't load image %s channels\n", filename);
out = make_image(w, h, c);
}
return out;
}
#endif // OPENCV #endif // OPENCV
// ----------------------------------------------------- // -----------------------------------------------------

@ -25,6 +25,8 @@ int get_stream_fps_cpp(CvCapture *cap);
image image_data_augmentation(IplImage* ipl, int w, int h, image image_data_augmentation(IplImage* ipl, int w, int h,
int pleft, int ptop, int swidth, int sheight, int flip, int pleft, int ptop, int swidth, int sheight, int flip,
float jitter, float dhue, float dsat, float dexp); float jitter, float dhue, float dsat, float dexp);
image load_image_resize(char *filename, int w, int h, int c);
#endif // OPENCV #endif // OPENCV
#ifdef __cplusplus #ifdef __cplusplus

@ -1880,6 +1880,21 @@ __global__ void gemm_nn_custom_bin_mean_transposed_gpu_kernel(int M, int N, int
} }
*/ */
// further optimization - use WMMA GEMM for using Tensor Cores
// https://github.com/NVIDIA-developer-blog/code-samples/blob/master/posts/tensor-cores/simpleTensorCoreGEMM.cu
// https://github.com/NVIDIA/cuda-samples/blob/master/Samples/cudaTensorCoreGemm/cudaTensorCoreGemm.cu
// https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#wmma-subbyte
// nvcuda::wmma::col_major -> cutlass::MatrixLayout::kColumnMajor (matrix is not transposed)
// Matrix A Matrix B Accumulator Matrix Size (m-n-k)
// precision::b1 precision::b1 int 8x8x128
// The only dimensions currently supported by WMMA for XNOR
// const int WMMA_M = 8;
// const int WMMA_N = 8;
// const int WMMA_K = 128;
// GOOD // GOOD
void gemm_nn_custom_bin_mean_transposed_gpu(int M, int N, int K, void gemm_nn_custom_bin_mean_transposed_gpu(int M, int N, int K,
unsigned char *A, int lda, unsigned char *A, int lda,

Loading…
Cancel
Save