fully separate C-API from CPP-API

pull/2394/head
Stefano Sinigardi 6 years ago
parent 6e7c7976d7
commit 00de023601
  1. 2
      include/darknet.h
  2. 28
      include/yolo_v2_class.hpp
  3. 8
      src/convolutional_layer.c
  4. 11
      src/cpu_gemm.c
  5. 37
      src/gemm.c
  6. 3
      src/image.c
  7. 3
      src/utils.c
  8. 1
      src/yolo_v2_class.cpp

@ -12,6 +12,7 @@
#include <assert.h> #include <assert.h>
#include <pthread.h> #include <pthread.h>
#ifndef LIB_API
#ifdef LIB_EXPORTS #ifdef LIB_EXPORTS
#if defined(_MSC_VER) #if defined(_MSC_VER)
#define LIB_API __declspec(dllexport) #define LIB_API __declspec(dllexport)
@ -25,6 +26,7 @@
#define LIB_API #define LIB_API
#endif #endif
#endif #endif
#endif
#define NFRAMES 3 #define NFRAMES 3
#define SECRET_NUM -1234 #define SECRET_NUM -1234

@ -1,23 +1,37 @@
#ifndef YOLO_V2_CLASS_HPP #ifndef YOLO_V2_CLASS_HPP
#define YOLO_V2_CLASS_HPP #define YOLO_V2_CLASS_HPP
#include "darknet.h" #ifndef LIB_API
#ifdef LIB_EXPORTS
#if defined(_MSC_VER)
#define LIB_API __declspec(dllexport)
#else
#define LIB_API __attribute__((visibility("default")))
#endif
#else
#if defined(_MSC_VER)
#define LIB_API
#else
#define LIB_API
#endif
#endif
#endif
#define C_SHARP_MAX_OBJECTS 1000 #define C_SHARP_MAX_OBJECTS 1000
struct bbox_t { struct bbox_t {
unsigned int x, y, w, h; // (x,y) - top-left corner, (w, h) - width & height of bounded box unsigned int x, y, w, h; // (x,y) - top-left corner, (w, h) - width & height of bounded box
float prob; // confidence - probability that the object was found correctly float prob; // confidence - probability that the object was found correctly
unsigned int obj_id; // class of object - from range [0, classes-1] unsigned int obj_id; // class of object - from range [0, classes-1]
unsigned int track_id; // tracking id for video (0 - untracked, 1 - inf - tracked object) unsigned int track_id; // tracking id for video (0 - untracked, 1 - inf - tracked object)
unsigned int frames_counter;// counter of frames on which the object was detected unsigned int frames_counter; // counter of frames on which the object was detected
}; };
struct image_t { struct image_t {
int h; // height int h; // height
int w; // width int w; // width
int c; // number of chanels (3 - for RGB) int c; // number of chanels (3 - for RGB)
float *data; // pointer to the image data float *data; // pointer to the image data
}; };
struct bbox_t_container { struct bbox_t_container {
@ -34,7 +48,7 @@ struct bbox_t_container {
#include <opencv2/opencv.hpp> // C++ #include <opencv2/opencv.hpp> // C++
#include <opencv2/highgui/highgui_c.h> // C #include <opencv2/highgui/highgui_c.h> // C
#include <opencv2/imgproc/imgproc_c.h> // C #include <opencv2/imgproc/imgproc_c.h> // C
#endif // OPENCV #endif
extern "C" LIB_API int init(const char *configurationFilename, const char *weightsFilename, int gpu); extern "C" LIB_API int init(const char *configurationFilename, const char *weightsFilename, int gpu);
extern "C" LIB_API int detect_image(const char *filename, bbox_t_container &container); extern "C" LIB_API int detect_image(const char *filename, bbox_t_container &container);

@ -18,6 +18,12 @@
#include "xnor_layer.h" #include "xnor_layer.h"
#endif #endif
#ifdef __cplusplus
#define PUT_IN_REGISTER
#else
#define PUT_IN_REGISTER register
#endif
#ifndef AI2 #ifndef AI2
#define AI2 0 #define AI2 0
void forward_xnor_layer(layer l, network_state state); void forward_xnor_layer(layer l, network_state state);
@ -644,7 +650,7 @@ void gemm_nn_custom(int M, int N, int K, float ALPHA,
int i, j, k; int i, j, k;
for (i = 0; i < M; ++i) { for (i = 0; i < M; ++i) {
for (k = 0; k < K; ++k) { for (k = 0; k < K; ++k) {
float A_PART = ALPHA * A[i * lda + k]; PUT_IN_REGISTER float A_PART = ALPHA * A[i * lda + k];
//printf("\n weight = %f \n", A_PART); //printf("\n weight = %f \n", A_PART);
for (j = 0; j < N; ++j) { for (j = 0; j < N; ++j) {
C[i*ldc + j] += A_PART*B[k*ldb + j]; C[i*ldc + j] += A_PART*B[k*ldb + j];

@ -1,4 +1,9 @@
//#include "mini_blas.h" //#include "mini_blas.h"
#ifdef __cplusplus
#define PUT_IN_REGISTER
#else
#define PUT_IN_REGISTER register
#endif
void cpu_gemm_nn(int TA, int TB, int M, int N, int K, float ALPHA, void cpu_gemm_nn(int TA, int TB, int M, int N, int K, float ALPHA,
float *A, int lda, float *A, int lda,
@ -9,7 +14,7 @@ void cpu_gemm_nn(int TA, int TB, int M, int N, int K, float ALPHA,
int i,j,k; int i,j,k;
for(i = 0; i < M; ++i){ for(i = 0; i < M; ++i){
for(k = 0; k < K; ++k){ for(k = 0; k < K; ++k){
float A_PART = ALPHA * A[i * lda + k]; PUT_IN_REGISTER float A_PART = ALPHA * A[i * lda + k];
for(j = 0; j < N; ++j){ for(j = 0; j < N; ++j){
C[i*ldc+j] += A_PART*B[k*ldb+j]; C[i*ldc+j] += A_PART*B[k*ldb+j];
} }
@ -26,7 +31,7 @@ void cpu_gemm_nt(int TA, int TB, int M, int N, int K, float ALPHA,
int i,j,k; int i,j,k;
for(i = 0; i < M; ++i){ for(i = 0; i < M; ++i){
for(j = 0; j < N; ++j){ for(j = 0; j < N; ++j){
float sum = 0; PUT_IN_REGISTER float sum = 0;
for(k = 0; k < K; ++k){ for(k = 0; k < K; ++k){
sum += ALPHA*A[i*lda+k]*B[k+j*ldb]; sum += ALPHA*A[i*lda+k]*B[k+j*ldb];
} }
@ -44,7 +49,7 @@ void cpu_gemm_tn(int TA, int TB, int M, int N, int K, float ALPHA,
int i,j,k; int i,j,k;
for(i = 0; i < M; ++i){ for(i = 0; i < M; ++i){
for(k = 0; k < K; ++k){ for(k = 0; k < K; ++k){
float A_PART = ALPHA * A[k * lda + i]; PUT_IN_REGISTER float A_PART = ALPHA * A[k * lda + i];
for(j = 0; j < N; ++j){ for(j = 0; j < N; ++j){
C[i*ldc+j] += A_PART*B[k*ldb+j]; C[i*ldc+j] += A_PART*B[k*ldb+j];
} }

@ -18,6 +18,11 @@
#define TILE_M 4 // 4 ops #define TILE_M 4 // 4 ops
#define TILE_N 16 // AVX2 = 2 ops * 8 floats #define TILE_N 16 // AVX2 = 2 ops * 8 floats
#define TILE_K 16 // loop #define TILE_K 16 // loop
#ifdef __cplusplus
#define PUT_IN_REGISTER
#else
#define PUT_IN_REGISTER register
#endif
void gemm_bin(int M, int N, int K, float ALPHA, void gemm_bin(int M, int N, int K, float ALPHA,
char *A, int lda, char *A, int lda,
@ -713,7 +718,7 @@ void gemm_nn(int M, int N, int K, float ALPHA,
else { else {
for (i = 0; i < M; ++i) { for (i = 0; i < M; ++i) {
for (k = 0; k < K; ++k) { for (k = 0; k < K; ++k) {
float A_PART = ALPHA * A[i * lda + k]; PUT_IN_REGISTER float A_PART = ALPHA * A[i * lda + k];
for (j = 0; j < N; ++j) { for (j = 0; j < N; ++j) {
C[i*ldc + j] += A_PART*B[k*ldb + j]; C[i*ldc + j] += A_PART*B[k*ldb + j];
} }
@ -845,7 +850,7 @@ void gemm_nn_fast(int M, int N, int K, float ALPHA,
{ {
for (k_d = k; k_d < (k + TILE_K); ++k_d) for (k_d = k; k_d < (k + TILE_K); ++k_d)
{ {
register float A_PART = ALPHA*A[i_d*lda + k_d]; PUT_IN_REGISTER float A_PART = ALPHA*A[i_d*lda + k_d];
C[i_d*ldc + j] += A_PART*B[k_d*ldb + j]; C[i_d*ldc + j] += A_PART*B[k_d*ldb + j];
} }
} }
@ -856,7 +861,7 @@ void gemm_nn_fast(int M, int N, int K, float ALPHA,
{ {
for (i_d = i; i_d < (i + TILE_M); ++i_d) for (i_d = i; i_d < (i + TILE_M); ++i_d)
{ {
register float A_PART = ALPHA*A[i_d*lda + k]; PUT_IN_REGISTER float A_PART = ALPHA*A[i_d*lda + k];
for (j = 0; j < N; ++j) { for (j = 0; j < N; ++j) {
C[i_d*ldc + j] += A_PART*B[k*ldb + j]; C[i_d*ldc + j] += A_PART*B[k*ldb + j];
} }
@ -867,7 +872,7 @@ void gemm_nn_fast(int M, int N, int K, float ALPHA,
for (i = (M / TILE_M)*TILE_M; i < M; ++i) { for (i = (M / TILE_M)*TILE_M; i < M; ++i) {
int j, k; int j, k;
for (k = 0; k < K; ++k) { for (k = 0; k < K; ++k) {
register float A_PART = ALPHA*A[i*lda + k]; PUT_IN_REGISTER float A_PART = ALPHA*A[i*lda + k];
for (j = 0; j < N; ++j) { for (j = 0; j < N; ++j) {
C[i*ldc + j] += A_PART*B[k*ldb + j]; C[i*ldc + j] += A_PART*B[k*ldb + j];
} }
@ -890,7 +895,7 @@ void gemm_nn_bin_32bit_packed(int M, int N, int K, float ALPHA,
//printf(" l.mean_arr[i] = %d \n ", l.mean_arr[i]); //printf(" l.mean_arr[i] = %d \n ", l.mean_arr[i]);
for (s = 0; s < K; ++s) // l.size*l.size*l.c/32 or (l.size*l.size*l.c) for (s = 0; s < K; ++s) // l.size*l.size*l.c/32 or (l.size*l.size*l.c)
{ {
register uint32_t A_PART = A[i*lda + s]; PUT_IN_REGISTER uint32_t A_PART = A[i*lda + s];
__m256i a256 = _mm256_set1_epi32(A_PART); __m256i a256 = _mm256_set1_epi32(A_PART);
for (j = 0; j < N - 8; j += 8) for (j = 0; j < N - 8; j += 8)
@ -927,7 +932,7 @@ void gemm_nn_bin_32bit_packed(int M, int N, int K, float ALPHA,
for (; j < N; ++j) // out_h*out_w; for (; j < N; ++j) // out_h*out_w;
{ {
register uint32_t B_PART = B[s*ldb + j]; PUT_IN_REGISTER uint32_t B_PART = B[s*ldb + j];
uint32_t xnor_result = ~(A_PART ^ B_PART); uint32_t xnor_result = ~(A_PART ^ B_PART);
int32_t count = popcnt_32(xnor_result); // must be Signed int int32_t count = popcnt_32(xnor_result); // must be Signed int
@ -1950,7 +1955,7 @@ void gemm_nn(int M, int N, int K, float ALPHA,
int i, j, k; int i, j, k;
for (i = 0; i < M; ++i) { for (i = 0; i < M; ++i) {
for (k = 0; k < K; ++k) { for (k = 0; k < K; ++k) {
float A_PART = ALPHA * A[i * lda + k]; PUT_IN_REGISTER float A_PART = ALPHA * A[i * lda + k];
for (j = 0; j < N; ++j) { for (j = 0; j < N; ++j) {
C[i*ldc + j] += A_PART*B[k*ldb + j]; C[i*ldc + j] += A_PART*B[k*ldb + j];
} }
@ -1967,7 +1972,7 @@ void gemm_nn_fast(int M, int N, int K, float ALPHA,
#pragma omp parallel for #pragma omp parallel for
for (i = 0; i < M; ++i) { for (i = 0; i < M; ++i) {
for (k = 0; k < K; ++k) { for (k = 0; k < K; ++k) {
register float A_PART = ALPHA*A[i*lda + k]; PUT_IN_REGISTER float A_PART = ALPHA*A[i*lda + k];
for (j = 0; j < N; ++j) { for (j = 0; j < N; ++j) {
C[i*ldc + j] += A_PART*B[k*ldb + j]; C[i*ldc + j] += A_PART*B[k*ldb + j];
} }
@ -1988,12 +1993,12 @@ void gemm_nn_bin_32bit_packed(int M, int N, int K, float ALPHA,
//printf(" l.mean_arr[i] = %d \n ", l.mean_arr[i]); //printf(" l.mean_arr[i] = %d \n ", l.mean_arr[i]);
for (s = 0; s < K; ++s) // l.size*l.size*l.c/32 or (l.size*l.size*l.c) for (s = 0; s < K; ++s) // l.size*l.size*l.c/32 or (l.size*l.size*l.c)
{ {
//register float A_PART = 1*a[i*k + s]; //PUT_IN_REGISTER float A_PART = 1*a[i*k + s];
register uint32_t A_PART = A[i*lda + s]; PUT_IN_REGISTER uint32_t A_PART = A[i * lda + s];
for (j = 0; j < N; ++j) // out_h*out_w; for (j = 0; j < N; ++j) // out_h*out_w;
{ {
//c[i*n + j] += A_PART*b[s*n + j]; //c[i*n + j] += A_PART*b[s*n + j];
register uint32_t B_PART = B[s*ldb + j]; PUT_IN_REGISTER uint32_t B_PART = B[s * ldb + j];
uint32_t xnor_result = ~(A_PART ^ B_PART); uint32_t xnor_result = ~(A_PART ^ B_PART);
//printf(" xnor_result = %d, ", xnor_result); //printf(" xnor_result = %d, ", xnor_result);
int32_t count = popcnt_32(xnor_result); // must be Signed int int32_t count = popcnt_32(xnor_result); // must be Signed int
@ -2490,8 +2495,8 @@ void gemm_nn_bin_transposed_32bit_packed(int M, int N, int K, float ALPHA,
float val = 0; float val = 0;
for (s = 0; s < K; ++s) // l.size*l.size*l.c/32 or (l.size*l.size*l.c) for (s = 0; s < K; ++s) // l.size*l.size*l.c/32 or (l.size*l.size*l.c)
{ {
register uint32_t A_PART = ((uint32_t*)A)[i*lda + s]; PUT_IN_REGISTER uint32_t A_PART = ((uint32_t*)A)[i*lda + s];
register uint32_t B_PART = ((uint32_t*)B)[j*ldb + s]; PUT_IN_REGISTER uint32_t B_PART = ((uint32_t*)B)[j * ldb + s];
uint32_t xnor_result = ~(A_PART ^ B_PART); uint32_t xnor_result = ~(A_PART ^ B_PART);
int32_t count = popcnt_32(xnor_result); // must be Signed int int32_t count = popcnt_32(xnor_result); // must be Signed int
@ -2576,7 +2581,7 @@ void gemm_nt(int M, int N, int K, float ALPHA,
int i,j,k; int i,j,k;
for(i = 0; i < M; ++i){ for(i = 0; i < M; ++i){
for(j = 0; j < N; ++j){ for(j = 0; j < N; ++j){
float sum = 0; PUT_IN_REGISTER float sum = 0;
for(k = 0; k < K; ++k){ for(k = 0; k < K; ++k){
sum += ALPHA*A[i*lda+k]*B[j*ldb + k]; sum += ALPHA*A[i*lda+k]*B[j*ldb + k];
} }
@ -2593,7 +2598,7 @@ void gemm_tn(int M, int N, int K, float ALPHA,
int i,j,k; int i,j,k;
for(i = 0; i < M; ++i){ for(i = 0; i < M; ++i){
for(k = 0; k < K; ++k){ for(k = 0; k < K; ++k){
float A_PART = ALPHA * A[k * lda + i]; PUT_IN_REGISTER float A_PART = ALPHA * A[k * lda + i];
for(j = 0; j < N; ++j){ for(j = 0; j < N; ++j){
C[i*ldc+j] += A_PART*B[k*ldb+j]; C[i*ldc+j] += A_PART*B[k*ldb+j];
} }
@ -2609,7 +2614,7 @@ void gemm_tt(int M, int N, int K, float ALPHA,
int i,j,k; int i,j,k;
for(i = 0; i < M; ++i){ for(i = 0; i < M; ++i){
for(j = 0; j < N; ++j){ for(j = 0; j < N; ++j){
float sum = 0; PUT_IN_REGISTER float sum = 0;
for(k = 0; k < K; ++k){ for(k = 0; k < K; ++k){
sum += ALPHA*A[i+k*lda]*B[k+j*ldb]; sum += ALPHA*A[i+k*lda]*B[k+j*ldb];
} }

@ -3,6 +3,9 @@
#include "blas.h" #include "blas.h"
#include "cuda.h" #include "cuda.h"
#include <stdio.h> #include <stdio.h>
#ifndef _USE_MATH_DEFINES
#define _USE_MATH_DEFINES
#endif
#include <math.h> #include <math.h>
#ifndef STB_IMAGE_IMPLEMENTATION #ifndef STB_IMAGE_IMPLEMENTATION

@ -2,6 +2,9 @@
#include <stdio.h> #include <stdio.h>
#include <stdlib.h> #include <stdlib.h>
#include <string.h> #include <string.h>
#ifndef _USE_MATH_DEFINES
#define _USE_MATH_DEFINES
#endif
#include <math.h> #include <math.h>
#include <assert.h> #include <assert.h>
#include <float.h> #include <float.h>

@ -1,3 +1,4 @@
#include "darknet.h"
#include "yolo_v2_class.hpp" #include "yolo_v2_class.hpp"
#include "network.h" #include "network.h"

Loading…
Cancel
Save