diff --git a/src/crnn_layer.c b/src/crnn_layer.c index 56ead3af..5a9d6f58 100644 --- a/src/crnn_layer.c +++ b/src/crnn_layer.c @@ -215,6 +215,16 @@ void forward_crnn_layer_gpu(layer l, network_state state) layer input_layer = *(l.input_layer); layer self_layer = *(l.self_layer); layer output_layer = *(l.output_layer); + /* +#ifdef CUDNN_HALF +// slow and bad + s.index = state.index; + s.net = state.net; + cuda_convert_f32_to_f16(input_layer.weights_gpu, input_layer.c*input_layer.n*input_layer.size*input_layer.size, input_layer.weights_gpu16); + cuda_convert_f32_to_f16(self_layer.weights_gpu, self_layer.c*self_layer.n*self_layer.size*self_layer.size, self_layer.weights_gpu16); + cuda_convert_f32_to_f16(output_layer.weights_gpu, output_layer.c*output_layer.n*output_layer.size*output_layer.size, output_layer.weights_gpu16); +#endif //CUDNN_HALF + */ fill_ongpu(l.outputs * l.batch * l.steps, 0, output_layer.delta_gpu, 1); fill_ongpu(l.hidden * l.batch * l.steps, 0, self_layer.delta_gpu, 1); diff --git a/src/network_kernels.cu b/src/network_kernels.cu index edb4cd5e..cb6c0f38 100644 --- a/src/network_kernels.cu +++ b/src/network_kernels.cu @@ -1,6 +1,7 @@ -#include "cuda_runtime.h" -#include "curand.h" -#include "cublas_v2.h" +//#include "cuda_runtime.h" +//#include "curand.h" +//#include "cublas_v2.h" +#include "cuda.h" extern "C" { #include @@ -61,7 +62,7 @@ void forward_network_gpu(network net, network_state state) //printf("\n layer %d - type: %d - \n", i, l.type); //start_timer(); l.forward_gpu(l, state); - //cudaDeviceSynchronize(); + //CHECK_CUDA(cudaDeviceSynchronize()); //stop_timer_and_show(); if(net.wait_stream) @@ -152,7 +153,7 @@ void forward_backward_network_gpu(network net, float *x, float *y) int i; for (i = 0; i < net.n; ++i) { layer l = net.layers[i]; - if (l.weights_gpu) { + if (l.weights_gpu && l.weights_gpu16) { assert((l.c*l.n*l.size*l.size) > 0); cuda_convert_f32_to_f16(l.weights_gpu, l.c*l.n*l.size*l.size, l.weights_gpu16); }