Fixed RNN (LSTM, RNN, CRNN, GRU) for CUDNN_HALF=1

pull/4477/head
AlexeyAB 6 years ago
parent 9e07605bc5
commit f154d2070a
  1. 10
      src/crnn_layer.c
  2. 11
      src/network_kernels.cu

@ -215,6 +215,16 @@ void forward_crnn_layer_gpu(layer l, network_state state)
layer input_layer = *(l.input_layer); layer input_layer = *(l.input_layer);
layer self_layer = *(l.self_layer); layer self_layer = *(l.self_layer);
layer output_layer = *(l.output_layer); layer output_layer = *(l.output_layer);
/*
#ifdef CUDNN_HALF
// slow and bad
s.index = state.index;
s.net = state.net;
cuda_convert_f32_to_f16(input_layer.weights_gpu, input_layer.c*input_layer.n*input_layer.size*input_layer.size, input_layer.weights_gpu16);
cuda_convert_f32_to_f16(self_layer.weights_gpu, self_layer.c*self_layer.n*self_layer.size*self_layer.size, self_layer.weights_gpu16);
cuda_convert_f32_to_f16(output_layer.weights_gpu, output_layer.c*output_layer.n*output_layer.size*output_layer.size, output_layer.weights_gpu16);
#endif //CUDNN_HALF
*/
fill_ongpu(l.outputs * l.batch * l.steps, 0, output_layer.delta_gpu, 1); fill_ongpu(l.outputs * l.batch * l.steps, 0, output_layer.delta_gpu, 1);
fill_ongpu(l.hidden * l.batch * l.steps, 0, self_layer.delta_gpu, 1); fill_ongpu(l.hidden * l.batch * l.steps, 0, self_layer.delta_gpu, 1);

@ -1,6 +1,7 @@
#include "cuda_runtime.h" //#include "cuda_runtime.h"
#include "curand.h" //#include "curand.h"
#include "cublas_v2.h" //#include "cublas_v2.h"
#include "cuda.h"
extern "C" { extern "C" {
#include <stdio.h> #include <stdio.h>
@ -61,7 +62,7 @@ void forward_network_gpu(network net, network_state state)
//printf("\n layer %d - type: %d - \n", i, l.type); //printf("\n layer %d - type: %d - \n", i, l.type);
//start_timer(); //start_timer();
l.forward_gpu(l, state); l.forward_gpu(l, state);
//cudaDeviceSynchronize(); //CHECK_CUDA(cudaDeviceSynchronize());
//stop_timer_and_show(); //stop_timer_and_show();
if(net.wait_stream) if(net.wait_stream)
@ -152,7 +153,7 @@ void forward_backward_network_gpu(network net, float *x, float *y)
int i; int i;
for (i = 0; i < net.n; ++i) { for (i = 0; i < net.n; ++i) {
layer l = net.layers[i]; layer l = net.layers[i];
if (l.weights_gpu) { if (l.weights_gpu && l.weights_gpu16) {
assert((l.c*l.n*l.size*l.size) > 0); assert((l.c*l.n*l.size*l.size) > 0);
cuda_convert_f32_to_f16(l.weights_gpu, l.c*l.n*l.size*l.size, l.weights_gpu16); cuda_convert_f32_to_f16(l.weights_gpu, l.c*l.n*l.size*l.size, l.weights_gpu16);
} }

Loading…
Cancel
Save