Skip to content

Commit

Permalink
🐍 🐍 🐍 🐍 🐍
Browse files Browse the repository at this point in the history
  • Loading branch information
pjreddie committed Jun 11, 2017
1 parent d8c5cfd commit f9446ac
Show file tree
Hide file tree
Showing 4 changed files with 32 additions and 25 deletions.
18 changes: 11 additions & 7 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -13,14 +13,15 @@ ARCH= -gencode arch=compute_20,code=[sm_20,sm_21] \
# ARCH= -gencode arch=compute_52,code=compute_52

VPATH=./src/:./examples
LIB=libdarknet.so
SLIB=libdarknet.so
ALIB=libdarknet.a
EXEC=darknet
OBJDIR=./obj/

CC=gcc
NVCC=nvcc --compiler-options '-fPIC'
AR=ar
ARFLAGS=-rv
ARFLAGS=rcs
OPTS=-Ofast
LDFLAGS= -lm -pthread
COMMON= -Iinclude/ -Isrc/
Expand Down Expand Up @@ -62,13 +63,16 @@ EXECOBJ = $(addprefix $(OBJDIR), $(EXECOBJA))
OBJS = $(addprefix $(OBJDIR), $(OBJ))
DEPS = $(wildcard src/*.h) Makefile include/darknet.h

all: obj backup results $(LIB) $(EXEC)
all: obj backup results $(SLIB) $(ALIB) $(EXEC)


$(EXEC): $(EXECOBJ) $(LIB)
$(CC) $(COMMON) $(CFLAGS) $^ -o $@ $(LDFLAGS) $(LIB)
$(EXEC): $(EXECOBJ) $(ALIB)
$(CC) $(COMMON) $(CFLAGS) $^ -o $@ $(LDFLAGS) $(ALIB)

$(LIB): $(OBJS)
$(ALIB): $(OBJS)
$(AR) $(ARFLAGS) $@ $^

$(SLIB): $(OBJS)
$(CC) $(CFLAGS) -shared $^ -o $@ $(LDFLAGS)

$(OBJDIR)%.o: %.c $(DEPS)
Expand All @@ -87,5 +91,5 @@ results:
.PHONY: clean

clean:
rm -rf $(OBJS) $(LIB) $(EXEC) $(EXECOBJ)
rm -rf $(OBJS) $(SLIB) $(ALIB) $(EXEC) $(EXECOBJ)

19 changes: 11 additions & 8 deletions src/blas_kernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -53,21 +53,24 @@ void backward_scale_gpu(float *x_norm, float *delta, int batch, int n, int size,
check_error(cudaPeekAtLastError());
}

__global__ void add_bias_kernel(float *output, float *biases, int n, int size)
__global__ void add_bias_kernel(float *output, float *biases, int batch, int n, int size)
{
int offset = blockIdx.x * blockDim.x + threadIdx.x;
int filter = blockIdx.y;
int batch = blockIdx.z;
int index = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
if (index >= n*size*batch) return;
int i = index % size;
index /= size;
int j = index % n;
index /= n;
int k = index;

if(offset < size) output[(batch*n+filter)*size + offset] += biases[filter];
output[(k*n+j)*size + i] += biases[j];
}

void add_bias_gpu(float *output, float *biases, int batch, int n, int size)
{
dim3 dimGrid((size-1)/BLOCK + 1, n, batch);
dim3 dimBlock(BLOCK, 1, 1);
int num = n*size*batch;

add_bias_kernel<<<dimGrid, dimBlock>>>(output, biases, n, size);
add_bias_kernel<<<cuda_gridsize(num), BLOCK>>>(output, biases, batch, n, size);
check_error(cudaPeekAtLastError());
}

Expand Down
2 changes: 1 addition & 1 deletion src/cuda.c
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ int cuda_get_device()

void check_error(cudaError_t status)
{
cudaDeviceSynchronize();
//cudaDeviceSynchronize();
cudaError_t status2 = cudaGetLastError();
if (status != cudaSuccess)
{
Expand Down
18 changes: 9 additions & 9 deletions src/gru_layer.c
Original file line number Diff line number Diff line change
Expand Up @@ -68,15 +68,6 @@ layer make_gru_layer(int batch, int inputs, int outputs, int steps, int batch_no
*(l.wh) = make_connected_layer(batch*steps, outputs, outputs, LINEAR, batch_normalize);
l.wh->batch = batch;

#ifdef CUDNN
cudnnSetTensor4dDescriptor(l.uz->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, batch, l.uz->out_c, l.uz->out_h, l.uz->out_w);
cudnnSetTensor4dDescriptor(l.uh->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, batch, l.uh->out_c, l.uh->out_h, l.uh->out_w);
cudnnSetTensor4dDescriptor(l.ur->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, batch, l.ur->out_c, l.ur->out_h, l.ur->out_w);
cudnnSetTensor4dDescriptor(l.wz->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, batch, l.wz->out_c, l.wz->out_h, l.wz->out_w);
cudnnSetTensor4dDescriptor(l.wh->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, batch, l.wh->out_c, l.wh->out_h, l.wh->out_w);
cudnnSetTensor4dDescriptor(l.wr->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, batch, l.wr->out_c, l.wr->out_h, l.wr->out_w);
#endif

l.batch_normalize = batch_normalize;


Expand Down Expand Up @@ -110,6 +101,15 @@ layer make_gru_layer(int batch, int inputs, int outputs, int steps, int batch_no
l.r_gpu = cuda_make_array(0, batch*outputs);
l.z_gpu = cuda_make_array(0, batch*outputs);
l.h_gpu = cuda_make_array(0, batch*outputs);

#ifdef CUDNN
cudnnSetTensor4dDescriptor(l.uz->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, batch, l.uz->out_c, l.uz->out_h, l.uz->out_w);
cudnnSetTensor4dDescriptor(l.uh->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, batch, l.uh->out_c, l.uh->out_h, l.uh->out_w);
cudnnSetTensor4dDescriptor(l.ur->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, batch, l.ur->out_c, l.ur->out_h, l.ur->out_w);
cudnnSetTensor4dDescriptor(l.wz->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, batch, l.wz->out_c, l.wz->out_h, l.wz->out_w);
cudnnSetTensor4dDescriptor(l.wh->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, batch, l.wh->out_c, l.wh->out_h, l.wh->out_w);
cudnnSetTensor4dDescriptor(l.wr->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, batch, l.wr->out_c, l.wr->out_h, l.wr->out_w);
#endif
#endif

return l;
Expand Down

0 comments on commit f9446ac

Please sign in to comment.