Skip to content

Commit

Permalink
Merge branch 'master' of github.com:lsds/Crossbow
Browse files Browse the repository at this point in the history
  • Loading branch information
Alexandros Koliousis committed May 4, 2019
2 parents b0ed097 + ec212bb commit a3cc725
Show file tree
Hide file tree
Showing 88 changed files with 4,220 additions and 3,036 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -15,3 +15,4 @@
checkpoints
.DS_Store
/target/
/bin/
4 changes: 2 additions & 2 deletions clib-multigpu/GPU.c
Original file line number Diff line number Diff line change
Expand Up @@ -774,15 +774,15 @@ JNIEXPORT jint JNICALL Java_uk_ac_imperial_lsds_crossbow_device_TheGPU_setLearni
}

JNIEXPORT jint JNICALL Java_uk_ac_imperial_lsds_crossbow_device_TheGPU_setLearningRateDecayPolicyMultiStep
(JNIEnv *env, jobject obj, jfloat learningRate, jdouble gamma, jintArray steps) {
(JNIEnv *env, jobject obj, jfloat learningRate, jdouble gamma, jint warmuptasks, jintArray steps) {

(void) env;
(void) obj;

jsize argc = (*env)->GetArrayLength(env, steps);
jint *argv = (*env)->GetIntArrayElements(env, steps, 0);

crossbowExecutionContextSetLearningRateDecayPolicyMultiStep (theGPU, learningRate, gamma, argc, argv);
crossbowExecutionContextSetLearningRateDecayPolicyMultiStep (theGPU, learningRate, gamma, warmuptasks, argc, argv);

(*env)->ReleaseIntArrayElements (env, steps, argv, JNI_ABORT);

Expand Down
107 changes: 90 additions & 17 deletions clib-multigpu/Makefile
Original file line number Diff line number Diff line change
@@ -1,13 +1,16 @@
CLASS_PATH := ../target/classes
vpath %.class $(CLASS_PATH)

# Makefile for Crossbow C/C++ library
# Customised for platypus2 running Linux on Wed 1 May 14:27:34 UTC 2019
#
OS = Linux
CUDA_PATH := /usr/local/cuda
JAVA_PATH := /usr/lib/jvm/java-8-openjdk-amd64
BLAS_PATH := /opt/OpenBLAS
NCCL_PATH := /home/akolious/nccl/nccl-2.1.15
NCCL_PATH :=
JPEG_PATH := /home/akolious/libjpeg-turbo
JAVA_PATH := /usr/lib/jvm/java-8-oracle
# Note: for Java 7, use: /usr/lib/jvm/java-7-openjdk-amd64/
CBOW_PATH := /home/akolious/pre-release
CBOW_PATH := /home/akolious/crossbow

CLASS_PATH := ../target/classes
vpath %.class $(CLASS_PATH)

ARCH := $(shell uname -m)
ifneq (,$(filter $(ARCH),x86_64))
Expand Down Expand Up @@ -65,13 +68,19 @@ INCLUDES := -I/usr/include -D_GNU_SOURCE
INCLUDES += -I$(CUDA_PATH)/include

# OpenBLAS
INCLUDES += -I$(BLAS_PATH)/include
ifneq ($(BLAS_PATH),)
INCLUDES += -I$(BLAS_PATH)/include
endif

# NCCL
INCLUDES += -I$(NCCL_PATH)/include
ifneq ($(NCCL_PATH),)
INCLUDES += -I$(NCCL_PATH)/include
endif

# Turbo-JPEG
INCLUDES += -I$(JPEG_PATH)
ifneq ($(JPEG_PATH),)
INCLUDES += -I$(JPEG_PATH)
endif

# JNI
ifeq ($(OS),Darwin)
Expand All @@ -98,7 +107,11 @@ LIBS += -L$(CUDA_PATH)/lib$(POSTFIX) -lcudart -lcublas -lcudnn -lcurand -lnvTool
LIBS += -L$(BLAS_PATH)/lib -lopenblas

# NCCL
LIBS += -L$(NCCL_PATH)/lib -lnccl
ifneq ($(NCCL_PATH),)
LIBS += -L$(NCCL_PATH)/lib -lnccl
else
LIBS += -lnccl
endif

# PTX code generation
#
Expand All @@ -115,8 +128,8 @@ ifneq ($(SMMAX),)
endif
endif

OBJS := executioncontext.o timer.o threadsafequeue.o waitfreequeue.o thetaqueue.o memorymanager.o list.o bytebuffer.o bufferpool.o arraylist.o stream.o kernel.o operator.o operatordependency.o dataflow.o variableschema.o variable.o localvariable.o kernelconfigurationparameter.o kernelscalar.o model.o modelmanager.o resulthandler.o databuffer.o kernelmap.o batch.o callbackhandler.o taskhandler.o solverconfiguration.o measurementlist.o device.o lightweightdatasethandler.o recorddataset.o doublebuffer.o cudnn/cudnntensor.o cudnn/cudnnconvparams.o cudnn/cudnnpoolparams.o cudnn/cudnnreluparams.o cudnn/cudnnsoftmaxparams.o cudnn/cudnnbatchnormparams.o cudnn/cudnndropoutparams.o cudnn/cudnnhelper.o
KNLS := kernels/classify.o kernels/accuracy.o kernels/gradientdescentoptimiser.o kernels/innerproduct.o kernels/innerproductgradient.o kernels/matmul.o kernels/noop.o kernels/noopstateless.o kernels/softmax.o kernels/softmaxgradient.o kernels/softmaxloss.o kernels/softmaxlossgradient.o kernels/pool.o kernels/poolgradient.o kernels/relu.o kernels/relugradient.o kernels/conv.o kernels/convgradient.o kernels/dropout.o kernels/dropoutgradient.o kernels/lrn.o kernels/lrngradient.o kernels/matfact.o kernels/cudnnconv.o kernels/cudnnconvgradient.o kernels/cudnnpool.o kernels/cudnnpoolgradient.o kernels/cudnnrelu.o kernels/cudnnrelugradient.o kernels/cudnnsoftmax.o kernels/cudnnsoftmaxgradient.o kernels/datatransform.o kernels/batchnorm.o kernels/batchnormgradient.o kernels/cudnnbatchnorm.o kernels/cudnnbatchnormgradient.o kernels/cudnndropout.o kernels/cudnndropoutgradient.o kernels/elementwiseop.o kernels/elementwiseopgradient.o kernels/concat.o kernels/concatgradient.o kernels/sleep.o
OBJS := executioncontext.o timer.o threadsafequeue.o waitfreequeue.o thetaqueue.o memorymanager.o list.o bytebuffer.o bufferpool.o arraylist.o stream.o kernel.o operator.o operatordependency.o dataflow.o variableschema.o variable.o localvariable.o kernelconfigurationparameter.o kernelscalar.o model.o modelmanager.o resulthandler.o databuffer.o kernelmap.o batch.o callbackhandler.o taskhandler.o solverconfiguration.o measurementlist.o device.o lightweightdatasethandler.o recorddataset.o doublebuffer.o synch/common.o synch/default.o synch/downpour.o synch/eamsgd.o synch/hogwild.o synch/polyakruppert.o synch/sma.o synch/synchronouseamsgd.o synch/synchronoussgd.o cudnn/cudnntensor.o cudnn/cudnnconvparams.o cudnn/cudnnpoolparams.o cudnn/cudnnreluparams.o cudnn/cudnnsoftmaxparams.o cudnn/cudnnbatchnormparams.o cudnn/cudnndropoutparams.o cudnn/cudnnhelper.o
KNLS := kernels/classify.o kernels/accuracy.o kernels/gradientdescentoptimiser.o kernels/innerproduct.o kernels/innerproductgradient.o kernels/matmul.o kernels/noop.o kernels/noopstateless.o kernels/softmax.o kernels/softmaxgradient.o kernels/softmaxloss.o kernels/softmaxlossgradient.o kernels/pool.o kernels/poolgradient.o kernels/relu.o kernels/relugradient.o kernels/conv.o kernels/convgradient.o kernels/dropout.o kernels/dropoutgradient.o kernels/lrn.o kernels/lrngradient.o kernels/matfact.o kernels/cudnnconv.o kernels/cudnnconvgradient.o kernels/cudnnpool.o kernels/cudnnpoolgradient.o kernels/cudnnrelu.o kernels/cudnnrelugradient.o kernels/cudnnsoftmax.o kernels/cudnnsoftmaxgradient.o kernels/datatransform.o kernels/batchnorm.o kernels/batchnormgradient.o kernels/cudnnbatchnorm.o kernels/cudnnbatchnormgradient.o kernels/cudnndropout.o kernels/cudnndropoutgradient.o kernels/elementwiseop.o kernels/elementwiseopgradient.o kernels/concat.o kernels/concatgradient.o kernels/sleep.o kernels/optimisers/default.o kernels/optimisers/hogwild.o kernels/optimisers/downpour.o kernels/optimisers/eamsgd.o kernels/optimisers/synchronouseamsgd.o kernels/optimisers/synchronoussgd.o kernels/optimisers/sma.o kernels/optimisers/polyakruppert.o

CROSSBOWBASEINCLUDES := memorymanager.h debug.h utils.h

Expand All @@ -136,7 +149,7 @@ uk_ac_imperial_lsds_crossbow_device_ObjectRef.h:
libCPU.so: CPU.o
$(NV) $(LFL) -shared -o libCPU.so CPU.o $(LIBS)

libGPU.so: GPU.o image/recordreader.o image/recordfile.o image/record.o image/image.o image/boundingbox.o image/rectangle.o image/yarng.o $(OBJS) $(KNLS)
libGPU.so: GPU.o image/recordreader.o image/recordfile.o image/record.o image/image.o image/boundingbox.o image/rectangle.o image/yarng.o random/random.o random/generator.o $(OBJS) $(KNLS)
$(NV) $(LFL) -shared -o libGPU.so GPU.o image/recordreader.o image/recordfile.o image/record.o image/image.o image/boundingbox.o image/rectangle.o image/yarng.o $(OBJS) $(KNLS) $(LIBS)

libBLAS.so: BLAS.o $(OBJS) $(KNLS)
Expand Down Expand Up @@ -170,7 +183,7 @@ random/generator.o: random/generator.cpp random/generator.hpp
$(CPP) $(INCLUDES) -W -Wall -DWARNING -fPIC -Wno-unused-function -c $< -o $@

dataset.o: dataset.c uk_ac_imperial_lsds_crossbow_device_dataset_DatasetMemoryManager.h
$(NV) $(INCLUDES) $(LFL) $(GENCODE) -c $< -o $@
$(NV) $(INCLUDES) $(LFL) $(GENCODE) -c $< -o $@

datasetfilemanager.o: datasetfilemanager.c datasetfilemanager.h $(CROSSBOWBASEINCLUDES)
$(NV) $(INCLUDES) $(LFL) $(GENCODE) -c $< -o $@
Expand All @@ -191,7 +204,7 @@ memoryregistry.o: memoryregistry.c memoryregistry.h memoryregistrynode.h $(CROSS
$(NV) $(INCLUDES) $(LFL) $(GENCODE) -c $< -o $@

lightweightdataset.o: lightweightdataset.c uk_ac_imperial_lsds_crossbow_device_dataset_LightWeightDatasetMemoryManager.h
$(NV) $(INCLUDES) $(LFL) $(GENCODE) -c $< -o $@
$(NV) $(INCLUDES) $(LFL) $(GENCODE) -c $< -o $@

lightweightdatasetmanager.o: lightweightdatasetmanager.c lightweightdatasetmanager.h $(CROSSBOWBASEINCLUDES)
$(NV) $(INCLUDES) $(LFL) $(GENCODE) -c $< -o $@
Expand All @@ -211,6 +224,37 @@ recorddataset.o: recorddataset.c recorddataset.h $(CROSSBOWBASEINCLUDES)
doublebuffer.o: doublebuffer.c doublebuffer.h $(CROSSBOWBASEINCLUDES)
$(NV) $(INCLUDES) $(LFL) $(GENCODE) -c $< -o $@

# === [Helpers for SGD (cross-replica synchronisation variants)] ===
#

synch/common.o: synch/common.c synch/common.h executioncontext.h $(CROSSBOWBASEINCLUDES)
$(NV) $(INCLUDES) $(LFL) $(GENCODE) -c $< -o $@

synch/default.o: synch/default.c synch/default.h synch/common.h executioncontext.h $(CROSSBOWBASEINCLUDES)
$(NV) $(INCLUDES) $(LFL) $(GENCODE) -c $< -o $@

synch/downpour.o: synch/downpour.c synch/downpour.h synch/common.h executioncontext.h $(CROSSBOWBASEINCLUDES)
$(NV) $(INCLUDES) $(LFL) $(GENCODE) -c $< -o $@

synch/eamsgd.o: synch/eamsgd.c synch/eamsgd.h synch/common.h executioncontext.h $(CROSSBOWBASEINCLUDES)
$(NV) $(INCLUDES) $(LFL) $(GENCODE) -c $< -o $@

synch/hogwild.o: synch/hogwild.c synch/hogwild.h synch/common.h executioncontext.h $(CROSSBOWBASEINCLUDES)
$(NV) $(INCLUDES) $(LFL) $(GENCODE) -c $< -o $@

synch/polyakruppert.o: synch/polyakruppert.c synch/polyakruppert.h synch/common.h executioncontext.h $(CROSSBOWBASEINCLUDES)
$(NV) $(INCLUDES) $(LFL) $(GENCODE) -c $< -o $@

synch/sma.o: synch/sma.c synch/sma.h synch/common.h executioncontext.h $(CROSSBOWBASEINCLUDES)
$(NV) $(INCLUDES) $(LFL) $(GENCODE) -c $< -o $@

synch/synchronouseamsgd.o: synch/synchronouseamsgd.c synch/synchronouseamsgd.h synch/common.h executioncontext.h $(CROSSBOWBASEINCLUDES)
$(NV) $(INCLUDES) $(LFL) $(GENCODE) -c $< -o $@

synch/synchronoussgd.o: synch/synchronoussgd.c synch/synchronoussgd.h synch/common.h executioncontext.h $(CROSSBOWBASEINCLUDES)
$(NV) $(INCLUDES) $(LFL) $(GENCODE) -c $< -o $@

# === [End of SGD helpers] ===

image/recordreader.o: image/recordreader.c image/recordreader.h $(CROSSBOWBASEINCLUDES)
$(NV) $(INCLUDES) $(LFL) $(GENCODE) -c $< -o $@
Expand Down Expand Up @@ -410,7 +454,7 @@ kernels/cudnndropoutgradient.o: kernels/cudnndropoutgradient.cu kernels/cudnndro

# === [Kernel compilation] ===
#

kernels/accuracy.o: kernels/accuracy.cu kernels/accuracy.h
$(NV) $(INCLUDES) $(LFL) $(GENCODE) -c $< -o $@

Expand All @@ -420,6 +464,34 @@ kernels/classify.o: kernels/classify.cu kernels/classify.h
kernels/gradientdescentoptimiser.o: kernels/gradientdescentoptimiser.cu kernels/gradientdescentoptimiser.h
$(NV) $(INCLUDES) $(LFL) $(GENCODE) -c $< -o $@

# === [Helpers for SGD (per replica sychronisation variants)] ===

kernels/optimisers/default.o: kernels/optimisers/default.cu kernels/optimisers/default.h
$(NV) $(INCLUDES) $(LFL) $(GENCODE) -c $< -o $@

kernels/optimisers/hogwild.o: kernels/optimisers/hogwild.cu kernels/optimisers/hogwild.h
$(NV) $(INCLUDES) $(LFL) $(GENCODE) -c $< -o $@

kernels/optimisers/downpour.o: kernels/optimisers/downpour.cu kernels/optimisers/downpour.h
$(NV) $(INCLUDES) $(LFL) $(GENCODE) -c $< -o $@

kernels/optimisers/eamsgd.o: kernels/optimisers/eamsgd.cu kernels/optimisers/eamsgd.h
$(NV) $(INCLUDES) $(LFL) $(GENCODE) -c $< -o $@

kernels/optimisers/synchronouseamsgd.o: kernels/optimisers/synchronouseamsgd.cu kernels/optimisers/synchronouseamsgd.h
$(NV) $(INCLUDES) $(LFL) $(GENCODE) -c $< -o $@

kernels/optimisers/synchronoussgd.o: kernels/optimisers/synchronoussgd.cu kernels/optimisers/synchronoussgd.h
$(NV) $(INCLUDES) $(LFL) $(GENCODE) -c $< -o $@

kernels/optimisers/sma.o: kernels/optimisers/sma.cu kernels/optimisers/sma.h
$(NV) $(INCLUDES) $(LFL) $(GENCODE) -c $< -o $@

kernels/optimisers/polyakruppert.o: kernels/optimisers/polyakruppert.cu kernels/optimisers/polyakruppert.h
$(NV) $(INCLUDES) $(LFL) $(GENCODE) -c $< -o $@

# === [End of SGD helpers] ===

kernels/innerproduct.o: kernels/innerproduct.cu kernels/innerproduct.h
$(NV) $(INCLUDES) $(LFL) $(GENCODE) -c $< -o $@

Expand Down Expand Up @@ -522,3 +594,4 @@ clean:
rm -f image/testrecordreader
rm -f image/testbatchreader
rm -f testrecorddataset

9 changes: 9 additions & 0 deletions clib-multigpu/callbackhandler.c
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
#include "debug.h"
#include "utils.h"

#include "recorddataset.h"

#include <pthread.h>

#include <cuda.h>
Expand Down Expand Up @@ -88,6 +90,13 @@ static void *handle (void *args) {
*
* while (cudaEventQuery(s->event) != cudaSuccess);
*/

/*
* Notify record dataset that a task completed
* (and therefore its data can be overwritten)
*/
if (s->dataset)
crossbowRecordDatasetNotify (s->dataset);

#ifdef INTRA_TASK_MEASUREMENTS
checkCudaErrors(cudaEventElapsedTime (&dt, s->start, s->event));
Expand Down
4 changes: 4 additions & 0 deletions clib-multigpu/cudnn/cudnnhelper.c
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,10 @@ const char *cudnnActivationModeString (cudnnActivationMode_t mode) {
#if CUDNN_MAJOR >= 6
case CUDNN_ACTIVATION_ELU: return "CUDNN_ACTIVATION_ELU";
#endif
#if CUDNN_MAJOR >= 7
case CUDNN_ACTIVATION_IDENTITY: return "CUDNN_ACTIVATION_IDENTITY";
#endif
default: return "CUDNN_ACTIVATION_UNKNOWN";
}

return NULL;
Expand Down
2 changes: 1 addition & 1 deletion clib-multigpu/debug.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
#include <nccl.h>

#undef GPU_VERBOSE
// #define GPU_VERBOSE
/* #define GPU_VERBOSE */

#undef KERNEL_CHECKSUM
/* #define COMPUTE_CHECKSUM */
Expand Down
Loading

0 comments on commit a3cc725

Please sign in to comment.