diff --git a/brian2cuda/device.py b/brian2cuda/device.py index b4610ee3..0f134042 100644 --- a/brian2cuda/device.py +++ b/brian2cuda/device.py @@ -49,6 +49,17 @@ ''', ), + gpu_heap_size = BrianPreference( + docs=''' + Size of the heap (in MB) used by malloc() and free() device system calls, which + are used in the `cudaVector` implementation. `cudaVectors` are used to + dynamically allocate device memory for `Spikemonitors` and the synapse + queues in the `CudaSpikeQueue` implementation for networks with + heterogeneously distributed delays. + ''', + validator=lambda v: isinstance(v, int) and v >= 0, + default=128), + curand_float_type=BrianPreference( docs=''' Floating point type of generated random numbers (float/double). @@ -373,7 +384,8 @@ def generate_main_source(self, writer, main_includes): code_objects=self.code_objects.values(), report_func=self.report_func, dt=float(defaultclock.dt), - additional_headers=main_includes + additional_headers=main_includes, + gpu_heap_size=prefs['devices.cuda_standalone.gpu_heap_size'] ) writer.write('main.cu', main_tmp) diff --git a/brian2cuda/templates/main.cu b/brian2cuda/templates/main.cu index af7cedf0..6c56044a 100644 --- a/brian2cuda/templates/main.cu +++ b/brian2cuda/templates/main.cu @@ -28,7 +28,7 @@ int main(int argc, char **argv) cudaDeviceProp props; cudaGetDeviceProperties(&props, 0); - size_t limit = 128 * 1024 * 1024; + size_t limit = {{gpu_heap_size}} * 1024 * 1024; cudaDeviceSetLimit(cudaLimitMallocHeapSize, limit); cudaDeviceSynchronize(); diff --git a/brian2cuda/templates/synapses_initialise_queue.cu b/brian2cuda/templates/synapses_initialise_queue.cu index 13245dd3..0deaabb6 100644 --- a/brian2cuda/templates/synapses_initialise_queue.cu +++ b/brian2cuda/templates/synapses_initialise_queue.cu @@ -17,7 +17,7 @@ namespace { __global__ void _run_{{codeobj_name}}_kernel( unsigned int _source_N, unsigned int _num_blocks, - unsigned int _num_threads_per_block, + unsigned int _num_threads, double _dt, unsigned int _syn_N, unsigned int num_delays, @@ -29,7 +29,7 @@ __global__ void _run_{{codeobj_name}}_kernel( {{pathobj}}.queue->prepare( tid, - _num_threads_per_block, + _num_threads, _num_blocks, 0, _source_N, @@ -364,10 +364,48 @@ void _run_{{pathobj}}_initialise_queue() { num_threads = max_threads_per_block; } - _run_{{codeobj_name}}_kernel<<<1, num_threads>>>( + unsigned int num_blocks = 1; + + // check if we have enough ressources to call kernel with given number + // of blocks and threads + struct cudaFuncAttributes funcAttrib; + cudaFuncGetAttributes(&funcAttrib, _run_{{codeobj_name}}_kernel); + if (num_threads > funcAttrib.maxThreadsPerBlock) + { + // use the max num_threads before launch failure + num_threads = funcAttrib.maxThreadsPerBlock; + printf("WARNING Not enough ressources available to call " + "_run_{{codeobj_name}}_kernel " + "with maximum possible threads per block (%u). " + "Reducing num_threads to %u. (Kernel needs %i " + "registers per block, %i bytes of " + "statically-allocated shared memory per block, %i " + "bytes of local memory per thread and a total of %i " + "bytes of user-allocated constant memory)\n", + max_threads_per_block, num_threads, funcAttrib.numRegs, + funcAttrib.sharedSizeBytes, funcAttrib.localSizeBytes, + funcAttrib.constSizeBytes); + } + else + { + printf("INFO _run_{{codeobj_name}}_kernel\n" + "\t%u blocks\n" + "\t%u threads\n" + "\t%i registers per block\n" + "\t%i bytes statically-allocated shared memory per block\n" + "\t%i bytes local memory per thread\n" + "\t%i bytes user-allocated constant memory\n" + "", + num_blocks, num_threads, funcAttrib.numRegs, + funcAttrib.sharedSizeBytes, funcAttrib.localSizeBytes, + funcAttrib.constSizeBytes); + } + + + _run_{{codeobj_name}}_kernel<<>>( source_N, num_parallel_blocks, - max_threads_per_block, + num_threads, dt, syn_N, num_delays, diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/README.md b/dev/benchmarks/results_2017_04_05_complete_after_talk/README.md new file mode 100644 index 00000000..d5f47013 --- /dev/null +++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/README.md @@ -0,0 +1,1752 @@ + +# Benchmark results from 05.04.2017 +## Description: + + + +## Last git log: +``` +commit 49e59d6b8fe0d84a3a1650e30e80e7caa023d987 +Author: Denis Alevi +Date: Wed Mar 29 20:14:08 2017 +0200 + + Revert to using cudaMemset to reset eventspace counter + + `__threadfence()` does not work in this + +``` +There is also a `git diff` saved in the current directory. + +## Results + +### AdaptationOscillation +![](plots/speed_test_AdaptationOscillation_absolute.png) +![](plots/speed_test_AdaptationOscillation_profiling.png) +![](plots/speed_test_AdaptationOscillation_relative.png) + +
Examplary `nvprof` results for **CUDAStandaloneConfiguration**

+Profile summary for `N = 1000`: + +``` +==27090== NVPROF is profiling process 27090, command: ./main +==27090== Profiling application: ./main +==27090== Profiling result: +Time(%) Time Calls Avg Min Max Name + 54.38% 151.00ms 10000 15.100us 2.8800us 70.592us kernel_synapses_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, double*, int, int*, int, int*, double, double*, int*, int, bool*) + 18.09% 50.227ms 10000 5.0220us 4.7040us 6.8800us kernel_neurongroup_stateupdater_codeobject(unsigned int, unsigned int, double*, double, double*, double*, double*, bool*, float*) + 11.30% 31.386ms 10000 3.1380us 3.0400us 4.2560us [CUDA memset] + 8.01% 22.246ms 10000 2.2240us 1.8560us 2.7520us kernel_neurongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double, double*, double*, bool*) + 7.90% 21.951ms 10000 2.1950us 1.5360us 3.0400us kernel_neurongroup_resetter_codeobject(unsigned int, unsigned int, int*, double*, double*, bool*) + 0.32% 881.25us 1 881.25us 881.25us 881.25us void gen_sequenced(curandStateXORWOW*, normal_args_st))>(curandStateXORWOW*, float2*, unsigned long, unsigned long, normal_args_st) + +==27090== API calls: +Time(%) Time Calls Avg Min Max Name + 65.77% 370.03ms 40001 9.2500us 8.1820us 8.8454ms cudaLaunch + 16.57% 93.193ms 10000 9.3190us 8.6380us 24.859us cudaMemset + 13.98% 78.650ms 390005 201ns 149ns 319.77us cudaSetupArgument + 1.93% 10.868ms 40001 271ns 200ns 313.28us cudaConfigureCall + 1.70% 9.5546ms 40002 238ns 207ns 5.1700us cudaGetLastError + 0.03% 174.94us 1 174.94us 174.94us 174.94us cudaMalloc + 0.01% 50.180us 1 50.180us 50.180us 50.180us cudaMemGetInfo + 0.00% 23.192us 38 610ns 476ns 1.5970us cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags + 0.00% 19.120us 7 2.7310us 2.0000us 5.0070us cudaFuncGetAttributes + 0.00% 17.862us 1 17.862us 17.862us 17.862us cudaDeviceSynchronize + 0.00% 5.0460us 12 420ns 293ns 1.1020us cudaDeviceGetAttribute + 0.00% 3.2580us 3 1.0860us 659ns 1.8660us cudaGetDevice + +``` + +

+ + +
Examplary `nvprof` results for **GeNNConfigurationOptimized**

+Profile summary for `N = 1000`: + +``` +==27315== NVPROF is profiling process 27315, command: ./main test 1.0 1 +==27315== Profiling application: ./main test 1.0 1 +==27315== Profiling result: +Time(%) Time Calls Avg Min Max Name + 53.41% 151.83ms 10000 15.183us 1.9200us 1.1186ms calcSynapses + 46.17% 131.26ms 10000 13.126us 10.560us 20.288us calcNeurons + 0.32% 903.46us 48 18.822us 960ns 129.47us [CUDA memcpy HtoD] + 0.10% 283.36us 14 20.240us 1.9840us 122.88us [CUDA memcpy DtoH] + +==27315== API calls: +Time(%) Time Calls Avg Min Max Name + 48.83% 298.28ms 13 22.945ms 9.2060us 295.80ms cudaHostAlloc + 46.42% 283.54ms 20000 14.176us 7.6710us 1.1119ms cudaLaunch + 2.61% 15.926ms 64 248.85us 409ns 13.875ms cudaMemcpy + 1.10% 6.6997ms 20000 334ns 268ns 303.73us cudaConfigureCall + 0.84% 5.1253ms 20000 256ns 228ns 5.1490us cudaSetupArgument + 0.14% 867.56us 13 66.735us 7.8370us 174.67us cudaMalloc + 0.04% 257.35us 83 3.1000us 186ns 109.74us cuDeviceGetAttribute + 0.01% 39.793us 1 39.793us 39.793us 39.793us cuDeviceGetName + 0.01% 36.797us 1 36.797us 36.797us 36.797us cuDeviceTotalMem + 0.00% 16.271us 1 16.271us 16.271us 16.271us cudaSetDevice + 0.00% 15.322us 13 1.1780us 539ns 3.3530us cudaGetSymbolAddress + 0.00% 2.6060us 2 1.3030us 777ns 1.8290us cuDeviceGetCount + 0.00% 1.8590us 1 1.8590us 1.8590us 1.8590us cudaGetDeviceCount + 0.00% 975ns 2 487ns 397ns 578ns cuDeviceGet + +``` + +

+ + +*** + +### BrunelHakimModelHeterogeneousDelay +![](plots/speed_test_BrunelHakimModelHeterogeneousDelay_absolute.png) +![](plots/speed_test_BrunelHakimModelHeterogeneousDelay_profiling.png) +![](plots/speed_test_BrunelHakimModelHeterogeneousDelay_relative.png) + +
Examplary `nvprof` results for **CUDAStandaloneConfiguration**

+Profile summary for `N = 1000`: + +``` +==13252== NVPROF is profiling process 13252, command: ./main +==13252== Profiling application: ./main +==13252== Profiling result: +Time(%) Time Calls Avg Min Max Name + 86.34% 3.21777s 10000 321.78us 1.5360us 4.9107ms _run_synapses_pre_push_spikes_push_kernel(unsigned int, unsigned int, unsigned int, int*) + 9.78% 364.56ms 10000 36.455us 2.2080us 84.928us kernel_synapses_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, int*, int, double, int*, int, double*, int*) + 1.24% 46.150ms 10000 4.6140us 4.4480us 6.7520us kernel_neurongroup_stateupdater_codeobject(unsigned int, unsigned int, double, double*, double*, double*, bool*, float*) + 0.86% 32.053ms 10000 3.2050us 2.9120us 4.2240us [CUDA memset] + 0.70% 25.923ms 10000 2.5920us 2.3680us 3.6160us _run_synapses_pre_push_spikes_advance_kernel(void) + 0.58% 21.708ms 10000 2.1700us 1.8880us 2.7200us kernel_neurongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double, double*, double*, bool*) + 0.48% 17.725ms 10000 1.7720us 1.6960us 2.0480us kernel_neurongroup_resetter_codeobject(unsigned int, unsigned int, double*, int*) + 0.02% 880.45us 1 880.45us 880.45us 880.45us void gen_sequenced(curandStateXORWOW*, normal_args_st))>(curandStateXORWOW*, float2*, unsigned long, unsigned long, normal_args_st) + +==13252== API calls: +Time(%) Time Calls Avg Min Max Name + 93.08% 3.54282s 60001 59.045us 7.8910us 6.6818ms cudaLaunch + 2.78% 105.95ms 10000 10.595us 8.3520us 305.06us cudaMemset + 1.61% 61.198ms 1 61.198ms 61.198ms 61.198ms cudaDeviceSynchronize + 1.60% 60.805ms 370005 164ns 130ns 296.03us cudaSetupArgument + 0.49% 18.710ms 60002 311ns 237ns 312.79us cudaGetLastError + 0.43% 16.481ms 60001 274ns 181ns 299.24us cudaConfigureCall + 0.00% 182.53us 1 182.53us 182.53us 182.53us cudaMalloc + 0.00% 71.394us 1 71.394us 71.394us 71.394us cudaMemGetInfo + 0.00% 20.387us 38 536ns 474ns 1.4760us cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags + 0.00% 18.951us 7 2.7070us 1.9760us 5.3870us cudaFuncGetAttributes + 0.00% 4.9460us 12 412ns 263ns 1.1520us cudaDeviceGetAttribute + 0.00% 2.8500us 3 950ns 608ns 1.6040us cudaGetDevice + +``` + +

+ + +
Examplary `nvprof` results for **GeNNConfigurationOptimized**

+Profile summary for `N = 1000`: + +``` +==13488== NVPROF is profiling process 13488, command: ./main test 1.0 1 +==13488== Profiling application: ./main test 1.0 1 +==13488== Profiling result: +Time(%) Time Calls Avg Min Max Name + 74.47% 118.07ms 10000 11.806us 10.016us 17.664us calcNeurons + 18.42% 29.207ms 10000 2.9200us 1.9200us 17.664us calcSynapses + 5.59% 8.8552ms 40 221.38us 960ns 2.5145ms [CUDA memcpy HtoD] + 1.52% 2.4178ms 10 241.78us 1.9520us 2.3869ms [CUDA memcpy DtoH] + +==13488== API calls: +Time(%) Time Calls Avg Min Max Name + 58.76% 270.99ms 11 24.635ms 17.531us 265.27ms cudaHostAlloc + 36.00% 166.02ms 20000 8.3000us 7.6090us 315.35us cudaLaunch + 2.62% 12.069ms 53 227.72us 334ns 2.5281ms cudaMemcpy + 1.36% 6.2887ms 20000 314ns 240ns 302.98us cudaConfigureCall + 1.00% 4.6085ms 20000 230ns 217ns 2.8530us cudaSetupArgument + 0.19% 860.67us 11 78.243us 12.662us 173.88us cudaMalloc + 0.05% 234.84us 83 2.8290us 158ns 100.64us cuDeviceGetAttribute + 0.01% 32.245us 1 32.245us 32.245us 32.245us cuDeviceTotalMem + 0.01% 27.894us 1 27.894us 27.894us 27.894us cuDeviceGetName + 0.00% 14.621us 11 1.3290us 791ns 3.3800us cudaGetSymbolAddress + 0.00% 12.561us 1 12.561us 12.561us 12.561us cudaSetDevice + 0.00% 1.4740us 2 737ns 495ns 979ns cuDeviceGetCount + 0.00% 1.4370us 1 1.4370us 1.4370us 1.4370us cudaGetDeviceCount + 0.00% 524ns 2 262ns 227ns 297ns cuDeviceGet + +``` + +

+ + +*** + +### BrunelHakimModelScalarDelay +![](plots/speed_test_BrunelHakimModelScalarDelay_absolute.png) +![](plots/speed_test_BrunelHakimModelScalarDelay_profiling.png) +![](plots/speed_test_BrunelHakimModelScalarDelay_relative.png) + +
Examplary `nvprof` results for **CUDAStandaloneConfiguration**

+Profile summary for `N = 1000`: + +``` +==2491== NVPROF is profiling process 2491, command: ./main +==2491== Profiling application: ./main +==2491== Profiling result: +Time(%) Time Calls Avg Min Max Name + 28.57% 48.196ms 10000 4.8190us 4.5440us 6.7840us kernel_neurongroup_stateupdater_codeobject(unsigned int, unsigned int, double, double*, double*, double*, bool*, float*) + 27.77% 46.841ms 10000 4.6840us 2.8800us 31.584us kernel_synapses_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, int*, int, double, int*, int, double*, int*) + 19.44% 32.782ms 10000 3.2780us 3.2320us 3.7760us [CUDA memset] + 12.58% 21.215ms 10000 2.1210us 1.9840us 2.5600us kernel_neurongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double, double*, double*, bool*) + 11.12% 18.762ms 10000 1.8760us 1.7920us 2.1120us kernel_neurongroup_resetter_codeobject(unsigned int, unsigned int, double*, int*) + 0.52% 880.90us 1 880.90us 880.90us 880.90us void gen_sequenced(curandStateXORWOW*, normal_args_st))>(curandStateXORWOW*, float2*, unsigned long, unsigned long, normal_args_st) + +==2491== API calls: +Time(%) Time Calls Avg Min Max Name + 67.81% 358.71ms 40001 8.9670us 7.9890us 10.112ms cudaLaunch + 16.69% 88.268ms 10000 8.8260us 8.3570us 34.808us cudaMemset + 11.38% 60.182ms 330005 182ns 150ns 304.26us cudaSetupArgument + 2.12% 11.226ms 40001 280ns 197ns 305.80us cudaConfigureCall + 1.95% 10.335ms 40002 258ns 217ns 14.869us cudaGetLastError + 0.03% 178.47us 1 178.47us 178.47us 178.47us cudaMalloc + 0.01% 51.372us 1 51.372us 51.372us 51.372us cudaMemGetInfo + 0.00% 21.822us 38 574ns 469ns 3.0220us cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags + 0.00% 19.460us 7 2.7800us 2.0130us 5.1840us cudaFuncGetAttributes + 0.00% 17.572us 1 17.572us 17.572us 17.572us cudaDeviceSynchronize + 0.00% 5.0120us 12 417ns 283ns 1.0740us cudaDeviceGetAttribute + 0.00% 2.8560us 3 952ns 570ns 1.6710us cudaGetDevice + +``` + +

+ + +
Examplary `nvprof` results for **GeNNConfigurationOptimized**

+Profile summary for `N = 1000`: + +``` +==2741== NVPROF is profiling process 2741, command: ./main test 1.0 1 +==2741== Profiling application: ./main test 1.0 1 +==2741== Profiling result: +Time(%) Time Calls Avg Min Max Name + 71.49% 120.00ms 10000 11.999us 10.016us 18.144us calcNeurons + 21.75% 36.501ms 10000 3.6500us 2.4960us 29.185us calcSynapses + 5.33% 8.9404ms 41 218.06us 960ns 2.5144ms [CUDA memcpy HtoD] + 1.43% 2.4037ms 10 240.37us 2.0480us 2.3725ms [CUDA memcpy DtoH] + +==2741== API calls: +Time(%) Time Calls Avg Min Max Name + 59.17% 284.47ms 11 25.861ms 13.934us 278.41ms cudaHostAlloc + 35.49% 170.60ms 20000 8.5300us 7.5850us 307.94us cudaLaunch + 2.68% 12.860ms 53 242.63us 394ns 2.5288ms cudaMemcpy + 1.36% 6.5596ms 20000 327ns 257ns 308.28us cudaConfigureCall + 1.04% 5.0131ms 20000 250ns 228ns 9.1940us cudaSetupArgument + 0.19% 898.78us 11 81.706us 9.2360us 153.32us cudaMalloc + 0.05% 226.47us 83 2.7280us 137ns 97.777us cuDeviceGetAttribute + 0.01% 31.138us 1 31.138us 31.138us 31.138us cuDeviceTotalMem + 0.01% 27.215us 1 27.215us 27.215us 27.215us cuDeviceGetName + 0.00% 12.953us 11 1.1770us 575ns 2.8170us cudaGetSymbolAddress + 0.00% 12.076us 1 12.076us 12.076us 12.076us cudaMemcpyToSymbol + 0.00% 10.837us 1 10.837us 10.837us 10.837us cudaSetDevice + 0.00% 1.5250us 1 1.5250us 1.5250us 1.5250us cudaGetDeviceCount + 0.00% 1.4930us 2 746ns 490ns 1.0030us cuDeviceGetCount + 0.00% 498ns 2 249ns 224ns 274ns cuDeviceGet + +``` + +

+ + +*** + +### BrunelHakimModelScalarDelayNoMultiPrePost +![](plots/speed_test_BrunelHakimModelScalarDelayNoMultiPrePost_absolute.png) +![](plots/speed_test_BrunelHakimModelScalarDelayNoMultiPrePost_profiling.png) +![](plots/speed_test_BrunelHakimModelScalarDelayNoMultiPrePost_relative.png) + +
Examplary `nvprof` results for **CUDAStandaloneConfiguration**

+Profile summary for `N = 1000`: + +``` +==23945== NVPROF is profiling process 23945, command: ./main +==23945== Profiling application: ./main +==23945== Profiling result: +Time(%) Time Calls Avg Min Max Name + 28.82% 47.429ms 10000 4.7420us 2.8800us 34.464us kernel_synapses_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, int*, int, double, int*, int, double*, int*) + 28.42% 46.768ms 10000 4.6760us 4.4480us 6.8800us kernel_neurongroup_stateupdater_codeobject(unsigned int, unsigned int, double, double*, double*, double*, bool*, float*) + 18.77% 30.887ms 10000 3.0880us 3.0400us 3.6160us [CUDA memset] + 13.20% 21.722ms 10000 2.1720us 2.0160us 2.5920us kernel_neurongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double, double*, double*, bool*) + 10.25% 16.871ms 10000 1.6870us 1.5680us 1.9840us kernel_neurongroup_resetter_codeobject(unsigned int, unsigned int, double*, int*) + 0.54% 881.31us 1 881.31us 881.31us 881.31us void gen_sequenced(curandStateXORWOW*, normal_args_st))>(curandStateXORWOW*, float2*, unsigned long, unsigned long, normal_args_st) + +==23945== API calls: +Time(%) Time Calls Avg Min Max Name + 68.47% 378.42ms 40001 9.4600us 8.3920us 11.185ms cudaLaunch + 16.96% 93.726ms 10000 9.3720us 8.8820us 22.956us cudaMemset + 10.76% 59.491ms 330005 180ns 148ns 309.86us cudaSetupArgument + 1.90% 10.527ms 40001 263ns 182ns 298.24us cudaConfigureCall + 1.84% 10.177ms 40002 254ns 225ns 10.282us cudaGetLastError + 0.03% 178.62us 1 178.62us 178.62us 178.62us cudaMalloc + 0.01% 52.598us 1 52.598us 52.598us 52.598us cudaMemGetInfo + 0.00% 25.078us 38 659ns 560ns 2.7750us cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags + 0.00% 19.936us 7 2.8480us 2.0920us 5.4650us cudaFuncGetAttributes + 0.00% 17.187us 1 17.187us 17.187us 17.187us cudaDeviceSynchronize + 0.00% 5.0920us 12 424ns 278ns 1.0780us cudaDeviceGetAttribute + 0.00% 3.1170us 3 1.0390us 523ns 1.9660us cudaGetDevice + +``` + +

+ + +
Examplary `nvprof` results for **GeNNConfigurationOptimized**

+Profile summary for `N = 1000`: + +``` +==24196== NVPROF is profiling process 24196, command: ./main test 1.0 1 +==24196== Profiling application: ./main test 1.0 1 +==24196== Profiling result: +Time(%) Time Calls Avg Min Max Name + 71.41% 120.56ms 10000 12.055us 10.048us 17.952us calcNeurons + 21.88% 36.941ms 10000 3.6940us 2.5280us 26.912us calcSynapses + 5.29% 8.9319ms 41 217.85us 992ns 2.5123ms [CUDA memcpy HtoD] + 1.42% 2.3983ms 10 239.83us 2.0160us 2.3673ms [CUDA memcpy DtoH] + +==24196== API calls: +Time(%) Time Calls Avg Min Max Name + 58.26% 272.15ms 11 24.741ms 19.067us 265.67ms cudaHostAlloc + 36.33% 169.74ms 20000 8.4860us 7.6190us 310.62us cudaLaunch + 2.72% 12.686ms 53 239.35us 323ns 2.5267ms cudaMemcpy + 1.36% 6.3732ms 20000 318ns 242ns 300.70us cudaConfigureCall + 1.03% 4.8351ms 20000 241ns 210ns 10.299us cudaSetupArgument + 0.22% 1.0265ms 11 93.320us 12.594us 179.95us cudaMalloc + 0.05% 240.26us 83 2.8940us 152ns 104.47us cuDeviceGetAttribute + 0.01% 32.415us 1 32.415us 32.415us 32.415us cuDeviceTotalMem + 0.01% 28.407us 1 28.407us 28.407us 28.407us cuDeviceGetName + 0.00% 14.808us 11 1.3460us 741ns 3.2100us cudaGetSymbolAddress + 0.00% 14.772us 1 14.772us 14.772us 14.772us cudaMemcpyToSymbol + 0.00% 12.168us 1 12.168us 12.168us 12.168us cudaSetDevice + 0.00% 1.4860us 1 1.4860us 1.4860us 1.4860us cudaGetDeviceCount + 0.00% 1.4580us 2 729ns 473ns 985ns cuDeviceGetCount + 0.00% 537ns 2 268ns 226ns 311ns cuDeviceGet + +``` + +

+ + +*** + +### COBAHH +![](plots/speed_test_COBAHH_absolute.png) +![](plots/speed_test_COBAHH_profiling.png) +![](plots/speed_test_COBAHH_relative.png) + +
Examplary `nvprof` results for **CUDAStandaloneConfiguration**

+Profile summary for `N = 1000`: + +``` +==11907== NVPROF is profiling process 11907, command: ./main +==11907== Profiling application: ./main +==11907== Profiling result: +Time(%) Time Calls Avg Min Max Name + 39.16% 186.02ms 10000 18.602us 17.856us 21.568us kernel_neurongroup_stateupdater_codeobject(unsigned int, unsigned int, double*, double*, double*, bool*, double*, double*, double*, double*, double, double*) + 29.93% 142.18ms 10000 14.218us 3.2320us 35.680us kernel_synapses_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, int*, int, double*, double, int*, int, int*) + 19.08% 90.630ms 10000 9.0620us 3.1680us 24.448us kernel_synapses_1_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, int*, int, double, int*, int, int*, double*) + 6.67% 31.670ms 10000 3.1660us 3.0400us 4.1920us [CUDA memset] + 5.15% 24.481ms 10000 2.4480us 2.0480us 2.7840us kernel_neurongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double, double*, double*, bool*) + +==11907== API calls: +Time(%) Time Calls Avg Min Max Name + 66.01% 376.74ms 40000 9.4180us 8.4480us 6.9662ms cudaLaunch + 15.97% 91.133ms 10000 9.1130us 8.5190us 28.283us cudaMemset + 13.95% 79.611ms 470000 169ns 149ns 316.22us cudaSetupArgument + 2.29% 13.092ms 40000 327ns 202ns 311.93us cudaConfigureCall + 1.76% 10.072ms 40000 251ns 230ns 5.0760us cudaGetLastError + 0.01% 50.252us 1 50.252us 50.252us 50.252us cudaMemGetInfo + 0.00% 22.121us 1 22.121us 22.121us 22.121us cudaDeviceSynchronize + 0.00% 16.912us 6 2.8180us 2.0980us 4.5270us cudaFuncGetAttributes + 0.00% 13.875us 21 660ns 520ns 1.5110us cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags + 0.00% 3.9730us 8 496ns 302ns 1.1490us cudaDeviceGetAttribute + 0.00% 2.3840us 2 1.1920us 836ns 1.5480us cudaGetDevice + +``` + +

+ + +
Examplary `nvprof` results for **GeNNConfigurationOptimized**

+Profile summary for `N = 1000`: + +``` +==12169== NVPROF is profiling process 12169, command: ./main test 1.0 1 +==12169== Profiling application: ./main test 1.0 1 +==12169== Profiling result: +Time(%) Time Calls Avg Min Max Name + 64.38% 254.25ms 10000 25.425us 23.777us 28.416us calcNeurons + 35.52% 140.25ms 10000 14.025us 2.4320us 41.696us calcSynapses + 0.07% 285.47us 68 4.1980us 960ns 42.944us [CUDA memcpy HtoD] + 0.03% 108.42us 18 6.0230us 1.9840us 40.736us [CUDA memcpy DtoH] + +==12169== API calls: +Time(%) Time Calls Avg Min Max Name + 52.49% 378.74ms 20000 18.937us 7.6840us 358.81us cudaLaunch + 42.10% 303.75ms 19 15.987ms 8.2320us 301.68ms cudaHostAlloc + 3.34% 24.097ms 88 273.83us 330ns 22.690ms cudaMemcpy + 1.06% 7.6642ms 20000 383ns 262ns 335.28us cudaConfigureCall + 0.86% 6.2250ms 20000 311ns 242ns 336.35us cudaSetupArgument + 0.10% 707.36us 19 37.229us 6.2200us 126.23us cudaMalloc + 0.03% 241.14us 83 2.9050us 137ns 109.48us cuDeviceGetAttribute + 0.00% 31.485us 1 31.485us 31.485us 31.485us cuDeviceTotalMem + 0.00% 30.190us 1 30.190us 30.190us 30.190us cuDeviceGetName + 0.00% 12.302us 19 647ns 344ns 2.1110us cudaGetSymbolAddress + 0.00% 11.562us 1 11.562us 11.562us 11.562us cudaSetDevice + 0.00% 1.5290us 2 764ns 561ns 968ns cuDeviceGetCount + 0.00% 1.4620us 1 1.4620us 1.4620us 1.4620us cudaGetDeviceCount + 0.00% 480ns 2 240ns 218ns 262ns cuDeviceGet + +``` + +

+ + +*** + +### COBAHHFixedConnectivity +![](plots/speed_test_COBAHHFixedConnectivity_absolute.png) +![](plots/speed_test_COBAHHFixedConnectivity_profiling.png) +![](plots/speed_test_COBAHHFixedConnectivity_relative.png) + +
Examplary `nvprof` results for **CUDAStandaloneConfiguration**

+Profile summary for `N = 1000`: + +``` +==17632== NVPROF is profiling process 17632, command: ./main +==17632== Profiling application: ./main +==17632== Profiling result: +Time(%) Time Calls Avg Min Max Name + 44.90% 349.33ms 10000 34.933us 1.6640us 111.13ms kernel_spikemonitor_codeobject(unsigned int, int*, double, int*, int*, int*, int, int*, double*, int, int*, int*) + 23.60% 183.61ms 10000 18.361us 17.824us 21.856us kernel_neurongroup_stateupdater_codeobject(unsigned int, unsigned int, double*, double*, double*, bool*, double*, double*, double*, double*, double, double*) + 14.85% 115.52ms 10000 11.551us 3.0720us 36.353us kernel_synapses_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, int*, int, double*, double, int*, int, int*) + 9.49% 73.847ms 10000 7.3840us 3.0720us 24.064us kernel_synapses_1_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, int*, int, double, int*, int, int*, double*) + 4.03% 31.352ms 10000 3.1350us 3.0400us 4.2880us [CUDA memset] + 3.12% 24.285ms 10000 2.4280us 2.0480us 2.7840us kernel_neurongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double, double*, double*, bool*) + 0.01% 68.000us 1 68.000us 68.000us 68.000us _run_spikemonitor_codeobject_init(void) + +==17632== API calls: +Time(%) Time Calls Avg Min Max Name + 73.31% 632.36ms 50001 12.646us 8.2740us 95.930ms cudaLaunch + 12.10% 104.36ms 590000 176ns 149ns 346.69us cudaSetupArgument + 11.27% 97.201ms 10000 9.7200us 8.6440us 1.1383ms cudaMemset + 1.55% 13.390ms 50001 267ns 192ns 331.43us cudaConfigureCall + 1.55% 13.349ms 50001 266ns 220ns 330.51us cudaGetLastError + 0.21% 1.8328ms 1 1.8328ms 1.8328ms 1.8328ms cudaDeviceSynchronize + 0.01% 51.143us 1 51.143us 51.143us 51.143us cudaMemGetInfo + 0.00% 18.972us 7 2.7100us 2.0070us 4.6510us cudaFuncGetAttributes + 0.00% 14.003us 22 636ns 470ns 1.4930us cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags + 0.00% 4.3080us 8 538ns 317ns 1.2590us cudaDeviceGetAttribute + 0.00% 2.2780us 2 1.1390us 764ns 1.5140us cudaGetDevice + +``` + +

+ + +
Examplary `nvprof` results for **GeNNConfigurationOptimized**

+Profile summary for `N = 1000`: + +``` +==17891== NVPROF is profiling process 17891, command: ./main test 1.0 1 +==17891== Profiling application: ./main test 1.0 1 +==17891== Profiling result: +Time(%) Time Calls Avg Min Max Name + 66.58% 251.53ms 10000 25.153us 23.840us 28.000us calcNeurons + 23.34% 88.193ms 10000 8.8190us 2.4320us 41.472us calcSynapses + 9.86% 37.269ms 18461 2.0180us 1.9520us 153.18us [CUDA memcpy DtoH] + 0.22% 820.87us 68 12.071us 960ns 164.23us [CUDA memcpy HtoD] + +==17891== API calls: +Time(%) Time Calls Avg Min Max Name + 52.66% 509.16ms 20088 25.346us 320ns 371.03us cudaMemcpy + 26.73% 258.42ms 19 13.601ms 8.8970us 255.30ms cudaHostAlloc + 19.10% 184.67ms 20000 9.2330us 7.8160us 348.55us cudaLaunch + 0.81% 7.7916ms 20000 389ns 275ns 331.45us cudaConfigureCall + 0.56% 5.4451ms 20000 272ns 241ns 4.6710us cudaSetupArgument + 0.10% 1.0098ms 19 53.145us 6.4240us 173.26us cudaMalloc + 0.02% 226.52us 83 2.7290us 143ns 97.659us cuDeviceGetAttribute + 0.00% 31.331us 1 31.331us 31.331us 31.331us cuDeviceTotalMem + 0.00% 30.487us 1 30.487us 30.487us 30.487us cuDeviceGetName + 0.00% 18.126us 19 954ns 368ns 3.5740us cudaGetSymbolAddress + 0.00% 11.311us 1 11.311us 11.311us 11.311us cudaSetDevice + 0.00% 1.7800us 2 890ns 658ns 1.1220us cuDeviceGetCount + 0.00% 1.4830us 1 1.4830us 1.4830us 1.4830us cudaGetDeviceCount + 0.00% 640ns 2 320ns 242ns 398ns cuDeviceGet + +``` + +

+ + +*** + +### CUBA +![](plots/speed_test_CUBA_absolute.png) +![](plots/speed_test_CUBA_profiling.png) +![](plots/speed_test_CUBA_relative.png) + +
Examplary `nvprof` results for **CUDAStandaloneConfiguration**

+Profile summary for `N = 1000`: + +``` +==31291== NVPROF is profiling process 31291, command: ./main +==31291== Profiling application: ./main +==31291== Profiling result: +Time(%) Time Calls Avg Min Max Name + 31.18% 76.419ms 10000 7.6410us 7.3920us 8.7360us kernel_neurongroup_stateupdater_codeobject(unsigned int, unsigned int, double*, double*, double, double*, double*, double*, bool*) + 19.96% 48.924ms 10000 4.8920us 3.4560us 20.384us kernel_synapses_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, int*, int, double*, double, int*, int, int*, bool*) + 18.13% 44.432ms 10000 4.4430us 3.2960us 17.952us kernel_synapses_1_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, int*, int, double, int*, int, int*, double*, bool*) + 13.38% 32.789ms 10000 3.2780us 3.2320us 3.7760us [CUDA memset] + 9.59% 23.496ms 10000 2.3490us 2.0480us 2.7520us kernel_neurongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double, double*, double*, bool*) + 7.76% 19.020ms 10000 1.9010us 1.6640us 2.0800us kernel_neurongroup_resetter_codeobject(unsigned int, unsigned int, double*, int*, bool*) + +==31291== API calls: +Time(%) Time Calls Avg Min Max Name + 68.69% 471.10ms 50000 9.4220us 8.2170us 19.231ms cudaLaunch + 13.91% 95.387ms 10000 9.5380us 8.7960us 312.26us cudaMemset + 13.50% 92.578ms 510000 181ns 148ns 324.51us cudaSetupArgument + 2.05% 14.040ms 50000 280ns 237ns 5.2940us cudaConfigureCall + 1.83% 12.581ms 50000 251ns 217ns 12.226us cudaGetLastError + 0.01% 51.575us 1 51.575us 51.575us 51.575us cudaMemGetInfo + 0.00% 21.460us 39 550ns 461ns 1.4270us cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags + 0.00% 21.129us 8 2.6410us 1.9560us 4.4310us cudaFuncGetAttributes + 0.00% 16.670us 1 16.670us 16.670us 16.670us cudaDeviceSynchronize + 0.00% 5.5840us 12 465ns 285ns 1.2870us cudaDeviceGetAttribute + 0.00% 3.3860us 3 1.1280us 653ns 1.8010us cudaGetDevice + +``` + +

+ + +
Examplary `nvprof` results for **GeNNConfigurationOptimized**

+Profile summary for `N = 1000`: + +``` +==31529== NVPROF is profiling process 31529, command: ./main test 1.0 1 +==31529== Profiling application: ./main test 1.0 1 +==31529== Profiling result: +Time(%) Time Calls Avg Min Max Name + 74.56% 131.02ms 10000 13.101us 11.808us 14.624us calcNeurons + 24.85% 43.662ms 10000 4.3660us 2.1760us 25.760us calcSynapses + 0.45% 796.80us 56 14.228us 960ns 163.59us [CUDA memcpy HtoD] + 0.13% 234.31us 13 18.023us 1.9520us 155.27us [CUDA memcpy DtoH] + +==31529== API calls: +Time(%) Time Calls Avg Min Max Name + 57.53% 276.80ms 16 17.300ms 8.5100us 274.32ms cudaHostAlloc + 38.37% 184.60ms 20000 9.2300us 7.6370us 342.36us cudaLaunch + 1.48% 7.1407ms 73 97.817us 343ns 5.2594ms cudaMemcpy + 1.31% 6.3266ms 20000 316ns 249ns 315.38us cudaConfigureCall + 1.06% 5.1071ms 20000 255ns 220ns 4.6570us cudaSetupArgument + 0.17% 819.17us 16 51.198us 6.2400us 136.59us cudaMalloc + 0.05% 241.67us 83 2.9110us 138ns 103.86us cuDeviceGetAttribute + 0.01% 32.371us 1 32.371us 32.371us 32.371us cuDeviceTotalMem + 0.01% 28.436us 1 28.436us 28.436us 28.436us cuDeviceGetName + 0.00% 12.399us 16 774ns 424ns 2.0180us cudaGetSymbolAddress + 0.00% 12.047us 1 12.047us 12.047us 12.047us cudaSetDevice + 0.00% 1.6800us 1 1.6800us 1.6800us 1.6800us cudaGetDeviceCount + 0.00% 1.4560us 2 728ns 455ns 1.0010us cuDeviceGetCount + 0.00% 575ns 2 287ns 235ns 340ns cuDeviceGet + +``` + +

+ + +*** + +### CUBAFixedConnectivity +![](plots/speed_test_CUBAFixedConnectivity_absolute.png) +![](plots/speed_test_CUBAFixedConnectivity_profiling.png) +![](plots/speed_test_CUBAFixedConnectivity_relative.png) + +
Examplary `nvprof` results for **CUDAStandaloneConfiguration**

+Profile summary for `N = 1000`: + +``` +==28333== NVPROF is profiling process 28333, command: ./main +==28333== Profiling application: ./main +==28333== Profiling result: +Time(%) Time Calls Avg Min Max Name + 23.53% 75.188ms 10000 7.5180us 7.1360us 8.8960us kernel_neurongroup_stateupdater_codeobject(unsigned int, unsigned int, double*, double*, double, double*, double*, double*, bool*) + 20.88% 66.723ms 10000 6.6720us 1.6960us 14.967ms kernel_spikemonitor_codeobject(unsigned int, int*, double, int*, int*, int*, int, int*, double*, int, int*, int*) + 17.07% 54.561ms 10000 5.4560us 3.2960us 21.920us kernel_synapses_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, int*, int, double*, double, int*, int, int*, bool*) + 15.31% 48.929ms 10000 4.8920us 3.2960us 18.784us kernel_synapses_1_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, int*, int, double, int*, int, int*, double*, bool*) + 10.24% 32.716ms 10000 3.2710us 3.1360us 4.1920us [CUDA memset] + 7.36% 23.508ms 10000 2.3500us 2.0160us 2.7200us kernel_neurongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double, double*, double*, bool*) + 5.59% 17.866ms 10000 1.7860us 1.5360us 2.0800us kernel_neurongroup_resetter_codeobject(unsigned int, unsigned int, double*, int*, bool*) + 0.02% 67.328us 1 67.328us 67.328us 67.328us _run_spikemonitor_codeobject_init(void) + +==28333== API calls: +Time(%) Time Calls Avg Min Max Name + 70.32% 550.58ms 60001 9.1760us 8.3390us 6.9445ms cudaLaunch + 14.00% 109.65ms 630000 174ns 148ns 343.93us cudaSetupArgument + 11.69% 91.573ms 10000 9.1570us 8.5300us 165.12us cudaMemset + 1.99% 15.611ms 60001 260ns 222ns 327.19us cudaConfigureCall + 1.98% 15.472ms 60001 257ns 208ns 1.1493ms cudaGetLastError + 0.01% 51.353us 1 51.353us 51.353us 51.353us cudaMemGetInfo + 0.00% 24.711us 40 617ns 509ns 1.7610us cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags + 0.00% 23.494us 9 2.6100us 2.0080us 4.3370us cudaFuncGetAttributes + 0.00% 17.566us 1 17.566us 17.566us 17.566us cudaDeviceSynchronize + 0.00% 5.4430us 12 453ns 281ns 1.1050us cudaDeviceGetAttribute + 0.00% 3.0770us 3 1.0250us 646ns 1.6320us cudaGetDevice + +``` + +

+ + +
Examplary `nvprof` results for **GeNNConfigurationOptimized**

+Profile summary for `N = 1000`: + +``` +==28592== NVPROF is profiling process 28592, command: ./main test 1.0 1 +==28592== Profiling application: ./main test 1.0 1 +==28592== Profiling result: +Time(%) Time Calls Avg Min Max Name + 63.11% 133.95ms 10000 13.394us 12.384us 14.432us calcNeurons + 22.74% 48.266ms 10000 4.8260us 2.7200us 24.896us calcSynapses + 13.78% 29.240ms 14081 2.0760us 2.0160us 154.95us [CUDA memcpy DtoH] + 0.37% 793.60us 56 14.171us 960ns 163.11us [CUDA memcpy HtoD] + +==28592== API calls: +Time(%) Time Calls Avg Min Max Name + 38.67% 315.20ms 20073 15.702us 324ns 773.07us cudaMemcpy + 37.36% 304.57ms 16 19.036ms 8.7600us 301.99ms cudaHostAlloc + 22.40% 182.59ms 20000 9.1290us 7.6730us 821.14us cudaLaunch + 0.78% 6.3728ms 20000 318ns 250ns 5.2440us cudaConfigureCall + 0.66% 5.3441ms 20000 267ns 226ns 332.81us cudaSetupArgument + 0.10% 800.29us 16 50.018us 6.1360us 126.53us cudaMalloc + 0.03% 230.87us 83 2.7810us 153ns 99.066us cuDeviceGetAttribute + 0.00% 32.084us 1 32.084us 32.084us 32.084us cuDeviceTotalMem + 0.00% 30.780us 1 30.780us 30.780us 30.780us cuDeviceGetName + 0.00% 12.549us 16 784ns 421ns 2.2350us cudaGetSymbolAddress + 0.00% 11.671us 1 11.671us 11.671us 11.671us cudaSetDevice + 0.00% 1.8440us 1 1.8440us 1.8440us 1.8440us cudaGetDeviceCount + 0.00% 1.7500us 2 875ns 690ns 1.0600us cuDeviceGetCount + 0.00% 626ns 2 313ns 253ns 373ns cuDeviceGet + +``` + +

+ + +*** + +### DenseMediumRateSynapsesOnly +![](plots/speed_test_DenseMediumRateSynapsesOnly_absolute.png) +![](plots/speed_test_DenseMediumRateSynapsesOnly_profiling.png) +![](plots/speed_test_DenseMediumRateSynapsesOnly_relative.png) + +
Examplary `nvprof` results for **CUDAStandaloneConfiguration**

+Profile summary for `N = 1000`: + +``` +==30551== NVPROF is profiling process 30551, command: ./main +==30551== Profiling application: ./main +==30551== Profiling result: +Time(%) Time Calls Avg Min Max Name + 56.01% 59.694ms 10000 5.9690us 5.6000us 6.4960us kernel_synapses_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, int*, int, double, double*, int*, int, int*) + 28.93% 30.830ms 10000 3.0820us 3.0400us 3.5200us [CUDA memset] + 15.06% 16.055ms 10000 1.6050us 1.5040us 2.4000us kernel_neurongroup_thresholder_codeobject(unsigned int, unsigned int, int*) + +==30551== API calls: +Time(%) Time Calls Avg Min Max Name + 59.17% 191.07ms 20000 9.5530us 8.3220us 11.129ms cudaLaunch + 27.89% 90.062ms 10000 9.0060us 8.4390us 27.616us cudaMemset + 9.32% 30.084ms 170000 176ns 153ns 306.97us cudaSetupArgument + 1.82% 5.8925ms 20000 294ns 213ns 303.17us cudaConfigureCall + 1.77% 5.7023ms 20000 285ns 216ns 302.98us cudaGetLastError + 0.01% 46.403us 1 46.403us 46.403us 46.403us cudaMemGetInfo + 0.01% 18.635us 1 18.635us 18.635us 18.635us cudaDeviceSynchronize + 0.00% 8.8700us 3 2.9560us 2.1570us 3.7290us cudaFuncGetAttributes + 0.00% 6.7130us 3 2.2370us 629ns 3.5200us cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags + 0.00% 1.7730us 4 443ns 369ns 586ns cudaDeviceGetAttribute + 0.00% 848ns 1 848ns 848ns 848ns cudaGetDevice + +``` + +

+ + +
Examplary `nvprof` results for **GeNNConfigurationOptimized**

+Profile summary for `N = 1000`: + +``` +==30762== NVPROF is profiling process 30762, command: ./main test 1.0 1 +==30762== Profiling application: ./main test 1.0 1 +==30762== Profiling result: +Time(%) Time Calls Avg Min Max Name + 64.08% 52.562ms 10000 5.2560us 3.4240us 5.9200us calcSynapses + 35.80% 29.364ms 10000 2.9360us 2.8800us 3.8080us calcNeurons + 0.07% 57.888us 44 1.3150us 960ns 2.2400us [CUDA memcpy HtoD] + 0.05% 38.240us 14 2.7310us 2.0160us 4.7360us [CUDA memcpy DtoH] + +==30762== API calls: +Time(%) Time Calls Avg Min Max Name + 61.72% 283.35ms 12 23.613ms 14.143us 281.71ms cudaHostAlloc + 35.34% 162.27ms 20000 8.1130us 7.4880us 334.11us cudaLaunch + 1.34% 6.1571ms 20000 307ns 256ns 322.44us cudaConfigureCall + 1.16% 5.3454ms 20000 267ns 224ns 332.57us cudaSetupArgument + 0.23% 1.0363ms 61 16.988us 318ns 37.131us cudaMemcpy + 0.14% 644.11us 12 53.676us 11.831us 178.21us cudaMalloc + 0.05% 226.72us 83 2.7310us 138ns 97.611us cuDeviceGetAttribute + 0.01% 31.315us 1 31.315us 31.315us 31.315us cuDeviceTotalMem + 0.01% 26.553us 1 26.553us 26.553us 26.553us cuDeviceGetName + 0.00% 13.976us 12 1.1640us 709ns 3.1230us cudaGetSymbolAddress + 0.00% 11.238us 1 11.238us 11.238us 11.238us cudaSetDevice + 0.00% 1.4430us 2 721ns 438ns 1.0050us cuDeviceGetCount + 0.00% 1.4380us 1 1.4380us 1.4380us 1.4380us cudaGetDeviceCount + 0.00% 582ns 2 291ns 214ns 368ns cuDeviceGet + +``` + +

+ + +*** + +### HHNeuronsOnly +![](plots/speed_test_HHNeuronsOnly_absolute.png) +![](plots/speed_test_HHNeuronsOnly_profiling.png) +![](plots/speed_test_HHNeuronsOnly_relative.png) + +
Examplary `nvprof` results for **CUDAStandaloneConfiguration**

+Profile summary for `N = 1000`: + +``` +==25014== NVPROF is profiling process 25014, command: ./main +==25014== Profiling application: ./main +==25014== Profiling result: +Time(%) Time Calls Avg Min Max Name + 76.60% 171.78ms 10000 17.177us 14.880us 18.080us kernel_neurongroup_stateupdater_codeobject(unsigned int, unsigned int, double*, double*, bool*, double*, double*, double*, double*) + 13.61% 30.516ms 10000 3.0510us 2.8160us 3.5840us [CUDA memset] + 9.79% 21.945ms 10000 2.1940us 1.8240us 2.9120us kernel_neurongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double, double*, double*, bool*) + +==25014== API calls: +Time(%) Time Calls Avg Min Max Name + 58.23% 179.09ms 20000 8.9540us 8.0160us 5.8117ms cudaLaunch + 28.13% 86.520ms 10000 8.6520us 8.0220us 324.89us cudaMemset + 10.05% 30.914ms 160000 193ns 150ns 347.54us cudaSetupArgument + 1.94% 5.9702ms 20000 298ns 223ns 315.53us cudaConfigureCall + 1.61% 4.9531ms 20000 247ns 210ns 327.22us cudaGetLastError + 0.02% 46.728us 1 46.728us 46.728us 46.728us cudaMemGetInfo + 0.01% 17.432us 35 498ns 471ns 917ns cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags + 0.00% 10.745us 1 10.745us 10.745us 10.745us cudaDeviceSynchronize + 0.00% 10.378us 4 2.5940us 2.0060us 3.1740us cudaFuncGetAttributes + 0.00% 3.1700us 8 396ns 284ns 677ns cudaDeviceGetAttribute + 0.00% 1.6580us 2 829ns 801ns 857ns cudaGetDevice + +``` + +

+ + +
Examplary `nvprof` results for **GeNNConfigurationOptimized**

+Profile summary for `N = 1000`: + +``` +==25225== NVPROF is profiling process 25225, command: ./main test 1.0 1 +==25225== Profiling application: ./main test 1.0 1 +==25225== Profiling result: +Time(%) Time Calls Avg Min Max Name + 99.94% 177.51ms 10000 17.750us 14.944us 26.400us calcNeurons + 0.04% 62.626us 40 1.5650us 960ns 2.1760us [CUDA memcpy HtoD] + 0.02% 38.560us 11 3.5050us 2.0160us 4.6720us [CUDA memcpy DtoH] + +==25225== API calls: +Time(%) Time Calls Avg Min Max Name + 55.84% 235.54ms 10 23.554ms 16.992us 233.93ms cudaHostAlloc + 37.45% 157.95ms 10000 15.795us 7.9250us 353.53us cudaLaunch + 4.97% 20.977ms 53 395.80us 389ns 20.008ms cudaMemcpy + 0.81% 3.4097ms 10000 340ns 278ns 5.0220us cudaConfigureCall + 0.70% 2.9582ms 10000 295ns 232ns 339.82us cudaSetupArgument + 0.15% 630.64us 10 63.063us 12.457us 174.83us cudaMalloc + 0.05% 227.15us 83 2.7360us 140ns 98.109us cuDeviceGetAttribute + 0.01% 31.635us 1 31.635us 31.635us 31.635us cuDeviceTotalMem + 0.01% 31.273us 1 31.273us 31.273us 31.273us cuDeviceGetName + 0.00% 12.870us 10 1.2870us 741ns 3.5550us cudaGetSymbolAddress + 0.00% 10.918us 1 10.918us 10.918us 10.918us cudaSetDevice + 0.00% 1.9240us 2 962ns 718ns 1.2060us cuDeviceGetCount + 0.00% 1.4330us 1 1.4330us 1.4330us 1.4330us cudaGetDeviceCount + 0.00% 657ns 2 328ns 303ns 354ns cuDeviceGet + +``` + +

+ + +*** + +### LinearNeuronsOnly +![](plots/speed_test_LinearNeuronsOnly_absolute.png) +![](plots/speed_test_LinearNeuronsOnly_profiling.png) +![](plots/speed_test_LinearNeuronsOnly_relative.png) + +
Examplary `nvprof` results for **CUDAStandaloneConfiguration**

+Profile summary for `N = 1000`: + +``` +==19640== NVPROF is profiling process 19640, command: ./main +==19640== Profiling application: ./main +==19640== Profiling result: +Time(%) Time Calls Avg Min Max Name +100.00% 247.35ms 100000 2.4730us 2.3360us 3.6800us kernel_neurongroup_stateupdater_codeobject(unsigned int, unsigned int, double*, double*) + +==19640== API calls: +Time(%) Time Calls Avg Min Max Name + 87.43% 837.87ms 100000 8.3780us 7.7260us 7.8274ms cudaLaunch + 7.01% 67.186ms 400000 167ns 147ns 10.910us cudaSetupArgument + 2.81% 26.904ms 100000 269ns 241ns 10.142us cudaConfigureCall + 2.74% 26.287ms 100000 262ns 235ns 11.074us cudaGetLastError + 0.01% 70.067us 1 70.067us 70.067us 70.067us cudaMemGetInfo + 0.00% 14.560us 2 7.2800us 4.1830us 10.377us cudaFuncGetAttributes + 0.00% 9.6320us 1 9.6320us 9.6320us 9.6320us cudaDeviceSynchronize + 0.00% 5.2800us 2 2.6400us 1.1150us 4.1650us cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags + 0.00% 3.9840us 1 3.9840us 3.9840us 3.9840us cudaGetDevice + 0.00% 3.7360us 4 934ns 668ns 1.5690us cudaDeviceGetAttribute + +``` + +

+ + +
Examplary `nvprof` results for **GeNNConfigurationOptimized**

+Profile summary for `N = 1000`: + +``` +==19869== NVPROF is profiling process 19869, command: ./main test 10.0 1 +==19869== Profiling application: ./main test 10.0 1 +==19869== Profiling result: +Time(%) Time Calls Avg Min Max Name + 99.99% 264.71ms 100000 2.6470us 2.5920us 3.1680us calcNeurons + 0.01% 22.656us 16 1.4160us 960ns 2.0800us [CUDA memcpy HtoD] + 0.01% 14.624us 5 2.9240us 2.0480us 4.6720us [CUDA memcpy DtoH] + +==19869== API calls: +Time(%) Time Calls Avg Min Max Name + 73.18% 822.50ms 100000 8.2250us 7.6370us 361.19us cudaLaunch + 21.57% 242.48ms 4 60.620ms 23.163us 240.97ms cudaHostAlloc + 2.95% 33.155ms 100000 331ns 251ns 369.91us cudaConfigureCall + 2.18% 24.551ms 100000 245ns 222ns 14.790us cudaSetupArgument + 0.05% 525.28us 4 131.32us 12.450us 178.02us cudaMalloc + 0.04% 460.82us 23 20.035us 384ns 39.476us cudaMemcpy + 0.02% 226.65us 83 2.7300us 142ns 97.695us cuDeviceGetAttribute + 0.00% 31.478us 1 31.478us 31.478us 31.478us cuDeviceTotalMem + 0.00% 30.578us 1 30.578us 30.578us 30.578us cuDeviceGetName + 0.00% 10.794us 1 10.794us 10.794us 10.794us cudaSetDevice + 0.00% 7.9740us 4 1.9930us 876ns 3.7070us cudaGetSymbolAddress + 0.00% 1.5520us 2 776ns 553ns 999ns cuDeviceGetCount + 0.00% 1.4290us 1 1.4290us 1.4290us 1.4290us cudaGetDeviceCount + 0.00% 545ns 2 272ns 256ns 289ns cuDeviceGet + +``` + +

+ + +*** + +### STDP +![](plots/speed_test_STDP_absolute.png) +![](plots/speed_test_STDP_profiling.png) +![](plots/speed_test_STDP_relative.png) + +
Examplary `nvprof` results for **CUDAStandaloneConfiguration**

+Profile summary for `N = 1000`: + +``` +==30259== NVPROF is profiling process 30259, command: ./main +==30259== Profiling application: ./main +==30259== Profiling result: +Time(%) Time Calls Avg Min Max Name + 29.51% 119.04ms 10000 11.903us 1.4720us 28.312ms kernel_spikemonitor_codeobject(unsigned int, int*, double, int*, int*, int*, int, int*, double*, int, int*, int*) + 19.38% 78.154ms 10000 7.8150us 3.0400us 25.729us kernel_synapses_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, double*, int, double*, int, int*, double*, double, double*, int, int*, int, int*, int) + 15.01% 60.555ms 20000 3.0270us 2.8480us 4.2880us [CUDA memset] + 13.45% 54.257ms 10000 5.4250us 4.9280us 8.0000us kernel_neurongroup_stateupdater_codeobject(unsigned int, unsigned int, double*, double*, double*) + 8.78% 35.407ms 10000 3.5400us 3.2000us 7.1360us kernel_synapses_post_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, double*, int, double*, int, double, double*, int, int*, int*, int) + 6.25% 25.200ms 10000 2.5190us 2.1760us 2.8800us kernel_poissongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double*, double*, float*) + 3.84% 15.476ms 10000 1.5470us 1.4080us 2.4960us kernel_neurongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double*) + 3.64% 14.677ms 10000 1.4670us 1.3440us 1.9520us kernel_neurongroup_resetter_codeobject(unsigned int, unsigned int, double*, int*) + 0.13% 535.30us 1 535.30us 535.30us 535.30us void gen_sequenced(curandStateXORWOW*, int))>(curandStateXORWOW*, float*, unsigned long, unsigned long, int) + 0.02% 69.760us 1 69.760us 69.760us 69.760us _run_spikemonitor_codeobject_init(void) + +==30259== API calls: +Time(%) Time Calls Avg Min Max Name + 66.59% 656.39ms 70002 9.3760us 8.0560us 14.291ms cudaLaunch + 18.06% 178.04ms 20000 8.9010us 7.9370us 1.1364ms cudaMemset + 11.56% 113.99ms 680005 167ns 152ns 60.368us cudaSetupArgument + 2.00% 19.667ms 70003 280ns 237ns 57.739us cudaGetLastError + 1.77% 17.418ms 70002 248ns 194ns 139.14us cudaConfigureCall + 0.01% 139.28us 1 139.28us 139.28us 139.28us cudaMalloc + 0.00% 48.635us 1 48.635us 48.635us 48.635us cudaMemGetInfo + 0.00% 27.603us 11 2.5090us 1.9830us 4.1880us cudaFuncGetAttributes + 0.00% 23.673us 42 563ns 472ns 1.2600us cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags + 0.00% 18.501us 1 18.501us 18.501us 18.501us cudaDeviceSynchronize + 0.00% 6.2050us 16 387ns 285ns 719ns cudaDeviceGetAttribute + 0.00% 3.4000us 4 850ns 590ns 1.2110us cudaGetDevice + +``` + +

+ + +
Examplary `nvprof` results for **GeNNConfigurationOptimized**

+Profile summary for `N = 1000`: + +``` +==30505== NVPROF is profiling process 30505, command: ./main test 1.0 1 +==30505== Profiling application: ./main test 1.0 1 +==30505== Profiling result: +Time(%) Time Calls Avg Min Max Name + 50.58% 115.54ms 10000 11.553us 1.7280us 50.209us calcSynapses + 21.49% 49.104ms 10000 4.9100us 4.0640us 6.1440us calcNeurons + 16.03% 36.625ms 17853 2.0510us 2.0160us 4.7360us [CUDA memcpy DtoH] + 11.86% 27.088ms 10000 2.7080us 2.5920us 11.392us learnSynapsesPost + 0.04% 93.633us 70 1.3370us 960ns 2.1440us [CUDA memcpy HtoD] + +==30505== API calls: +Time(%) Time Calls Avg Min Max Name + 35.14% 309.15ms 20095 15.384us 188ns 352.42us cudaMemcpy + 32.84% 288.94ms 20 14.447ms 7.6290us 287.79ms cudaHostAlloc + 29.91% 263.12ms 30000 8.7700us 7.6720us 331.70us cudaLaunch + 1.17% 10.291ms 30000 343ns 248ns 319.74us cudaConfigureCall + 0.84% 7.4251ms 30000 247ns 223ns 10.549us cudaSetupArgument + 0.06% 487.96us 20 24.398us 6.1080us 126.07us cudaMalloc + 0.03% 225.93us 83 2.7220us 138ns 97.475us cuDeviceGetAttribute + 0.00% 31.137us 1 31.137us 31.137us 31.137us cuDeviceTotalMem + 0.00% 27.695us 1 27.695us 27.695us 27.695us cuDeviceGetName + 0.00% 11.547us 20 577ns 375ns 2.1780us cudaGetSymbolAddress + 0.00% 11.033us 1 11.033us 11.033us 11.033us cudaSetDevice + 0.00% 1.4410us 2 720ns 488ns 953ns cuDeviceGetCount + 0.00% 1.3060us 1 1.3060us 1.3060us 1.3060us cudaGetDeviceCount + 0.00% 575ns 2 287ns 226ns 349ns cuDeviceGet + +``` + +

+ + +*** + +### STDPEventDriven +![](plots/speed_test_STDPEventDriven_absolute.png) +![](plots/speed_test_STDPEventDriven_profiling.png) +![](plots/speed_test_STDPEventDriven_relative.png) + +
Examplary `nvprof` results for **CUDAStandaloneConfiguration**

+Profile summary for `N = 1000`: + +``` +==13883== NVPROF is profiling process 13883, command: ./main +==13883== Profiling application: ./main +==13883== Profiling result: +Time(%) Time Calls Avg Min Max Name + 29.16% 88.869ms 10000 8.8860us 3.4880us 32.064us kernel_synapses_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, double*, int, double*, int, int*, double*, double, double*, int, int*, int, int*, int) + 20.89% 63.662ms 20000 3.1830us 3.0400us 3.6800us [CUDA memset] + 17.94% 54.662ms 10000 5.4660us 5.1840us 7.5200us kernel_neurongroup_stateupdater_codeobject(unsigned int, unsigned int, double*, double*, double*) + 12.41% 37.829ms 10000 3.7820us 3.6480us 7.2000us kernel_synapses_post_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, double*, int, double*, int, double, double*, int, int*, int*, int) + 7.99% 24.357ms 10000 2.4350us 2.1760us 2.8800us kernel_poissongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double*, double*, float*) + 5.78% 17.601ms 10000 1.7600us 1.5360us 2.4960us kernel_neurongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double*) + 5.65% 17.232ms 10000 1.7230us 1.6640us 1.9840us kernel_neurongroup_resetter_codeobject(unsigned int, unsigned int, double*, int*) + 0.17% 532.84us 1 532.84us 532.84us 532.84us void gen_sequenced(curandStateXORWOW*, int))>(curandStateXORWOW*, float*, unsigned long, unsigned long, int) + +==13883== API calls: +Time(%) Time Calls Avg Min Max Name + 62.59% 547.05ms 60001 9.1170us 8.1770us 7.2312ms cudaLaunch + 20.36% 177.95ms 20000 8.8970us 8.1030us 336.69us cudaMemset + 13.38% 116.92ms 560005 208ns 150ns 330.03us cudaSetupArgument + 1.91% 16.702ms 60001 278ns 208ns 316.80us cudaConfigureCall + 1.74% 15.203ms 60002 253ns 222ns 313.88us cudaGetLastError + 0.02% 138.47us 1 138.47us 138.47us 138.47us cudaMalloc + 0.01% 47.825us 1 47.825us 47.825us 47.825us cudaMemGetInfo + 0.00% 24.670us 10 2.4670us 1.9950us 3.8850us cudaFuncGetAttributes + 0.00% 22.588us 41 550ns 471ns 1.2300us cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags + 0.00% 17.416us 1 17.416us 17.416us 17.416us cudaDeviceSynchronize + 0.00% 5.6370us 16 352ns 276ns 664ns cudaDeviceGetAttribute + 0.00% 3.1450us 4 786ns 601ns 1.1830us cudaGetDevice + +``` + +

+ + +
Examplary `nvprof` results for **GeNNConfigurationOptimized**

+Profile summary for `N = 1000`: + +``` +==14124== NVPROF is profiling process 14124, command: ./main test 1.0 1 +==14124== Profiling application: ./main test 1.0 1 +==14124== Profiling result: +Time(%) Time Calls Avg Min Max Name + 62.29% 109.79ms 10000 10.979us 1.4400us 50.176us calcSynapses + 23.83% 42.003ms 10000 4.2000us 3.3280us 6.2080us calcNeurons + 13.80% 24.321ms 10000 2.4320us 2.0800us 10.848us learnSynapsesPost + 0.05% 93.824us 70 1.3400us 960ns 2.1760us [CUDA memcpy HtoD] + 0.03% 53.856us 19 2.8340us 1.9520us 4.6400us [CUDA memcpy DtoH] + +==14124== API calls: +Time(%) Time Calls Avg Min Max Name + 54.33% 315.51ms 20 15.776ms 7.4360us 314.37ms cudaHostAlloc + 42.46% 246.58ms 30000 8.2190us 7.6810us 352.29us cudaLaunch + 1.62% 9.4165ms 30000 313ns 235ns 338.10us cudaConfigureCall + 1.25% 7.2565ms 30000 241ns 219ns 10.061us cudaSetupArgument + 0.20% 1.1638ms 95 12.250us 188ns 29.618us cudaMemcpy + 0.08% 485.57us 20 24.278us 6.1510us 122.08us cudaMalloc + 0.04% 225.75us 83 2.7190us 136ns 97.167us cuDeviceGetAttribute + 0.01% 31.148us 1 31.148us 31.148us 31.148us cuDeviceTotalMem + 0.00% 27.209us 1 27.209us 27.209us 27.209us cuDeviceGetName + 0.00% 25.053us 20 1.2520us 370ns 14.749us cudaGetSymbolAddress + 0.00% 11.323us 1 11.323us 11.323us 11.323us cudaSetDevice + 0.00% 1.4040us 1 1.4040us 1.4040us 1.4040us cudaGetDeviceCount + 0.00% 1.3580us 2 679ns 456ns 902ns cuDeviceGetCount + 0.00% 492ns 2 246ns 220ns 272ns cuDeviceGet + +``` + +

+ + +*** + +### STDPMultiPost +![](plots/speed_test_STDPMultiPost_absolute.png) +![](plots/speed_test_STDPMultiPost_profiling.png) +![](plots/speed_test_STDPMultiPost_relative.png) + +
Examplary `nvprof` results for **CUDAStandaloneConfiguration**

+Profile summary for `N = 1000`: + +``` +==13752== NVPROF is profiling process 13752, command: ./main +==13752== Profiling application: ./main +==13752== Profiling result: +Time(%) Time Calls Avg Min Max Name + 26.01% 63.681ms 20000 3.1840us 3.0400us 3.8080us [CUDA memset] + 21.90% 53.615ms 10000 5.3610us 5.1840us 7.2640us kernel_neurongroup_stateupdater_codeobject(unsigned int, unsigned int, double*, double*, double*) + 16.08% 39.373ms 10000 3.9370us 3.5840us 10.720us kernel_synapses_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, double*, int, double*, int, int*, double*, double, double*, int, int*, int, int*, int) + 14.74% 36.097ms 10000 3.6090us 3.4880us 105.60us kernel_synapses_post_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, double*, int, double*, int, double, double*, int, int*, int*, int) + 8.31% 20.344ms 10000 2.0340us 1.8560us 2.4320us kernel_poissongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double*, double*, float*) + 6.61% 16.187ms 10000 1.6180us 1.5040us 2.8160us kernel_neurongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double*) + 6.34% 15.535ms 10000 1.5530us 1.4720us 1.9840us kernel_neurongroup_resetter_codeobject(unsigned int, unsigned int, double*, int*) + 0.01% 22.881us 1 22.881us 22.881us 22.881us void gen_sequenced(curandStateXORWOW*, int))>(curandStateXORWOW*, float*, unsigned long, unsigned long, int) + +==13752== API calls: +Time(%) Time Calls Avg Min Max Name + 64.39% 566.77ms 60001 9.4450us 8.5300us 7.6226ms cudaLaunch + 20.37% 179.35ms 20000 8.9670us 8.0990us 320.51us cudaMemset + 11.68% 102.80ms 560005 183ns 154ns 320.82us cudaSetupArgument + 1.91% 16.807ms 60001 280ns 234ns 314.83us cudaConfigureCall + 1.62% 14.260ms 60002 237ns 197ns 325.01us cudaGetLastError + 0.01% 125.15us 1 125.15us 125.15us 125.15us cudaMalloc + 0.01% 50.027us 1 50.027us 50.027us 50.027us cudaMemGetInfo + 0.00% 25.943us 10 2.5940us 1.9990us 4.6510us cudaFuncGetAttributes + 0.00% 23.402us 41 570ns 490ns 1.2400us cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags + 0.00% 17.044us 1 17.044us 17.044us 17.044us cudaDeviceSynchronize + 0.00% 6.0160us 16 376ns 279ns 1.0150us cudaDeviceGetAttribute + 0.00% 3.0950us 4 773ns 532ns 1.3840us cudaGetDevice + +``` + +

+ + +
Examplary `nvprof` results for **GeNNConfigurationOptimized**

+Profile summary for `N = 1000`: + +``` +==13992== NVPROF is profiling process 13992, command: ./main test 1.0 1 +==13992== Profiling application: ./main test 1.0 1 +==13992== Profiling result: +Time(%) Time Calls Avg Min Max Name + 47.47% 40.621ms 10000 4.0620us 3.9680us 12.064us calcNeurons + 29.19% 24.977ms 10000 2.4970us 2.4000us 360.29us learnSynapsesPost + 23.19% 19.844ms 10000 1.9840us 1.5680us 15.904us calcSynapses + 0.10% 83.488us 70 1.1920us 960ns 2.0480us [CUDA memcpy HtoD] + 0.05% 45.344us 17 2.6670us 2.0480us 4.7040us [CUDA memcpy DtoH] + +==13992== API calls: +Time(%) Time Calls Avg Min Max Name + 49.24% 255.49ms 20 12.774ms 7.1470us 254.39ms cudaHostAlloc + 47.05% 244.13ms 30000 8.1370us 7.4970us 325.41us cudaLaunch + 1.88% 9.7505ms 30000 325ns 240ns 313.30us cudaConfigureCall + 1.44% 7.4897ms 30000 249ns 228ns 4.6460us cudaSetupArgument + 0.23% 1.1712ms 95 12.328us 191ns 29.827us cudaMemcpy + 0.10% 498.07us 20 24.903us 6.1390us 124.17us cudaMalloc + 0.04% 225.66us 83 2.7180us 135ns 97.278us cuDeviceGetAttribute + 0.01% 31.145us 1 31.145us 31.145us 31.145us cuDeviceTotalMem + 0.01% 27.598us 1 27.598us 27.598us 27.598us cuDeviceGetName + 0.00% 11.370us 20 568ns 348ns 2.0700us cudaGetSymbolAddress + 0.00% 11.183us 1 11.183us 11.183us 11.183us cudaSetDevice + 0.00% 1.4160us 2 708ns 453ns 963ns cuDeviceGetCount + 0.00% 1.3950us 1 1.3950us 1.3950us 1.3950us cudaGetDeviceCount + 0.00% 533ns 2 266ns 241ns 292ns cuDeviceGet + +``` + +

+ + +*** + +### STDPMultiPostNeuronalTraces +![](plots/speed_test_STDPMultiPostNeuronalTraces_absolute.png) +![](plots/speed_test_STDPMultiPostNeuronalTraces_profiling.png) +![](plots/speed_test_STDPMultiPostNeuronalTraces_relative.png) + +
Examplary `nvprof` results for **CUDAStandaloneConfiguration**

+Profile summary for `N = 1000`: + +``` +==31645== NVPROF is profiling process 31645, command: ./main +==31645== Profiling application: ./main +==31645== Profiling result: +Time(%) Time Calls Avg Min Max Name + 23.09% 63.632ms 20000 3.1810us 3.0400us 3.8080us [CUDA memset] + 21.51% 59.284ms 10000 5.9280us 5.6320us 7.6160us kernel_neurongroup_1_stateupdater_codeobject(unsigned int, unsigned int, double*, double*, double*, double*) + 13.19% 36.348ms 10000 3.6340us 3.4240us 12.288us kernel_synapses_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, int*, double*, double, double*, int, int*, int, int*, int, double*) + 12.65% 34.859ms 10000 3.4850us 3.3920us 94.048us kernel_synapses_post_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, int*, double, double*, int, double*, int*, int, int) + 9.89% 27.258ms 10000 2.7250us 2.5280us 2.9760us kernel_neurongroup_stateupdater_codeobject(unsigned int, unsigned int, double*, double*) + 6.72% 18.518ms 10000 1.8510us 1.7600us 2.8160us kernel_neurongroup_1_thresholder_codeobject(unsigned int, unsigned int, int*, double*) + 6.69% 18.444ms 10000 1.8440us 1.6000us 2.4320us kernel_neurongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double*, double*, float*) + 6.26% 17.266ms 10000 1.7260us 1.6640us 2.4000us kernel_neurongroup_1_resetter_codeobject(unsigned int, unsigned int, double*, int*, double*) + 0.01% 22.689us 1 22.689us 22.689us 22.689us void gen_sequenced(curandStateXORWOW*, int))>(curandStateXORWOW*, float*, unsigned long, unsigned long, int) + +==31645== API calls: +Time(%) Time Calls Avg Min Max Name + 66.34% 631.89ms 70001 9.0260us 7.8240us 7.5683ms cudaLaunch + 18.61% 177.26ms 20000 8.8630us 8.0310us 327.63us cudaMemset + 11.06% 105.29ms 570005 184ns 147ns 324.54us cudaSetupArgument + 1.98% 18.868ms 70002 269ns 211ns 316.30us cudaGetLastError + 1.98% 18.848ms 70001 269ns 196ns 10.259us cudaConfigureCall + 0.01% 123.44us 1 123.44us 123.44us 123.44us cudaMalloc + 0.01% 48.253us 1 48.253us 48.253us 48.253us cudaMemGetInfo + 0.00% 38.693us 74 522ns 468ns 1.2040us cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags + 0.00% 30.351us 12 2.5290us 2.0130us 4.4000us cudaFuncGetAttributes + 0.00% 17.703us 1 17.703us 17.703us 17.703us cudaDeviceSynchronize + 0.00% 8.0120us 20 400ns 315ns 771ns cudaDeviceGetAttribute + 0.00% 3.7350us 5 747ns 588ns 1.2880us cudaGetDevice + +``` + +

+ + +
Examplary `nvprof` results for **GeNNConfigurationOptimized**

+Profile summary for `N = 1000`: + +``` +==31875== NVPROF is profiling process 31875, command: ./main test 1.0 1 +==31875== Profiling application: ./main test 1.0 1 +==31875== Profiling result: +Time(%) Time Calls Avg Min Max Name + 51.59% 44.978ms 10000 4.4970us 4.4160us 13.216us calcNeurons + 28.08% 24.482ms 10000 2.4480us 2.4000us 108.48us learnSynapsesPost + 20.19% 17.604ms 10000 1.7600us 1.5680us 8.0320us calcSynapses + 0.09% 77.888us 70 1.1120us 960ns 2.0160us [CUDA memcpy HtoD] + 0.05% 40.704us 17 2.3940us 2.0480us 4.6720us [CUDA memcpy DtoH] + +==31875== API calls: +Time(%) Time Calls Avg Min Max Name + 49.08% 242.98ms 30000 8.0990us 7.4830us 330.16us cudaLaunch + 46.99% 232.62ms 20 11.631ms 13.742us 230.95ms cudaHostAlloc + 1.93% 9.5539ms 30000 318ns 249ns 316.27us cudaConfigureCall + 1.50% 7.4449ms 30000 248ns 228ns 9.5620us cudaSetupArgument + 0.29% 1.4169ms 93 15.235us 341ns 34.925us cudaMemcpy + 0.15% 732.26us 20 36.613us 11.241us 173.89us cudaMalloc + 0.05% 225.85us 83 2.7210us 144ns 97.097us cuDeviceGetAttribute + 0.01% 31.104us 1 31.104us 31.104us 31.104us cuDeviceTotalMem + 0.01% 27.342us 1 27.342us 27.342us 27.342us cuDeviceGetName + 0.00% 19.527us 20 976ns 638ns 3.5660us cudaGetSymbolAddress + 0.00% 11.180us 1 11.180us 11.180us 11.180us cudaSetDevice + 0.00% 1.5790us 2 789ns 579ns 1.0000us cuDeviceGetCount + 0.00% 1.4070us 1 1.4070us 1.4070us 1.4070us cudaGetDeviceCount + 0.00% 534ns 2 267ns 238ns 296ns cuDeviceGet + +``` + +

+ + +*** + +### STDPNeuronalTraces +![](plots/speed_test_STDPNeuronalTraces_absolute.png) +![](plots/speed_test_STDPNeuronalTraces_profiling.png) +![](plots/speed_test_STDPNeuronalTraces_relative.png) + +
Examplary `nvprof` results for **CUDAStandaloneConfiguration**

+Profile summary for `N = 1000`: + +``` +==22958== NVPROF is profiling process 22958, command: ./main +==22958== Profiling application: ./main +==22958== Profiling result: +Time(%) Time Calls Avg Min Max Name + 23.34% 76.426ms 10000 7.6420us 3.2960us 26.944us kernel_synapses_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, int*, double*, double, double*, int, int*, int, int*, int, double*) + 19.43% 63.625ms 20000 3.1810us 3.0400us 3.7120us [CUDA memset] + 18.23% 59.686ms 10000 5.9680us 5.6320us 8.0960us kernel_neurongroup_1_stateupdater_codeobject(unsigned int, unsigned int, double*, double*, double*, double*) + 11.04% 36.142ms 10000 3.6140us 3.3920us 7.0730us kernel_synapses_post_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, int*, double, double*, int, double*, int*, int, int) + 9.09% 29.761ms 10000 2.9760us 2.8800us 3.5840us kernel_neurongroup_stateupdater_codeobject(unsigned int, unsigned int, double*, double*) + 7.99% 26.155ms 10000 2.6150us 2.2080us 2.8800us kernel_neurongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double*, double*, float*) + 5.47% 17.908ms 10000 1.7900us 1.7280us 2.4640us kernel_neurongroup_1_thresholder_codeobject(unsigned int, unsigned int, int*, double*) + 5.26% 17.212ms 10000 1.7210us 1.6640us 2.3680us kernel_neurongroup_1_resetter_codeobject(unsigned int, unsigned int, double*, int*, double*) + 0.16% 534.91us 1 534.91us 534.91us 534.91us void gen_sequenced(curandStateXORWOW*, int))>(curandStateXORWOW*, float*, unsigned long, unsigned long, int) + +==22958== API calls: +Time(%) Time Calls Avg Min Max Name + 66.19% 628.57ms 70001 8.9790us 7.8060us 7.0815ms cudaLaunch + 18.98% 180.22ms 20000 9.0110us 8.1910us 325.17us cudaMemset + 10.84% 102.92ms 570005 180ns 148ns 322.77us cudaSetupArgument + 2.05% 19.421ms 70002 277ns 224ns 322.72us cudaGetLastError + 1.92% 18.237ms 70001 260ns 204ns 7.6100us cudaConfigureCall + 0.01% 139.26us 1 139.26us 139.26us 139.26us cudaMalloc + 0.01% 47.740us 1 47.740us 47.740us 47.740us cudaMemGetInfo + 0.00% 38.641us 74 522ns 463ns 1.3230us cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags + 0.00% 31.070us 12 2.5890us 2.0180us 4.6520us cudaFuncGetAttributes + 0.00% 17.325us 1 17.325us 17.325us 17.325us cudaDeviceSynchronize + 0.00% 7.2280us 20 361ns 279ns 764ns cudaDeviceGetAttribute + 0.00% 3.4300us 5 686ns 519ns 1.2200us cudaGetDevice + +``` + +

+ + +
Examplary `nvprof` results for **GeNNConfigurationOptimized**

+Profile summary for `N = 1000`: + +``` +==23186== NVPROF is profiling process 23186, command: ./main test 1.0 1 +==23186== Profiling application: ./main test 1.0 1 +==23186== Profiling result: +Time(%) Time Calls Avg Min Max Name + 45.72% 59.376ms 10000 5.9370us 1.4400us 22.209us calcSynapses + 36.59% 47.519ms 10000 4.7510us 3.7440us 7.2000us calcNeurons + 17.59% 22.844ms 10000 2.2840us 2.0800us 5.8240us learnSynapsesPost + 0.07% 90.016us 70 1.2850us 928ns 2.0480us [CUDA memcpy HtoD] + 0.04% 51.168us 19 2.6930us 1.9520us 4.6080us [CUDA memcpy DtoH] + +==23186== API calls: +Time(%) Time Calls Avg Min Max Name + 48.78% 251.54ms 20 12.577ms 7.1400us 250.44ms cudaHostAlloc + 47.58% 245.35ms 30000 8.1780us 7.6280us 342.38us cudaLaunch + 1.85% 9.5606ms 30000 318ns 255ns 320.84us cudaConfigureCall + 1.41% 7.2598ms 30000 241ns 222ns 5.1580us cudaSetupArgument + 0.22% 1.1470ms 93 12.333us 278ns 32.150us cudaMemcpy + 0.10% 513.51us 20 25.675us 6.0810us 139.05us cudaMalloc + 0.04% 228.09us 83 2.7480us 140ns 98.263us cuDeviceGetAttribute + 0.01% 31.411us 1 31.411us 31.411us 31.411us cuDeviceTotalMem + 0.01% 27.452us 1 27.452us 27.452us 27.452us cuDeviceGetName + 0.00% 12.004us 1 12.004us 12.004us 12.004us cudaSetDevice + 0.00% 11.525us 20 576ns 352ns 2.0890us cudaGetSymbolAddress + 0.00% 1.6280us 2 814ns 489ns 1.1390us cuDeviceGetCount + 0.00% 1.5650us 1 1.5650us 1.5650us 1.5650us cudaGetDeviceCount + 0.00% 594ns 2 297ns 230ns 364ns cuDeviceGet + +``` + +

+ + +*** + +### STDPNotEventDriven +![](plots/speed_test_STDPNotEventDriven_absolute.png) +![](plots/speed_test_STDPNotEventDriven_profiling.png) +![](plots/speed_test_STDPNotEventDriven_relative.png) + +
Examplary `nvprof` results for **CUDAStandaloneConfiguration**

+Profile summary for `N = 1000`: + +``` +==5309== NVPROF is profiling process 5309, command: ./main +==5309== Profiling application: ./main +==5309== Profiling result: +Time(%) Time Calls Avg Min Max Name + 23.35% 73.232ms 10000 7.3230us 3.4560us 24.544us kernel_synapses_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, double*, int, double*, int, int*, int, double*, double, double*, int, int*, int, int*) + 20.25% 63.528ms 20000 3.1760us 3.0400us 3.7440us [CUDA memset] + 17.18% 53.899ms 10000 5.3890us 5.0240us 7.6480us kernel_neurongroup_stateupdater_codeobject(unsigned int, unsigned int, double*, double*, double*) + 11.40% 35.764ms 10000 3.5760us 3.3920us 6.2720us kernel_synapses_post_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, double*, int, double*, int, int*, int, double, double*, int, int*) + 9.18% 28.794ms 10000 2.8790us 2.7840us 3.3600us kernel_synapses_stateupdater_codeobject(unsigned int, unsigned int, double*, int, double*, int, double*, int*) + 7.72% 24.206ms 10000 2.4200us 2.2080us 2.8480us kernel_poissongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double*, double*, float*) + 5.48% 17.200ms 10000 1.7190us 1.6640us 1.9840us kernel_neurongroup_resetter_codeobject(unsigned int, unsigned int, double*, int*) + 5.26% 16.509ms 10000 1.6500us 1.5360us 2.4960us kernel_neurongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double*) + 0.17% 534.31us 1 534.31us 534.31us 534.31us void gen_sequenced(curandStateXORWOW*, int))>(curandStateXORWOW*, float*, unsigned long, unsigned long, int) + +==5309== API calls: +Time(%) Time Calls Avg Min Max Name + 65.37% 632.10ms 70001 9.0290us 7.8220us 7.1147ms cudaLaunch + 18.21% 176.05ms 20000 8.8020us 7.9140us 65.993us cudaMemset + 11.98% 115.80ms 640005 180ns 150ns 325.82us cudaSetupArgument + 2.23% 21.584ms 70002 308ns 218ns 325.68us cudaGetLastError + 2.19% 21.175ms 70001 302ns 199ns 314.30us cudaConfigureCall + 0.01% 138.56us 1 138.56us 138.56us 138.56us cudaMalloc + 0.00% 48.141us 1 48.141us 48.141us 48.141us cudaMemGetInfo + 0.00% 40.939us 74 553ns 496ns 1.2830us cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags + 0.00% 30.402us 12 2.5330us 2.0360us 4.5650us cudaFuncGetAttributes + 0.00% 17.493us 1 17.493us 17.493us 17.493us cudaDeviceSynchronize + 0.00% 6.8790us 20 343ns 280ns 612ns cudaDeviceGetAttribute + 0.00% 3.7860us 5 757ns 587ns 1.2530us cudaGetDevice + +``` + +

+ + +
Examplary `nvprof` results for **GeNNConfigurationOptimized**

+Profile summary for `N = 1000`: + +``` +==5547== NVPROF is profiling process 5547, command: ./main test 1.0 1 +==5547== Profiling application: ./main test 1.0 1 +==5547== Profiling result: +Time(%) Time Calls Avg Min Max Name + 38.01% 64.497ms 10000 6.4490us 1.4720us 25.121us calcSynapses + 24.89% 42.225ms 10000 4.2220us 3.3600us 6.1120us calcNeurons + 22.75% 38.605ms 10000 3.8600us 3.2320us 5.5680us calcSynapseDynamics + 14.26% 24.189ms 10000 2.4180us 2.1120us 6.5920us learnSynapsesPost + 0.06% 96.512us 72 1.3400us 928ns 2.0800us [CUDA memcpy HtoD] + 0.03% 54.080us 19 2.8460us 1.9840us 4.6720us [CUDA memcpy DtoH] + +==5547== API calls: +Time(%) Time Calls Avg Min Max Name + 53.26% 318.06ms 40000 7.9510us 7.3870us 323.19us cudaLaunch + 42.53% 254.01ms 21 12.096ms 7.5310us 252.89ms cudaHostAlloc + 2.21% 13.204ms 40000 330ns 252ns 332.54us cudaConfigureCall + 1.66% 9.9116ms 40000 247ns 233ns 5.2730us cudaSetupArgument + 0.20% 1.1942ms 97 12.311us 197ns 30.710us cudaMemcpy + 0.08% 498.29us 21 23.728us 6.1100us 122.22us cudaMalloc + 0.04% 227.33us 83 2.7380us 149ns 97.591us cuDeviceGetAttribute + 0.01% 31.273us 1 31.273us 31.273us 31.273us cuDeviceTotalMem + 0.00% 27.431us 1 27.431us 27.431us 27.431us cuDeviceGetName + 0.00% 11.816us 1 11.816us 11.816us 11.816us cudaSetDevice + 0.00% 11.690us 21 556ns 357ns 2.1550us cudaGetSymbolAddress + 0.00% 1.4320us 2 716ns 525ns 907ns cuDeviceGetCount + 0.00% 1.3390us 1 1.3390us 1.3390us 1.3390us cudaGetDeviceCount + 0.00% 577ns 2 288ns 252ns 325ns cuDeviceGet + +``` + +

+ + +*** + +### SparseHighRateSynapsesOnly +![](plots/speed_test_SparseHighRateSynapsesOnly_absolute.png) +![](plots/speed_test_SparseHighRateSynapsesOnly_profiling.png) +![](plots/speed_test_SparseHighRateSynapsesOnly_relative.png) + +
Examplary `nvprof` results for **CUDAStandaloneConfiguration**

+Profile summary for `N = 1000`: + +``` +==29929== NVPROF is profiling process 29929, command: ./main +==29929== Profiling application: ./main +==29929== Profiling result: +Time(%) Time Calls Avg Min Max Name + 86.04% 284.29ms 10000 28.429us 27.328us 32.544us kernel_synapses_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, int*, int, double, double*, int*, int, int*) + 8.93% 29.521ms 10000 2.9520us 2.8800us 4.4480us [CUDA memset] + 5.03% 16.619ms 10000 1.6610us 1.5360us 2.4000us kernel_neurongroup_thresholder_codeobject(unsigned int, unsigned int, int*) + +==29929== API calls: +Time(%) Time Calls Avg Min Max Name + 58.38% 206.98ms 20000 10.348us 8.5120us 8.2431ms cudaLaunch + 28.06% 99.491ms 10000 9.9490us 8.5150us 27.390us cudaMemset + 8.91% 31.590ms 170000 185ns 150ns 313.25us cudaSetupArgument + 1.79% 6.3337ms 20000 316ns 206ns 303.30us cudaConfigureCall + 1.73% 6.1183ms 20000 305ns 199ns 315.94us cudaGetLastError + 1.12% 3.9780ms 1 3.9780ms 3.9780ms 3.9780ms cudaDeviceSynchronize + 0.01% 46.286us 1 46.286us 46.286us 46.286us cudaMemGetInfo + 0.00% 8.3370us 3 2.7790us 2.1280us 3.2430us cudaFuncGetAttributes + 0.00% 5.4670us 3 1.8220us 649ns 2.4930us cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags + 0.00% 1.5130us 4 378ns 295ns 546ns cudaDeviceGetAttribute + 0.00% 820ns 1 820ns 820ns 820ns cudaGetDevice + +``` + +

+ + +
Examplary `nvprof` results for **GeNNConfigurationOptimized**

+Profile summary for `N = 1000`: + +``` +==30148== NVPROF is profiling process 30148, command: ./main test 1.0 1 +==30148== Profiling application: ./main test 1.0 1 +==30148== Profiling result: +Time(%) Time Calls Avg Min Max Name + 88.25% 301.73ms 10000 30.173us 3.3920us 32.704us calcSynapses + 11.72% 40.058ms 10000 4.0050us 3.8080us 4.8640us calcNeurons + 0.02% 61.280us 44 1.3920us 960ns 3.2000us [CUDA memcpy HtoD] + 0.01% 39.392us 14 2.8130us 1.9840us 6.8480us [CUDA memcpy DtoH] + +==30148== API calls: +Time(%) Time Calls Avg Min Max Name + 54.90% 442.78ms 12 36.898ms 14.006us 441.12ms cudaHostAlloc + 40.88% 329.68ms 20000 16.483us 7.7050us 338.70us cudaLaunch + 2.49% 20.082ms 61 329.22us 400ns 18.995ms cudaMemcpy + 0.94% 7.5995ms 20000 379ns 255ns 310.22us cudaConfigureCall + 0.67% 5.4120ms 20000 270ns 222ns 314.38us cudaSetupArgument + 0.08% 639.34us 12 53.278us 11.895us 172.21us cudaMalloc + 0.03% 235.92us 83 2.8420us 155ns 101.36us cuDeviceGetAttribute + 0.00% 32.471us 1 32.471us 32.471us 32.471us cuDeviceTotalMem + 0.00% 30.953us 1 30.953us 30.953us 30.953us cuDeviceGetName + 0.00% 14.056us 12 1.1710us 746ns 3.5320us cudaGetSymbolAddress + 0.00% 12.473us 1 12.473us 12.473us 12.473us cudaSetDevice + 0.00% 1.5390us 1 1.5390us 1.5390us 1.5390us cudaGetDeviceCount + 0.00% 1.4990us 2 749ns 424ns 1.0750us cuDeviceGetCount + 0.00% 514ns 2 257ns 199ns 315ns cuDeviceGet + +``` + +

+ + +*** + +### SparseLowRateSynapsesOnly +![](plots/speed_test_SparseLowRateSynapsesOnly_absolute.png) +![](plots/speed_test_SparseLowRateSynapsesOnly_profiling.png) +![](plots/speed_test_SparseLowRateSynapsesOnly_relative.png) + +
Examplary `nvprof` results for **CUDAStandaloneConfiguration**

+Profile summary for `N = 1000`: + +``` +==8193== NVPROF is profiling process 8193, command: ./main +==8193== Profiling application: ./main +==8193== Profiling result: +Time(%) Time Calls Avg Min Max Name + 55.84% 593.43ms 100000 5.9340us 5.4400us 6.9120us kernel_synapses_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, int*, int, double, double*, int*, int, int*) + 28.97% 307.88ms 100000 3.0780us 3.0400us 3.6800us [CUDA memset] + 15.19% 161.38ms 100000 1.6130us 1.5040us 2.5920us kernel_neurongroup_thresholder_codeobject(unsigned int, unsigned int, int*) + +==8193== API calls: +Time(%) Time Calls Avg Min Max Name + 55.92% 1.79370s 200000 8.9680us 7.6320us 7.2529ms cudaLaunch + 29.82% 956.72ms 100000 9.5670us 8.2580us 21.256ms cudaMemset + 10.51% 337.16ms 1700000 198ns 139ns 340.09us cudaSetupArgument + 1.91% 61.333ms 200000 306ns 217ns 368.29us cudaGetLastError + 1.83% 58.844ms 200000 294ns 168ns 332.73us cudaConfigureCall + 0.00% 45.848us 1 45.848us 45.848us 45.848us cudaMemGetInfo + 0.00% 12.992us 1 12.992us 12.992us 12.992us cudaDeviceSynchronize + 0.00% 8.6600us 3 2.8860us 2.0910us 3.5820us cudaFuncGetAttributes + 0.00% 5.3760us 3 1.7920us 594ns 2.4470us cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags + 0.00% 1.5830us 4 395ns 305ns 591ns cudaDeviceGetAttribute + 0.00% 829ns 1 829ns 829ns 829ns cudaGetDevice + +``` + +

+ + +
Examplary `nvprof` results for **GeNNConfigurationOptimized**

+Profile summary for `N = 1000`: + +``` +==8451== NVPROF is profiling process 8451, command: ./main test 10.0 1 +==8451== Profiling application: ./main test 10.0 1 +==8451== Profiling result: +Time(%) Time Calls Avg Min Max Name + 66.88% 550.62ms 100000 5.5060us 3.4560us 6.4000us calcSynapses + 33.11% 272.64ms 100000 2.7260us 2.6560us 3.7760us calcNeurons + 0.01% 53.984us 44 1.2260us 960ns 2.0800us [CUDA memcpy HtoD] + 0.00% 35.072us 14 2.5050us 1.9520us 4.7040us [CUDA memcpy DtoH] + +==8451== API calls: +Time(%) Time Calls Avg Min Max Name + 81.32% 1.60600s 200000 8.0290us 7.4920us 354.55us cudaLaunch + 12.69% 250.71ms 12 20.893ms 15.503us 249.06ms cudaHostAlloc + 3.37% 66.566ms 200000 332ns 257ns 334.65us cudaConfigureCall + 2.52% 49.683ms 200000 248ns 225ns 334.65us cudaSetupArgument + 0.05% 1.0155ms 61 16.647us 343ns 35.922us cudaMemcpy + 0.03% 641.50us 12 53.458us 12.040us 174.09us cudaMalloc + 0.01% 225.49us 83 2.7160us 135ns 97.180us cuDeviceGetAttribute + 0.00% 31.170us 1 31.170us 31.170us 31.170us cuDeviceTotalMem + 0.00% 26.897us 1 26.897us 26.897us 26.897us cuDeviceGetName + 0.00% 13.730us 12 1.1440us 698ns 3.1800us cudaGetSymbolAddress + 0.00% 11.132us 1 11.132us 11.132us 11.132us cudaSetDevice + 0.00% 1.3520us 2 676ns 376ns 976ns cuDeviceGetCount + 0.00% 1.3320us 1 1.3320us 1.3320us 1.3320us cudaGetDeviceCount + 0.00% 542ns 2 271ns 213ns 329ns cuDeviceGet + +``` + +

+ + +*** + +### SparseMediumRateSynapsesOnly +![](plots/speed_test_SparseMediumRateSynapsesOnly_absolute.png) +![](plots/speed_test_SparseMediumRateSynapsesOnly_profiling.png) +![](plots/speed_test_SparseMediumRateSynapsesOnly_relative.png) + +
Examplary `nvprof` results for **CUDAStandaloneConfiguration**

+Profile summary for `N = 1000`: + +``` +==16276== NVPROF is profiling process 16276, command: ./main +==16276== Profiling application: ./main +==16276== Profiling result: +Time(%) Time Calls Avg Min Max Name + 55.93% 59.598ms 10000 5.9590us 5.6000us 6.8480us kernel_synapses_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, int*, int, double, double*, int*, int, int*) + 28.96% 30.864ms 10000 3.0860us 3.0400us 3.5840us [CUDA memset] + 15.11% 16.106ms 10000 1.6100us 1.5040us 2.4000us kernel_neurongroup_thresholder_codeobject(unsigned int, unsigned int, int*) + +==16276== API calls: +Time(%) Time Calls Avg Min Max Name + 57.38% 194.03ms 20000 9.7010us 8.5280us 7.3801ms cudaLaunch + 27.54% 93.116ms 10000 9.3110us 8.6920us 28.380us cudaMemset + 10.82% 36.579ms 170000 215ns 184ns 349.92us cudaSetupArgument + 2.15% 7.2682ms 20000 363ns 248ns 327.47us cudaConfigureCall + 2.09% 7.0721ms 20000 353ns 266ns 337.12us cudaGetLastError + 0.01% 46.564us 1 46.564us 46.564us 46.564us cudaMemGetInfo + 0.01% 18.278us 1 18.278us 18.278us 18.278us cudaDeviceSynchronize + 0.00% 8.5460us 3 2.8480us 2.1440us 3.4910us cudaFuncGetAttributes + 0.00% 5.2380us 3 1.7460us 617ns 2.4330us cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags + 0.00% 1.7410us 4 435ns 339ns 632ns cudaDeviceGetAttribute + 0.00% 956ns 1 956ns 956ns 956ns cudaGetDevice + +``` + +

+ + +
Examplary `nvprof` results for **GeNNConfigurationOptimized**

+Profile summary for `N = 1000`: + +``` +==16495== NVPROF is profiling process 16495, command: ./main test 1.0 1 +==16495== Profiling application: ./main test 1.0 1 +==16495== Profiling result: +Time(%) Time Calls Avg Min Max Name + 67.04% 60.321ms 10000 6.0320us 3.4560us 6.5280us calcSynapses + 32.86% 29.567ms 10000 2.9560us 2.9120us 3.7440us calcNeurons + 0.06% 54.017us 44 1.2270us 960ns 2.0480us [CUDA memcpy HtoD] + 0.04% 36.032us 14 2.5730us 2.0480us 4.7360us [CUDA memcpy DtoH] + +==16495== API calls: +Time(%) Time Calls Avg Min Max Name + 62.23% 290.68ms 12 24.223ms 7.8400us 289.60ms cudaHostAlloc + 35.13% 164.11ms 20000 8.2050us 7.5690us 348.13us cudaLaunch + 1.32% 6.1557ms 20000 307ns 255ns 328.87us cudaConfigureCall + 1.01% 4.7095ms 20000 235ns 202ns 341.44us cudaSetupArgument + 0.16% 750.68us 61 12.306us 358ns 28.177us cudaMemcpy + 0.09% 419.68us 12 34.973us 6.2030us 120.19us cudaMalloc + 0.05% 227.14us 83 2.7360us 145ns 97.726us cuDeviceGetAttribute + 0.01% 31.327us 1 31.327us 31.327us 31.327us cuDeviceTotalMem + 0.01% 26.548us 1 26.548us 26.548us 26.548us cuDeviceGetName + 0.00% 11.315us 1 11.315us 11.315us 11.315us cudaSetDevice + 0.00% 7.9470us 12 662ns 405ns 1.9600us cudaGetSymbolAddress + 0.00% 1.5460us 2 773ns 495ns 1.0510us cuDeviceGetCount + 0.00% 1.4000us 1 1.4000us 1.4000us 1.4000us cudaGetDeviceCount + 0.00% 578ns 2 289ns 223ns 355ns cuDeviceGet + +``` + +

+ + +*** + +### VerySparseMediumRateSynapsesOnly +![](plots/speed_test_VerySparseMediumRateSynapsesOnly_absolute.png) +![](plots/speed_test_VerySparseMediumRateSynapsesOnly_profiling.png) +![](plots/speed_test_VerySparseMediumRateSynapsesOnly_relative.png) + +
Examplary `nvprof` results for **CUDAStandaloneConfiguration**

+Profile summary for `N = 1000`: + +``` +==6005== NVPROF is profiling process 6005, command: ./main +==6005== Profiling application: ./main +==6005== Profiling result: +Time(%) Time Calls Avg Min Max Name + 55.29% 580.67ms 100000 5.8060us 5.2160us 6.6240us kernel_synapses_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, int*, int, double, double*, int*, int, int*) + 29.34% 308.08ms 100000 3.0800us 3.0400us 3.7120us [CUDA memset] + 15.37% 161.45ms 100000 1.6140us 1.5040us 2.5920us kernel_neurongroup_thresholder_codeobject(unsigned int, unsigned int, int*) + +==6005== API calls: +Time(%) Time Calls Avg Min Max Name + 56.44% 1.83924s 200000 9.1960us 7.9810us 7.4326ms cudaLaunch + 29.07% 947.22ms 100000 9.4720us 8.1380us 21.897ms cudaMemset + 10.90% 355.11ms 1700000 208ns 171ns 355.90us cudaSetupArgument + 1.82% 59.307ms 200000 296ns 177ns 333.92us cudaConfigureCall + 1.77% 57.629ms 200000 288ns 202ns 337.07us cudaGetLastError + 0.00% 46.411us 1 46.411us 46.411us 46.411us cudaMemGetInfo + 0.00% 13.163us 1 13.163us 13.163us 13.163us cudaDeviceSynchronize + 0.00% 8.2890us 3 2.7630us 2.0680us 3.3230us cudaFuncGetAttributes + 0.00% 5.4810us 3 1.8270us 565ns 2.5590us cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags + 0.00% 1.5840us 4 396ns 318ns 545ns cudaDeviceGetAttribute + 0.00% 924ns 1 924ns 924ns 924ns cudaGetDevice + +``` + +

+ + +
Examplary `nvprof` results for **GeNNConfigurationOptimized**

+Profile summary for `N = 1000`: + +``` +==6274== NVPROF is profiling process 6274, command: ./main test 10.0 1 +==6274== Profiling application: ./main test 10.0 1 +==6274== Profiling result: +Time(%) Time Calls Avg Min Max Name + 69.30% 617.28ms 100000 6.1720us 3.3600us 7.5200us calcSynapses + 30.70% 273.43ms 100000 2.7340us 2.6560us 3.7440us calcNeurons + 0.01% 53.472us 44 1.2150us 960ns 2.0480us [CUDA memcpy HtoD] + 0.00% 34.560us 14 2.4680us 1.9520us 4.6080us [CUDA memcpy DtoH] + +==6274== API calls: +Time(%) Time Calls Avg Min Max Name + 82.48% 1.61117s 200000 8.0550us 7.0380us 353.83us cudaLaunch + 11.62% 226.99ms 12 18.916ms 7.8850us 225.88ms cudaHostAlloc + 3.30% 64.540ms 200000 322ns 238ns 338.74us cudaConfigureCall + 2.52% 49.132ms 200000 245ns 211ns 344.36us cudaSetupArgument + 0.04% 744.26us 61 12.200us 293ns 32.120us cudaMemcpy + 0.02% 421.09us 12 35.090us 6.1780us 119.69us cudaMalloc + 0.01% 226.88us 83 2.7330us 137ns 97.756us cuDeviceGetAttribute + 0.00% 31.259us 1 31.259us 31.259us 31.259us cuDeviceTotalMem + 0.00% 28.119us 1 28.119us 28.119us 28.119us cuDeviceGetName + 0.00% 11.457us 1 11.457us 11.457us 11.457us cudaSetDevice + 0.00% 8.0410us 12 670ns 397ns 1.9590us cudaGetSymbolAddress + 0.00% 1.6770us 2 838ns 479ns 1.1980us cuDeviceGetCount + 0.00% 1.4060us 1 1.4060us 1.4060us 1.4060us cudaGetDeviceCount + 0.00% 507ns 2 253ns 231ns 276ns cuDeviceGet + +``` + +

+ + +*** + +### Vogels +![](plots/speed_test_Vogels_absolute.png) +![](plots/speed_test_Vogels_profiling.png) +![](plots/speed_test_Vogels_relative.png) + +
Examplary `nvprof` results for **CUDAStandaloneConfiguration**

+Profile summary for `N = 1000`: + +``` +==12243== NVPROF is profiling process 12243, command: ./main +==12243== Profiling application: ./main +==12243== Profiling result: +Time(%) Time Calls Avg Min Max Name + 27.91% 192.82ms 10000 19.281us 3.1360us 2.1170ms kernel_synapses_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, double*, int*, int, int*, double, int*, int) + 25.45% 175.79ms 10000 17.578us 3.3280us 1.7610ms kernel_synapses_2_post_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, double*, int, double*, int, int*, int, double, double*, int, int*) + 15.82% 109.25ms 10000 10.925us 3.3600us 1.1837ms kernel_synapses_2_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, double*, int, double*, int, int*, int, int*, int, double, double*, int, double*, int*) + 14.27% 98.554ms 10000 9.8550us 3.1680us 1.0373ms kernel_synapses_1_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, int*, int, int*, double*, double, int*, int) + 5.95% 41.110ms 10000 4.1110us 3.7760us 5.3120us kernel_neurongroup_stateupdater_codeobject(unsigned int, unsigned int, double*, double*, double, double*, double*, double*, bool*) + 4.53% 31.297ms 10000 3.1290us 2.9440us 4.3200us [CUDA memset] + 3.54% 24.435ms 10000 2.4430us 2.0160us 6.0160us kernel_neurongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double, double*, double*, bool*) + 2.53% 17.499ms 10000 1.7490us 1.5360us 2.8160us kernel_neurongroup_resetter_codeobject(unsigned int, unsigned int, double*, int*, bool*) + +==12243== API calls: +Time(%) Time Calls Avg Min Max Name + 69.99% 645.08ms 70000 9.2150us 8.1890us 7.3493ms cudaLaunch + 16.20% 149.30ms 860000 173ns 144ns 1.1943ms cudaSetupArgument + 10.32% 95.084ms 10000 9.5080us 8.7600us 327.83us cudaMemset + 1.76% 16.177ms 70000 231ns 200ns 10.120us cudaGetLastError + 1.72% 15.875ms 70000 226ns 181ns 5.3450us cudaConfigureCall + 0.01% 51.450us 1 51.450us 51.450us 51.450us cudaMemGetInfo + 0.00% 25.843us 10 2.5840us 2.0060us 4.6820us cudaFuncGetAttributes + 0.00% 25.773us 41 628ns 481ns 2.9340us cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags + 0.00% 17.259us 1 17.259us 17.259us 17.259us cudaDeviceSynchronize + 0.00% 5.8620us 12 488ns 313ns 1.3830us cudaDeviceGetAttribute + 0.00% 3.0770us 3 1.0250us 630ns 1.5860us cudaGetDevice + +``` + +

+ + +
Examplary `nvprof` results for **GeNNConfigurationOptimized**

+Profile summary for `N = 1000`: + +``` +==12518== NVPROF is profiling process 12518, command: ./main test 1.0 1 +==12518== Profiling application: ./main test 1.0 1 +==12518== Profiling result: +Time(%) Time Calls Avg Min Max Name + 59.61% 415.51ms 10000 41.550us 2.0480us 6.0015ms learnSynapsesPost + 29.39% 204.87ms 10000 20.486us 1.5680us 2.4941ms calcSynapses + 10.93% 76.180ms 10000 7.6170us 6.6240us 14.560us calcNeurons + 0.06% 385.28us 86 4.4800us 960ns 42.752us [CUDA memcpy HtoD] + 0.02% 130.11us 20 6.5050us 1.9840us 40.641us [CUDA memcpy DtoH] + +==12518== API calls: +Time(%) Time Calls Avg Min Max Name + 66.01% 690.75ms 30000 23.025us 7.6920us 649.80us cudaLaunch + 29.49% 308.57ms 26 11.868ms 7.6940us 306.48ms cudaHostAlloc + 2.65% 27.715ms 112 247.46us 184ns 25.977ms cudaMemcpy + 0.97% 10.186ms 30000 339ns 250ns 318.13us cudaConfigureCall + 0.77% 8.0652ms 30000 268ns 222ns 319.03us cudaSetupArgument + 0.07% 763.51us 26 29.365us 6.1460us 121.30us cudaMalloc + 0.02% 226.59us 83 2.7300us 136ns 97.714us cuDeviceGetAttribute + 0.00% 31.319us 1 31.319us 31.319us 31.319us cuDeviceTotalMem + 0.00% 28.107us 1 28.107us 28.107us 28.107us cuDeviceGetName + 0.00% 15.639us 26 601ns 388ns 2.0380us cudaGetSymbolAddress + 0.00% 11.574us 1 11.574us 11.574us 11.574us cudaSetDevice + 0.00% 1.7010us 2 850ns 538ns 1.1630us cuDeviceGetCount + 0.00% 1.5690us 1 1.5690us 1.5690us 1.5690us cudaGetDeviceCount + 0.00% 540ns 2 270ns 227ns 313ns cuDeviceGet + +``` + +

+ + +*** + +### VogelsWithSynapticDynamic +![](plots/speed_test_VogelsWithSynapticDynamic_absolute.png) +![](plots/speed_test_VogelsWithSynapticDynamic_profiling.png) +![](plots/speed_test_VogelsWithSynapticDynamic_relative.png) + +
Examplary `nvprof` results for **CUDAStandaloneConfiguration**

+Profile summary for `N = 1000`: + +``` +==6312== NVPROF is profiling process 6312, command: ./main +==6312== Profiling application: ./main +==6312== Profiling result: +Time(%) Time Calls Avg Min Max Name + 27.18% 194.20ms 10000 19.419us 3.1680us 2.1194ms kernel_synapses_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, double*, int*, int, int*, double, int*, int) + 22.86% 163.34ms 10000 16.333us 3.1040us 1.6753ms kernel_synapses_2_post_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, int*, int, double, double*, int, double*, int, int*, double*, int) + 14.99% 107.12ms 10000 10.711us 3.2960us 1.1295ms kernel_synapses_2_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, int*, int, int*, double, double*, int, double*, double*, int, int*, int, double*, int) + 14.22% 101.59ms 10000 10.158us 3.2960us 1.0383ms kernel_synapses_1_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, int*, int, int*, double*, double, int*, int) + 5.84% 41.697ms 10000 4.1690us 3.8720us 5.5360us kernel_neurongroup_stateupdater_codeobject(unsigned int, unsigned int, double*, double*, double, double*, double*, double*, bool*) + 4.71% 33.655ms 10000 3.3650us 3.2320us 4.1280us kernel_synapses_2_stateupdater_codeobject(unsigned int, unsigned int, int*, double*, int, double*, int, double*) + 4.37% 31.213ms 10000 3.1210us 3.0400us 4.1920us [CUDA memset] + 3.37% 24.073ms 10000 2.4070us 2.0160us 5.7920us kernel_neurongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double, double*, double*, bool*) + 2.45% 17.497ms 10000 1.7490us 1.5360us 2.7840us kernel_neurongroup_resetter_codeobject(unsigned int, unsigned int, double*, int*, bool*) + +==6312== API calls: +Time(%) Time Calls Avg Min Max Name + 70.49% 724.20ms 80000 9.0520us 7.8180us 7.3109ms cudaLaunch + 16.18% 166.25ms 940000 176ns 148ns 532.24us cudaSetupArgument + 9.28% 95.356ms 10000 9.5350us 8.8100us 1.1346ms cudaMemset + 2.07% 21.258ms 80000 265ns 188ns 322.95us cudaConfigureCall + 1.97% 20.198ms 80000 252ns 221ns 60.788us cudaGetLastError + 0.00% 51.002us 1 51.002us 51.002us 51.002us cudaMemGetInfo + 0.00% 42.841us 1 42.841us 42.841us 42.841us cudaDeviceSynchronize + 0.00% 41.487us 74 560ns 469ns 2.5840us cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags + 0.00% 31.858us 12 2.6540us 1.9920us 4.7290us cudaFuncGetAttributes + 0.00% 6.5530us 16 409ns 280ns 1.1330us cudaDeviceGetAttribute + 0.00% 3.9370us 4 984ns 604ns 1.7060us cudaGetDevice + +``` + +

+ + diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/data/AdaptationOscillation.pkl b/dev/benchmarks/results_2017_04_05_complete_after_talk/data/AdaptationOscillation.pkl new file mode 100644 index 00000000..8ab15a42 Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/data/AdaptationOscillation.pkl differ diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/data/BrunelHakimModelHeterogeneousDelay.pkl b/dev/benchmarks/results_2017_04_05_complete_after_talk/data/BrunelHakimModelHeterogeneousDelay.pkl new file mode 100644 index 00000000..8049b0a5 Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/data/BrunelHakimModelHeterogeneousDelay.pkl differ diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/data/BrunelHakimModelScalarDelay.pkl b/dev/benchmarks/results_2017_04_05_complete_after_talk/data/BrunelHakimModelScalarDelay.pkl new file mode 100644 index 00000000..6e2d78a8 Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/data/BrunelHakimModelScalarDelay.pkl differ diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/data/BrunelHakimModelScalarDelayNoMultiPrePost.pkl b/dev/benchmarks/results_2017_04_05_complete_after_talk/data/BrunelHakimModelScalarDelayNoMultiPrePost.pkl new file mode 100644 index 00000000..e89cbdf3 Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/data/BrunelHakimModelScalarDelayNoMultiPrePost.pkl differ diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/data/COBAHH.pkl b/dev/benchmarks/results_2017_04_05_complete_after_talk/data/COBAHH.pkl new file mode 100644 index 00000000..8e266503 Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/data/COBAHH.pkl differ diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/data/COBAHHFixedConnectivity.pkl b/dev/benchmarks/results_2017_04_05_complete_after_talk/data/COBAHHFixedConnectivity.pkl new file mode 100644 index 00000000..fc9586d3 Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/data/COBAHHFixedConnectivity.pkl differ diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/data/CUBA.pkl b/dev/benchmarks/results_2017_04_05_complete_after_talk/data/CUBA.pkl new file mode 100644 index 00000000..38353279 Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/data/CUBA.pkl differ diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/data/CUBAFixedConnectivity.pkl b/dev/benchmarks/results_2017_04_05_complete_after_talk/data/CUBAFixedConnectivity.pkl new file mode 100644 index 00000000..2902dd42 Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/data/CUBAFixedConnectivity.pkl differ diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/data/DenseMediumRateSynapsesOnly.pkl b/dev/benchmarks/results_2017_04_05_complete_after_talk/data/DenseMediumRateSynapsesOnly.pkl new file mode 100644 index 00000000..08896f41 Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/data/DenseMediumRateSynapsesOnly.pkl differ diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/data/HHNeuronsOnly.pkl b/dev/benchmarks/results_2017_04_05_complete_after_talk/data/HHNeuronsOnly.pkl new file mode 100644 index 00000000..4c05ff62 Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/data/HHNeuronsOnly.pkl differ diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/data/LinearNeuronsOnly.pkl b/dev/benchmarks/results_2017_04_05_complete_after_talk/data/LinearNeuronsOnly.pkl new file mode 100644 index 00000000..671146b7 Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/data/LinearNeuronsOnly.pkl differ diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/data/STDP.pkl b/dev/benchmarks/results_2017_04_05_complete_after_talk/data/STDP.pkl new file mode 100644 index 00000000..e2f0bea2 Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/data/STDP.pkl differ diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/data/STDPEventDriven.pkl b/dev/benchmarks/results_2017_04_05_complete_after_talk/data/STDPEventDriven.pkl new file mode 100644 index 00000000..07f9c8be Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/data/STDPEventDriven.pkl differ diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/data/STDPMultiPost.pkl b/dev/benchmarks/results_2017_04_05_complete_after_talk/data/STDPMultiPost.pkl new file mode 100644 index 00000000..9e27d459 Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/data/STDPMultiPost.pkl differ diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/data/STDPMultiPostNeuronalTraces.pkl b/dev/benchmarks/results_2017_04_05_complete_after_talk/data/STDPMultiPostNeuronalTraces.pkl new file mode 100644 index 00000000..5a3077b1 Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/data/STDPMultiPostNeuronalTraces.pkl differ diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/data/STDPNeuronalTraces.pkl b/dev/benchmarks/results_2017_04_05_complete_after_talk/data/STDPNeuronalTraces.pkl new file mode 100644 index 00000000..2fd67b40 Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/data/STDPNeuronalTraces.pkl differ diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/data/STDPNotEventDriven.pkl b/dev/benchmarks/results_2017_04_05_complete_after_talk/data/STDPNotEventDriven.pkl new file mode 100644 index 00000000..d3f2bf50 Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/data/STDPNotEventDriven.pkl differ diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/data/SparseHighRateSynapsesOnly.pkl b/dev/benchmarks/results_2017_04_05_complete_after_talk/data/SparseHighRateSynapsesOnly.pkl new file mode 100644 index 00000000..e0097385 Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/data/SparseHighRateSynapsesOnly.pkl differ diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/data/SparseLowRateSynapsesOnly.pkl b/dev/benchmarks/results_2017_04_05_complete_after_talk/data/SparseLowRateSynapsesOnly.pkl new file mode 100644 index 00000000..874d47a9 Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/data/SparseLowRateSynapsesOnly.pkl differ diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/data/SparseMediumRateSynapsesOnly.pkl b/dev/benchmarks/results_2017_04_05_complete_after_talk/data/SparseMediumRateSynapsesOnly.pkl new file mode 100644 index 00000000..f400c591 Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/data/SparseMediumRateSynapsesOnly.pkl differ diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/data/VerySparseMediumRateSynapsesOnly.pkl b/dev/benchmarks/results_2017_04_05_complete_after_talk/data/VerySparseMediumRateSynapsesOnly.pkl new file mode 100644 index 00000000..9faa447b Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/data/VerySparseMediumRateSynapsesOnly.pkl differ diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/data/Vogels.pkl b/dev/benchmarks/results_2017_04_05_complete_after_talk/data/Vogels.pkl new file mode 100644 index 00000000..cac93dd3 Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/data/Vogels.pkl differ diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/data/VogelsWithSynapticDynamic.pkl b/dev/benchmarks/results_2017_04_05_complete_after_talk/data/VogelsWithSynapticDynamic.pkl new file mode 100644 index 00000000..3d2d4750 Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/data/VogelsWithSynapticDynamic.pkl differ diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/git.diff b/dev/benchmarks/results_2017_04_05_complete_after_talk/git.diff new file mode 100644 index 00000000..d891c27a --- /dev/null +++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/git.diff @@ -0,0 +1,1000 @@ +diff --git a/brian2cuda/device.py b/brian2cuda/device.py +index 44ca331..b446937 100644 +--- a/brian2cuda/device.py ++++ b/brian2cuda/device.py +@@ -910,10 +910,13 @@ class CUDAStandaloneDevice(CPPStandaloneDevice): + if clock not in all_clocks: + run_lines.append('{net.name}.add(&{clock.name}, NULL, NULL, NULL, NULL);'.format(clock=clock, net=net)) + ++ run_lines.append('cudaProfilerStart();') + run_lines.append('{net.name}.run({duration!r}, {report_call}, {report_period!r});'.format(net=net, + duration=float(duration), + report_call=report_call, + report_period=float(report_period))) ++ run_lines.append('cudaDeviceSynchronize();') ++ run_lines.append('cudaProfilerStop();') + self.main_queue.append(('run_network', (net, run_lines))) + + # Manually set the cache for the clocks, simulation scripts might +diff --git a/brian2cuda/templates/common_group.cu b/brian2cuda/templates/common_group.cu +index a77b071..85f2639 100644 +--- a/brian2cuda/templates/common_group.cu ++++ b/brian2cuda/templates/common_group.cu +@@ -145,6 +145,7 @@ void _run_{{codeobj_name}}() + } + {% block extra_info_msg %} + {% endblock %} ++ {% block kernel_info %} + else + { + printf("INFO calling kernel_{{codeobj_name}} with %u blocks and %u threads. " +@@ -156,6 +157,7 @@ void _run_{{codeobj_name}}() + funcAttrib.localSizeBytes, funcAttrib.constSizeBytes{% if calc_occupancy %},occupancy{%endif%}); + + } ++ {% endblock %} + first_run = false; + } + {% endblock prepare_kernel %} +diff --git a/brian2cuda/templates/main.cu b/brian2cuda/templates/main.cu +index 8402033..af7cedf 100644 +--- a/brian2cuda/templates/main.cu ++++ b/brian2cuda/templates/main.cu +@@ -16,6 +16,7 @@ + + #include + #include ++#include "cuda_profiler_api.h" + + {{report_func|autoindent}} + +diff --git a/brian2cuda/templates/statemonitor.cu b/brian2cuda/templates/statemonitor.cu +index f43ddf7..73c1b13 100644 +--- a/brian2cuda/templates/statemonitor.cu ++++ b/brian2cuda/templates/statemonitor.cu +@@ -7,6 +7,8 @@ + {# remove this once we have properly defined num_threads, num_blocks here... #} + {% block occupancy %} + {% endblock occupancy %} ++{% block kernel_info %} ++{% endblock %} + + {% block prepare_kernel_inner %} + {% for varname, var in _recorded_variables | dictsort %} +diff --git a/brian2cuda/templates/synapses_create_array.cu b/brian2cuda/templates/synapses_create_array.cu +index 5741b08..e5ae279 100644 +--- a/brian2cuda/templates/synapses_create_array.cu ++++ b/brian2cuda/templates/synapses_create_array.cu +@@ -17,6 +17,9 @@ + {% block occupancy %} + {% endblock occupancy %} + ++{% block kernel_info %} ++{% endblock %} ++ + {% block define_N %} + {% endblock %} + +diff --git a/brian2cuda/templates/synapses_create_generator.cu b/brian2cuda/templates/synapses_create_generator.cu +index cb06a4f..e663cf3 100644 +--- a/brian2cuda/templates/synapses_create_generator.cu ++++ b/brian2cuda/templates/synapses_create_generator.cu +@@ -20,6 +20,9 @@ + {% block occupancy %} + {% endblock %} + ++{% block kernel_info %} ++{% endblock %} ++ + {% block define_N %} + {% endblock %} + +diff --git a/brian2cuda/templates/synapses_initialise_queue.cu b/brian2cuda/templates/synapses_initialise_queue.cu +index 869812f..1fae5cb 100644 +--- a/brian2cuda/templates/synapses_initialise_queue.cu ++++ b/brian2cuda/templates/synapses_initialise_queue.cu +@@ -161,10 +161,12 @@ void _run_{{pathobj}}_initialise_queue() + {% endif %} + + ++ int size_connectivity_matrix = 0; + //fill temp arrays with device pointers + for(int i = 0; i < num_parallel_blocks*source_N; i++) // loop through connectivity matrix + { + int num_elements = h_synapses_by_pre_id[i].size(); ++ size_connectivity_matrix += num_elements; + temp_size_by_pre_id[i] = num_elements; + if (num_elements > {{pathobj}}_max_size) + {{pathobj}}_max_size = num_elements; +@@ -281,6 +283,7 @@ void _run_{{pathobj}}_initialise_queue() + {% endif %} + } + } ++ printf("INFO connectivity matrix has size %i\n", size_connectivity_matrix); + + + //copy temp arrays to device +diff --git a/brian2cuda/tests/features/cuda_configuration.py b/brian2cuda/tests/features/cuda_configuration.py +index acaae7d..b2b93a1 100644 +--- a/brian2cuda/tests/features/cuda_configuration.py ++++ b/brian2cuda/tests/features/cuda_configuration.py +@@ -26,6 +26,22 @@ class CUDAStandaloneConfiguration(Configuration): + brian2.device.build(directory='cuda_standalone', compile=True, run=True, + with_output=False) + ++class CUDAStandaloneConfigurationNoAssert(Configuration): ++ name = 'CUDA standalone (asserts disabled)' ++ def before_run(self): ++ brian2.set_device('cuda_standalone', build_on_run=False, disable_asserts=True) ++ if socket.gethostname() == 'elnath': ++ if prefs['devices.cpp_standalone.extra_make_args_unix'] == ['-j12']: ++ prefs['devices.cpp_standalone.extra_make_args_unix'] = ['-j24'] ++ prefs['codegen.cuda.extra_compile_args_nvcc'].remove('-arch=sm_35') ++ prefs['codegen.cuda.extra_compile_args_nvcc'].extend(['-arch=sm_20']) ++ ++ def after_run(self): ++ if os.path.exists('cuda_standalone'): ++ shutil.rmtree('cuda_standalone') ++ brian2.device.build(directory='cuda_standalone', compile=True, run=True, ++ with_output=False) ++ + class CUDAStandaloneConfigurationCurandDouble(Configuration): + name = 'CUDA standalone (curand_float_type = double)' + def before_run(self): +diff --git a/brian2cuda/tests/features/speed.py b/brian2cuda/tests/features/speed.py +index 49c6e52..c0b4887 100644 +--- a/brian2cuda/tests/features/speed.py ++++ b/brian2cuda/tests/features/speed.py +@@ -7,8 +7,23 @@ from brian2.tests.features.speed import * + + from brian2.tests.features.speed import __all__ + __all__.extend(['AdaptationOscillation', ++ 'ThresholderOnlyAlwaysSpiking', ++ 'ThresholderOnlyPoissonLowRate', ++ 'ThresholderOnlyPoissonMediumRate', ++ 'ThresholderOnlyPoissonHighRate', ++ 'BrunelHakimNeuronsOnly', ++ 'BrunelHakimStateupdateOnly', ++ 'BrunelHakimStateupdateOnlyDouble', ++ 'BrunelHakimStateupdateOnlyTriple', ++ 'BrunelHakimStateupdateThresholdOnly', ++ 'BrunelHakimStateupdateThresholdResetOnly', ++ 'BrunelHakimNeuronsOnlyNoXi', ++ 'BrunelHakimNeuronsOnlyNoRand', + 'BrunelHakimModelScalarDelay', ++ 'BrunelHakimModelScalarDelayNoMultiPrePost', ++ 'BrunelHakimModelScalarDelayShort', + 'BrunelHakimModelHeterogeneousDelay', ++ 'CUBA', + 'COBAHH', + 'STDPEventDriven', + 'STDPNotEventDriven', +@@ -73,12 +88,334 @@ class AdaptationOscillation(SpeedTest): + + self.timed_run(self.duration) + ++class BrunelHakimNeuronsOnly(SpeedTest): ++ ++ category = "Neurons only" ++ name = "Brunel Hakim" ++ tags = ["Neurons"] ++ n_range = [10, 100, 1000, 10000, 20000, 50000, 100000, 1000000, 10000000] ++ n_label = 'Num neurons' ++ ++ # configuration options ++ duration = 1*second ++ ++ def run(self): ++ N = self.n ++ Vr = 10*mV ++ theta = 20*mV ++ tau = 20*ms ++ delta = 2*ms ++ taurefr = 2*ms ++ C = 1000 ++ sparseness = float(C)/N ++ J = .1*mV ++ muext = 25*mV ++ sigmaext = 1*mV ++ ++ eqs = """ ++ dV/dt = (-V+muext + sigmaext * sqrt(tau) * xi)/tau : volt ++ """ ++ ++ self.group = group = NeuronGroup(N, eqs, threshold='V>theta', ++ reset='V=Vr', refractory=taurefr) ++ group.V = Vr ++ ++ self.timed_run(self.duration) ++ ++class BrunelHakimStateupdateOnlyTriple(SpeedTest): ++ ++ category = "Neurons only" ++ name = "Brunel Hakim (3 x stateupdate)" ++ tags = ["Neurons"] ++ n_range = [10, 100, 1000, 10000, 20000, 50000, 100000, 1000000, 10000000] ++ n_label = 'Num neurons' ++ ++ # configuration options ++ duration = 1*second ++ ++ def run(self): ++ N = self.n ++ Vr = 10*mV ++ theta = 20*mV ++ tau = 20*ms ++ delta = 2*ms ++ taurefr = 2*ms ++ C = 1000 ++ sparseness = float(C)/N ++ J = .1*mV ++ muext = 25*mV ++ sigmaext = 1*mV ++ ++ eqs = """ ++ dV/dt = (-V+muext + sigmaext * sqrt(tau) * xi)/tau : volt ++ """ ++ ++ self.group = group = NeuronGroup(N, eqs)#, threshold='V>theta', ++ #reset='V=Vr', refractory=taurefr) ++ group.V = Vr ++ ++ self.group2 = group2 = NeuronGroup(N, eqs) ++ group2.V = Vr ++ ++ self.group3 = group3 = NeuronGroup(N, eqs) ++ group3.V = Vr ++ ++ self.timed_run(self.duration) ++ ++ ++class BrunelHakimStateupdateOnlyDouble(SpeedTest): ++ ++ category = "Neurons only" ++ name = "Brunel Hakim (2 x stateupdate)" ++ tags = ["Neurons"] ++ n_range = [10, 100, 1000, 10000, 20000, 50000, 100000, 1000000, 10000000] ++ n_label = 'Num neurons' ++ ++ # configuration options ++ duration = 1*second ++ ++ def run(self): ++ N = self.n ++ Vr = 10*mV ++ theta = 20*mV ++ tau = 20*ms ++ delta = 2*ms ++ taurefr = 2*ms ++ C = 1000 ++ sparseness = float(C)/N ++ J = .1*mV ++ muext = 25*mV ++ sigmaext = 1*mV ++ ++ eqs = """ ++ dV/dt = (-V+muext + sigmaext * sqrt(tau) * xi)/tau : volt ++ """ ++ ++ self.group = group = NeuronGroup(N, eqs)#, threshold='V>theta', ++ #reset='V=Vr', refractory=taurefr) ++ group.V = Vr ++ ++ self.group2 = group2 = NeuronGroup(N, eqs) ++ group2.V = Vr ++ ++ self.timed_run(self.duration) ++ ++ ++class BrunelHakimStateupdateOnly(SpeedTest): ++ ++ category = "Neurons only" ++ name = "Brunel Hakim (stateupdate)" ++ tags = ["Neurons"] ++ n_range = [10, 100, 1000, 10000, 20000, 50000, 100000, 1000000, 10000000] ++ n_label = 'Num neurons' ++ ++ # configuration options ++ duration = 1*second ++ ++ def run(self): ++ N = self.n ++ Vr = 10*mV ++ theta = 20*mV ++ tau = 20*ms ++ delta = 2*ms ++ taurefr = 2*ms ++ C = 1000 ++ sparseness = float(C)/N ++ J = .1*mV ++ muext = 25*mV ++ sigmaext = 1*mV ++ ++ eqs = """ ++ dV/dt = (-V+muext + sigmaext * sqrt(tau) * xi)/tau : volt ++ """ ++ ++ self.group = group = NeuronGroup(N, eqs)#, threshold='V>theta', ++ #reset='V=Vr', refractory=taurefr) ++ group.V = Vr ++ ++ self.timed_run(self.duration) ++ ++class BrunelHakimStateupdateThresholdOnly(SpeedTest): ++ ++ category = "Neurons only" ++ name = "Brunel Hakim (stateupdate + threshold)" ++ tags = ["Neurons"] ++ n_range = [10, 100, 1000, 10000, 20000, 50000, 100000, 1000000, 10000000] ++ n_label = 'Num neurons' ++ ++ # configuration options ++ duration = 1*second ++ ++ def run(self): ++ N = self.n ++ Vr = 10*mV ++ theta = 20*mV ++ tau = 20*ms ++ delta = 2*ms ++ taurefr = 2*ms ++ C = 1000 ++ sparseness = float(C)/N ++ J = .1*mV ++ muext = 25*mV ++ sigmaext = 1*mV ++ ++ eqs = """ ++ dV/dt = (-V+muext + sigmaext * sqrt(tau) * xi)/tau : volt ++ """ ++ ++ self.group = group = NeuronGroup(N, eqs, threshold='V>theta') ++ group.V = Vr ++ ++ self.timed_run(self.duration) ++ ++class BrunelHakimStateupdateThresholdResetOnly(SpeedTest): ++ ++ category = "Neurons only" ++ name = "Brunel Hakim (stateupdate + threshold + reset)" ++ tags = ["Neurons"] ++ n_range = [10, 100, 1000, 10000, 20000, 50000, 100000, 1000000, 10000000] ++ n_label = 'Num neurons' ++ ++ # configuration options ++ duration = 1*second ++ ++ def run(self): ++ N = self.n ++ Vr = 10*mV ++ theta = 20*mV ++ tau = 20*ms ++ delta = 2*ms ++ taurefr = 2*ms ++ C = 1000 ++ sparseness = float(C)/N ++ J = .1*mV ++ muext = 25*mV ++ sigmaext = 1*mV ++ ++ eqs = """ ++ dV/dt = (-V+muext + sigmaext * sqrt(tau) * xi)/tau : volt ++ """ ++ ++ self.group = group = NeuronGroup(N, eqs, threshold='V>theta', ++ reset='V=Vr') ++ group.V = Vr ++ ++ self.timed_run(self.duration) ++ ++class BrunelHakimNeuronsOnlyNoXi(SpeedTest): ++ ++ category = "Neurons only" ++ name = "Brunel Hakim (no xi)" ++ tags = ["Neurons"] ++ n_range = [10, 100, 1000, 10000, 20000, 50000, 100000, 1000000, 10000000] ++ n_label = 'Num neurons' ++ ++ # configuration options ++ duration = 1*second ++ ++ def run(self): ++ N = self.n ++ Vr = 10*mV ++ theta = 20*mV ++ tau = 20*ms ++ delta = 2*ms ++ taurefr = 2*ms ++ C = 1000 ++ sparseness = float(C)/N ++ J = .1*mV ++ muext = 25*mV ++ sigmaext = 1*mV ++ ++ eqs = """ ++ dV/dt = (-V+muext + sigmaext * sqrt(tau/ms))/tau : volt ++ """ ++ ++ self.group = group = NeuronGroup(N, eqs, threshold='V>theta', ++ reset='V=Vr', refractory=taurefr) ++ group.V = Vr ++ ++ self.timed_run(self.duration) ++ ++class BrunelHakimNeuronsOnlyNoRand(SpeedTest): ++ ++ category = "Neurons only" ++ name = "Brunel Hakim (no rand)" ++ tags = ["Neurons"] ++ n_range = [10, 100, 1000, 10000, 20000, 50000, 100000, 1000000, 10000000] ++ n_label = 'Num neurons' ++ ++ # configuration options ++ duration = 1*second ++ ++ def run(self): ++ N = self.n ++ Vr = 10*mV ++ theta = 20*mV ++ tau = 20*ms ++ delta = 2*ms ++ taurefr = 2*ms ++ C = 1000 ++ sparseness = float(C)/N ++ J = .1*mV ++ muext = 25*mV ++ sigmaext = 1*mV ++ ++ myxi = np.random.randn(N) ++ ++ eqs = """ ++ dV/dt = (-V+muext + sigmaext * sqrt(tau) * myxi/sqrt(ms))/tau : volt ++ myxi : 1 ++ """ ++ ++ self.group = group = NeuronGroup(N, eqs, threshold='V>theta', ++ reset='V=Vr', refractory=taurefr) ++ group.V = Vr ++ group.myxi = myxi ++ ++ self.timed_run(self.duration) ++ ++class BrunelHakimModelScalarDelayNoMultiPrePost(SpeedTest): ++ ++ category = "Full examples" ++ name = "Brunel Hakim with scalar delay (1s, no multip pre-post connections)" ++ tags = ["Neurons", "Synapses"] ++ n_range = [10, 100, 1000, 10000, 20000, 50000, 100000, 250000]#, 350000]#500000, 1000000] ++ n_label = 'Num neurons' ++ ++ # configuration options ++ duration = 1*second ++ ++ def run(self): ++ N = self.n ++ Vr = 10*mV ++ theta = 20*mV ++ tau = 20*ms ++ delta = 2*ms ++ taurefr = 2*ms ++ C = 1000 ++ sparseness = float(C)/N ++ J = .1*mV ++ muext = 25*mV ++ sigmaext = 1*mV ++ ++ eqs = """ ++ dV/dt = (-V+muext + sigmaext * sqrt(tau) * xi)/tau : volt ++ """ ++ ++ group = NeuronGroup(N, eqs, threshold='V>theta', ++ reset='V=Vr', refractory=taurefr) ++ group.V = Vr ++ conn = Synapses(group, group, on_pre='V += -J', delay=delta) ++ conn.connect('i!=j and rand()=Ne', p=80. / N) ++ ++ self.timed_run(self.duration) + + class COBAHH(SpeedTest): + +@@ -223,7 +673,7 @@ class STDPEventDriven(SpeedTest): + category = "Full examples" + name = "STDP (event-driven)" + tags = ["Neurons", "Synapses"] +- n_range = [10, 100, 1000, 10000, 20000, 50000, 100000] ++ n_range = [10, 100, 1000, 10000, 20000, 50000, 100000, 1000000, 5000000] + n_label = 'Num neurons' + + # configuration options +@@ -543,18 +993,18 @@ class Vogels(SpeedTest): + + eqs_stdp_inhib = ''' + w : 1 +- dA_pre/dt=-A_pre/tau_stdp : 1 (event-driven) +- dA_post/dt=-A_post/tau_stdp : 1 (event-driven) ++ dApre/dt=-Apre/tau_stdp : 1 (event-driven) ++ dApost/dt=-Apost/tau_stdp : 1 (event-driven) + ''' + alpha = 3*Hz*tau_stdp*2 # Target rate parameter + gmax = 100 # Maximum inhibitory weight + + con_ie = Synapses(Pi, Pe, model=eqs_stdp_inhib, +- on_pre='''A_pre += 1. +- w = clip(w+(A_post-alpha)*eta, 0, gmax) ++ on_pre='''Apre += 1. ++ w = clip(w+(Apost-alpha)*eta, 0, gmax) + g_gaba += w*nS''', +- on_post='''A_post += 1. +- w = clip(w+A_pre*eta, 0, gmax) ++ on_post='''Apost += 1. ++ w = clip(w+Apre*eta, 0, gmax) + ''' + ) + con_ie.connect('rand()=3200', p=0.02) + + s_mon = SpikeMonitor(P) + +-run(1 * second, report='text') ++run(10 * second, report='text') + +-plot(s_mon.t/ms, s_mon.i, '.k', ms=1) +-title('CUBA_CUDA') +-xlabel('Time (ms)') +-ylabel('Neuron index') +-savefig('CUBA_CUDA/CUBA_CUDA_rasterplot.png') ++print(profiling_summary()) ++ ++#plot(s_mon.t/ms, s_mon.i, '.k', ms=1) ++#title('CUBA_CUDA') ++#xlabel('Time (ms)') ++#ylabel('Neuron index') ++#savefig('CUBA_CUDA/CUBA_CUDA_rasterplot.png') + #show() +diff --git a/frozen_repos/brian2 b/frozen_repos/brian2 +--- a/frozen_repos/brian2 ++++ b/frozen_repos/brian2 +@@ -1 +1 @@ +-Subproject commit fadc6a0aeb90d1b4d343470628457d8561536f67 ++Subproject commit fadc6a0aeb90d1b4d343470628457d8561536f67-dirty +diff --git a/frozen_repos/brian2genn b/frozen_repos/brian2genn +--- a/frozen_repos/brian2genn ++++ b/frozen_repos/brian2genn +@@ -1 +1 @@ +-Subproject commit 0553cafeab49ea5403c0230411035df504d4db06 ++Subproject commit 0553cafeab49ea5403c0230411035df504d4db06-dirty +diff --git a/frozen_repos/genn b/frozen_repos/genn +--- a/frozen_repos/genn ++++ b/frozen_repos/genn +@@ -1 +1 @@ +-Subproject commit e01c85f18339249558d6e570ae976609dc972846 ++Subproject commit e01c85f18339249558d6e570ae976609dc972846-dirty diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/log_complete_after_talk b/dev/benchmarks/results_2017_04_05_complete_after_talk/log_complete_after_talk new file mode 100644 index 00000000..13ab8e8d --- /dev/null +++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/log_complete_after_talk @@ -0,0 +1,1903 @@ +Directory with name `results_2017_04_05_complete_after_talk` already exists. Renaming it to `results_2017_04_05_complete_after_talk_bak_1491422611`. +Saving results in results_2017_04_05_complete_after_talk/plots. +Starting CUBA on 05.04.2017 at 22:03:31. +Running speed tests +Configurations: CUDA standalone, C++ standalone, GeNN_optimized +Full examples: CUBA fixed connectivity: n=10 [...] n=100 [...] n=1000 [...] n=10000 [...] n=100000 [...] n=1000000 [E..]WARNING /home/denisalevi/anaconda2/envs/dev_b2c/lib/python2.7/site-packages/matplotlib/__init__.py:892: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter. + warnings.warn(self.msg_depr % (key, alt_key)) + [py.warnings] + + +TRACEBACK CUDA standalone N=1000000 +INFO: setting cudaDevice stuff took 0.265023 seconds +INFO calling kernel_neurongroup_group_variable_set_conditional_codeobject with 977 blocks and 1024 threads. Kernel needs 8 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory +INFO connectivity matrix has size 15999736 +INFO connectivity matrix has size 64013467 +INFO calling kernel_neurongroup_stateupdater_codeobject with 1303 blocks and 768 threads. Kernel needs 36 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 0.750000. +ERROR launching kernel_neurongroup_stateupdater_codeobject in code_objects/neurongroup_stateupdater_codeobject.cu:1101 invalid argument + +('debug syn effect mdoe ', 'target') +('debug syn effect mdoe ', 'target') +INFO: setting cudaDevice stuff took 0.265023 seconds +INFO calling kernel_neurongroup_group_variable_set_conditional_codeobject with 977 blocks and 1024 threads. Kernel needs 8 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory +INFO connectivity matrix has size 15999736 +INFO connectivity matrix has size 64013467 +INFO calling kernel_neurongroup_stateupdater_codeobject with 1303 blocks and 768 threads. Kernel needs 36 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 0.750000. +ERROR launching kernel_neurongroup_stateupdater_codeobject in code_objects/neurongroup_stateupdater_codeobject.cu:1101 invalid argument + + +/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2/brian2/utils/logger.py:546: UserWarning: Could not copy script file to temp directory: [Errno 2] No such file or directory: '/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/dev/benchmarks/-c' + 'Could not copy script file to temp directory: %s' % ex) +INFO No numerical integration method specified for group 'neurongroup', using method 'linear' (took 1.49s). [brian2.stateupdaters.base.method_choice] + +Traceback (most recent call last): + File "", line 21, in + File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/brian2cuda/tests/features/cuda_configuration.py", line 27, in after_run + with_output=False) + File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/brian2cuda/device.py", line 778, in build + self.run(directory, with_output, run_args) + File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2/brian2/devices/cpp_standalone/device.py", line 864, in run + "%s)" % os.path.abspath(directory)) +RuntimeError: Project run failed (project directory: /mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/dev/benchmarks/cuda_standalone/cuda_standalone) + + + +Running CUBA took 2:08:31. +INFO relative performance for Full examples: CUBA fixed connectivity N=10 CUDA standalone: 1.0 +INFO relative performance for Full examples: CUBA fixed connectivity N=100 CUDA standalone: 1.0 +INFO relative performance for Full examples: CUBA fixed connectivity N=1000 CUDA standalone: 1.0 +INFO relative performance for Full examples: CUBA fixed connectivity N=10000 CUDA standalone: 1.0 +INFO relative performance for Full examples: CUBA fixed connectivity N=100000 CUDA standalone: 1.0 +INFO relative performance for Full examples: CUBA fixed connectivity N=1000000 CUDA standalone: nan +INFO relative performance for Full examples: CUBA fixed connectivity N=10 C++ standalone: 9.50331627245 +INFO relative performance for Full examples: CUBA fixed connectivity N=100 C++ standalone: 8.5446853951 +INFO relative performance for Full examples: CUBA fixed connectINFO: setting cudaDevice stuff took 0.172666 seconds +INFO calling kernel_neurongroup_group_variable_set_conditional_codeobject with 1 blocks and 1024 threads. Kernel needs 8 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory +INFO connectivity matrix has size 16108 +INFO connectivity matrix has size 64158 +INFO calling kernel_neurongroup_stateupdater_codeobject with 2 blocks and 768 threads. Kernel needs 36 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 0.750000. +INFO calling kernel_neurongroup_thresholder_codeobject with 1 blocks and 1024 threads. Kernel needs 11 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000. +INFO calling kernel_synapses_1_pre_codeobject with 15 blocks and 1024 threads. Kernel needs 27 registers per block, 0 bytes of statically-allocated shared memory per block, 8 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000. +INFO calling kernel_synapses_pre_codeobject with 15 blocks and 1024 threads. Kernel needs 27 registers per block, 0 bytes of statically-allocated shared memory per block, 8 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000. +INFO calling kernel_neurongroup_resetter_codeobject with 1 blocks and 1024 threads. Kernel needs 14 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000. +Number of synapses: 64158 +Number of synapses: 16108 +INFO: main_lines took 5.325889 seconds +INFO: main function took 5.523285 seconds +mkdir: cannot create directory ‘test_output’: File exists +# DT 0.000100 +# totalTime 1.000000 +# We are running with fixed time step 0.000100 +0.9999 done ... +everything finished. +ivity N=1000 C++ standalone: 3.79737620222 +INFO relative performance for Full examples: CUBA fixed connectivity N=10000 C++ standalone: 0.797678384644 +INFO relative performance for Full examples: CUBA fixed connectivity N=100000 C++ standalone: 0.440223048132 +INFO relative performance for Full examples: CUBA fixed connectivity N=1000000 C++ standalone: nan +INFO relative performance for Full examples: CUBA fixed connectivity N=10 GeNN_optimized: 2.12816895677 +INFO relative performance for Full examples: CUBA fixed connectivity N=100 GeNN_optimized: 1.93923218414 +INFO relative performance for Full examples: CUBA fixed connectivity N=1000 GeNN_optimized: 1.75435949027 +INFO relative performance for Full examples: CUBA fixed connectivity N=10000 GeNN_optimized: 1.43439787216 +INFO relative performance for Full examples: CUBA fixed connectivity N=100000 GeNN_optimized: 0.625130402294 +INFO relative performance for Full examples: CUBA fixed connectivity N=1000000 GeNN_optimized: nan +Rerunning CUDAStandaloneConfiguration with n = 1000 for nvprof profiling +cd cuda_standalone && nvprof --profile-from-start-off --log-file ../results_2017_04_05_complete_after_talk/nvprof/nvprof_CUBA_CUDAStandaloneConfiguration_1000.log ./main +Profiling took 0:00:10 for runtime of 0.355534 +Rerunning GeNNConfigurationOptimized with n = 1000 for nvprof profiling +cd GeNNworkspace && nvprof --log-file ../results_2017_04_05_complete_after_talk/nvprof/nvprof_CUBA_GeNNConfigurationOptimized_1000.log ./main test 1.0 1 +Profiling took 0:00:02 for runtime of 0.199734 +Starting COBAHH on 06.04.2017 at 00:13:16. +Running speed tests +Configurations: CUDA standalone, C++ standalone, GeNN_optimized +Full examples: COBAHH: n=10 [...] n=100 [...] n=1000 [...] n=10000 [...] n=20000 [...] n=50000 [...] n=100000 [...]INFO: setting cudaDevice stuff took 0.156221 seconds +INFO calling kernel_neurongroup_group_variable_set_conditional_codeobject with 1 blocks and 1024 threads. Kernel needs 14 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory +INFO calling kernel_neurongroup_group_variable_set_conditional_codeobject_1 with 1 blocks and 1024 threads. Kernel needs 7 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory +INFO calling kernel_neurongroup_group_variable_set_conditional_codeobject_2 with 1 blocks and 1024 threads. Kernel needs 7 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory +INFO connectivity matrix has size 3949 +INFO connectivity matrix has size 15742 +INFO calling kernel_neurongroup_stateupdater_codeobject with 2 blocks and 512 threads. Kernel needs 109 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 0.250000. +INFO calling kernel_neurongroup_thresholder_codeobject with 1 blocks and 1024 threads. Kernel needs 11 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000. +INFO calling kernel_synapses_1_pre_codeobject with 15 blocks and 1024 threads. Kernel needs 27 registers per block, 0 bytes of statically-allocated shared memory per block, 8 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000. +INFO calling kernel_synapses_pre_codeobject with 15 blocks and 1024 threads. Kernel needs 27 registers per block, 0 bytes of statically-allocated shared memory per block, 8 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000. +Number of synapses: 3949 +Number of synapses: 15742 +INFO: main_lines took 4.652397 seconds +INFO: main function took 4.834268 seconds +mkdir: cannot create directory ‘test_output’: File exists +# DT 0.000100 +# totalTime 1.000000 +# We are running with fixed time step 0.000100 +0.9999 done ... +everything finished. + +Running COBAHH took 0:55:44. +INFO relative performance for Full examples: COBAHH N=10 CUDA standalone: 1.0 +INFO relative performance for Full examples: COBAHH N=100 CUDA standalone: 1.0 +INFO relative performance for Full examples: COBAHH N=1000 CUDA standalone: 1.0 +INFO relative performance for Full examples: COBAHH N=10000 CUDA standalone: 1.0 +INFO relative performance for Full examples: COBAHH N=20000 CUDA standalone: 1.0 +INFO relative performance for Full examples: COBAHH N=50000 CUDA standalone: 1.0 +INFO relative performance for Full examples: COBAHH N=100000 CUDA standalone: 1.0 +INFO relative performance for Full examples: COBAHH N=10 C++ standalone: 5.97680043162 +INFO relative performance for Full examples: COBAHH N=100 C++ standalone: 1.00371226203 +INFO relative performance for Full examples: COBAHH N=1000 C++ standalone: 0.154618074816 +INFO relative performance for Full examples: COBAHH N=10000 C++ standalone: 0.0396381192455 +INFO relative performance for Full examples: COBAHH N=20000 C++ standalone: 0.0253531218652 +INFO relative performance for Full examples: COBAHH N=50000 C++ standalone: 0.0106447160514 +INFO relative performance for Full examples: COBAHH N=100000 C++ standalone: 0.00914741719581 +INFO relative performance for Full examples: COBAHH N=10 GeNN_optimized: 1.08714146487 +INFO relative performance for Full examples: COBAHH N=100 GeNN_optimized: 1.20794915957 +INFO relative performance for Full examples: COBAHH N=1000 GeNN_optimized: 1.32626500874 +INFO relative performance for Full examples: COBAHH N=10000 GeNN_optimized: 1.27813023083 +INFO relative performance for Full examples: COBAHH N=20000 GeNN_optimized: 2.12777862025 +INFO relative performance for Full examples: COBAHH N=50000 GeNN_optimized: 1.22069742475 +INFO relative performance for Full examples: COBAHH N=100000 GeNN_optimized: 1.09531764141 +Rerunning CUDAStandaloneConfiguration with n = 1000 for nvprof profiling +cd cuda_standalone && nvprof --profile-from-start-off --log-file ../results_2017_04_05_complete_after_talk/nvprof/nvprof_COBAHH_CUDAStandaloneConfiguration_1000.log ./main +Profiling took 0:00:09 for runtime of 0.531296 +Rerunning GeNNConfigurationOptimized with n = 1000 for nvprof profiling +cd GeNNworkspace && nvprof --log-file ../results_2017_04_05_complete_after_talk/nvprof/nvprof_COBAHH_GeNNConfigurationOptimized_1000.log ./main test 1.0 1 +Profiling took 0:00:03 for runtime of 0.409076 +Starting AdaptationOscillation on 06.04.2017 at 01:10:20. +Running speed tests +Configurations: CUDA standalone, C++ standalone, GeNN_optimized +Full examples: Adaptation oscillation: n=10 [...] n=100 [...] n=1000 [...] n=10000 [...] n=20000 [...] n=50000 [...] n=100000 [E.E] + +TRACEBACK CUDA standalone N=100000 +INFO: setting cudaDevice stuff took 0.311282 seconds +INFO calling kernel_neurongroup_group_variable_set_conditional_codeobject with 98 blocks and 1024 threads. Kernel needs 8 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory +INFO calling kernel_neurongroup_group_variable_set_conditional_codeobject_1 with 98 blocks and 1024 threads. Kernel needs 8 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory + +('debug syn effect mdoe ', 'target') +INFO: setting cudaDevice stuff took 0.311282 seconds +INFO calling kernel_neurongroup_group_variable_set_conditional_codeobject with 98 blocks and 1024 threads. Kernel needs 8 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory +INFO calling kernel_neurongroup_group_variable_set_conditional_codeobject_1 with 98 blocks and 1024 threads. Kernel needs 8 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory + + +/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2/brian2/utils/logger.py:546: UserWarning: Could not copy script file to temp directory: [Errno 2] No such file or directory: '/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/dev/benchmarks/-c' + 'Could not copy script file to temp directory: %s' % ex) +INFO No numerical integration method specified for group 'neurongroup', using method 'euler' (took 0.14s, trying other methods took 0.00s). [brian2.stateupdaters.base.method_choice] +terminate called after throwing an instance of 'thrust::system::detail::bad_alloc' + what(): std::bad_alloc: out of memory + +Traceback (most recent call last): + File "", line 21, in + File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/brian2cuda/tests/features/cuda_configuration.py", line 27, in after_run + with_output=False) + File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/brian2cuda/device.py", line 778, in build + self.run(directory, with_output, run_args) + File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2/brian2/devices/cpp_standalone/device.py", line 864, in run + "%s)" % os.path.abspath(directory)) +RuntimeError: Project run failed (project directory: /mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/dev/benchmarks/cuda_standalone/cuda_standalone) + + + + +TRACEBACK GeNN_optimized N=100000 +no stdout file found, cwd = /mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/dev/benchmarks/GeNNWorkspace/results/stdout.txt +running brian code generation ... +building genn executable ... +executing genn binary on GPU ... + +/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2/brian2/utils/logger.py:546: UserWarning: Could not copy script file to temp directory: [Errno 2] No such file or directory: '/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/dev/benchmarks/-c' + 'Could not copy script file to temp directory: %s' % ex) +WARNING The selected device 'genn' only supports a fixed schedule, but this schedule is not consistent with the network's schedule. The simulation will use the device's schedule. +Device schedule: ['start', 'synapses', 'groups', 'thresholds', 'resets', 'end'] +Network schedule: ['start', 'groups', 'thresholds', 'synapses', 'resets', 'end'] +Set the network schedule explicitly or set the core.network.default_schedule preference to avoid this warning. [brian2.core.network.schedule_conflict] +INFO No numerical integration method specified for group 'neurongroup', using method 'euler' (took 0.12s, trying other methods took 0.00s). [brian2.stateupdaters.base.method_choice] +mkdir: cannot create directory ‘test_output’: File exists +# DT 0.000100 +# totalTime 1.000000 +definitions.h: 102: cuda runtime error 2: out of memory + +Traceback (most recent call last): + File "", line 14, in + File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/brian2cuda/tests/features/speed.py", line 89, in run + self.timed_run(self.duration) + File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2/brian2/tests/features/base.py", line 63, in timed_run + brian2.run(duration, level=1) + File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2/brian2/units/fundamentalunits.py", line 2428, in new_f + result = f(*args, **kwds) + File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2/brian2/core/magic.py", line 371, in run + namespace=namespace, profile=profile, level=2+level) + File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2/brian2/core/magic.py", line 231, in run + namespace=namespace, profile=profile, level=level+1) + File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2/brian2/core/base.py", line 276, in device_override_decorated_function + return getattr(curdev, name)(*args, **kwds) + File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2genn/brian2genn/device.py", line 1206, in network_run + super(GeNNDevice, self).network_run(net=net, duration=duration, report=report, report_period=report_period, namespace=namespace, level=level+1) + File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2/brian2/devices/cpp_standalone/device.py", line 1171, in network_run + self.build(direct_call=False, **self.build_options) + File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2genn/brian2genn/device.py", line 592, in build + returncode=ex.returncode) +RuntimeError: Project run failed (Command ['./main', 'test', '1.0', '1'] failed with error code 1). +See the output above (if any) for more details. + + + +Running AdaptationOscillation took 1:41:42. +INFO relative performance for Full examples: Adaptation oscillation N=10 CUDA standalone: 1.0 +INFO relative performance for Full examples: Adaptation oscillation N=100 CUDA standalone: 1.0 +INFO relative performance for Full examples: Adaptation oscillation N=1000 CUDA standalone: 1.0 +INFO relative performance for Full examples: Adaptation oscillation N=10000 CUDA standalone: 1.0 +INFO relative performance for Full examples: Adaptation oscillation N=20000 CUDA standalone: 1.0 +INFO relative performance for Full examples: Adaptation oscillation N=50000 CUDA standalone: 1.0 +INFO relative performance for Full examples: Adaptation oscillation N=100000 CUDA standalone: nan +INFO relative performance for Full examples: Adaptation oscillation N=10 C++ standalone: 8.59983763369 +INFO relative performance for Full examples: Adaptation oscillation N=100 C++ standalone: 2.8440796921 +INFO relative performance for Full examples: Adaptation oscillation N=1000 C++ standalone: 0.506496466226 +INFO relative performance for Full examples: Adaptation oscillation N=10000 C++ standalone: 0.214522626694 +INFO relative performance for Full examples: Adaptation oscillation N=20000 C++ standalone: 0.170045431942 +INFO relative performance for Full examples: Adaptation oscillation N=50000 C++ standalone: 0.109420242272 +INFO relative performance for Full examples: Adaptation oscillation N=100000 C++ standalone: nan +INFO relative performance for Full examples: Adaptation oscillation N=10 GeNN_optimized: 1.54028322165 +INFO relative performance for Full examples: Adaptation oscillation N=100 GeNN_optimized: 1.37097363869 +INFO relative performance for Full examples: Adaptation oscillation N=1000 GeNN_optimized: 1.2270477INFO: setting cudaDevice stuff took 0.149325 seconds +INFO calling kernel_neurongroup_group_variable_set_conditional_codeobject with 1 blocks and 1024 threads. Kernel needs 8 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory +INFO calling kernel_neurongroup_group_variable_set_conditional_codeobject_1 with 1 blocks and 1024 threads. Kernel needs 8 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory +INFO calling kernel_synapses_group_variable_set_conditional_codeobject with 49 blocks and 1024 threads. Kernel needs 6 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory +INFO calling kernel_synapses_group_variable_set_conditional_codeobject_1 with 49 blocks and 1024 threads. Kernel needs 6 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory +INFO connectivity matrix has size 49931 +INFO generating 10000000 randn every 13107 clock cycles for neurongroup_stateupdater_codeobject +INFO calling kernel_neurongroup_stateupdater_codeobject with 2 blocks and 576 threads. Kernel needs 52 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 0.562500. +INFO calling kernel_neurongroup_thresholder_codeobject with 1 blocks and 1024 threads. Kernel needs 11 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000. +INFO calling kernel_synapses_pre_codeobject with 15 blocks and 1024 threads. Kernel needs 28 registers per block, 0 bytes of statically-allocated shared memory per block, 8 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000. +INFO calling kernel_neurongroup_resetter_codeobject with 1 blocks and 1024 threads. Kernel needs 14 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000. +Number of synapses: 49931 +INFO: main_lines took 4.071034 seconds +INFO: main function took 4.240770 seconds +mkdir: cannot create directory ‘test_output’: File exists +# DT 0.000100 +# totalTime 1.000000 +# We are running with fixed time step 0.000100 +0.9999 done ... +everything finished. +4328 +INFO relative performance for Full examples: Adaptation oscillation N=10000 GeNN_optimized: 1.08010242282 +INFO relative performance for Full examples: Adaptation oscillation N=20000 GeNN_optimized: 1.04654988332 +INFO relative performance for Full examples: Adaptation oscillation N=50000 GeNN_optimized: 0.961954575197 +INFO relative performance for Full examples: Adaptation oscillation N=100000 GeNN_optimized: nan +Rerunning CUDAStandaloneConfiguration with n = 1000 for nvprof profiling +cd cuda_standalone && nvprof --profile-from-start-off --log-file ../results_2017_04_05_complete_after_talk/nvprof/nvprof_AdaptationOscillation_CUDAStandaloneConfiguration_1000.log ./main +Profiling took 0:00:07 for runtime of 0.372608 +Rerunning GeNNConfigurationOptimized with n = 1000 for nvprof profiling +cd GeNNworkspace && nvprof --log-file ../results_2017_04_05_complete_after_talk/nvprof/nvprof_AdaptationOscillation_GeNNConfigurationOptimized_1000.log ./main test 1.0 1 +Profiling took 0:00:03 for runtime of 0.311908 +Starting Vogels on 06.04.2017 at 02:53:45. +Running speed tests +Configurations: CUDA standalone, C++ standalone, GeNN_optimized +Full examples: Vogels et al 2011 (event-driven synapses): n=10 [...] n=100 [...] n=1000 [...] n=10000 [...] n=20000 [...] n=50000 [...] n=100000 [E..] + +TRACEBACK CUDA standalone N=100000 +INFO: setting cudaDevice stuff took 0.263336 seconds +INFO connectivity matrix has size 7997654 +INFO connectivity matrix has size 31988320 +INFO connectivity matrix has size 159989507 +INFO connectivity matrix has size 31988320 +INFO calling kernel_neurongroup_stateupdater_codeobject with 131 blocks and 768 threads. Kernel needs 40 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 0.750000. +ERROR launching kernel_neurongroup_stateupdater_codeobject in code_objects/neurongroup_stateupdater_codeobject.cu:1066 invalid argument + +('debug syn effect mdoe ', 'target') +('debug syn effect mdoe ', 'target') +('debug syn effect mdoe ', 'target') +('debug syn effect mdoe ', 'synapse') +INFO: setting cudaDevice stuff took 0.263336 seconds +INFO connectivity matrix has size 7997654 +INFO connectivity matrix has size 31988320 +INFO connectivity matrix has size 159989507 +INFO connectivity matrix has size 31988320 +INFO calling kernel_neurongroup_stateupdater_codeobject with 131 blocks and 768 threads. Kernel needs 40 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 0.750000. +ERROR launching kernel_neurongroup_stateupdater_codeobject in code_objects/neurongroup_stateupdater_codeobject.cu:1066 invalid argument + + +/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2/brian2/utils/logger.py:546: UserWarning: Could not copy script file to temp directory: [Errno 2] No such file or directory: '/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/dev/benchmarks/-c' + 'Could not copy script file to temp directory: %s' % ex) +INFO No numerical integration method specified for group 'neurongroup', using method 'euler' (took 0.06s, trying other methods took 0.11s). [brian2.stateupdaters.base.method_choice] + +Traceback (most recent call last): + File "", line 21, in + File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/brian2cuda/tests/features/cuda_configuration.py", line 27, in after_run + with_output=False) + File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/brian2cuda/device.py", line 778, in build + self.run(directory, with_output, run_args) + File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2/brian2/devices/cpp_standalone/device.py", line 864, in run + "%s)" % os.path.abspath(directory)) +RuntimeError: Project run failed (project directory: /mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/dev/benchmarks/cuda_standalone/cuda_standalone) + + + +Running Vogels took 1:24:04. +INFO relative performance for Full examples: Vogels et al 2011 (event-driven synapses) N=10 CUDA standalone: 1.0 +INFO relative performance for Full examples: Vogels et al 2011 (event-driven synapses) N=100 CUDA standalone: 1.0 +INFO relative performance for Full examples: Vogels et al 2011 (event-driven synapses) N=1000 CUDA standalone: 1.0 +INFO relative performance for Full examples: Vogels et al 2011 (event-driven synapses) N=10000 CUDA standalone: 1.0 +INFO relative performance for Full examples: Vogels et al 2011 (event-driven synapses) N=20000 CUDA standalone: 1.0 +INFO relative performance for Full examples: Vogels et al 2011 (event-driven synapses) N=50000 CUDA standalone: 1.0 +INFO relative performance for Full examples: Vogels et al 2011 (event-driven synapses) N=100000 CUDA standalone: nan +INFO relative performance for Full examples: Vogels et al 2011 (event-driven synapses) N=10 C++ standalone: 5.81708231538 +INFO relative performance for Full examples: Vogels et al 2011 (event-driven synapses) N=100 C++ standalone: 8.52257702632 +INFO relative performance for Full examples: Vogels et al 2011 (event-driven synapses) N=1000 C++ standalone: 3.6497INFO: setting cudaDevice stuff took 0.201820 seconds +INFO connectivity matrix has size 776 +INFO connectivity matrix has size 3224 +INFO connectivity matrix has size 15811 +INFO connectivity matrix has size 3224 +INFO calling kernel_neurongroup_stateupdater_codeobject with 2 blocks and 768 threads. Kernel needs 40 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 0.750000. +INFO calling kernel_neurongroup_thresholder_codeobject with 1 blocks and 1024 threads. Kernel needs 11 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000. +INFO calling kernel_synapses_1_pre_codeobject with 15 blocks and 1024 threads. Kernel needs 27 registers per block, 0 bytes of statically-allocated shared memory per block, 8 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000. +INFO calling kernel_synapses_2_pre_codeobject with 15 blocks and 1024 threads. Kernel needs 42 registers per block, 0 bytes of statically-allocated shared memory per block, 8 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 0.500000. +INFO calling kernel_synapses_pre_codeobject with 15 blocks and 1024 threads. Kernel needs 27 registers per block, 0 bytes of statically-allocated shared memory per block, 8 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000. +INFO calling kernel_synapses_2_post_codeobject with 15 blocks and 1024 threads. Kernel needs 34 registers per block, 0 bytes of statically-allocated shared memory per block, 8 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 0.500000. +INFO calling kernel_neurongroup_resetter_codeobject with 1 blocks and 1024 threads. Kernel needs 14 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000. +Number of synapses: 3224 +Number of synapses: 3224 +Number of synapses: 15811 +Number of synapses: 776 +INFO: main_lines took 7.959584 seconds +INFO: main function took 8.189923 seconds +mkdir: cannot create directory ‘test_output’: File exists +# DT 0.000100 +# totalTime 1.000000 +# We are running with fixed time step 0.000100 +0.9999 done ... +everything finished. +7700535 +INFO relative performance for Full examples: Vogels et al 2011 (event-driven synapses) N=10000 C++ standalone: 1.27385995798 +INFO relative performance for Full examples: Vogels et al 2011 (event-driven synapses) N=20000 C++ standalone: 0.756183613604 +INFO relative performance for Full examples: Vogels et al 2011 (event-driven synapses) N=50000 C++ standalone: 0.427712217638 +INFO relative performance for Full examples: Vogels et al 2011 (event-driven synapses) N=100000 C++ standalone: nan +INFO relative performance for Full examples: Vogels et al 2011 (event-driven synapses) N=10 GeNN_optimized: 1.48314163249 +INFO relative performance for Full examples: Vogels et al 2011 (event-driven synapses) N=100 GeNN_optimized: 2.17682514992 +INFO relative performance for Full examples: Vogels et al 2011 (event-driven synapses) N=1000 GeNN_optimized: 1.16598307673 +INFO relative performance for Full examples: Vogels et al 2011 (event-driven synapses) N=10000 GeNN_optimized: 0.540424662951 +INFO relative performance for Full examples: Vogels et al 2011 (event-driven synapses) N=20000 GeNN_optimized: 0.560432413335 +INFO relative performance for Full examples: Vogels et al 2011 (event-driven synapses) N=50000 GeNN_optimized: 0.536937885623 +INFO relative performance for Full examples: Vogels et al 2011 (event-driven synapses) N=100000 GeNN_optimized: nan +Rerunning CUDAStandaloneConfiguration with n = 1000 for nvprof profiling +cd cuda_standalone && nvprof --profile-from-start-off --log-file ../results_2017_04_05_complete_after_talk/nvprof/nvprof_Vogels_CUDAStandaloneConfiguration_1000.log ./main +Profiling took 0:00:15 for runtime of 0.85962 +Rerunning GeNNConfigurationOptimized with n = 1000 for nvprof profiling +cd GeNNworkspace && nvprof --log-file ../results_2017_04_05_complete_after_talk/nvprof/nvprof_Vogels_GeNNConfigurationOptimized_1000.log ./main test 1.0 1 +Profiling took 0:00:04 for runtime of 0.740019 +Starting STDP on 06.04.2017 at 04:19:26. +Running speed tests +Configurations: CUDA standalone, C++ standalone, GeNN_optimized +Full examples: STDP with Poisson input: n=100 [...] n=500 [...] n=1000 [...] n=5000 [...] n=10000 [...] n=50000 [...] n=100000 [...] n=500000 [...] n=1000000 [...]INFO: setting cudaDevice stuff took 0.313461 seconds +INFO calling kernel_synapses_group_variable_set_conditional_codeobject with 1 blocks and 1024 threads. Kernel needs 6 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory +INFO connectivity matrix has size 1000 +INFO connectivity matrix has size 1000 +INFO generating 10000000 rand every 13107 clock cycles for poissongroup_thresholder_codeobject +INFO calling kernel_neurongroup_stateupdater_codeobject with 1 blocks and 768 threads. Kernel needs 36 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 0.750000. +INFO calling kernel_neurongroup_thresholder_codeobject with 1 blocks and 1024 threads. Kernel needs 9 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000. +INFO calling kernel_poissongroup_thresholder_codeobject with 1 blocks and 1024 threads. Kernel needs 15 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000. +INFO calling kernel_spikemonitor_codeobject with 1 blocks and 1 threads. Kernel needs 37 registers per block, 0 bytes of statically-allocated shared memory per block, 16 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 0.000000. +INFO calling kernel_synapses_pre_codeobject with 15 blocks and 1024 threads. Kernel needs 42 registers per block, 0 bytes of statically-allocated shared memory per block, 8 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 0.500000. +INFO calling kernel_synapses_post_codeobject with 15 blocks and 1024 threads. Kernel needs 34 registers per block, 0 bytes of statically-allocated shared memory per block, 8 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 0.500000. +INFO calling kernel_neurongroup_resetter_codeobject with 1 blocks and 1024 threads. Kernel needs 14 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000. +Number of synapses: 1000 +Number of synapses: 1000 +INFO: main_lines took 6.937627 seconds +Number of spikes: 14845 +INFO: main function took 7.299253 seconds +mkdir: cannot create directory ‘test_output’: File exists +# DT 0.000100 +# totalTime 1.000000 +# We are running with fixed time step 0.000100 +0.9999 done ... +everything finished. + +Running STDP took 1:20:32. +INFO relative performance for Full examples: STDP with Poisson input N=100 CUDA standalone: 1.0 +INFO relative performance for Full examples: STDP with Poisson input N=500 CUDA standalone: 1.0 +INFO relative performance for Full examples: STDP with Poisson input N=1000 CUDA standalone: 1.0 +INFO relative performance for Full examples: STDP with Poisson input N=5000 CUDA standalone: 1.0 +INFO relative performance for Full examples: STDP with Poisson input N=10000 CUDA standalone: 1.0 +INFO relative performance for Full examples: STDP with Poisson input N=50000 CUDA standalone: 1.0 +INFO relative performance for Full examples: STDP with Poisson input N=100000 CUDA standalone: 1.0 +INFO relative performance for Full examples: STDP with Poisson input N=500000 CUDA standalone: 1.0 +INFO relative performance for Full examples: STDP with Poisson input N=1000000 CUDA standalone: 1.0 +INFO relative performance for Full examples: STDP with Poisson input N=100 C++ standalone: 8.88338818143 +INFO relative performance for Full examples: STDP with Poisson input N=500 C++ standalone: 5.09683126402 +INFO relative performance for Full examples: STDP with Poisson input N=1000 C++ standalone: 3.43176369337 +INFO relative performance for Full examples: STDP with Poisson input N=5000 C++ standalone: 1.23369418761 +INFO relative performance for Full examples: STDP with Poisson input N=10000 C++ standalone: 0.740582551232 +INFO relative performance for Full examples: STDP with Poisson input N=50000 C++ standalone: 0.373019575212 +INFO relative performance for Full examples: STDP with Poisson input N=100000 C++ standalone: 0.364799663468 +INFO relative performance for Full examples: STDP with Poisson input N=500000 C++ standalone: 2.96351279504 +INFO relative performance for Full examples: STDP with Poisson input N=1000000 C++ standalone: 4.70050294368 +INFO relative performance for Full examples: STDP with Poisson input N=100 GeNN_optimized: 1.30974438102 +INFO relative performance for Full examples: STDP with Poisson input N=500 GeNN_optimized: 1.18471683703 +INFO relative performance for Full examples: STDP with Poisson input N=1000 GeNN_optimized: 1.16060458009 +INFO relative performance for Full examples: STDP with Poisson input N=5000 GeNN_optimized: 1.56393492832 +INFO relative performance for Full examples: STDP with Poisson input N=10000 GeNN_optimized: 1.7254821012 +INFO relative performance for Full examples: STDP with Poisson input N=50000 GeNN_optimized: 1.69780564815 +INFO relative performance for Full examples: STDP with Poisson input N=100000 GeNN_optimized: 1.68674045475 +INFO relative performance for Full examples: STDP with Poisson input N=500000 GeNN_optimized: 14.7726180354 +INFO relative performance for Full examples: STDP with Poisson input N=1000000 GeNN_optimized: 23.369044153 +Rerunning CUDAStandaloneConfiguration with n = 1000 for nvprof profiling +cd cuda_standalone && nvprof --profile-from-start-off --log-file ../results_2017_04_05_complete_after_talk/nvprof/nvprof_STDP_CUDAStandaloneConfiguration_1000.log ./main +Profiling took 0:00:12 for runtime of 0.558918 +Rerunning GeNNConfigurationOptimized with n = 1000 for nvprof profiling +cd GeNNworkspace && nvprof --log-file ../results_2017_04_05_complete_after_talk/nvprof/nvprof_STDP_GeNNConfigurationOptimized_1000.log ./main test 1.0 1 +Profiling took 0:00:04 for runtime of 0.479759 +Starting STDPEventDriven on 06.04.2017 at 05:41:19. +Running speed tests +Configurations: CUDA standalone, C++ standalone, GeNN_optimized +Full examples: STDP (event-driven): n=10 [...] n=100 [...] n=1000 [...] n=10000 [...] n=20000 [...] n=50000 [...] n=100000 [...] n=1000000 [...] n=5000000 [...]INFO: setting cudaDevice stuff took 0.328379 seconds +INFO calling kernel_synapses_group_variable_set_conditional_codeobject with 1 blocks and 1024 threads. Kernel needs 6 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory +INFO connectivity matrix has size 1000 +INFO connectivity matrix has size 1000 +INFO generating 10000000 rand every 13107 clock cycles for poissongroup_thresholder_codeobject +INFO calling kernel_neurongroup_stateupdater_codeobject with 1 blocks and 768 threads. Kernel needs 36 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 0.750000. +INFO calling kernel_neurongroup_thresholder_codeobject with 1 blocks and 1024 threads. Kernel needs 9 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000. +INFO calling kernel_poissongroup_thresholder_codeobject with 1 blocks and 1024 threads. Kernel needs 15 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000. +INFO calling kernel_synapses_pre_codeobject with 15 blocks and 1024 threads. Kernel needs 42 registers per block, 0 bytes of statically-allocated shared memory per block, 8 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 0.500000. +INFO calling kernel_synapses_post_codeobject with 15 blocks and 1024 threads. Kernel needs 34 registers per block, 0 bytes of statically-allocated shared memory per block, 8 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 0.500000. +INFO calling kernel_neurongroup_resetter_codeobject with 1 blocks and 1024 threads. Kernel needs 14 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000. +Number of synapses: 1000 +Number of synapses: 1000 +INFO: main_lines took 5.794570 seconds +INFO: main function took 6.145385 seconds +mkdir: cannot create directory ‘test_output’: File exists +# DT 0.000100 +# totalTime 1.000000 +# We are running with fixed time step 0.000100 +0.9999 done ... +everything finished. + +Running STDPEventDriven took 1:19:53. +INFO relative performance for Full examples: STDP (event-driven) N=10 CUDA standalone: 1.0 +INFO relative performance for Full examples: STDP (event-driven) N=100 CUDA standalone: 1.0 +INFO relative performance for Full examples: STDP (event-driven) N=1000 CUDA standalone: 1.0 +INFO relative performance for Full examples: STDP (event-driven) N=10000 CUDA standalone: 1.0 +INFO relative performance for Full examples: STDP (event-driven) N=20000 CUDA standalone: 1.0 +INFO relative performance for Full examples: STDP (event-driven) N=50000 CUDA standalone: 1.0 +INFO relative performance for Full examples: STDP (event-driven) N=100000 CUDA standalone: 1.0 +INFO relative performance for Full examples: STDP (event-driven) N=1000000 CUDA standalone: 1.0 +INFO relative performance for Full examples: STDP (event-driven) N=5000000 CUDA standalone: 1.0 +INFO relative performance for Full examples: STDP (event-driven) N=10 C++ standalone: 11.2497943354 +INFO relative performance for Full examples: STDP (event-driven) N=100 C++ standalone: 8.31207995112 +INFO relative performance for Full examples: STDP (event-driven) N=1000 C++ standalone: 2.61972877926 +INFO relative performance for Full examples: STDP (event-driven) N=10000 C++ standalone: 0.288795267181 +INFO relative performance for Full examples: STDP (event-driven) N=20000 C++ standalone: 0.142747274164 +INFO relative performance for Full examples: STDP (event-driven) N=50000 C++ standalone: 0.127502322902 +INFO relative performance for Full examples: STDP (event-driven) N=100000 C++ standalone: 0.124020384785 +INFO relative performance for Full examples: STDP (event-driven) N=1000000 C++ standalone: 0.111562986518 +INFO relative performance for Full examples: STDP (event-driven) N=5000000 C++ standalone: 0.115750281196 +INFO relative performance for Full examples: STDP (event-driven) N=10 GeNN_optimized: 2.76946556051 +INFO relative performance for Full examples: STDP (event-driven) N=100 GeNN_optimized: 2.56034789856 +INFO relative performance for Full examples: STDP (event-driven) N=1000 GeNN_optimized: 1.81374117769 +INFO relative performance for Full examples: STDP (event-driven) N=10000 GeNN_optimized: 0.854456214358 +INFO relative performance for Full examples: STDP (event-driven) N=20000 GeNN_optimized: 0.725383414514 +INFO relative performance for Full examples: STDP (event-driven) N=50000 GeNN_optimized: 0.633313911843 +INFO relative performance for Full examples: STDP (event-driven) N=100000 GeNN_optimized: 0.609749101371 +INFO relative performance for Full examples: STDP (event-driven) N=1000000 GeNN_optimized: 0.578689800398 +INFO relative performance for Full examples: STDP (event-driven) N=5000000 GeNN_optimized: 0.607900794592 +Rerunning CUDAStandaloneConfiguration with n = 1000 for nvprof profiling +cd cuda_standalone && nvprof --profile-from-start-off --log-file ../results_2017_04_05_complete_after_talk/nvprof/nvprof_STDPEventDriven_CUDAStandaloneConfiguration_1000.log ./main +Profiling took 0:00:10 for runtime of 0.415781 +Rerunning GeNNConfigurationOptimized with n = 1000 for nvprof profiling +cd GeNNworkspace && nvprof --log-file ../results_2017_04_05_complete_after_talk/nvprof/nvprof_STDPEventDriven_GeNNConfigurationOptimized_1000.log ./main test 1.0 1 +Profiling took 0:00:02 for runtime of 0.232967 +Starting BrunelHakimModelScalarDelay on 06.04.2017 at 07:02:28. +Running speed tests +Configurations: CUDA standalone, C++ standalone, GeNN_optimized +Full examples: Brunel Hakim with scalar delay (1s): n=10 [...] n=100 [...] n=1000 [...] n=10000 [...] n=20000 [...] n=50000 [...] n=100000 [...] n=250000 [...]INFO: setting cudaDevice stuff took 0.303419 seconds +INFO connectivity matrix has size 1000000 +INFO generating 10000000 randn every 13107 clock cycles for neurongroup_stateupdater_codeobject +INFO calling kernel_neurongroup_stateupdater_codeobject with 2 blocks and 576 threads. Kernel needs 52 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 0.562500. +INFO calling kernel_neurongroup_thresholder_codeobject with 1 blocks and 1024 threads. Kernel needs 11 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000. +INFO calling kernel_synapses_pre_codeobject with 15 blocks and 1024 threads. Kernel needs 27 registers per block, 0 bytes of statically-allocated shared memory per block, 8 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000. +INFO calling kernel_neurongroup_resetter_codeobject with 1 blocks and 1024 threads. Kernel needs 14 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000. +Number of synapses: 1000000 +INFO: main_lines took 4.771391 seconds +INFO: main function took 5.118664 seconds +mkdir: cannot create directory ‘test_output’: File exists +# DT 0.000100 +# totalTime 1.000000 +# We are running with fixed time step 0.000100 +0.9999 done ... +everything finished. + +Running BrunelHakimModelScalarDelay took 2:11:11. +INFO relative performance for Full examples: Brunel Hakim with scalar delay (1s) N=10 CUDA standalone: 1.0 +INFO relative performance for Full examples: Brunel Hakim with scalar delay (1s) N=100 CUDA standalone: 1.0 +INFO relative performance for Full examples: Brunel Hakim with scalar delay (1s) N=1000 CUDA standalone: 1.0 +INFO relative performance for Full examples: Brunel Hakim with scalar delay (1s) N=10000 CUDA standalone: 1.0 +INFO relative performance for Full examples: Brunel Hakim with scalar delay (1s) N=20000 CUDA standalone: 1.0 +INFO relative performance for Full examples: Brunel Hakim with scalar delay (1s) N=50000 CUDA standalone: 1.0 +INFO relative performance for Full examples: Brunel Hakim with scalar delay (1s) N=100000 CUDA standalone: 1.0 +INFO relative performance for Full examples: Brunel Hakim with scalar delay (1s) N=250000 CUDA standalone: 1.0 +INFO relative performance for Full examples: Brunel Hakim with scalar delay (1s) N=10 C++ standalone: 9.78843893145 +INFO relative performance for Full examples: Brunel Hakim with scalar delay (1s) N=100 C++ standalone: 2.88689717066 +INFO relative performance for Full examples: Brunel Hakim with scalar delay (1s) N=1000 C++ standalone: 0.39678676061 +INFO relative performance for Full examples: Brunel Hakim with scalar delay (1s) N=10000 C++ standalone: 0.0579548817132 +INFO relative performance for Full examples: Brunel Hakim with scalar delay (1s) N=20000 C++ standalone: 0.043471563887 +INFO relative performance for Full examples: Brunel Hakim with scalar delay (1s) N=50000 C++ standalone: 0.032781986812 +INFO relative performance for Full examples: Brunel Hakim with scalar delay (1s) N=100000 C++ standalone: 0.0294157569658 +INFO relative performance for Full examples: Brunel Hakim with scalar delay (1s) N=250000 C++ standalone: 0.0305941911305 +INFO relative performance for Full examples: Brunel Hakim with scalar delay (1s) N=10 GeNN_optimized: 1.58033861921 +INFO relative performance for Full examples: Brunel Hakim with scalar delay (1s) N=100 GeNN_optimized: 1.39364952685 +INFO relative performance for Full examples: Brunel Hakim with scalar delay (1s) N=1000 GeNN_optimized: 1.28814277906 +INFO relative performance for Full examples: Brunel Hakim with scalar delay (1s) N=10000 GeNN_optimized: 1.15879715758 +INFO relative performance for Full examples: Brunel Hakim with scalar delay (1s) N=20000 GeNN_optimized: 1.08175176136 +INFO relative performance for Full examples: Brunel Hakim with scalar delay (1s) N=50000 GeNN_optimized: 0.884117890361 +INFO relative performance for Full examples: Brunel Hakim with scalar delay (1s) N=100000 GeNN_optimized: 0.770480233171 +INFO relative performance for Full examples: Brunel Hakim with scalar delay (1s) N=250000 GeNN_optimized: 0.457457403795 +Rerunning CUDAStandaloneConfiguration with n = 1000 for nvprof profiling +cd cuda_standalone && nvprof --profile-from-start-off --log-file ../results_2017_04_05_complete_after_talk/nvprof/nvprof_BrunelHakimModelScalarDelay_CUDAStandaloneConfiguration_1000.log ./main +Profiling took 0:00:09 for runtime of 0.248229 +Rerunning GeNNConfigurationOptimized with n = 1000 for nvprof profiling +cd GeNNworkspace && nvprof --log-file ../results_2017_04_05_complete_after_talk/nvprof/nvprof_BrunelHakimModelScalarDelay_GeNNConfigurationOptimized_1000.log ./main test 1.0 1 +Profiling took 0:00:04 for runtime of 0.195981 +Starting BrunelHakimModelScalarDelayNoMultiPrePost on 06.04.2017 at 09:15:13. +Running speed tests +Configurations: CUDA standalone, C++ standalone, GeNN_optimized +Full examples: Brunel Hakim with scalar delay (1s, no multip pre-post connections): n=10 [...] n=100 [...] n=1000 [...] n=10000 [...] n=20000 [...] n=50000 [...] n=100000 [...] n=250000 [...]INFO: setting cudaDevice stuff took 0.171453 seconds +INFO connectivity matrix has size 999000 +INFO generating 10000000 randn every 13107 clock cycles for neurongroup_stateupdater_codeobject +INFO calling kernel_neurongroup_stateupdater_codeobject with 2 blocks and 576 threads. Kernel needs 52 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 0.562500. +INFO calling kernel_neurongroup_thresholder_codeobject with 1 blocks and 1024 threads. Kernel needs 11 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000. +INFO calling kernel_synapses_pre_codeobject with 15 blocks and 1024 threads. Kernel needs 27 registers per block, 0 bytes of statically-allocated shared memory per block, 8 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000. +INFO calling kernel_neurongroup_resetter_codeobject with 1 blocks and 1024 threads. Kernel needs 14 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000. +Number of synapses: 999000 +INFO: main_lines took 4.843546 seconds +INFO: main function took 5.059406 seconds + +Running BrunelHakimModelScalarDelayNoMultiPrePost took 2:09:50. +INFO relative performance for Full examples: Brunel Hakim with scalar delay (1s, no multip pre-post connections) N=10 CUDA standalone: 1.0 +INFO relative performance for Full examples: Brunel Hakim with scalar delay (1s, no multip pre-post connections) N=100 CUDA standalone: 1.0 +INFO relative performance for Full examples: Brunel Hakim with scalar delay (1s, no multip pre-post connections) N=1000 CUDA standalone: 1.0 +INFO relative performance for Full examples: Brunel Hakim with scalar delay (1s, no multip pre-post connections) N=10000 CUDA standalone: 1.0 +INFO relative performance for Full examples: Brunel Hakim with scalar delay (1s, no multip pre-post connections) N=20000 CUDA standalone: 1.0 +INFO relative performance for Full examples: Brunel Hakim with scalar delay (1s, no multip pre-post connections) N=50000 CUDA standalone: 1.0 +INFO relative performance for Full examples: Brunel Hakim with scalar delay (1s, no multip pre-post connections) N=100000 CUDA standalone: 1.0 +INFO relative performance for Full examples: Brunel Hakim with scalar delay (1s, no multip pre-post connections) N=250000 CUDA standalone: 1.0 +INFO relative performance for Full examples: Brunel Hakim with scalar delay (1s, no multip pre-post connections) N=10 C++ standalone: 8.20634264047 +INFO relative performance for Full examples: Brunel Hakim with scalar delay (1s, no multip pre-post connections) N=100 C++ standalone: 2.89509603547 +INFO relative performance for Full examples: Brunel Hakim with scalar delay (1s, no multip pre-post connections) N=1000 C++ standalone: 0.362216843115 +INFO relative performance for Full examples: Brunel Hakim with scalar delay (1s, no multip pre-post connections) N=10000 C++ standalone: 0.0614173832433 +INFO relative performance for Full examples: Brunel Hakim with scalar delay (1s, no multip pre-post connections) N=20000 C++ standalone: 0.0440634627728 +INFO relative performance for Full examples: Brunel Hakim with scalar delay (1s, no multip pre-post connections) N=50000 C++ standalone: 0.0323258939735 +INFO relative performance for Full examples: Brunel Hakim with scalar delay (1s, no multip pre-post connections) N=100000 C++ standalone: 0.0295636812952 +INFO relative performance for Full examples: Brunel Hakim with scalar delay (1s, no multip pre-post connections) N=250000 C++ standalone: 0.0301825412509 +INFO relative performance for Full examples: Brunel Hakim with scalar delay (1s, no multip pre-post connections) N=10 GeNN_optimized: 1.55619035168 +INFO relative performance for Full examples: Brunel Hakim with scalar delay (1s, no multip pre-post connections) N=100 GeNN_optimized: 1.39476926975 +INFO relative performance for Full examples: Brunel Hakim with scalar delay (1s, no multip pre-post connections) N=1000 GeNN_optimized: 1.32290219232 +INFO relative performance for Full examples: Brunel Hakim with scalar delay (1s, no multip pre-post connections) N=10000 GeNN_optimized: 1.19834218479 +INFO relative performance for Full examples: Brunel Hakim with scalar delay (1s, no multip pre-post connections) N=20000 GeNN_optimized: 1.08968345615 +INFO relative performance for Full examples: Brunel Hakim with scalar delay (1s, no multip pre-post connections) N=50000 GeNN_optimized: 0.88300034153 +INFO relative performance for Full examples: Brunel Hakim with scalar delay (1s, no multip pre-post connections) N=100000 GeNN_optimized: 0.76787687807 +INFO relative performance for Full examples: Brunel Hakim with scalar delay (1s, no multip pre-post connections) N=250000 GeNN_optimized: 0.4580926795 +Rerunning CUDAStandaloneConfiguration with n = 1000 for nvprof profiling +cd cuda_standalone && nvprof --profile-from-start-off --log-file ../results_2017_04_05_complete_after_talk/nvprof/nvprof_BrunelHakimModelScalarDelayNoMultiPrePost_CUDAStandaloneConfiguration_1000.log ./main +Profiling took 0:00:10 for runtime of 0.251194 +Rerunning GeNNConfigurationOptimized with n = 1000 for nvprof profiling +cd GeNNworkspace && nvprof --log-file ../results_2017_04_05_complete_after_talk/mkdir: cannot create directory ‘test_output’: File exists +# DT 0.000100 +# totalTime 1.000000 +# We are running with fixed time step 0.000100 +0.9999 done ... +everything finished. +nvprof/nvprof_BrunelHakimModelScalarDelayNoMultiPrePost_GeNNConfigurationOptimized_1000.log ./main test 1.0 1 +Profiling took 0:00:04 for runtime of 0.194764 +Starting VerySparseMediumRateSynapsesOnly on 06.04.2017 at 11:26:46. +Running speed tests +Configurations: CUDA standalone, C++ standalone, GeNN_optimized +Synapses only: Very sparse, medium rate (10s duration): n=10 [...] n=100 [...] n=1000 [...] n=10000 [...] n=100000 [...] n=500000 [...]INFO: setting cudaDevice stuff took 0.189294 seconds +INFO connectivity matrix has size 32 +INFO calling kernel_neurongroup_thresholder_codeobject with 1 blocks and 1024 threads. Kernel needs 14 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000. +INFO calling kernel_synapses_pre_codeobject with 15 blocks and 1024 threads. Kernel needs 27 registers per block, 0 bytes of statically-allocated shared memory per block, 8 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000. +Number of synapses: 32 +INFO: main_lines took 19.422106 seconds +INFO: main function took 19.638434 seconds +mkdir: cannot create directory ‘test_output’: File exists +# DT 0.000100 +# totalTime 10.000000 +# We are running with fixed time step 0.000100 +9.9999 done ... +everything finished. + +Running VerySparseMediumRateSynapsesOnly took 1:34:44. +INFO relative performance for Synapses only: Very sparse, medium rate (10s duration) N=10 CUDA standalone: 1.0 +INFO relative performance for Synapses only: Very sparse, medium rate (10s duration) N=100 CUDA standalone: 1.0 +INFO relative performance for Synapses only: Very sparse, medium rate (10s duration) N=1000 CUDA standalone: 1.0 +INFO relative performance for Synapses only: Very sparse, medium rate (10s duration) N=10000 CUDA standalone: 1.0 +INFO relative performance for Synapses only: Very sparse, medium rate (10s duration) N=100000 CUDA standalone: 1.0 +INFO relative performance for Synapses only: Very sparse, medium rate (10s duration) N=500000 CUDA standalone: 1.0 +INFO relative performance for Synapses only: Very sparse, medium rate (10s duration) N=10 C++ standalone: 7.46232140337 +INFO relative performance for Synapses only: Very sparse, medium rate (10s duration) N=100 C++ standalone: 7.35680132588 +INFO relative performance for Synapses only: Very sparse, medium rate (10s duration) N=1000 C++ standalone: 11.8771841425 +INFO relative performance for Synapses only: Very sparse, medium rate (10s duration) N=10000 C++ standalone: 5.02984481209 +INFO relative performance for Synapses only: Very sparse, medium rate (10s duration) N=100000 C++ standalone: 0.318651307727 +INFO relative performance for Synapses only: Very sparse, medium rate (10s duration) N=500000 C++ standalone: 0.0791039271891 +INFO relative performance for Synapses only: Very sparse, medium rate (10s duration) N=10 GeNN_optimized: 0.882604947291 +INFO relative performance for Synapses only: Very sparse, medium rate (10s duration) N=100 GeNN_optimized: 0.865446555301 +INFO relative performance for Synapses only: Very sparse, medium rate (10s duration) N=1000 GeNN_optimized: 1.15609826329 +INFO relative performance for Synapses only: Very sparse, medium rate (10s duration) N=10000 GeNN_optimized: 0.847073701942 +INFO relative performance for Synapses only: Very sparse, medium rate (10s duration) N=100000 GeNN_optimized: 0.706869490288 +INFO relative performance for Synapses only: Very sparse, medium rate (10s duration) N=500000 GeNN_optimized: 0.280107509868 +Rerunning CUDAStandaloneConfiguration with n = 1000 for nvprof profiling +cd cuda_standalone && nvprof --profile-from-start-off --log-file ../results_2017_04_05_complete_after_talk/nvprof/nvprof_VerySparseMediumRateSynapsesOnly_CUDAStandaloneConfiguration_1000.log ./main +Profiling took 0:00:30 for runtime of 1.45008 +Rerunning GeNNConfigurationOptimized with n = 1000 for nvprof profiling +cd GeNNworkspace && nvprof --log-file ../results_2017_04_05_complete_after_talk/nvprof/nvprof_VerySparseMediumRateSynapsesOnly_GeNNConfigurationOptimized_1000.log ./main test 10.0 1 +Profiling took 0:00:12 for runtime of 1.244563 +Starting SparseMediumRateSynapsesOnly on 06.04.2017 at 13:03:04. +Running speed tests +Configurations: CUDA standalone, C++ standalone, GeNN_optimized +Synapses only: Sparse, medium rate (1s duration): n=10 [...] n=100 [...] n=1000 [...] n=10000 [...] n=100000 [...] n=500000 [...]INFO: setting cudaDevice stuff took 0.337554 seconds +INFO connectivity matrix has size 203 +INFO calling kernel_neurongroup_thresholder_codeobject with 1 blocks and 1024 threads. Kernel needs 14 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000. +INFO calling kernel_synapses_pre_codeobject with 15 blocks and 1024 threads. Kernel needs 27 registers per block, 0 bytes of statically-allocated shared memory per block, 8 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000. +Number of synapses: 203 +INFO: main_lines took 2.003913 seconds +INFO: main function took 2.360594 seconds +mkdir: cannot create directory ‘test_output’: File exists +# DT 0.000100 +# totalTime 1.000000 +# We are running with fixed time step 0.000100 +0.9999 done ... +everything finished. + +Running SparseMediumRateSynapsesOnly took 1:10:08. +INFO relative performance for Synapses only: Sparse, medium rate (1s duration) N=10 CUDA standalone: 1.0 +INFO relative performance for Synapses only: Sparse, medium rate (1s duration) N=100 CUDA standalone: 1.0 +INFO relative performance for Synapses only: Sparse, medium rate (1s duration) N=1000 CUDA standalone: 1.0 +INFO relative performance for Synapses only: Sparse, medium rate (1s duration) N=10000 CUDA standalone: 1.0 +INFO relative performance for Synapses only: Sparse, medium rate (1s duration) N=100000 CUDA standalone: 1.0 +INFO relative performance for Synapses only: Sparse, medium rate (1s duration) N=500000 CUDA standalone: 1.0 +INFO relative performance for Synapses only: Sparse, medium rate (1s duration) N=10 C++ standalone: 9.03811951624 +INFO relative performance for Synapses only: Sparse, medium rate (1s duration) N=100 C++ standalone: 12.5731575893 +INFO relative performance for Synapses only: Sparse, medium rate (1s duration) N=1000 C++ standalone: 7.14094900056 +INFO relative performance for Synapses only: Sparse, medium rate (1s duration) N=10000 C++ standalone: 0.618089685795 +INFO relative performance for Synapses only: Sparse, medium rate (1s duration) N=100000 C++ standalone: 0.0489483464931 +INFO relative performance for Synapses only: Sparse, medium rate (1s duration) N=500000 C++ standalone: 0.049623428215 +INFO relative performance for Synapses only: Sparse, medium rate (1s duration) N=10 GeNN_optimized: 1.29097406281 +INFO relative performance for Synapses only: Sparse, medium rate (1s duration) N=100 GeNN_optimized: 1.23388192416 +INFO relative performance for Synapses only: Sparse, medium rate (1s duration) N=1000 GeNN_optimized: 1.20240950591 +INFO relative performance for Synapses only: Sparse, medium rate (1s duration) N=10000 GeNN_optimized: 1.07033546948 +INFO relative performance for Synapses only: Sparse, medium rate (1s duration) N=100000 GeNN_optimized: 0.721667335794 +INFO relative performance for Synapses only: Sparse, medium rate (1s duration) N=500000 GeNN_optimized: 0.715129849711 +Rerunning CUDAStandaloneConfiguration with n = 1000 for nvprof profiling +cd cuda_standalone && nvprof --profile-from-start-off --log-file ../results_2017_04_05_complete_after_talk/nvprof/nvprof_SparseMediumRateSynapsesOnly_CUDAStandaloneConfiguration_1000.log ./main +Profiling took 0:00:04 for runtime of 0.152218 +Rerunning GeNNConfigurationOptimized with n = 1000 for nvprof profiling +cd GeNNworkspace && nvprof --log-file ../results_2017_04_05_complete_after_talk/nvprof/nvprof_SparseMediumRateSynapsesOnly_GeNNConfigurationOptimized_1000.log ./main test 1.0 1 +Profiling took 0:00:02 for runtime of 0.128374 +Starting DenseMediumRateSynapsesOnly on 06.04.2017 at 14:14:07. +Running speed tests +Configurations: CUDA standalone, C++ standalone, GeNN_optimized +Synapses only: Dense, medium rate (1s duration): n=10 [...] n=100 [...] n=1000 [...] n=10000 [...] n=100000 [...] n=500000 [E..] + +TRACEBACK CUDA standalone N=500000 +INFO: setting cudaDevice stuff took 0.113374 seconds +INFO connectivity matrix has size 250000000 +INFO calling kernel_neurongroup_thresholder_codeobject with 1 blocks and 1024 threads. Kernel needs 14 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000. +ERROR launching kernel_neurongroup_thresholder_codeobject in code_objects/neurongroup_thresholder_codeobject.cu:1008 invalid argument + +('debug syn effect mdoe ', 'target') +INFO: setting cudaDevice stuff took 0.113374 seconds +INFO connectivity matrix has size 250000000 +INFO calling kernel_neurongroup_thresholder_codeobject with 1 blocks and 1024 threads. Kernel needs 14 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000. +ERROR launching kernel_neurongroup_thresholder_codeobject in code_objects/neurongroup_thresholder_codeobject.cu:1008 invalid argument + + +/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2/brian2/utils/logger.py:546: UserWarning: Could not copy script file to temp directory: [Errno 2] No such file or directory: '/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/dev/benchmarks/-c' + 'Could not copy script file to temp directory: %s' % ex) + +Traceback (most recent call last): + File "", line 21, in + File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/brian2cuda/tests/features/cuda_configuration.py", line 27, in after_run + with_output=False) + File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/brian2cuda/device.py", line 778, in build + self.run(directory, with_output, run_args) + File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2/brian2/devices/cpp_standalone/device.py", line 864, in run + "%s)" % os.path.abspath(directory)) +RuntimeError: Project run failed (project directory: /mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/dev/benchmarks/cuda_standalone/cuda_standalone) + + + +Running DenseMediumRateSynapsesOnly took 1:52:54. +INFO relative performance for Synapses only: Dense, medium rate (1s duration) N=10 CUDA standalone: 1.0 +INFO relative performance for Synapses only: Dense, medium rate (1s duration) N=100 CUDA standalone: 1.0 +INFO relative performance for Synapses only: Dense, medium rate (1s duration) N=1000 CUDA standalone: 1.0 +INFO relative performance for Synapses only: Dense, medium rate (1s duration) N=10000 CUDA standalone: 1.0 +INFO relative performance for Synapses only: Dense, medium rate (1s duration) N=100000 CUDA standalone: 1.0 +INFO relative performance for Synapses only: Dense, medium rate (1s duration) N=500000 CUDA standalone: nan +INFO relative performance for Synapses only: Dense, medium rate (1s duration) N=10 C++ standalone: 8.7685522422 +INFO relative performance for Synapses only: Dense, medium rate (1s duration) N=100 C++ standalone: 8.55977504315 +INFO relative performance for Synapses only: Dense, medium rate (1s duration) N=1000 C++ standalone: 3.94245172467 +INFO relative performance for Synapses only: Dense, medium rate (1s duration) N=10000 C++ standalone: 0.142538734289 +INFO relative performance for Synapses only: Dense, medium rate (1s duration) N=100000 C++ standalone: 0.0356173923641 +INFO relative performance for Synapses only: Dense, medium rate (1s duration) N=500000 C++ standalone: nan +INFO relative performance for Synapses only: Dense, medium rate (1s duration) N=10 GeNN_optimized: 1.30697848012 +INFO relative performance for Synapses only: Dense, medium rate (1s duration) N=100 GeNN_optimized: 1.27150537634 +INFO relative performance for Synapses only: Dense, medium rate (1s duration) N=1000 GeNN_optimized: 1.26404476027 +INFO relative performance for Synapses only: DeINFO: setting cudaDevice stuff took 0.177828 seconds +INFO connectivity matrix has size 1000 +INFO calling kernel_neurongroup_thresholder_codeobject with 1 blocks and 1024 threads. Kernel needs 14 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000. +INFO calling kernel_synapses_pre_codeobject with 15 blocks and 1024 threads. Kernel needs 27 registers per block, 0 bytes of statically-allocated shared memory per block, 8 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000. +Number of synapses: 1000 +INFO: main_lines took 1.945162 seconds +INFO: main function took 2.141915 seconds +mkdir: cannot create directory ‘test_output’: File exists +# DT 0.000100 +# totalTime 1.000000 +# We are running with fixed time step 0.000100 +0.9999 done ... +everything finished. +nse, medium rate (1s duration) N=10000 GeNN_optimized: 1.24454603496 +INFO relative performance for Synapses only: Dense, medium rate (1s duration) N=100000 GeNN_optimized: 1.35448135542 +INFO relative performance for Synapses only: Dense, medium rate (1s duration) N=500000 GeNN_optimized: nan +Rerunning CUDAStandaloneConfiguration with n = 1000 for nvprof profiling +cd cuda_standalone && nvprof --profile-from-start-off --log-file ../results_2017_04_05_complete_after_talk/nvprof/nvprof_DenseMediumRateSynapsesOnly_CUDAStandaloneConfiguration_1000.log ./main +Profiling took 0:00:04 for runtime of 0.154969 +Rerunning GeNNConfigurationOptimized with n = 1000 for nvprof profiling +cd GeNNworkspace && nvprof --log-file ../results_2017_04_05_complete_after_talk/nvprof/nvprof_DenseMediumRateSynapsesOnly_GeNNConfigurationOptimized_1000.log ./main test 1.0 1 +Profiling took 0:00:03 for runtime of 0.120151 +Starting SparseLowRateSynapsesOnly on 06.04.2017 at 16:08:31. +Running speed tests +Configurations: CUDA standalone, C++ standalone, GeNN_optimized +Synapses only: Sparse, low rate (10s duration): n=10 [...] n=100 [...] n=1000 [...] n=10000 [...] n=100000 [...] n=500000 [...]INFO: setting cudaDevice stuff took 0.315501 seconds +INFO connectivity matrix has size 191 +INFO calling kernel_neurongroup_thresholder_codeobject with 1 blocks and 1024 threads. Kernel needs 14 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000. +INFO calling kernel_synapses_pre_codeobject with 15 blocks and 1024 threads. Kernel needs 27 registers per block, 0 bytes of statically-allocated shared memory per block, 8 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000. +Number of synapses: 191 +INFO: main_lines took 18.990017 seconds +INFO: main function took 19.330690 seconds +mkdir: cannot create directory ‘test_output’: File exists +# DT 0.000100 +# totalTime 10.000000 +# We are running with fixed time step 0.000100 +9.9999 done ... +everything finished. + +Running SparseLowRateSynapsesOnly took 1:06:45. +INFO relative performance for Synapses only: Sparse, low rate (10s duration) N=10 CUDA standalone: 1.0 +INFO relative performance for Synapses only: Sparse, low rate (10s duration) N=100 CUDA standalone: 1.0 +INFO relative performance for Synapses only: Sparse, low rate (10s duration) N=1000 CUDA standalone: 1.0 +INFO relative performance for Synapses only: Sparse, low rate (10s duration) N=10000 CUDA standalone: 1.0 +INFO relative performance for Synapses only: Sparse, low rate (10s duration) N=100000 CUDA standalone: 1.0 +INFO relative performance for Synapses only: Sparse, low rate (10s duration) N=500000 CUDA standalone: 1.0 +INFO relative performance for Synapses only: Sparse, low rate (10s duration) N=10 C++ standalone: 11.3532916318 +INFO relative performance for Synapses only: Sparse, low rate (10s duration) N=100 C++ standalone: 11.7559788497 +INFO relative performance for Synapses only: Sparse, low rate (10s duration) N=1000 C++ standalone: 8.13595845714 +INFO relative performance for Synapses only: Sparse, low rate (10s duration) N=10000 C++ standalone: 2.08954888412 +INFO relative performance for Synapses only: Sparse, low rate (10s duration) N=100000 C++ standalone: 0.0786977354705 +INFO relative performance for Synapses only: Sparse, low rate (10s duration) N=500000 C++ standalone: 0.050991577758 +INFO relative performance for Synapses only: Sparse, low rate (10s duration) N=10 GeNN_optimized: 1.24713089977 +INFO relative performance for Synapses only: Sparse, low rate (10s duration) N=100 GeNN_optimized: 1.20253101745 +INFO relative performance for Synapses only: Sparse, low rate (10s duration) N=1000 GeNN_optimized: 1.16650382499 +INFO relative performance for Synapses only: Sparse, low rate (10s duration) N=10000 GeNN_optimized: 1.13437197572 +INFO relative performance for Synapses only: Sparse, low rate (10s duration) N=100000 GeNN_optimized: 0.553632482835 +INFO relative performance for Synapses only: Sparse, low rate (10s duration) N=500000 GeNN_optimized: 0.521572319612 +Rerunning CUDAStandaloneConfiguration with n = 1000 for nvprof profiling +cd cuda_standalone && nvprof --profile-from-start-off --log-file ../results_2017_04_05_complete_after_talk/nvprof/nvprof_SparseLowRateSynapsesOnly_CUDAStandaloneConfiguration_1000.log ./main +Profiling took 0:00:31 for runtime of 1.45718 +Rerunning GeNNConfigurationOptimized with n = 1000 for nvprof profiling +cd GeNNworkspace && nvprof --log-file ../results_2017_04_05_complete_after_talk/nvprof/nvprof_SparseLowRateSynapsesOnly_GeNNConfigurationOptimized_1000.log ./main test 10.0 1 +Profiling took 0:00:12 for runtime of 1.260501 +Starting SparseHighRateSynapsesOnly on 06.04.2017 at 17:16:49. +Running speed tests +Configurations: CUDA standalone, C++ standalone, GeNN_optimized +Synapses only: Sparse, high rate (1s duration): n=10 [...] n=100 [...] n=1000 [...] n=10000 [...] n=100000 [...] n=500000 [E..]INFO: setting cudaDevice stuff took 0.143830 seconds +INFO connectivity matrix has size 2030 +INFO calling kernel_neurongroup_thresholder_codeobject with 1 blocks and 1024 threads. Kernel needs 14 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000. +INFO calling kernel_synapses_pre_codeobject with 15 blocks and 1024 threads. Kernel needs 27 registers per block, 0 bytes of statically-allocated shared memory per block, 8 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000. +Number of synapses: 2030 +INFO: main_lines took 1.986260 seconds +INFO: main function took 2.148834 seconds +mkdir: cannot create directory ‘test_output’: File exists +# DT 0.000100 +# totalTime 1.000000 +# We are running with fixed time step 0.000100 +0.9999 done ... +everything finished. + + +TRACEBACK CUDA standalone N=500000 +INFO: setting cudaDevice stuff took 0.286275 seconds + +('debug syn effect mdoe ', 'target') +INFO: setting cudaDevice stuff took 0.286275 seconds + + +/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2/brian2/utils/logger.py:546: UserWarning: Could not copy script file to temp directory: [Errno 2] No such file or directory: '/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/dev/benchmarks/-c' + 'Could not copy script file to temp directory: %s' % ex) +terminate called after throwing an instance of 'thrust::system::detail::bad_alloc' + what(): std::bad_alloc: out of memory + +Traceback (most recent call last): + File "", line 21, in + File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/brian2cuda/tests/features/cuda_configuration.py", line 27, in after_run + with_output=False) + File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/brian2cuda/device.py", line 778, in build + self.run(directory, with_output, run_args) + File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2/brian2/devices/cpp_standalone/device.py", line 864, in run + "%s)" % os.path.abspath(directory)) +RuntimeError: Project run failed (project directory: /mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/dev/benchmarks/cuda_standalone/cuda_standalone) + + + +Running SparseHighRateSynapsesOnly took 3:04:44. +INFO relative performance for Synapses only: Sparse, high rate (1s duration) N=10 CUDA standalone: 1.0 +INFO relative performance for Synapses only: Sparse, high rate (1s duration) N=100 CUDA standalone: 1.0 +INFO relative performance for Synapses only: Sparse, high rate (1s duration) N=1000 CUDA standalone: 1.0 +INFO relative performance for Synapses only: Sparse, high rate (1s duration) N=10000 CUDA standalone: 1.0 +INFO relative performance for Synapses only: Sparse, high rate (1s duration) N=100000 CUDA standalone: 1.0 +INFO relative performance for Synapses only: Sparse, high rate (1s duration) N=500000 CUDA standalone: nan +INFO relative performance for Synapses only: Sparse, high rate (1s duration) N=10 C++ standalone: 13.027787307 +INFO relative performance for Synapses only: Sparse, high rate (1s duration) N=100 C++ standalone: 8.31720400679 +INFO relative performance for Synapses only: Sparse, high rate (1s duration) N=1000 C++ standalone: 5.57225034169 +INFO relative performance for Synapses only: Sparse, high rate (1s duration) N=10000 C++ standalone: 0.42275766627 +INFO relative performance for Synapses only: Sparse, high rate (1s duration) N=100000 C++ standalone: 0.0495905212469 +INFO relative performance for Synapses only: Sparse, high rate (1s duration) N=500000 C++ standalone: nan +INFO relative performance for Synapses only: Sparse, high rate (1s duration) N=10 GeNN_optimized: 1.29306411522 +INFO relative performance for Synapses only: Sparse, high rate (1s duration) N=100 GeNN_optimized: 1.23894750575 +INFO relative performance for Synapses only: Sparse, high rate (1s duration) N=1000 GeNN_optimized: 1.03327020554 +INFO relative performance for Synapses only: Sparse, high rate (1s duration) N=10000 GeNN_optimized: 1.0713338171 +INFO relative performance for Synapses only: Sparse, high rate (1s duration) N=100000 GeNN_optimized: 0.743955000458 +INFO relative performance for Synapses only: Sparse, high rate (1s duration) N=500000 GeNN_optimized: nan +Rerunning CUDAStandaloneConfiguration with n = 1000 for nvprof profiling +cd cuda_standalone && nvprof --profile-from-start-off --log-file ../results_2017_04_05_complete_after_talk/nvprof/nvprof_SparseHighRateSynapsesOnly_CUDAStandaloneConfiguration_1000.log ./main +Profiling took 0:00:04 for runtime of 0.374927 +Rerunning GeNNConfigurationOptimized with n = 1000 for nvprof profiling +cd GeNNworkspace && nvprof --log-file ../results_2017_04_05_complete_after_talk/nvprof/nvprof_SparseHighRateSynapsesOnly_GeNNConfigurationOptimized_1000.log ./main test 1.0 1 +Profiling took 0:00:03 for runtime of 0.362964 +Starting STDPNotEventDriven on 06.04.2017 at 20:23:12. +Running speed tests +Configurations: CUDA standalone, C++ standalone, GeNN_optimized +Full examples: STDP (not event-driven): n=10 [...] n=100 [...] n=1000 [...] n=10000 [...] n=20000 [...] n=50000 [...] n=100000 [...]INFO: setting cudaDevice stuff took 0.153160 seconds +INFO calling kernel_synapses_group_variable_set_conditional_codeobject with 1 blocks and 1024 threads. Kernel needs 6 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory +INFO connectivity matrix has size 1000 +INFO connectivity matrix has size 1000 +INFO generating 10000000 rand every 13107 clock cycles for poissongroup_thresholder_codeobject +INFO calling kernel_neurongroup_stateupdater_codeobject with 1 blocks and 768 threads. Kernel needs 36 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 0.750000. +INFO calling kernel_synapses_stateupdater_codeobject with 2 blocks and 768 threads. Kernel needs 36 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 0.750000. +INFO calling kernel_neurongroup_thresholder_codeobject with 1 blocks and 1024 threads. Kernel needs 9 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000. +INFO calling kernel_poissongroup_thresholder_codeobject with 1 blocks and 1024 threads. Kernel needs 15 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000. +INFO calling kernel_synapses_pre_codeobject with 15 blocks and 1024 threads. Kernel needs 33 registers per block, 0 bytes of statically-allocated shared memory per block, 8 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 0.500000. +INFO calling kernel_synapses_post_codeobject with 15 blocks and 1024 threads. Kernel needs 27 registers per block, 0 bytes of statically-allocated shared memory per block, 8 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000. +INFO calling kernel_neurongroup_resetter_codeobject with 1 blocks and 1024 threads. Kernel needs 14 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000. +Number of synapses: 1000 +Number of synapses: 1000 +INFO: main_lines took 6.799618 seconds +INFO: main function took 6.974729 seconds +mkdir: cannot create directory ‘test_output’: File exists +# DT 0.000100 +# totalTime 1.000000 +# We are running with fixed time step 0.000100 +0.9999 done ... +everything finished. + +Running STDPNotEventDriven took 0:08:16. +INFO relative performance for Full examples: STDP (not event-driven) N=10 CUDA standalone: 1.0 +INFO relative performance for Full examples: STDP (not event-driven) N=100 CUDA standalone: 1.0 +INFO relative performance for Full examples: STDP (not event-driven) N=1000 CUDA standalone: 1.0 +INFO relative performance for Full examples: STDP (not event-driven) N=10000 CUDA standalone: 1.0 +INFO relative performance for Full examples: STDP (not event-driven) N=20000 CUDA standalone: 1.0 +INFO relative performance for Full examples: STDP (not event-driven) N=50000 CUDA standalone: 1.0 +INFO relative performance for Full examples: STDP (not event-driven) N=100000 CUDA standalone: 1.0 +INFO relative performance for Full examples: STDP (not event-driven) N=10 C++ standalone: 13.4856872038 +INFO relative performance for Full examples: STDP (not event-driven) N=100 C++ standalone: 8.68819435966 +INFO relative performance for Full examples: STDP (not event-driven) N=1000 C++ standalone: 2.66795987943 +INFO relative performance for Full examples: STDP (not event-driven) N=10000 C++ standalone: 0.487504968552 +INFO relative performance for Full examples: STDP (not event-driven) N=20000 C++ standalone: 0.335405214753 +INFO relative performance for Full examples: STDP (not event-driven) N=50000 C++ standalone: 0.278810141551 +INFO relative performance for Full examples: STDP (not event-driven) N=100000 C++ standalone: 0.248035751829 +INFO relative performance for Full examples: STDP (not event-driven) N=10 GeNN_optimized: 2.23964192763 +INFO relative performance for Full examples: STDP (not event-driven) N=100 GeNN_optimized: 2.14290015959 +INFO relative performance for Full examples: STDP (not event-driven) N=1000 GeNN_optimized: 1.86995794984 +INFO relative performance for Full examples: STDP (not event-driven) N=10000 GeNN_optimized: 1.30586540569 +INFO relative performance for Full examples: STDP (not event-driven) N=20000 GeNN_optimized: 1.1519252236 +INFO relative performance for Full examples: STDP (not event-driven) N=50000 GeNN_optimized: 0.970393398749 +INFO relative performance for Full examples: STDP (not event-driven) N=100000 GeNN_optimized: 0.923158814456 +Rerunning CUDAStandaloneConfiguration with n = 1000 for nvprof profiling +cd cuda_standalone && nvprof --profile-from-start-off --log-file ../results_2017_04_05_complete_after_talk/nvprof/nvprof_STDPNotEventDriven_CUDAStandaloneConfiguration_1000.log ./main +Profiling took 0:00:11 for runtime of 0.459476 +Rerunning GeNNConfigurationOptimized with n = 1000 for nvprof profiling +cd GeNNworkspace && nvprof --log-file ../results_2017_04_05_complete_after_talk/nvprof/nvprof_STDPNotEventDriven_GeNNConfigurationOptimized_1000.log ./main test 1.0 1 +Profiling took 0:00:03 for runtime of 0.244255 +Starting STDPMultiPost on 06.04.2017 at 20:32:42. +Running speed tests +Configurations: CUDA standalone, C++ standalone, GeNN_optimized +Full examples: STDP with multiple pre- and postsynaptic neurons: n=10 [...] n=100 [...] n=1000 [...] n=10000 [...] n=20000 [...] n=50000 [...] n=100000 [...] n=1000000 [...]INFO: setting cudaDevice stuff took 0.144291 seconds +INFO calling kernel_synapses_group_variable_set_conditional_codeobject with 1 blocks and 1024 threads. Kernel needs 6 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory +INFO connectivity matrix has size 961 +INFO connectivity matrix has size 961 +INFO generating 310000 rand every 422812 clock cycles for poissongroup_thresholder_codeobject +INFO calling kernel_neurongroup_stateupdater_codeobject with 1 blocks and 768 threads. Kernel needs 36 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 0.750000. +INFO calling kernel_neurongroup_thresholder_codeobject with 1 blocks and 1024 threads. Kernel needs 9 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000. +INFO calling kernel_poissongroup_thresholder_codeobject with 1 blocks and 1024 threads. Kernel needs 15 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000. +INFO calling kernel_synapses_pre_codeobject with 15 blocks and 1024 threads. Kernel needs 42 registers per block, 0 bytes of statically-allocated shared memory per block, 8 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 0.500000. +INFO calling kernel_synapses_post_codeobject with 15 blocks and 1024 threads. Kernel needs 34 registers per block, 0 bytes of statically-allocated shared memory per block, 8 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 0.500000. +INFO calling kernel_neurongroup_resetter_codeobject with 1 blocks and 1024 threads. Kernel needs 14 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000. +Number of synapses: 961 +Number of synapses: 961 +INFO: main_lines took 5.925237 seconds +INFO: main function took 6.091588 seconds +mkdir: cannot create directory ‘test_output’: File exists +# DT 0.000100 +# totalTime 1.000000 +# We are running with fixed time step 0.000100 +0.9999 done ... +everything finished. + +Running STDPMultiPost took 0:09:20. +INFO relative performance for Full examples: STDP with multiple pre- and postsynaptic neurons N=10 CUDA standalone: 1.0 +INFO relative performance for Full examples: STDP with multiple pre- and postsynaptic neurons N=100 CUDA standalone: 1.0 +INFO relative performance for Full examples: STDP with multiple pre- and postsynaptic neurons N=1000 CUDA standalone: 1.0 +INFO relative performance for Full examples: STDP with multiple pre- and postsynaptic neurons N=10000 CUDA standalone: 1.0 +INFO relative performance for Full examples: STDP with multiple pre- and postsynaptic neurons N=20000 CUDA standalone: 1.0 +INFO relative performance for Full examples: STDP with multiple pre- and postsynaptic neurons N=50000 CUDA standalone: 1.0 +INFO relative performance for Full examples: STDP with multiple pre- and postsynaptic neurons N=100000 CUDA standalone: 1.0 +INFO relative performance for Full examples: STDP with multiple pre- and postsynaptic neurons N=1000000 CUDA standalone: 1.0 +INFO relative performance for Full examples: STDP with multiple pre- and postsynaptic neurons N=10 C++ standalone: 12.7474018667 +INFO relative performance for Full examples: STDP with multiple pre- and postsynaptic neurons N=100 C++ standalone: 11.5913458596 +INFO relative performance for Full examples: STDP with multiple pre- and postsynaptic neurons N=1000 C++ standalone: 9.54428741786 +INFO relative performance for Full examples: STDP with multiple pre- and postsynaptic neurons N=10000 C++ standalone: 7.34011397663 +INFO relative performance for Full examples: STDP with multiple pre- and postsynaptic neurons N=20000 C++ standalone: 4.6434939759 +INFO relative performance for Full examples: STDP with multiple pre- and postsynaptic neurons N=50000 C++ standalone: 3.54781164235 +INFO relative performance for Full examples: STDP with multiple pre- and postsynaptic neurons N=100000 C++ standalone: 3.07057913489 +INFO relative performance for Full examples: STDP with multiple pre- and postsynaptic neurons N=1000000 C++ standalone: 0.10910483462 +INFO relative performance for Full examples: STDP with multiple pre- and postsynaptic neurons N=10 GeNN_optimized: 2.81045227011 +INFO relative performance for Full examples: STDP with multiple pre- and postsynaptic neurons N=100 GeNN_optimized: 2.74426018098 +INFO relative performance for Full examples: STDP with multiple pre- and postsynaptic neurons N=1000 GeNN_optimized: 2.62544842893 +INFO relative performance for Full examples: STDP with multiple pre- and postsynaptic neurons N=10000 GeNN_optimized: 2.48962114561 +INFO relative performance for Full examples: STDP with multiple pre- and postsynaptic neurons N=20000 GeNN_optimized: 2.45380922669 +INFO relative performance for Full examples: STDP with multiple pre- and postsynaptic neurons N=50000 GeNN_optimized: 2.3428861001 +INFO relative performance for Full examples: STDP with multiple pre- and postsynaptic neurons N=100000 GeNN_optimized: 2.18909976103 +INFO relative performance for Full examples: STDP with multiple pre- and postsynaptic neurons N=1000000 GeNN_optimized: 0.801428258577 +Rerunning CUDAStandaloneConfiguration with n = 1000 for nvprof profiling +cd cuda_standalone && nvprof --profile-from-start-off --log-file ../results_2017_04_05_complete_after_talk/nvprof/nvprof_STDPMultiPost_CUDAStandaloneConfiguration_1000.log ./main +Profiling took 0:00:10 for runtime of 0.379254 +Rerunning GeNNConfigurationOptimized with n = 1000 for nvprof profiling +cd GeNNworkspace && nvprof --log-file ../results_2017_04_05_complete_after_talk/nvprof/nvprof_STDPMultiPost_GeNNConfigurationOptimized_1000.log ./main test 1.0 1 +Profiling took 0:00:03 for runtime of 0.142824 +Starting STDPNeuronalTraces on 06.04.2017 at 20:43:19. +Running speed tests +Configurations: CUDA standalone, C++ standalone, GeNN_optimized +Full examples: STDP with traces in neurons: n=10 [...] n=100 [...] n=1000 [...] n=10000 [...] n=20000 [...] n=50000 [...] n=100000 [...] n=1000000 [...]INFO: setting cudaDevice stuff took 0.176256 seconds +INFO calling kernel_synapses_group_variable_set_conditional_codeobject with 1 blocks and 1024 threads. Kernel needs 6 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory +INFO connectivity matrix has size 1000 +INFO connectivity matrix has size 1000 +INFO generating 10000000 rand every 13107 clock cycles for neurongroup_thresholder_codeobject +INFO calling kernel_neurongroup_1_stateupdater_codeobject with 1 blocks and 768 threads. Kernel needs 36 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 0.750000. +INFO calling kernel_neurongroup_stateupdater_codeobject with 2 blocks and 768 threads. Kernel needs 36 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 0.750000. +INFO calling kernel_neurongroup_1_thresholder_codeobject with 1 blocks and 1024 threads. Kernel needs 9 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000. +INFO calling kernel_neurongroup_thresholder_codeobject with 1 blocks and 1024 threads. Kernel needs 15 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000. +INFO calling kernel_synapses_pre_codeobject with 15 blocks and 1024 threads. Kernel needs 31 registers per block, 0 bytes of statically-allocated shared memory per block, 8 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000. +INFO calling kernel_synapses_post_codeobject with 15 blocks and 1024 threads. Kernel needs 28 registers per block, 0 bytes of statically-allocated shared memory per block, 8 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000. +INFO calling kernel_neurongroup_1_resetter_codeobject with 1 blocks and 1024 threads. Kernel needs 14 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000. +Number of synapses: 1000 +Number of synapses: 1000 +INFO: main_lines took 6.268831 seconds +INFO: main function took 6.467315 seconds +mkdir: cannot create directory ‘test_output’: File exists +# DT 0.000100 +# totalTime 1.000000 +# We are running with fixed time step 0.000100 +0.9999 done ... +everything finished. + +Running STDPNeuronalTraces took 0:14:08. +INFO relative performance for Full examples: STDP with traces in neurons N=10 CUDA standalone: 1.0 +INFO relative performance for Full examples: STDP with traces in neurons N=100 CUDA standalone: 1.0 +INFO relative performance for Full examples: STDP with traces in neurons N=1000 CUDA standalone: 1.0 +INFO relative performance for Full examples: STDP with traces in neurons N=10000 CUDA standalone: 1.0 +INFO relative performance for Full examples: STDP with traces in neurons N=20000 CUDA standalone: 1.0 +INFO relative performance for Full examples: STDP with traces in neurons N=50000 CUDA standalone: 1.0 +INFO relative performance for Full examples: STDP with traces in neurons N=100000 CUDA standalone: 1.0 +INFO relative performance for Full examples: STDP with traces in neurons N=1000000 CUDA standalone: 1.0 +INFO relative performance for Full examples: STDP with traces in neurons N=10 C++ standalone: 11.6440202443 +INFO relative performance for Full examples: STDP with traces in neurons N=100 C++ standalone: 8.66784869976 +INFO relative performance for Full examples: STDP with traces in neurons N=1000 C++ standalone: 2.18593367648 +INFO relative performance for Full examples: STDP with traces in neurons N=10000 C++ standalone: 0.621996996463 +INFO relative performance for Full examples: STDP with traces in neurons N=20000 C++ standalone: 0.439453583339 +INFO relative performance for Full examples: STDP with traces in neurons N=50000 C++ standalone: 0.385339081617 +INFO relative performance for Full examples: STDP with traces in neurons N=100000 C++ standalone: 0.355185290586 +INFO relative performance for Full examples: STDP with traces in neurons N=1000000 C++ standalone: 0.288422965605 +INFO relative performance for Full examples: STDP with traces in neurons N=10 GeNN_optimized: 2.95565780195 +INFO relative performance for Full examples: STDP with traces in neurons N=100 GeNN_optimized: 2.81320236796 +INFO relative performance for Full examples: STDP with traces in neurons N=1000 GeNN_optimized: 2.48979624527 +INFO relative performance for Full examples: STDP with traces in neurons N=10000 GeNN_optimized: 1.74573261233 +INFO relative performance for Full examples: STDP with traces in neurons N=20000 GeNN_optimized: 1.53949294157 +INFO relative performance for Full examples: STDP with traces in neurons N=50000 GeNN_optimized: 1.27630217316 +INFO relative performance for Full examples: STDP with traces in neurons N=100000 GeNN_optimized: 1.20707261203 +INFO relative performance for Full examples: STDP with traces in neurons N=1000000 GeNN_optimized: 1.15485155451 +Rerunning CUDAStandaloneConfiguration with n = 1000 for nvprof profiling +cd cuda_standalone && nvprof --profile-from-start-off --log-file ../results_2017_04_05_complete_after_talk/nvprof/nvprof_STDPNeuronalTraces_CUDAStandaloneConfiguration_1000.log ./main +Profiling took 0:00:12 for runtime of 0.465582 +Rerunning GeNNConfigurationOptimized with n = 1000 for nvprof profiling +cd GeNNworkspace && nvprof --log-file ../results_2017_04_05_complete_after_talk/nvprof/nvprof_STDPNeuronalTraces_GeNNConfigurationOptimized_1000.log ./main test 1.0 1 +Profiling took 0:00:03 for runtime of 0.186743 +Starting STDPMultiPostNeuronalTraces on 06.04.2017 at 20:58:43. +Running speed tests +Configurations: CUDA standalone, C++ standalone, GeNN_optimized +Full examples: STDP with multiple postsynaptic neurons and traces in neurons: n=10 [...] n=100 [...] n=1000 [...] n=10000 [...] n=20000 [...] n=50000 [...] n=100000 [...] n=1000000 [...]INFO: setting cudaDevice stuff took 0.186410 seconds +INFO calling kernel_synapses_group_variable_set_conditional_codeobject with 1 blocks and 1024 threads. Kernel needs 6 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory +INFO connectivity matrix has size 961 +INFO connectivity matrix has size 961 +INFO generating 310000 rand every 422812 clock cycles for neurongroup_thresholder_codeobject +INFO calling kernel_neurongroup_1_stateupdater_codeobject with 1 blocks and 768 threads. Kernel needs 36 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 0.750000. +INFO calling kernel_neurongroup_stateupdater_codeobject with 1 blocks and 768 threads. Kernel needs 36 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 0.750000. +INFO calling kernel_neurongroup_1_thresholder_codeobject with 1 blocks and 1024 threads. Kernel needs 9 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000. +INFO calling kernel_neurongroup_thresholder_codeobject with 1 blocks and 1024 threads. Kernel needs 15 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000. +INFO calling kernel_synapses_pre_codeobject with 15 blocks and 1024 threads. Kernel needs 31 registers per block, 0 bytes of statically-allocated shared memory per block, 8 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000. +INFO calling kernel_synapses_post_codeobject with 15 blocks and 1024 threads. Kernel needs 28 registers per block, 0 bytes of statically-allocated shared memory per block, 8 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000. +INFO calling kernel_neurongroup_1_resetter_codeobject with 1 blocks and 1024 threads. Kernel needs 14 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000. +Number of synapses: 961 +Number of synapses: 961 +INFO: main_lines took 6.372802 seconds +INFO: main function took 6.581250 seconds +mkdir: cannot create directory ‘test_output’: File exists +# DT 0.000100 +# totalTime 1.000000 +# We are running with fixed time step 0.000100 +0.9999 done ... +everything finished. + +Running STDPMultiPostNeuronalTraces took 0:08:49. +INFO relative performance for Full examples: STDP with multiple postsynaptic neurons and traces in neurons N=10 CUDA standalone: 1.0 +INFO relative performance for Full examples: STDP with multiple postsynaptic neurons and traces in neurons N=100 CUDA standalone: 1.0 +INFO relative performance for Full examples: STDP with multiple postsynaptic neurons and traces in neurons N=1000 CUDA standalone: 1.0 +INFO relative performance for Full examples: STDP with multiple postsynaptic neurons and traces in neurons N=10000 CUDA standalone: 1.0 +INFO relative performance for Full examples: STDP with multiple postsynaptic neurons and traces in neurons N=20000 CUDA standalone: 1.0 +INFO relative performance for Full examples: STDP with multiple postsynaptic neurons and traces in neurons N=50000 CUDA standalone: 1.0 +INFO relative performance for Full examples: STDP with multiple postsynaptic neurons and traces in neurons N=100000 CUDA standalone: 1.0 +INFO relative performance for Full examples: STDP with multiple postsynaptic neurons and traces in neurons N=1000000 CUDA standalone: 1.0 +INFO relative performance for Full examples: STDP with multiple postsynaptic neurons and traces in neurons N=10 C++ standalone: 11.2776436504 +INFO relative performance for Full examples: STDP with multiple postsynaptic neurons and traces in neurons N=100 C++ standalone: 11.2039193789 +INFO relative performance for Full examples: STDP with multiple postsynaptic neurons and traces in neurons N=1000 C++ standalone: 10.2038728186 +INFO relative performance for Full examples: STDP with multiple postsynaptic neurons and traces in neurons N=10000 C++ standalone: 8.2861849847 +INFO relative performance for Full examples: STDP with multiple postsynaptic neurons and traces in neurons N=20000 C++ standalone: 7.36851429155 +INFO relative performance for Full examples: STDP with multiple postsynaptic neurons and traces in neurons N=50000 C++ standalone: 6.35608139671 +INFO relative performance for Full examples: STDP with multiple postsynaptic neurons and traces in neurons N=100000 C++ standalone: 4.82692286419 +INFO relative performance for Full examples: STDP with multiple postsynaptic neurons and traces in neurons N=1000000 C++ standalone: 0.603177630071 +INFO relative performance for Full examples: STDP with multiple postsynaptic neurons and traces in neurons N=10 GeNN_optimized: 2.94135590849 +INFO relative performance for Full examples: STDP with multiple postsynaptic neurons and traces in neurons N=100 GeNN_optimized: 2.95301604211 +INFO relative performance for Full examples: STDP with multiple postsynaptic neurons and traces in neurons N=1000 GeNN_optimized: 2.90776556826 +INFO relative performance for Full examples: STDP with multiple postsynaptic neurons and traces in neurons N=10000 GeNN_optimized: 2.78124403426 +INFO relative performance for Full examples: STDP with multiple postsynaptic neurons and traces in neurons N=20000 GeNN_optimized: 2.82306874025 +INFO relative performance for Full examples: STDP with multiple postsynaptic neurons and traces in neurons N=50000 GeNN_optimized: 2.77392207329 +INFO relative performance for Full examples: STDP with multiple postsynaptic neurons and traces in neurons N=100000 GeNN_optimized: 2.67728338099 +INFO relative performance for Full examples: STDP with multiple postsynaptic neurons and traces in neurons N=1000000 GeNN_optimized: 2.1046509768 +Rerunning CUDAStandaloneConfiguration with n = 1000 for nvprof profiling +cd cuda_standalone && nvprof --profile-from-start-off --log-file ../results_2017_04_05_complete_after_talk/nvprof/nvprof_STDPMultiPostNeuronalTraces_CUDAStandaloneConfiguration_1000.log ./main +Profiling took 0:00:12 for runtime of 0.422016 +Rerunning GeNNConfigurationOptimized with n = 1000 for nvprof profiling +cd GeNNworkspace && nvprof --log-file ../results_2017_04_05_complete_after_talk/nvprof/nvprof_STDPMultiPostNeuronalTraces_GeNNConfigurationOptimized_1000.log ./main test 1.0 1 +Profiling took 0:00:03 for runtime of 0.146966 +Starting BrunelHakimModelHeterogeneousDelay on 06.04.2017 at 21:08:46. +Running speed tests +Configurations: CUDA standalone, C++ standalone, GeNN_optimized +Full examples: Brunel Hakim with heterogenous delays: n=10 [...] n=100 [...] n=1000 [...] n=10000 [...] n=20000 [...] n=50000 [...] n=100000 [E..] + +TRACEBACK CUDA standalone N=100000 +INFO: setting cudaDevice stuff took 0.094517 seconds +INFO calling kernel_synapses_group_variable_set_conditional_codeobject with 97661 blocks and 1024 threads. Kernel needs 6 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory +INFO connectivity matrix has size 100004409 +INFO not enough memory available to generate 247808 random numbers for neurongroup_stateupdater_codeobject, reducing the buffer size +INFO not enough memory available to generate 247808 random numbers for neurongroup_stateupdater_codeobject, reducing the buffer size +INFO not enough memory available to generate 247808 random numbers for neurongroup_stateupdater_codeobject, reducing the buffer size +INFO not enough memory available to generate 247808 random numbers for neurongroup_stateupdater_codeobject, reducing the buffer size +INFO not enough memory available to generate 247808 random numbers for neurongroup_stateupdater_codeobject, reducing the buffer size +INFO not enough memory available to generate 247808 random numbers for neurongroup_stateupdater_codeobject, reducing the buffer size +INFO generating 204687 randn every 131 clock cycles for neurongroup_stateupdater_codeobject +INFO calling kernel_neurongroup_stateupdater_codeobject with 174 blocks and 576 threads. Kernel needs 52 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 0.562500. +ERROR launching kernel_neurongroup_stateupdater_codeobject in code_objects/neurongroup_stateupdater_codeobject.cu:1051 out of memory + +('debug syn effect mdoe ', 'target') +INFO: setting cudaDevice stuff took 0.094517 seconds +INFO calling kernel_synapses_group_variable_set_conditional_codeobject with 97661 blocks and 1024 threads. Kernel needs 6 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory +INFO connectivity matrix has size 100004409 +INFO not enough memory available to generate 247808 random numbers for neurongroup_stateupdater_codeobject, reducing the buffer size +INFO not enough memory available to generate 247808 random numbers for neurongroup_stateupdater_codeobject, reducing the buffer size +INFO not enough memory available to generate 247808 random numbers for neurongroup_stateupdater_codeobject, reducing the buffer size +INFO not enough memory available to generate 247808 random numbers for neurongroup_stateupdater_codeobject, reducing the buffer size +INFO not enough memory available to generate 247808 random numbers for neurongroup_stateupdater_codeobject, reducing the buffer size +INFO not enough memory available to generate 247808 random numbers for neurongroup_stateupdater_codeobject, reducing the buffer size +INFO generating 204687 randn every 131 clock cycles for neurongroup_stateupdater_codeobject +INFO calling kernel_neurongroup_stateupdater_codeobject with 174 blocks and 576 threads. Kernel needs 52 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 0.562500. +ERROR launching kernel_neurongroup_stateupdater_codeobject in code_objects/neurongroup_stateupdater_codeobject.cu:1051 out of memory + + +/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2/brian2/utils/logger.py:546: UserWarning: Could not copy script file to temp directory: [Errno 2] No such file or directory: '/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/dev/benchmarks/-c' + 'Could not copy script file to temp directory: %s' % ex) +INFO No numerical integration method specified for group 'neurongroup', using method 'euler' (took 0.07s, trying other methods took 0.00s). [brian2.stateupdaters.base.method_choice] +terminINFO: setting cudaDevice stuff took 0.179905 seconds +INFO calling kernel_synapses_group_variable_set_conditional_codeobject with 977 blocks and 1024 threads. Kernel needs 6 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory +INFO connectivity matrix has size 1000000 +INFO generating 10000000 randn every 13107 clock cycles for neurongroup_stateupdater_codeobject +INFO calling kernel_neurongroup_stateupdater_codeobject with 2 blocks and 576 threads. Kernel needs 52 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 0.562500. +INFO calling kernel_neurongroup_thresholder_codeobject with 1 blocks and 1024 threads. Kernel needs 11 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000. +INFO calling kernel_synapses_pre_codeobject with 15 blocks and 1 threads. Kernel needs 27 registers per block, 0 bytes of statically-allocated shared memory per block, 8 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 0.000000. +INFO calling kernel_neurongroup_resetter_codeobject with 1 blocks and 1024 threads. Kernel needs 14 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000. +Number of synapses: 1000000 +INFO: main_lines took 9.590645 seconds +INFO: main function took 9.829623 seconds +mkdir: cannot create directory ‘test_output’: File exists +# DT 0.000100 +# totalTime 1.000000 +# We are running with fixed time step 0.000100 +0.9999 done ... +everything finished. +ate called after throwing an instance of 'thrust::system::system_error' + what(): cudaFree in free: an illegal memory access was encountered + +Traceback (most recent call last): + File "", line 21, in + File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/brian2cuda/tests/features/cuda_configuration.py", line 27, in after_run + with_output=False) + File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/brian2cuda/device.py", line 778, in build + self.run(directory, with_output, run_args) + File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2/brian2/devices/cpp_standalone/device.py", line 864, in run + "%s)" % os.path.abspath(directory)) +RuntimeError: Project run failed (project directory: /mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/dev/benchmarks/cuda_standalone/cuda_standalone) + + + +Running BrunelHakimModelHeterogeneousDelay took 1:11:22. +INFO relative performance for Full examples: Brunel Hakim with heterogenous delays N=10 CUDA standalone: 1.0 +INFO relative performance for Full examples: Brunel Hakim with heterogenous delays N=100 CUDA standalone: 1.0 +INFO relative performance for Full examples: Brunel Hakim with heterogenous delays N=1000 CUDA standalone: 1.0 +INFO relative performance for Full examples: Brunel Hakim with heterogenous delays N=10000 CUDA standalone: 1.0 +INFO relative performance for Full examples: Brunel Hakim with heterogenous delays N=20000 CUDA standalone: 1.0 +INFO relative performance for Full examples: Brunel Hakim with heterogenous delays N=50000 CUDA standalone: 1.0 +INFO relative performance for Full examples: Brunel Hakim with heterogenous delays N=100000 CUDA standalone: nan +INFO relative performance for Full examples: Brunel Hakim with heterogenous delays N=10 C++ standalone: 11.4029279828 +INFO relative performance for Full examples: Brunel Hakim with heterogenous delays N=100 C++ standalone: 8.7440567782 +INFO relative performance for Full examples: Brunel Hakim with heterogenous delays N=1000 C++ standalone: 5.15311757921 +INFO relative performance for Full examples: Brunel Hakim with heterogenous delays N=10000 C++ standalone: 6.3911140671 +INFO relative performance for Full examples: Brunel Hakim with heterogenous delays N=20000 C++ standalone: 12.3422729105 +INFO relative performance for Full examples: Brunel Hakim with heterogenous delays N=50000 C++ standalone: 15.0350916602 +INFO relative performance for Full examples: Brunel Hakim with heterogenous delays N=100000 C++ standalone: nan +INFO relative performance for Full examples: Brunel Hakim with heterogenous delays N=10 GeNN_optimized: 2.23856530406 +INFO relative performance for Full examples: Brunel Hakim with heterogenous delays N=100 GeNN_optimized: 4.36762611789 +INFO relative performance for Full examples: Brunel Hakim with heterogenous delays N=1000 GeNN_optimized: 20.15374328 +INFO relative performance for Full examples: Brunel Hakim with heterogenous delays N=10000 GeNN_optimized: 146.679676807 +INFO relative performance for Full examples: Brunel Hakim with heterogenous delays N=20000 GeNN_optimized: 357.995725987 +INFO relative performance for Full examples: Brunel Hakim with heterogenous delays N=50000 GeNN_optimized: 492.68840486 +INFO relative performance for Full examples: Brunel Hakim with heterogenous delays N=100000 GeNN_optimized: nan +Rerunning CUDAStandaloneConfiguration with n = 1000 for nvprof profiling +cd cuda_standalone && nvprof --profile-from-start-off --log-file ../results_2017_04_05_complete_after_talk/nvprof/nvprof_BrunelHakimModelHeterogeneousDelay_CUDAStandaloneConfiguration_1000.log ./main +Profiling took 0:00:15 for runtime of 3.81853 +Rerunning GeNNConfigurationOptimized with n = 1000 for nvprof profiling +cd GeNNworkspace && nvprof --log-file ../results_2017_04_05_complete_after_talk/nvprof/nvprof_BrunelHakimModelHeterogeneousDelay_GeNNConfigurationOptimized_1000.log ./main test 1.0 1 +Profiling took 0:00:04 for runtime of 0.188529 +Starting LinearNeuronsOnly on 06.04.2017 at 22:21:28. +Running speed tests +Configurations: CUDA standalone, C++ standalone, GeNN_optimized +Neurons only: Linear 1D: n=10 [...] n=100 [...] n=1000 [...] n=10000 [...] n=100000 [...] n=1000000 [...] n=10000000 [...]INFO: setting cudaDevice stuff took 0.326520 seconds +INFO calling kernel_neurongroup_stateupdater_codeobject with 1 blocks and 1024 threads. Kernel needs 12 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 0 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000. +INFO: main_lines took 5.766263 seconds +INFO: main function took 6.105103 seconds +mkdir: cannot create directory ‘test_output’: File exists +# DT 0.000100 +# totalTime 10.000000 +# We are running with fixed time step 0.000100 +9.9999 done ... +everything finished. + +Running LinearNeuronsOnly took 0:33:02. +INFO relative performance for Neurons only: Linear 1D N=10 CUDA standalone: 1.0 +INFO relative performance for Neurons only: Linear 1D N=100 CUDA standalone: 1.0 +INFO relative performance for Neurons only: Linear 1D N=1000 CUDA standalone: 1.0 +INFO relative performance for Neurons only: Linear 1D N=10000 CUDA standalone: 1.0 +INFO relative performance for Neurons only: Linear 1D N=100000 CUDA standalone: 1.0 +INFO relative performance for Neurons only: Linear 1D N=1000000 CUDA standalone: 1.0 +INFO relative performance for Neurons only: Linear 1D N=10000000 CUDA standalone: 1.0 +INFO relative performance for Neurons only: Linear 1D N=10 C++ standalone: 7.51083775606 +INFO relative performance for Neurons only: Linear 1D N=100 C++ standalone: 6.6944353215 +INFO relative performance for Neurons only: Linear 1D N=1000 C++ standalone: 3.76942753323 +INFO relative performance for Neurons only: Linear 1D N=10000 C++ standalone: 0.799040137212 +INFO relative performance for Neurons only: Linear 1D N=100000 C++ standalone: 0.144238328209 +INFO relative performance for Neurons only: Linear 1D N=1000000 C++ standalone: 0.104833409762 +INFO relative performance for Neurons only: Linear 1D N=10000000 C++ standalone: 0.0490665649822 +INFO relative performance for Neurons only: Linear 1D N=10 GeNN_optimized: 0.973583595511 +INFO relative performance for Neurons only: Linear 1D N=100 GeNN_optimized: 0.984380282075 +INFO relative performance for Neurons only: Linear 1D N=1000 GeNN_optimized: 1.0333623878 +INFO relative performance for Neurons only: Linear 1D N=10000 GeNN_optimized: 0.907366342055 +INFO relative performance for Neurons only: Linear 1D N=100000 GeNN_optimized: 0.65065534916 +INFO relative performance for Neurons only: Linear 1D N=1000000 GeNN_optimized: 0.603726581657 +INFO relative performance for Neurons only: Linear 1D N=10000000 GeNN_optimized: 0.583361983096 +Rerunning CUDAStandaloneConfiguration with n = 1000 for nvprof profiling +cd cuda_standalone && nvprof --profile-from-start-off --log-file ../results_2017_04_05_complete_after_talk/nvprof/nvprof_LinearNeuronsOnly_CUDAStandaloneConfiguration_1000.log ./main +Profiling took 0:00:10 for runtime of 0.48209 +Rerunning GeNNConfigurationOptimized with n = 1000 for nvprof profiling +cd GeNNworkspace && nvprof --log-file ../results_2017_04_05_complete_after_talk/nvprof/nvprof_LinearNeuronsOnly_GeNNConfigurationOptimized_1000.log ./main test 10.0 1 +Profiling took 0:00:06 for runtime of 0.45749 +Starting HHNeuronsOnly on 06.04.2017 at 22:55:24. +Running speed tests +Configurations: CUDA standalone, C++ standalone, GeNN_optimized +Neurons only: Hodgkin-Huxley: n=10 [...] n=100 [...] n=1000 [...] n=10000 [...] n=100000 [...] n=1000000 [...]INFO: setting cudaDevice stuff took 0.143355 seconds +INFO calling kernel_neurongroup_group_variable_set_conditional_codeobject with 1 blocks and 1024 threads. Kernel needs 7 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 0 bytes of user-allocated constant memory +INFO calling kernel_neurongroup_stateupdater_codeobject with 1 blocks and 1024 threads. Kernel needs 62 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 0 bytes of user-allocated constant memory. Theoretical occupancy is 0.500000. +INFO calling kernel_neurongroup_thresholder_codeobject with 1 blocks and 1024 threads. Kernel needs 11 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 0 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000. +INFO: main_lines took 1.875142 seconds +INFO: main function took 2.032155 seconds +mkdir: cannot create directory ‘test_output’: File exists +# DT 0.000100 +# totalTime 1.000000 +# We are running with fixed time step 0.000100 +0.9999 done ... +everything finished. + +Running HHNeuronsOnly took 0:21:09. +INFO relative performance for Neurons only: Hodgkin-Huxley N=10 CUDA standalone: 1.0 +INFO relative performance for Neurons only: Hodgkin-Huxley N=100 CUDA standalone: 1.0 +INFO relative performance for Neurons only: Hodgkin-Huxley N=1000 CUDA standalone: 1.0 +INFO relative performance for Neurons only: Hodgkin-Huxley N=10000 CUDA standalone: 1.0 +INFO relative performance for Neurons only: Hodgkin-Huxley N=100000 CUDA standalone: 1.0 +INFO relative performance for Neurons only: Hodgkin-Huxley N=1000000 CUDA standalone: 1.0 +INFO relative performance for Neurons only: Hodgkin-Huxley N=10 C++ standalone: 8.88708231981 +INFO relative performance for Neurons only: Hodgkin-Huxley N=100 C++ standalone: 1.93233246921 +INFO relative performance for Neurons only: Hodgkin-Huxley N=1000 C++ standalone: 0.273829639851 +INFO relative performance for Neurons only: Hodgkin-Huxley N=10000 C++ standalone: 0.0301623262336 +INFO relative performance for Neurons only: Hodgkin-Huxley N=100000 C++ standalone: 0.0140154114169 +INFO relative performance for Neurons only: Hodgkin-Huxley N=1000000 C++ standalone: 0.0117289494235 +INFO relative performance for Neurons only: Hodgkin-Huxley N=10 GeNN_optimized: 1.17820348169 +INFO relative performance for Neurons only: Hodgkin-Huxley N=100 GeNN_optimized: 1.1193610315 +INFO relative performance for Neurons only: Hodgkin-Huxley N=1000 GeNN_optimized: 1.40084936828 +INFO relative performance for Neurons only: Hodgkin-Huxley N=10000 GeNN_optimized: 1.15423786453 +INFO relative performance for Neurons only: Hodgkin-Huxley N=100000 GeNN_optimized: 1.03595178005 +INFO relative performance for Neurons only: Hodgkin-Huxley N=1000000 GeNN_optimized: 0.955525776817 +Rerunning CUDAStandaloneConfiguration with n = 1000 for nvprof profiling +cd cuda_standalone && nvprof --profile-from-start-off --log-file ../results_2017_04_05_complete_after_talk/nvprof/nvprof_HHNeuronsOnly_CUDAStandaloneConfiguration_1000.log ./main +Profiling took 0:00:04 for runtime of 0.251227 +Rerunning GeNNConfigurationOptimized with n = 1000 for nvprof profiling +cd GeNNworkspace && nvprof --log-file ../results_2017_04_05_complete_after_talk/nvprof/nvprof_HHNeuronsOnly_GeNNConfigurationOptimized_1000.log ./main test 1.0 1 +Profiling took 0:00:02 for runtime of 0.179052 +Starting VogelsWithSynapticDynamic on 06.04.2017 at 23:17:20. +Running speed tests +Configurations: CUDA standalone, C++ standalone, GeNN_optimized +Full examples: Vogels et al 2011 (not event-driven synapses): n=10 [E.E] n=100 [..E] n=1000 [..E] n=10000 [..E] n=20000 [..E] n=50000 [..E] n=100000 [E.E] + +TRACEBACK CUDA standalone N=10 +INFO: setting cudaDevice stuff took 0.268672 seconds +INFO connectivity matrix has size 1 +INFO calling kernel_neurongroup_stateupdater_codeobject with 1 blocks and 768 threads. Kernel needs 40 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 0.750000. +INFO calling kernel_synapses_2_stateupdater_codeobject with 0 blocks and 768 threads. Kernel needs 36 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 0.750000. +ERROR launching kernel_synapses_2_stateupdater_codeobject in code_objects/synapses_2_stateupdater_codeobject.cu:1044 invalid configuration argument + +('debug syn effect mdoe ', 'target') +('debug syn effect mdoe ', 'target') +('debug syn effect mdoe ', 'target') +('debug syn effect mdoe ', 'synapse') +INFO: setting cudaDevice stuff took 0.268672 seconds +INFO connectivity matrix has size 1 +INFO calling kernel_neurongroup_stateupdater_codeobject with 1 blocks and 768 threads. Kernel needs 40 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 0.750000. +INFO calling kernel_synapses_2_stateupdater_codeobject with 0 blocks and 768 threads. Kernel needs 36 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 0.750000. +ERROR launching kernel_synapses_2_stateupdater_codeobject in code_objects/synapses_2_stateupdater_codeobject.cu:1044 invalid configuration argument + + +/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2/brian2/utils/logger.py:546: UserWarning: Could not copy script file to temp directory: [Errno 2] No such file or directory: '/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/dev/benchmarks/-c' + 'Could not copy script file to temp directory: %s' % ex) +INFO The synaptic equation for the variable A_pre does not specify whether it should be integrated at every timestep ("clock-driven") or only at spiking events ("event-driven"). It will be integrated at every timestep which can slow down your simulation unnecessarily if you only need the values of this variable whenever a spike occurs. Specify the equation as clock-driven explicitly to avoid this warning. [brian2.synapses.synapses.clock_driven] +INFO The synaptic equation for the variable A_post does not specify whether it should be integrated at every timestep ("clock-driven") or only at spiking events ("event-driven"). It will be integrated at every timestep which can slow down your simulation unnecessarily if you only need the values of this variable whenever a spike occurs. Specify the equation as clock-driven explicitly to avoid this warning. [brian2.synapses.synapses.clock_driven] +INFO No numerical integration method specified for group 'neurongroup', using method 'euler' (took 0.06s, trying other methods took 0.12s). [brian2.stateupdaters.base.method_choice] +INFO No numerical integration method specified for group 'synapses_2', using method 'linear' (took 0.14s). [brian2.stateupdaters.base.method_choice] + +Traceback (most recent call last): + File "", line 21, in + File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/brian2cuda/tests/features/cuda_configuration.py", line 27, in after_run + with_output=False) + File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/brian2cuda/device.py", line 778, in build + self.run(directory, with_output, run_args) + File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2/brian2/devices/cpp_standalone/device.py", line 864, in run + "%s)" % os.path.abspath(directory)) +RuntimeError: Project run failed (project directory: /mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/dev/benchmarks/cuda_standalone/cuda_standalone) + + + + +TRACEBACK GeNN_optimized N=10 +no stdout file found, cwd = /mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/dev/benchmarks/GeNNWorkspace/results/stdout.txt +running brian code generation ... +building genn executable ... +ar -rcs /home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/lib/libgenn.a /home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/obj/global.o /home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/obj/modelSpec.o /home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/obj/neuronModels.o /home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/obj/synapseModels.o /home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/obj/postSynapseModels.o /home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/obj/utils.o /home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/obj/stringUtils.o /home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/obj/sparseUtils.o /home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/obj/hr_time.o +g++ -std=c++11 -DNVCC=\""/usr/local/cuda/bin/nvcc"\" -DMODEL=\"/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/dev/benchmarks/GeNNworkspace/magicnetwork_model.cpp\" -o /mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/dev/benchmarks/GeNNworkspace/generateALL /home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/src/generate*.cc -I"/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/include" -I"/usr/local/cuda/include" -L"/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/lib" -L"/usr/local/cuda/lib64" -lgenn -lcuda -lcudart +call was ./generateALL /mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/dev/benchmarks/GeNNworkspace +optimizing block size... +Global memory required for core model: 0.000654 MB. +6440894464 for device 0 +dry-run compile for device 0 +"/usr/local/cuda/bin/nvcc" -cubin -x cu -arch sm_35 -O3 -I"$GENN_PATH/lib/include" -o "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/dev/benchmarks/GeNNworkspace/runner.cubin" "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/dev/benchmarks/GeNNworkspace/magicnetwork_model_CODE/runner.cc" +genn-buildmodel.sh:70: error 50: command failure + +/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2/brian2/utils/logger.py:546: UserWarning: Could not copy script file to temp directory: [Errno 2] No such file or directory: '/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/dev/benchmarks/-c' + 'Could not copy script file to temp directory: %s' % ex) +INFO The synaptic equation for the variable A_pre does not specify whether it should be integrated at every timestep ("clock-driven") or only at spiking events ("event-driven"). It will be integrated at every timestep which can slow down your simulation unnecessarily if you only need the values of this variable whenever a spike occurs. Specify the equation as clock-driven explicitly to avoid this warning. [brian2.synapses.synapses.clock_driven] +INFO The synaptic equation for the variable A_post does not specify whether it should be integrated at every timestep ("clock-driven") or only at spiking events ("event-driven"). It will be integrated at every timestep which can slow down your simulation unnecessarily if you only need the values of this variable whenever a spike occurs. Specify the equation as clock-driven explicitly to avoid this warning. [brian2.synapses.synapses.clock_driven] +WARNING The selected device 'genn' only supports a fixed schedule, but this schedule is not consistent with the network's schedule. The simulation will use the device's schedule. +Device schedule: ['start', 'synapses', 'groups', 'thresholds', 'resets', 'end'] +Network schedule: ['start', 'groups', 'thresholds', 'synapses', 'resets', 'end'] +Set the network schedule explicitly or set the core.network.default_schedule preference to avoid this warning. [brian2.core.network.schedule_conflict] +INFO No numerical integration method specified for group 'neurongroup', using method 'euler' (took 0.06s, trying other methods took 0.11s). [brian2.stateupdaters.base.method_choice] +INFO No numerical integration method specified for group 'synapses_2', using method 'linear' (took 0.15s). [brian2.stateupdaters.base.method_choice] +/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/dev/benchmarks/GeNNworkspace/magicnetwork_model_CODE/synapseKrnl.cc(23): error: name must be a namespace name + +1 error detected in the compilation of "/tmp/tmpxft_000066aa_00000000-7_runner.cpp1.ii". +/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/src/generateALL.cc: 258: cuda driver error 301: CUDA_ERROR_FILE_NOT_FOUND + +Traceback (most recent call last): + File "", line 14, in + File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/brian2cuda/tests/features/speed.py", line 1075, in run + self.timed_run(self.duration) + File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2/brian2/tests/features/base.py", line 63, in timed_run + brian2.run(duration, level=1) + File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2/brian2/units/fundamentalunits.py", line 2428, in new_f + result = f(*args, **kwds) + File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2/brian2/core/magic.py", line 371, in run + namespace=namespace, profile=profile, level=2+level) + File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2/brian2/core/magic.py", line 231, in run + namespace=namespace, profile=profile, level=level+1) + File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2/brian2/core/base.py", line 276, in device_override_decorated_function + return getattr(curdev, name)(*args, **kwds) + File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2genn/brian2genn/device.py", line 1206, in network_run + super(GeNNDevice, self).network_run(net=net, duration=duration, report=report, report_period=report_period, namespace=namespace, level=level+1) + File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2/brian2/devices/cpp_standalone/device.py", line 1171, in network_run + self.build(direct_call=False, **self.build_options) + File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2genn/brian2genn/device.py", line 582, in build + returncode=ex.returncode) +RuntimeError: Project compilation failed (Command ['genn-buildmodel.sh', 'magicnetwork_model.cpp'] failed with error code 50). +See the output above (if any) for more details. + + + + +TRACEBACK GeNN_optimized N=100 +no stdout file found, cwd = /mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/dev/benchmarks/GeNNWorkspace/results/stdout.txt +running brian code generation ... +building genn executable ... +ar -rcs /home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/lib/libgenn.a /home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/obj/global.o /home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/obj/modelSpec.o /home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/obj/neuronModels.o /home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/obj/synapseModels.o /home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/obj/postSynapseModels.o /home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/obj/utils.o /home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/obj/stringUtils.o /home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/obj/sparseUtils.o /home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/obj/hr_time.o +g++ -std=c++11 -DNVCC=\""/usr/local/cuda/bin/nvcc"\" -DMODEL=\"/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/dev/benchmarks/GeNNworkspace/magicnetwork_model.cpp\" -o /mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/dev/benchmarks/GeNNworkspace/generateALL /home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/src/generate*.cc -I"/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/include" -I"/usr/local/cuda/include" -L"/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/lib" -L"/usr/local/cuda/lib64" -lgenn -lcuda -lcudart +call was ./generateALL /mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/dev/benchmarks/GeNNworkspace +optimizing block size... +Global memory required for core model: 0.006504 MB. +6440894464 for device 0 +dry-run compile for device 0 +"/usr/local/cuda/bin/nvcc" -cubin -x cu -arch sm_35 -O3 -I"$GENN_PATH/lib/include" -o "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/dev/benchmarks/GeNNworkspace/runner.cubin" "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/dev/benchmarks/GeNNworkspace/magicnetwork_model_CODE/runner.cc" +genn-buildmodel.sh:70: error 50: command failure + +/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2/brian2/utils/logger.py:546: UserWarning: Could not copy script file to temp directory: [Errno 2] No such file or directory: '/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/dev/benchmarks/-c' + 'Could not copy script file to temp directory: %s' % ex) +INFO The synaptic equation for the variable A_pre does not specify whether it should be integrated at every timestep ("clock-driven") or only at spiking events ("event-driven"). It will be integrated at every timestep which can slow down your simulation unnecessarily if you only need the values of this variable whenever a spike occurs. Specify the equation as clock-driven explicitly to avoid this warning. [brian2.synapses.synapses.clock_driven] +INFO The synaptic equation for the variable A_post does not specify whether it should be integrated at every timestep ("clock-driven") or only at spiking events ("event-driven"). It will be integrated at every timestep which can slow down your simulation unnecessarily if you only need the values of this variable whenever a spike occurs. Specify the equation as clock-driven explicitly to avoid this warning. [brian2.synapses.synapses.clock_driven] +WARNING The selected device 'genn' only supports a fixed schedule, but this schedule is not consistent with the network's schedule. The simulation will use the device's schedule. +Device schedule: ['start', 'synapses', 'groups', 'thresholds', 'resets', 'end'] +Network schedule: ['start', 'groups', 'thresholds', 'synapses', 'resets', 'end'] +Set the network schedule explicitly or set the core.network.default_schedule preference to avoid this warning. [brian2.core.network.schedule_conflict] +INFO No numerical integration method specified for group 'neurongroup', using method 'euler' (took 0.07s, trying other methods took 0.14s). [brian2.stateupdaters.base.method_choice] +INFO No numerical integration method specified for group 'synapses_2', using method 'linear' (took 0.13s). [brian2.stateupdaters.base.method_choice] +/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/dev/benchmarks/GeNNworkspace/magicnetwork_model_CODE/synapseKrnl.cc(23): error: name must be a namespace name + +1 error detected in the compilation of "/tmp/tmpxft_00006ab9_00000000-7_runner.cpp1.ii". +/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/src/generateALL.cc: 258: cuda driver error 301: CUDA_ERROR_FILE_NOT_FOUND + +Traceback (most recent call last): + File "", line 14, in + File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/brian2cuda/tests/features/speed.py", line 1075, in run + self.timed_run(self.duration) + File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2/brian2/tests/features/base.py", line 63, in timed_run + brian2.run(duration, level=1) + File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2/brian2/units/fundamentalunits.py", line 2428, in new_f + result = f(*args, **kwds) + File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2/brian2/core/magic.py", line 371, in run + namespace=namespace, profile=profile, level=2+level) + File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2/brian2/core/magic.py", line 231, in run + namespace=namespace, profile=profile, level=level+1) + File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2/brian2/core/base.py", line 276, in device_override_decorated_function + return getattr(curdev, name)(*args, **kwds) + File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2genn/brian2genn/device.py", line 1206, in network_run + super(GeNNDevice, self).network_run(net=net, duration=duration, report=report, report_period=report_period, namespace=namespace, level=level+1) + File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2/brian2/devices/cpp_standalone/device.py", line 1171, in network_run + self.build(direct_call=False, **self.build_options) + File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2genn/brian2genn/device.py", line 582, in build + returncode=ex.returncode) +RuntimeError: Project compilation failed (Command ['genn-buildmodel.sh', 'magicnetwork_model.cpp'] failed with error code 50). +See the output above (if any) for more details. + + + + +TRACEBACK GeNN_optimized N=1000 +no stdout file found, cwd = /mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/dev/benchmarks/GeNNWorkspace/results/stdout.txt +running brian code generation ... +building genn executable ... +ar -rcs /home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/lib/libgenn.a /home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/obj/global.o /home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/obj/modelSpec.o /home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/obj/neuronModels.o /home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/obj/synapseModels.o /home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/obj/postSynapseModels.o /home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/obj/utils.o /home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/obj/stringUtils.o /home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/obj/sparseUtils.o /home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/obj/hr_time.o +g++ -std=c++11 -DNVCC=\""/usr/local/cuda/bin/nvcc"\" -DMODEL=\"/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/dev/benchmarks/GeNNworkspace/magicnetwork_model.cpp\" -o /mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/dev/benchmarks/GeNNworkspace/generateALL /home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/src/generate*.cc -I"/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/include" -I"/usr/local/cuda/include" -L"/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/lib" -L"/usr/local/cuda/lib64" -lgenn -lcuda -lcudart +call was ./generateALL /mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/dev/benchmarks/GeNNworkspace +optimizing block size... +Global memory required for core model: 0.065004 MB. +6440894464 for device 0 +dry-run compile for device 0 +"/usr/local/cuda/bin/nvcc" -cubin -x cu -arch sm_35 -O3 -I"$GENN_PATH/lib/include" -o "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/dev/benchmarks/GeNNworkspace/runner.cubin" "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/dev/benchmarks/GeNNworkspace/magicnetwork_model_CODE/runner.cc" +genn-buildmodel.sh:70: error 50: command failure + +/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2/brian2/utils/logger.py:546: UserWarning: Could not copy script file to temp directory: [Errno 2] No such file or directory: '/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/dev/benchmarks/-c' + 'Could not copy script file to temp directory: %s' % ex) +INFO The synaptic equation for the variable A_pre does not specify whether it should be integrated at every timestep ("clock-driven") or only at spiking events ("event-driven"). It will be integrated at every timestep which can slow down your simulation unnecessarily if you only need the values of this variable whenever a spike occurs. Specify the equation as clock-driven explicitly to avoid this warning. [brian2.synapses.synapses.clock_driven] +INFO The synaptic equation for the variable A_post does not specify whether it should be integrated at every timestep ("clock-driven") or only at spiking events ("event-driven"). It will be integrated at every timestep which can slow down your simulation unnecessarily if you only need the values of this variable whenever a spike occurs. Specify the equation as clock-driven explicitly to avoid this warning. [brian2.synapses.synapses.clock_driven] +WARNING The selected device 'genn' only supports a fixed schedule, but this schedule is not consistent with the network's schedule. The simulation will use the device's schedule. +Device schedule: ['start', 'synapses', 'groups', 'thresholds', 'resets', 'end'] +Network schedule: ['start', 'groups', 'thresholds', 'synapses', 'resets', 'end'] +Set the network schedule explicitly or set the core.network.default_schedule preference to avoid this warning. [brian2.core.network.schedule_conflict] +INFO No numerical integration method specified for group 'neurongroup', using method 'euler' (took 0.07s, trying other methods took 0.12s). [brian2.stateupdaters.base.method_choice] +INFO No numerical integration method specified for group 'synapses_2', using method 'linear' (took 0.16s). [brian2.stateupdaters.base.method_choice] +/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/dev/benchmarks/GeNNworkspace/magicnetwork_model_CODE/synapseKrnl.cc(23): error: name must be a namespace name + +1 error detected in the compilation of "/tmp/tmpxft_00006ee5_00000000-7_runner.cpp1.ii". +/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/src/generateALL.cc: 258: cuda driver error 301: CUDA_ERROR_FILE_NOT_FOUND + +Traceback (most recent call last): + File "", line 14, in + File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/brian2cuda/tests/features/speed.py", line 1075, in run + self.timed_run(self.duration) + File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2/brian2/tests/features/base.py", line 63, in timed_run + brian2.run(duration, level=1) + File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2/brian2/units/fundamentalunits.py", line 2428, in new_f + result = f(*args, **kwds) + File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2/brian2/core/magic.py", line 371, in run + namespace=namespace, profile=profile, level=2+level) + File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2/brian2/core/magic.py", line 231, in run + namespace=namespace, profile=profile, level=level+1) + File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2/brian2/core/base.py", line 276, in device_override_decorated_function + return getattr(curdev, name)(*args, **kwds) + File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2genn/brian2genn/device.py", line 1206, in network_run + super(GeNNDevice, self).network_run(net=net, duration=duration, report=report, report_period=report_period, namespace=namespace, level=level+1) + File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2/brian2/devices/cpp_standalone/device.py", line 1171, in network_run + self.build(direct_call=False, **self.build_options) + File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2genn/brian2genn/device.py", line 582, in build + returncode=ex.returncode) +RuntimeError: Project compilation failed (Command ['genn-buildmodel.sh', 'magicnetwork_model.cpp'] failed with error code 50). +See the output above (if any) for more details. + + + + +TRACEBACK GeNN_optimized N=10000 +no stdout file found, cwd = /mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/dev/benchmarks/GeNNWorkspace/results/stdout.txt +running brian code generation ... +building genn executable ... +ar -rcs /home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/lib/libgenn.a /home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/obj/global.o /home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/obj/modelSpec.o /home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/obj/neuronModels.o /home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/obj/synapseModels.o /home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/obj/postSynapseModels.o /home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/obj/utils.o /home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/obj/stringUtils.o /home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/obj/sparseUtils.o /home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/obj/hr_time.o +g++ -std=c++11 -DNVCC=\""/usr/local/cuda/bin/nvcc"\" -DMODEL=\"/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/dev/benchmarks/GeNNworkspace/magicnetwork_model.cpp\" -o /mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/dev/benchmarks/GeNNworkspace/generateALL /home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/src/generate*.cc -I"/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/include" -I"/usr/local/cuda/include" -L"/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/lib" -L"/usr/local/cuda/lib64" -lgenn -lcuda -lcudart +call was ./generateALL /mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/dev/benchmarks/GeNNworkspace +optimizing block size... +Global memory required for core model: 0.650004 MB. +6440894464 for device 0 +dry-run compile for device 0 +"/usr/local/cuda/bin/nvcc" -cubin -x cu -arch sm_35 -O3 -I"$GENN_PATH/lib/include" -o "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/dev/benchmarks/GeNNworkspace/runner.cubin" "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/dev/benchmarks/GeNNworkspace/magicnetwork_model_CODE/runner.cc" +genn-buildmodel.sh:70: error 50: command failure + +/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2/brian2/utils/logger.py:546: UserWarning: Could not copy script file to temp directory: [Errno 2] No such file or directory: '/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/dev/benchmarks/-c' + 'Could not copy script file to temp directory: %s' % ex) +INFO The synaptic equation for the variable A_pre does not specify whether it should be integrated at every timestep ("clock-driven") or only at spiking events ("event-driven"). It will be integrated at every timestep which can slow down your simulation unnecessarily if you only need the values of this variable whenever a spike occurs. Specify the equation as clock-driven explicitly to avoid this warning. [brian2.synapses.synapses.clock_driven] +INFO The synaptic equation for the variable A_post does not specify whether it should be integrated at every timestep ("clock-driven") or only at spiking events ("event-driven"). It will be integrated at every timestep which can slow down your simulation unnecessarily if you only need the values of this variable whenever a spike occurs. Specify the equation as clock-driven explicitly to avoid this warning. [brian2.synapses.synapses.clock_driven] +WARNING The selected device 'genn' only supports a fixed schedule, but this schedule is not consistent with the network's schedule. The simulation will use the device's schedule. +Device schedule: ['start', 'synapses', 'groups', 'thresholds', 'resets', 'end'] +Network schedule: ['start', 'groups', 'thresholds', 'synapses', 'resets', 'end'] +Set the network schedule explicitly or set the core.network.default_schedule preference to avoid this warning. [brian2.core.network.schedule_conflict] +INFO No numerical integration method specified for group 'neurongroup', using method 'euler' (took 0.06s, trying other methods took 0.14s). [brian2.stateupdaters.base.method_choice] +INFO No numerical integration method specified for group 'synapses_2', using method 'linear' (took 0.13s). [brian2.stateupdaters.base.method_choice] +/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/dev/benchmarks/GeNNworkspace/magicnetwork_model_CODE/synapseKrnl.cc(23): error: name must be a namespace name + +1 error detected in the compilation of "/tmp/tmpxft_0000733c_00000000-7_runner.cpp1.ii". +/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/src/generateALL.cc: 258: cuda driver error 301: CUDA_ERROR_FILE_NOT_FOUND + +Traceback (most recent call last): + File "", line 14, in + File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/brian2cuda/tests/features/speed.py", line 1075, in run + self.timed_run(self.duration) + File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2/brian2/tests/features/base.py", line 63, in timed_run + brian2.run(duration, level=1) + File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2/brian2/units/fundamentalunits.py", line 2428, in new_f + result = f(*args, **kwds) + File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2/brian2/core/magic.py", line 371, in run + namespace=namespace, profile=profile, level=2+level) + File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2/brian2/core/magic.py", line 231, in run + namespace=namespace, profile=profile, level=level+1) + File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2/brian2/core/base.py", line 276, in device_override_decorated_function + return getattr(curdev, name)(*args, **kwds) + File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2genn/brian2genn/device.py", line 1206, in network_run + super(GeNNDevice, self).network_run(net=net, duration=duration, report=report, report_period=report_period, namespace=namespace, level=level+1) + File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2/brian2/devices/cpp_standalone/device.py", line 1171, in network_run + self.build(direct_call=False, **self.build_options) + File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2genn/brian2genn/device.py", line 582, in build + returncode=ex.returncode) +RuntimeError: Project compilation failed (Command ['genn-buildmodel.sh', 'magicnetwork_model.cpp'] failed with error code 50). +See the output above (if any) for more details. + + + + +TRACEBACK GeNN_optimized N=20000 +no stdout file found, cwd = /mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/dev/benchmarks/GeNNWorkspace/results/stdout.txt +running brian code generation ... +building genn executable ... +ar -rcs /home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/lib/libgenn.a /home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/obj/global.o /home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/obj/modelSpec.o /home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/obj/neuronModels.o /home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/obj/synapseModels.o /home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/obj/postSynapseModels.o /home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/obj/utils.o /home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/obj/stringUtils.o /home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/obj/sparseUtils.o /home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/obj/hr_time.o +g++ -std=c++11 -DNVCC=\""/usr/local/cuda/bin/nvcc"\" -DMODEL=\"/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/dev/benchmarks/GeNNworkspace/magicnetwork_model.cpp\" -o /mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/dev/benchmarks/GeNNworkspace/generateALL /home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/src/generate*.cc -I"/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/include" -I"/usr/local/cuda/include" -L"/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/lib" -L"/usr/local/cuda/lib64" -lgenn -lcuda -lcudart +call was ./generateALL /mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/dev/benchmarks/GeNNworkspace +optimizing block size... +Global memory required for core model: 1.3 MB. +6440894464 for device 0 +dry-run compile for device 0 +"/usr/local/cuda/bin/nvcc" -cubin -x cu -arch sm_35 -O3 -I"$GENN_PATH/lib/include" -o "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/dev/benchmarks/GeNNworkspace/runner.cubin" "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/dev/benchmarks/GeNNworkspace/magicnetwork_model_CODE/runner.cc" +genn-buildmodel.sh:70: error 50: command failure + +/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2/brian2/utils/logger.py:546: UserWarning: Could not copy script file to temp directory: [Errno 2] No such file or directory: '/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/dev/benchmarks/-c' + 'Could not copy script file to temp directory: %s' % ex) +INFO The synaptic equation for the variable A_pre does not specify whether it should be integrated at every timestep ("clock-driven") or only at spiking events ("event-driven"). It will be integrated at every timestep which can slow down your simulation unnecessarily if you only need the values of this variable whenever a spike occurs. Specify the equation as clock-driven explicitly to avoid this warning. [brian2.synapses.synapses.clock_driven] +INFO The synaptic equation for the variable A_post does not specify whether it should be integrated at every timestep ("clock-driven") or only at spiking events ("event-driven"). It will be integrated at every timestep which can slow down your simulation unnecessarily if you only need the values of this variable whenever a spike occurs. Specify the equation as clock-driven explicitly to avoid this warning. [brian2.synapses.synapses.clock_driven] +WARNING The selected device 'genn' only supports a fixed schedule, but this schedule is not consistent with the network's schedule. The simulation will use the device's schedule. +Device schedule: ['start', 'synapses', 'groups', 'thresholds', 'resets', 'end'] +Network schedule: ['start', 'groups', 'thresholds', 'synapses', 'resets', 'end'] +Set the network schedule explicitly or set the core.network.default_schedule preference to avoid this warning. [brian2.core.network.schedule_conflict] +INFO No numerical integration method specified for group 'neurongroup', using method 'euler' (took 0.07s, trying other methods took 0.17s). [brian2.stateupdaters.base.method_choice] +INFO No numerical integration method specified for group 'synapses_2', using method 'linear' (took 0.14s). [brian2.stateupdaters.base.method_choice] +/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/dev/benchmarks/GeNNworkspace/magicnetwork_model_CODE/synapseKrnl.cc(23): error: name must be a namespace name + +1 error detected in the compilation of "/tmp/tmpxft_0000782e_00000000-7_runner.cpp1.ii". +/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/src/generateALL.cc: 258: cuda driver error 301: CUDA_ERROR_FILE_NOT_FOUND + +Traceback (most recent call last): + File "", line 14, in + File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/brian2cuda/tests/features/speed.py", line 1075, in run + self.timed_run(self.duration) + File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2/brian2/tests/features/base.py", line 63, in timed_run + brian2.run(duration, level=1) + File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2/brian2/units/fundamentalunits.py", line 2428, in new_f + result = f(*args, **kwds) + File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2/brian2/core/magic.py", line 371, in run + namespace=namespace, profile=profile, level=2+level) + File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2/brian2/core/magic.py", line 231, in run + namespace=namespace, profile=profile, level=level+1) + File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2/brian2/core/base.py", line 276, in device_override_decorated_function + return getattr(curdev, name)(*args, **kwds) + File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2genn/brian2genn/device.py", line 1206, in network_run + super(GeNNDevice, self).network_run(net=net, duration=duration, report=report, report_period=report_period, namespace=namespace, level=level+1) + File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2/brian2/devices/cpp_standalone/device.py", line 1171, in network_run + self.build(direct_call=False, **self.build_options) + File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2genn/brian2genn/device.py", line 582, in build + returncode=ex.returncode) +RuntimeError: Project compilation failed (Command ['genn-buildmodel.sh', 'magicnetwork_model.cpp'] failed with error code 50). +See the output above (if any) for more details. + + + + +TRACEBACK GeNN_optimized N=50000 +no stdout file found, cwd = /mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/dev/benchmarks/GeNNWorkspace/results/stdout.txt +running brian code generation ... +building genn executable ... +ar -rcs /home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/lib/libgenn.a /home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/obj/global.o /home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/obj/modelSpec.o /home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/obj/neuronModels.o /home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/obj/synapseModels.o /home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/obj/postSynapseModels.o /home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/obj/utils.o /home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/obj/stringUtils.o /home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/obj/sparseUtils.o /home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/obj/hr_time.o +g++ -std=c++11 -DNVCC=\""/usr/local/cuda/bin/nvcc"\" -DMODEL=\"/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/dev/benchmarks/GeNNworkspace/magicnetwork_model.cpp\" -o /mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/dev/benchmarks/GeNNworkspace/generateALL /home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/src/generate*.cc -I"/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/include" -I"/usr/local/cuda/include" -L"/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/lib" -L"/usr/local/cuda/lib64" -lgenn -lcuda -lcudart +call was ./generateALL /mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/dev/benchmarks/GeNNworkspace +optimizing block size... +Global memory required for core model: 3.25 MB. +6440894464 for device 0 +dry-run compile for device 0 +"/usr/local/cuda/bin/nvcc" -cubin -x cu -arch sm_35 -O3 -I"$GENN_PATH/lib/include" -o "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/dev/benchmarks/GeNNworkspace/runner.cubin" "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/dev/benchmarks/GeNNworkspace/magicnetwork_model_CODE/runner.cc" +genn-buildmodel.sh:70: error 50: command failure + +/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2/brian2/utils/logger.py:546: UserWarning: Could not copy script file to temp directory: [Errno 2] No such file or directory: '/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/dev/benchmarks/-c' + 'Could not copy script file to temp directory: %s' % ex) +INFO The synaptic equation for the variable A_pre does not specify whether it should be integrated at every timestep ("clock-driven") or only at spiking events ("event-driven"). It will be integrated at every timestep which can slow down your simulation unnecessarily if you only need the values of this variable whenever a spike occurs. Specify the equation as clock-driven explicitly to avoid this warning. [brian2.synapses.synapses.clock_driven] +INFO The synaptic equation for the variable A_post does not specify whether it should be integrated at every timestep ("clock-driven") or only at spiking events ("event-driven"). It will be integrated at every timestep which can slow down your simulation unnecessarily if you only need the values of this variable whenever a spike occurs. Specify the equation as clock-driven explicitly to avoid this warning. [brian2.synapses.synapses.clock_driven] +WARNING The selected device 'genn' only supports a fixed schedule, but this schedule is not consistent with the network's schedule. The simulation will use the device's schedule. +Device schedule: ['start', 'synapses', 'groups', 'thresholds', 'resets', 'end'] +Network schedule: ['start', 'groups', 'thresholds', 'synapses', 'resets', 'end'] +Set the network schedule explicitly or set the core.network.default_schedule preference to avoid this warning. [brian2.core.network.schedule_conflict] +INFO No numerical integration method specified for group 'neurongroup', using method 'euler' (took 0.06s, trying other methods took 0.12s). [brian2.stateupdaters.base.method_choice] +INFO No numerical integration method specified for group 'synapses_2', using method 'linear' (took 0.14s). [brian2.stateupdaters.base.method_choice] +/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/dev/benchmarks/GeNNworkspace/magicnetwork_model_CODE/synapseKrnl.cc(23): error: name must be a namespace name + +1 error detected in the compilation of "/tmp/tmpxft_00000209_00000000-7_runner.cpp1.ii". +/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/src/generateALL.cc: 258: cuda driver error 301: CUDA_ERROR_FILE_NOT_FOUND + +Traceback (most recent call last): + File "", line 14, in + File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/brian2cuda/tests/features/speed.py", line 1075, in run + self.timed_run(self.duration) + File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2/brian2/tests/features/base.py", line 63, in timed_run + brian2.run(duration, level=1) + File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2/brian2/units/fundamentalunits.py", line 2428, in new_f + result = f(*args, **kwds) + File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2/brian2/core/magic.py", line 371, in run + namespace=namespace, profile=profile, level=2+level) + File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2/brian2/core/magic.py", line 231, in run + namespace=namespace, profile=profile, level=level+1) + File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2/brian2/core/base.py", line 276, in device_override_decorated_function + return getattr(curdev, name)(*args, **kwds) + File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2genn/brian2genn/device.py", line 1206, in network_run + super(GeNNDevice, self).network_run(net=net, duration=duration, report=report, report_period=report_period, namespace=namespace, level=level+1) + File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2/brian2/devices/cpp_standalone/device.py", line 1171, in network_run + self.build(direct_call=False, **self.build_options) + File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2genn/brian2genn/device.py", line 582, in build + returncode=ex.returncode) +RuntimeError: Project compilation failed (Command ['genn-buildmodel.sh', 'magicnetwork_model.cpp'] failed with error code 50). +See the output above (if any) for more details. + + + + +TRACEBACK CUDA standalone N=100000 +INFO: setting cudaDevice stuff took 0.256535 seconds +INFO connectivity matrix has size 7999692 +INFO connectivity matrix has size 32002251 +INFO connectivity matrix has size 160017025 +INFO connectivity matrix has size 32002251 +INFO calling kernel_neurongroup_stateupdater_codeobject with 131 blocks and 768 threads. Kernel needs 40 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 0.750000. +ERROR launching kernel_neurongroup_stateupdater_codeobject in code_objects/neurongroup_stateupdater_codeobject.cu:1066 invalid argument + +('debug syn effect mdoe ', 'target') +('debug syn effect mdoe ', 'target') +('debug syn effect mdoe ', 'target') +('debug syn effect mdoe ', 'synapse') +INFO: setting cudaDevice stuff took 0.256535 seconds +INFO connectivity matrix has size 7999692 +INFO connectivity matrix has size 32002251 +INFO connectivity matrix has size 160017025 +INFO connectivity matrix has size 32002251 +INFO calling kernel_neurongroup_stateupdater_codeobject with 131 blocks and 768 threads. Kernel needs 40 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 0.750000. +ERROR launching kernel_neurongroup_stateupdater_codeobject in code_objects/neurongroup_stateupdater_codeobject.cu:1066 invalid argument + + +/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2/brian2/utils/logger.py:546: UserWarning: Could not copy script file to temp directory: [Errno 2] No such file or directory: '/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/dev/benchmarks/-c' + 'Could not copy script file to temp directory: %s' % ex) +INFO The synaptic equation for the variable A_pre does not specify whether it should be integrated at every timestep ("clock-driven") or only at spiking events ("event-driven"). It will be integrated at every timestep which can slow down your simulation unnecessarily if you only need the values of this variable whenever a spike occurs. Specify the equation as clock-driven explicitly to avoid this warning. [brian2.synapses.synapses.clock_driven] +INFO The synaptic equation for the variable A_post does not specify whether it should be integrated at every timestep ("clock-driven") or only at spiking events ("event-driven"). It will be integrated at every timestep which can slow down your simulation unnecessarily if you only need the values of this variable whenever a spike occurs. Specify the equation as clock-driven explicitly to avoid this warning. [brian2.synapses.synapses.clock_driven] +INFO No numerical integration method specified for group 'neurongroup', using method 'euler' (took 0.06s, trying other methods took 0.12s). [brian2.stateupdaters.base.method_choice] +INFO No numerical integration method specified for group 'synapses_2', using method 'linear' (took 0.19s). [brian2.stateupdaters.base.method_choice] + +Traceback (most recent call last): + File "", line 21, in + File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/brian2cuda/tests/features/cuda_configuration.py", line 27, in after_run + with_output=False) + File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/brian2cuda/device.py", line 778, in build + self.run(directory, with_output, run_args) + File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2/brian2/devices/cpp_standalone/device.py", line 864, in run + "%s)" % os.path.abspath(directory)) +RuntimeError: Project run failed (project directory: /mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/dev/benchmarks/cuda_standalone/cuda_standalone) + + + + +TRACEBACK GeNN_optimized N=100000 +no stdout file found, cwd = /mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/dev/benchmarks/GeNNWorkspace/results/stdout.txt +running brian code generation ... +building genn executable ... +ar -rcs /home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/lib/libgenn.a /home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/obj/global.o /home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/obj/modelSpec.o /home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/obj/neuronModels.o /home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/obj/synapseModels.o /home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/obj/postSynapseModels.o /home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/obj/utils.o /home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/obj/stringUtils.o /home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/obj/sparseUtils.o /home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/obj/hr_time.o +g++ -std=c++11 -DNVCC=\""/usr/local/cuda/bin/nvcc"\" -DMODEL=\"/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/dev/benchmarks/GeNNworkspace/magicnetwork_model.cpp\" -o /mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/dev/benchmarks/GeNNworkspace/generateALL /home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/src/generate*.cc -I"/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/include" -I"/usr/local/cuda/include" -L"/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/lib" -L"/usr/local/cuda/lib64" -lgenn -lcuda -lcudart +call was ./generateALL /mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/dev/benchmarks/GeNNworkspace +optimizing block size... +Global memory required for core model: 6.5 MB. +6440894464 for device 0 +dry-run compile for device 0 +"/usr/local/cuda/bin/nvcc" -cubin -x cu -arch sm_35 -O3 -I"$GENN_PATH/lib/include" -o "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/dev/benchmarks/GeNNworkspace/runner.cubin" "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/dev/benchmarks/GeNNworkspace/magicnetwork_model_CODE/runner.cc" +genn-buildmodel.sh:70: error 50: command failure + +/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2/brian2/utils/logger.py:546: UserWarning: Could not copy script file to temp directory: [Errno 2] No such file or directory: '/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/dev/benchmarks/-c' + 'Could not copy script file to temp directory: %s' % ex) +INFO The synaptic equation for the variable A_pre does not specify whether it should be integrated at every timestep ("clock-driven") or only at spiking events ("event-driven"). It will be integrated at every timestep which can slow down your simulation unnecessarily if you only need the values of this variable whenever a spike occurs. Specify the equation as clock-driven explicitly to avoid this warning. [brian2.synapses.synapses.clock_driven] +INFO The synaptic equation for the variable A_post does not specify whether it should be integrated at every timestep ("clock-driven") or only at spiking events ("event-driven"). It will be integrated at every timestep which can slow down your simulation unnecessarily if you only need the values of this variable whenever a spike occurs. Specify the equation as clock-driven explicitly to avoid this warning. [brian2.synapses.synapses.clock_driven] +WARNING The selected device 'genn' only supports a fixed schedule, but this schedule is not consistent with the network's schedule. The simulation will use the device's schedule. +Device schedule: ['start', 'synapses', 'groups', 'thresholds', 'resets', 'end'] +Network schedule: ['start', 'groups', 'thresholds', 'synapses', 'resets', 'end'] +Set the network schedule explicitly or set the core.network.default_schedule preference to avoid this warning. [brian2.core.network.schedule_conflict] +INFO No numerical integration method specified for group 'neurongroup', using method 'euler' (took 0.06s, trying other methods took 0.11s). [brian2.stateupdaters.base.method_choice] +INFO No numerical integration method specified for group 'synapses_2', using method 'linear' (took 0.20s). [brian2.stateupdaters.base.method_choice] +/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/dev/benchmarks/GeNNworkspace/magicnetwork_model_CODE/synapseKrnl.cc(23): error: name must be a namespace name + +1 error detected in the compilation of "/tmp/tmpxft_00001531_00000000-7_runner.cpp1.ii". +/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/src/generateALL.cc: 258: cuda driver error 301: CUDA_ERROR_FILE_NOT_FOUND + +Traceback (most recent call last): + File "", line 14, in + File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/brian2cuda/tests/features/speed.py", line 1075, in run + self.timed_run(self.duration) + File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2/brian2/tests/features/base.py", line 63, in timed_run + brian2.run(duration, level=1) + File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2/brian2/units/fundamentalunits.py", line 2428, in new_f + result = f(*args, **kwds) + File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2/brian2/core/magic.py", line 371, in run + namespace=namespace, profile=profile, level=2+leINFO: setting cudaDevice stuff took 0.170318 seconds +INFO connectivity matrix has size 812 +INFO connectivity matrix has size 3241 +INFO connectivity matrix has size 16057 +INFO connectivity matrix has size 3241 +INFO calling kernel_neurongroup_stateupdater_codeobject with 2 blocks and 768 threads. Kernel needs 40 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 0.750000. +INFO calling kernel_synapses_2_stateupdater_codeobject with 5 blocks and 768 threads. Kernel needs 36 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 0.750000. +INFO calling kernel_neurongroup_thresholder_codeobject with 1 blocks and 1024 threads. Kernel needs 11 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000. +INFO calling kernel_synapses_1_pre_codeobject with 15 blocks and 1024 threads. Kernel needs 27 registers per block, 0 bytes of statically-allocated shared memory per block, 8 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000. +INFO calling kernel_synapses_2_pre_codeobject with 15 blocks and 1024 threads. Kernel needs 31 registers per block, 0 bytes of statically-allocated shared memory per block, 8 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000. +INFO calling kernel_synapses_pre_codeobject with 15 blocks and 1024 threads. Kernel needs 27 registers per block, 0 bytes of statically-allocated shared memory per block, 8 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000. +INFO calling kernel_synapses_2_post_codeobject with 15 blocks and 1024 threads. Kernel needs 27 registers per block, 0 bytes of statically-allocated shared memory per block, 8 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000. +INFO calling kernel_neurongroup_resetter_codeobject with 1 blocks and 1024 threads. Kernel needs 14 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000. +Number of synapses: 3241 +Number of synapses: 3241 +Number of synapses: 16057 +Number of synapses: 812 +INFO: main_lines took 8.665926 seconds +INFO: main function took 8.864357 seconds +vel) + File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2/brian2/core/magic.py", line 231, in run + namespace=namespace, profile=profile, level=level+1) + File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2/brian2/core/base.py", line 276, in device_override_decorated_function + return getattr(curdev, name)(*args, **kwds) + File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2genn/brian2genn/device.py", line 1206, in network_run + super(GeNNDevice, self).network_run(net=net, duration=duration, report=report, report_period=report_period, namespace=namespace, level=level+1) + File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2/brian2/devices/cpp_standalone/device.py", line 1171, in network_run + self.build(direct_call=False, **self.build_options) + File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2genn/brian2genn/device.py", line 582, in build + returncode=ex.returncode) +RuntimeError: Project compilation failed (Command ['genn-buildmodel.sh', 'magicnetwork_model.cpp'] failed with error code 50). +See the output above (if any) for more details. + + + +Running VogelsWithSynapticDynamic took 1:02:13. +INFO relative performance for Full examples: Vogels et al 2011 (not event-driven synapses) N=10 CUDA standalone: nan +INFO relative performance for Full examples: Vogels et al 2011 (not event-driven synapses) N=100 CUDA standalone: 1.0 +INFO relative performance for Full examples: Vogels et al 2011 (not event-driven synapses) N=1000 CUDA standalone: 1.0 +INFO relative performance for Full examples: Vogels et al 2011 (not event-driven synapses) N=10000 CUDA standalone: 1.0 +INFO relative performance for Full examples: Vogels et al 2011 (not event-driven synapses) N=20000 CUDA standalone: 1.0 +INFO relative performance for Full examples: Vogels et al 2011 (not event-driven synapses) N=50000 CUDA standalone: 1.0 +INFO relative performance for Full examples: Vogels et al 2011 (not event-driven synapses) N=100000 CUDA standalone: nan +INFO relative performance for Full examples: Vogels et al 2011 (not event-driven synapses) N=10 C++ standalone: nan +INFO relative performance for Full examples: Vogels et al 2011 (not event-driven synapses) N=100 C++ standalone: 8.86897457548 +INFO relative performance for Full examples: Vogels et al 2011 (not event-driven synapses) N=1000 C++ standalone: 4.54459720371 +INFO relative performance for Full examples: Vogels et al 2011 (not event-driven synapses) N=10000 C++ standalone: 1.30294701267 +INFO relative performance for Full examples: Vogels et al 2011 (not event-driven synapses) N=20000 C++ standalone: 0.533376382918 +INFO relative performance for Full examples: Vogels et al 2011 (not event-driven synapses) N=50000 C++ standalone: 0.285758703759 +INFO relative performance for Full examples: Vogels et al 2011 (not event-driven synapses) N=100000 C++ standalone: nan +Rerunning CUDAStandaloneConfiguration with n = 1000 for nvprof profiling +cd cuda_standalone && nvprof --profile-from-start-off --log-file ../results_2017_04_05_complete_after_talk/nvprof/nvprof_VogelsWithSynapticDynamic_CUDAStandaloneConfiguration_1000.log ./main +Profiling took 0:00:15 for runtime of 0.894734 +Rerunning GeNNConfigurationOptimized with n = 1000 for nvprof profiling +Starting CUBAFixedConnectivity on 07.04.2017 at 00:20:43. +Running speed tests +Configurations: CUDA standalone, C++ standalone, GeNN_optimized +Full examples: CUBA fixed connectivity: n=10 [...] n=100 [...] n=1000 [...] n=10000 [...] n=100000 [...] n=1000000 [E..] + +TRACEBACK CUDA standalone N=1000000 +INFO: setting cudaDevice stuff took 0.269366 seconds +INFO calling kernel_neurongroup_group_variable_set_conditional_codeobject with 977 blocks and 1024 threads. Kernel needs 8 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory +INFO connectivity matrix has size 15997784 +INFO connectivity matrix has size 63997779 +INFO calling kernel_neurongroup_stateupdater_codeobject with 1303 blocks and 768 threads. Kernel needs 36 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 0.750000. +ERROR launching kernel_neurongroup_stateupdater_codeobject in code_objects/neurongroup_stateupdater_codeobject.cu:1101 invalid argument + +('debug syn effect mdoe ', 'target') +('debug syn effect mdoe ', 'target') +INFO: setting cudaDevice stuff took 0.269366 seconds +INFO calling kernel_neurongroup_group_variable_set_conditional_codeobject with 977 blocks and 1024 threads. Kernel needs 8 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory +INFO connectivity matrix has size 15997784 +INFO connectivity matrix has size 63997779 +INFO calling kernel_neurongroup_stateupdater_codeobject with 1303 blocks and 768 threads. Kernel needs 36 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 0.750000. +ERROR launching kernel_neurongroup_stateupdater_codeobject in code_objects/neurongroup_stateupdater_codeobject.cu:1101 invalid argument + + +/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2/brian2/utils/logger.py:546: UserWarning: Could not copy script file to temp directory: [Errno 2] No such file or directory: '/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/dev/benchmarks/-c' + 'Could not copy script file to temp directory: %s' % ex) +INFO No numerical integration method specified for group 'neurongroup', using method 'linear' (took 1.52s). [brian2.stateupdaters.base.method_choice] + +Traceback (most recent call last): + File "", line 21, in + File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/brian2cuda/tests/features/cuda_configuration.py", line 27, in after_run + with_output=False) + File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/brian2cuda/device.py", line 778, in build + self.run(directory, with_output, run_args) + File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2/brian2/devices/cpp_standalone/device.py", line 864, in run + "%s)" % os.path.abspath(directory)) +RuntimeError: Project run failed (project directory: /mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/dev/benchmarks/cuda_standalone/cuda_standalone) + + + +Running CUBAFixedConnectivity took 2:08:42. +INFO relative performance for Full examples: CUBA fixed connectivity N=10 CUDA standalone: 1.0 +INFO relative performance for Full examples: CUBA fixed connectivity N=100 CUDA standalone: 1.0 +INFO relative performance for Full examples: CUBA fixed connectivity N=1000 CUDA standalone: 1.0 +INFO relative performance for Full examples: CUBA fixed connectivity N=10000 CUDA standalone: 1.0 +INFO relative performance for Full examples: CUBA fixed connectivity N=100000 CUDA standalone: 1.0 +INFO relative performance for Full examples: CUBA fixed connectivity N=1000000 CUDA standalone: nan +INFO relative performance for Full examples: CUBA fixed connectivity N=10 C++ standalone: 10.4753271459 +INFO relative performance for Full examples: CUBA fixed connectivity N=100 C++ standalone: 9.11815288259 +INFO relative performance for Full examples: INFO: setting cudaDevice stuff took 0.159560 seconds +INFO calling kernel_neurongroup_group_variable_set_conditional_codeobject with 1 blocks and 1024 threads. Kernel needs 8 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory +INFO connectivity matrix has size 15885 +INFO connectivity matrix has size 63887 +INFO calling kernel_neurongroup_stateupdater_codeobject with 2 blocks and 768 threads. Kernel needs 36 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 0.750000. +INFO calling kernel_neurongroup_thresholder_codeobject with 1 blocks and 1024 threads. Kernel needs 11 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000. +INFO calling kernel_spikemonitor_codeobject with 1 blocks and 1 threads. Kernel needs 37 registers per block, 0 bytes of statically-allocated shared memory per block, 16 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 0.000000. +INFO calling kernel_synapses_1_pre_codeobject with 15 blocks and 1024 threads. Kernel needs 27 registers per block, 0 bytes of statically-allocated shared memory per block, 8 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000. +INFO calling kernel_synapses_pre_codeobject with 15 blocks and 1024 threads. Kernel needs 27 registers per block, 0 bytes of statically-allocated shared memory per block, 8 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000. +INFO calling kernel_neurongroup_resetter_codeobject with 1 blocks and 1024 threads. Kernel needs 14 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000. +Number of synapses: 63887 +Number of synapses: 15885 +INFO: main_lines took 6.371682 seconds +Number of spikes: 5793 +INFO: main function took 6.564151 seconds +mkdir: cannot create directory ‘test_output’: File exists +# DT 0.000100 +# totalTime 1.000000 +# We are running with fixed time step 0.000100 +0.9999 done ... +everything finished. +CUBA fixed connectivity N=1000 C++ standalone: 4.59531396439 +INFO relative performance for Full examples: CUBA fixed connectivity N=10000 C++ standalone: 1.31468051433 +INFO relative performance for Full examples: CUBA fixed connectivity N=100000 C++ standalone: 1.18548470084 +INFO relative performance for Full examples: CUBA fixed connectivity N=1000000 C++ standalone: nan +INFO relative performance for Full examples: CUBA fixed connectivity N=10 GeNN_optimized: 1.11042839814 +INFO relative performance for Full examples: CUBA fixed connectivity N=100 GeNN_optimized: 1.04833449905 +INFO relative performance for Full examples: CUBA fixed connectivity N=1000 GeNN_optimized: 1.04557236752 +INFO relative performance for Full examples: CUBA fixed connectivity N=10000 GeNN_optimized: 1.50988461982 +INFO relative performance for Full examples: CUBA fixed connectivity N=100000 GeNN_optimized: 1.45615031203 +INFO relative performance for Full examples: CUBA fixed connectivity N=1000000 GeNN_optimized: nan +Rerunning CUDAStandaloneConfiguration with n = 1000 for nvprof profiling +cd cuda_standalone && nvprof --profile-from-start-off --log-file ../results_2017_04_05_complete_after_talk/nvprof/nvprof_CUBAFixedConnectivity_CUDAStandaloneConfiguration_1000.log ./main +Profiling took 0:00:12 for runtime of 0.436839 +Rerunning GeNNConfigurationOptimized with n = 1000 for nvprof profiling +cd GeNNworkspace && nvprof --log-file ../results_2017_04_05_complete_after_talk/nvprof/nvprof_CUBAFixedConnectivity_GeNNConfigurationOptimized_1000.log ./main test 1.0 1 +Profiling took 0:00:03 for runtime of 0.414081 +Starting COBAHHFixedConnectivity on 07.04.2017 at 02:30:45. +Running speed tests +Configurations: CUDA standalone, C++ standalone, GeNN_optimized +Full examples: COBAHH fixed connectivity: n=100 [...] n=500 [...] n=1000 [...] n=5000 [...] n=10000 [...] n=50000 [...] n=100000 [...] n=500000 [...]INFO: setting cudaDevice stuff took 0.332072 seconds +INFO calling kernel_neurongroup_group_variable_set_conditional_codeobject with 1 blocks and 1024 threads. Kernel needs 14 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory +INFO calling kernel_neurongroup_group_variable_set_conditional_codeobject_1 with 1 blocks and 1024 threads. Kernel needs 7 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory +INFO calling kernel_neurongroup_group_variable_set_conditional_codeobject_2 with 1 blocks and 1024 threads. Kernel needs 7 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory +INFO connectivity matrix has size 15964 +INFO connectivity matrix has size 64222 +INFO calling kernel_neurongroup_stateupdater_codeobject with 2 blocks and 512 threads. Kernel needs 109 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 0.250000. +INFO calling kernel_neurongroup_thresholder_codeobject with 1 blocks and 1024 threads. Kernel needs 11 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000. +INFO calling kernel_spikemonitor_codeobject with 1 blocks and 1 threads. Kernel needs 37 registers per block, 0 bytes of statically-allocated shared memory per block, 16 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 0.000000. +INFO calling kernel_synapses_1_pre_codeobject with 15 blocks and 1024 threads. Kernel needs 27 registers per block, 0 bytes of statically-allocated shared memory per block, 8 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000. +INFO calling kernel_synapses_pre_codeobject with 15 blocks and 1024 threads. Kernel needs 27 registers per block, 0 bytes of statically-allocated shared memory per block, 8 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000. +Number of synapses: 15964 +Number of synapses: 64222 +INFO: main_lines took 6.054959 seconds +Number of spikes: 37082 +INFO: main function took 6.485835 seconds +mkdir: cannot create directory ‘test_output’: File exists +# DT 0.000100 +# totalTime 1.000000 +# We are running with fixed time step 0.000100 +0.9999 done ... +everything finished. + +Running COBAHHFixedConnectivity took 2:02:00. +INFO relative performance for Full examples: COBAHH fixed connectivity N=100 CUDA standalone: 1.0 +INFO relative performance for Full examples: COBAHH fixed connectivity N=500 CUDA standalone: 1.0 +INFO relative performance for Full examples: COBAHH fixed connectivity N=1000 CUDA standalone: 1.0 +INFO relative performance for Full examples: COBAHH fixed connectivity N=5000 CUDA standalone: 1.0 +INFO relative performance for Full examples: COBAHH fixed connectivity N=10000 CUDA standalone: 1.0 +INFO relative performance for Full examples: COBAHH fixed connectivity N=50000 CUDA standalone: 1.0 +INFO relative performance for Full examples: COBAHH fixed connectivity N=100000 CUDA standalone: 1.0 +INFO relative performance for Full examples: COBAHH fixed connectivity N=500000 CUDA standalone: 1.0 +INFO relative performance for Full examples: COBAHH fixed connectivity N=100 C++ standalone: 1.08329904788 +INFO relative performance for Full examples: COBAHH fixed connectivity N=500 C++ standalone: 0.396811267606 +INFO relative performance for Full examples: COBAHH fixed connectivity N=1000 C++ standalone: 0.263636826134 +INFO relative performance for Full examples: COBAHH fixed connectivity N=5000 C++ standalone: 0.156838480711 +INFO relative performance for Full examples: COBAHH fixed connectivity N=10000 C++ standalone: 0.144342764605 +INFO relative performance for Full examples: COBAHH fixed connectivity N=50000 C++ standalone: 0.124221429972 +INFO relative performance for Full examples: COBAHH fixed connectivity N=100000 C++ standalone: 0.125880488867 +INFO relative performance for Full examples: COBAHH fixed connectivity N=500000 C++ standalone: 1.77958110994 +INFO relative performance for Full examples: COBAHH fixed connectivity N=100 GeNN_optimized: 0.849264812988 +INFO relative performance for Full examples: COBAHH fixed connectivity N=500 GeNN_optimized: 1.14449843751 +INFO relative performance for Full examples: COBAHH fixed connectivity N=1000 GeNN_optimized: 1.29478794328 +INFO relative performance for Full examples: COBAHH fixed connectivity N=5000 GeNN_optimized: 2.10583634597 +INFO relative performance for Full examples: COBAHH fixed connectivity N=10000 GeNN_optimized: 2.64633239801 +INFO relative performance for Full examples: COBAHH fixed connectivity N=50000 GeNN_optimized: 2.23178989574 +INFO relative performance for Full examples: COBAHH fixed connectivity N=100000 GeNN_optimized: 1.21355706552 +INFO relative performance for Full examples: COBAHH fixed connectivity N=500000 GeNN_optimized: 3.68221005988 +Rerunning CUDAStandaloneConfiguration with n = 1000 for nvprof profiling +cd cuda_standalone && nvprof --profile-from-start-off --log-file ../results_2017_04_05_complete_after_talk/nvprof/nvprof_COBAHHFixedConnectivity_CUDAStandaloneConfiguration_1000.log ./main +Profiling took 0:00:11 for runtime of 0.881464 +Rerunning GeNNConfigurationOptimized with n = 1000 for nvprof profiling +cd GeNNworkspace && nvprof --log-file ../results_2017_04_05_complete_after_talk/nvprof/nvprof_COBAHHFixedConnectivity_GeNNConfigurationOptimized_1000.log ./main test 1.0 1 +Profiling took 0:00:04 for runtime of 0.6583 + +Summarized speed test results in results_2017_04_05_complete_after_talk/README.md +Finished speed test on 07.04.2017 at 04:34:29. Total time = 1 day, 6:30:58. diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_AdaptationOscillation_cpp_standalone_100000.txt b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_AdaptationOscillation_cpp_standalone_100000.txt new file mode 100644 index 00000000..e0192f4f --- /dev/null +++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_AdaptationOscillation_cpp_standalone_100000.txt @@ -0,0 +1 @@ +Number of synapses: 500007644 diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_AdaptationOscillation_cuda_standalone_100000.txt b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_AdaptationOscillation_cuda_standalone_100000.txt new file mode 100644 index 00000000..dfcb71f6 --- /dev/null +++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_AdaptationOscillation_cuda_standalone_100000.txt @@ -0,0 +1,3 @@ +INFO: setting cudaDevice stuff took 0.311282 seconds +INFO calling kernel_neurongroup_group_variable_set_conditional_codeobject with 98 blocks and 1024 threads. Kernel needs 8 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory +INFO calling kernel_neurongroup_group_variable_set_conditional_codeobject_1 with 98 blocks and 1024 threads. Kernel needs 8 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_BrunelHakimModelHeterogeneousDelay_cpp_standalone_100000.txt b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_BrunelHakimModelHeterogeneousDelay_cpp_standalone_100000.txt new file mode 100644 index 00000000..074364c4 --- /dev/null +++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_BrunelHakimModelHeterogeneousDelay_cpp_standalone_100000.txt @@ -0,0 +1 @@ +Number of synapses: 100003108 diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_BrunelHakimModelHeterogeneousDelay_cuda_standalone_100000.txt b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_BrunelHakimModelHeterogeneousDelay_cuda_standalone_100000.txt new file mode 100644 index 00000000..1be9a968 --- /dev/null +++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_BrunelHakimModelHeterogeneousDelay_cuda_standalone_100000.txt @@ -0,0 +1,12 @@ +INFO: setting cudaDevice stuff took 0.094517 seconds +INFO calling kernel_synapses_group_variable_set_conditional_codeobject with 97661 blocks and 1024 threads. Kernel needs 6 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory +INFO connectivity matrix has size 100004409 +INFO not enough memory available to generate 247808 random numbers for neurongroup_stateupdater_codeobject, reducing the buffer size +INFO not enough memory available to generate 247808 random numbers for neurongroup_stateupdater_codeobject, reducing the buffer size +INFO not enough memory available to generate 247808 random numbers for neurongroup_stateupdater_codeobject, reducing the buffer size +INFO not enough memory available to generate 247808 random numbers for neurongroup_stateupdater_codeobject, reducing the buffer size +INFO not enough memory available to generate 247808 random numbers for neurongroup_stateupdater_codeobject, reducing the buffer size +INFO not enough memory available to generate 247808 random numbers for neurongroup_stateupdater_codeobject, reducing the buffer size +INFO generating 204687 randn every 131 clock cycles for neurongroup_stateupdater_codeobject +INFO calling kernel_neurongroup_stateupdater_codeobject with 174 blocks and 576 threads. Kernel needs 52 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 0.562500. +ERROR launching kernel_neurongroup_stateupdater_codeobject in code_objects/neurongroup_stateupdater_codeobject.cu:1051 out of memory diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_BrunelHakimModelScalarDelayNoMultiPrePost_cpp_standalone_250000.txt b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_BrunelHakimModelScalarDelayNoMultiPrePost_cpp_standalone_250000.txt new file mode 100644 index 00000000..797f590d --- /dev/null +++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_BrunelHakimModelScalarDelayNoMultiPrePost_cpp_standalone_250000.txt @@ -0,0 +1 @@ +Number of synapses: 250027209 diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_BrunelHakimModelScalarDelayNoMultiPrePost_cuda_standalone_250000.txt b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_BrunelHakimModelScalarDelayNoMultiPrePost_cuda_standalone_250000.txt new file mode 100644 index 00000000..03cbcd84 --- /dev/null +++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_BrunelHakimModelScalarDelayNoMultiPrePost_cuda_standalone_250000.txt @@ -0,0 +1,10 @@ +INFO: setting cudaDevice stuff took 0.282011 seconds +INFO connectivity matrix has size 249991134 +INFO generating 13000000 randn every 52 clock cycles for neurongroup_stateupdater_codeobject +INFO calling kernel_neurongroup_stateupdater_codeobject with 435 blocks and 576 threads. Kernel needs 52 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 0.562500. +INFO calling kernel_neurongroup_thresholder_codeobject with 245 blocks and 1024 threads. Kernel needs 11 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000. +INFO calling kernel_synapses_pre_codeobject with 15 blocks and 1024 threads. Kernel needs 27 registers per block, 0 bytes of statically-allocated shared memory per block, 8 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000. +INFO calling kernel_neurongroup_resetter_codeobject with 245 blocks and 1024 threads. Kernel needs 14 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000. +Number of synapses: 249991134 +INFO: main_lines took 851.693748 seconds +INFO: main function took 860.695222 seconds diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_BrunelHakimModelScalarDelay_cpp_standalone_250000.txt b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_BrunelHakimModelScalarDelay_cpp_standalone_250000.txt new file mode 100644 index 00000000..15884f65 --- /dev/null +++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_BrunelHakimModelScalarDelay_cpp_standalone_250000.txt @@ -0,0 +1 @@ +Number of synapses: 250014804 diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_BrunelHakimModelScalarDelay_cuda_standalone_250000.txt b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_BrunelHakimModelScalarDelay_cuda_standalone_250000.txt new file mode 100644 index 00000000..ae002382 --- /dev/null +++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_BrunelHakimModelScalarDelay_cuda_standalone_250000.txt @@ -0,0 +1,10 @@ +INFO: setting cudaDevice stuff took 0.232506 seconds +INFO connectivity matrix has size 249995860 +INFO generating 13000000 randn every 52 clock cycles for neurongroup_stateupdater_codeobject +INFO calling kernel_neurongroup_stateupdater_codeobject with 435 blocks and 576 threads. Kernel needs 52 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 0.562500. +INFO calling kernel_neurongroup_thresholder_codeobject with 245 blocks and 1024 threads. Kernel needs 11 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000. +INFO calling kernel_synapses_pre_codeobject with 15 blocks and 1024 threads. Kernel needs 27 registers per block, 0 bytes of statically-allocated shared memory per block, 8 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000. +INFO calling kernel_neurongroup_resetter_codeobject with 245 blocks and 1024 threads. Kernel needs 14 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000. +Number of synapses: 249995860 +INFO: main_lines took 871.342261 seconds +INFO: main function took 880.904159 seconds diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_COBAHHFixedConnectivity_cpp_standalone_500000.txt b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_COBAHHFixedConnectivity_cpp_standalone_500000.txt new file mode 100644 index 00000000..99f726d6 --- /dev/null +++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_COBAHHFixedConnectivity_cpp_standalone_500000.txt @@ -0,0 +1,3 @@ +Number of spikes: 18384816 +Number of synapses: 7999481 +Number of synapses: 32003913 diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_COBAHHFixedConnectivity_cuda_standalone_500000.txt b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_COBAHHFixedConnectivity_cuda_standalone_500000.txt new file mode 100644 index 00000000..e62943fc --- /dev/null +++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_COBAHHFixedConnectivity_cuda_standalone_500000.txt @@ -0,0 +1,20 @@ +INFO: setting cudaDevice stuff took 0.273473 seconds +INFO calling kernel_neurongroup_group_variable_set_conditional_codeobject with 489 blocks and 1024 threads. Kernel needs 14 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory +INFO calling kernel_neurongroup_group_variable_set_conditional_codeobject_1 with 489 blocks and 1024 threads. Kernel needs 7 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory +INFO calling kernel_neurongroup_group_variable_set_conditional_codeobject_2 with 489 blocks and 1024 threads. Kernel needs 7 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory +INFO connectivity matrix has size 7997964 +INFO connectivity matrix has size 31986009 +INFO calling kernel_neurongroup_stateupdater_codeobject with 977 blocks and 512 threads. Kernel needs 109 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 0.250000. +INFO calling kernel_neurongroup_thresholder_codeobject with 489 blocks and 1024 threads. Kernel needs 11 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000. +INFO calling kernel_spikemonitor_codeobject with 1 blocks and 1 threads. Kernel needs 37 registers per block, 0 bytes of statically-allocated shared memory per block, 16 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 0.000000. +INFO calling kernel_synapses_1_pre_codeobject with 15 blocks and 1024 threads. Kernel needs 27 registers per block, 0 bytes of statically-allocated shared memory per block, 8 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000. +INFO calling kernel_synapses_pre_codeobject with 15 blocks and 1024 threads. Kernel needs 27 registers per block, 0 bytes of statically-allocated shared memory per block, 8 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000. +... +ERROR while allocating 33554428 bytes in cudaVector.h/reserve() +ERROR while allocating 67108856 bytes in cudaVector.h/reserve() +... +Number of synapses: 7997964 +Number of synapses: 31986009 +INFO: main_lines took 3155.450103 seconds +Number of spikes: 4194303 +INFO: main function took 3164.489584 seconds diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_COBAHH_cpp_standalone_100000.txt b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_COBAHH_cpp_standalone_100000.txt new file mode 100644 index 00000000..057aa12b --- /dev/null +++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_COBAHH_cpp_standalone_100000.txt @@ -0,0 +1,2 @@ +Number of synapses: 40003665 +Number of synapses: 160004814 diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_COBAHH_cuda_standalone_100000.txt b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_COBAHH_cuda_standalone_100000.txt new file mode 100644 index 00000000..2c98a46b --- /dev/null +++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_COBAHH_cuda_standalone_100000.txt @@ -0,0 +1,14 @@ +INFO: setting cudaDevice stuff took 0.293608 seconds +INFO calling kernel_neurongroup_group_variable_set_conditional_codeobject with 98 blocks and 1024 threads. Kernel needs 14 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory +INFO calling kernel_neurongroup_group_variable_set_conditional_codeobject_1 with 98 blocks and 1024 threads. Kernel needs 7 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory +INFO calling kernel_neurongroup_group_variable_set_conditional_codeobject_2 with 98 blocks and 1024 threads. Kernel needs 7 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory +INFO connectivity matrix has size 40000194 +INFO connectivity matrix has size 159996627 +INFO calling kernel_neurongroup_stateupdater_codeobject with 196 blocks and 512 threads. Kernel needs 109 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 0.250000. +INFO calling kernel_neurongroup_thresholder_codeobject with 98 blocks and 1024 threads. Kernel needs 11 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000. +INFO calling kernel_synapses_1_pre_codeobject with 15 blocks and 1024 threads. Kernel needs 27 registers per block, 0 bytes of statically-allocated shared memory per block, 8 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000. +INFO calling kernel_synapses_pre_codeobject with 15 blocks and 1024 threads. Kernel needs 27 registers per block, 0 bytes of statically-allocated shared memory per block, 8 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000. +Number of synapses: 40000194 +Number of synapses: 159996627 +INFO: main_lines took 362.221271 seconds +INFO: main function took 368.960768 seconds diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_CUBAFixedConnectivity_cpp_standalone_1000000.txt b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_CUBAFixedConnectivity_cpp_standalone_1000000.txt new file mode 100644 index 00000000..3d0c9a4f --- /dev/null +++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_CUBAFixedConnectivity_cpp_standalone_1000000.txt @@ -0,0 +1,3 @@ +Number of spikes: 5834425 +Number of synapses: 63999971 +Number of synapses: 16002058 diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_CUBAFixedConnectivity_cuda_standalone_1000000.txt b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_CUBAFixedConnectivity_cuda_standalone_1000000.txt new file mode 100644 index 00000000..487ad897 --- /dev/null +++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_CUBAFixedConnectivity_cuda_standalone_1000000.txt @@ -0,0 +1,6 @@ +INFO: setting cudaDevice stuff took 0.269366 seconds +INFO calling kernel_neurongroup_group_variable_set_conditional_codeobject with 977 blocks and 1024 threads. Kernel needs 8 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory +INFO connectivity matrix has size 15997784 +INFO connectivity matrix has size 63997779 +INFO calling kernel_neurongroup_stateupdater_codeobject with 1303 blocks and 768 threads. Kernel needs 36 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 0.750000. +ERROR launching kernel_neurongroup_stateupdater_codeobject in code_objects/neurongroup_stateupdater_codeobject.cu:1101 invalid argument diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_CUBA_cpp_standalone_1000000.txt b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_CUBA_cpp_standalone_1000000.txt new file mode 100644 index 00000000..39e2b088 --- /dev/null +++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_CUBA_cpp_standalone_1000000.txt @@ -0,0 +1,2 @@ +Number of synapses: 63985232 +Number of synapses: 16002041 diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_CUBA_cuda_standalone_1000000.txt b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_CUBA_cuda_standalone_1000000.txt new file mode 100644 index 00000000..7ce8ad87 --- /dev/null +++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_CUBA_cuda_standalone_1000000.txt @@ -0,0 +1,6 @@ +INFO: setting cudaDevice stuff took 0.265023 seconds +INFO calling kernel_neurongroup_group_variable_set_conditional_codeobject with 977 blocks and 1024 threads. Kernel needs 8 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory +INFO connectivity matrix has size 15999736 +INFO connectivity matrix has size 64013467 +INFO calling kernel_neurongroup_stateupdater_codeobject with 1303 blocks and 768 threads. Kernel needs 36 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 0.750000. +ERROR launching kernel_neurongroup_stateupdater_codeobject in code_objects/neurongroup_stateupdater_codeobject.cu:1101 invalid argument diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_DenseMediumRateSynapsesOnly_cpp_standalone_500000.txt b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_DenseMediumRateSynapsesOnly_cpp_standalone_500000.txt new file mode 100644 index 00000000..d304292b --- /dev/null +++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_DenseMediumRateSynapsesOnly_cpp_standalone_500000.txt @@ -0,0 +1 @@ +Number of synapses: 250000000 diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_DenseMediumRateSynapsesOnly_cuda_standalone_500000.txt b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_DenseMediumRateSynapsesOnly_cuda_standalone_500000.txt new file mode 100644 index 00000000..8209f380 --- /dev/null +++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_DenseMediumRateSynapsesOnly_cuda_standalone_500000.txt @@ -0,0 +1,4 @@ +INFO: setting cudaDevice stuff took 0.113374 seconds +INFO connectivity matrix has size 250000000 +INFO calling kernel_neurongroup_thresholder_codeobject with 1 blocks and 1024 threads. Kernel needs 14 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000. +ERROR launching kernel_neurongroup_thresholder_codeobject in code_objects/neurongroup_thresholder_codeobject.cu:1008 invalid argument diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_HHNeuronsOnly_cpp_standalone_1000000.txt b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_HHNeuronsOnly_cpp_standalone_1000000.txt new file mode 100644 index 00000000..e69de29b diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_HHNeuronsOnly_cuda_standalone_1000000.txt b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_HHNeuronsOnly_cuda_standalone_1000000.txt new file mode 100644 index 00000000..0de01ad8 --- /dev/null +++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_HHNeuronsOnly_cuda_standalone_1000000.txt @@ -0,0 +1,6 @@ +INFO: setting cudaDevice stuff took 0.135925 seconds +INFO calling kernel_neurongroup_group_variable_set_conditional_codeobject with 977 blocks and 1024 threads. Kernel needs 7 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 0 bytes of user-allocated constant memory +INFO calling kernel_neurongroup_stateupdater_codeobject with 977 blocks and 1024 threads. Kernel needs 62 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 0 bytes of user-allocated constant memory. Theoretical occupancy is 0.500000. +INFO calling kernel_neurongroup_thresholder_codeobject with 977 blocks and 1024 threads. Kernel needs 11 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 0 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000. +INFO: main_lines took 10.586006 seconds +INFO: main function took 10.832985 seconds diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_LinearNeuronsOnly_cpp_standalone_10000000.txt b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_LinearNeuronsOnly_cpp_standalone_10000000.txt new file mode 100644 index 00000000..e69de29b diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_LinearNeuronsOnly_cuda_standalone_10000000.txt b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_LinearNeuronsOnly_cuda_standalone_10000000.txt new file mode 100644 index 00000000..3d020cad --- /dev/null +++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_LinearNeuronsOnly_cuda_standalone_10000000.txt @@ -0,0 +1,4 @@ +INFO: setting cudaDevice stuff took 0.274551 seconds +INFO calling kernel_neurongroup_stateupdater_codeobject with 9766 blocks and 1024 threads. Kernel needs 12 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 0 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000. +INFO: main_lines took 68.949131 seconds +INFO: main function took 69.505944 seconds diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_STDPEventDriven_cpp_standalone_5000000.txt b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_STDPEventDriven_cpp_standalone_5000000.txt new file mode 100644 index 00000000..0d502595 --- /dev/null +++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_STDPEventDriven_cpp_standalone_5000000.txt @@ -0,0 +1,2 @@ +Number of synapses: 5000000 +Number of synapses: 5000000 diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_STDPEventDriven_cuda_standalone_5000000.txt b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_STDPEventDriven_cuda_standalone_5000000.txt new file mode 100644 index 00000000..5b160ed4 --- /dev/null +++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_STDPEventDriven_cuda_standalone_5000000.txt @@ -0,0 +1,15 @@ +INFO: setting cudaDevice stuff took 0.241168 seconds +INFO calling kernel_synapses_group_variable_set_conditional_codeobject with 4883 blocks and 1024 threads. Kernel needs 6 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory +INFO connectivity matrix has size 5000000 +INFO connectivity matrix has size 5000000 +INFO generating 10000000 rand every 2 clock cycles for poissongroup_thresholder_codeobject +INFO calling kernel_neurongroup_stateupdater_codeobject with 1 blocks and 768 threads. Kernel needs 36 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 0.750000. +INFO calling kernel_neurongroup_thresholder_codeobject with 1 blocks and 1024 threads. Kernel needs 9 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000. +INFO calling kernel_poissongroup_thresholder_codeobject with 4883 blocks and 1024 threads. Kernel needs 15 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000. +INFO calling kernel_synapses_pre_codeobject with 15 blocks and 1024 threads. Kernel needs 42 registers per block, 0 bytes of statically-allocated shared memory per block, 8 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 0.500000. +INFO calling kernel_synapses_post_codeobject with 15 blocks and 1024 threads. Kernel needs 34 registers per block, 0 bytes of statically-allocated shared memory per block, 8 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 0.500000. +INFO calling kernel_neurongroup_resetter_codeobject with 1 blocks and 1024 threads. Kernel needs 14 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000. +Number of synapses: 5000000 +Number of synapses: 5000000 +INFO: main_lines took 382.905001 seconds +INFO: main function took 383.607174 seconds diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_STDPMultiPostNeuronalTraces_cpp_standalone_1000000.txt b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_STDPMultiPostNeuronalTraces_cpp_standalone_1000000.txt new file mode 100644 index 00000000..f8e93de7 --- /dev/null +++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_STDPMultiPostNeuronalTraces_cpp_standalone_1000000.txt @@ -0,0 +1,2 @@ +Number of synapses: 1000000 +Number of synapses: 1000000 diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_STDPMultiPostNeuronalTraces_cuda_standalone_1000000.txt b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_STDPMultiPostNeuronalTraces_cuda_standalone_1000000.txt new file mode 100644 index 00000000..3278dcc4 --- /dev/null +++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_STDPMultiPostNeuronalTraces_cuda_standalone_1000000.txt @@ -0,0 +1,16 @@ +INFO: setting cudaDevice stuff took 0.264461 seconds +INFO calling kernel_synapses_group_variable_set_conditional_codeobject with 977 blocks and 1024 threads. Kernel needs 6 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory +INFO connectivity matrix has size 1000000 +INFO connectivity matrix has size 1000000 +INFO generating 10000000 rand every 13107 clock cycles for neurongroup_thresholder_codeobject +INFO calling kernel_neurongroup_1_stateupdater_codeobject with 2 blocks and 768 threads. Kernel needs 36 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 0.750000. +INFO calling kernel_neurongroup_stateupdater_codeobject with 2 blocks and 768 threads. Kernel needs 36 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 0.750000. +INFO calling kernel_neurongroup_1_thresholder_codeobject with 1 blocks and 1024 threads. Kernel needs 9 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000. +INFO calling kernel_neurongroup_thresholder_codeobject with 1 blocks and 1024 threads. Kernel needs 15 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000. +INFO calling kernel_synapses_pre_codeobject with 15 blocks and 1024 threads. Kernel needs 31 registers per block, 0 bytes of statically-allocated shared memory per block, 8 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000. +INFO calling kernel_synapses_post_codeobject with 15 blocks and 1024 threads. Kernel needs 28 registers per block, 0 bytes of statically-allocated shared memory per block, 8 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000. +INFO calling kernel_neurongroup_1_resetter_codeobject with 1 blocks and 1024 threads. Kernel needs 14 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000. +Number of synapses: 1000000 +Number of synapses: 1000000 +INFO: main_lines took 1.961364 seconds +INFO: main function took 2.291993 seconds diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_STDPMultiPost_cpp_standalone_1000000.txt b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_STDPMultiPost_cpp_standalone_1000000.txt new file mode 100644 index 00000000..f8e93de7 --- /dev/null +++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_STDPMultiPost_cpp_standalone_1000000.txt @@ -0,0 +1,2 @@ +Number of synapses: 1000000 +Number of synapses: 1000000 diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_STDPMultiPost_cuda_standalone_1000000.txt b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_STDPMultiPost_cuda_standalone_1000000.txt new file mode 100644 index 00000000..0d03941f --- /dev/null +++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_STDPMultiPost_cuda_standalone_1000000.txt @@ -0,0 +1,15 @@ +INFO: setting cudaDevice stuff took 0.135497 seconds +INFO calling kernel_synapses_group_variable_set_conditional_codeobject with 977 blocks and 1024 threads. Kernel needs 6 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory +INFO connectivity matrix has size 1000000 +INFO connectivity matrix has size 1000000 +INFO generating 10000000 rand every 13107 clock cycles for poissongroup_thresholder_codeobject +INFO calling kernel_neurongroup_stateupdater_codeobject with 2 blocks and 768 threads. Kernel needs 36 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 0.750000. +INFO calling kernel_neurongroup_thresholder_codeobject with 1 blocks and 1024 threads. Kernel needs 9 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000. +INFO calling kernel_poissongroup_thresholder_codeobject with 1 blocks and 1024 threads. Kernel needs 15 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000. +INFO calling kernel_synapses_pre_codeobject with 15 blocks and 1024 threads. Kernel needs 42 registers per block, 0 bytes of statically-allocated shared memory per block, 8 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 0.500000. +INFO calling kernel_synapses_post_codeobject with 15 blocks and 1024 threads. Kernel needs 34 registers per block, 0 bytes of statically-allocated shared memory per block, 8 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 0.500000. +INFO calling kernel_neurongroup_resetter_codeobject with 1 blocks and 1024 threads. Kernel needs 14 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000. +Number of synapses: 1000000 +Number of synapses: 1000000 +INFO: main_lines took 2.266672 seconds +INFO: main function took 2.490897 seconds diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_STDPNeuronalTraces_cpp_standalone_1000000.txt b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_STDPNeuronalTraces_cpp_standalone_1000000.txt new file mode 100644 index 00000000..f8e93de7 --- /dev/null +++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_STDPNeuronalTraces_cpp_standalone_1000000.txt @@ -0,0 +1,2 @@ +Number of synapses: 1000000 +Number of synapses: 1000000 diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_STDPNeuronalTraces_cuda_standalone_1000000.txt b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_STDPNeuronalTraces_cuda_standalone_1000000.txt new file mode 100644 index 00000000..54a46611 --- /dev/null +++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_STDPNeuronalTraces_cuda_standalone_1000000.txt @@ -0,0 +1,16 @@ +INFO: setting cudaDevice stuff took 0.104386 seconds +INFO calling kernel_synapses_group_variable_set_conditional_codeobject with 977 blocks and 1024 threads. Kernel needs 6 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory +INFO connectivity matrix has size 1000000 +INFO connectivity matrix has size 1000000 +INFO generating 13000000 rand every 13 clock cycles for neurongroup_thresholder_codeobject +INFO calling kernel_neurongroup_1_stateupdater_codeobject with 1 blocks and 768 threads. Kernel needs 36 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 0.750000. +INFO calling kernel_neurongroup_stateupdater_codeobject with 1303 blocks and 768 threads. Kernel needs 36 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 0.750000. +INFO calling kernel_neurongroup_1_thresholder_codeobject with 1 blocks and 1024 threads. Kernel needs 9 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000. +INFO calling kernel_neurongroup_thresholder_codeobject with 977 blocks and 1024 threads. Kernel needs 15 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000. +INFO calling kernel_synapses_pre_codeobject with 15 blocks and 1024 threads. Kernel needs 31 registers per block, 0 bytes of statically-allocated shared memory per block, 8 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000. +INFO calling kernel_synapses_post_codeobject with 15 blocks and 1024 threads. Kernel needs 28 registers per block, 0 bytes of statically-allocated shared memory per block, 8 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000. +INFO calling kernel_neurongroup_1_resetter_codeobject with 1 blocks and 1024 threads. Kernel needs 14 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000. +Number of synapses: 1000000 +Number of synapses: 1000000 +INFO: main_lines took 65.679538 seconds +INFO: main function took 65.888984 seconds diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_STDPNotEventDriven_cpp_standalone_100000.txt b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_STDPNotEventDriven_cpp_standalone_100000.txt new file mode 100644 index 00000000..2ed54ef7 --- /dev/null +++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_STDPNotEventDriven_cpp_standalone_100000.txt @@ -0,0 +1,2 @@ +Number of synapses: 100000 +Number of synapses: 100000 diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_STDPNotEventDriven_cuda_standalone_100000.txt b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_STDPNotEventDriven_cuda_standalone_100000.txt new file mode 100644 index 00000000..289fd80d --- /dev/null +++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_STDPNotEventDriven_cuda_standalone_100000.txt @@ -0,0 +1,16 @@ +INFO: setting cudaDevice stuff took 0.096839 seconds +INFO calling kernel_synapses_group_variable_set_conditional_codeobject with 98 blocks and 1024 threads. Kernel needs 6 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory +INFO connectivity matrix has size 100000 +INFO connectivity matrix has size 100000 +INFO generating 13100000 rand every 131 clock cycles for poissongroup_thresholder_codeobject +INFO calling kernel_neurongroup_stateupdater_codeobject with 1 blocks and 768 threads. Kernel needs 36 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 0.750000. +INFO calling kernel_synapses_stateupdater_codeobject with 131 blocks and 768 threads. Kernel needs 36 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 0.750000. +INFO calling kernel_neurongroup_thresholder_codeobject with 1 blocks and 1024 threads. Kernel needs 9 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000. +INFO calling kernel_poissongroup_thresholder_codeobject with 98 blocks and 1024 threads. Kernel needs 15 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000. +INFO calling kernel_synapses_pre_codeobject with 15 blocks and 1024 threads. Kernel needs 33 registers per block, 0 bytes of statically-allocated shared memory per block, 8 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 0.500000. +INFO calling kernel_synapses_post_codeobject with 15 blocks and 1024 threads. Kernel needs 27 registers per block, 0 bytes of statically-allocated shared memory per block, 8 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000. +INFO calling kernel_neurongroup_resetter_codeobject with 1 blocks and 1024 threads. Kernel needs 14 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000. +Number of synapses: 100000 +Number of synapses: 100000 +INFO: main_lines took 6.587633 seconds +INFO: main function took 6.708345 seconds diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_STDP_cpp_standalone_1000000.txt b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_STDP_cpp_standalone_1000000.txt new file mode 100644 index 00000000..f9d65d91 --- /dev/null +++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_STDP_cpp_standalone_1000000.txt @@ -0,0 +1,3 @@ +Number of synapses: 1000000 +Number of spikes: 15000963 +Number of synapses: 1000000 diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_STDP_cuda_standalone_1000000.txt b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_STDP_cuda_standalone_1000000.txt new file mode 100644 index 00000000..5b8787a6 --- /dev/null +++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_STDP_cuda_standalone_1000000.txt @@ -0,0 +1,21 @@ +INFO: setting cudaDevice stuff took 0.275236 seconds +INFO calling kernel_synapses_group_variable_set_conditional_codeobject with 977 blocks and 1024 threads. Kernel needs 6 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory +INFO connectivity matrix has size 1000000 +INFO connectivity matrix has size 1000000 +INFO generating 13000000 rand every 13 clock cycles for poissongroup_thresholder_codeobject +INFO calling kernel_neurongroup_stateupdater_codeobject with 1 blocks and 768 threads. Kernel needs 36 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 0.750000. +INFO calling kernel_neurongroup_thresholder_codeobject with 1 blocks and 1024 threads. Kernel needs 9 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000. +INFO calling kernel_poissongroup_thresholder_codeobject with 977 blocks and 1024 threads. Kernel needs 15 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000. +INFO calling kernel_spikemonitor_codeobject with 1 blocks and 1 threads. Kernel needs 37 registers per block, 0 bytes of statically-allocated shared memory per block, 16 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 0.000000. +INFO calling kernel_synapses_pre_codeobject with 15 blocks and 1024 threads. Kernel needs 42 registers per block, 0 bytes of statically-allocated shared memory per block, 8 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 0.500000. +INFO calling kernel_synapses_post_codeobject with 15 blocks and 1024 threads. Kernel needs 34 registers per block, 0 bytes of statically-allocated shared memory per block, 8 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 0.500000. +INFO calling kernel_neurongroup_resetter_codeobject with 1 blocks and 1024 threads. Kernel needs 14 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000. +... +ERROR while allocating 33554428 bytes in cudaVector.h/reserve() +ERROR while allocating 67108856 bytes in cudaVector.h/reserve() +... +Number of synapses: 1000000 +Number of synapses: 1000000 +INFO: main_lines took 2321.434166 seconds +Number of spikes: 4194303 +INFO: main function took 2329.753789 seconds diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_SparseHighRateSynapsesOnly_cpp_standalone_500000.txt b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_SparseHighRateSynapsesOnly_cpp_standalone_500000.txt new file mode 100644 index 00000000..d6ff351a --- /dev/null +++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_SparseHighRateSynapsesOnly_cpp_standalone_500000.txt @@ -0,0 +1 @@ +Number of synapses: 500027365 diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_SparseHighRateSynapsesOnly_cuda_standalone_500000.txt b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_SparseHighRateSynapsesOnly_cuda_standalone_500000.txt new file mode 100644 index 00000000..13230949 --- /dev/null +++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_SparseHighRateSynapsesOnly_cuda_standalone_500000.txt @@ -0,0 +1 @@ +INFO: setting cudaDevice stuff took 0.286275 seconds diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_SparseLowRateSynapsesOnly_cpp_standalone_500000.txt b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_SparseLowRateSynapsesOnly_cpp_standalone_500000.txt new file mode 100644 index 00000000..a4e24f5d --- /dev/null +++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_SparseLowRateSynapsesOnly_cpp_standalone_500000.txt @@ -0,0 +1 @@ +Number of synapses: 5004129 diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_SparseLowRateSynapsesOnly_cuda_standalone_500000.txt b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_SparseLowRateSynapsesOnly_cuda_standalone_500000.txt new file mode 100644 index 00000000..2e03a450 --- /dev/null +++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_SparseLowRateSynapsesOnly_cuda_standalone_500000.txt @@ -0,0 +1,7 @@ +INFO: setting cudaDevice stuff took 0.127070 seconds +INFO connectivity matrix has size 5000898 +INFO calling kernel_neurongroup_thresholder_codeobject with 1 blocks and 1024 threads. Kernel needs 14 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000. +INFO calling kernel_synapses_pre_codeobject with 15 blocks and 1024 threads. Kernel needs 27 registers per block, 0 bytes of statically-allocated shared memory per block, 8 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000. +Number of synapses: 5000898 +INFO: main_lines took 162.284184 seconds +INFO: main function took 162.573362 seconds diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_SparseMediumRateSynapsesOnly_cpp_standalone_500000.txt b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_SparseMediumRateSynapsesOnly_cpp_standalone_500000.txt new file mode 100644 index 00000000..f5dcf831 --- /dev/null +++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_SparseMediumRateSynapsesOnly_cpp_standalone_500000.txt @@ -0,0 +1 @@ +Number of synapses: 50001289 diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_SparseMediumRateSynapsesOnly_cuda_standalone_500000.txt b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_SparseMediumRateSynapsesOnly_cuda_standalone_500000.txt new file mode 100644 index 00000000..d6f0b0c1 --- /dev/null +++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_SparseMediumRateSynapsesOnly_cuda_standalone_500000.txt @@ -0,0 +1,7 @@ +INFO: setting cudaDevice stuff took 0.129188 seconds +INFO connectivity matrix has size 49998381 +INFO calling kernel_neurongroup_thresholder_codeobject with 1 blocks and 1024 threads. Kernel needs 14 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000. +INFO calling kernel_synapses_pre_codeobject with 15 blocks and 1024 threads. Kernel needs 27 registers per block, 0 bytes of statically-allocated shared memory per block, 8 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000. +Number of synapses: 49998381 +INFO: main_lines took 223.189514 seconds +INFO: main function took 224.554622 seconds diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_VerySparseMediumRateSynapsesOnly_cpp_standalone_500000.txt b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_VerySparseMediumRateSynapsesOnly_cpp_standalone_500000.txt new file mode 100644 index 00000000..6e23dd8f --- /dev/null +++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_VerySparseMediumRateSynapsesOnly_cpp_standalone_500000.txt @@ -0,0 +1 @@ +Number of synapses: 5000485 diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_VerySparseMediumRateSynapsesOnly_cuda_standalone_500000.txt b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_VerySparseMediumRateSynapsesOnly_cuda_standalone_500000.txt new file mode 100644 index 00000000..9111f9e9 --- /dev/null +++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_VerySparseMediumRateSynapsesOnly_cuda_standalone_500000.txt @@ -0,0 +1,7 @@ +INFO: setting cudaDevice stuff took 0.117737 seconds +INFO connectivity matrix has size 5002948 +INFO calling kernel_neurongroup_thresholder_codeobject with 1 blocks and 1024 threads. Kernel needs 14 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000. +INFO calling kernel_synapses_pre_codeobject with 15 blocks and 1024 threads. Kernel needs 27 registers per block, 0 bytes of statically-allocated shared memory per block, 8 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000. +Number of synapses: 5002948 +INFO: main_lines took 316.438656 seconds +INFO: main function took 316.716762 seconds diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_VogelsWithSynapticDynamic_cpp_standalone_100000.txt b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_VogelsWithSynapticDynamic_cpp_standalone_100000.txt new file mode 100644 index 00000000..5fbe1aaf --- /dev/null +++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_VogelsWithSynapticDynamic_cpp_standalone_100000.txt @@ -0,0 +1,4 @@ +Number of synapses: 32011543 +Number of synapses: 32011543 +Number of synapses: 160020171 +Number of synapses: 8000773 diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_VogelsWithSynapticDynamic_cuda_standalone_100000.txt b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_VogelsWithSynapticDynamic_cuda_standalone_100000.txt new file mode 100644 index 00000000..58c2f26b --- /dev/null +++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_VogelsWithSynapticDynamic_cuda_standalone_100000.txt @@ -0,0 +1,7 @@ +INFO: setting cudaDevice stuff took 0.256535 seconds +INFO connectivity matrix has size 7999692 +INFO connectivity matrix has size 32002251 +INFO connectivity matrix has size 160017025 +INFO connectivity matrix has size 32002251 +INFO calling kernel_neurongroup_stateupdater_codeobject with 131 blocks and 768 threads. Kernel needs 40 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 0.750000. +ERROR launching kernel_neurongroup_stateupdater_codeobject in code_objects/neurongroup_stateupdater_codeobject.cu:1066 invalid argument diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_Vogels_cpp_standalone_100000.txt b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_Vogels_cpp_standalone_100000.txt new file mode 100644 index 00000000..7990bb47 --- /dev/null +++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_Vogels_cpp_standalone_100000.txt @@ -0,0 +1,4 @@ +Number of synapses: 32000183 +Number of synapses: 32000183 +Number of synapses: 159996515 +Number of synapses: 7996913 diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_Vogels_cuda_standalone_100000.txt b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_Vogels_cuda_standalone_100000.txt new file mode 100644 index 00000000..da99c451 --- /dev/null +++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_Vogels_cuda_standalone_100000.txt @@ -0,0 +1,7 @@ +INFO: setting cudaDevice stuff took 0.263336 seconds +INFO connectivity matrix has size 7997654 +INFO connectivity matrix has size 31988320 +INFO connectivity matrix has size 159989507 +INFO connectivity matrix has size 31988320 +INFO calling kernel_neurongroup_stateupdater_codeobject with 131 blocks and 768 threads. Kernel needs 40 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 0.750000. +ERROR launching kernel_neurongroup_stateupdater_codeobject in code_objects/neurongroup_stateupdater_codeobject.cu:1066 invalid argument diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_AdaptationOscillation_CUDAStandaloneConfiguration_1000.log b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_AdaptationOscillation_CUDAStandaloneConfiguration_1000.log new file mode 100644 index 00000000..c0fa0dd0 --- /dev/null +++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_AdaptationOscillation_CUDAStandaloneConfiguration_1000.log @@ -0,0 +1,25 @@ +==27090== NVPROF is profiling process 27090, command: ./main +==27090== Profiling application: ./main +==27090== Profiling result: +Time(%) Time Calls Avg Min Max Name + 54.38% 151.00ms 10000 15.100us 2.8800us 70.592us kernel_synapses_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, double*, int, int*, int, int*, double, double*, int*, int, bool*) + 18.09% 50.227ms 10000 5.0220us 4.7040us 6.8800us kernel_neurongroup_stateupdater_codeobject(unsigned int, unsigned int, double*, double, double*, double*, double*, bool*, float*) + 11.30% 31.386ms 10000 3.1380us 3.0400us 4.2560us [CUDA memset] + 8.01% 22.246ms 10000 2.2240us 1.8560us 2.7520us kernel_neurongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double, double*, double*, bool*) + 7.90% 21.951ms 10000 2.1950us 1.5360us 3.0400us kernel_neurongroup_resetter_codeobject(unsigned int, unsigned int, int*, double*, double*, bool*) + 0.32% 881.25us 1 881.25us 881.25us 881.25us void gen_sequenced(curandStateXORWOW*, normal_args_st))>(curandStateXORWOW*, float2*, unsigned long, unsigned long, normal_args_st) + +==27090== API calls: +Time(%) Time Calls Avg Min Max Name + 65.77% 370.03ms 40001 9.2500us 8.1820us 8.8454ms cudaLaunch + 16.57% 93.193ms 10000 9.3190us 8.6380us 24.859us cudaMemset + 13.98% 78.650ms 390005 201ns 149ns 319.77us cudaSetupArgument + 1.93% 10.868ms 40001 271ns 200ns 313.28us cudaConfigureCall + 1.70% 9.5546ms 40002 238ns 207ns 5.1700us cudaGetLastError + 0.03% 174.94us 1 174.94us 174.94us 174.94us cudaMalloc + 0.01% 50.180us 1 50.180us 50.180us 50.180us cudaMemGetInfo + 0.00% 23.192us 38 610ns 476ns 1.5970us cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags + 0.00% 19.120us 7 2.7310us 2.0000us 5.0070us cudaFuncGetAttributes + 0.00% 17.862us 1 17.862us 17.862us 17.862us cudaDeviceSynchronize + 0.00% 5.0460us 12 420ns 293ns 1.1020us cudaDeviceGetAttribute + 0.00% 3.2580us 3 1.0860us 659ns 1.8660us cudaGetDevice diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_AdaptationOscillation_GeNNConfigurationOptimized_1000.log b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_AdaptationOscillation_GeNNConfigurationOptimized_1000.log new file mode 100644 index 00000000..47f193b0 --- /dev/null +++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_AdaptationOscillation_GeNNConfigurationOptimized_1000.log @@ -0,0 +1,25 @@ +==27315== NVPROF is profiling process 27315, command: ./main test 1.0 1 +==27315== Profiling application: ./main test 1.0 1 +==27315== Profiling result: +Time(%) Time Calls Avg Min Max Name + 53.41% 151.83ms 10000 15.183us 1.9200us 1.1186ms calcSynapses + 46.17% 131.26ms 10000 13.126us 10.560us 20.288us calcNeurons + 0.32% 903.46us 48 18.822us 960ns 129.47us [CUDA memcpy HtoD] + 0.10% 283.36us 14 20.240us 1.9840us 122.88us [CUDA memcpy DtoH] + +==27315== API calls: +Time(%) Time Calls Avg Min Max Name + 48.83% 298.28ms 13 22.945ms 9.2060us 295.80ms cudaHostAlloc + 46.42% 283.54ms 20000 14.176us 7.6710us 1.1119ms cudaLaunch + 2.61% 15.926ms 64 248.85us 409ns 13.875ms cudaMemcpy + 1.10% 6.6997ms 20000 334ns 268ns 303.73us cudaConfigureCall + 0.84% 5.1253ms 20000 256ns 228ns 5.1490us cudaSetupArgument + 0.14% 867.56us 13 66.735us 7.8370us 174.67us cudaMalloc + 0.04% 257.35us 83 3.1000us 186ns 109.74us cuDeviceGetAttribute + 0.01% 39.793us 1 39.793us 39.793us 39.793us cuDeviceGetName + 0.01% 36.797us 1 36.797us 36.797us 36.797us cuDeviceTotalMem + 0.00% 16.271us 1 16.271us 16.271us 16.271us cudaSetDevice + 0.00% 15.322us 13 1.1780us 539ns 3.3530us cudaGetSymbolAddress + 0.00% 2.6060us 2 1.3030us 777ns 1.8290us cuDeviceGetCount + 0.00% 1.8590us 1 1.8590us 1.8590us 1.8590us cudaGetDeviceCount + 0.00% 975ns 2 487ns 397ns 578ns cuDeviceGet diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_BrunelHakimModelHeterogeneousDelay_CUDAStandaloneConfiguration_1000.log b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_BrunelHakimModelHeterogeneousDelay_CUDAStandaloneConfiguration_1000.log new file mode 100644 index 00000000..0d6b351a --- /dev/null +++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_BrunelHakimModelHeterogeneousDelay_CUDAStandaloneConfiguration_1000.log @@ -0,0 +1,27 @@ +==13252== NVPROF is profiling process 13252, command: ./main +==13252== Profiling application: ./main +==13252== Profiling result: +Time(%) Time Calls Avg Min Max Name + 86.34% 3.21777s 10000 321.78us 1.5360us 4.9107ms _run_synapses_pre_push_spikes_push_kernel(unsigned int, unsigned int, unsigned int, int*) + 9.78% 364.56ms 10000 36.455us 2.2080us 84.928us kernel_synapses_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, int*, int, double, int*, int, double*, int*) + 1.24% 46.150ms 10000 4.6140us 4.4480us 6.7520us kernel_neurongroup_stateupdater_codeobject(unsigned int, unsigned int, double, double*, double*, double*, bool*, float*) + 0.86% 32.053ms 10000 3.2050us 2.9120us 4.2240us [CUDA memset] + 0.70% 25.923ms 10000 2.5920us 2.3680us 3.6160us _run_synapses_pre_push_spikes_advance_kernel(void) + 0.58% 21.708ms 10000 2.1700us 1.8880us 2.7200us kernel_neurongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double, double*, double*, bool*) + 0.48% 17.725ms 10000 1.7720us 1.6960us 2.0480us kernel_neurongroup_resetter_codeobject(unsigned int, unsigned int, double*, int*) + 0.02% 880.45us 1 880.45us 880.45us 880.45us void gen_sequenced(curandStateXORWOW*, normal_args_st))>(curandStateXORWOW*, float2*, unsigned long, unsigned long, normal_args_st) + +==13252== API calls: +Time(%) Time Calls Avg Min Max Name + 93.08% 3.54282s 60001 59.045us 7.8910us 6.6818ms cudaLaunch + 2.78% 105.95ms 10000 10.595us 8.3520us 305.06us cudaMemset + 1.61% 61.198ms 1 61.198ms 61.198ms 61.198ms cudaDeviceSynchronize + 1.60% 60.805ms 370005 164ns 130ns 296.03us cudaSetupArgument + 0.49% 18.710ms 60002 311ns 237ns 312.79us cudaGetLastError + 0.43% 16.481ms 60001 274ns 181ns 299.24us cudaConfigureCall + 0.00% 182.53us 1 182.53us 182.53us 182.53us cudaMalloc + 0.00% 71.394us 1 71.394us 71.394us 71.394us cudaMemGetInfo + 0.00% 20.387us 38 536ns 474ns 1.4760us cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags + 0.00% 18.951us 7 2.7070us 1.9760us 5.3870us cudaFuncGetAttributes + 0.00% 4.9460us 12 412ns 263ns 1.1520us cudaDeviceGetAttribute + 0.00% 2.8500us 3 950ns 608ns 1.6040us cudaGetDevice diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_BrunelHakimModelHeterogeneousDelay_GeNNConfigurationOptimized_1000.log b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_BrunelHakimModelHeterogeneousDelay_GeNNConfigurationOptimized_1000.log new file mode 100644 index 00000000..1c7f7e6f --- /dev/null +++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_BrunelHakimModelHeterogeneousDelay_GeNNConfigurationOptimized_1000.log @@ -0,0 +1,25 @@ +==13488== NVPROF is profiling process 13488, command: ./main test 1.0 1 +==13488== Profiling application: ./main test 1.0 1 +==13488== Profiling result: +Time(%) Time Calls Avg Min Max Name + 74.47% 118.07ms 10000 11.806us 10.016us 17.664us calcNeurons + 18.42% 29.207ms 10000 2.9200us 1.9200us 17.664us calcSynapses + 5.59% 8.8552ms 40 221.38us 960ns 2.5145ms [CUDA memcpy HtoD] + 1.52% 2.4178ms 10 241.78us 1.9520us 2.3869ms [CUDA memcpy DtoH] + +==13488== API calls: +Time(%) Time Calls Avg Min Max Name + 58.76% 270.99ms 11 24.635ms 17.531us 265.27ms cudaHostAlloc + 36.00% 166.02ms 20000 8.3000us 7.6090us 315.35us cudaLaunch + 2.62% 12.069ms 53 227.72us 334ns 2.5281ms cudaMemcpy + 1.36% 6.2887ms 20000 314ns 240ns 302.98us cudaConfigureCall + 1.00% 4.6085ms 20000 230ns 217ns 2.8530us cudaSetupArgument + 0.19% 860.67us 11 78.243us 12.662us 173.88us cudaMalloc + 0.05% 234.84us 83 2.8290us 158ns 100.64us cuDeviceGetAttribute + 0.01% 32.245us 1 32.245us 32.245us 32.245us cuDeviceTotalMem + 0.01% 27.894us 1 27.894us 27.894us 27.894us cuDeviceGetName + 0.00% 14.621us 11 1.3290us 791ns 3.3800us cudaGetSymbolAddress + 0.00% 12.561us 1 12.561us 12.561us 12.561us cudaSetDevice + 0.00% 1.4740us 2 737ns 495ns 979ns cuDeviceGetCount + 0.00% 1.4370us 1 1.4370us 1.4370us 1.4370us cudaGetDeviceCount + 0.00% 524ns 2 262ns 227ns 297ns cuDeviceGet diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_BrunelHakimModelScalarDelayNoMultiPrePost_CUDAStandaloneConfiguration_1000.log b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_BrunelHakimModelScalarDelayNoMultiPrePost_CUDAStandaloneConfiguration_1000.log new file mode 100644 index 00000000..607585c5 --- /dev/null +++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_BrunelHakimModelScalarDelayNoMultiPrePost_CUDAStandaloneConfiguration_1000.log @@ -0,0 +1,25 @@ +==23945== NVPROF is profiling process 23945, command: ./main +==23945== Profiling application: ./main +==23945== Profiling result: +Time(%) Time Calls Avg Min Max Name + 28.82% 47.429ms 10000 4.7420us 2.8800us 34.464us kernel_synapses_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, int*, int, double, int*, int, double*, int*) + 28.42% 46.768ms 10000 4.6760us 4.4480us 6.8800us kernel_neurongroup_stateupdater_codeobject(unsigned int, unsigned int, double, double*, double*, double*, bool*, float*) + 18.77% 30.887ms 10000 3.0880us 3.0400us 3.6160us [CUDA memset] + 13.20% 21.722ms 10000 2.1720us 2.0160us 2.5920us kernel_neurongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double, double*, double*, bool*) + 10.25% 16.871ms 10000 1.6870us 1.5680us 1.9840us kernel_neurongroup_resetter_codeobject(unsigned int, unsigned int, double*, int*) + 0.54% 881.31us 1 881.31us 881.31us 881.31us void gen_sequenced(curandStateXORWOW*, normal_args_st))>(curandStateXORWOW*, float2*, unsigned long, unsigned long, normal_args_st) + +==23945== API calls: +Time(%) Time Calls Avg Min Max Name + 68.47% 378.42ms 40001 9.4600us 8.3920us 11.185ms cudaLaunch + 16.96% 93.726ms 10000 9.3720us 8.8820us 22.956us cudaMemset + 10.76% 59.491ms 330005 180ns 148ns 309.86us cudaSetupArgument + 1.90% 10.527ms 40001 263ns 182ns 298.24us cudaConfigureCall + 1.84% 10.177ms 40002 254ns 225ns 10.282us cudaGetLastError + 0.03% 178.62us 1 178.62us 178.62us 178.62us cudaMalloc + 0.01% 52.598us 1 52.598us 52.598us 52.598us cudaMemGetInfo + 0.00% 25.078us 38 659ns 560ns 2.7750us cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags + 0.00% 19.936us 7 2.8480us 2.0920us 5.4650us cudaFuncGetAttributes + 0.00% 17.187us 1 17.187us 17.187us 17.187us cudaDeviceSynchronize + 0.00% 5.0920us 12 424ns 278ns 1.0780us cudaDeviceGetAttribute + 0.00% 3.1170us 3 1.0390us 523ns 1.9660us cudaGetDevice diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_BrunelHakimModelScalarDelayNoMultiPrePost_GeNNConfigurationOptimized_1000.log b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_BrunelHakimModelScalarDelayNoMultiPrePost_GeNNConfigurationOptimized_1000.log new file mode 100644 index 00000000..1c090bcc --- /dev/null +++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_BrunelHakimModelScalarDelayNoMultiPrePost_GeNNConfigurationOptimized_1000.log @@ -0,0 +1,26 @@ +==24196== NVPROF is profiling process 24196, command: ./main test 1.0 1 +==24196== Profiling application: ./main test 1.0 1 +==24196== Profiling result: +Time(%) Time Calls Avg Min Max Name + 71.41% 120.56ms 10000 12.055us 10.048us 17.952us calcNeurons + 21.88% 36.941ms 10000 3.6940us 2.5280us 26.912us calcSynapses + 5.29% 8.9319ms 41 217.85us 992ns 2.5123ms [CUDA memcpy HtoD] + 1.42% 2.3983ms 10 239.83us 2.0160us 2.3673ms [CUDA memcpy DtoH] + +==24196== API calls: +Time(%) Time Calls Avg Min Max Name + 58.26% 272.15ms 11 24.741ms 19.067us 265.67ms cudaHostAlloc + 36.33% 169.74ms 20000 8.4860us 7.6190us 310.62us cudaLaunch + 2.72% 12.686ms 53 239.35us 323ns 2.5267ms cudaMemcpy + 1.36% 6.3732ms 20000 318ns 242ns 300.70us cudaConfigureCall + 1.03% 4.8351ms 20000 241ns 210ns 10.299us cudaSetupArgument + 0.22% 1.0265ms 11 93.320us 12.594us 179.95us cudaMalloc + 0.05% 240.26us 83 2.8940us 152ns 104.47us cuDeviceGetAttribute + 0.01% 32.415us 1 32.415us 32.415us 32.415us cuDeviceTotalMem + 0.01% 28.407us 1 28.407us 28.407us 28.407us cuDeviceGetName + 0.00% 14.808us 11 1.3460us 741ns 3.2100us cudaGetSymbolAddress + 0.00% 14.772us 1 14.772us 14.772us 14.772us cudaMemcpyToSymbol + 0.00% 12.168us 1 12.168us 12.168us 12.168us cudaSetDevice + 0.00% 1.4860us 1 1.4860us 1.4860us 1.4860us cudaGetDeviceCount + 0.00% 1.4580us 2 729ns 473ns 985ns cuDeviceGetCount + 0.00% 537ns 2 268ns 226ns 311ns cuDeviceGet diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_BrunelHakimModelScalarDelay_CUDAStandaloneConfiguration_1000.log b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_BrunelHakimModelScalarDelay_CUDAStandaloneConfiguration_1000.log new file mode 100644 index 00000000..a21ea00e --- /dev/null +++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_BrunelHakimModelScalarDelay_CUDAStandaloneConfiguration_1000.log @@ -0,0 +1,25 @@ +==2491== NVPROF is profiling process 2491, command: ./main +==2491== Profiling application: ./main +==2491== Profiling result: +Time(%) Time Calls Avg Min Max Name + 28.57% 48.196ms 10000 4.8190us 4.5440us 6.7840us kernel_neurongroup_stateupdater_codeobject(unsigned int, unsigned int, double, double*, double*, double*, bool*, float*) + 27.77% 46.841ms 10000 4.6840us 2.8800us 31.584us kernel_synapses_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, int*, int, double, int*, int, double*, int*) + 19.44% 32.782ms 10000 3.2780us 3.2320us 3.7760us [CUDA memset] + 12.58% 21.215ms 10000 2.1210us 1.9840us 2.5600us kernel_neurongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double, double*, double*, bool*) + 11.12% 18.762ms 10000 1.8760us 1.7920us 2.1120us kernel_neurongroup_resetter_codeobject(unsigned int, unsigned int, double*, int*) + 0.52% 880.90us 1 880.90us 880.90us 880.90us void gen_sequenced(curandStateXORWOW*, normal_args_st))>(curandStateXORWOW*, float2*, unsigned long, unsigned long, normal_args_st) + +==2491== API calls: +Time(%) Time Calls Avg Min Max Name + 67.81% 358.71ms 40001 8.9670us 7.9890us 10.112ms cudaLaunch + 16.69% 88.268ms 10000 8.8260us 8.3570us 34.808us cudaMemset + 11.38% 60.182ms 330005 182ns 150ns 304.26us cudaSetupArgument + 2.12% 11.226ms 40001 280ns 197ns 305.80us cudaConfigureCall + 1.95% 10.335ms 40002 258ns 217ns 14.869us cudaGetLastError + 0.03% 178.47us 1 178.47us 178.47us 178.47us cudaMalloc + 0.01% 51.372us 1 51.372us 51.372us 51.372us cudaMemGetInfo + 0.00% 21.822us 38 574ns 469ns 3.0220us cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags + 0.00% 19.460us 7 2.7800us 2.0130us 5.1840us cudaFuncGetAttributes + 0.00% 17.572us 1 17.572us 17.572us 17.572us cudaDeviceSynchronize + 0.00% 5.0120us 12 417ns 283ns 1.0740us cudaDeviceGetAttribute + 0.00% 2.8560us 3 952ns 570ns 1.6710us cudaGetDevice diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_BrunelHakimModelScalarDelay_GeNNConfigurationOptimized_1000.log b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_BrunelHakimModelScalarDelay_GeNNConfigurationOptimized_1000.log new file mode 100644 index 00000000..e28fbaf4 --- /dev/null +++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_BrunelHakimModelScalarDelay_GeNNConfigurationOptimized_1000.log @@ -0,0 +1,26 @@ +==2741== NVPROF is profiling process 2741, command: ./main test 1.0 1 +==2741== Profiling application: ./main test 1.0 1 +==2741== Profiling result: +Time(%) Time Calls Avg Min Max Name + 71.49% 120.00ms 10000 11.999us 10.016us 18.144us calcNeurons + 21.75% 36.501ms 10000 3.6500us 2.4960us 29.185us calcSynapses + 5.33% 8.9404ms 41 218.06us 960ns 2.5144ms [CUDA memcpy HtoD] + 1.43% 2.4037ms 10 240.37us 2.0480us 2.3725ms [CUDA memcpy DtoH] + +==2741== API calls: +Time(%) Time Calls Avg Min Max Name + 59.17% 284.47ms 11 25.861ms 13.934us 278.41ms cudaHostAlloc + 35.49% 170.60ms 20000 8.5300us 7.5850us 307.94us cudaLaunch + 2.68% 12.860ms 53 242.63us 394ns 2.5288ms cudaMemcpy + 1.36% 6.5596ms 20000 327ns 257ns 308.28us cudaConfigureCall + 1.04% 5.0131ms 20000 250ns 228ns 9.1940us cudaSetupArgument + 0.19% 898.78us 11 81.706us 9.2360us 153.32us cudaMalloc + 0.05% 226.47us 83 2.7280us 137ns 97.777us cuDeviceGetAttribute + 0.01% 31.138us 1 31.138us 31.138us 31.138us cuDeviceTotalMem + 0.01% 27.215us 1 27.215us 27.215us 27.215us cuDeviceGetName + 0.00% 12.953us 11 1.1770us 575ns 2.8170us cudaGetSymbolAddress + 0.00% 12.076us 1 12.076us 12.076us 12.076us cudaMemcpyToSymbol + 0.00% 10.837us 1 10.837us 10.837us 10.837us cudaSetDevice + 0.00% 1.5250us 1 1.5250us 1.5250us 1.5250us cudaGetDeviceCount + 0.00% 1.4930us 2 746ns 490ns 1.0030us cuDeviceGetCount + 0.00% 498ns 2 249ns 224ns 274ns cuDeviceGet diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_COBAHHFixedConnectivity_CUDAStandaloneConfiguration_1000.log b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_COBAHHFixedConnectivity_CUDAStandaloneConfiguration_1000.log new file mode 100644 index 00000000..4beaf7b5 --- /dev/null +++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_COBAHHFixedConnectivity_CUDAStandaloneConfiguration_1000.log @@ -0,0 +1,25 @@ +==17632== NVPROF is profiling process 17632, command: ./main +==17632== Profiling application: ./main +==17632== Profiling result: +Time(%) Time Calls Avg Min Max Name + 44.90% 349.33ms 10000 34.933us 1.6640us 111.13ms kernel_spikemonitor_codeobject(unsigned int, int*, double, int*, int*, int*, int, int*, double*, int, int*, int*) + 23.60% 183.61ms 10000 18.361us 17.824us 21.856us kernel_neurongroup_stateupdater_codeobject(unsigned int, unsigned int, double*, double*, double*, bool*, double*, double*, double*, double*, double, double*) + 14.85% 115.52ms 10000 11.551us 3.0720us 36.353us kernel_synapses_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, int*, int, double*, double, int*, int, int*) + 9.49% 73.847ms 10000 7.3840us 3.0720us 24.064us kernel_synapses_1_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, int*, int, double, int*, int, int*, double*) + 4.03% 31.352ms 10000 3.1350us 3.0400us 4.2880us [CUDA memset] + 3.12% 24.285ms 10000 2.4280us 2.0480us 2.7840us kernel_neurongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double, double*, double*, bool*) + 0.01% 68.000us 1 68.000us 68.000us 68.000us _run_spikemonitor_codeobject_init(void) + +==17632== API calls: +Time(%) Time Calls Avg Min Max Name + 73.31% 632.36ms 50001 12.646us 8.2740us 95.930ms cudaLaunch + 12.10% 104.36ms 590000 176ns 149ns 346.69us cudaSetupArgument + 11.27% 97.201ms 10000 9.7200us 8.6440us 1.1383ms cudaMemset + 1.55% 13.390ms 50001 267ns 192ns 331.43us cudaConfigureCall + 1.55% 13.349ms 50001 266ns 220ns 330.51us cudaGetLastError + 0.21% 1.8328ms 1 1.8328ms 1.8328ms 1.8328ms cudaDeviceSynchronize + 0.01% 51.143us 1 51.143us 51.143us 51.143us cudaMemGetInfo + 0.00% 18.972us 7 2.7100us 2.0070us 4.6510us cudaFuncGetAttributes + 0.00% 14.003us 22 636ns 470ns 1.4930us cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags + 0.00% 4.3080us 8 538ns 317ns 1.2590us cudaDeviceGetAttribute + 0.00% 2.2780us 2 1.1390us 764ns 1.5140us cudaGetDevice diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_COBAHHFixedConnectivity_GeNNConfigurationOptimized_1000.log b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_COBAHHFixedConnectivity_GeNNConfigurationOptimized_1000.log new file mode 100644 index 00000000..32b766b8 --- /dev/null +++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_COBAHHFixedConnectivity_GeNNConfigurationOptimized_1000.log @@ -0,0 +1,25 @@ +==17891== NVPROF is profiling process 17891, command: ./main test 1.0 1 +==17891== Profiling application: ./main test 1.0 1 +==17891== Profiling result: +Time(%) Time Calls Avg Min Max Name + 66.58% 251.53ms 10000 25.153us 23.840us 28.000us calcNeurons + 23.34% 88.193ms 10000 8.8190us 2.4320us 41.472us calcSynapses + 9.86% 37.269ms 18461 2.0180us 1.9520us 153.18us [CUDA memcpy DtoH] + 0.22% 820.87us 68 12.071us 960ns 164.23us [CUDA memcpy HtoD] + +==17891== API calls: +Time(%) Time Calls Avg Min Max Name + 52.66% 509.16ms 20088 25.346us 320ns 371.03us cudaMemcpy + 26.73% 258.42ms 19 13.601ms 8.8970us 255.30ms cudaHostAlloc + 19.10% 184.67ms 20000 9.2330us 7.8160us 348.55us cudaLaunch + 0.81% 7.7916ms 20000 389ns 275ns 331.45us cudaConfigureCall + 0.56% 5.4451ms 20000 272ns 241ns 4.6710us cudaSetupArgument + 0.10% 1.0098ms 19 53.145us 6.4240us 173.26us cudaMalloc + 0.02% 226.52us 83 2.7290us 143ns 97.659us cuDeviceGetAttribute + 0.00% 31.331us 1 31.331us 31.331us 31.331us cuDeviceTotalMem + 0.00% 30.487us 1 30.487us 30.487us 30.487us cuDeviceGetName + 0.00% 18.126us 19 954ns 368ns 3.5740us cudaGetSymbolAddress + 0.00% 11.311us 1 11.311us 11.311us 11.311us cudaSetDevice + 0.00% 1.7800us 2 890ns 658ns 1.1220us cuDeviceGetCount + 0.00% 1.4830us 1 1.4830us 1.4830us 1.4830us cudaGetDeviceCount + 0.00% 640ns 2 320ns 242ns 398ns cuDeviceGet diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_COBAHH_CUDAStandaloneConfiguration_1000.log b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_COBAHH_CUDAStandaloneConfiguration_1000.log new file mode 100644 index 00000000..3e647a1c --- /dev/null +++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_COBAHH_CUDAStandaloneConfiguration_1000.log @@ -0,0 +1,23 @@ +==11907== NVPROF is profiling process 11907, command: ./main +==11907== Profiling application: ./main +==11907== Profiling result: +Time(%) Time Calls Avg Min Max Name + 39.16% 186.02ms 10000 18.602us 17.856us 21.568us kernel_neurongroup_stateupdater_codeobject(unsigned int, unsigned int, double*, double*, double*, bool*, double*, double*, double*, double*, double, double*) + 29.93% 142.18ms 10000 14.218us 3.2320us 35.680us kernel_synapses_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, int*, int, double*, double, int*, int, int*) + 19.08% 90.630ms 10000 9.0620us 3.1680us 24.448us kernel_synapses_1_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, int*, int, double, int*, int, int*, double*) + 6.67% 31.670ms 10000 3.1660us 3.0400us 4.1920us [CUDA memset] + 5.15% 24.481ms 10000 2.4480us 2.0480us 2.7840us kernel_neurongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double, double*, double*, bool*) + +==11907== API calls: +Time(%) Time Calls Avg Min Max Name + 66.01% 376.74ms 40000 9.4180us 8.4480us 6.9662ms cudaLaunch + 15.97% 91.133ms 10000 9.1130us 8.5190us 28.283us cudaMemset + 13.95% 79.611ms 470000 169ns 149ns 316.22us cudaSetupArgument + 2.29% 13.092ms 40000 327ns 202ns 311.93us cudaConfigureCall + 1.76% 10.072ms 40000 251ns 230ns 5.0760us cudaGetLastError + 0.01% 50.252us 1 50.252us 50.252us 50.252us cudaMemGetInfo + 0.00% 22.121us 1 22.121us 22.121us 22.121us cudaDeviceSynchronize + 0.00% 16.912us 6 2.8180us 2.0980us 4.5270us cudaFuncGetAttributes + 0.00% 13.875us 21 660ns 520ns 1.5110us cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags + 0.00% 3.9730us 8 496ns 302ns 1.1490us cudaDeviceGetAttribute + 0.00% 2.3840us 2 1.1920us 836ns 1.5480us cudaGetDevice diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_COBAHH_GeNNConfigurationOptimized_1000.log b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_COBAHH_GeNNConfigurationOptimized_1000.log new file mode 100644 index 00000000..21ebd2b2 --- /dev/null +++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_COBAHH_GeNNConfigurationOptimized_1000.log @@ -0,0 +1,25 @@ +==12169== NVPROF is profiling process 12169, command: ./main test 1.0 1 +==12169== Profiling application: ./main test 1.0 1 +==12169== Profiling result: +Time(%) Time Calls Avg Min Max Name + 64.38% 254.25ms 10000 25.425us 23.777us 28.416us calcNeurons + 35.52% 140.25ms 10000 14.025us 2.4320us 41.696us calcSynapses + 0.07% 285.47us 68 4.1980us 960ns 42.944us [CUDA memcpy HtoD] + 0.03% 108.42us 18 6.0230us 1.9840us 40.736us [CUDA memcpy DtoH] + +==12169== API calls: +Time(%) Time Calls Avg Min Max Name + 52.49% 378.74ms 20000 18.937us 7.6840us 358.81us cudaLaunch + 42.10% 303.75ms 19 15.987ms 8.2320us 301.68ms cudaHostAlloc + 3.34% 24.097ms 88 273.83us 330ns 22.690ms cudaMemcpy + 1.06% 7.6642ms 20000 383ns 262ns 335.28us cudaConfigureCall + 0.86% 6.2250ms 20000 311ns 242ns 336.35us cudaSetupArgument + 0.10% 707.36us 19 37.229us 6.2200us 126.23us cudaMalloc + 0.03% 241.14us 83 2.9050us 137ns 109.48us cuDeviceGetAttribute + 0.00% 31.485us 1 31.485us 31.485us 31.485us cuDeviceTotalMem + 0.00% 30.190us 1 30.190us 30.190us 30.190us cuDeviceGetName + 0.00% 12.302us 19 647ns 344ns 2.1110us cudaGetSymbolAddress + 0.00% 11.562us 1 11.562us 11.562us 11.562us cudaSetDevice + 0.00% 1.5290us 2 764ns 561ns 968ns cuDeviceGetCount + 0.00% 1.4620us 1 1.4620us 1.4620us 1.4620us cudaGetDeviceCount + 0.00% 480ns 2 240ns 218ns 262ns cuDeviceGet diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_CUBAFixedConnectivity_CUDAStandaloneConfiguration_1000.log b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_CUBAFixedConnectivity_CUDAStandaloneConfiguration_1000.log new file mode 100644 index 00000000..60479a60 --- /dev/null +++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_CUBAFixedConnectivity_CUDAStandaloneConfiguration_1000.log @@ -0,0 +1,26 @@ +==28333== NVPROF is profiling process 28333, command: ./main +==28333== Profiling application: ./main +==28333== Profiling result: +Time(%) Time Calls Avg Min Max Name + 23.53% 75.188ms 10000 7.5180us 7.1360us 8.8960us kernel_neurongroup_stateupdater_codeobject(unsigned int, unsigned int, double*, double*, double, double*, double*, double*, bool*) + 20.88% 66.723ms 10000 6.6720us 1.6960us 14.967ms kernel_spikemonitor_codeobject(unsigned int, int*, double, int*, int*, int*, int, int*, double*, int, int*, int*) + 17.07% 54.561ms 10000 5.4560us 3.2960us 21.920us kernel_synapses_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, int*, int, double*, double, int*, int, int*, bool*) + 15.31% 48.929ms 10000 4.8920us 3.2960us 18.784us kernel_synapses_1_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, int*, int, double, int*, int, int*, double*, bool*) + 10.24% 32.716ms 10000 3.2710us 3.1360us 4.1920us [CUDA memset] + 7.36% 23.508ms 10000 2.3500us 2.0160us 2.7200us kernel_neurongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double, double*, double*, bool*) + 5.59% 17.866ms 10000 1.7860us 1.5360us 2.0800us kernel_neurongroup_resetter_codeobject(unsigned int, unsigned int, double*, int*, bool*) + 0.02% 67.328us 1 67.328us 67.328us 67.328us _run_spikemonitor_codeobject_init(void) + +==28333== API calls: +Time(%) Time Calls Avg Min Max Name + 70.32% 550.58ms 60001 9.1760us 8.3390us 6.9445ms cudaLaunch + 14.00% 109.65ms 630000 174ns 148ns 343.93us cudaSetupArgument + 11.69% 91.573ms 10000 9.1570us 8.5300us 165.12us cudaMemset + 1.99% 15.611ms 60001 260ns 222ns 327.19us cudaConfigureCall + 1.98% 15.472ms 60001 257ns 208ns 1.1493ms cudaGetLastError + 0.01% 51.353us 1 51.353us 51.353us 51.353us cudaMemGetInfo + 0.00% 24.711us 40 617ns 509ns 1.7610us cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags + 0.00% 23.494us 9 2.6100us 2.0080us 4.3370us cudaFuncGetAttributes + 0.00% 17.566us 1 17.566us 17.566us 17.566us cudaDeviceSynchronize + 0.00% 5.4430us 12 453ns 281ns 1.1050us cudaDeviceGetAttribute + 0.00% 3.0770us 3 1.0250us 646ns 1.6320us cudaGetDevice diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_CUBAFixedConnectivity_GeNNConfigurationOptimized_1000.log b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_CUBAFixedConnectivity_GeNNConfigurationOptimized_1000.log new file mode 100644 index 00000000..2b127f8e --- /dev/null +++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_CUBAFixedConnectivity_GeNNConfigurationOptimized_1000.log @@ -0,0 +1,25 @@ +==28592== NVPROF is profiling process 28592, command: ./main test 1.0 1 +==28592== Profiling application: ./main test 1.0 1 +==28592== Profiling result: +Time(%) Time Calls Avg Min Max Name + 63.11% 133.95ms 10000 13.394us 12.384us 14.432us calcNeurons + 22.74% 48.266ms 10000 4.8260us 2.7200us 24.896us calcSynapses + 13.78% 29.240ms 14081 2.0760us 2.0160us 154.95us [CUDA memcpy DtoH] + 0.37% 793.60us 56 14.171us 960ns 163.11us [CUDA memcpy HtoD] + +==28592== API calls: +Time(%) Time Calls Avg Min Max Name + 38.67% 315.20ms 20073 15.702us 324ns 773.07us cudaMemcpy + 37.36% 304.57ms 16 19.036ms 8.7600us 301.99ms cudaHostAlloc + 22.40% 182.59ms 20000 9.1290us 7.6730us 821.14us cudaLaunch + 0.78% 6.3728ms 20000 318ns 250ns 5.2440us cudaConfigureCall + 0.66% 5.3441ms 20000 267ns 226ns 332.81us cudaSetupArgument + 0.10% 800.29us 16 50.018us 6.1360us 126.53us cudaMalloc + 0.03% 230.87us 83 2.7810us 153ns 99.066us cuDeviceGetAttribute + 0.00% 32.084us 1 32.084us 32.084us 32.084us cuDeviceTotalMem + 0.00% 30.780us 1 30.780us 30.780us 30.780us cuDeviceGetName + 0.00% 12.549us 16 784ns 421ns 2.2350us cudaGetSymbolAddress + 0.00% 11.671us 1 11.671us 11.671us 11.671us cudaSetDevice + 0.00% 1.8440us 1 1.8440us 1.8440us 1.8440us cudaGetDeviceCount + 0.00% 1.7500us 2 875ns 690ns 1.0600us cuDeviceGetCount + 0.00% 626ns 2 313ns 253ns 373ns cuDeviceGet diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_CUBA_CUDAStandaloneConfiguration_1000.log b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_CUBA_CUDAStandaloneConfiguration_1000.log new file mode 100644 index 00000000..69e1a4bd --- /dev/null +++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_CUBA_CUDAStandaloneConfiguration_1000.log @@ -0,0 +1,24 @@ +==31291== NVPROF is profiling process 31291, command: ./main +==31291== Profiling application: ./main +==31291== Profiling result: +Time(%) Time Calls Avg Min Max Name + 31.18% 76.419ms 10000 7.6410us 7.3920us 8.7360us kernel_neurongroup_stateupdater_codeobject(unsigned int, unsigned int, double*, double*, double, double*, double*, double*, bool*) + 19.96% 48.924ms 10000 4.8920us 3.4560us 20.384us kernel_synapses_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, int*, int, double*, double, int*, int, int*, bool*) + 18.13% 44.432ms 10000 4.4430us 3.2960us 17.952us kernel_synapses_1_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, int*, int, double, int*, int, int*, double*, bool*) + 13.38% 32.789ms 10000 3.2780us 3.2320us 3.7760us [CUDA memset] + 9.59% 23.496ms 10000 2.3490us 2.0480us 2.7520us kernel_neurongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double, double*, double*, bool*) + 7.76% 19.020ms 10000 1.9010us 1.6640us 2.0800us kernel_neurongroup_resetter_codeobject(unsigned int, unsigned int, double*, int*, bool*) + +==31291== API calls: +Time(%) Time Calls Avg Min Max Name + 68.69% 471.10ms 50000 9.4220us 8.2170us 19.231ms cudaLaunch + 13.91% 95.387ms 10000 9.5380us 8.7960us 312.26us cudaMemset + 13.50% 92.578ms 510000 181ns 148ns 324.51us cudaSetupArgument + 2.05% 14.040ms 50000 280ns 237ns 5.2940us cudaConfigureCall + 1.83% 12.581ms 50000 251ns 217ns 12.226us cudaGetLastError + 0.01% 51.575us 1 51.575us 51.575us 51.575us cudaMemGetInfo + 0.00% 21.460us 39 550ns 461ns 1.4270us cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags + 0.00% 21.129us 8 2.6410us 1.9560us 4.4310us cudaFuncGetAttributes + 0.00% 16.670us 1 16.670us 16.670us 16.670us cudaDeviceSynchronize + 0.00% 5.5840us 12 465ns 285ns 1.2870us cudaDeviceGetAttribute + 0.00% 3.3860us 3 1.1280us 653ns 1.8010us cudaGetDevice diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_CUBA_GeNNConfigurationOptimized_1000.log b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_CUBA_GeNNConfigurationOptimized_1000.log new file mode 100644 index 00000000..5ec6b1a0 --- /dev/null +++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_CUBA_GeNNConfigurationOptimized_1000.log @@ -0,0 +1,25 @@ +==31529== NVPROF is profiling process 31529, command: ./main test 1.0 1 +==31529== Profiling application: ./main test 1.0 1 +==31529== Profiling result: +Time(%) Time Calls Avg Min Max Name + 74.56% 131.02ms 10000 13.101us 11.808us 14.624us calcNeurons + 24.85% 43.662ms 10000 4.3660us 2.1760us 25.760us calcSynapses + 0.45% 796.80us 56 14.228us 960ns 163.59us [CUDA memcpy HtoD] + 0.13% 234.31us 13 18.023us 1.9520us 155.27us [CUDA memcpy DtoH] + +==31529== API calls: +Time(%) Time Calls Avg Min Max Name + 57.53% 276.80ms 16 17.300ms 8.5100us 274.32ms cudaHostAlloc + 38.37% 184.60ms 20000 9.2300us 7.6370us 342.36us cudaLaunch + 1.48% 7.1407ms 73 97.817us 343ns 5.2594ms cudaMemcpy + 1.31% 6.3266ms 20000 316ns 249ns 315.38us cudaConfigureCall + 1.06% 5.1071ms 20000 255ns 220ns 4.6570us cudaSetupArgument + 0.17% 819.17us 16 51.198us 6.2400us 136.59us cudaMalloc + 0.05% 241.67us 83 2.9110us 138ns 103.86us cuDeviceGetAttribute + 0.01% 32.371us 1 32.371us 32.371us 32.371us cuDeviceTotalMem + 0.01% 28.436us 1 28.436us 28.436us 28.436us cuDeviceGetName + 0.00% 12.399us 16 774ns 424ns 2.0180us cudaGetSymbolAddress + 0.00% 12.047us 1 12.047us 12.047us 12.047us cudaSetDevice + 0.00% 1.6800us 1 1.6800us 1.6800us 1.6800us cudaGetDeviceCount + 0.00% 1.4560us 2 728ns 455ns 1.0010us cuDeviceGetCount + 0.00% 575ns 2 287ns 235ns 340ns cuDeviceGet diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_DenseMediumRateSynapsesOnly_CUDAStandaloneConfiguration_1000.log b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_DenseMediumRateSynapsesOnly_CUDAStandaloneConfiguration_1000.log new file mode 100644 index 00000000..9f083553 --- /dev/null +++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_DenseMediumRateSynapsesOnly_CUDAStandaloneConfiguration_1000.log @@ -0,0 +1,21 @@ +==30551== NVPROF is profiling process 30551, command: ./main +==30551== Profiling application: ./main +==30551== Profiling result: +Time(%) Time Calls Avg Min Max Name + 56.01% 59.694ms 10000 5.9690us 5.6000us 6.4960us kernel_synapses_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, int*, int, double, double*, int*, int, int*) + 28.93% 30.830ms 10000 3.0820us 3.0400us 3.5200us [CUDA memset] + 15.06% 16.055ms 10000 1.6050us 1.5040us 2.4000us kernel_neurongroup_thresholder_codeobject(unsigned int, unsigned int, int*) + +==30551== API calls: +Time(%) Time Calls Avg Min Max Name + 59.17% 191.07ms 20000 9.5530us 8.3220us 11.129ms cudaLaunch + 27.89% 90.062ms 10000 9.0060us 8.4390us 27.616us cudaMemset + 9.32% 30.084ms 170000 176ns 153ns 306.97us cudaSetupArgument + 1.82% 5.8925ms 20000 294ns 213ns 303.17us cudaConfigureCall + 1.77% 5.7023ms 20000 285ns 216ns 302.98us cudaGetLastError + 0.01% 46.403us 1 46.403us 46.403us 46.403us cudaMemGetInfo + 0.01% 18.635us 1 18.635us 18.635us 18.635us cudaDeviceSynchronize + 0.00% 8.8700us 3 2.9560us 2.1570us 3.7290us cudaFuncGetAttributes + 0.00% 6.7130us 3 2.2370us 629ns 3.5200us cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags + 0.00% 1.7730us 4 443ns 369ns 586ns cudaDeviceGetAttribute + 0.00% 848ns 1 848ns 848ns 848ns cudaGetDevice diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_DenseMediumRateSynapsesOnly_GeNNConfigurationOptimized_1000.log b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_DenseMediumRateSynapsesOnly_GeNNConfigurationOptimized_1000.log new file mode 100644 index 00000000..05b6bca3 --- /dev/null +++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_DenseMediumRateSynapsesOnly_GeNNConfigurationOptimized_1000.log @@ -0,0 +1,25 @@ +==30762== NVPROF is profiling process 30762, command: ./main test 1.0 1 +==30762== Profiling application: ./main test 1.0 1 +==30762== Profiling result: +Time(%) Time Calls Avg Min Max Name + 64.08% 52.562ms 10000 5.2560us 3.4240us 5.9200us calcSynapses + 35.80% 29.364ms 10000 2.9360us 2.8800us 3.8080us calcNeurons + 0.07% 57.888us 44 1.3150us 960ns 2.2400us [CUDA memcpy HtoD] + 0.05% 38.240us 14 2.7310us 2.0160us 4.7360us [CUDA memcpy DtoH] + +==30762== API calls: +Time(%) Time Calls Avg Min Max Name + 61.72% 283.35ms 12 23.613ms 14.143us 281.71ms cudaHostAlloc + 35.34% 162.27ms 20000 8.1130us 7.4880us 334.11us cudaLaunch + 1.34% 6.1571ms 20000 307ns 256ns 322.44us cudaConfigureCall + 1.16% 5.3454ms 20000 267ns 224ns 332.57us cudaSetupArgument + 0.23% 1.0363ms 61 16.988us 318ns 37.131us cudaMemcpy + 0.14% 644.11us 12 53.676us 11.831us 178.21us cudaMalloc + 0.05% 226.72us 83 2.7310us 138ns 97.611us cuDeviceGetAttribute + 0.01% 31.315us 1 31.315us 31.315us 31.315us cuDeviceTotalMem + 0.01% 26.553us 1 26.553us 26.553us 26.553us cuDeviceGetName + 0.00% 13.976us 12 1.1640us 709ns 3.1230us cudaGetSymbolAddress + 0.00% 11.238us 1 11.238us 11.238us 11.238us cudaSetDevice + 0.00% 1.4430us 2 721ns 438ns 1.0050us cuDeviceGetCount + 0.00% 1.4380us 1 1.4380us 1.4380us 1.4380us cudaGetDeviceCount + 0.00% 582ns 2 291ns 214ns 368ns cuDeviceGet diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_HHNeuronsOnly_CUDAStandaloneConfiguration_1000.log b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_HHNeuronsOnly_CUDAStandaloneConfiguration_1000.log new file mode 100644 index 00000000..c3db833d --- /dev/null +++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_HHNeuronsOnly_CUDAStandaloneConfiguration_1000.log @@ -0,0 +1,21 @@ +==25014== NVPROF is profiling process 25014, command: ./main +==25014== Profiling application: ./main +==25014== Profiling result: +Time(%) Time Calls Avg Min Max Name + 76.60% 171.78ms 10000 17.177us 14.880us 18.080us kernel_neurongroup_stateupdater_codeobject(unsigned int, unsigned int, double*, double*, bool*, double*, double*, double*, double*) + 13.61% 30.516ms 10000 3.0510us 2.8160us 3.5840us [CUDA memset] + 9.79% 21.945ms 10000 2.1940us 1.8240us 2.9120us kernel_neurongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double, double*, double*, bool*) + +==25014== API calls: +Time(%) Time Calls Avg Min Max Name + 58.23% 179.09ms 20000 8.9540us 8.0160us 5.8117ms cudaLaunch + 28.13% 86.520ms 10000 8.6520us 8.0220us 324.89us cudaMemset + 10.05% 30.914ms 160000 193ns 150ns 347.54us cudaSetupArgument + 1.94% 5.9702ms 20000 298ns 223ns 315.53us cudaConfigureCall + 1.61% 4.9531ms 20000 247ns 210ns 327.22us cudaGetLastError + 0.02% 46.728us 1 46.728us 46.728us 46.728us cudaMemGetInfo + 0.01% 17.432us 35 498ns 471ns 917ns cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags + 0.00% 10.745us 1 10.745us 10.745us 10.745us cudaDeviceSynchronize + 0.00% 10.378us 4 2.5940us 2.0060us 3.1740us cudaFuncGetAttributes + 0.00% 3.1700us 8 396ns 284ns 677ns cudaDeviceGetAttribute + 0.00% 1.6580us 2 829ns 801ns 857ns cudaGetDevice diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_HHNeuronsOnly_GeNNConfigurationOptimized_1000.log b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_HHNeuronsOnly_GeNNConfigurationOptimized_1000.log new file mode 100644 index 00000000..798b69a4 --- /dev/null +++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_HHNeuronsOnly_GeNNConfigurationOptimized_1000.log @@ -0,0 +1,24 @@ +==25225== NVPROF is profiling process 25225, command: ./main test 1.0 1 +==25225== Profiling application: ./main test 1.0 1 +==25225== Profiling result: +Time(%) Time Calls Avg Min Max Name + 99.94% 177.51ms 10000 17.750us 14.944us 26.400us calcNeurons + 0.04% 62.626us 40 1.5650us 960ns 2.1760us [CUDA memcpy HtoD] + 0.02% 38.560us 11 3.5050us 2.0160us 4.6720us [CUDA memcpy DtoH] + +==25225== API calls: +Time(%) Time Calls Avg Min Max Name + 55.84% 235.54ms 10 23.554ms 16.992us 233.93ms cudaHostAlloc + 37.45% 157.95ms 10000 15.795us 7.9250us 353.53us cudaLaunch + 4.97% 20.977ms 53 395.80us 389ns 20.008ms cudaMemcpy + 0.81% 3.4097ms 10000 340ns 278ns 5.0220us cudaConfigureCall + 0.70% 2.9582ms 10000 295ns 232ns 339.82us cudaSetupArgument + 0.15% 630.64us 10 63.063us 12.457us 174.83us cudaMalloc + 0.05% 227.15us 83 2.7360us 140ns 98.109us cuDeviceGetAttribute + 0.01% 31.635us 1 31.635us 31.635us 31.635us cuDeviceTotalMem + 0.01% 31.273us 1 31.273us 31.273us 31.273us cuDeviceGetName + 0.00% 12.870us 10 1.2870us 741ns 3.5550us cudaGetSymbolAddress + 0.00% 10.918us 1 10.918us 10.918us 10.918us cudaSetDevice + 0.00% 1.9240us 2 962ns 718ns 1.2060us cuDeviceGetCount + 0.00% 1.4330us 1 1.4330us 1.4330us 1.4330us cudaGetDeviceCount + 0.00% 657ns 2 328ns 303ns 354ns cuDeviceGet diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_LinearNeuronsOnly_CUDAStandaloneConfiguration_1000.log b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_LinearNeuronsOnly_CUDAStandaloneConfiguration_1000.log new file mode 100644 index 00000000..1b216fb6 --- /dev/null +++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_LinearNeuronsOnly_CUDAStandaloneConfiguration_1000.log @@ -0,0 +1,18 @@ +==19640== NVPROF is profiling process 19640, command: ./main +==19640== Profiling application: ./main +==19640== Profiling result: +Time(%) Time Calls Avg Min Max Name +100.00% 247.35ms 100000 2.4730us 2.3360us 3.6800us kernel_neurongroup_stateupdater_codeobject(unsigned int, unsigned int, double*, double*) + +==19640== API calls: +Time(%) Time Calls Avg Min Max Name + 87.43% 837.87ms 100000 8.3780us 7.7260us 7.8274ms cudaLaunch + 7.01% 67.186ms 400000 167ns 147ns 10.910us cudaSetupArgument + 2.81% 26.904ms 100000 269ns 241ns 10.142us cudaConfigureCall + 2.74% 26.287ms 100000 262ns 235ns 11.074us cudaGetLastError + 0.01% 70.067us 1 70.067us 70.067us 70.067us cudaMemGetInfo + 0.00% 14.560us 2 7.2800us 4.1830us 10.377us cudaFuncGetAttributes + 0.00% 9.6320us 1 9.6320us 9.6320us 9.6320us cudaDeviceSynchronize + 0.00% 5.2800us 2 2.6400us 1.1150us 4.1650us cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags + 0.00% 3.9840us 1 3.9840us 3.9840us 3.9840us cudaGetDevice + 0.00% 3.7360us 4 934ns 668ns 1.5690us cudaDeviceGetAttribute diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_LinearNeuronsOnly_GeNNConfigurationOptimized_1000.log b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_LinearNeuronsOnly_GeNNConfigurationOptimized_1000.log new file mode 100644 index 00000000..a5d528ae --- /dev/null +++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_LinearNeuronsOnly_GeNNConfigurationOptimized_1000.log @@ -0,0 +1,24 @@ +==19869== NVPROF is profiling process 19869, command: ./main test 10.0 1 +==19869== Profiling application: ./main test 10.0 1 +==19869== Profiling result: +Time(%) Time Calls Avg Min Max Name + 99.99% 264.71ms 100000 2.6470us 2.5920us 3.1680us calcNeurons + 0.01% 22.656us 16 1.4160us 960ns 2.0800us [CUDA memcpy HtoD] + 0.01% 14.624us 5 2.9240us 2.0480us 4.6720us [CUDA memcpy DtoH] + +==19869== API calls: +Time(%) Time Calls Avg Min Max Name + 73.18% 822.50ms 100000 8.2250us 7.6370us 361.19us cudaLaunch + 21.57% 242.48ms 4 60.620ms 23.163us 240.97ms cudaHostAlloc + 2.95% 33.155ms 100000 331ns 251ns 369.91us cudaConfigureCall + 2.18% 24.551ms 100000 245ns 222ns 14.790us cudaSetupArgument + 0.05% 525.28us 4 131.32us 12.450us 178.02us cudaMalloc + 0.04% 460.82us 23 20.035us 384ns 39.476us cudaMemcpy + 0.02% 226.65us 83 2.7300us 142ns 97.695us cuDeviceGetAttribute + 0.00% 31.478us 1 31.478us 31.478us 31.478us cuDeviceTotalMem + 0.00% 30.578us 1 30.578us 30.578us 30.578us cuDeviceGetName + 0.00% 10.794us 1 10.794us 10.794us 10.794us cudaSetDevice + 0.00% 7.9740us 4 1.9930us 876ns 3.7070us cudaGetSymbolAddress + 0.00% 1.5520us 2 776ns 553ns 999ns cuDeviceGetCount + 0.00% 1.4290us 1 1.4290us 1.4290us 1.4290us cudaGetDeviceCount + 0.00% 545ns 2 272ns 256ns 289ns cuDeviceGet diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_STDPEventDriven_CUDAStandaloneConfiguration_1000.log b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_STDPEventDriven_CUDAStandaloneConfiguration_1000.log new file mode 100644 index 00000000..f45b844f --- /dev/null +++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_STDPEventDriven_CUDAStandaloneConfiguration_1000.log @@ -0,0 +1,27 @@ +==13883== NVPROF is profiling process 13883, command: ./main +==13883== Profiling application: ./main +==13883== Profiling result: +Time(%) Time Calls Avg Min Max Name + 29.16% 88.869ms 10000 8.8860us 3.4880us 32.064us kernel_synapses_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, double*, int, double*, int, int*, double*, double, double*, int, int*, int, int*, int) + 20.89% 63.662ms 20000 3.1830us 3.0400us 3.6800us [CUDA memset] + 17.94% 54.662ms 10000 5.4660us 5.1840us 7.5200us kernel_neurongroup_stateupdater_codeobject(unsigned int, unsigned int, double*, double*, double*) + 12.41% 37.829ms 10000 3.7820us 3.6480us 7.2000us kernel_synapses_post_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, double*, int, double*, int, double, double*, int, int*, int*, int) + 7.99% 24.357ms 10000 2.4350us 2.1760us 2.8800us kernel_poissongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double*, double*, float*) + 5.78% 17.601ms 10000 1.7600us 1.5360us 2.4960us kernel_neurongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double*) + 5.65% 17.232ms 10000 1.7230us 1.6640us 1.9840us kernel_neurongroup_resetter_codeobject(unsigned int, unsigned int, double*, int*) + 0.17% 532.84us 1 532.84us 532.84us 532.84us void gen_sequenced(curandStateXORWOW*, int))>(curandStateXORWOW*, float*, unsigned long, unsigned long, int) + +==13883== API calls: +Time(%) Time Calls Avg Min Max Name + 62.59% 547.05ms 60001 9.1170us 8.1770us 7.2312ms cudaLaunch + 20.36% 177.95ms 20000 8.8970us 8.1030us 336.69us cudaMemset + 13.38% 116.92ms 560005 208ns 150ns 330.03us cudaSetupArgument + 1.91% 16.702ms 60001 278ns 208ns 316.80us cudaConfigureCall + 1.74% 15.203ms 60002 253ns 222ns 313.88us cudaGetLastError + 0.02% 138.47us 1 138.47us 138.47us 138.47us cudaMalloc + 0.01% 47.825us 1 47.825us 47.825us 47.825us cudaMemGetInfo + 0.00% 24.670us 10 2.4670us 1.9950us 3.8850us cudaFuncGetAttributes + 0.00% 22.588us 41 550ns 471ns 1.2300us cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags + 0.00% 17.416us 1 17.416us 17.416us 17.416us cudaDeviceSynchronize + 0.00% 5.6370us 16 352ns 276ns 664ns cudaDeviceGetAttribute + 0.00% 3.1450us 4 786ns 601ns 1.1830us cudaGetDevice diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_STDPEventDriven_GeNNConfigurationOptimized_1000.log b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_STDPEventDriven_GeNNConfigurationOptimized_1000.log new file mode 100644 index 00000000..e7d20d38 --- /dev/null +++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_STDPEventDriven_GeNNConfigurationOptimized_1000.log @@ -0,0 +1,26 @@ +==14124== NVPROF is profiling process 14124, command: ./main test 1.0 1 +==14124== Profiling application: ./main test 1.0 1 +==14124== Profiling result: +Time(%) Time Calls Avg Min Max Name + 62.29% 109.79ms 10000 10.979us 1.4400us 50.176us calcSynapses + 23.83% 42.003ms 10000 4.2000us 3.3280us 6.2080us calcNeurons + 13.80% 24.321ms 10000 2.4320us 2.0800us 10.848us learnSynapsesPost + 0.05% 93.824us 70 1.3400us 960ns 2.1760us [CUDA memcpy HtoD] + 0.03% 53.856us 19 2.8340us 1.9520us 4.6400us [CUDA memcpy DtoH] + +==14124== API calls: +Time(%) Time Calls Avg Min Max Name + 54.33% 315.51ms 20 15.776ms 7.4360us 314.37ms cudaHostAlloc + 42.46% 246.58ms 30000 8.2190us 7.6810us 352.29us cudaLaunch + 1.62% 9.4165ms 30000 313ns 235ns 338.10us cudaConfigureCall + 1.25% 7.2565ms 30000 241ns 219ns 10.061us cudaSetupArgument + 0.20% 1.1638ms 95 12.250us 188ns 29.618us cudaMemcpy + 0.08% 485.57us 20 24.278us 6.1510us 122.08us cudaMalloc + 0.04% 225.75us 83 2.7190us 136ns 97.167us cuDeviceGetAttribute + 0.01% 31.148us 1 31.148us 31.148us 31.148us cuDeviceTotalMem + 0.00% 27.209us 1 27.209us 27.209us 27.209us cuDeviceGetName + 0.00% 25.053us 20 1.2520us 370ns 14.749us cudaGetSymbolAddress + 0.00% 11.323us 1 11.323us 11.323us 11.323us cudaSetDevice + 0.00% 1.4040us 1 1.4040us 1.4040us 1.4040us cudaGetDeviceCount + 0.00% 1.3580us 2 679ns 456ns 902ns cuDeviceGetCount + 0.00% 492ns 2 246ns 220ns 272ns cuDeviceGet diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_STDPMultiPostNeuronalTraces_CUDAStandaloneConfiguration_1000.log b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_STDPMultiPostNeuronalTraces_CUDAStandaloneConfiguration_1000.log new file mode 100644 index 00000000..c05b8c43 --- /dev/null +++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_STDPMultiPostNeuronalTraces_CUDAStandaloneConfiguration_1000.log @@ -0,0 +1,28 @@ +==31645== NVPROF is profiling process 31645, command: ./main +==31645== Profiling application: ./main +==31645== Profiling result: +Time(%) Time Calls Avg Min Max Name + 23.09% 63.632ms 20000 3.1810us 3.0400us 3.8080us [CUDA memset] + 21.51% 59.284ms 10000 5.9280us 5.6320us 7.6160us kernel_neurongroup_1_stateupdater_codeobject(unsigned int, unsigned int, double*, double*, double*, double*) + 13.19% 36.348ms 10000 3.6340us 3.4240us 12.288us kernel_synapses_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, int*, double*, double, double*, int, int*, int, int*, int, double*) + 12.65% 34.859ms 10000 3.4850us 3.3920us 94.048us kernel_synapses_post_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, int*, double, double*, int, double*, int*, int, int) + 9.89% 27.258ms 10000 2.7250us 2.5280us 2.9760us kernel_neurongroup_stateupdater_codeobject(unsigned int, unsigned int, double*, double*) + 6.72% 18.518ms 10000 1.8510us 1.7600us 2.8160us kernel_neurongroup_1_thresholder_codeobject(unsigned int, unsigned int, int*, double*) + 6.69% 18.444ms 10000 1.8440us 1.6000us 2.4320us kernel_neurongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double*, double*, float*) + 6.26% 17.266ms 10000 1.7260us 1.6640us 2.4000us kernel_neurongroup_1_resetter_codeobject(unsigned int, unsigned int, double*, int*, double*) + 0.01% 22.689us 1 22.689us 22.689us 22.689us void gen_sequenced(curandStateXORWOW*, int))>(curandStateXORWOW*, float*, unsigned long, unsigned long, int) + +==31645== API calls: +Time(%) Time Calls Avg Min Max Name + 66.34% 631.89ms 70001 9.0260us 7.8240us 7.5683ms cudaLaunch + 18.61% 177.26ms 20000 8.8630us 8.0310us 327.63us cudaMemset + 11.06% 105.29ms 570005 184ns 147ns 324.54us cudaSetupArgument + 1.98% 18.868ms 70002 269ns 211ns 316.30us cudaGetLastError + 1.98% 18.848ms 70001 269ns 196ns 10.259us cudaConfigureCall + 0.01% 123.44us 1 123.44us 123.44us 123.44us cudaMalloc + 0.01% 48.253us 1 48.253us 48.253us 48.253us cudaMemGetInfo + 0.00% 38.693us 74 522ns 468ns 1.2040us cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags + 0.00% 30.351us 12 2.5290us 2.0130us 4.4000us cudaFuncGetAttributes + 0.00% 17.703us 1 17.703us 17.703us 17.703us cudaDeviceSynchronize + 0.00% 8.0120us 20 400ns 315ns 771ns cudaDeviceGetAttribute + 0.00% 3.7350us 5 747ns 588ns 1.2880us cudaGetDevice diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_STDPMultiPostNeuronalTraces_GeNNConfigurationOptimized_1000.log b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_STDPMultiPostNeuronalTraces_GeNNConfigurationOptimized_1000.log new file mode 100644 index 00000000..7556474c --- /dev/null +++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_STDPMultiPostNeuronalTraces_GeNNConfigurationOptimized_1000.log @@ -0,0 +1,26 @@ +==31875== NVPROF is profiling process 31875, command: ./main test 1.0 1 +==31875== Profiling application: ./main test 1.0 1 +==31875== Profiling result: +Time(%) Time Calls Avg Min Max Name + 51.59% 44.978ms 10000 4.4970us 4.4160us 13.216us calcNeurons + 28.08% 24.482ms 10000 2.4480us 2.4000us 108.48us learnSynapsesPost + 20.19% 17.604ms 10000 1.7600us 1.5680us 8.0320us calcSynapses + 0.09% 77.888us 70 1.1120us 960ns 2.0160us [CUDA memcpy HtoD] + 0.05% 40.704us 17 2.3940us 2.0480us 4.6720us [CUDA memcpy DtoH] + +==31875== API calls: +Time(%) Time Calls Avg Min Max Name + 49.08% 242.98ms 30000 8.0990us 7.4830us 330.16us cudaLaunch + 46.99% 232.62ms 20 11.631ms 13.742us 230.95ms cudaHostAlloc + 1.93% 9.5539ms 30000 318ns 249ns 316.27us cudaConfigureCall + 1.50% 7.4449ms 30000 248ns 228ns 9.5620us cudaSetupArgument + 0.29% 1.4169ms 93 15.235us 341ns 34.925us cudaMemcpy + 0.15% 732.26us 20 36.613us 11.241us 173.89us cudaMalloc + 0.05% 225.85us 83 2.7210us 144ns 97.097us cuDeviceGetAttribute + 0.01% 31.104us 1 31.104us 31.104us 31.104us cuDeviceTotalMem + 0.01% 27.342us 1 27.342us 27.342us 27.342us cuDeviceGetName + 0.00% 19.527us 20 976ns 638ns 3.5660us cudaGetSymbolAddress + 0.00% 11.180us 1 11.180us 11.180us 11.180us cudaSetDevice + 0.00% 1.5790us 2 789ns 579ns 1.0000us cuDeviceGetCount + 0.00% 1.4070us 1 1.4070us 1.4070us 1.4070us cudaGetDeviceCount + 0.00% 534ns 2 267ns 238ns 296ns cuDeviceGet diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_STDPMultiPost_CUDAStandaloneConfiguration_1000.log b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_STDPMultiPost_CUDAStandaloneConfiguration_1000.log new file mode 100644 index 00000000..d11b4f61 --- /dev/null +++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_STDPMultiPost_CUDAStandaloneConfiguration_1000.log @@ -0,0 +1,27 @@ +==13752== NVPROF is profiling process 13752, command: ./main +==13752== Profiling application: ./main +==13752== Profiling result: +Time(%) Time Calls Avg Min Max Name + 26.01% 63.681ms 20000 3.1840us 3.0400us 3.8080us [CUDA memset] + 21.90% 53.615ms 10000 5.3610us 5.1840us 7.2640us kernel_neurongroup_stateupdater_codeobject(unsigned int, unsigned int, double*, double*, double*) + 16.08% 39.373ms 10000 3.9370us 3.5840us 10.720us kernel_synapses_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, double*, int, double*, int, int*, double*, double, double*, int, int*, int, int*, int) + 14.74% 36.097ms 10000 3.6090us 3.4880us 105.60us kernel_synapses_post_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, double*, int, double*, int, double, double*, int, int*, int*, int) + 8.31% 20.344ms 10000 2.0340us 1.8560us 2.4320us kernel_poissongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double*, double*, float*) + 6.61% 16.187ms 10000 1.6180us 1.5040us 2.8160us kernel_neurongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double*) + 6.34% 15.535ms 10000 1.5530us 1.4720us 1.9840us kernel_neurongroup_resetter_codeobject(unsigned int, unsigned int, double*, int*) + 0.01% 22.881us 1 22.881us 22.881us 22.881us void gen_sequenced(curandStateXORWOW*, int))>(curandStateXORWOW*, float*, unsigned long, unsigned long, int) + +==13752== API calls: +Time(%) Time Calls Avg Min Max Name + 64.39% 566.77ms 60001 9.4450us 8.5300us 7.6226ms cudaLaunch + 20.37% 179.35ms 20000 8.9670us 8.0990us 320.51us cudaMemset + 11.68% 102.80ms 560005 183ns 154ns 320.82us cudaSetupArgument + 1.91% 16.807ms 60001 280ns 234ns 314.83us cudaConfigureCall + 1.62% 14.260ms 60002 237ns 197ns 325.01us cudaGetLastError + 0.01% 125.15us 1 125.15us 125.15us 125.15us cudaMalloc + 0.01% 50.027us 1 50.027us 50.027us 50.027us cudaMemGetInfo + 0.00% 25.943us 10 2.5940us 1.9990us 4.6510us cudaFuncGetAttributes + 0.00% 23.402us 41 570ns 490ns 1.2400us cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags + 0.00% 17.044us 1 17.044us 17.044us 17.044us cudaDeviceSynchronize + 0.00% 6.0160us 16 376ns 279ns 1.0150us cudaDeviceGetAttribute + 0.00% 3.0950us 4 773ns 532ns 1.3840us cudaGetDevice diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_STDPMultiPost_GeNNConfigurationOptimized_1000.log b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_STDPMultiPost_GeNNConfigurationOptimized_1000.log new file mode 100644 index 00000000..08c38fd6 --- /dev/null +++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_STDPMultiPost_GeNNConfigurationOptimized_1000.log @@ -0,0 +1,26 @@ +==13992== NVPROF is profiling process 13992, command: ./main test 1.0 1 +==13992== Profiling application: ./main test 1.0 1 +==13992== Profiling result: +Time(%) Time Calls Avg Min Max Name + 47.47% 40.621ms 10000 4.0620us 3.9680us 12.064us calcNeurons + 29.19% 24.977ms 10000 2.4970us 2.4000us 360.29us learnSynapsesPost + 23.19% 19.844ms 10000 1.9840us 1.5680us 15.904us calcSynapses + 0.10% 83.488us 70 1.1920us 960ns 2.0480us [CUDA memcpy HtoD] + 0.05% 45.344us 17 2.6670us 2.0480us 4.7040us [CUDA memcpy DtoH] + +==13992== API calls: +Time(%) Time Calls Avg Min Max Name + 49.24% 255.49ms 20 12.774ms 7.1470us 254.39ms cudaHostAlloc + 47.05% 244.13ms 30000 8.1370us 7.4970us 325.41us cudaLaunch + 1.88% 9.7505ms 30000 325ns 240ns 313.30us cudaConfigureCall + 1.44% 7.4897ms 30000 249ns 228ns 4.6460us cudaSetupArgument + 0.23% 1.1712ms 95 12.328us 191ns 29.827us cudaMemcpy + 0.10% 498.07us 20 24.903us 6.1390us 124.17us cudaMalloc + 0.04% 225.66us 83 2.7180us 135ns 97.278us cuDeviceGetAttribute + 0.01% 31.145us 1 31.145us 31.145us 31.145us cuDeviceTotalMem + 0.01% 27.598us 1 27.598us 27.598us 27.598us cuDeviceGetName + 0.00% 11.370us 20 568ns 348ns 2.0700us cudaGetSymbolAddress + 0.00% 11.183us 1 11.183us 11.183us 11.183us cudaSetDevice + 0.00% 1.4160us 2 708ns 453ns 963ns cuDeviceGetCount + 0.00% 1.3950us 1 1.3950us 1.3950us 1.3950us cudaGetDeviceCount + 0.00% 533ns 2 266ns 241ns 292ns cuDeviceGet diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_STDPNeuronalTraces_CUDAStandaloneConfiguration_1000.log b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_STDPNeuronalTraces_CUDAStandaloneConfiguration_1000.log new file mode 100644 index 00000000..df801df7 --- /dev/null +++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_STDPNeuronalTraces_CUDAStandaloneConfiguration_1000.log @@ -0,0 +1,28 @@ +==22958== NVPROF is profiling process 22958, command: ./main +==22958== Profiling application: ./main +==22958== Profiling result: +Time(%) Time Calls Avg Min Max Name + 23.34% 76.426ms 10000 7.6420us 3.2960us 26.944us kernel_synapses_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, int*, double*, double, double*, int, int*, int, int*, int, double*) + 19.43% 63.625ms 20000 3.1810us 3.0400us 3.7120us [CUDA memset] + 18.23% 59.686ms 10000 5.9680us 5.6320us 8.0960us kernel_neurongroup_1_stateupdater_codeobject(unsigned int, unsigned int, double*, double*, double*, double*) + 11.04% 36.142ms 10000 3.6140us 3.3920us 7.0730us kernel_synapses_post_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, int*, double, double*, int, double*, int*, int, int) + 9.09% 29.761ms 10000 2.9760us 2.8800us 3.5840us kernel_neurongroup_stateupdater_codeobject(unsigned int, unsigned int, double*, double*) + 7.99% 26.155ms 10000 2.6150us 2.2080us 2.8800us kernel_neurongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double*, double*, float*) + 5.47% 17.908ms 10000 1.7900us 1.7280us 2.4640us kernel_neurongroup_1_thresholder_codeobject(unsigned int, unsigned int, int*, double*) + 5.26% 17.212ms 10000 1.7210us 1.6640us 2.3680us kernel_neurongroup_1_resetter_codeobject(unsigned int, unsigned int, double*, int*, double*) + 0.16% 534.91us 1 534.91us 534.91us 534.91us void gen_sequenced(curandStateXORWOW*, int))>(curandStateXORWOW*, float*, unsigned long, unsigned long, int) + +==22958== API calls: +Time(%) Time Calls Avg Min Max Name + 66.19% 628.57ms 70001 8.9790us 7.8060us 7.0815ms cudaLaunch + 18.98% 180.22ms 20000 9.0110us 8.1910us 325.17us cudaMemset + 10.84% 102.92ms 570005 180ns 148ns 322.77us cudaSetupArgument + 2.05% 19.421ms 70002 277ns 224ns 322.72us cudaGetLastError + 1.92% 18.237ms 70001 260ns 204ns 7.6100us cudaConfigureCall + 0.01% 139.26us 1 139.26us 139.26us 139.26us cudaMalloc + 0.01% 47.740us 1 47.740us 47.740us 47.740us cudaMemGetInfo + 0.00% 38.641us 74 522ns 463ns 1.3230us cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags + 0.00% 31.070us 12 2.5890us 2.0180us 4.6520us cudaFuncGetAttributes + 0.00% 17.325us 1 17.325us 17.325us 17.325us cudaDeviceSynchronize + 0.00% 7.2280us 20 361ns 279ns 764ns cudaDeviceGetAttribute + 0.00% 3.4300us 5 686ns 519ns 1.2200us cudaGetDevice diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_STDPNeuronalTraces_GeNNConfigurationOptimized_1000.log b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_STDPNeuronalTraces_GeNNConfigurationOptimized_1000.log new file mode 100644 index 00000000..4a1b6afb --- /dev/null +++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_STDPNeuronalTraces_GeNNConfigurationOptimized_1000.log @@ -0,0 +1,26 @@ +==23186== NVPROF is profiling process 23186, command: ./main test 1.0 1 +==23186== Profiling application: ./main test 1.0 1 +==23186== Profiling result: +Time(%) Time Calls Avg Min Max Name + 45.72% 59.376ms 10000 5.9370us 1.4400us 22.209us calcSynapses + 36.59% 47.519ms 10000 4.7510us 3.7440us 7.2000us calcNeurons + 17.59% 22.844ms 10000 2.2840us 2.0800us 5.8240us learnSynapsesPost + 0.07% 90.016us 70 1.2850us 928ns 2.0480us [CUDA memcpy HtoD] + 0.04% 51.168us 19 2.6930us 1.9520us 4.6080us [CUDA memcpy DtoH] + +==23186== API calls: +Time(%) Time Calls Avg Min Max Name + 48.78% 251.54ms 20 12.577ms 7.1400us 250.44ms cudaHostAlloc + 47.58% 245.35ms 30000 8.1780us 7.6280us 342.38us cudaLaunch + 1.85% 9.5606ms 30000 318ns 255ns 320.84us cudaConfigureCall + 1.41% 7.2598ms 30000 241ns 222ns 5.1580us cudaSetupArgument + 0.22% 1.1470ms 93 12.333us 278ns 32.150us cudaMemcpy + 0.10% 513.51us 20 25.675us 6.0810us 139.05us cudaMalloc + 0.04% 228.09us 83 2.7480us 140ns 98.263us cuDeviceGetAttribute + 0.01% 31.411us 1 31.411us 31.411us 31.411us cuDeviceTotalMem + 0.01% 27.452us 1 27.452us 27.452us 27.452us cuDeviceGetName + 0.00% 12.004us 1 12.004us 12.004us 12.004us cudaSetDevice + 0.00% 11.525us 20 576ns 352ns 2.0890us cudaGetSymbolAddress + 0.00% 1.6280us 2 814ns 489ns 1.1390us cuDeviceGetCount + 0.00% 1.5650us 1 1.5650us 1.5650us 1.5650us cudaGetDeviceCount + 0.00% 594ns 2 297ns 230ns 364ns cuDeviceGet diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_STDPNotEventDriven_CUDAStandaloneConfiguration_1000.log b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_STDPNotEventDriven_CUDAStandaloneConfiguration_1000.log new file mode 100644 index 00000000..f8401320 --- /dev/null +++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_STDPNotEventDriven_CUDAStandaloneConfiguration_1000.log @@ -0,0 +1,28 @@ +==5309== NVPROF is profiling process 5309, command: ./main +==5309== Profiling application: ./main +==5309== Profiling result: +Time(%) Time Calls Avg Min Max Name + 23.35% 73.232ms 10000 7.3230us 3.4560us 24.544us kernel_synapses_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, double*, int, double*, int, int*, int, double*, double, double*, int, int*, int, int*) + 20.25% 63.528ms 20000 3.1760us 3.0400us 3.7440us [CUDA memset] + 17.18% 53.899ms 10000 5.3890us 5.0240us 7.6480us kernel_neurongroup_stateupdater_codeobject(unsigned int, unsigned int, double*, double*, double*) + 11.40% 35.764ms 10000 3.5760us 3.3920us 6.2720us kernel_synapses_post_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, double*, int, double*, int, int*, int, double, double*, int, int*) + 9.18% 28.794ms 10000 2.8790us 2.7840us 3.3600us kernel_synapses_stateupdater_codeobject(unsigned int, unsigned int, double*, int, double*, int, double*, int*) + 7.72% 24.206ms 10000 2.4200us 2.2080us 2.8480us kernel_poissongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double*, double*, float*) + 5.48% 17.200ms 10000 1.7190us 1.6640us 1.9840us kernel_neurongroup_resetter_codeobject(unsigned int, unsigned int, double*, int*) + 5.26% 16.509ms 10000 1.6500us 1.5360us 2.4960us kernel_neurongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double*) + 0.17% 534.31us 1 534.31us 534.31us 534.31us void gen_sequenced(curandStateXORWOW*, int))>(curandStateXORWOW*, float*, unsigned long, unsigned long, int) + +==5309== API calls: +Time(%) Time Calls Avg Min Max Name + 65.37% 632.10ms 70001 9.0290us 7.8220us 7.1147ms cudaLaunch + 18.21% 176.05ms 20000 8.8020us 7.9140us 65.993us cudaMemset + 11.98% 115.80ms 640005 180ns 150ns 325.82us cudaSetupArgument + 2.23% 21.584ms 70002 308ns 218ns 325.68us cudaGetLastError + 2.19% 21.175ms 70001 302ns 199ns 314.30us cudaConfigureCall + 0.01% 138.56us 1 138.56us 138.56us 138.56us cudaMalloc + 0.00% 48.141us 1 48.141us 48.141us 48.141us cudaMemGetInfo + 0.00% 40.939us 74 553ns 496ns 1.2830us cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags + 0.00% 30.402us 12 2.5330us 2.0360us 4.5650us cudaFuncGetAttributes + 0.00% 17.493us 1 17.493us 17.493us 17.493us cudaDeviceSynchronize + 0.00% 6.8790us 20 343ns 280ns 612ns cudaDeviceGetAttribute + 0.00% 3.7860us 5 757ns 587ns 1.2530us cudaGetDevice diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_STDPNotEventDriven_GeNNConfigurationOptimized_1000.log b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_STDPNotEventDriven_GeNNConfigurationOptimized_1000.log new file mode 100644 index 00000000..e8654e31 --- /dev/null +++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_STDPNotEventDriven_GeNNConfigurationOptimized_1000.log @@ -0,0 +1,27 @@ +==5547== NVPROF is profiling process 5547, command: ./main test 1.0 1 +==5547== Profiling application: ./main test 1.0 1 +==5547== Profiling result: +Time(%) Time Calls Avg Min Max Name + 38.01% 64.497ms 10000 6.4490us 1.4720us 25.121us calcSynapses + 24.89% 42.225ms 10000 4.2220us 3.3600us 6.1120us calcNeurons + 22.75% 38.605ms 10000 3.8600us 3.2320us 5.5680us calcSynapseDynamics + 14.26% 24.189ms 10000 2.4180us 2.1120us 6.5920us learnSynapsesPost + 0.06% 96.512us 72 1.3400us 928ns 2.0800us [CUDA memcpy HtoD] + 0.03% 54.080us 19 2.8460us 1.9840us 4.6720us [CUDA memcpy DtoH] + +==5547== API calls: +Time(%) Time Calls Avg Min Max Name + 53.26% 318.06ms 40000 7.9510us 7.3870us 323.19us cudaLaunch + 42.53% 254.01ms 21 12.096ms 7.5310us 252.89ms cudaHostAlloc + 2.21% 13.204ms 40000 330ns 252ns 332.54us cudaConfigureCall + 1.66% 9.9116ms 40000 247ns 233ns 5.2730us cudaSetupArgument + 0.20% 1.1942ms 97 12.311us 197ns 30.710us cudaMemcpy + 0.08% 498.29us 21 23.728us 6.1100us 122.22us cudaMalloc + 0.04% 227.33us 83 2.7380us 149ns 97.591us cuDeviceGetAttribute + 0.01% 31.273us 1 31.273us 31.273us 31.273us cuDeviceTotalMem + 0.00% 27.431us 1 27.431us 27.431us 27.431us cuDeviceGetName + 0.00% 11.816us 1 11.816us 11.816us 11.816us cudaSetDevice + 0.00% 11.690us 21 556ns 357ns 2.1550us cudaGetSymbolAddress + 0.00% 1.4320us 2 716ns 525ns 907ns cuDeviceGetCount + 0.00% 1.3390us 1 1.3390us 1.3390us 1.3390us cudaGetDeviceCount + 0.00% 577ns 2 288ns 252ns 325ns cuDeviceGet diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_STDP_CUDAStandaloneConfiguration_1000.log b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_STDP_CUDAStandaloneConfiguration_1000.log new file mode 100644 index 00000000..3a956eee --- /dev/null +++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_STDP_CUDAStandaloneConfiguration_1000.log @@ -0,0 +1,29 @@ +==30259== NVPROF is profiling process 30259, command: ./main +==30259== Profiling application: ./main +==30259== Profiling result: +Time(%) Time Calls Avg Min Max Name + 29.51% 119.04ms 10000 11.903us 1.4720us 28.312ms kernel_spikemonitor_codeobject(unsigned int, int*, double, int*, int*, int*, int, int*, double*, int, int*, int*) + 19.38% 78.154ms 10000 7.8150us 3.0400us 25.729us kernel_synapses_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, double*, int, double*, int, int*, double*, double, double*, int, int*, int, int*, int) + 15.01% 60.555ms 20000 3.0270us 2.8480us 4.2880us [CUDA memset] + 13.45% 54.257ms 10000 5.4250us 4.9280us 8.0000us kernel_neurongroup_stateupdater_codeobject(unsigned int, unsigned int, double*, double*, double*) + 8.78% 35.407ms 10000 3.5400us 3.2000us 7.1360us kernel_synapses_post_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, double*, int, double*, int, double, double*, int, int*, int*, int) + 6.25% 25.200ms 10000 2.5190us 2.1760us 2.8800us kernel_poissongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double*, double*, float*) + 3.84% 15.476ms 10000 1.5470us 1.4080us 2.4960us kernel_neurongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double*) + 3.64% 14.677ms 10000 1.4670us 1.3440us 1.9520us kernel_neurongroup_resetter_codeobject(unsigned int, unsigned int, double*, int*) + 0.13% 535.30us 1 535.30us 535.30us 535.30us void gen_sequenced(curandStateXORWOW*, int))>(curandStateXORWOW*, float*, unsigned long, unsigned long, int) + 0.02% 69.760us 1 69.760us 69.760us 69.760us _run_spikemonitor_codeobject_init(void) + +==30259== API calls: +Time(%) Time Calls Avg Min Max Name + 66.59% 656.39ms 70002 9.3760us 8.0560us 14.291ms cudaLaunch + 18.06% 178.04ms 20000 8.9010us 7.9370us 1.1364ms cudaMemset + 11.56% 113.99ms 680005 167ns 152ns 60.368us cudaSetupArgument + 2.00% 19.667ms 70003 280ns 237ns 57.739us cudaGetLastError + 1.77% 17.418ms 70002 248ns 194ns 139.14us cudaConfigureCall + 0.01% 139.28us 1 139.28us 139.28us 139.28us cudaMalloc + 0.00% 48.635us 1 48.635us 48.635us 48.635us cudaMemGetInfo + 0.00% 27.603us 11 2.5090us 1.9830us 4.1880us cudaFuncGetAttributes + 0.00% 23.673us 42 563ns 472ns 1.2600us cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags + 0.00% 18.501us 1 18.501us 18.501us 18.501us cudaDeviceSynchronize + 0.00% 6.2050us 16 387ns 285ns 719ns cudaDeviceGetAttribute + 0.00% 3.4000us 4 850ns 590ns 1.2110us cudaGetDevice diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_STDP_GeNNConfigurationOptimized_1000.log b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_STDP_GeNNConfigurationOptimized_1000.log new file mode 100644 index 00000000..7c0d0855 --- /dev/null +++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_STDP_GeNNConfigurationOptimized_1000.log @@ -0,0 +1,26 @@ +==30505== NVPROF is profiling process 30505, command: ./main test 1.0 1 +==30505== Profiling application: ./main test 1.0 1 +==30505== Profiling result: +Time(%) Time Calls Avg Min Max Name + 50.58% 115.54ms 10000 11.553us 1.7280us 50.209us calcSynapses + 21.49% 49.104ms 10000 4.9100us 4.0640us 6.1440us calcNeurons + 16.03% 36.625ms 17853 2.0510us 2.0160us 4.7360us [CUDA memcpy DtoH] + 11.86% 27.088ms 10000 2.7080us 2.5920us 11.392us learnSynapsesPost + 0.04% 93.633us 70 1.3370us 960ns 2.1440us [CUDA memcpy HtoD] + +==30505== API calls: +Time(%) Time Calls Avg Min Max Name + 35.14% 309.15ms 20095 15.384us 188ns 352.42us cudaMemcpy + 32.84% 288.94ms 20 14.447ms 7.6290us 287.79ms cudaHostAlloc + 29.91% 263.12ms 30000 8.7700us 7.6720us 331.70us cudaLaunch + 1.17% 10.291ms 30000 343ns 248ns 319.74us cudaConfigureCall + 0.84% 7.4251ms 30000 247ns 223ns 10.549us cudaSetupArgument + 0.06% 487.96us 20 24.398us 6.1080us 126.07us cudaMalloc + 0.03% 225.93us 83 2.7220us 138ns 97.475us cuDeviceGetAttribute + 0.00% 31.137us 1 31.137us 31.137us 31.137us cuDeviceTotalMem + 0.00% 27.695us 1 27.695us 27.695us 27.695us cuDeviceGetName + 0.00% 11.547us 20 577ns 375ns 2.1780us cudaGetSymbolAddress + 0.00% 11.033us 1 11.033us 11.033us 11.033us cudaSetDevice + 0.00% 1.4410us 2 720ns 488ns 953ns cuDeviceGetCount + 0.00% 1.3060us 1 1.3060us 1.3060us 1.3060us cudaGetDeviceCount + 0.00% 575ns 2 287ns 226ns 349ns cuDeviceGet diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_SparseHighRateSynapsesOnly_CUDAStandaloneConfiguration_1000.log b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_SparseHighRateSynapsesOnly_CUDAStandaloneConfiguration_1000.log new file mode 100644 index 00000000..1ecd96c1 --- /dev/null +++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_SparseHighRateSynapsesOnly_CUDAStandaloneConfiguration_1000.log @@ -0,0 +1,21 @@ +==29929== NVPROF is profiling process 29929, command: ./main +==29929== Profiling application: ./main +==29929== Profiling result: +Time(%) Time Calls Avg Min Max Name + 86.04% 284.29ms 10000 28.429us 27.328us 32.544us kernel_synapses_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, int*, int, double, double*, int*, int, int*) + 8.93% 29.521ms 10000 2.9520us 2.8800us 4.4480us [CUDA memset] + 5.03% 16.619ms 10000 1.6610us 1.5360us 2.4000us kernel_neurongroup_thresholder_codeobject(unsigned int, unsigned int, int*) + +==29929== API calls: +Time(%) Time Calls Avg Min Max Name + 58.38% 206.98ms 20000 10.348us 8.5120us 8.2431ms cudaLaunch + 28.06% 99.491ms 10000 9.9490us 8.5150us 27.390us cudaMemset + 8.91% 31.590ms 170000 185ns 150ns 313.25us cudaSetupArgument + 1.79% 6.3337ms 20000 316ns 206ns 303.30us cudaConfigureCall + 1.73% 6.1183ms 20000 305ns 199ns 315.94us cudaGetLastError + 1.12% 3.9780ms 1 3.9780ms 3.9780ms 3.9780ms cudaDeviceSynchronize + 0.01% 46.286us 1 46.286us 46.286us 46.286us cudaMemGetInfo + 0.00% 8.3370us 3 2.7790us 2.1280us 3.2430us cudaFuncGetAttributes + 0.00% 5.4670us 3 1.8220us 649ns 2.4930us cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags + 0.00% 1.5130us 4 378ns 295ns 546ns cudaDeviceGetAttribute + 0.00% 820ns 1 820ns 820ns 820ns cudaGetDevice diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_SparseHighRateSynapsesOnly_GeNNConfigurationOptimized_1000.log b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_SparseHighRateSynapsesOnly_GeNNConfigurationOptimized_1000.log new file mode 100644 index 00000000..0d0d08dd --- /dev/null +++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_SparseHighRateSynapsesOnly_GeNNConfigurationOptimized_1000.log @@ -0,0 +1,25 @@ +==30148== NVPROF is profiling process 30148, command: ./main test 1.0 1 +==30148== Profiling application: ./main test 1.0 1 +==30148== Profiling result: +Time(%) Time Calls Avg Min Max Name + 88.25% 301.73ms 10000 30.173us 3.3920us 32.704us calcSynapses + 11.72% 40.058ms 10000 4.0050us 3.8080us 4.8640us calcNeurons + 0.02% 61.280us 44 1.3920us 960ns 3.2000us [CUDA memcpy HtoD] + 0.01% 39.392us 14 2.8130us 1.9840us 6.8480us [CUDA memcpy DtoH] + +==30148== API calls: +Time(%) Time Calls Avg Min Max Name + 54.90% 442.78ms 12 36.898ms 14.006us 441.12ms cudaHostAlloc + 40.88% 329.68ms 20000 16.483us 7.7050us 338.70us cudaLaunch + 2.49% 20.082ms 61 329.22us 400ns 18.995ms cudaMemcpy + 0.94% 7.5995ms 20000 379ns 255ns 310.22us cudaConfigureCall + 0.67% 5.4120ms 20000 270ns 222ns 314.38us cudaSetupArgument + 0.08% 639.34us 12 53.278us 11.895us 172.21us cudaMalloc + 0.03% 235.92us 83 2.8420us 155ns 101.36us cuDeviceGetAttribute + 0.00% 32.471us 1 32.471us 32.471us 32.471us cuDeviceTotalMem + 0.00% 30.953us 1 30.953us 30.953us 30.953us cuDeviceGetName + 0.00% 14.056us 12 1.1710us 746ns 3.5320us cudaGetSymbolAddress + 0.00% 12.473us 1 12.473us 12.473us 12.473us cudaSetDevice + 0.00% 1.5390us 1 1.5390us 1.5390us 1.5390us cudaGetDeviceCount + 0.00% 1.4990us 2 749ns 424ns 1.0750us cuDeviceGetCount + 0.00% 514ns 2 257ns 199ns 315ns cuDeviceGet diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_SparseLowRateSynapsesOnly_CUDAStandaloneConfiguration_1000.log b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_SparseLowRateSynapsesOnly_CUDAStandaloneConfiguration_1000.log new file mode 100644 index 00000000..d7d74aa0 --- /dev/null +++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_SparseLowRateSynapsesOnly_CUDAStandaloneConfiguration_1000.log @@ -0,0 +1,21 @@ +==8193== NVPROF is profiling process 8193, command: ./main +==8193== Profiling application: ./main +==8193== Profiling result: +Time(%) Time Calls Avg Min Max Name + 55.84% 593.43ms 100000 5.9340us 5.4400us 6.9120us kernel_synapses_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, int*, int, double, double*, int*, int, int*) + 28.97% 307.88ms 100000 3.0780us 3.0400us 3.6800us [CUDA memset] + 15.19% 161.38ms 100000 1.6130us 1.5040us 2.5920us kernel_neurongroup_thresholder_codeobject(unsigned int, unsigned int, int*) + +==8193== API calls: +Time(%) Time Calls Avg Min Max Name + 55.92% 1.79370s 200000 8.9680us 7.6320us 7.2529ms cudaLaunch + 29.82% 956.72ms 100000 9.5670us 8.2580us 21.256ms cudaMemset + 10.51% 337.16ms 1700000 198ns 139ns 340.09us cudaSetupArgument + 1.91% 61.333ms 200000 306ns 217ns 368.29us cudaGetLastError + 1.83% 58.844ms 200000 294ns 168ns 332.73us cudaConfigureCall + 0.00% 45.848us 1 45.848us 45.848us 45.848us cudaMemGetInfo + 0.00% 12.992us 1 12.992us 12.992us 12.992us cudaDeviceSynchronize + 0.00% 8.6600us 3 2.8860us 2.0910us 3.5820us cudaFuncGetAttributes + 0.00% 5.3760us 3 1.7920us 594ns 2.4470us cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags + 0.00% 1.5830us 4 395ns 305ns 591ns cudaDeviceGetAttribute + 0.00% 829ns 1 829ns 829ns 829ns cudaGetDevice diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_SparseLowRateSynapsesOnly_GeNNConfigurationOptimized_1000.log b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_SparseLowRateSynapsesOnly_GeNNConfigurationOptimized_1000.log new file mode 100644 index 00000000..c1775029 --- /dev/null +++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_SparseLowRateSynapsesOnly_GeNNConfigurationOptimized_1000.log @@ -0,0 +1,25 @@ +==8451== NVPROF is profiling process 8451, command: ./main test 10.0 1 +==8451== Profiling application: ./main test 10.0 1 +==8451== Profiling result: +Time(%) Time Calls Avg Min Max Name + 66.88% 550.62ms 100000 5.5060us 3.4560us 6.4000us calcSynapses + 33.11% 272.64ms 100000 2.7260us 2.6560us 3.7760us calcNeurons + 0.01% 53.984us 44 1.2260us 960ns 2.0800us [CUDA memcpy HtoD] + 0.00% 35.072us 14 2.5050us 1.9520us 4.7040us [CUDA memcpy DtoH] + +==8451== API calls: +Time(%) Time Calls Avg Min Max Name + 81.32% 1.60600s 200000 8.0290us 7.4920us 354.55us cudaLaunch + 12.69% 250.71ms 12 20.893ms 15.503us 249.06ms cudaHostAlloc + 3.37% 66.566ms 200000 332ns 257ns 334.65us cudaConfigureCall + 2.52% 49.683ms 200000 248ns 225ns 334.65us cudaSetupArgument + 0.05% 1.0155ms 61 16.647us 343ns 35.922us cudaMemcpy + 0.03% 641.50us 12 53.458us 12.040us 174.09us cudaMalloc + 0.01% 225.49us 83 2.7160us 135ns 97.180us cuDeviceGetAttribute + 0.00% 31.170us 1 31.170us 31.170us 31.170us cuDeviceTotalMem + 0.00% 26.897us 1 26.897us 26.897us 26.897us cuDeviceGetName + 0.00% 13.730us 12 1.1440us 698ns 3.1800us cudaGetSymbolAddress + 0.00% 11.132us 1 11.132us 11.132us 11.132us cudaSetDevice + 0.00% 1.3520us 2 676ns 376ns 976ns cuDeviceGetCount + 0.00% 1.3320us 1 1.3320us 1.3320us 1.3320us cudaGetDeviceCount + 0.00% 542ns 2 271ns 213ns 329ns cuDeviceGet diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_SparseMediumRateSynapsesOnly_CUDAStandaloneConfiguration_1000.log b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_SparseMediumRateSynapsesOnly_CUDAStandaloneConfiguration_1000.log new file mode 100644 index 00000000..4d4e1ebe --- /dev/null +++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_SparseMediumRateSynapsesOnly_CUDAStandaloneConfiguration_1000.log @@ -0,0 +1,21 @@ +==16276== NVPROF is profiling process 16276, command: ./main +==16276== Profiling application: ./main +==16276== Profiling result: +Time(%) Time Calls Avg Min Max Name + 55.93% 59.598ms 10000 5.9590us 5.6000us 6.8480us kernel_synapses_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, int*, int, double, double*, int*, int, int*) + 28.96% 30.864ms 10000 3.0860us 3.0400us 3.5840us [CUDA memset] + 15.11% 16.106ms 10000 1.6100us 1.5040us 2.4000us kernel_neurongroup_thresholder_codeobject(unsigned int, unsigned int, int*) + +==16276== API calls: +Time(%) Time Calls Avg Min Max Name + 57.38% 194.03ms 20000 9.7010us 8.5280us 7.3801ms cudaLaunch + 27.54% 93.116ms 10000 9.3110us 8.6920us 28.380us cudaMemset + 10.82% 36.579ms 170000 215ns 184ns 349.92us cudaSetupArgument + 2.15% 7.2682ms 20000 363ns 248ns 327.47us cudaConfigureCall + 2.09% 7.0721ms 20000 353ns 266ns 337.12us cudaGetLastError + 0.01% 46.564us 1 46.564us 46.564us 46.564us cudaMemGetInfo + 0.01% 18.278us 1 18.278us 18.278us 18.278us cudaDeviceSynchronize + 0.00% 8.5460us 3 2.8480us 2.1440us 3.4910us cudaFuncGetAttributes + 0.00% 5.2380us 3 1.7460us 617ns 2.4330us cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags + 0.00% 1.7410us 4 435ns 339ns 632ns cudaDeviceGetAttribute + 0.00% 956ns 1 956ns 956ns 956ns cudaGetDevice diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_SparseMediumRateSynapsesOnly_GeNNConfigurationOptimized_1000.log b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_SparseMediumRateSynapsesOnly_GeNNConfigurationOptimized_1000.log new file mode 100644 index 00000000..639f22a9 --- /dev/null +++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_SparseMediumRateSynapsesOnly_GeNNConfigurationOptimized_1000.log @@ -0,0 +1,25 @@ +==16495== NVPROF is profiling process 16495, command: ./main test 1.0 1 +==16495== Profiling application: ./main test 1.0 1 +==16495== Profiling result: +Time(%) Time Calls Avg Min Max Name + 67.04% 60.321ms 10000 6.0320us 3.4560us 6.5280us calcSynapses + 32.86% 29.567ms 10000 2.9560us 2.9120us 3.7440us calcNeurons + 0.06% 54.017us 44 1.2270us 960ns 2.0480us [CUDA memcpy HtoD] + 0.04% 36.032us 14 2.5730us 2.0480us 4.7360us [CUDA memcpy DtoH] + +==16495== API calls: +Time(%) Time Calls Avg Min Max Name + 62.23% 290.68ms 12 24.223ms 7.8400us 289.60ms cudaHostAlloc + 35.13% 164.11ms 20000 8.2050us 7.5690us 348.13us cudaLaunch + 1.32% 6.1557ms 20000 307ns 255ns 328.87us cudaConfigureCall + 1.01% 4.7095ms 20000 235ns 202ns 341.44us cudaSetupArgument + 0.16% 750.68us 61 12.306us 358ns 28.177us cudaMemcpy + 0.09% 419.68us 12 34.973us 6.2030us 120.19us cudaMalloc + 0.05% 227.14us 83 2.7360us 145ns 97.726us cuDeviceGetAttribute + 0.01% 31.327us 1 31.327us 31.327us 31.327us cuDeviceTotalMem + 0.01% 26.548us 1 26.548us 26.548us 26.548us cuDeviceGetName + 0.00% 11.315us 1 11.315us 11.315us 11.315us cudaSetDevice + 0.00% 7.9470us 12 662ns 405ns 1.9600us cudaGetSymbolAddress + 0.00% 1.5460us 2 773ns 495ns 1.0510us cuDeviceGetCount + 0.00% 1.4000us 1 1.4000us 1.4000us 1.4000us cudaGetDeviceCount + 0.00% 578ns 2 289ns 223ns 355ns cuDeviceGet diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_VerySparseMediumRateSynapsesOnly_CUDAStandaloneConfiguration_1000.log b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_VerySparseMediumRateSynapsesOnly_CUDAStandaloneConfiguration_1000.log new file mode 100644 index 00000000..782334df --- /dev/null +++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_VerySparseMediumRateSynapsesOnly_CUDAStandaloneConfiguration_1000.log @@ -0,0 +1,21 @@ +==6005== NVPROF is profiling process 6005, command: ./main +==6005== Profiling application: ./main +==6005== Profiling result: +Time(%) Time Calls Avg Min Max Name + 55.29% 580.67ms 100000 5.8060us 5.2160us 6.6240us kernel_synapses_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, int*, int, double, double*, int*, int, int*) + 29.34% 308.08ms 100000 3.0800us 3.0400us 3.7120us [CUDA memset] + 15.37% 161.45ms 100000 1.6140us 1.5040us 2.5920us kernel_neurongroup_thresholder_codeobject(unsigned int, unsigned int, int*) + +==6005== API calls: +Time(%) Time Calls Avg Min Max Name + 56.44% 1.83924s 200000 9.1960us 7.9810us 7.4326ms cudaLaunch + 29.07% 947.22ms 100000 9.4720us 8.1380us 21.897ms cudaMemset + 10.90% 355.11ms 1700000 208ns 171ns 355.90us cudaSetupArgument + 1.82% 59.307ms 200000 296ns 177ns 333.92us cudaConfigureCall + 1.77% 57.629ms 200000 288ns 202ns 337.07us cudaGetLastError + 0.00% 46.411us 1 46.411us 46.411us 46.411us cudaMemGetInfo + 0.00% 13.163us 1 13.163us 13.163us 13.163us cudaDeviceSynchronize + 0.00% 8.2890us 3 2.7630us 2.0680us 3.3230us cudaFuncGetAttributes + 0.00% 5.4810us 3 1.8270us 565ns 2.5590us cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags + 0.00% 1.5840us 4 396ns 318ns 545ns cudaDeviceGetAttribute + 0.00% 924ns 1 924ns 924ns 924ns cudaGetDevice diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_VerySparseMediumRateSynapsesOnly_GeNNConfigurationOptimized_1000.log b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_VerySparseMediumRateSynapsesOnly_GeNNConfigurationOptimized_1000.log new file mode 100644 index 00000000..da3ef851 --- /dev/null +++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_VerySparseMediumRateSynapsesOnly_GeNNConfigurationOptimized_1000.log @@ -0,0 +1,25 @@ +==6274== NVPROF is profiling process 6274, command: ./main test 10.0 1 +==6274== Profiling application: ./main test 10.0 1 +==6274== Profiling result: +Time(%) Time Calls Avg Min Max Name + 69.30% 617.28ms 100000 6.1720us 3.3600us 7.5200us calcSynapses + 30.70% 273.43ms 100000 2.7340us 2.6560us 3.7440us calcNeurons + 0.01% 53.472us 44 1.2150us 960ns 2.0480us [CUDA memcpy HtoD] + 0.00% 34.560us 14 2.4680us 1.9520us 4.6080us [CUDA memcpy DtoH] + +==6274== API calls: +Time(%) Time Calls Avg Min Max Name + 82.48% 1.61117s 200000 8.0550us 7.0380us 353.83us cudaLaunch + 11.62% 226.99ms 12 18.916ms 7.8850us 225.88ms cudaHostAlloc + 3.30% 64.540ms 200000 322ns 238ns 338.74us cudaConfigureCall + 2.52% 49.132ms 200000 245ns 211ns 344.36us cudaSetupArgument + 0.04% 744.26us 61 12.200us 293ns 32.120us cudaMemcpy + 0.02% 421.09us 12 35.090us 6.1780us 119.69us cudaMalloc + 0.01% 226.88us 83 2.7330us 137ns 97.756us cuDeviceGetAttribute + 0.00% 31.259us 1 31.259us 31.259us 31.259us cuDeviceTotalMem + 0.00% 28.119us 1 28.119us 28.119us 28.119us cuDeviceGetName + 0.00% 11.457us 1 11.457us 11.457us 11.457us cudaSetDevice + 0.00% 8.0410us 12 670ns 397ns 1.9590us cudaGetSymbolAddress + 0.00% 1.6770us 2 838ns 479ns 1.1980us cuDeviceGetCount + 0.00% 1.4060us 1 1.4060us 1.4060us 1.4060us cudaGetDeviceCount + 0.00% 507ns 2 253ns 231ns 276ns cuDeviceGet diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_VogelsWithSynapticDynamic_CUDAStandaloneConfiguration_1000.log b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_VogelsWithSynapticDynamic_CUDAStandaloneConfiguration_1000.log new file mode 100644 index 00000000..2d7eef06 --- /dev/null +++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_VogelsWithSynapticDynamic_CUDAStandaloneConfiguration_1000.log @@ -0,0 +1,27 @@ +==6312== NVPROF is profiling process 6312, command: ./main +==6312== Profiling application: ./main +==6312== Profiling result: +Time(%) Time Calls Avg Min Max Name + 27.18% 194.20ms 10000 19.419us 3.1680us 2.1194ms kernel_synapses_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, double*, int*, int, int*, double, int*, int) + 22.86% 163.34ms 10000 16.333us 3.1040us 1.6753ms kernel_synapses_2_post_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, int*, int, double, double*, int, double*, int, int*, double*, int) + 14.99% 107.12ms 10000 10.711us 3.2960us 1.1295ms kernel_synapses_2_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, int*, int, int*, double, double*, int, double*, double*, int, int*, int, double*, int) + 14.22% 101.59ms 10000 10.158us 3.2960us 1.0383ms kernel_synapses_1_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, int*, int, int*, double*, double, int*, int) + 5.84% 41.697ms 10000 4.1690us 3.8720us 5.5360us kernel_neurongroup_stateupdater_codeobject(unsigned int, unsigned int, double*, double*, double, double*, double*, double*, bool*) + 4.71% 33.655ms 10000 3.3650us 3.2320us 4.1280us kernel_synapses_2_stateupdater_codeobject(unsigned int, unsigned int, int*, double*, int, double*, int, double*) + 4.37% 31.213ms 10000 3.1210us 3.0400us 4.1920us [CUDA memset] + 3.37% 24.073ms 10000 2.4070us 2.0160us 5.7920us kernel_neurongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double, double*, double*, bool*) + 2.45% 17.497ms 10000 1.7490us 1.5360us 2.7840us kernel_neurongroup_resetter_codeobject(unsigned int, unsigned int, double*, int*, bool*) + +==6312== API calls: +Time(%) Time Calls Avg Min Max Name + 70.49% 724.20ms 80000 9.0520us 7.8180us 7.3109ms cudaLaunch + 16.18% 166.25ms 940000 176ns 148ns 532.24us cudaSetupArgument + 9.28% 95.356ms 10000 9.5350us 8.8100us 1.1346ms cudaMemset + 2.07% 21.258ms 80000 265ns 188ns 322.95us cudaConfigureCall + 1.97% 20.198ms 80000 252ns 221ns 60.788us cudaGetLastError + 0.00% 51.002us 1 51.002us 51.002us 51.002us cudaMemGetInfo + 0.00% 42.841us 1 42.841us 42.841us 42.841us cudaDeviceSynchronize + 0.00% 41.487us 74 560ns 469ns 2.5840us cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags + 0.00% 31.858us 12 2.6540us 1.9920us 4.7290us cudaFuncGetAttributes + 0.00% 6.5530us 16 409ns 280ns 1.1330us cudaDeviceGetAttribute + 0.00% 3.9370us 4 984ns 604ns 1.7060us cudaGetDevice diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_Vogels_CUDAStandaloneConfiguration_1000.log b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_Vogels_CUDAStandaloneConfiguration_1000.log new file mode 100644 index 00000000..6f6703a2 --- /dev/null +++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_Vogels_CUDAStandaloneConfiguration_1000.log @@ -0,0 +1,26 @@ +==12243== NVPROF is profiling process 12243, command: ./main +==12243== Profiling application: ./main +==12243== Profiling result: +Time(%) Time Calls Avg Min Max Name + 27.91% 192.82ms 10000 19.281us 3.1360us 2.1170ms kernel_synapses_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, double*, int*, int, int*, double, int*, int) + 25.45% 175.79ms 10000 17.578us 3.3280us 1.7610ms kernel_synapses_2_post_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, double*, int, double*, int, int*, int, double, double*, int, int*) + 15.82% 109.25ms 10000 10.925us 3.3600us 1.1837ms kernel_synapses_2_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, double*, int, double*, int, int*, int, int*, int, double, double*, int, double*, int*) + 14.27% 98.554ms 10000 9.8550us 3.1680us 1.0373ms kernel_synapses_1_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, int*, int, int*, double*, double, int*, int) + 5.95% 41.110ms 10000 4.1110us 3.7760us 5.3120us kernel_neurongroup_stateupdater_codeobject(unsigned int, unsigned int, double*, double*, double, double*, double*, double*, bool*) + 4.53% 31.297ms 10000 3.1290us 2.9440us 4.3200us [CUDA memset] + 3.54% 24.435ms 10000 2.4430us 2.0160us 6.0160us kernel_neurongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double, double*, double*, bool*) + 2.53% 17.499ms 10000 1.7490us 1.5360us 2.8160us kernel_neurongroup_resetter_codeobject(unsigned int, unsigned int, double*, int*, bool*) + +==12243== API calls: +Time(%) Time Calls Avg Min Max Name + 69.99% 645.08ms 70000 9.2150us 8.1890us 7.3493ms cudaLaunch + 16.20% 149.30ms 860000 173ns 144ns 1.1943ms cudaSetupArgument + 10.32% 95.084ms 10000 9.5080us 8.7600us 327.83us cudaMemset + 1.76% 16.177ms 70000 231ns 200ns 10.120us cudaGetLastError + 1.72% 15.875ms 70000 226ns 181ns 5.3450us cudaConfigureCall + 0.01% 51.450us 1 51.450us 51.450us 51.450us cudaMemGetInfo + 0.00% 25.843us 10 2.5840us 2.0060us 4.6820us cudaFuncGetAttributes + 0.00% 25.773us 41 628ns 481ns 2.9340us cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags + 0.00% 17.259us 1 17.259us 17.259us 17.259us cudaDeviceSynchronize + 0.00% 5.8620us 12 488ns 313ns 1.3830us cudaDeviceGetAttribute + 0.00% 3.0770us 3 1.0250us 630ns 1.5860us cudaGetDevice diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_Vogels_GeNNConfigurationOptimized_1000.log b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_Vogels_GeNNConfigurationOptimized_1000.log new file mode 100644 index 00000000..c45cf1a3 --- /dev/null +++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_Vogels_GeNNConfigurationOptimized_1000.log @@ -0,0 +1,26 @@ +==12518== NVPROF is profiling process 12518, command: ./main test 1.0 1 +==12518== Profiling application: ./main test 1.0 1 +==12518== Profiling result: +Time(%) Time Calls Avg Min Max Name + 59.61% 415.51ms 10000 41.550us 2.0480us 6.0015ms learnSynapsesPost + 29.39% 204.87ms 10000 20.486us 1.5680us 2.4941ms calcSynapses + 10.93% 76.180ms 10000 7.6170us 6.6240us 14.560us calcNeurons + 0.06% 385.28us 86 4.4800us 960ns 42.752us [CUDA memcpy HtoD] + 0.02% 130.11us 20 6.5050us 1.9840us 40.641us [CUDA memcpy DtoH] + +==12518== API calls: +Time(%) Time Calls Avg Min Max Name + 66.01% 690.75ms 30000 23.025us 7.6920us 649.80us cudaLaunch + 29.49% 308.57ms 26 11.868ms 7.6940us 306.48ms cudaHostAlloc + 2.65% 27.715ms 112 247.46us 184ns 25.977ms cudaMemcpy + 0.97% 10.186ms 30000 339ns 250ns 318.13us cudaConfigureCall + 0.77% 8.0652ms 30000 268ns 222ns 319.03us cudaSetupArgument + 0.07% 763.51us 26 29.365us 6.1460us 121.30us cudaMalloc + 0.02% 226.59us 83 2.7300us 136ns 97.714us cuDeviceGetAttribute + 0.00% 31.319us 1 31.319us 31.319us 31.319us cuDeviceTotalMem + 0.00% 28.107us 1 28.107us 28.107us 28.107us cuDeviceGetName + 0.00% 15.639us 26 601ns 388ns 2.0380us cudaGetSymbolAddress + 0.00% 11.574us 1 11.574us 11.574us 11.574us cudaSetDevice + 0.00% 1.7010us 2 850ns 538ns 1.1630us cuDeviceGetCount + 0.00% 1.5690us 1 1.5690us 1.5690us 1.5690us cudaGetDeviceCount + 0.00% 540ns 2 270ns 227ns 313ns cuDeviceGet diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_AdaptationOscillation_absolute.png b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_AdaptationOscillation_absolute.png new file mode 100644 index 00000000..e1ba28f3 Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_AdaptationOscillation_absolute.png differ diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_AdaptationOscillation_profiling.png b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_AdaptationOscillation_profiling.png new file mode 100644 index 00000000..d19c440d Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_AdaptationOscillation_profiling.png differ diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_AdaptationOscillation_relative.png b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_AdaptationOscillation_relative.png new file mode 100644 index 00000000..c270386e Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_AdaptationOscillation_relative.png differ diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_BrunelHakimModelHeterogeneousDelay_absolute.png b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_BrunelHakimModelHeterogeneousDelay_absolute.png new file mode 100644 index 00000000..84a46b24 Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_BrunelHakimModelHeterogeneousDelay_absolute.png differ diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_BrunelHakimModelHeterogeneousDelay_profiling.png b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_BrunelHakimModelHeterogeneousDelay_profiling.png new file mode 100644 index 00000000..bd726111 Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_BrunelHakimModelHeterogeneousDelay_profiling.png differ diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_BrunelHakimModelHeterogeneousDelay_relative.png b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_BrunelHakimModelHeterogeneousDelay_relative.png new file mode 100644 index 00000000..ffea3406 Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_BrunelHakimModelHeterogeneousDelay_relative.png differ diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_BrunelHakimModelScalarDelayNoMultiPrePost_absolute.png b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_BrunelHakimModelScalarDelayNoMultiPrePost_absolute.png new file mode 100644 index 00000000..58e7259f Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_BrunelHakimModelScalarDelayNoMultiPrePost_absolute.png differ diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_BrunelHakimModelScalarDelayNoMultiPrePost_profiling.png b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_BrunelHakimModelScalarDelayNoMultiPrePost_profiling.png new file mode 100644 index 00000000..399afa8c Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_BrunelHakimModelScalarDelayNoMultiPrePost_profiling.png differ diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_BrunelHakimModelScalarDelayNoMultiPrePost_relative.png b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_BrunelHakimModelScalarDelayNoMultiPrePost_relative.png new file mode 100644 index 00000000..919caf69 Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_BrunelHakimModelScalarDelayNoMultiPrePost_relative.png differ diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_BrunelHakimModelScalarDelay_absolute.png b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_BrunelHakimModelScalarDelay_absolute.png new file mode 100644 index 00000000..0411270d Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_BrunelHakimModelScalarDelay_absolute.png differ diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_BrunelHakimModelScalarDelay_profiling.png b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_BrunelHakimModelScalarDelay_profiling.png new file mode 100644 index 00000000..6cc9c92e Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_BrunelHakimModelScalarDelay_profiling.png differ diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_BrunelHakimModelScalarDelay_relative.png b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_BrunelHakimModelScalarDelay_relative.png new file mode 100644 index 00000000..7a5d4fc2 Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_BrunelHakimModelScalarDelay_relative.png differ diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_COBAHHFixedConnectivity_absolute.png b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_COBAHHFixedConnectivity_absolute.png new file mode 100644 index 00000000..e4edc8b6 Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_COBAHHFixedConnectivity_absolute.png differ diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_COBAHHFixedConnectivity_profiling.png b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_COBAHHFixedConnectivity_profiling.png new file mode 100644 index 00000000..44159113 Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_COBAHHFixedConnectivity_profiling.png differ diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_COBAHHFixedConnectivity_relative.png b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_COBAHHFixedConnectivity_relative.png new file mode 100644 index 00000000..7257cc46 Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_COBAHHFixedConnectivity_relative.png differ diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_COBAHH_absolute.png b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_COBAHH_absolute.png new file mode 100644 index 00000000..a69e0cdb Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_COBAHH_absolute.png differ diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_COBAHH_profiling.png b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_COBAHH_profiling.png new file mode 100644 index 00000000..f44f9932 Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_COBAHH_profiling.png differ diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_COBAHH_relative.png b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_COBAHH_relative.png new file mode 100644 index 00000000..3b17d55f Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_COBAHH_relative.png differ diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_CUBAFixedConnectivity_absolute.png b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_CUBAFixedConnectivity_absolute.png new file mode 100644 index 00000000..c18c0899 Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_CUBAFixedConnectivity_absolute.png differ diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_CUBAFixedConnectivity_profiling.png b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_CUBAFixedConnectivity_profiling.png new file mode 100644 index 00000000..aa7b06d1 Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_CUBAFixedConnectivity_profiling.png differ diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_CUBAFixedConnectivity_relative.png b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_CUBAFixedConnectivity_relative.png new file mode 100644 index 00000000..9a8341ed Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_CUBAFixedConnectivity_relative.png differ diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_CUBA_absolute.png b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_CUBA_absolute.png new file mode 100644 index 00000000..2a605e63 Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_CUBA_absolute.png differ diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_CUBA_profiling.png b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_CUBA_profiling.png new file mode 100644 index 00000000..d7b98866 Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_CUBA_profiling.png differ diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_CUBA_relative.png b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_CUBA_relative.png new file mode 100644 index 00000000..7fd8860d Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_CUBA_relative.png differ diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_DenseMediumRateSynapsesOnly_absolute.png b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_DenseMediumRateSynapsesOnly_absolute.png new file mode 100644 index 00000000..ee874308 Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_DenseMediumRateSynapsesOnly_absolute.png differ diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_DenseMediumRateSynapsesOnly_profiling.png b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_DenseMediumRateSynapsesOnly_profiling.png new file mode 100644 index 00000000..07ce992e Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_DenseMediumRateSynapsesOnly_profiling.png differ diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_DenseMediumRateSynapsesOnly_relative.png b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_DenseMediumRateSynapsesOnly_relative.png new file mode 100644 index 00000000..a844b061 Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_DenseMediumRateSynapsesOnly_relative.png differ diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_HHNeuronsOnly_absolute.png b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_HHNeuronsOnly_absolute.png new file mode 100644 index 00000000..5453d4bd Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_HHNeuronsOnly_absolute.png differ diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_HHNeuronsOnly_profiling.png b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_HHNeuronsOnly_profiling.png new file mode 100644 index 00000000..56192923 Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_HHNeuronsOnly_profiling.png differ diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_HHNeuronsOnly_relative.png b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_HHNeuronsOnly_relative.png new file mode 100644 index 00000000..98ee6788 Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_HHNeuronsOnly_relative.png differ diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_LinearNeuronsOnly_absolute.png b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_LinearNeuronsOnly_absolute.png new file mode 100644 index 00000000..5d629b4b Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_LinearNeuronsOnly_absolute.png differ diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_LinearNeuronsOnly_profiling.png b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_LinearNeuronsOnly_profiling.png new file mode 100644 index 00000000..e26ef41a Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_LinearNeuronsOnly_profiling.png differ diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_LinearNeuronsOnly_relative.png b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_LinearNeuronsOnly_relative.png new file mode 100644 index 00000000..255f8cdf Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_LinearNeuronsOnly_relative.png differ diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_STDPEventDriven_absolute.png b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_STDPEventDriven_absolute.png new file mode 100644 index 00000000..0599d925 Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_STDPEventDriven_absolute.png differ diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_STDPEventDriven_profiling.png b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_STDPEventDriven_profiling.png new file mode 100644 index 00000000..8a2f88e0 Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_STDPEventDriven_profiling.png differ diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_STDPEventDriven_relative.png b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_STDPEventDriven_relative.png new file mode 100644 index 00000000..29696eb2 Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_STDPEventDriven_relative.png differ diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_STDPMultiPostNeuronalTraces_absolute.png b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_STDPMultiPostNeuronalTraces_absolute.png new file mode 100644 index 00000000..66bf3e36 Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_STDPMultiPostNeuronalTraces_absolute.png differ diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_STDPMultiPostNeuronalTraces_profiling.png b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_STDPMultiPostNeuronalTraces_profiling.png new file mode 100644 index 00000000..03d00681 Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_STDPMultiPostNeuronalTraces_profiling.png differ diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_STDPMultiPostNeuronalTraces_relative.png b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_STDPMultiPostNeuronalTraces_relative.png new file mode 100644 index 00000000..11ef77a4 Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_STDPMultiPostNeuronalTraces_relative.png differ diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_STDPMultiPost_absolute.png b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_STDPMultiPost_absolute.png new file mode 100644 index 00000000..65f2d9d4 Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_STDPMultiPost_absolute.png differ diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_STDPMultiPost_profiling.png b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_STDPMultiPost_profiling.png new file mode 100644 index 00000000..15f230be Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_STDPMultiPost_profiling.png differ diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_STDPMultiPost_relative.png b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_STDPMultiPost_relative.png new file mode 100644 index 00000000..a709f40e Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_STDPMultiPost_relative.png differ diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_STDPNeuronalTraces_absolute.png b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_STDPNeuronalTraces_absolute.png new file mode 100644 index 00000000..4507a51e Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_STDPNeuronalTraces_absolute.png differ diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_STDPNeuronalTraces_profiling.png b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_STDPNeuronalTraces_profiling.png new file mode 100644 index 00000000..e9b440b3 Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_STDPNeuronalTraces_profiling.png differ diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_STDPNeuronalTraces_relative.png b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_STDPNeuronalTraces_relative.png new file mode 100644 index 00000000..23ce5dc6 Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_STDPNeuronalTraces_relative.png differ diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_STDPNotEventDriven_absolute.png b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_STDPNotEventDriven_absolute.png new file mode 100644 index 00000000..56bd1ec9 Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_STDPNotEventDriven_absolute.png differ diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_STDPNotEventDriven_profiling.png b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_STDPNotEventDriven_profiling.png new file mode 100644 index 00000000..e1f9b29e Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_STDPNotEventDriven_profiling.png differ diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_STDPNotEventDriven_relative.png b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_STDPNotEventDriven_relative.png new file mode 100644 index 00000000..a114006a Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_STDPNotEventDriven_relative.png differ diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_STDP_absolute.png b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_STDP_absolute.png new file mode 100644 index 00000000..9a6e8891 Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_STDP_absolute.png differ diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_STDP_profiling.png b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_STDP_profiling.png new file mode 100644 index 00000000..54ddab08 Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_STDP_profiling.png differ diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_STDP_relative.png b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_STDP_relative.png new file mode 100644 index 00000000..ceac1522 Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_STDP_relative.png differ diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_SparseHighRateSynapsesOnly_absolute.png b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_SparseHighRateSynapsesOnly_absolute.png new file mode 100644 index 00000000..3e2bd173 Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_SparseHighRateSynapsesOnly_absolute.png differ diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_SparseHighRateSynapsesOnly_profiling.png b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_SparseHighRateSynapsesOnly_profiling.png new file mode 100644 index 00000000..4494d0a3 Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_SparseHighRateSynapsesOnly_profiling.png differ diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_SparseHighRateSynapsesOnly_relative.png b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_SparseHighRateSynapsesOnly_relative.png new file mode 100644 index 00000000..85ddf107 Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_SparseHighRateSynapsesOnly_relative.png differ diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_SparseLowRateSynapsesOnly_absolute.png b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_SparseLowRateSynapsesOnly_absolute.png new file mode 100644 index 00000000..5cb5a1a8 Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_SparseLowRateSynapsesOnly_absolute.png differ diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_SparseLowRateSynapsesOnly_profiling.png b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_SparseLowRateSynapsesOnly_profiling.png new file mode 100644 index 00000000..5562caa9 Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_SparseLowRateSynapsesOnly_profiling.png differ diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_SparseLowRateSynapsesOnly_relative.png b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_SparseLowRateSynapsesOnly_relative.png new file mode 100644 index 00000000..70d71c34 Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_SparseLowRateSynapsesOnly_relative.png differ diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_SparseMediumRateSynapsesOnly_absolute.png b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_SparseMediumRateSynapsesOnly_absolute.png new file mode 100644 index 00000000..bc14e017 Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_SparseMediumRateSynapsesOnly_absolute.png differ diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_SparseMediumRateSynapsesOnly_profiling.png b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_SparseMediumRateSynapsesOnly_profiling.png new file mode 100644 index 00000000..574e7d10 Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_SparseMediumRateSynapsesOnly_profiling.png differ diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_SparseMediumRateSynapsesOnly_relative.png b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_SparseMediumRateSynapsesOnly_relative.png new file mode 100644 index 00000000..954b7f19 Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_SparseMediumRateSynapsesOnly_relative.png differ diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_VerySparseMediumRateSynapsesOnly_absolute.png b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_VerySparseMediumRateSynapsesOnly_absolute.png new file mode 100644 index 00000000..85709744 Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_VerySparseMediumRateSynapsesOnly_absolute.png differ diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_VerySparseMediumRateSynapsesOnly_profiling.png b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_VerySparseMediumRateSynapsesOnly_profiling.png new file mode 100644 index 00000000..76f619d0 Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_VerySparseMediumRateSynapsesOnly_profiling.png differ diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_VerySparseMediumRateSynapsesOnly_relative.png b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_VerySparseMediumRateSynapsesOnly_relative.png new file mode 100644 index 00000000..17a234eb Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_VerySparseMediumRateSynapsesOnly_relative.png differ diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_VogelsWithSynapticDynamic_absolute.png b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_VogelsWithSynapticDynamic_absolute.png new file mode 100644 index 00000000..207758e2 Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_VogelsWithSynapticDynamic_absolute.png differ diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_VogelsWithSynapticDynamic_profiling.png b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_VogelsWithSynapticDynamic_profiling.png new file mode 100644 index 00000000..199ec4f8 Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_VogelsWithSynapticDynamic_profiling.png differ diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_VogelsWithSynapticDynamic_relative.png b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_VogelsWithSynapticDynamic_relative.png new file mode 100644 index 00000000..378cdc89 Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_VogelsWithSynapticDynamic_relative.png differ diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_Vogels_absolute.png b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_Vogels_absolute.png new file mode 100644 index 00000000..8f6a8e3c Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_Vogels_absolute.png differ diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_Vogels_profiling.png b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_Vogels_profiling.png new file mode 100644 index 00000000..c9d9e666 Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_Vogels_profiling.png differ diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_Vogels_relative.png b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_Vogels_relative.png new file mode 100644 index 00000000..48573e36 Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_Vogels_relative.png differ diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/run_speed_test_script.py b/dev/benchmarks/results_2017_04_05_complete_after_talk/run_speed_test_script.py new file mode 100644 index 00000000..865118ae --- /dev/null +++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/run_speed_test_script.py @@ -0,0 +1,249 @@ +import os +import shutil +import glob +import subprocess +import sys + +# run tests without X-server +import matplotlib +matplotlib.use('Agg') + +# pretty plots +import seaborn + +import time +import datetime +import cPickle as pickle + +from brian2 import * +from brian2.tests.features import * +from brian2.tests.features.base import * +from brian2.tests.features.base import results + +import brian2cuda +from brian2cuda.tests.features.cuda_configuration import (CUDAStandaloneConfiguration, + CUDAStandaloneConfigurationNoAssert, + CUDAStandaloneConfigurationCurandDouble, + CUDAStandaloneConfigurationNoCudaOccupancyAPI, + CUDAStandaloneConfigurationNoCudaOccupancyAPIProfileCPU, + CUDAStandaloneConfiguration2BlocksPerSM, + CUDAStandaloneConfiguration2BlocksPerSMLaunchBounds, + CUDAStandaloneConfigurationSynLaunchBounds, + CUDAStandaloneConfiguration2BlocksPerSMSynLaunchBounds, + CUDAStandaloneConfigurationProfileGPU, + CUDAStandaloneConfigurationProfileCPU) +from brian2cuda.tests.features.speed import * + +from brian2genn.correctness_testing import GeNNConfiguration, GeNNConfigurationCPU, GeNNConfigurationOptimized + +from create_readme import create_readme + +assert len(sys.argv)<= 2, 'Only one command line argument supported! Got {}'.format(len(sys.argv)-1) +if len(sys.argv) == 2: + additional_dir_name = '_' + sys.argv[1] +else: + additional_dir_name = '' + +prefs['devices.cpp_standalone.extra_make_args_unix'] = ['-j12'] + +configs = [# configuration project_directory + #(NumpyConfiguration, None), + #(WeaveConfiguration, None), + #(LocalConfiguration, None), + (CUDAStandaloneConfiguration, 'cuda_standalone'), + #(CUDAStandaloneConfigurationNoAssert, 'cuda_standalone'), + #(CUDAStandaloneConfigurationNoThreadfence, 'cuda_standalone'), + #(CUDAStandaloneConfigurationCurandDouble, 'cuda_standalone'), + #(CUDAStandaloneConfigurationNoCudaOccupancyAPI, 'cuda_standalone'), + #(CUDAStandaloneConfigurationNoCudaOccupancyAPIProfileCPU, 'cuda_standalone'), + #(CUDAStandaloneConfiguration2BlocksPerSM, 'cuda_standalone'), + #(CUDAStandaloneConfiguration2BlocksPerSMLaunchBounds, 'cuda_standalone'), + #(CUDAStandaloneConfigurationSynLaunchBounds, 'cuda_standalone'), + #(CUDAStandaloneConfiguration2BlocksPerSMSynLaunchBounds, 'cuda_standalone'), + #(CUDAStandaloneConfigurationProfileGPU, 'cuda_standalone'), + #(CUDAStandaloneConfigurationProfileCPU, 'cuda_standalone'), + (CPPStandaloneConfiguration, 'cpp_standalone'), + #(GeNNConfiguration, 'GeNNworkspace'), + #(CPPStandaloneConfigurationOpenMP, 'cpp_standalone'), + #(GeNNConfigurationCPU, 'GeNNworkspace'), + (GeNNConfigurationOptimized, 'GeNNworkspace') + ] + +speed_tests = [# feature_test name n_slice + + #(ThresholderOnlyPoissonLowRate, 'ThresholderOnlyPoissonLowRate', slice(None) ), + #(ThresholderOnlyPoissonMediumRate, 'ThresholderOnlyPoissonMediumRate', slice(None) ), + #(ThresholderOnlyPoissonHighRate, 'ThresholderOnlyPoissonHighRate', slice(None) ), + #(ThresholderOnlyAlwaysSpiking, 'ThresholderOnlyAlwaysSpiking', slice(None) ), + + #(BrunelHakimStateupdateOnlyDouble, 'BrunelHakimStateupdateOnlyDouble', slice(None) ), + #(BrunelHakimStateupdateOnlyTriple, 'BrunelHakimStateupdateOnlyTriple', slice(None) ), + #(BrunelHakimStateupdateOnly, 'BrunelHakimStateupdateOnly', slice(None) ), + #(BrunelHakimNeuronsOnly, 'BrunelHakimNeuronsOnly', slice(None) ), + #(BrunelHakimNeuronsOnlyNoXi, 'BrunelHakimNeuronsOnlyNoXi', slice(None) ), + #(BrunelHakimNeuronsOnlyNoRand, 'BrunelHakimNeuronsOnlyNoRand', slice(None) ), + #(BrunelHakimStateupdateThresholdOnly, 'BrunelHakimStateupdateThresholdOnly', slice(None) ), + #(BrunelHakimStateupdateThresholdResetOnly, 'BrunelHakimStateupdateThresholdResetOnly', slice(None) ), + #(BrunelHakimModelScalarDelayShort, 'BrunelHakimModelScalarDelayShort', slice(None) ), + (CUBA, 'CUBA', slice(None) ), + (COBAHH, 'COBAHH', slice(None) ), + (AdaptationOscillation, 'AdaptationOscillation', slice(None) ), + (Vogels, 'Vogels', slice(None) ), + (STDP, 'STDP', slice(None) ), + (STDPEventDriven, 'STDPEventDriven', slice(None) ), + (BrunelHakimModelScalarDelay, 'BrunelHakimModelScalarDelay', slice(None) ), + (BrunelHakimModelScalarDelayNoMultiPrePost, 'BrunelHakimModelScalarDelayNoMultiPrePost', slice(None) ), + + (VerySparseMediumRateSynapsesOnly, 'VerySparseMediumRateSynapsesOnly', slice(None) ), + (SparseMediumRateSynapsesOnly, 'SparseMediumRateSynapsesOnly', slice(None) ), + (DenseMediumRateSynapsesOnly, 'DenseMediumRateSynapsesOnly', slice(None) ), + (SparseLowRateSynapsesOnly, 'SparseLowRateSynapsesOnly', slice(None) ), + (SparseHighRateSynapsesOnly, 'SparseHighRateSynapsesOnly', slice(None) ), + + (STDPNotEventDriven, 'STDPNotEventDriven', slice(None) ), + (STDPMultiPost, 'STDPMultiPost', slice(None) ), + (STDPNeuronalTraces, 'STDPNeuronalTraces', slice(None) ), + (STDPMultiPostNeuronalTraces, 'STDPMultiPostNeuronalTraces', slice(None) ), + + (BrunelHakimModelHeterogeneousDelay, 'BrunelHakimModelHeterogeneousDelay', slice(None) ), + + (LinearNeuronsOnly, 'LinearNeuronsOnly', slice(None) ), + (HHNeuronsOnly, 'HHNeuronsOnly', slice(None) ), + (VogelsWithSynapticDynamic, 'VogelsWithSynapticDynamic', slice(None) ), +## below uses monitors + (CUBAFixedConnectivity, 'CUBAFixedConnectivity', slice(None) ), + (COBAHHFixedConnectivity, 'COBAHHFixedConnectivity', slice(None, -1) ), +] + +configurations = [config[0] for config in configs] +project_dirs = [config[1] for config in configs] + +# check if multiple Configurations with same project_dirs are specified +last_idx = {} +for proj_dir in project_dirs: + if proj_dir is not None: + first_i = project_dirs.index(proj_dir) + last_i = len(project_dirs) - 1 - project_dirs[::-1].index(proj_dir) + if first_i != last_i: + print("WARNING there are multiple configurations using {d} as project " + "directory. Profiling and logfiles will only be saved for the last one {c}.".format( + d=proj_dir, c=configurations[last_i].__name__)) + last_idx[proj_dir] = last_i + +time_stemp = time.time() +date_str = datetime.datetime.fromtimestamp(time_stemp).strftime('%Y_%m_%d') + +directory = 'results_{}{}'.format(date_str, additional_dir_name) +if os.path.exists(directory): + new_dir = directory + '_bak_' + str(int(time.time())) + print("Directory with name `{}` already exists. Renaming it to `{}`.".format(directory, new_dir)) + os.rename(directory, new_dir) +os.makedirs(directory) +data_dir = os.path.join(directory, 'data') +plot_dir = os.path.join(directory, 'plots') +log_dir = os.path.join(directory, 'logs') +prof_dir = os.path.join(directory, 'nvprof') +os.makedirs(data_dir) +os.makedirs(plot_dir) +os.makedirs(log_dir) +os.makedirs(prof_dir) +print("Saving results in {}.".format(plot_dir)) + +shutil.copy(os.path.realpath(__file__), os.path.join(directory, 'run_speed_test_script.py')) + +time_format = '%d.%m.%Y at %H:%M:%S' +script_start = datetime.datetime.fromtimestamp(time.time()).strftime(time_format) + +with open(os.path.join(directory, 'git.diff'), 'w') as diff_file: + subprocess.call(['git', 'diff'], stdout=diff_file) + +try: + for n, (st, name, sl) in enumerate(speed_tests): + start = datetime.datetime.fromtimestamp(time.time()).strftime(time_format) + print("Starting {} on {}.".format(name, start)) + maximum_run_time = 1*60*60*second + #st.duration = 10*second + res = run_speed_tests(configurations=configurations, + speed_tests=[st], + n_slice=sl, + #n_slice=slice(0,1,None), + run_twice=False, + verbose=True, + maximum_run_time=maximum_run_time, + profile_only_active=True) + #profile_only_active=False) + end = datetime.datetime.fromtimestamp(time.time()).strftime(time_format) + diff = datetime.datetime.strptime(end, time_format) - datetime.datetime.strptime(start, time_format) + print("Running {} took {}.".format(name, diff)) + res.plot_all_tests(print_relative=True) + savefig(os.path.join(plot_dir, 'speed_test_{}_absolute.png'.format(speed_tests[n][1]))) + res.plot_all_tests(relative=True) + savefig(os.path.join(plot_dir, 'speed_test_{}_relative.png'.format(name))) + res.plot_all_tests(profiling_minimum=0.05) + savefig(os.path.join(plot_dir, 'speed_test_{}_profiling.png'.format(name))) + if 3 != len(get_fignums()): + print("WARNING: There were {} plots created, but only {} saved.".format(len(get_fignums()), 3*(n+1))) + for n in get_fignums(): + close(n) + + # pickel results object to disk + pkl_file = os.path.join(data_dir, name + '.pkl' ) + with open(pkl_file, 'wb') as output: + pickle.dump(res, output, pickle.HIGHEST_PROTOCOL) + + # save stdout log of last run (the other are deleted in run_speed_tests()) + for proj_dir in set(project_dirs): + if not proj_dir is None and proj_dir in ['cuda_standalone', 'cpp_standalone']: + config = configurations[last_idx[proj_dir]] + stdout_file = os.path.join(proj_dir, 'results/stdout.txt') + if os.path.exists(stdout_file): + shutil.copy(stdout_file, + os.path.join(log_dir, 'stdout_{st}_{conf}_{n}.txt'.format(st=name, conf=proj_dir, + n=st.n_range[sl][-1]))) + else: + print("WARNING Couldn't save {},file not found.".format(stdout_file)) + + # run nvprof on n_range[2] + for conf, proj_dir in zip(configurations, project_dirs): + main_arg = '' + if proj_dir in ['cuda_standalone', 'GeNNworkspace']: + if proj_dir == 'GeNNworkspace': + main_arg = 'test {time} 1'.format(time=st.duration/second) + ns = st.n_range[sl] + idx = 2 + max_runtime = 20 + conf_name = conf.__name__ + print("Rerunning {} with n = {} for nvprof profiling".format(conf_name, st.n_range[idx])) + tb, res, runtime, prof_info = results(conf, st, st.n_range[idx], maximum_run_time=maximum_run_time) + if not isinstance(res, Exception) and runtime < max_runtime: + option = '--profile-from-start-off' if proj_dir == 'cuda_standalone' else '' + cmd = 'cd {proj_dir} && nvprof {opt} --log-file ../{log_file} ./main {arg}'.format( + proj_dir=proj_dir, arg=main_arg, opt=option, + log_file=os.path.join(prof_dir, 'nvprof_{st}_{conf}_{n}.log'.format( + st=name, conf=conf_name, n=st.n_range[idx]))) + prof_start = datetime.datetime.fromtimestamp(time.time()).strftime(time_format) + print(cmd) + x = os.system(cmd) + if x: + print('nvprof failed with {}'.format(x)) + prof_end = datetime.datetime.fromtimestamp(time.time()).strftime(time_format) + prof_diff = datetime.datetime.strptime(prof_end, time_format) - datetime.datetime.strptime(prof_start, time_format) + print("Profiling took {} for runtime of {}".format(prof_diff, runtime)) +finally: + create_readme(directory) + print("\nSummarized speed test results in {}".format(directory + '/README.md')) + script_end = datetime.datetime.fromtimestamp(time.time()).strftime(time_format) + script_diff = datetime.datetime.strptime(script_end, time_format) - datetime.datetime.strptime(script_start, time_format) + print("Finished speed test on {}. Total time = {}.".format( + datetime.datetime.fromtimestamp(time.time()).strftime(time_format), script_diff)) + + +##res.plot_all_tests(relative=True) +#for n in get_fignums(): +# plt.figure(n) +# savefig(plot_dir + '/speed_test_{}.png'.format(speed_tests[n-1][1])) + +## Debug (includes profiling infos) +#from brian2.tests.features.base import results +#for x in results(LocalConfiguration, LinearNeuronsOnly, 10, maximum_run_time=10*second): +# print x diff --git a/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/README.md b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/README.md new file mode 100644 index 00000000..258f1cd7 --- /dev/null +++ b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/README.md @@ -0,0 +1,7 @@ +[cuda_cpp_comparison_for_heterogenous_delay_mode](cuda_cpp_comparison_for_heterogenous_delay_mode) + +[cuda_atomics_original_and_atomics_effects_profiled](cuda_atomics_original_and_atomics_effects_profiled) + +[cuda_atomics_in_heterogenous_delay_mode](cuda_atomics_in_heterogenous_delay_mode) + +[cuda_atomics_effects_and_queue_resize_profiled](cuda_atomics_effects_and_queue_resize_profiled) diff --git a/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_effects_and_queue_resize_profiled/README.md b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_effects_and_queue_resize_profiled/README.md new file mode 100644 index 00000000..86d6251e --- /dev/null +++ b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_effects_and_queue_resize_profiled/README.md @@ -0,0 +1,98 @@ + +# Benchmark results from 28.11.2017 +## Description: + + + +## Last git log: +``` +commit 8987de24ed9f4a3b1a276496407fca1087f04004 +Author: Denis Alevi +Date: Mon Nov 20 14:31:09 2017 +0100 + + Fix critical section to include the actual pushing + +``` +There is also a `git diff` saved in the current directory. + +## Results + +### BrunelHakimModelHeterogeneousDelay +![](plots/speed_test_BrunelHakimModelHeterogeneousDelay_absolute.svg) +![](plots/speed_test_BrunelHakimModelHeterogeneousDelay_profiling.svg) +![](plots/speed_test_BrunelHakimModelHeterogeneousDelay_relative.svg) + +
Examplary `nvprof` results for **CUDAStandaloneConfigurationBrunelHeterogAndPushAtomicResizeProfileCPU**

+Profile summary for `N = 1000`: + +``` +==24531== NVPROF is profiling process 24531, command: ./main +==24531== Profiling application: ./main +==24531== Profiling result: + Type Time(%) Time Calls Avg Min Max Name + GPU activities: 37.91% 132.38ms 2521 52.511us 14.048us 1.0672ms _run_synapses_pre_push_spikes_push_kernel(unsigned int, unsigned int, unsigned int, int*) + 18.34% 64.052ms 10000 6.4050us 3.5520us 8.3520us kernel_synapses_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, int*, int, double, int*, int, double*, int*) + 13.11% 45.786ms 10000 4.5780us 4.3840us 5.6320us kernel_neurongroup_stateupdater_codeobject(unsigned int, unsigned int, double, double*, double*, double*, bool*, float*) + 7.89% 27.566ms 10000 2.7560us 2.7200us 4.1280us _run_synapses_pre_push_spikes_advance_kernel(void) + 6.60% 23.060ms 10000 2.3050us 2.0800us 2.8480us kernel_neurongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double, double*, double*, bool*) + 5.89% 20.553ms 10000 2.0550us 2.0160us 4.1920us [CUDA memcpy DtoH] + 5.25% 18.329ms 10000 1.8320us 1.6640us 2.1760us kernel_neurongroup_resetter_codeobject(unsigned int, unsigned int, double*, int*) + 4.80% 16.747ms 10000 1.6740us 1.6000us 2.2080us _GLOBAL__N__69_tmpxft_00005e15_00000000_6_neurongroup_thresholder_codeobject_cpp1_ii_97ebdcc0::_reset_neurongroup_thresholder_codeobject(int*) + 0.21% 731.84us 1 731.84us 731.84us 731.84us void gen_sequenced(curandStateXORWOW*, normal_args_st))>(curandStateXORWOW*, float2*, unsigned long, unsigned long, normal_args_st) + API calls: 46.45% 719.26ms 62522 11.504us 9.5190us 8.6020ms cudaLaunch + 35.15% 544.39ms 60001 9.0720us 2.4110us 1.0720ms cudaDeviceSynchronize + 12.81% 198.43ms 10000 19.842us 18.034us 330.92us cudaMemcpy + 3.57% 55.321ms 350089 158ns 123ns 330.68us cudaSetupArgument + 1.15% 17.835ms 62522 285ns 182ns 10.032us cudaConfigureCall + 0.83% 12.881ms 52523 245ns 209ns 9.8600us cudaGetLastError + 0.02% 250.79us 1 250.79us 250.79us 250.79us cudaMalloc + 0.01% 147.52us 1 147.52us 147.52us 147.52us cudaMemGetInfo + 0.00% 28.259us 8 3.5320us 2.7680us 5.4040us cudaFuncGetAttributes + 0.00% 26.485us 39 679ns 562ns 1.7750us cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags + 0.00% 6.3090us 12 525ns 358ns 1.3730us cudaDeviceGetAttribute + 0.00% 2.9100us 3 970ns 717ns 1.4180us cudaGetDevice + +``` + +

+ + +
Examplary `nvprof` results for **CUDAStandaloneConfigurationTestBrunelHeteroAtomicsProfileCPU**

+Profile summary for `N = 1000`: + +``` +==23837== NVPROF is profiling process 23837, command: ./main +==23837== Profiling application: ./main +==23837== Profiling result: + Type Time(%) Time Calls Avg Min Max Name + GPU activities: 44.49% 157.31ms 10000 15.731us 1.8560us 1.1459ms _run_synapses_pre_push_spikes_push_kernel(unsigned int, unsigned int, unsigned int, int*) + 17.67% 62.479ms 10000 6.2470us 3.4240us 7.9360us kernel_synapses_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, int*, int, double, int*, int, double*, int*) + 12.96% 45.814ms 10000 4.5810us 4.3530us 5.4080us kernel_neurongroup_stateupdater_codeobject(unsigned int, unsigned int, double, double*, double*, double*, bool*, float*) + 7.81% 27.614ms 10000 2.7610us 2.7200us 4.1920us _run_synapses_pre_push_spikes_advance_kernel(void) + 6.48% 22.902ms 10000 2.2900us 2.0160us 2.8170us kernel_neurongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double, double*, double*, bool*) + 5.57% 19.698ms 10000 1.9690us 1.6960us 2.2080us kernel_neurongroup_resetter_codeobject(unsigned int, unsigned int, double*, int*) + 4.81% 17.002ms 10000 1.7000us 1.6320us 2.2400us _GLOBAL__N__69_tmpxft_00005b5a_00000000_6_neurongroup_thresholder_codeobject_cpp1_ii_97ebdcc0::_reset_neurongroup_thresholder_codeobject(int*) + 0.21% 731.94us 1 731.94us 731.94us 731.94us void gen_sequenced(curandStateXORWOW*, normal_args_st))>(curandStateXORWOW*, float2*, unsigned long, unsigned long, normal_args_st) + API calls: 54.28% 776.23ms 70001 11.088us 9.1570us 9.5432ms cudaLaunch + 39.57% 565.83ms 60001 9.4300us 2.4970us 1.1523ms cudaDeviceSynchronize + 3.99% 57.063ms 380005 150ns 121ns 325.75us cudaSetupArgument + 1.16% 16.531ms 70001 236ns 172ns 25.540us cudaConfigureCall + 0.96% 13.788ms 60002 229ns 191ns 12.473us cudaGetLastError + 0.02% 304.60us 1 304.60us 304.60us 304.60us cudaMalloc + 0.01% 168.13us 1 168.13us 168.13us 168.13us cudaMemGetInfo + 0.00% 31.295us 39 802ns 568ns 4.4480us cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags + 0.00% 29.348us 8 3.6680us 2.8260us 5.7480us cudaFuncGetAttributes + 0.00% 6.1630us 12 513ns 356ns 1.2920us cudaDeviceGetAttribute + 0.00% 3.1870us 3 1.0620us 733ns 1.7050us cudaGetDevice + +``` + +

+ + +*** + +### BrunelHakimModelHeterogeneousDelay - display less kernels in profiling +![](plots/speed_test_min_15_BrunelHakimModelHeterogeneousDelay_profiling.svg) + + diff --git a/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_effects_and_queue_resize_profiled/data/BrunelHakimModelHeterogeneousDelay.pkl b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_effects_and_queue_resize_profiled/data/BrunelHakimModelHeterogeneousDelay.pkl new file mode 100644 index 00000000..207f984f Binary files /dev/null and b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_effects_and_queue_resize_profiled/data/BrunelHakimModelHeterogeneousDelay.pkl differ diff --git a/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_effects_and_queue_resize_profiled/git.diff b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_effects_and_queue_resize_profiled/git.diff new file mode 100644 index 00000000..44a84fa2 --- /dev/null +++ b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_effects_and_queue_resize_profiled/git.diff @@ -0,0 +1,305 @@ +diff --git a/brian2cuda/tests/features/cuda_configuration.py b/brian2cuda/tests/features/cuda_configuration.py +index 250687d..622f73a 100644 +--- a/brian2cuda/tests/features/cuda_configuration.py ++++ b/brian2cuda/tests/features/cuda_configuration.py +@@ -225,7 +225,7 @@ class CUDAStandaloneConfigurationProfileCPU(Configuration): + with_output=False) + + class CUDAStandaloneConfigurationTestBrunelHeteroAtomics(Configuration): +- name = 'CUDA standalone with atomics in heterog delay mode' ++ name = 'CUDA standalone with atomics in effect application' + def before_run(self): + brian2.set_device('cuda_standalone', build_on_run=False) + prefs["devices.cuda_standalone.test_brunel_hetero_atomics"] = True +@@ -248,7 +248,7 @@ class CUDAStandaloneConfigurationTestBrunelHeteroAtomics(Configuration): + with_output=False) + + class CUDAStandaloneConfigurationTestBrunelHeteroAtomicsProfileCPU(Configuration): +- name = "CUDA standalone with atomics in heterog delay mode (profile='blocking')" ++ name = "CUDA standalone with atomics in effect application (profile='blocking')" + def before_run(self): + brian2.set_device('cuda_standalone', build_on_run=False, profile='blocking') + prefs["devices.cuda_standalone.test_brunel_hetero_atomics"] = True +@@ -270,12 +270,10 @@ class CUDAStandaloneConfigurationTestBrunelHeteroAtomicsProfileCPU(Configuration + brian2.device.build(directory='cuda_standalone', compile=True, run=True, + with_output=False) + +- + class CUDAStandaloneConfigurationPushAtomicResize(Configuration): +- name = "CUDA standalone with atomics in queue resize" ++ name = "CUDA standalone with atomics in spikequeue resize" + def before_run(self): + brian2.set_device('cuda_standalone', build_on_run=False) +- prefs["devices.cuda_standalone.test_brunel_hetero_atomics"] = True + prefs["devices.cuda_standalone.push_atomic_resize"] = True + if socket.gethostname() == 'elnath': + if prefs['devices.cpp_standalone.extra_make_args_unix'] == ['-j12']: +@@ -295,8 +293,82 @@ class CUDAStandaloneConfigurationPushAtomicResize(Configuration): + brian2.device.build(directory='cuda_standalone', compile=True, run=True, + with_output=False) + ++ + class CUDAStandaloneConfigurationPushAtomicResizeProfileCPU(Configuration): +- name = "CUDA standalone with atomics in queue resize (profile='blocking')" ++ name = "CUDA standalone with atomics in spikequeue resize (profile='blocking')" ++ def before_run(self): ++ brian2.set_device('cuda_standalone', build_on_run=False, profile='blocking') ++ prefs["devices.cuda_standalone.push_atomic_resize"] = True ++ if socket.gethostname() == 'elnath': ++ if prefs['devices.cpp_standalone.extra_make_args_unix'] == ['-j12']: ++ prefs['devices.cpp_standalone.extra_make_args_unix'] = ['-j24'] ++ prefs['codegen.cuda.extra_compile_args_nvcc'].remove('-arch=sm_35') ++ prefs['codegen.cuda.extra_compile_args_nvcc'].extend(['-arch=sm_20']) ++ elif socket.gethostname() == 'sabik': ++ prefs['codegen.cuda.extra_compile_args_nvcc'].remove('-arch=sm_35') ++ prefs['codegen.cuda.extra_compile_args_nvcc'].extend(['-arch=sm_52']) ++ elif socket.gethostname() == 'eltanin': ++ prefs['codegen.cuda.extra_compile_args_nvcc'].remove('-arch=sm_35') ++ prefs['codegen.cuda.extra_compile_args_nvcc'].extend(['-arch=sm_61']) ++ ++ def after_run(self): ++ if os.path.exists('cuda_standalone'): ++ shutil.rmtree('cuda_standalone') ++ brian2.device.build(directory='cuda_standalone', compile=True, run=True, ++ with_output=False) ++ ++ ++class CUDAStandaloneConfigurationPushAtomicResizProfileCPU(Configuration): ++ name = "CUDA standalone with atomics in spikequeue resize (profile='blocking)" ++ def before_run(self): ++ brian2.set_device('cuda_standalone', build_on_run=False, ++ profile='blocking') ++ prefs["devices.cuda_standalone.push_atomic_resize"] = True ++ if socket.gethostname() == 'elnath': ++ if prefs['devices.cpp_standalone.extra_make_args_unix'] == ['-j12']: ++ prefs['devices.cpp_standalone.extra_make_args_unix'] = ['-j24'] ++ prefs['codegen.cuda.extra_compile_args_nvcc'].remove('-arch=sm_35') ++ prefs['codegen.cuda.extra_compile_args_nvcc'].extend(['-arch=sm_20']) ++ elif socket.gethostname() == 'sabik': ++ prefs['codegen.cuda.extra_compile_args_nvcc'].remove('-arch=sm_35') ++ prefs['codegen.cuda.extra_compile_args_nvcc'].extend(['-arch=sm_52']) ++ elif socket.gethostname() == 'eltanin': ++ prefs['codegen.cuda.extra_compile_args_nvcc'].remove('-arch=sm_35') ++ prefs['codegen.cuda.extra_compile_args_nvcc'].extend(['-arch=sm_61']) ++ ++ def after_run(self): ++ if os.path.exists('cuda_standalone'): ++ shutil.rmtree('cuda_standalone') ++ brian2.device.build(directory='cuda_standalone', compile=True, run=True, ++ with_output=False) ++ ++ ++class CUDAStandaloneConfigurationBrunelHeterogAndPushAtomicResize(Configuration): ++ name = "CUDA standalone with atomics in effect application and in spikequeue resize" ++ def before_run(self): ++ brian2.set_device('cuda_standalone', build_on_run=False) ++ prefs["devices.cuda_standalone.test_brunel_hetero_atomics"] = True ++ prefs["devices.cuda_standalone.push_atomic_resize"] = True ++ if socket.gethostname() == 'elnath': ++ if prefs['devices.cpp_standalone.extra_make_args_unix'] == ['-j12']: ++ prefs['devices.cpp_standalone.extra_make_args_unix'] = ['-j24'] ++ prefs['codegen.cuda.extra_compile_args_nvcc'].remove('-arch=sm_35') ++ prefs['codegen.cuda.extra_compile_args_nvcc'].extend(['-arch=sm_20']) ++ elif socket.gethostname() == 'sabik': ++ prefs['codegen.cuda.extra_compile_args_nvcc'].remove('-arch=sm_35') ++ prefs['codegen.cuda.extra_compile_args_nvcc'].extend(['-arch=sm_52']) ++ elif socket.gethostname() == 'eltanin': ++ prefs['codegen.cuda.extra_compile_args_nvcc'].remove('-arch=sm_35') ++ prefs['codegen.cuda.extra_compile_args_nvcc'].extend(['-arch=sm_61']) ++ ++ def after_run(self): ++ if os.path.exists('cuda_standalone'): ++ shutil.rmtree('cuda_standalone') ++ brian2.device.build(directory='cuda_standalone', compile=True, run=True, ++ with_output=False) ++ ++class CUDAStandaloneConfigurationBrunelHeterogAndPushAtomicResizeProfileCPU(Configuration): ++ name = "CUDA standalone with atomics in effect application and in spikequeue resize (profile='blocking')" + def before_run(self): + brian2.set_device('cuda_standalone', build_on_run=False, profile='blocking') + prefs["devices.cuda_standalone.test_brunel_hetero_atomics"] = True +diff --git a/dev/benchmarks/run_speed_tests.py b/dev/benchmarks/run_speed_tests.py +index 2518634..b525e97 100644 +--- a/dev/benchmarks/run_speed_tests.py ++++ b/dev/benchmarks/run_speed_tests.py +@@ -37,6 +37,7 @@ from brian2cuda.tests.features.cuda_configuration import (CUDAStandaloneConfigur + CUDAStandaloneConfigurationTestBrunelHeteroAtomics, + CUDAStandaloneConfigurationTestBrunelHeteroAtomicsProfileCPU, + CUDAStandaloneConfigurationPushAtomicResize, ++ CUDAStandaloneConfigurationBrunelHeterogAndPushAtomicResize, + CUDAStandaloneConfigurationPushAtomicResizeProfileCPU, + CUDAStandaloneConfigurationPushAtomicResizeAsyncMemcpy, + CUDAStandaloneConfigurationPushAtomicResizeAsyncMemcpyProfileCPU) +@@ -61,79 +62,80 @@ if socket.gethostname() == 'elnath': + prefs['codegen.cuda.extra_compile_args_nvcc'].extend(['-arch=sm_20']) + + configs = [# configuration project_directory +- (NumpyConfiguration, None), +- (WeaveConfiguration, None), +- (LocalConfiguration, None), ++ #(NumpyConfiguration, None), ++ #(WeaveConfiguration, None), ++ #(LocalConfiguration, None), ++ #(CPPStandaloneConfiguration, 'cpp_standalone'), ++ #(CPPStandaloneConfigurationOpenMP, 'cpp_standalone'), + (CUDAStandaloneConfiguration, 'cuda_standalone'), +- (CUDAStandaloneConfigurationExtraThresholdKernel, 'cuda_standalone'), +- (CUDAStandaloneConfigurationNoAssert, 'cuda_standalone'), +- (CUDAStandaloneConfigurationCurandDouble, 'cuda_standalone'), +- (CUDAStandaloneConfigurationNoCudaOccupancyAPI, 'cuda_standalone'), +- (CUDAStandaloneConfigurationNoCudaOccupancyAPIProfileCPU, 'cuda_standalone'), +- (CUDAStandaloneConfiguration2BlocksPerSM, 'cuda_standalone'), +- (CUDAStandaloneConfiguration2BlocksPerSMLaunchBounds, 'cuda_standalone'), +- (CUDAStandaloneConfigurationSynLaunchBounds, 'cuda_standalone'), +- (CUDAStandaloneConfiguration2BlocksPerSMSynLaunchBounds, 'cuda_standalone'), +- (CUDAStandaloneConfigurationProfileGPU, 'cuda_standalone'), +- (CUDAStandaloneConfigurationProfileCPU, 'cuda_standalone'), +- (CPPStandaloneConfiguration, 'cpp_standalone'), +- (CUDAStandaloneConfigurationTestBrunelHeteroAtomics, 'cuda_standalone'), +- (CUDAStandaloneConfigurationTestBrunelHeteroAtomicsProfileCPU, 'cuda_standalone'), + (CUDAStandaloneConfigurationPushAtomicResize, 'cuda_standalone'), +- (CUDAStandaloneConfigurationPushAtomicResizeProfileCPU, 'cuda_standalone'), +- (CUDAStandaloneConfigurationPushAtomicResizeAsyncMemcpy, 'cuda_standalone'), +- (CUDAStandaloneConfigurationPushAtomicResizeAsyncMemcpyProfileCPU, 'cuda_standalone'), +- (GeNNConfiguration, 'GeNNworkspace'), +- (CPPStandaloneConfigurationOpenMP, 'cpp_standalone'), +- (GeNNConfigurationCPU, 'GeNNworkspace'), +- (GeNNConfigurationOptimized, 'GeNNworkspace') ++ (CUDAStandaloneConfigurationTestBrunelHeteroAtomics, 'cuda_standalone'), ++ (CUDAStandaloneConfigurationBrunelHeterogAndPushAtomicResize, 'cuda_standalone'), ++ #(CUDAStandaloneConfigurationExtraThresholdKernel, 'cuda_standalone'), ++ #(CUDAStandaloneConfigurationNoAssert, 'cuda_standalone'), ++ #(CUDAStandaloneConfigurationCurandDouble, 'cuda_standalone'), ++ #(CUDAStandaloneConfigurationNoCudaOccupancyAPI, 'cuda_standalone'), ++ #(CUDAStandaloneConfigurationNoCudaOccupancyAPIProfileCPU, 'cuda_standalone'), ++ #(CUDAStandaloneConfiguration2BlocksPerSM, 'cuda_standalone'), ++ #(CUDAStandaloneConfiguration2BlocksPerSMLaunchBounds, 'cuda_standalone'), ++ #(CUDAStandaloneConfigurationSynLaunchBounds, 'cuda_standalone'), ++ #(CUDAStandaloneConfiguration2BlocksPerSMSynLaunchBounds, 'cuda_standalone'), ++ #(CUDAStandaloneConfigurationProfileGPU, 'cuda_standalone'), ++ #(CUDAStandaloneConfigurationProfileCPU, 'cuda_standalone'), ++ #(CUDAStandaloneConfigurationTestBrunelHeteroAtomicsProfileCPU, 'cuda_standalone'), ++ #(CUDAStandaloneConfigurationPushAtomicResizeProfileCPU, 'cuda_standalone'), ++ #(CUDAStandaloneConfigurationPushAtomicResizeAsyncMemcpy, 'cuda_standalone'), ++ #(CUDAStandaloneConfigurationPushAtomicResizeAsyncMemcpyProfileCPU, 'cuda_standalone'), ++ #(GeNNConfiguration, 'GeNNworkspace'), ++ #(GeNNConfigurationCPU, 'GeNNworkspace'), ++ #(GeNNConfigurationOptimized, 'GeNNworkspace') + ] + + speed_tests = [# feature_test name n_slice + +- (ThresholderOnlyPoissonLowRate, 'ThresholderOnlyPoissonLowRate', slice(None) ), +- (ThresholderOnlyPoissonMediumRate, 'ThresholderOnlyPoissonMediumRate', slice(None) ), +- (ThresholderOnlyPoissonHighRate, 'ThresholderOnlyPoissonHighRate', slice(None) ), +- (ThresholderOnlyAlwaysSpiking, 'ThresholderOnlyAlwaysSpiking', slice(None) ), +- +- (BrunelHakimStateupdateOnlyDouble, 'BrunelHakimStateupdateOnlyDouble', slice(None) ), +- (BrunelHakimStateupdateOnlyTriple, 'BrunelHakimStateupdateOnlyTriple', slice(None) ), +- (BrunelHakimStateupdateOnly, 'BrunelHakimStateupdateOnly', slice(None) ), +- (BrunelHakimNeuronsOnly, 'BrunelHakimNeuronsOnly', slice(None) ), +- (BrunelHakimNeuronsOnlyNoXi, 'BrunelHakimNeuronsOnlyNoXi', slice(None) ), +- (BrunelHakimNeuronsOnlyNoRand, 'BrunelHakimNeuronsOnlyNoRand', slice(None) ), +- (BrunelHakimStateupdateThresholdOnly, 'BrunelHakimStateupdateThresholdOnly', slice(None) ), +- (BrunelHakimStateupdateThresholdResetOnly, 'BrunelHakimStateupdateThresholdResetOnly', slice(None) ), +- (BrunelHakimModelScalarDelayShort, 'BrunelHakimModelScalarDelayShort', slice(None) ), +- (BrunelHakimModelScalarDelayNoSelfConnections, 'BrunelHakimModelScalarDelayNoSelfConnections', slice(None) ), +- (CUBA, 'CUBA', slice(None) ), +- (COBAHH, 'COBAHH', slice(None) ), +- (AdaptationOscillation, 'AdaptationOscillation', slice(None) ), +- (Vogels, 'Vogels', slice(None) ), +- (STDP, 'STDP', slice(None) ), +- (STDPEventDriven, 'STDPEventDriven', slice(None) ), +- (BrunelHakimModelScalarDelay, 'BrunelHakimModelScalarDelay', slice(None) ), +- +- (VerySparseMediumRateSynapsesOnly, 'VerySparseMediumRateSynapsesOnly', slice(None) ), +- (SparseMediumRateSynapsesOnly, 'SparseMediumRateSynapsesOnly', slice(None) ), +- (DenseMediumRateSynapsesOnly, 'DenseMediumRateSynapsesOnly', slice(None) ), +- (SparseLowRateSynapsesOnly, 'SparseLowRateSynapsesOnly', slice(None) ), +- (SparseHighRateSynapsesOnly, 'SparseHighRateSynapsesOnly', slice(None) ), +- +- (STDPNotEventDriven, 'STDPNotEventDriven', slice(None) ), +- (STDPMultiPost, 'STDPMultiPost', slice(None) ), +- (STDPNeuronalTraces, 'STDPNeuronalTraces', slice(None) ), +- (STDPMultiPostNeuronalTraces, 'STDPMultiPostNeuronalTraces', slice(None) ), ++ #(ThresholderOnlyPoissonLowRate, 'ThresholderOnlyPoissonLowRate', slice(None) ), ++ #(ThresholderOnlyPoissonMediumRate, 'ThresholderOnlyPoissonMediumRate', slice(None) ), ++ #(ThresholderOnlyPoissonHighRate, 'ThresholderOnlyPoissonHighRate', slice(None) ), ++ #(ThresholderOnlyAlwaysSpiking, 'ThresholderOnlyAlwaysSpiking', slice(None) ), ++ ++ #(BrunelHakimStateupdateOnlyDouble, 'BrunelHakimStateupdateOnlyDouble', slice(None) ), ++ #(BrunelHakimStateupdateOnlyTriple, 'BrunelHakimStateupdateOnlyTriple', slice(None) ), ++ #(BrunelHakimStateupdateOnly, 'BrunelHakimStateupdateOnly', slice(None) ), ++ #(BrunelHakimNeuronsOnly, 'BrunelHakimNeuronsOnly', slice(None) ), ++ #(BrunelHakimNeuronsOnlyNoXi, 'BrunelHakimNeuronsOnlyNoXi', slice(None) ), ++ #(BrunelHakimNeuronsOnlyNoRand, 'BrunelHakimNeuronsOnlyNoRand', slice(None) ), ++ #(BrunelHakimStateupdateThresholdOnly, 'BrunelHakimStateupdateThresholdOnly', slice(None) ), ++ #(BrunelHakimStateupdateThresholdResetOnly, 'BrunelHakimStateupdateThresholdResetOnly', slice(None) ), ++ #(BrunelHakimModelScalarDelayShort, 'BrunelHakimModelScalarDelayShort', slice(None) ), ++ #(BrunelHakimModelScalarDelayNoSelfConnections, 'BrunelHakimModelScalarDelayNoSelfConnections', slice(None) ), ++ #(CUBA, 'CUBA', slice(None) ), ++ #(COBAHH, 'COBAHH', slice(None) ), ++ #(AdaptationOscillation, 'AdaptationOscillation', slice(None) ), ++ #(Vogels, 'Vogels', slice(None) ), ++ #(STDP, 'STDP', slice(None) ), ++ #(STDPEventDriven, 'STDPEventDriven', slice(None) ), ++ #(BrunelHakimModelScalarDelay, 'BrunelHakimModelScalarDelay', slice(None) ), ++ ++ #(VerySparseMediumRateSynapsesOnly, 'VerySparseMediumRateSynapsesOnly', slice(None) ), ++ #(SparseMediumRateSynapsesOnly, 'SparseMediumRateSynapsesOnly', slice(None) ), ++ #(DenseMediumRateSynapsesOnly, 'DenseMediumRateSynapsesOnly', slice(None) ), ++ #(SparseLowRateSynapsesOnly, 'SparseLowRateSynapsesOnly', slice(None) ), ++ #(SparseHighRateSynapsesOnly, 'SparseHighRateSynapsesOnly', slice(None) ), ++ ++ #(STDPNotEventDriven, 'STDPNotEventDriven', slice(None) ), ++ #(STDPMultiPost, 'STDPMultiPost', slice(None) ), ++ #(STDPNeuronalTraces, 'STDPNeuronalTraces', slice(None) ), ++ #(STDPMultiPostNeuronalTraces, 'STDPMultiPostNeuronalTraces', slice(None) ), + + (BrunelHakimModelHeterogeneousDelay, 'BrunelHakimModelHeterogeneousDelay', slice(None) ), + +- (LinearNeuronsOnly, 'LinearNeuronsOnly', slice(None) ), +- (HHNeuronsOnly, 'HHNeuronsOnly', slice(None) ), +- (VogelsWithSynapticDynamic, 'VogelsWithSynapticDynamic', slice(None) ), ++ #(LinearNeuronsOnly, 'LinearNeuronsOnly', slice(None) ), ++ #(HHNeuronsOnly, 'HHNeuronsOnly', slice(None) ), ++ #(VogelsWithSynapticDynamic, 'VogelsWithSynapticDynamic', slice(None) ), + +- ## below uses monitors +- (CUBAFixedConnectivity, 'CUBAFixedConnectivity', slice(None) ), +- (COBAHHFixedConnectivity, 'COBAHHFixedConnectivity', slice(None, -1) ), ++ ### below uses monitors ++ #(CUBAFixedConnectivity, 'CUBAFixedConnectivity', slice(None) ), ++ #(COBAHHFixedConnectivity, 'COBAHHFixedConnectivity', slice(None, -1) ), + ] + + configurations = [config[0] for config in configs] +@@ -205,6 +207,16 @@ try: + savefig(os.path.join(plot_dir, 'speed_test_{}_relative.png'.format(name))) + res.plot_all_tests(profiling_minimum=0.05) + savefig(os.path.join(plot_dir, 'speed_test_{}_profiling.png'.format(name))) ++ ++ res.plot_all_tests() ++ ## this needs modification of brian2 code ++ #res.plot_all_tests(print_relative=True) ++ savefig(os.path.join(plot_dir, 'speed_test_{}_absolute.svg'.format(speed_tests[n][1]))) ++ res.plot_all_tests(relative=True) ++ savefig(os.path.join(plot_dir, 'speed_test_{}_relative.svg'.format(name))) ++ res.plot_all_tests(profiling_minimum=0.05) ++ savefig(os.path.join(plot_dir, 'speed_test_{}_profiling.svg'.format(name))) ++ + if 3 != len(get_fignums()): + print("WARNING: There were {} plots created, but only {} saved.".format(len(get_fignums()), 3*(n+1))) + for n in get_fignums(): +diff --git a/frozen_repos/brian2 b/frozen_repos/brian2 +--- a/frozen_repos/brian2 ++++ b/frozen_repos/brian2 +@@ -1 +1 @@ +-Subproject commit fadc6a0aeb90d1b4d343470628457d8561536f67 ++Subproject commit fadc6a0aeb90d1b4d343470628457d8561536f67-dirty +diff --git a/frozen_repos/brian2genn b/frozen_repos/brian2genn +--- a/frozen_repos/brian2genn ++++ b/frozen_repos/brian2genn +@@ -1 +1 @@ +-Subproject commit 0553cafeab49ea5403c0230411035df504d4db06 ++Subproject commit 0553cafeab49ea5403c0230411035df504d4db06-dirty diff --git a/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_effects_and_queue_resize_profiled/logs/stdout_BrunelHakimModelHeterogeneousDelay_cuda_standalone_50000.txt b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_effects_and_queue_resize_profiled/logs/stdout_BrunelHakimModelHeterogeneousDelay_cuda_standalone_50000.txt new file mode 100644 index 00000000..9bfd72cb --- /dev/null +++ b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_effects_and_queue_resize_profiled/logs/stdout_BrunelHakimModelHeterogeneousDelay_cuda_standalone_50000.txt @@ -0,0 +1,53 @@ +INFO: setting cudaDevice stuff took 0.304642 seconds +INFO kernel_synapses_group_variable_set_conditional_codeobject + 48824 blocks + 1024 threads + 8 registers per block + 0 bytes statically-allocated shared memory per block + 0 bytes local memory per thread + 304 bytes user-allocated constant memory +INFO connectivity matrix has size 49994896 +INFO generating 13100000 randn every 262 clock cycles for neurongroup_stateupdater_codeobject +INFO kernel_neurongroup_stateupdater_codeobject + 66 blocks + 768 threads + 36 registers per block + 0 bytes statically-allocated shared memory per block + 8 bytes local memory per thread + 304 bytes user-allocated constant memory + 0.750 theoretical occupancy +INFO kernel_neurongroup_thresholder_codeobject + 49 blocks + 1024 threads + 15 registers per block + 0 bytes statically-allocated shared memory per block + 0 bytes local memory per thread + 304 bytes user-allocated constant memory + 1.000 theoretical occupancy +INFO _run_synapses_pre_push_spikes_push_kernel + 15 blocks + 108 threads + 78 registers per block + 0 bytes statically-allocated shared memory per block + 16 bytes local memory per thread + 304 bytes user-allocated constant memory + 0.312 theoretical occupancy +INFO kernel_synapses_pre_codeobject + 15 blocks + 1024 threads + 21 registers per block + 0 bytes statically-allocated shared memory per block + 8 bytes local memory per thread + 304 bytes user-allocated constant memory + 1.000 theoretical occupancy +INFO kernel_neurongroup_resetter_codeobject + 49 blocks + 1024 threads + 14 registers per block + 0 bytes statically-allocated shared memory per block + 0 bytes local memory per thread + 304 bytes user-allocated constant memory + 1.000 theoretical occupancy +Number of synapses: 49994896 +INFO: main_lines took 138.483894 seconds +INFO: main function took 140.094220 seconds diff --git a/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_effects_and_queue_resize_profiled/nvprof/nvprof_BrunelHakimModelHeterogeneousDelay_CUDAStandaloneConfigurationBrunelHeterogAndPushAtomicResizeProfileCPU_1000.log b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_effects_and_queue_resize_profiled/nvprof/nvprof_BrunelHakimModelHeterogeneousDelay_CUDAStandaloneConfigurationBrunelHeterogAndPushAtomicResizeProfileCPU_1000.log new file mode 100644 index 00000000..da67b355 --- /dev/null +++ b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_effects_and_queue_resize_profiled/nvprof/nvprof_BrunelHakimModelHeterogeneousDelay_CUDAStandaloneConfigurationBrunelHeterogAndPushAtomicResizeProfileCPU_1000.log @@ -0,0 +1,25 @@ +==24531== NVPROF is profiling process 24531, command: ./main +==24531== Profiling application: ./main +==24531== Profiling result: + Type Time(%) Time Calls Avg Min Max Name + GPU activities: 37.91% 132.38ms 2521 52.511us 14.048us 1.0672ms _run_synapses_pre_push_spikes_push_kernel(unsigned int, unsigned int, unsigned int, int*) + 18.34% 64.052ms 10000 6.4050us 3.5520us 8.3520us kernel_synapses_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, int*, int, double, int*, int, double*, int*) + 13.11% 45.786ms 10000 4.5780us 4.3840us 5.6320us kernel_neurongroup_stateupdater_codeobject(unsigned int, unsigned int, double, double*, double*, double*, bool*, float*) + 7.89% 27.566ms 10000 2.7560us 2.7200us 4.1280us _run_synapses_pre_push_spikes_advance_kernel(void) + 6.60% 23.060ms 10000 2.3050us 2.0800us 2.8480us kernel_neurongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double, double*, double*, bool*) + 5.89% 20.553ms 10000 2.0550us 2.0160us 4.1920us [CUDA memcpy DtoH] + 5.25% 18.329ms 10000 1.8320us 1.6640us 2.1760us kernel_neurongroup_resetter_codeobject(unsigned int, unsigned int, double*, int*) + 4.80% 16.747ms 10000 1.6740us 1.6000us 2.2080us _GLOBAL__N__69_tmpxft_00005e15_00000000_6_neurongroup_thresholder_codeobject_cpp1_ii_97ebdcc0::_reset_neurongroup_thresholder_codeobject(int*) + 0.21% 731.84us 1 731.84us 731.84us 731.84us void gen_sequenced(curandStateXORWOW*, normal_args_st))>(curandStateXORWOW*, float2*, unsigned long, unsigned long, normal_args_st) + API calls: 46.45% 719.26ms 62522 11.504us 9.5190us 8.6020ms cudaLaunch + 35.15% 544.39ms 60001 9.0720us 2.4110us 1.0720ms cudaDeviceSynchronize + 12.81% 198.43ms 10000 19.842us 18.034us 330.92us cudaMemcpy + 3.57% 55.321ms 350089 158ns 123ns 330.68us cudaSetupArgument + 1.15% 17.835ms 62522 285ns 182ns 10.032us cudaConfigureCall + 0.83% 12.881ms 52523 245ns 209ns 9.8600us cudaGetLastError + 0.02% 250.79us 1 250.79us 250.79us 250.79us cudaMalloc + 0.01% 147.52us 1 147.52us 147.52us 147.52us cudaMemGetInfo + 0.00% 28.259us 8 3.5320us 2.7680us 5.4040us cudaFuncGetAttributes + 0.00% 26.485us 39 679ns 562ns 1.7750us cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags + 0.00% 6.3090us 12 525ns 358ns 1.3730us cudaDeviceGetAttribute + 0.00% 2.9100us 3 970ns 717ns 1.4180us cudaGetDevice diff --git a/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_effects_and_queue_resize_profiled/nvprof/nvprof_BrunelHakimModelHeterogeneousDelay_CUDAStandaloneConfigurationTestBrunelHeteroAtomicsProfileCPU_1000.log b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_effects_and_queue_resize_profiled/nvprof/nvprof_BrunelHakimModelHeterogeneousDelay_CUDAStandaloneConfigurationTestBrunelHeteroAtomicsProfileCPU_1000.log new file mode 100644 index 00000000..666b0e65 --- /dev/null +++ b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_effects_and_queue_resize_profiled/nvprof/nvprof_BrunelHakimModelHeterogeneousDelay_CUDAStandaloneConfigurationTestBrunelHeteroAtomicsProfileCPU_1000.log @@ -0,0 +1,23 @@ +==23837== NVPROF is profiling process 23837, command: ./main +==23837== Profiling application: ./main +==23837== Profiling result: + Type Time(%) Time Calls Avg Min Max Name + GPU activities: 44.49% 157.31ms 10000 15.731us 1.8560us 1.1459ms _run_synapses_pre_push_spikes_push_kernel(unsigned int, unsigned int, unsigned int, int*) + 17.67% 62.479ms 10000 6.2470us 3.4240us 7.9360us kernel_synapses_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, int*, int, double, int*, int, double*, int*) + 12.96% 45.814ms 10000 4.5810us 4.3530us 5.4080us kernel_neurongroup_stateupdater_codeobject(unsigned int, unsigned int, double, double*, double*, double*, bool*, float*) + 7.81% 27.614ms 10000 2.7610us 2.7200us 4.1920us _run_synapses_pre_push_spikes_advance_kernel(void) + 6.48% 22.902ms 10000 2.2900us 2.0160us 2.8170us kernel_neurongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double, double*, double*, bool*) + 5.57% 19.698ms 10000 1.9690us 1.6960us 2.2080us kernel_neurongroup_resetter_codeobject(unsigned int, unsigned int, double*, int*) + 4.81% 17.002ms 10000 1.7000us 1.6320us 2.2400us _GLOBAL__N__69_tmpxft_00005b5a_00000000_6_neurongroup_thresholder_codeobject_cpp1_ii_97ebdcc0::_reset_neurongroup_thresholder_codeobject(int*) + 0.21% 731.94us 1 731.94us 731.94us 731.94us void gen_sequenced(curandStateXORWOW*, normal_args_st))>(curandStateXORWOW*, float2*, unsigned long, unsigned long, normal_args_st) + API calls: 54.28% 776.23ms 70001 11.088us 9.1570us 9.5432ms cudaLaunch + 39.57% 565.83ms 60001 9.4300us 2.4970us 1.1523ms cudaDeviceSynchronize + 3.99% 57.063ms 380005 150ns 121ns 325.75us cudaSetupArgument + 1.16% 16.531ms 70001 236ns 172ns 25.540us cudaConfigureCall + 0.96% 13.788ms 60002 229ns 191ns 12.473us cudaGetLastError + 0.02% 304.60us 1 304.60us 304.60us 304.60us cudaMalloc + 0.01% 168.13us 1 168.13us 168.13us 168.13us cudaMemGetInfo + 0.00% 31.295us 39 802ns 568ns 4.4480us cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags + 0.00% 29.348us 8 3.6680us 2.8260us 5.7480us cudaFuncGetAttributes + 0.00% 6.1630us 12 513ns 356ns 1.2920us cudaDeviceGetAttribute + 0.00% 3.1870us 3 1.0620us 733ns 1.7050us cudaGetDevice diff --git a/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_effects_and_queue_resize_profiled/plots/speed_test_BrunelHakimModelHeterogeneousDelay_absolute.png b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_effects_and_queue_resize_profiled/plots/speed_test_BrunelHakimModelHeterogeneousDelay_absolute.png new file mode 100644 index 00000000..de995116 Binary files /dev/null and b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_effects_and_queue_resize_profiled/plots/speed_test_BrunelHakimModelHeterogeneousDelay_absolute.png differ diff --git a/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_effects_and_queue_resize_profiled/plots/speed_test_BrunelHakimModelHeterogeneousDelay_profiling.png b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_effects_and_queue_resize_profiled/plots/speed_test_BrunelHakimModelHeterogeneousDelay_profiling.png new file mode 100644 index 00000000..d94f0997 Binary files /dev/null and b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_effects_and_queue_resize_profiled/plots/speed_test_BrunelHakimModelHeterogeneousDelay_profiling.png differ diff --git a/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_effects_and_queue_resize_profiled/plots/speed_test_BrunelHakimModelHeterogeneousDelay_relative.png b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_effects_and_queue_resize_profiled/plots/speed_test_BrunelHakimModelHeterogeneousDelay_relative.png new file mode 100644 index 00000000..11df764b Binary files /dev/null and b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_effects_and_queue_resize_profiled/plots/speed_test_BrunelHakimModelHeterogeneousDelay_relative.png differ diff --git a/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_effects_and_queue_resize_profiled/plots/speed_test_min_15_BrunelHakimModelHeterogeneousDelay_profiling.png b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_effects_and_queue_resize_profiled/plots/speed_test_min_15_BrunelHakimModelHeterogeneousDelay_profiling.png new file mode 100644 index 00000000..50724fc3 Binary files /dev/null and b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_effects_and_queue_resize_profiled/plots/speed_test_min_15_BrunelHakimModelHeterogeneousDelay_profiling.png differ diff --git a/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_effects_and_queue_resize_profiled/run_speed_test_script.py b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_effects_and_queue_resize_profiled/run_speed_test_script.py new file mode 100644 index 00000000..c88ac141 --- /dev/null +++ b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_effects_and_queue_resize_profiled/run_speed_test_script.py @@ -0,0 +1,291 @@ +import os +import shutil +import glob +import subprocess +import sys +import socket + +# run tests without X-server +import matplotlib +matplotlib.use('Agg') + +# pretty plots +import seaborn + +import time +import datetime +import cPickle as pickle + +from brian2 import * +from brian2.tests.features import * +from brian2.tests.features.base import * +from brian2.tests.features.base import results + +import brian2cuda +from brian2cuda.tests.features.cuda_configuration import (CUDAStandaloneConfiguration, + CUDAStandaloneConfigurationNoAssert, + CUDAStandaloneConfigurationExtraThresholdKernel, + CUDAStandaloneConfigurationCurandDouble, + CUDAStandaloneConfigurationNoCudaOccupancyAPI, + CUDAStandaloneConfigurationNoCudaOccupancyAPIProfileCPU, + CUDAStandaloneConfiguration2BlocksPerSM, + CUDAStandaloneConfiguration2BlocksPerSMLaunchBounds, + CUDAStandaloneConfigurationSynLaunchBounds, + CUDAStandaloneConfiguration2BlocksPerSMSynLaunchBounds, + CUDAStandaloneConfigurationProfileGPU, + CUDAStandaloneConfigurationProfileCPU, + CUDAStandaloneConfigurationTestBrunelHeteroAtomics, + CUDAStandaloneConfigurationTestBrunelHeteroAtomicsProfileCPU, + CUDAStandaloneConfigurationPushAtomicResize, + CUDAStandaloneConfigurationBrunelHeterogAndPushAtomicResize, + CUDAStandaloneConfigurationPushAtomicResizeProfileCPU, + CUDAStandaloneConfigurationBrunelHeterogAndPushAtomicResizeProfileCPU, + CUDAStandaloneConfigurationPushAtomicResizeAsyncMemcpy, + CUDAStandaloneConfigurationPushAtomicResizeAsyncMemcpyProfileCPU) +from brian2cuda.tests.features.speed import * + +from brian2genn.correctness_testing import GeNNConfiguration, GeNNConfigurationCPU, GeNNConfigurationOptimized + +from create_readme import create_readme + +assert len(sys.argv)<= 2, 'Only one command line argument supported! Got {}'.format(len(sys.argv)-1) +if len(sys.argv) == 2: + additional_dir_name = '_' + sys.argv[1] +else: + additional_dir_name = '' + +prefs['devices.cpp_standalone.extra_make_args_unix'] = ['-j12'] + +# host specific settings +if socket.gethostname() == 'elnath': + prefs['devices.cpp_standalone.extra_make_args_unix'] = ['-j24'] + prefs['codegen.cuda.extra_compile_args_nvcc'].remove('-arch=sm_35') + prefs['codegen.cuda.extra_compile_args_nvcc'].extend(['-arch=sm_20']) + +configs = [# configuration project_directory + #(NumpyConfiguration, None), + #(WeaveConfiguration, None), + #(LocalConfiguration, None), + #(CPPStandaloneConfiguration, 'cpp_standalone'), + #(CPPStandaloneConfigurationOpenMP, 'cpp_standalone'), + #(CUDAStandaloneConfiguration, 'cuda_standalone'), + #(CUDAStandaloneConfigurationPushAtomicResize, 'cuda_standalone'), + #(CUDAStandaloneConfigurationTestBrunelHeteroAtomics, 'cuda_standalone'), + #(CUDAStandaloneConfigurationBrunelHeterogAndPushAtomicResize, 'cuda_standalone'), + #(CUDAStandaloneConfigurationExtraThresholdKernel, 'cuda_standalone'), + #(CUDAStandaloneConfigurationNoAssert, 'cuda_standalone'), + #(CUDAStandaloneConfigurationCurandDouble, 'cuda_standalone'), + #(CUDAStandaloneConfigurationNoCudaOccupancyAPI, 'cuda_standalone'), + #(CUDAStandaloneConfigurationNoCudaOccupancyAPIProfileCPU, 'cuda_standalone'), + #(CUDAStandaloneConfiguration2BlocksPerSM, 'cuda_standalone'), + #(CUDAStandaloneConfiguration2BlocksPerSMLaunchBounds, 'cuda_standalone'), + #(CUDAStandaloneConfigurationSynLaunchBounds, 'cuda_standalone'), + #(CUDAStandaloneConfiguration2BlocksPerSMSynLaunchBounds, 'cuda_standalone'), + #(CUDAStandaloneConfigurationProfileGPU, 'cuda_standalone'), + #(CUDAStandaloneConfigurationProfileCPU, 'cuda_standalone'), + (CUDAStandaloneConfigurationTestBrunelHeteroAtomicsProfileCPU, 'cuda_standalone'), + #(CUDAStandaloneConfigurationPushAtomicResizeProfileCPU, 'cuda_standalone'), + (CUDAStandaloneConfigurationBrunelHeterogAndPushAtomicResizeProfileCPU, 'cuda_standalone'), + #(CUDAStandaloneConfigurationPushAtomicResizeAsyncMemcpy, 'cuda_standalone'), + #(CUDAStandaloneConfigurationPushAtomicResizeAsyncMemcpyProfileCPU, 'cuda_standalone'), + #(GeNNConfiguration, 'GeNNworkspace'), + #(GeNNConfigurationCPU, 'GeNNworkspace'), + #(GeNNConfigurationOptimized, 'GeNNworkspace') + ] + +speed_tests = [# feature_test name n_slice + + #(ThresholderOnlyPoissonLowRate, 'ThresholderOnlyPoissonLowRate', slice(None) ), + #(ThresholderOnlyPoissonMediumRate, 'ThresholderOnlyPoissonMediumRate', slice(None) ), + #(ThresholderOnlyPoissonHighRate, 'ThresholderOnlyPoissonHighRate', slice(None) ), + #(ThresholderOnlyAlwaysSpiking, 'ThresholderOnlyAlwaysSpiking', slice(None) ), + + #(BrunelHakimStateupdateOnlyDouble, 'BrunelHakimStateupdateOnlyDouble', slice(None) ), + #(BrunelHakimStateupdateOnlyTriple, 'BrunelHakimStateupdateOnlyTriple', slice(None) ), + #(BrunelHakimStateupdateOnly, 'BrunelHakimStateupdateOnly', slice(None) ), + #(BrunelHakimNeuronsOnly, 'BrunelHakimNeuronsOnly', slice(None) ), + #(BrunelHakimNeuronsOnlyNoXi, 'BrunelHakimNeuronsOnlyNoXi', slice(None) ), + #(BrunelHakimNeuronsOnlyNoRand, 'BrunelHakimNeuronsOnlyNoRand', slice(None) ), + #(BrunelHakimStateupdateThresholdOnly, 'BrunelHakimStateupdateThresholdOnly', slice(None) ), + #(BrunelHakimStateupdateThresholdResetOnly, 'BrunelHakimStateupdateThresholdResetOnly', slice(None) ), + #(BrunelHakimModelScalarDelayShort, 'BrunelHakimModelScalarDelayShort', slice(None) ), + #(BrunelHakimModelScalarDelayNoSelfConnections, 'BrunelHakimModelScalarDelayNoSelfConnections', slice(None) ), + #(CUBA, 'CUBA', slice(None) ), + #(COBAHH, 'COBAHH', slice(None) ), + #(AdaptationOscillation, 'AdaptationOscillation', slice(None) ), + #(Vogels, 'Vogels', slice(None) ), + #(STDP, 'STDP', slice(None) ), + #(STDPEventDriven, 'STDPEventDriven', slice(None) ), + #(BrunelHakimModelScalarDelay, 'BrunelHakimModelScalarDelay', slice(None) ), + + #(VerySparseMediumRateSynapsesOnly, 'VerySparseMediumRateSynapsesOnly', slice(None) ), + #(SparseMediumRateSynapsesOnly, 'SparseMediumRateSynapsesOnly', slice(None) ), + #(DenseMediumRateSynapsesOnly, 'DenseMediumRateSynapsesOnly', slice(None) ), + #(SparseLowRateSynapsesOnly, 'SparseLowRateSynapsesOnly', slice(None) ), + #(SparseHighRateSynapsesOnly, 'SparseHighRateSynapsesOnly', slice(None) ), + + #(STDPNotEventDriven, 'STDPNotEventDriven', slice(None) ), + #(STDPMultiPost, 'STDPMultiPost', slice(None) ), + #(STDPNeuronalTraces, 'STDPNeuronalTraces', slice(None) ), + #(STDPMultiPostNeuronalTraces, 'STDPMultiPostNeuronalTraces', slice(None) ), + + (BrunelHakimModelHeterogeneousDelay, 'BrunelHakimModelHeterogeneousDelay', slice(0,-1,1) ), + + #(LinearNeuronsOnly, 'LinearNeuronsOnly', slice(None) ), + #(HHNeuronsOnly, 'HHNeuronsOnly', slice(None) ), + #(VogelsWithSynapticDynamic, 'VogelsWithSynapticDynamic', slice(None) ), + + ### below uses monitors + #(CUBAFixedConnectivity, 'CUBAFixedConnectivity', slice(None) ), + #(COBAHHFixedConnectivity, 'COBAHHFixedConnectivity', slice(None, -1) ), +] + +configurations = [config[0] for config in configs] +project_dirs = [config[1] for config in configs] + +# check if multiple Configurations with same project_dirs are specified +last_idx = {} +for proj_dir in project_dirs: + if proj_dir is not None: + first_i = project_dirs.index(proj_dir) + last_i = len(project_dirs) - 1 - project_dirs[::-1].index(proj_dir) + if first_i != last_i: + print("WARNING there are multiple configurations using {d} as project " + "directory. Profiling and logfiles will only be saved for the last one {c}.".format( + d=proj_dir, c=configurations[last_i].__name__)) + last_idx[proj_dir] = last_i + +time_stemp = time.time() +date_str = datetime.datetime.fromtimestamp(time_stemp).strftime('%Y_%m_%d') + +directory = 'results_{}{}'.format(date_str, additional_dir_name) +if os.path.exists(directory): + new_dir = directory + '_bak_' + str(int(time.time())) + print("Directory with name `{}` already exists. Renaming it to `{}`.".format(directory, new_dir)) + os.rename(directory, new_dir) +os.makedirs(directory) +data_dir = os.path.join(directory, 'data') +plot_dir = os.path.join(directory, 'plots') +log_dir = os.path.join(directory, 'logs') +prof_dir = os.path.join(directory, 'nvprof') +os.makedirs(data_dir) +os.makedirs(plot_dir) +os.makedirs(log_dir) +os.makedirs(prof_dir) +print("Saving results in {}.".format(plot_dir)) + +shutil.copy(os.path.realpath(__file__), os.path.join(directory, 'run_speed_test_script.py')) + +time_format = '%d.%m.%Y at %H:%M:%S' +script_start = datetime.datetime.fromtimestamp(time.time()).strftime(time_format) + +with open(os.path.join(directory, 'git.diff'), 'w') as diff_file: + subprocess.call(['git', 'diff'], stdout=diff_file) + +try: + for n, (st, name, sl) in enumerate(speed_tests): + start = datetime.datetime.fromtimestamp(time.time()).strftime(time_format) + print("Starting {} on {}.".format(name, start)) + maximum_run_time = 1*60*60*second + res = run_speed_tests(configurations=configurations, + speed_tests=[st], + n_slice=sl, + #n_slice=slice(0,1,None), + run_twice=False, + verbose=True, + maximum_run_time=maximum_run_time#, + ## this needs modification of brian2 code + #profile_only_active=True + #profile_only_active=False + ) + end = datetime.datetime.fromtimestamp(time.time()).strftime(time_format) + diff = datetime.datetime.strptime(end, time_format) - datetime.datetime.strptime(start, time_format) + print("Running {} took {}.".format(name, diff)) + res.plot_all_tests() + ## this needs modification of brian2 code + #res.plot_all_tests(print_relative=True) + savefig(os.path.join(plot_dir, 'speed_test_{}_absolute.png'.format(speed_tests[n][1]))) + res.plot_all_tests(relative=True) + savefig(os.path.join(plot_dir, 'speed_test_{}_relative.png'.format(name))) + res.plot_all_tests(profiling_minimum=0.05) + savefig(os.path.join(plot_dir, 'speed_test_{}_profiling.png'.format(name))) + res.plot_all_tests(profiling_minimum=0.15) + savefig(os.path.join(plot_dir, 'speed_test_min_15_{}_profiling.png'.format(name))) + + res.plot_all_tests() + ## this needs modification of brian2 code + #res.plot_all_tests(print_relative=True) + savefig(os.path.join(plot_dir, 'speed_test_{}_absolute.svg'.format(speed_tests[n][1]))) + res.plot_all_tests(relative=True) + savefig(os.path.join(plot_dir, 'speed_test_{}_relative.svg'.format(name))) + res.plot_all_tests(profiling_minimum=0.05) + savefig(os.path.join(plot_dir, 'speed_test_{}_profiling.svg'.format(name))) + res.plot_all_tests(profiling_minimum=0.15) + savefig(os.path.join(plot_dir, 'speed_test_min_15_{}_profiling.svg'.format(name))) + + if 3 != len(get_fignums()): + print("WARNING: There were {} plots created, but only {} saved.".format(len(get_fignums()), 3*(n+1))) + for n in get_fignums(): + close(n) + + # pickel results object to disk + pkl_file = os.path.join(data_dir, name + '.pkl' ) + with open(pkl_file, 'wb') as output: + pickle.dump(res, output, pickle.HIGHEST_PROTOCOL) + + # save stdout log of last run (the other are deleted in run_speed_tests()) + for proj_dir in set(project_dirs): + if not proj_dir is None and proj_dir in ['cuda_standalone', 'cpp_standalone']: + config = configurations[last_idx[proj_dir]] + stdout_file = os.path.join(proj_dir, 'results/stdout.txt') + if os.path.exists(stdout_file): + shutil.copy(stdout_file, + os.path.join(log_dir, 'stdout_{st}_{conf}_{n}.txt'.format(st=name, conf=proj_dir, + n=st.n_range[sl][-1]))) + else: + print("WARNING Couldn't save {},file not found.".format(stdout_file)) + + # run nvprof on n_range[2] + for conf, proj_dir in zip(configurations, project_dirs): + main_arg = '' + if proj_dir in ['cuda_standalone', 'GeNNworkspace']: + if proj_dir == 'GeNNworkspace': + main_arg = 'test {time} 1'.format(time=st.duration/second) + ns = st.n_range[sl] + idx = 2 + max_runtime = 20 + conf_name = conf.__name__ + print("Rerunning {} with n = {} for nvprof profiling".format(conf_name, st.n_range[idx])) + tb, res, runtime, prof_info = results(conf, st, st.n_range[idx], maximum_run_time=maximum_run_time) + if not isinstance(res, Exception) and runtime < max_runtime: + option = '--profile-from-start-off' if proj_dir == 'cuda_standalone' else '' + cmd = 'cd {proj_dir} && nvprof {opt} --log-file ../{log_file} ./main {arg}'.format( + proj_dir=proj_dir, arg=main_arg, opt=option, + log_file=os.path.join(prof_dir, 'nvprof_{st}_{conf}_{n}.log'.format( + st=name, conf=conf_name, n=st.n_range[idx]))) + prof_start = datetime.datetime.fromtimestamp(time.time()).strftime(time_format) + print(cmd) + x = os.system(cmd) + if x: + print('nvprof failed with {}'.format(x)) + prof_end = datetime.datetime.fromtimestamp(time.time()).strftime(time_format) + prof_diff = datetime.datetime.strptime(prof_end, time_format) - datetime.datetime.strptime(prof_start, time_format) + print("Profiling took {} for runtime of {}".format(prof_diff, runtime)) +finally: + create_readme(directory) + print("\nSummarized speed test results in {}".format(directory + '/README.md')) + script_end = datetime.datetime.fromtimestamp(time.time()).strftime(time_format) + script_diff = datetime.datetime.strptime(script_end, time_format) - datetime.datetime.strptime(script_start, time_format) + print("Finished speed test on {}. Total time = {}.".format( + datetime.datetime.fromtimestamp(time.time()).strftime(time_format), script_diff)) + + +##res.plot_all_tests(relative=True) +#for n in get_fignums(): +# plt.figure(n) +# savefig(plot_dir + '/speed_test_{}.png'.format(speed_tests[n-1][1])) + +## Debug (includes profiling infos) +#from brian2.tests.features.base import results +#for x in results(LocalConfiguration, LinearNeuronsOnly, 10, maximum_run_time=10*second): +# print x diff --git a/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_in_heterogenous_delay_mode/README.md b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_in_heterogenous_delay_mode/README.md new file mode 100644 index 00000000..99a61ae2 --- /dev/null +++ b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_in_heterogenous_delay_mode/README.md @@ -0,0 +1,160 @@ + +# Benchmark results from 28.11.2017 +## Description: + + + +## Last git log: +``` +commit 8987de24ed9f4a3b1a276496407fca1087f04004 +Author: Denis Alevi +Date: Mon Nov 20 14:31:09 2017 +0100 + + Fix critical section to include the actual pushing + +``` +There is also a `git diff` saved in the current directory. + +## Results + +### BrunelHakimModelHeterogeneousDelay +![](plots/speed_test_BrunelHakimModelHeterogeneousDelay_absolute.svg) +![](plots/speed_test_BrunelHakimModelHeterogeneousDelay_profiling.svg) +![](plots/speed_test_BrunelHakimModelHeterogeneousDelay_relative.svg) + +
Examplary `nvprof` results for **CUDAStandaloneConfigurationBrunelHeterogAndPushAtomicResize**

+Profile summary for `N = 1000`: + +``` +==11697== NVPROF is profiling process 11697, command: ./main +==11697== Profiling application: ./main +==11697== Profiling result: + Type Time(%) Time Calls Avg Min Max Name + GPU activities: 35.88% 122.08ms 2517 48.504us 14.144us 1.1319ms _run_synapses_pre_push_spikes_push_kernel(unsigned int, unsigned int, unsigned int, int*) + 18.92% 64.378ms 10000 6.4370us 3.5520us 8.5120us kernel_synapses_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, int*, int, double, int*, int, double*, int*) + 13.37% 45.473ms 10000 4.5470us 4.2560us 5.4400us kernel_neurongroup_stateupdater_codeobject(unsigned int, unsigned int, double, double*, double*, double*, bool*, float*) + 8.57% 29.165ms 10000 2.9160us 2.7200us 4.3200us _run_synapses_pre_push_spikes_advance_kernel(void) + 6.76% 22.989ms 10000 2.2980us 2.0480us 2.8160us kernel_neurongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double, double*, double*, bool*) + 6.05% 20.569ms 10000 2.0560us 2.0160us 4.1290us [CUDA memcpy DtoH] + 5.33% 18.127ms 10000 1.8120us 1.6320us 3.0080us kernel_neurongroup_resetter_codeobject(unsigned int, unsigned int, double*, int*) + 4.91% 16.719ms 10000 1.6710us 1.3440us 2.6240us _GLOBAL__N__69_tmpxft_00002bf1_00000000_6_neurongroup_thresholder_codeobject_cpp1_ii_97ebdcc0::_reset_neurongroup_thresholder_codeobject(int*) + 0.22% 732.58us 1 732.58us 732.58us 732.58us void gen_sequenced(curandStateXORWOW*, normal_args_st))>(curandStateXORWOW*, float2*, unsigned long, unsigned long, normal_args_st) + API calls: 63.87% 668.18ms 62518 10.687us 8.5490us 9.4480ms cudaLaunch + 27.88% 291.67ms 10000 29.166us 18.638us 1.1319ms cudaMemcpy + 5.59% 58.512ms 350073 167ns 127ns 325.26us cudaSetupArgument + 1.38% 14.484ms 62518 231ns 156ns 327.91us cudaConfigureCall + 1.23% 12.835ms 52519 244ns 175ns 326.82us cudaGetLastError + 0.03% 305.95us 1 305.95us 305.95us 305.95us cudaMalloc + 0.02% 159.37us 1 159.37us 159.37us 159.37us cudaMemGetInfo + 0.00% 31.728us 39 813ns 618ns 3.2440us cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags + 0.00% 31.555us 8 3.9440us 2.9150us 7.0990us cudaFuncGetAttributes + 0.00% 7.5860us 1 7.5860us 7.5860us 7.5860us cudaDeviceSynchronize + 0.00% 6.3490us 12 529ns 354ns 1.3930us cudaDeviceGetAttribute + 0.00% 4.5310us 3 1.5100us 1.0030us 2.4860us cudaGetDevice + +``` + +

+ + +
Examplary `nvprof` results for **CUDAStandaloneConfigurationPushAtomicResize**

+Profile summary for `N = 1000`: + +``` +==10355== NVPROF is profiling process 10355, command: ./main +==10355== Profiling application: ./main +==10355== Profiling result: + Type Time(%) Time Calls Avg Min Max Name + GPU activities: 57.51% 359.36ms 10000 35.935us 2.0800us 84.257us kernel_synapses_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, int*, int, double, int*, int, double*, int*) + 19.13% 119.52ms 2474 48.310us 13.376us 1.5396ms _run_synapses_pre_push_spikes_push_kernel(unsigned int, unsigned int, unsigned int, int*) + 6.91% 43.163ms 10000 4.3160us 4.0640us 6.1440us kernel_neurongroup_stateupdater_codeobject(unsigned int, unsigned int, double, double*, double*, double*, bool*, float*) + 4.20% 26.272ms 10000 2.6270us 2.5600us 4.0960us _run_synapses_pre_push_spikes_advance_kernel(void) + 3.48% 21.766ms 10000 2.1760us 1.9200us 2.8480us kernel_neurongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double, double*, double*, bool*) + 3.22% 20.128ms 10000 2.0120us 1.9520us 4.3200us [CUDA memcpy DtoH] + 2.80% 17.511ms 10000 1.7510us 1.5360us 3.0080us kernel_neurongroup_resetter_codeobject(unsigned int, unsigned int, double*, int*) + 2.62% 16.370ms 10000 1.6370us 1.5360us 2.7200us _GLOBAL__N__69_tmpxft_000026b5_00000000_6_neurongroup_thresholder_codeobject_cpp1_ii_97ebdcc0::_reset_neurongroup_thresholder_codeobject(int*) + 0.12% 733.19us 1 733.19us 733.19us 733.19us void gen_sequenced(curandStateXORWOW*, normal_args_st))>(curandStateXORWOW*, float2*, unsigned long, unsigned long, normal_args_st) + API calls: 58.61% 668.77ms 62475 10.704us 8.6680us 8.8399ms cudaLaunch + 33.75% 385.14ms 10000 38.513us 18.463us 1.5525ms cudaMemcpy + 5.06% 57.722ms 349901 164ns 123ns 329.86us cudaSetupArgument + 1.38% 15.780ms 62475 252ns 172ns 322.70us cudaConfigureCall + 1.15% 13.126ms 52476 250ns 203ns 308.93us cudaGetLastError + 0.02% 253.22us 1 253.22us 253.22us 253.22us cudaMalloc + 0.01% 144.31us 1 144.31us 144.31us 144.31us cudaMemGetInfo + 0.00% 29.459us 8 3.6820us 2.8650us 6.3900us cudaFuncGetAttributes + 0.00% 28.673us 39 735ns 615ns 2.0410us cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags + 0.00% 11.375us 1 11.375us 11.375us 11.375us cudaDeviceSynchronize + 0.00% 6.4040us 12 533ns 341ns 1.4790us cudaDeviceGetAttribute + 0.00% 3.9250us 3 1.3080us 891ns 2.0500us cudaGetDevice + +``` + +

+ + +
Examplary `nvprof` results for **CUDAStandaloneConfigurationTestBrunelHeteroAtomics**

+Profile summary for `N = 1000`: + +``` +==11034== NVPROF is profiling process 11034, command: ./main +==11034== Profiling application: ./main +==11034== Profiling result: + Type Time(%) Time Calls Avg Min Max Name + GPU activities: 45.01% 151.33ms 10000 15.133us 1.6960us 1.0793ms _run_synapses_pre_push_spikes_push_kernel(unsigned int, unsigned int, unsigned int, int*) + 18.06% 60.710ms 10000 6.0710us 3.4240us 8.0000us kernel_synapses_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, int*, int, double, int*, int, double*, int*) + 12.22% 41.097ms 10000 4.1090us 3.9040us 5.3760us kernel_neurongroup_stateupdater_codeobject(unsigned int, unsigned int, double, double*, double*, double*, bool*, float*) + 8.12% 27.312ms 10000 2.7310us 2.6560us 4.4800us _run_synapses_pre_push_spikes_advance_kernel(void) + 6.93% 23.311ms 10000 2.3310us 2.1440us 3.7440us kernel_neurongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double, double*, double*, bool*) + 5.17% 17.383ms 10000 1.7380us 1.5680us 3.7120us kernel_neurongroup_resetter_codeobject(unsigned int, unsigned int, double*, int*) + 4.27% 14.359ms 10000 1.4350us 1.3120us 3.7440us _GLOBAL__N__69_tmpxft_0000295b_00000000_6_neurongroup_thresholder_codeobject_cpp1_ii_97ebdcc0::_reset_neurongroup_thresholder_codeobject(int*) + 0.22% 732.61us 1 732.61us 732.61us 732.61us void gen_sequenced(curandStateXORWOW*, normal_args_st))>(curandStateXORWOW*, float2*, unsigned long, unsigned long, normal_args_st) + API calls: 88.41% 688.82ms 70001 9.8400us 8.6450us 8.6670ms cudaLaunch + 7.80% 60.794ms 380005 159ns 122ns 319.71us cudaSetupArgument + 1.88% 14.685ms 70001 209ns 166ns 314.53us cudaConfigureCall + 1.84% 14.372ms 60002 239ns 197ns 307.61us cudaGetLastError + 0.03% 259.35us 1 259.35us 259.35us 259.35us cudaMalloc + 0.02% 147.03us 1 147.03us 147.03us 147.03us cudaMemGetInfo + 0.00% 29.491us 39 756ns 620ns 1.8670us cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags + 0.00% 28.449us 8 3.5560us 2.7550us 5.7160us cudaFuncGetAttributes + 0.00% 13.847us 1 13.847us 13.847us 13.847us cudaDeviceSynchronize + 0.00% 6.0720us 12 506ns 338ns 1.3940us cudaDeviceGetAttribute + 0.00% 3.5940us 3 1.1980us 850ns 1.8350us cudaGetDevice + +``` + +

+ + +
Examplary `nvprof` results for **CUDAStandaloneConfiguration**

+Profile summary for `N = 1000`: + +``` +==9683== NVPROF is profiling process 9683, command: ./main +==9683== Profiling application: ./main +==9683== Profiling result: + Type Time(%) Time Calls Avg Min Max Name + GPU activities: 57.63% 349.62ms 10000 34.962us 1.8880us 75.808us kernel_synapses_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, int*, int, double, int*, int, double*, int*) + 22.08% 133.93ms 10000 13.392us 1.6960us 947.33us _run_synapses_pre_push_spikes_push_kernel(unsigned int, unsigned int, unsigned int, int*) + 6.72% 40.747ms 10000 4.0740us 3.8720us 5.4080us kernel_neurongroup_stateupdater_codeobject(unsigned int, unsigned int, double, double*, double*, double*, bool*, float*) + 4.24% 25.697ms 10000 2.5690us 2.4640us 4.1920us _run_synapses_pre_push_spikes_advance_kernel(void) + 3.74% 22.682ms 10000 2.2680us 2.0160us 4.0320us kernel_neurongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double, double*, double*, bool*) + 3.10% 18.782ms 10000 1.8780us 1.7280us 3.9040us kernel_neurongroup_resetter_codeobject(unsigned int, unsigned int, double*, int*) + 2.39% 14.485ms 10000 1.4480us 1.2800us 3.9040us _GLOBAL__N__69_tmpxft_00002413_00000000_6_neurongroup_thresholder_codeobject_cpp1_ii_97ebdcc0::_reset_neurongroup_thresholder_codeobject(int*) + 0.12% 732.29us 1 732.29us 732.29us 732.29us void gen_sequenced(curandStateXORWOW*, normal_args_st))>(curandStateXORWOW*, float2*, unsigned long, unsigned long, normal_args_st) + API calls: 88.80% 701.01ms 70001 10.014us 8.3580us 9.2616ms cudaLaunch + 7.49% 59.115ms 380005 155ns 121ns 308.33us cudaSetupArgument + 1.87% 14.751ms 70001 210ns 163ns 298.58us cudaConfigureCall + 1.77% 13.941ms 60002 232ns 183ns 295.21us cudaGetLastError + 0.04% 282.56us 1 282.56us 282.56us 282.56us cudaMalloc + 0.02% 148.30us 1 148.30us 148.30us 148.30us cudaMemGetInfo + 0.01% 81.989us 1 81.989us 81.989us 81.989us cudaDeviceSynchronize + 0.00% 30.446us 8 3.8050us 2.9540us 6.3180us cudaFuncGetAttributes + 0.00% 27.544us 39 706ns 585ns 1.9970us cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags + 0.00% 6.1470us 12 512ns 337ns 1.2730us cudaDeviceGetAttribute + 0.00% 4.1110us 3 1.3700us 857ns 2.3570us cudaGetDevice + +``` + +

+ + diff --git a/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_in_heterogenous_delay_mode/data/BrunelHakimModelHeterogeneousDelay.pkl b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_in_heterogenous_delay_mode/data/BrunelHakimModelHeterogeneousDelay.pkl new file mode 100644 index 00000000..f0d1527a Binary files /dev/null and b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_in_heterogenous_delay_mode/data/BrunelHakimModelHeterogeneousDelay.pkl differ diff --git a/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_in_heterogenous_delay_mode/git.diff b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_in_heterogenous_delay_mode/git.diff new file mode 100644 index 00000000..44a84fa2 --- /dev/null +++ b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_in_heterogenous_delay_mode/git.diff @@ -0,0 +1,305 @@ +diff --git a/brian2cuda/tests/features/cuda_configuration.py b/brian2cuda/tests/features/cuda_configuration.py +index 250687d..622f73a 100644 +--- a/brian2cuda/tests/features/cuda_configuration.py ++++ b/brian2cuda/tests/features/cuda_configuration.py +@@ -225,7 +225,7 @@ class CUDAStandaloneConfigurationProfileCPU(Configuration): + with_output=False) + + class CUDAStandaloneConfigurationTestBrunelHeteroAtomics(Configuration): +- name = 'CUDA standalone with atomics in heterog delay mode' ++ name = 'CUDA standalone with atomics in effect application' + def before_run(self): + brian2.set_device('cuda_standalone', build_on_run=False) + prefs["devices.cuda_standalone.test_brunel_hetero_atomics"] = True +@@ -248,7 +248,7 @@ class CUDAStandaloneConfigurationTestBrunelHeteroAtomics(Configuration): + with_output=False) + + class CUDAStandaloneConfigurationTestBrunelHeteroAtomicsProfileCPU(Configuration): +- name = "CUDA standalone with atomics in heterog delay mode (profile='blocking')" ++ name = "CUDA standalone with atomics in effect application (profile='blocking')" + def before_run(self): + brian2.set_device('cuda_standalone', build_on_run=False, profile='blocking') + prefs["devices.cuda_standalone.test_brunel_hetero_atomics"] = True +@@ -270,12 +270,10 @@ class CUDAStandaloneConfigurationTestBrunelHeteroAtomicsProfileCPU(Configuration + brian2.device.build(directory='cuda_standalone', compile=True, run=True, + with_output=False) + +- + class CUDAStandaloneConfigurationPushAtomicResize(Configuration): +- name = "CUDA standalone with atomics in queue resize" ++ name = "CUDA standalone with atomics in spikequeue resize" + def before_run(self): + brian2.set_device('cuda_standalone', build_on_run=False) +- prefs["devices.cuda_standalone.test_brunel_hetero_atomics"] = True + prefs["devices.cuda_standalone.push_atomic_resize"] = True + if socket.gethostname() == 'elnath': + if prefs['devices.cpp_standalone.extra_make_args_unix'] == ['-j12']: +@@ -295,8 +293,82 @@ class CUDAStandaloneConfigurationPushAtomicResize(Configuration): + brian2.device.build(directory='cuda_standalone', compile=True, run=True, + with_output=False) + ++ + class CUDAStandaloneConfigurationPushAtomicResizeProfileCPU(Configuration): +- name = "CUDA standalone with atomics in queue resize (profile='blocking')" ++ name = "CUDA standalone with atomics in spikequeue resize (profile='blocking')" ++ def before_run(self): ++ brian2.set_device('cuda_standalone', build_on_run=False, profile='blocking') ++ prefs["devices.cuda_standalone.push_atomic_resize"] = True ++ if socket.gethostname() == 'elnath': ++ if prefs['devices.cpp_standalone.extra_make_args_unix'] == ['-j12']: ++ prefs['devices.cpp_standalone.extra_make_args_unix'] = ['-j24'] ++ prefs['codegen.cuda.extra_compile_args_nvcc'].remove('-arch=sm_35') ++ prefs['codegen.cuda.extra_compile_args_nvcc'].extend(['-arch=sm_20']) ++ elif socket.gethostname() == 'sabik': ++ prefs['codegen.cuda.extra_compile_args_nvcc'].remove('-arch=sm_35') ++ prefs['codegen.cuda.extra_compile_args_nvcc'].extend(['-arch=sm_52']) ++ elif socket.gethostname() == 'eltanin': ++ prefs['codegen.cuda.extra_compile_args_nvcc'].remove('-arch=sm_35') ++ prefs['codegen.cuda.extra_compile_args_nvcc'].extend(['-arch=sm_61']) ++ ++ def after_run(self): ++ if os.path.exists('cuda_standalone'): ++ shutil.rmtree('cuda_standalone') ++ brian2.device.build(directory='cuda_standalone', compile=True, run=True, ++ with_output=False) ++ ++ ++class CUDAStandaloneConfigurationPushAtomicResizProfileCPU(Configuration): ++ name = "CUDA standalone with atomics in spikequeue resize (profile='blocking)" ++ def before_run(self): ++ brian2.set_device('cuda_standalone', build_on_run=False, ++ profile='blocking') ++ prefs["devices.cuda_standalone.push_atomic_resize"] = True ++ if socket.gethostname() == 'elnath': ++ if prefs['devices.cpp_standalone.extra_make_args_unix'] == ['-j12']: ++ prefs['devices.cpp_standalone.extra_make_args_unix'] = ['-j24'] ++ prefs['codegen.cuda.extra_compile_args_nvcc'].remove('-arch=sm_35') ++ prefs['codegen.cuda.extra_compile_args_nvcc'].extend(['-arch=sm_20']) ++ elif socket.gethostname() == 'sabik': ++ prefs['codegen.cuda.extra_compile_args_nvcc'].remove('-arch=sm_35') ++ prefs['codegen.cuda.extra_compile_args_nvcc'].extend(['-arch=sm_52']) ++ elif socket.gethostname() == 'eltanin': ++ prefs['codegen.cuda.extra_compile_args_nvcc'].remove('-arch=sm_35') ++ prefs['codegen.cuda.extra_compile_args_nvcc'].extend(['-arch=sm_61']) ++ ++ def after_run(self): ++ if os.path.exists('cuda_standalone'): ++ shutil.rmtree('cuda_standalone') ++ brian2.device.build(directory='cuda_standalone', compile=True, run=True, ++ with_output=False) ++ ++ ++class CUDAStandaloneConfigurationBrunelHeterogAndPushAtomicResize(Configuration): ++ name = "CUDA standalone with atomics in effect application and in spikequeue resize" ++ def before_run(self): ++ brian2.set_device('cuda_standalone', build_on_run=False) ++ prefs["devices.cuda_standalone.test_brunel_hetero_atomics"] = True ++ prefs["devices.cuda_standalone.push_atomic_resize"] = True ++ if socket.gethostname() == 'elnath': ++ if prefs['devices.cpp_standalone.extra_make_args_unix'] == ['-j12']: ++ prefs['devices.cpp_standalone.extra_make_args_unix'] = ['-j24'] ++ prefs['codegen.cuda.extra_compile_args_nvcc'].remove('-arch=sm_35') ++ prefs['codegen.cuda.extra_compile_args_nvcc'].extend(['-arch=sm_20']) ++ elif socket.gethostname() == 'sabik': ++ prefs['codegen.cuda.extra_compile_args_nvcc'].remove('-arch=sm_35') ++ prefs['codegen.cuda.extra_compile_args_nvcc'].extend(['-arch=sm_52']) ++ elif socket.gethostname() == 'eltanin': ++ prefs['codegen.cuda.extra_compile_args_nvcc'].remove('-arch=sm_35') ++ prefs['codegen.cuda.extra_compile_args_nvcc'].extend(['-arch=sm_61']) ++ ++ def after_run(self): ++ if os.path.exists('cuda_standalone'): ++ shutil.rmtree('cuda_standalone') ++ brian2.device.build(directory='cuda_standalone', compile=True, run=True, ++ with_output=False) ++ ++class CUDAStandaloneConfigurationBrunelHeterogAndPushAtomicResizeProfileCPU(Configuration): ++ name = "CUDA standalone with atomics in effect application and in spikequeue resize (profile='blocking')" + def before_run(self): + brian2.set_device('cuda_standalone', build_on_run=False, profile='blocking') + prefs["devices.cuda_standalone.test_brunel_hetero_atomics"] = True +diff --git a/dev/benchmarks/run_speed_tests.py b/dev/benchmarks/run_speed_tests.py +index 2518634..b525e97 100644 +--- a/dev/benchmarks/run_speed_tests.py ++++ b/dev/benchmarks/run_speed_tests.py +@@ -37,6 +37,7 @@ from brian2cuda.tests.features.cuda_configuration import (CUDAStandaloneConfigur + CUDAStandaloneConfigurationTestBrunelHeteroAtomics, + CUDAStandaloneConfigurationTestBrunelHeteroAtomicsProfileCPU, + CUDAStandaloneConfigurationPushAtomicResize, ++ CUDAStandaloneConfigurationBrunelHeterogAndPushAtomicResize, + CUDAStandaloneConfigurationPushAtomicResizeProfileCPU, + CUDAStandaloneConfigurationPushAtomicResizeAsyncMemcpy, + CUDAStandaloneConfigurationPushAtomicResizeAsyncMemcpyProfileCPU) +@@ -61,79 +62,80 @@ if socket.gethostname() == 'elnath': + prefs['codegen.cuda.extra_compile_args_nvcc'].extend(['-arch=sm_20']) + + configs = [# configuration project_directory +- (NumpyConfiguration, None), +- (WeaveConfiguration, None), +- (LocalConfiguration, None), ++ #(NumpyConfiguration, None), ++ #(WeaveConfiguration, None), ++ #(LocalConfiguration, None), ++ #(CPPStandaloneConfiguration, 'cpp_standalone'), ++ #(CPPStandaloneConfigurationOpenMP, 'cpp_standalone'), + (CUDAStandaloneConfiguration, 'cuda_standalone'), +- (CUDAStandaloneConfigurationExtraThresholdKernel, 'cuda_standalone'), +- (CUDAStandaloneConfigurationNoAssert, 'cuda_standalone'), +- (CUDAStandaloneConfigurationCurandDouble, 'cuda_standalone'), +- (CUDAStandaloneConfigurationNoCudaOccupancyAPI, 'cuda_standalone'), +- (CUDAStandaloneConfigurationNoCudaOccupancyAPIProfileCPU, 'cuda_standalone'), +- (CUDAStandaloneConfiguration2BlocksPerSM, 'cuda_standalone'), +- (CUDAStandaloneConfiguration2BlocksPerSMLaunchBounds, 'cuda_standalone'), +- (CUDAStandaloneConfigurationSynLaunchBounds, 'cuda_standalone'), +- (CUDAStandaloneConfiguration2BlocksPerSMSynLaunchBounds, 'cuda_standalone'), +- (CUDAStandaloneConfigurationProfileGPU, 'cuda_standalone'), +- (CUDAStandaloneConfigurationProfileCPU, 'cuda_standalone'), +- (CPPStandaloneConfiguration, 'cpp_standalone'), +- (CUDAStandaloneConfigurationTestBrunelHeteroAtomics, 'cuda_standalone'), +- (CUDAStandaloneConfigurationTestBrunelHeteroAtomicsProfileCPU, 'cuda_standalone'), + (CUDAStandaloneConfigurationPushAtomicResize, 'cuda_standalone'), +- (CUDAStandaloneConfigurationPushAtomicResizeProfileCPU, 'cuda_standalone'), +- (CUDAStandaloneConfigurationPushAtomicResizeAsyncMemcpy, 'cuda_standalone'), +- (CUDAStandaloneConfigurationPushAtomicResizeAsyncMemcpyProfileCPU, 'cuda_standalone'), +- (GeNNConfiguration, 'GeNNworkspace'), +- (CPPStandaloneConfigurationOpenMP, 'cpp_standalone'), +- (GeNNConfigurationCPU, 'GeNNworkspace'), +- (GeNNConfigurationOptimized, 'GeNNworkspace') ++ (CUDAStandaloneConfigurationTestBrunelHeteroAtomics, 'cuda_standalone'), ++ (CUDAStandaloneConfigurationBrunelHeterogAndPushAtomicResize, 'cuda_standalone'), ++ #(CUDAStandaloneConfigurationExtraThresholdKernel, 'cuda_standalone'), ++ #(CUDAStandaloneConfigurationNoAssert, 'cuda_standalone'), ++ #(CUDAStandaloneConfigurationCurandDouble, 'cuda_standalone'), ++ #(CUDAStandaloneConfigurationNoCudaOccupancyAPI, 'cuda_standalone'), ++ #(CUDAStandaloneConfigurationNoCudaOccupancyAPIProfileCPU, 'cuda_standalone'), ++ #(CUDAStandaloneConfiguration2BlocksPerSM, 'cuda_standalone'), ++ #(CUDAStandaloneConfiguration2BlocksPerSMLaunchBounds, 'cuda_standalone'), ++ #(CUDAStandaloneConfigurationSynLaunchBounds, 'cuda_standalone'), ++ #(CUDAStandaloneConfiguration2BlocksPerSMSynLaunchBounds, 'cuda_standalone'), ++ #(CUDAStandaloneConfigurationProfileGPU, 'cuda_standalone'), ++ #(CUDAStandaloneConfigurationProfileCPU, 'cuda_standalone'), ++ #(CUDAStandaloneConfigurationTestBrunelHeteroAtomicsProfileCPU, 'cuda_standalone'), ++ #(CUDAStandaloneConfigurationPushAtomicResizeProfileCPU, 'cuda_standalone'), ++ #(CUDAStandaloneConfigurationPushAtomicResizeAsyncMemcpy, 'cuda_standalone'), ++ #(CUDAStandaloneConfigurationPushAtomicResizeAsyncMemcpyProfileCPU, 'cuda_standalone'), ++ #(GeNNConfiguration, 'GeNNworkspace'), ++ #(GeNNConfigurationCPU, 'GeNNworkspace'), ++ #(GeNNConfigurationOptimized, 'GeNNworkspace') + ] + + speed_tests = [# feature_test name n_slice + +- (ThresholderOnlyPoissonLowRate, 'ThresholderOnlyPoissonLowRate', slice(None) ), +- (ThresholderOnlyPoissonMediumRate, 'ThresholderOnlyPoissonMediumRate', slice(None) ), +- (ThresholderOnlyPoissonHighRate, 'ThresholderOnlyPoissonHighRate', slice(None) ), +- (ThresholderOnlyAlwaysSpiking, 'ThresholderOnlyAlwaysSpiking', slice(None) ), +- +- (BrunelHakimStateupdateOnlyDouble, 'BrunelHakimStateupdateOnlyDouble', slice(None) ), +- (BrunelHakimStateupdateOnlyTriple, 'BrunelHakimStateupdateOnlyTriple', slice(None) ), +- (BrunelHakimStateupdateOnly, 'BrunelHakimStateupdateOnly', slice(None) ), +- (BrunelHakimNeuronsOnly, 'BrunelHakimNeuronsOnly', slice(None) ), +- (BrunelHakimNeuronsOnlyNoXi, 'BrunelHakimNeuronsOnlyNoXi', slice(None) ), +- (BrunelHakimNeuronsOnlyNoRand, 'BrunelHakimNeuronsOnlyNoRand', slice(None) ), +- (BrunelHakimStateupdateThresholdOnly, 'BrunelHakimStateupdateThresholdOnly', slice(None) ), +- (BrunelHakimStateupdateThresholdResetOnly, 'BrunelHakimStateupdateThresholdResetOnly', slice(None) ), +- (BrunelHakimModelScalarDelayShort, 'BrunelHakimModelScalarDelayShort', slice(None) ), +- (BrunelHakimModelScalarDelayNoSelfConnections, 'BrunelHakimModelScalarDelayNoSelfConnections', slice(None) ), +- (CUBA, 'CUBA', slice(None) ), +- (COBAHH, 'COBAHH', slice(None) ), +- (AdaptationOscillation, 'AdaptationOscillation', slice(None) ), +- (Vogels, 'Vogels', slice(None) ), +- (STDP, 'STDP', slice(None) ), +- (STDPEventDriven, 'STDPEventDriven', slice(None) ), +- (BrunelHakimModelScalarDelay, 'BrunelHakimModelScalarDelay', slice(None) ), +- +- (VerySparseMediumRateSynapsesOnly, 'VerySparseMediumRateSynapsesOnly', slice(None) ), +- (SparseMediumRateSynapsesOnly, 'SparseMediumRateSynapsesOnly', slice(None) ), +- (DenseMediumRateSynapsesOnly, 'DenseMediumRateSynapsesOnly', slice(None) ), +- (SparseLowRateSynapsesOnly, 'SparseLowRateSynapsesOnly', slice(None) ), +- (SparseHighRateSynapsesOnly, 'SparseHighRateSynapsesOnly', slice(None) ), +- +- (STDPNotEventDriven, 'STDPNotEventDriven', slice(None) ), +- (STDPMultiPost, 'STDPMultiPost', slice(None) ), +- (STDPNeuronalTraces, 'STDPNeuronalTraces', slice(None) ), +- (STDPMultiPostNeuronalTraces, 'STDPMultiPostNeuronalTraces', slice(None) ), ++ #(ThresholderOnlyPoissonLowRate, 'ThresholderOnlyPoissonLowRate', slice(None) ), ++ #(ThresholderOnlyPoissonMediumRate, 'ThresholderOnlyPoissonMediumRate', slice(None) ), ++ #(ThresholderOnlyPoissonHighRate, 'ThresholderOnlyPoissonHighRate', slice(None) ), ++ #(ThresholderOnlyAlwaysSpiking, 'ThresholderOnlyAlwaysSpiking', slice(None) ), ++ ++ #(BrunelHakimStateupdateOnlyDouble, 'BrunelHakimStateupdateOnlyDouble', slice(None) ), ++ #(BrunelHakimStateupdateOnlyTriple, 'BrunelHakimStateupdateOnlyTriple', slice(None) ), ++ #(BrunelHakimStateupdateOnly, 'BrunelHakimStateupdateOnly', slice(None) ), ++ #(BrunelHakimNeuronsOnly, 'BrunelHakimNeuronsOnly', slice(None) ), ++ #(BrunelHakimNeuronsOnlyNoXi, 'BrunelHakimNeuronsOnlyNoXi', slice(None) ), ++ #(BrunelHakimNeuronsOnlyNoRand, 'BrunelHakimNeuronsOnlyNoRand', slice(None) ), ++ #(BrunelHakimStateupdateThresholdOnly, 'BrunelHakimStateupdateThresholdOnly', slice(None) ), ++ #(BrunelHakimStateupdateThresholdResetOnly, 'BrunelHakimStateupdateThresholdResetOnly', slice(None) ), ++ #(BrunelHakimModelScalarDelayShort, 'BrunelHakimModelScalarDelayShort', slice(None) ), ++ #(BrunelHakimModelScalarDelayNoSelfConnections, 'BrunelHakimModelScalarDelayNoSelfConnections', slice(None) ), ++ #(CUBA, 'CUBA', slice(None) ), ++ #(COBAHH, 'COBAHH', slice(None) ), ++ #(AdaptationOscillation, 'AdaptationOscillation', slice(None) ), ++ #(Vogels, 'Vogels', slice(None) ), ++ #(STDP, 'STDP', slice(None) ), ++ #(STDPEventDriven, 'STDPEventDriven', slice(None) ), ++ #(BrunelHakimModelScalarDelay, 'BrunelHakimModelScalarDelay', slice(None) ), ++ ++ #(VerySparseMediumRateSynapsesOnly, 'VerySparseMediumRateSynapsesOnly', slice(None) ), ++ #(SparseMediumRateSynapsesOnly, 'SparseMediumRateSynapsesOnly', slice(None) ), ++ #(DenseMediumRateSynapsesOnly, 'DenseMediumRateSynapsesOnly', slice(None) ), ++ #(SparseLowRateSynapsesOnly, 'SparseLowRateSynapsesOnly', slice(None) ), ++ #(SparseHighRateSynapsesOnly, 'SparseHighRateSynapsesOnly', slice(None) ), ++ ++ #(STDPNotEventDriven, 'STDPNotEventDriven', slice(None) ), ++ #(STDPMultiPost, 'STDPMultiPost', slice(None) ), ++ #(STDPNeuronalTraces, 'STDPNeuronalTraces', slice(None) ), ++ #(STDPMultiPostNeuronalTraces, 'STDPMultiPostNeuronalTraces', slice(None) ), + + (BrunelHakimModelHeterogeneousDelay, 'BrunelHakimModelHeterogeneousDelay', slice(None) ), + +- (LinearNeuronsOnly, 'LinearNeuronsOnly', slice(None) ), +- (HHNeuronsOnly, 'HHNeuronsOnly', slice(None) ), +- (VogelsWithSynapticDynamic, 'VogelsWithSynapticDynamic', slice(None) ), ++ #(LinearNeuronsOnly, 'LinearNeuronsOnly', slice(None) ), ++ #(HHNeuronsOnly, 'HHNeuronsOnly', slice(None) ), ++ #(VogelsWithSynapticDynamic, 'VogelsWithSynapticDynamic', slice(None) ), + +- ## below uses monitors +- (CUBAFixedConnectivity, 'CUBAFixedConnectivity', slice(None) ), +- (COBAHHFixedConnectivity, 'COBAHHFixedConnectivity', slice(None, -1) ), ++ ### below uses monitors ++ #(CUBAFixedConnectivity, 'CUBAFixedConnectivity', slice(None) ), ++ #(COBAHHFixedConnectivity, 'COBAHHFixedConnectivity', slice(None, -1) ), + ] + + configurations = [config[0] for config in configs] +@@ -205,6 +207,16 @@ try: + savefig(os.path.join(plot_dir, 'speed_test_{}_relative.png'.format(name))) + res.plot_all_tests(profiling_minimum=0.05) + savefig(os.path.join(plot_dir, 'speed_test_{}_profiling.png'.format(name))) ++ ++ res.plot_all_tests() ++ ## this needs modification of brian2 code ++ #res.plot_all_tests(print_relative=True) ++ savefig(os.path.join(plot_dir, 'speed_test_{}_absolute.svg'.format(speed_tests[n][1]))) ++ res.plot_all_tests(relative=True) ++ savefig(os.path.join(plot_dir, 'speed_test_{}_relative.svg'.format(name))) ++ res.plot_all_tests(profiling_minimum=0.05) ++ savefig(os.path.join(plot_dir, 'speed_test_{}_profiling.svg'.format(name))) ++ + if 3 != len(get_fignums()): + print("WARNING: There were {} plots created, but only {} saved.".format(len(get_fignums()), 3*(n+1))) + for n in get_fignums(): +diff --git a/frozen_repos/brian2 b/frozen_repos/brian2 +--- a/frozen_repos/brian2 ++++ b/frozen_repos/brian2 +@@ -1 +1 @@ +-Subproject commit fadc6a0aeb90d1b4d343470628457d8561536f67 ++Subproject commit fadc6a0aeb90d1b4d343470628457d8561536f67-dirty +diff --git a/frozen_repos/brian2genn b/frozen_repos/brian2genn +--- a/frozen_repos/brian2genn ++++ b/frozen_repos/brian2genn +@@ -1 +1 @@ +-Subproject commit 0553cafeab49ea5403c0230411035df504d4db06 ++Subproject commit 0553cafeab49ea5403c0230411035df504d4db06-dirty diff --git a/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_in_heterogenous_delay_mode/logs/stdout_BrunelHakimModelHeterogeneousDelay_cuda_standalone_100000.txt b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_in_heterogenous_delay_mode/logs/stdout_BrunelHakimModelHeterogeneousDelay_cuda_standalone_100000.txt new file mode 100644 index 00000000..e69de29b diff --git a/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_in_heterogenous_delay_mode/nvprof/nvprof_BrunelHakimModelHeterogeneousDelay_CUDAStandaloneConfigurationBrunelHeterogAndPushAtomicResize_1000.log b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_in_heterogenous_delay_mode/nvprof/nvprof_BrunelHakimModelHeterogeneousDelay_CUDAStandaloneConfigurationBrunelHeterogAndPushAtomicResize_1000.log new file mode 100644 index 00000000..caa4f6e7 --- /dev/null +++ b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_in_heterogenous_delay_mode/nvprof/nvprof_BrunelHakimModelHeterogeneousDelay_CUDAStandaloneConfigurationBrunelHeterogAndPushAtomicResize_1000.log @@ -0,0 +1,25 @@ +==11697== NVPROF is profiling process 11697, command: ./main +==11697== Profiling application: ./main +==11697== Profiling result: + Type Time(%) Time Calls Avg Min Max Name + GPU activities: 35.88% 122.08ms 2517 48.504us 14.144us 1.1319ms _run_synapses_pre_push_spikes_push_kernel(unsigned int, unsigned int, unsigned int, int*) + 18.92% 64.378ms 10000 6.4370us 3.5520us 8.5120us kernel_synapses_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, int*, int, double, int*, int, double*, int*) + 13.37% 45.473ms 10000 4.5470us 4.2560us 5.4400us kernel_neurongroup_stateupdater_codeobject(unsigned int, unsigned int, double, double*, double*, double*, bool*, float*) + 8.57% 29.165ms 10000 2.9160us 2.7200us 4.3200us _run_synapses_pre_push_spikes_advance_kernel(void) + 6.76% 22.989ms 10000 2.2980us 2.0480us 2.8160us kernel_neurongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double, double*, double*, bool*) + 6.05% 20.569ms 10000 2.0560us 2.0160us 4.1290us [CUDA memcpy DtoH] + 5.33% 18.127ms 10000 1.8120us 1.6320us 3.0080us kernel_neurongroup_resetter_codeobject(unsigned int, unsigned int, double*, int*) + 4.91% 16.719ms 10000 1.6710us 1.3440us 2.6240us _GLOBAL__N__69_tmpxft_00002bf1_00000000_6_neurongroup_thresholder_codeobject_cpp1_ii_97ebdcc0::_reset_neurongroup_thresholder_codeobject(int*) + 0.22% 732.58us 1 732.58us 732.58us 732.58us void gen_sequenced(curandStateXORWOW*, normal_args_st))>(curandStateXORWOW*, float2*, unsigned long, unsigned long, normal_args_st) + API calls: 63.87% 668.18ms 62518 10.687us 8.5490us 9.4480ms cudaLaunch + 27.88% 291.67ms 10000 29.166us 18.638us 1.1319ms cudaMemcpy + 5.59% 58.512ms 350073 167ns 127ns 325.26us cudaSetupArgument + 1.38% 14.484ms 62518 231ns 156ns 327.91us cudaConfigureCall + 1.23% 12.835ms 52519 244ns 175ns 326.82us cudaGetLastError + 0.03% 305.95us 1 305.95us 305.95us 305.95us cudaMalloc + 0.02% 159.37us 1 159.37us 159.37us 159.37us cudaMemGetInfo + 0.00% 31.728us 39 813ns 618ns 3.2440us cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags + 0.00% 31.555us 8 3.9440us 2.9150us 7.0990us cudaFuncGetAttributes + 0.00% 7.5860us 1 7.5860us 7.5860us 7.5860us cudaDeviceSynchronize + 0.00% 6.3490us 12 529ns 354ns 1.3930us cudaDeviceGetAttribute + 0.00% 4.5310us 3 1.5100us 1.0030us 2.4860us cudaGetDevice diff --git a/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_in_heterogenous_delay_mode/nvprof/nvprof_BrunelHakimModelHeterogeneousDelay_CUDAStandaloneConfigurationPushAtomicResize_1000.log b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_in_heterogenous_delay_mode/nvprof/nvprof_BrunelHakimModelHeterogeneousDelay_CUDAStandaloneConfigurationPushAtomicResize_1000.log new file mode 100644 index 00000000..acabeb8f --- /dev/null +++ b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_in_heterogenous_delay_mode/nvprof/nvprof_BrunelHakimModelHeterogeneousDelay_CUDAStandaloneConfigurationPushAtomicResize_1000.log @@ -0,0 +1,25 @@ +==10355== NVPROF is profiling process 10355, command: ./main +==10355== Profiling application: ./main +==10355== Profiling result: + Type Time(%) Time Calls Avg Min Max Name + GPU activities: 57.51% 359.36ms 10000 35.935us 2.0800us 84.257us kernel_synapses_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, int*, int, double, int*, int, double*, int*) + 19.13% 119.52ms 2474 48.310us 13.376us 1.5396ms _run_synapses_pre_push_spikes_push_kernel(unsigned int, unsigned int, unsigned int, int*) + 6.91% 43.163ms 10000 4.3160us 4.0640us 6.1440us kernel_neurongroup_stateupdater_codeobject(unsigned int, unsigned int, double, double*, double*, double*, bool*, float*) + 4.20% 26.272ms 10000 2.6270us 2.5600us 4.0960us _run_synapses_pre_push_spikes_advance_kernel(void) + 3.48% 21.766ms 10000 2.1760us 1.9200us 2.8480us kernel_neurongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double, double*, double*, bool*) + 3.22% 20.128ms 10000 2.0120us 1.9520us 4.3200us [CUDA memcpy DtoH] + 2.80% 17.511ms 10000 1.7510us 1.5360us 3.0080us kernel_neurongroup_resetter_codeobject(unsigned int, unsigned int, double*, int*) + 2.62% 16.370ms 10000 1.6370us 1.5360us 2.7200us _GLOBAL__N__69_tmpxft_000026b5_00000000_6_neurongroup_thresholder_codeobject_cpp1_ii_97ebdcc0::_reset_neurongroup_thresholder_codeobject(int*) + 0.12% 733.19us 1 733.19us 733.19us 733.19us void gen_sequenced(curandStateXORWOW*, normal_args_st))>(curandStateXORWOW*, float2*, unsigned long, unsigned long, normal_args_st) + API calls: 58.61% 668.77ms 62475 10.704us 8.6680us 8.8399ms cudaLaunch + 33.75% 385.14ms 10000 38.513us 18.463us 1.5525ms cudaMemcpy + 5.06% 57.722ms 349901 164ns 123ns 329.86us cudaSetupArgument + 1.38% 15.780ms 62475 252ns 172ns 322.70us cudaConfigureCall + 1.15% 13.126ms 52476 250ns 203ns 308.93us cudaGetLastError + 0.02% 253.22us 1 253.22us 253.22us 253.22us cudaMalloc + 0.01% 144.31us 1 144.31us 144.31us 144.31us cudaMemGetInfo + 0.00% 29.459us 8 3.6820us 2.8650us 6.3900us cudaFuncGetAttributes + 0.00% 28.673us 39 735ns 615ns 2.0410us cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags + 0.00% 11.375us 1 11.375us 11.375us 11.375us cudaDeviceSynchronize + 0.00% 6.4040us 12 533ns 341ns 1.4790us cudaDeviceGetAttribute + 0.00% 3.9250us 3 1.3080us 891ns 2.0500us cudaGetDevice diff --git a/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_in_heterogenous_delay_mode/nvprof/nvprof_BrunelHakimModelHeterogeneousDelay_CUDAStandaloneConfigurationTestBrunelHeteroAtomics_1000.log b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_in_heterogenous_delay_mode/nvprof/nvprof_BrunelHakimModelHeterogeneousDelay_CUDAStandaloneConfigurationTestBrunelHeteroAtomics_1000.log new file mode 100644 index 00000000..f9dac10c --- /dev/null +++ b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_in_heterogenous_delay_mode/nvprof/nvprof_BrunelHakimModelHeterogeneousDelay_CUDAStandaloneConfigurationTestBrunelHeteroAtomics_1000.log @@ -0,0 +1,23 @@ +==11034== NVPROF is profiling process 11034, command: ./main +==11034== Profiling application: ./main +==11034== Profiling result: + Type Time(%) Time Calls Avg Min Max Name + GPU activities: 45.01% 151.33ms 10000 15.133us 1.6960us 1.0793ms _run_synapses_pre_push_spikes_push_kernel(unsigned int, unsigned int, unsigned int, int*) + 18.06% 60.710ms 10000 6.0710us 3.4240us 8.0000us kernel_synapses_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, int*, int, double, int*, int, double*, int*) + 12.22% 41.097ms 10000 4.1090us 3.9040us 5.3760us kernel_neurongroup_stateupdater_codeobject(unsigned int, unsigned int, double, double*, double*, double*, bool*, float*) + 8.12% 27.312ms 10000 2.7310us 2.6560us 4.4800us _run_synapses_pre_push_spikes_advance_kernel(void) + 6.93% 23.311ms 10000 2.3310us 2.1440us 3.7440us kernel_neurongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double, double*, double*, bool*) + 5.17% 17.383ms 10000 1.7380us 1.5680us 3.7120us kernel_neurongroup_resetter_codeobject(unsigned int, unsigned int, double*, int*) + 4.27% 14.359ms 10000 1.4350us 1.3120us 3.7440us _GLOBAL__N__69_tmpxft_0000295b_00000000_6_neurongroup_thresholder_codeobject_cpp1_ii_97ebdcc0::_reset_neurongroup_thresholder_codeobject(int*) + 0.22% 732.61us 1 732.61us 732.61us 732.61us void gen_sequenced(curandStateXORWOW*, normal_args_st))>(curandStateXORWOW*, float2*, unsigned long, unsigned long, normal_args_st) + API calls: 88.41% 688.82ms 70001 9.8400us 8.6450us 8.6670ms cudaLaunch + 7.80% 60.794ms 380005 159ns 122ns 319.71us cudaSetupArgument + 1.88% 14.685ms 70001 209ns 166ns 314.53us cudaConfigureCall + 1.84% 14.372ms 60002 239ns 197ns 307.61us cudaGetLastError + 0.03% 259.35us 1 259.35us 259.35us 259.35us cudaMalloc + 0.02% 147.03us 1 147.03us 147.03us 147.03us cudaMemGetInfo + 0.00% 29.491us 39 756ns 620ns 1.8670us cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags + 0.00% 28.449us 8 3.5560us 2.7550us 5.7160us cudaFuncGetAttributes + 0.00% 13.847us 1 13.847us 13.847us 13.847us cudaDeviceSynchronize + 0.00% 6.0720us 12 506ns 338ns 1.3940us cudaDeviceGetAttribute + 0.00% 3.5940us 3 1.1980us 850ns 1.8350us cudaGetDevice diff --git a/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_in_heterogenous_delay_mode/nvprof/nvprof_BrunelHakimModelHeterogeneousDelay_CUDAStandaloneConfiguration_1000.log b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_in_heterogenous_delay_mode/nvprof/nvprof_BrunelHakimModelHeterogeneousDelay_CUDAStandaloneConfiguration_1000.log new file mode 100644 index 00000000..5a56d8cc --- /dev/null +++ b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_in_heterogenous_delay_mode/nvprof/nvprof_BrunelHakimModelHeterogeneousDelay_CUDAStandaloneConfiguration_1000.log @@ -0,0 +1,23 @@ +==9683== NVPROF is profiling process 9683, command: ./main +==9683== Profiling application: ./main +==9683== Profiling result: + Type Time(%) Time Calls Avg Min Max Name + GPU activities: 57.63% 349.62ms 10000 34.962us 1.8880us 75.808us kernel_synapses_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, int*, int, double, int*, int, double*, int*) + 22.08% 133.93ms 10000 13.392us 1.6960us 947.33us _run_synapses_pre_push_spikes_push_kernel(unsigned int, unsigned int, unsigned int, int*) + 6.72% 40.747ms 10000 4.0740us 3.8720us 5.4080us kernel_neurongroup_stateupdater_codeobject(unsigned int, unsigned int, double, double*, double*, double*, bool*, float*) + 4.24% 25.697ms 10000 2.5690us 2.4640us 4.1920us _run_synapses_pre_push_spikes_advance_kernel(void) + 3.74% 22.682ms 10000 2.2680us 2.0160us 4.0320us kernel_neurongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double, double*, double*, bool*) + 3.10% 18.782ms 10000 1.8780us 1.7280us 3.9040us kernel_neurongroup_resetter_codeobject(unsigned int, unsigned int, double*, int*) + 2.39% 14.485ms 10000 1.4480us 1.2800us 3.9040us _GLOBAL__N__69_tmpxft_00002413_00000000_6_neurongroup_thresholder_codeobject_cpp1_ii_97ebdcc0::_reset_neurongroup_thresholder_codeobject(int*) + 0.12% 732.29us 1 732.29us 732.29us 732.29us void gen_sequenced(curandStateXORWOW*, normal_args_st))>(curandStateXORWOW*, float2*, unsigned long, unsigned long, normal_args_st) + API calls: 88.80% 701.01ms 70001 10.014us 8.3580us 9.2616ms cudaLaunch + 7.49% 59.115ms 380005 155ns 121ns 308.33us cudaSetupArgument + 1.87% 14.751ms 70001 210ns 163ns 298.58us cudaConfigureCall + 1.77% 13.941ms 60002 232ns 183ns 295.21us cudaGetLastError + 0.04% 282.56us 1 282.56us 282.56us 282.56us cudaMalloc + 0.02% 148.30us 1 148.30us 148.30us 148.30us cudaMemGetInfo + 0.01% 81.989us 1 81.989us 81.989us 81.989us cudaDeviceSynchronize + 0.00% 30.446us 8 3.8050us 2.9540us 6.3180us cudaFuncGetAttributes + 0.00% 27.544us 39 706ns 585ns 1.9970us cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags + 0.00% 6.1470us 12 512ns 337ns 1.2730us cudaDeviceGetAttribute + 0.00% 4.1110us 3 1.3700us 857ns 2.3570us cudaGetDevice diff --git a/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_in_heterogenous_delay_mode/plots/speed_test_BrunelHakimModelHeterogeneousDelay_absolute.png b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_in_heterogenous_delay_mode/plots/speed_test_BrunelHakimModelHeterogeneousDelay_absolute.png new file mode 100644 index 00000000..ac9bf015 Binary files /dev/null and b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_in_heterogenous_delay_mode/plots/speed_test_BrunelHakimModelHeterogeneousDelay_absolute.png differ diff --git a/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_in_heterogenous_delay_mode/plots/speed_test_BrunelHakimModelHeterogeneousDelay_profiling.png b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_in_heterogenous_delay_mode/plots/speed_test_BrunelHakimModelHeterogeneousDelay_profiling.png new file mode 100644 index 00000000..e707e501 Binary files /dev/null and b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_in_heterogenous_delay_mode/plots/speed_test_BrunelHakimModelHeterogeneousDelay_profiling.png differ diff --git a/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_in_heterogenous_delay_mode/plots/speed_test_BrunelHakimModelHeterogeneousDelay_relative.png b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_in_heterogenous_delay_mode/plots/speed_test_BrunelHakimModelHeterogeneousDelay_relative.png new file mode 100644 index 00000000..32f9e08a Binary files /dev/null and b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_in_heterogenous_delay_mode/plots/speed_test_BrunelHakimModelHeterogeneousDelay_relative.png differ diff --git a/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_in_heterogenous_delay_mode/run_speed_test_script.py b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_in_heterogenous_delay_mode/run_speed_test_script.py new file mode 100644 index 00000000..b525e975 --- /dev/null +++ b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_in_heterogenous_delay_mode/run_speed_test_script.py @@ -0,0 +1,285 @@ +import os +import shutil +import glob +import subprocess +import sys +import socket + +# run tests without X-server +import matplotlib +matplotlib.use('Agg') + +# pretty plots +import seaborn + +import time +import datetime +import cPickle as pickle + +from brian2 import * +from brian2.tests.features import * +from brian2.tests.features.base import * +from brian2.tests.features.base import results + +import brian2cuda +from brian2cuda.tests.features.cuda_configuration import (CUDAStandaloneConfiguration, + CUDAStandaloneConfigurationNoAssert, + CUDAStandaloneConfigurationExtraThresholdKernel, + CUDAStandaloneConfigurationCurandDouble, + CUDAStandaloneConfigurationNoCudaOccupancyAPI, + CUDAStandaloneConfigurationNoCudaOccupancyAPIProfileCPU, + CUDAStandaloneConfiguration2BlocksPerSM, + CUDAStandaloneConfiguration2BlocksPerSMLaunchBounds, + CUDAStandaloneConfigurationSynLaunchBounds, + CUDAStandaloneConfiguration2BlocksPerSMSynLaunchBounds, + CUDAStandaloneConfigurationProfileGPU, + CUDAStandaloneConfigurationProfileCPU, + CUDAStandaloneConfigurationTestBrunelHeteroAtomics, + CUDAStandaloneConfigurationTestBrunelHeteroAtomicsProfileCPU, + CUDAStandaloneConfigurationPushAtomicResize, + CUDAStandaloneConfigurationBrunelHeterogAndPushAtomicResize, + CUDAStandaloneConfigurationPushAtomicResizeProfileCPU, + CUDAStandaloneConfigurationPushAtomicResizeAsyncMemcpy, + CUDAStandaloneConfigurationPushAtomicResizeAsyncMemcpyProfileCPU) +from brian2cuda.tests.features.speed import * + +from brian2genn.correctness_testing import GeNNConfiguration, GeNNConfigurationCPU, GeNNConfigurationOptimized + +from create_readme import create_readme + +assert len(sys.argv)<= 2, 'Only one command line argument supported! Got {}'.format(len(sys.argv)-1) +if len(sys.argv) == 2: + additional_dir_name = '_' + sys.argv[1] +else: + additional_dir_name = '' + +prefs['devices.cpp_standalone.extra_make_args_unix'] = ['-j12'] + +# host specific settings +if socket.gethostname() == 'elnath': + prefs['devices.cpp_standalone.extra_make_args_unix'] = ['-j24'] + prefs['codegen.cuda.extra_compile_args_nvcc'].remove('-arch=sm_35') + prefs['codegen.cuda.extra_compile_args_nvcc'].extend(['-arch=sm_20']) + +configs = [# configuration project_directory + #(NumpyConfiguration, None), + #(WeaveConfiguration, None), + #(LocalConfiguration, None), + #(CPPStandaloneConfiguration, 'cpp_standalone'), + #(CPPStandaloneConfigurationOpenMP, 'cpp_standalone'), + (CUDAStandaloneConfiguration, 'cuda_standalone'), + (CUDAStandaloneConfigurationPushAtomicResize, 'cuda_standalone'), + (CUDAStandaloneConfigurationTestBrunelHeteroAtomics, 'cuda_standalone'), + (CUDAStandaloneConfigurationBrunelHeterogAndPushAtomicResize, 'cuda_standalone'), + #(CUDAStandaloneConfigurationExtraThresholdKernel, 'cuda_standalone'), + #(CUDAStandaloneConfigurationNoAssert, 'cuda_standalone'), + #(CUDAStandaloneConfigurationCurandDouble, 'cuda_standalone'), + #(CUDAStandaloneConfigurationNoCudaOccupancyAPI, 'cuda_standalone'), + #(CUDAStandaloneConfigurationNoCudaOccupancyAPIProfileCPU, 'cuda_standalone'), + #(CUDAStandaloneConfiguration2BlocksPerSM, 'cuda_standalone'), + #(CUDAStandaloneConfiguration2BlocksPerSMLaunchBounds, 'cuda_standalone'), + #(CUDAStandaloneConfigurationSynLaunchBounds, 'cuda_standalone'), + #(CUDAStandaloneConfiguration2BlocksPerSMSynLaunchBounds, 'cuda_standalone'), + #(CUDAStandaloneConfigurationProfileGPU, 'cuda_standalone'), + #(CUDAStandaloneConfigurationProfileCPU, 'cuda_standalone'), + #(CUDAStandaloneConfigurationTestBrunelHeteroAtomicsProfileCPU, 'cuda_standalone'), + #(CUDAStandaloneConfigurationPushAtomicResizeProfileCPU, 'cuda_standalone'), + #(CUDAStandaloneConfigurationPushAtomicResizeAsyncMemcpy, 'cuda_standalone'), + #(CUDAStandaloneConfigurationPushAtomicResizeAsyncMemcpyProfileCPU, 'cuda_standalone'), + #(GeNNConfiguration, 'GeNNworkspace'), + #(GeNNConfigurationCPU, 'GeNNworkspace'), + #(GeNNConfigurationOptimized, 'GeNNworkspace') + ] + +speed_tests = [# feature_test name n_slice + + #(ThresholderOnlyPoissonLowRate, 'ThresholderOnlyPoissonLowRate', slice(None) ), + #(ThresholderOnlyPoissonMediumRate, 'ThresholderOnlyPoissonMediumRate', slice(None) ), + #(ThresholderOnlyPoissonHighRate, 'ThresholderOnlyPoissonHighRate', slice(None) ), + #(ThresholderOnlyAlwaysSpiking, 'ThresholderOnlyAlwaysSpiking', slice(None) ), + + #(BrunelHakimStateupdateOnlyDouble, 'BrunelHakimStateupdateOnlyDouble', slice(None) ), + #(BrunelHakimStateupdateOnlyTriple, 'BrunelHakimStateupdateOnlyTriple', slice(None) ), + #(BrunelHakimStateupdateOnly, 'BrunelHakimStateupdateOnly', slice(None) ), + #(BrunelHakimNeuronsOnly, 'BrunelHakimNeuronsOnly', slice(None) ), + #(BrunelHakimNeuronsOnlyNoXi, 'BrunelHakimNeuronsOnlyNoXi', slice(None) ), + #(BrunelHakimNeuronsOnlyNoRand, 'BrunelHakimNeuronsOnlyNoRand', slice(None) ), + #(BrunelHakimStateupdateThresholdOnly, 'BrunelHakimStateupdateThresholdOnly', slice(None) ), + #(BrunelHakimStateupdateThresholdResetOnly, 'BrunelHakimStateupdateThresholdResetOnly', slice(None) ), + #(BrunelHakimModelScalarDelayShort, 'BrunelHakimModelScalarDelayShort', slice(None) ), + #(BrunelHakimModelScalarDelayNoSelfConnections, 'BrunelHakimModelScalarDelayNoSelfConnections', slice(None) ), + #(CUBA, 'CUBA', slice(None) ), + #(COBAHH, 'COBAHH', slice(None) ), + #(AdaptationOscillation, 'AdaptationOscillation', slice(None) ), + #(Vogels, 'Vogels', slice(None) ), + #(STDP, 'STDP', slice(None) ), + #(STDPEventDriven, 'STDPEventDriven', slice(None) ), + #(BrunelHakimModelScalarDelay, 'BrunelHakimModelScalarDelay', slice(None) ), + + #(VerySparseMediumRateSynapsesOnly, 'VerySparseMediumRateSynapsesOnly', slice(None) ), + #(SparseMediumRateSynapsesOnly, 'SparseMediumRateSynapsesOnly', slice(None) ), + #(DenseMediumRateSynapsesOnly, 'DenseMediumRateSynapsesOnly', slice(None) ), + #(SparseLowRateSynapsesOnly, 'SparseLowRateSynapsesOnly', slice(None) ), + #(SparseHighRateSynapsesOnly, 'SparseHighRateSynapsesOnly', slice(None) ), + + #(STDPNotEventDriven, 'STDPNotEventDriven', slice(None) ), + #(STDPMultiPost, 'STDPMultiPost', slice(None) ), + #(STDPNeuronalTraces, 'STDPNeuronalTraces', slice(None) ), + #(STDPMultiPostNeuronalTraces, 'STDPMultiPostNeuronalTraces', slice(None) ), + + (BrunelHakimModelHeterogeneousDelay, 'BrunelHakimModelHeterogeneousDelay', slice(None) ), + + #(LinearNeuronsOnly, 'LinearNeuronsOnly', slice(None) ), + #(HHNeuronsOnly, 'HHNeuronsOnly', slice(None) ), + #(VogelsWithSynapticDynamic, 'VogelsWithSynapticDynamic', slice(None) ), + + ### below uses monitors + #(CUBAFixedConnectivity, 'CUBAFixedConnectivity', slice(None) ), + #(COBAHHFixedConnectivity, 'COBAHHFixedConnectivity', slice(None, -1) ), +] + +configurations = [config[0] for config in configs] +project_dirs = [config[1] for config in configs] + +# check if multiple Configurations with same project_dirs are specified +last_idx = {} +for proj_dir in project_dirs: + if proj_dir is not None: + first_i = project_dirs.index(proj_dir) + last_i = len(project_dirs) - 1 - project_dirs[::-1].index(proj_dir) + if first_i != last_i: + print("WARNING there are multiple configurations using {d} as project " + "directory. Profiling and logfiles will only be saved for the last one {c}.".format( + d=proj_dir, c=configurations[last_i].__name__)) + last_idx[proj_dir] = last_i + +time_stemp = time.time() +date_str = datetime.datetime.fromtimestamp(time_stemp).strftime('%Y_%m_%d') + +directory = 'results_{}{}'.format(date_str, additional_dir_name) +if os.path.exists(directory): + new_dir = directory + '_bak_' + str(int(time.time())) + print("Directory with name `{}` already exists. Renaming it to `{}`.".format(directory, new_dir)) + os.rename(directory, new_dir) +os.makedirs(directory) +data_dir = os.path.join(directory, 'data') +plot_dir = os.path.join(directory, 'plots') +log_dir = os.path.join(directory, 'logs') +prof_dir = os.path.join(directory, 'nvprof') +os.makedirs(data_dir) +os.makedirs(plot_dir) +os.makedirs(log_dir) +os.makedirs(prof_dir) +print("Saving results in {}.".format(plot_dir)) + +shutil.copy(os.path.realpath(__file__), os.path.join(directory, 'run_speed_test_script.py')) + +time_format = '%d.%m.%Y at %H:%M:%S' +script_start = datetime.datetime.fromtimestamp(time.time()).strftime(time_format) + +with open(os.path.join(directory, 'git.diff'), 'w') as diff_file: + subprocess.call(['git', 'diff'], stdout=diff_file) + +try: + for n, (st, name, sl) in enumerate(speed_tests): + start = datetime.datetime.fromtimestamp(time.time()).strftime(time_format) + print("Starting {} on {}.".format(name, start)) + maximum_run_time = 1*60*60*second + res = run_speed_tests(configurations=configurations, + speed_tests=[st], + n_slice=sl, + #n_slice=slice(0,1,None), + run_twice=False, + verbose=True, + maximum_run_time=maximum_run_time#, + ## this needs modification of brian2 code + #profile_only_active=True + #profile_only_active=False + ) + end = datetime.datetime.fromtimestamp(time.time()).strftime(time_format) + diff = datetime.datetime.strptime(end, time_format) - datetime.datetime.strptime(start, time_format) + print("Running {} took {}.".format(name, diff)) + res.plot_all_tests() + ## this needs modification of brian2 code + #res.plot_all_tests(print_relative=True) + savefig(os.path.join(plot_dir, 'speed_test_{}_absolute.png'.format(speed_tests[n][1]))) + res.plot_all_tests(relative=True) + savefig(os.path.join(plot_dir, 'speed_test_{}_relative.png'.format(name))) + res.plot_all_tests(profiling_minimum=0.05) + savefig(os.path.join(plot_dir, 'speed_test_{}_profiling.png'.format(name))) + + res.plot_all_tests() + ## this needs modification of brian2 code + #res.plot_all_tests(print_relative=True) + savefig(os.path.join(plot_dir, 'speed_test_{}_absolute.svg'.format(speed_tests[n][1]))) + res.plot_all_tests(relative=True) + savefig(os.path.join(plot_dir, 'speed_test_{}_relative.svg'.format(name))) + res.plot_all_tests(profiling_minimum=0.05) + savefig(os.path.join(plot_dir, 'speed_test_{}_profiling.svg'.format(name))) + + if 3 != len(get_fignums()): + print("WARNING: There were {} plots created, but only {} saved.".format(len(get_fignums()), 3*(n+1))) + for n in get_fignums(): + close(n) + + # pickel results object to disk + pkl_file = os.path.join(data_dir, name + '.pkl' ) + with open(pkl_file, 'wb') as output: + pickle.dump(res, output, pickle.HIGHEST_PROTOCOL) + + # save stdout log of last run (the other are deleted in run_speed_tests()) + for proj_dir in set(project_dirs): + if not proj_dir is None and proj_dir in ['cuda_standalone', 'cpp_standalone']: + config = configurations[last_idx[proj_dir]] + stdout_file = os.path.join(proj_dir, 'results/stdout.txt') + if os.path.exists(stdout_file): + shutil.copy(stdout_file, + os.path.join(log_dir, 'stdout_{st}_{conf}_{n}.txt'.format(st=name, conf=proj_dir, + n=st.n_range[sl][-1]))) + else: + print("WARNING Couldn't save {},file not found.".format(stdout_file)) + + # run nvprof on n_range[2] + for conf, proj_dir in zip(configurations, project_dirs): + main_arg = '' + if proj_dir in ['cuda_standalone', 'GeNNworkspace']: + if proj_dir == 'GeNNworkspace': + main_arg = 'test {time} 1'.format(time=st.duration/second) + ns = st.n_range[sl] + idx = 2 + max_runtime = 20 + conf_name = conf.__name__ + print("Rerunning {} with n = {} for nvprof profiling".format(conf_name, st.n_range[idx])) + tb, res, runtime, prof_info = results(conf, st, st.n_range[idx], maximum_run_time=maximum_run_time) + if not isinstance(res, Exception) and runtime < max_runtime: + option = '--profile-from-start-off' if proj_dir == 'cuda_standalone' else '' + cmd = 'cd {proj_dir} && nvprof {opt} --log-file ../{log_file} ./main {arg}'.format( + proj_dir=proj_dir, arg=main_arg, opt=option, + log_file=os.path.join(prof_dir, 'nvprof_{st}_{conf}_{n}.log'.format( + st=name, conf=conf_name, n=st.n_range[idx]))) + prof_start = datetime.datetime.fromtimestamp(time.time()).strftime(time_format) + print(cmd) + x = os.system(cmd) + if x: + print('nvprof failed with {}'.format(x)) + prof_end = datetime.datetime.fromtimestamp(time.time()).strftime(time_format) + prof_diff = datetime.datetime.strptime(prof_end, time_format) - datetime.datetime.strptime(prof_start, time_format) + print("Profiling took {} for runtime of {}".format(prof_diff, runtime)) +finally: + create_readme(directory) + print("\nSummarized speed test results in {}".format(directory + '/README.md')) + script_end = datetime.datetime.fromtimestamp(time.time()).strftime(time_format) + script_diff = datetime.datetime.strptime(script_end, time_format) - datetime.datetime.strptime(script_start, time_format) + print("Finished speed test on {}. Total time = {}.".format( + datetime.datetime.fromtimestamp(time.time()).strftime(time_format), script_diff)) + + +##res.plot_all_tests(relative=True) +#for n in get_fignums(): +# plt.figure(n) +# savefig(plot_dir + '/speed_test_{}.png'.format(speed_tests[n-1][1])) + +## Debug (includes profiling infos) +#from brian2.tests.features.base import results +#for x in results(LocalConfiguration, LinearNeuronsOnly, 10, maximum_run_time=10*second): +# print x diff --git a/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_original_and_atomics_effects_profiled/README.md b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_original_and_atomics_effects_profiled/README.md new file mode 100644 index 00000000..db395a43 --- /dev/null +++ b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_original_and_atomics_effects_profiled/README.md @@ -0,0 +1,96 @@ + +# Benchmark results from 28.11.2017 +## Description: + + + +## Last git log: +``` +commit 8987de24ed9f4a3b1a276496407fca1087f04004 +Author: Denis Alevi +Date: Mon Nov 20 14:31:09 2017 +0100 + + Fix critical section to include the actual pushing + +``` +There is also a `git diff` saved in the current directory. + +## Results + +### BrunelHakimModelHeterogeneousDelay +![](plots/speed_test_BrunelHakimModelHeterogeneousDelay_absolute.svg) +![](plots/speed_test_BrunelHakimModelHeterogeneousDelay_profiling.svg) +![](plots/speed_test_BrunelHakimModelHeterogeneousDelay_relative.svg) + +
Examplary `nvprof` results for **CUDAStandaloneConfigurationProfileCPU**

+Profile summary for `N = 1000`: + +``` +==22819== NVPROF is profiling process 22819, command: ./main +==22819== Profiling application: ./main +==22819== Profiling result: + Type Time(%) Time Calls Avg Min Max Name + GPU activities: 57.40% 374.75ms 10000 37.475us 2.2080us 87.617us kernel_synapses_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, int*, int, double, int*, int, double*, int*) + 22.46% 146.65ms 10000 14.664us 1.7280us 960.13us _run_synapses_pre_push_spikes_push_kernel(unsigned int, unsigned int, unsigned int, int*) + 6.51% 42.496ms 10000 4.2490us 4.0640us 5.4400us kernel_neurongroup_stateupdater_codeobject(unsigned int, unsigned int, double, double*, double*, double*, bool*, float*) + 4.30% 28.046ms 10000 2.8040us 2.7200us 4.3840us _run_synapses_pre_push_spikes_advance_kernel(void) + 3.74% 24.420ms 10000 2.4420us 2.1120us 2.9120us kernel_neurongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double, double*, double*, bool*) + 3.10% 20.239ms 10000 2.0230us 1.9520us 2.2400us kernel_neurongroup_resetter_codeobject(unsigned int, unsigned int, double*, int*) + 2.38% 15.522ms 10000 1.5520us 1.4400us 1.9520us _GLOBAL__N__69_tmpxft_0000578e_00000000_6_neurongroup_thresholder_codeobject_cpp1_ii_97ebdcc0::_reset_neurongroup_thresholder_codeobject(int*) + 0.11% 732.55us 1 732.55us 732.55us 732.55us void gen_sequenced(curandStateXORWOW*, normal_args_st))>(curandStateXORWOW*, float2*, unsigned long, unsigned long, normal_args_st) + API calls: 48.78% 862.78ms 60001 14.379us 2.6750us 965.88us cudaDeviceSynchronize + 45.73% 808.74ms 70001 11.553us 9.9030us 8.8572ms cudaLaunch + 3.59% 63.553ms 380005 167ns 135ns 324.80us cudaSetupArgument + 1.05% 18.499ms 70001 264ns 187ns 12.032us cudaConfigureCall + 0.82% 14.507ms 60002 241ns 191ns 11.691us cudaGetLastError + 0.02% 272.19us 1 272.19us 272.19us 272.19us cudaMalloc + 0.01% 153.60us 1 153.60us 153.60us 153.60us cudaMemGetInfo + 0.00% 30.182us 8 3.7720us 3.0160us 5.4880us cudaFuncGetAttributes + 0.00% 28.610us 39 733ns 616ns 2.0690us cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags + 0.00% 6.6020us 12 550ns 360ns 1.2870us cudaDeviceGetAttribute + 0.00% 2.7290us 3 909ns 660ns 1.3620us cudaGetDevice + +``` + +

+ + +
Examplary `nvprof` results for **CUDAStandaloneConfigurationTestBrunelHeteroAtomicsProfileCPU**

+Profile summary for `N = 1000`: + +``` +==23535== NVPROF is profiling process 23535, command: ./main +==23535== Profiling application: ./main +==23535== Profiling result: + Type Time(%) Time Calls Avg Min Max Name + GPU activities: 42.76% 145.65ms 10000 14.565us 1.9840us 1.1359ms _run_synapses_pre_push_spikes_push_kernel(unsigned int, unsigned int, unsigned int, int*) + 18.94% 64.522ms 10000 6.4520us 3.6480us 8.1280us kernel_synapses_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, int*, int, double, int*, int, double*, int*) + 12.92% 43.998ms 10000 4.3990us 4.1280us 5.5360us kernel_neurongroup_stateupdater_codeobject(unsigned int, unsigned int, double, double*, double*, double*, bool*, float*) + 8.09% 27.540ms 10000 2.7530us 2.6880us 4.5120us _run_synapses_pre_push_spikes_advance_kernel(void) + 6.73% 22.929ms 10000 2.2920us 2.0800us 2.8170us kernel_neurongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double, double*, double*, bool*) + 5.80% 19.741ms 10000 1.9740us 1.8560us 2.1760us kernel_neurongroup_resetter_codeobject(unsigned int, unsigned int, double*, int*) + 4.55% 15.502ms 10000 1.5500us 1.4400us 2.1760us _GLOBAL__N__69_tmpxft_00005a32_00000000_6_neurongroup_thresholder_codeobject_cpp1_ii_97ebdcc0::_reset_neurongroup_thresholder_codeobject(int*) + 0.21% 731.97us 1 731.97us 731.97us 731.97us void gen_sequenced(curandStateXORWOW*, normal_args_st))>(curandStateXORWOW*, float2*, unsigned long, unsigned long, normal_args_st) + API calls: 55.46% 798.58ms 70001 11.408us 9.3530us 8.8769ms cudaLaunch + 37.87% 545.25ms 60001 9.0870us 2.4500us 1.1408ms cudaDeviceSynchronize + 4.14% 59.601ms 380005 156ns 125ns 312.80us cudaSetupArgument + 1.35% 19.377ms 70001 276ns 185ns 13.408us cudaConfigureCall + 1.16% 16.665ms 60002 277ns 203ns 11.227us cudaGetLastError + 0.02% 277.76us 1 277.76us 277.76us 277.76us cudaMalloc + 0.01% 156.74us 1 156.74us 156.74us 156.74us cudaMemGetInfo + 0.00% 32.165us 8 4.0200us 2.8580us 7.6380us cudaFuncGetAttributes + 0.00% 27.873us 39 714ns 603ns 1.7260us cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags + 0.00% 6.6820us 12 556ns 373ns 1.4220us cudaDeviceGetAttribute + 0.00% 2.8330us 3 944ns 675ns 1.4210us cudaGetDevice + +``` + +

+ + +*** + +### BrunelHakimModelHeterogeneousDelay - display less kernels in profiling +![](plots/speed_test_min_15_BrunelHakimModelHeterogeneousDelay_profiling.svg) + + diff --git a/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_original_and_atomics_effects_profiled/data/BrunelHakimModelHeterogeneousDelay.pkl b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_original_and_atomics_effects_profiled/data/BrunelHakimModelHeterogeneousDelay.pkl new file mode 100644 index 00000000..a7ddffe9 Binary files /dev/null and b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_original_and_atomics_effects_profiled/data/BrunelHakimModelHeterogeneousDelay.pkl differ diff --git a/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_original_and_atomics_effects_profiled/git.diff b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_original_and_atomics_effects_profiled/git.diff new file mode 100644 index 00000000..44a84fa2 --- /dev/null +++ b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_original_and_atomics_effects_profiled/git.diff @@ -0,0 +1,305 @@ +diff --git a/brian2cuda/tests/features/cuda_configuration.py b/brian2cuda/tests/features/cuda_configuration.py +index 250687d..622f73a 100644 +--- a/brian2cuda/tests/features/cuda_configuration.py ++++ b/brian2cuda/tests/features/cuda_configuration.py +@@ -225,7 +225,7 @@ class CUDAStandaloneConfigurationProfileCPU(Configuration): + with_output=False) + + class CUDAStandaloneConfigurationTestBrunelHeteroAtomics(Configuration): +- name = 'CUDA standalone with atomics in heterog delay mode' ++ name = 'CUDA standalone with atomics in effect application' + def before_run(self): + brian2.set_device('cuda_standalone', build_on_run=False) + prefs["devices.cuda_standalone.test_brunel_hetero_atomics"] = True +@@ -248,7 +248,7 @@ class CUDAStandaloneConfigurationTestBrunelHeteroAtomics(Configuration): + with_output=False) + + class CUDAStandaloneConfigurationTestBrunelHeteroAtomicsProfileCPU(Configuration): +- name = "CUDA standalone with atomics in heterog delay mode (profile='blocking')" ++ name = "CUDA standalone with atomics in effect application (profile='blocking')" + def before_run(self): + brian2.set_device('cuda_standalone', build_on_run=False, profile='blocking') + prefs["devices.cuda_standalone.test_brunel_hetero_atomics"] = True +@@ -270,12 +270,10 @@ class CUDAStandaloneConfigurationTestBrunelHeteroAtomicsProfileCPU(Configuration + brian2.device.build(directory='cuda_standalone', compile=True, run=True, + with_output=False) + +- + class CUDAStandaloneConfigurationPushAtomicResize(Configuration): +- name = "CUDA standalone with atomics in queue resize" ++ name = "CUDA standalone with atomics in spikequeue resize" + def before_run(self): + brian2.set_device('cuda_standalone', build_on_run=False) +- prefs["devices.cuda_standalone.test_brunel_hetero_atomics"] = True + prefs["devices.cuda_standalone.push_atomic_resize"] = True + if socket.gethostname() == 'elnath': + if prefs['devices.cpp_standalone.extra_make_args_unix'] == ['-j12']: +@@ -295,8 +293,82 @@ class CUDAStandaloneConfigurationPushAtomicResize(Configuration): + brian2.device.build(directory='cuda_standalone', compile=True, run=True, + with_output=False) + ++ + class CUDAStandaloneConfigurationPushAtomicResizeProfileCPU(Configuration): +- name = "CUDA standalone with atomics in queue resize (profile='blocking')" ++ name = "CUDA standalone with atomics in spikequeue resize (profile='blocking')" ++ def before_run(self): ++ brian2.set_device('cuda_standalone', build_on_run=False, profile='blocking') ++ prefs["devices.cuda_standalone.push_atomic_resize"] = True ++ if socket.gethostname() == 'elnath': ++ if prefs['devices.cpp_standalone.extra_make_args_unix'] == ['-j12']: ++ prefs['devices.cpp_standalone.extra_make_args_unix'] = ['-j24'] ++ prefs['codegen.cuda.extra_compile_args_nvcc'].remove('-arch=sm_35') ++ prefs['codegen.cuda.extra_compile_args_nvcc'].extend(['-arch=sm_20']) ++ elif socket.gethostname() == 'sabik': ++ prefs['codegen.cuda.extra_compile_args_nvcc'].remove('-arch=sm_35') ++ prefs['codegen.cuda.extra_compile_args_nvcc'].extend(['-arch=sm_52']) ++ elif socket.gethostname() == 'eltanin': ++ prefs['codegen.cuda.extra_compile_args_nvcc'].remove('-arch=sm_35') ++ prefs['codegen.cuda.extra_compile_args_nvcc'].extend(['-arch=sm_61']) ++ ++ def after_run(self): ++ if os.path.exists('cuda_standalone'): ++ shutil.rmtree('cuda_standalone') ++ brian2.device.build(directory='cuda_standalone', compile=True, run=True, ++ with_output=False) ++ ++ ++class CUDAStandaloneConfigurationPushAtomicResizProfileCPU(Configuration): ++ name = "CUDA standalone with atomics in spikequeue resize (profile='blocking)" ++ def before_run(self): ++ brian2.set_device('cuda_standalone', build_on_run=False, ++ profile='blocking') ++ prefs["devices.cuda_standalone.push_atomic_resize"] = True ++ if socket.gethostname() == 'elnath': ++ if prefs['devices.cpp_standalone.extra_make_args_unix'] == ['-j12']: ++ prefs['devices.cpp_standalone.extra_make_args_unix'] = ['-j24'] ++ prefs['codegen.cuda.extra_compile_args_nvcc'].remove('-arch=sm_35') ++ prefs['codegen.cuda.extra_compile_args_nvcc'].extend(['-arch=sm_20']) ++ elif socket.gethostname() == 'sabik': ++ prefs['codegen.cuda.extra_compile_args_nvcc'].remove('-arch=sm_35') ++ prefs['codegen.cuda.extra_compile_args_nvcc'].extend(['-arch=sm_52']) ++ elif socket.gethostname() == 'eltanin': ++ prefs['codegen.cuda.extra_compile_args_nvcc'].remove('-arch=sm_35') ++ prefs['codegen.cuda.extra_compile_args_nvcc'].extend(['-arch=sm_61']) ++ ++ def after_run(self): ++ if os.path.exists('cuda_standalone'): ++ shutil.rmtree('cuda_standalone') ++ brian2.device.build(directory='cuda_standalone', compile=True, run=True, ++ with_output=False) ++ ++ ++class CUDAStandaloneConfigurationBrunelHeterogAndPushAtomicResize(Configuration): ++ name = "CUDA standalone with atomics in effect application and in spikequeue resize" ++ def before_run(self): ++ brian2.set_device('cuda_standalone', build_on_run=False) ++ prefs["devices.cuda_standalone.test_brunel_hetero_atomics"] = True ++ prefs["devices.cuda_standalone.push_atomic_resize"] = True ++ if socket.gethostname() == 'elnath': ++ if prefs['devices.cpp_standalone.extra_make_args_unix'] == ['-j12']: ++ prefs['devices.cpp_standalone.extra_make_args_unix'] = ['-j24'] ++ prefs['codegen.cuda.extra_compile_args_nvcc'].remove('-arch=sm_35') ++ prefs['codegen.cuda.extra_compile_args_nvcc'].extend(['-arch=sm_20']) ++ elif socket.gethostname() == 'sabik': ++ prefs['codegen.cuda.extra_compile_args_nvcc'].remove('-arch=sm_35') ++ prefs['codegen.cuda.extra_compile_args_nvcc'].extend(['-arch=sm_52']) ++ elif socket.gethostname() == 'eltanin': ++ prefs['codegen.cuda.extra_compile_args_nvcc'].remove('-arch=sm_35') ++ prefs['codegen.cuda.extra_compile_args_nvcc'].extend(['-arch=sm_61']) ++ ++ def after_run(self): ++ if os.path.exists('cuda_standalone'): ++ shutil.rmtree('cuda_standalone') ++ brian2.device.build(directory='cuda_standalone', compile=True, run=True, ++ with_output=False) ++ ++class CUDAStandaloneConfigurationBrunelHeterogAndPushAtomicResizeProfileCPU(Configuration): ++ name = "CUDA standalone with atomics in effect application and in spikequeue resize (profile='blocking')" + def before_run(self): + brian2.set_device('cuda_standalone', build_on_run=False, profile='blocking') + prefs["devices.cuda_standalone.test_brunel_hetero_atomics"] = True +diff --git a/dev/benchmarks/run_speed_tests.py b/dev/benchmarks/run_speed_tests.py +index 2518634..b525e97 100644 +--- a/dev/benchmarks/run_speed_tests.py ++++ b/dev/benchmarks/run_speed_tests.py +@@ -37,6 +37,7 @@ from brian2cuda.tests.features.cuda_configuration import (CUDAStandaloneConfigur + CUDAStandaloneConfigurationTestBrunelHeteroAtomics, + CUDAStandaloneConfigurationTestBrunelHeteroAtomicsProfileCPU, + CUDAStandaloneConfigurationPushAtomicResize, ++ CUDAStandaloneConfigurationBrunelHeterogAndPushAtomicResize, + CUDAStandaloneConfigurationPushAtomicResizeProfileCPU, + CUDAStandaloneConfigurationPushAtomicResizeAsyncMemcpy, + CUDAStandaloneConfigurationPushAtomicResizeAsyncMemcpyProfileCPU) +@@ -61,79 +62,80 @@ if socket.gethostname() == 'elnath': + prefs['codegen.cuda.extra_compile_args_nvcc'].extend(['-arch=sm_20']) + + configs = [# configuration project_directory +- (NumpyConfiguration, None), +- (WeaveConfiguration, None), +- (LocalConfiguration, None), ++ #(NumpyConfiguration, None), ++ #(WeaveConfiguration, None), ++ #(LocalConfiguration, None), ++ #(CPPStandaloneConfiguration, 'cpp_standalone'), ++ #(CPPStandaloneConfigurationOpenMP, 'cpp_standalone'), + (CUDAStandaloneConfiguration, 'cuda_standalone'), +- (CUDAStandaloneConfigurationExtraThresholdKernel, 'cuda_standalone'), +- (CUDAStandaloneConfigurationNoAssert, 'cuda_standalone'), +- (CUDAStandaloneConfigurationCurandDouble, 'cuda_standalone'), +- (CUDAStandaloneConfigurationNoCudaOccupancyAPI, 'cuda_standalone'), +- (CUDAStandaloneConfigurationNoCudaOccupancyAPIProfileCPU, 'cuda_standalone'), +- (CUDAStandaloneConfiguration2BlocksPerSM, 'cuda_standalone'), +- (CUDAStandaloneConfiguration2BlocksPerSMLaunchBounds, 'cuda_standalone'), +- (CUDAStandaloneConfigurationSynLaunchBounds, 'cuda_standalone'), +- (CUDAStandaloneConfiguration2BlocksPerSMSynLaunchBounds, 'cuda_standalone'), +- (CUDAStandaloneConfigurationProfileGPU, 'cuda_standalone'), +- (CUDAStandaloneConfigurationProfileCPU, 'cuda_standalone'), +- (CPPStandaloneConfiguration, 'cpp_standalone'), +- (CUDAStandaloneConfigurationTestBrunelHeteroAtomics, 'cuda_standalone'), +- (CUDAStandaloneConfigurationTestBrunelHeteroAtomicsProfileCPU, 'cuda_standalone'), + (CUDAStandaloneConfigurationPushAtomicResize, 'cuda_standalone'), +- (CUDAStandaloneConfigurationPushAtomicResizeProfileCPU, 'cuda_standalone'), +- (CUDAStandaloneConfigurationPushAtomicResizeAsyncMemcpy, 'cuda_standalone'), +- (CUDAStandaloneConfigurationPushAtomicResizeAsyncMemcpyProfileCPU, 'cuda_standalone'), +- (GeNNConfiguration, 'GeNNworkspace'), +- (CPPStandaloneConfigurationOpenMP, 'cpp_standalone'), +- (GeNNConfigurationCPU, 'GeNNworkspace'), +- (GeNNConfigurationOptimized, 'GeNNworkspace') ++ (CUDAStandaloneConfigurationTestBrunelHeteroAtomics, 'cuda_standalone'), ++ (CUDAStandaloneConfigurationBrunelHeterogAndPushAtomicResize, 'cuda_standalone'), ++ #(CUDAStandaloneConfigurationExtraThresholdKernel, 'cuda_standalone'), ++ #(CUDAStandaloneConfigurationNoAssert, 'cuda_standalone'), ++ #(CUDAStandaloneConfigurationCurandDouble, 'cuda_standalone'), ++ #(CUDAStandaloneConfigurationNoCudaOccupancyAPI, 'cuda_standalone'), ++ #(CUDAStandaloneConfigurationNoCudaOccupancyAPIProfileCPU, 'cuda_standalone'), ++ #(CUDAStandaloneConfiguration2BlocksPerSM, 'cuda_standalone'), ++ #(CUDAStandaloneConfiguration2BlocksPerSMLaunchBounds, 'cuda_standalone'), ++ #(CUDAStandaloneConfigurationSynLaunchBounds, 'cuda_standalone'), ++ #(CUDAStandaloneConfiguration2BlocksPerSMSynLaunchBounds, 'cuda_standalone'), ++ #(CUDAStandaloneConfigurationProfileGPU, 'cuda_standalone'), ++ #(CUDAStandaloneConfigurationProfileCPU, 'cuda_standalone'), ++ #(CUDAStandaloneConfigurationTestBrunelHeteroAtomicsProfileCPU, 'cuda_standalone'), ++ #(CUDAStandaloneConfigurationPushAtomicResizeProfileCPU, 'cuda_standalone'), ++ #(CUDAStandaloneConfigurationPushAtomicResizeAsyncMemcpy, 'cuda_standalone'), ++ #(CUDAStandaloneConfigurationPushAtomicResizeAsyncMemcpyProfileCPU, 'cuda_standalone'), ++ #(GeNNConfiguration, 'GeNNworkspace'), ++ #(GeNNConfigurationCPU, 'GeNNworkspace'), ++ #(GeNNConfigurationOptimized, 'GeNNworkspace') + ] + + speed_tests = [# feature_test name n_slice + +- (ThresholderOnlyPoissonLowRate, 'ThresholderOnlyPoissonLowRate', slice(None) ), +- (ThresholderOnlyPoissonMediumRate, 'ThresholderOnlyPoissonMediumRate', slice(None) ), +- (ThresholderOnlyPoissonHighRate, 'ThresholderOnlyPoissonHighRate', slice(None) ), +- (ThresholderOnlyAlwaysSpiking, 'ThresholderOnlyAlwaysSpiking', slice(None) ), +- +- (BrunelHakimStateupdateOnlyDouble, 'BrunelHakimStateupdateOnlyDouble', slice(None) ), +- (BrunelHakimStateupdateOnlyTriple, 'BrunelHakimStateupdateOnlyTriple', slice(None) ), +- (BrunelHakimStateupdateOnly, 'BrunelHakimStateupdateOnly', slice(None) ), +- (BrunelHakimNeuronsOnly, 'BrunelHakimNeuronsOnly', slice(None) ), +- (BrunelHakimNeuronsOnlyNoXi, 'BrunelHakimNeuronsOnlyNoXi', slice(None) ), +- (BrunelHakimNeuronsOnlyNoRand, 'BrunelHakimNeuronsOnlyNoRand', slice(None) ), +- (BrunelHakimStateupdateThresholdOnly, 'BrunelHakimStateupdateThresholdOnly', slice(None) ), +- (BrunelHakimStateupdateThresholdResetOnly, 'BrunelHakimStateupdateThresholdResetOnly', slice(None) ), +- (BrunelHakimModelScalarDelayShort, 'BrunelHakimModelScalarDelayShort', slice(None) ), +- (BrunelHakimModelScalarDelayNoSelfConnections, 'BrunelHakimModelScalarDelayNoSelfConnections', slice(None) ), +- (CUBA, 'CUBA', slice(None) ), +- (COBAHH, 'COBAHH', slice(None) ), +- (AdaptationOscillation, 'AdaptationOscillation', slice(None) ), +- (Vogels, 'Vogels', slice(None) ), +- (STDP, 'STDP', slice(None) ), +- (STDPEventDriven, 'STDPEventDriven', slice(None) ), +- (BrunelHakimModelScalarDelay, 'BrunelHakimModelScalarDelay', slice(None) ), +- +- (VerySparseMediumRateSynapsesOnly, 'VerySparseMediumRateSynapsesOnly', slice(None) ), +- (SparseMediumRateSynapsesOnly, 'SparseMediumRateSynapsesOnly', slice(None) ), +- (DenseMediumRateSynapsesOnly, 'DenseMediumRateSynapsesOnly', slice(None) ), +- (SparseLowRateSynapsesOnly, 'SparseLowRateSynapsesOnly', slice(None) ), +- (SparseHighRateSynapsesOnly, 'SparseHighRateSynapsesOnly', slice(None) ), +- +- (STDPNotEventDriven, 'STDPNotEventDriven', slice(None) ), +- (STDPMultiPost, 'STDPMultiPost', slice(None) ), +- (STDPNeuronalTraces, 'STDPNeuronalTraces', slice(None) ), +- (STDPMultiPostNeuronalTraces, 'STDPMultiPostNeuronalTraces', slice(None) ), ++ #(ThresholderOnlyPoissonLowRate, 'ThresholderOnlyPoissonLowRate', slice(None) ), ++ #(ThresholderOnlyPoissonMediumRate, 'ThresholderOnlyPoissonMediumRate', slice(None) ), ++ #(ThresholderOnlyPoissonHighRate, 'ThresholderOnlyPoissonHighRate', slice(None) ), ++ #(ThresholderOnlyAlwaysSpiking, 'ThresholderOnlyAlwaysSpiking', slice(None) ), ++ ++ #(BrunelHakimStateupdateOnlyDouble, 'BrunelHakimStateupdateOnlyDouble', slice(None) ), ++ #(BrunelHakimStateupdateOnlyTriple, 'BrunelHakimStateupdateOnlyTriple', slice(None) ), ++ #(BrunelHakimStateupdateOnly, 'BrunelHakimStateupdateOnly', slice(None) ), ++ #(BrunelHakimNeuronsOnly, 'BrunelHakimNeuronsOnly', slice(None) ), ++ #(BrunelHakimNeuronsOnlyNoXi, 'BrunelHakimNeuronsOnlyNoXi', slice(None) ), ++ #(BrunelHakimNeuronsOnlyNoRand, 'BrunelHakimNeuronsOnlyNoRand', slice(None) ), ++ #(BrunelHakimStateupdateThresholdOnly, 'BrunelHakimStateupdateThresholdOnly', slice(None) ), ++ #(BrunelHakimStateupdateThresholdResetOnly, 'BrunelHakimStateupdateThresholdResetOnly', slice(None) ), ++ #(BrunelHakimModelScalarDelayShort, 'BrunelHakimModelScalarDelayShort', slice(None) ), ++ #(BrunelHakimModelScalarDelayNoSelfConnections, 'BrunelHakimModelScalarDelayNoSelfConnections', slice(None) ), ++ #(CUBA, 'CUBA', slice(None) ), ++ #(COBAHH, 'COBAHH', slice(None) ), ++ #(AdaptationOscillation, 'AdaptationOscillation', slice(None) ), ++ #(Vogels, 'Vogels', slice(None) ), ++ #(STDP, 'STDP', slice(None) ), ++ #(STDPEventDriven, 'STDPEventDriven', slice(None) ), ++ #(BrunelHakimModelScalarDelay, 'BrunelHakimModelScalarDelay', slice(None) ), ++ ++ #(VerySparseMediumRateSynapsesOnly, 'VerySparseMediumRateSynapsesOnly', slice(None) ), ++ #(SparseMediumRateSynapsesOnly, 'SparseMediumRateSynapsesOnly', slice(None) ), ++ #(DenseMediumRateSynapsesOnly, 'DenseMediumRateSynapsesOnly', slice(None) ), ++ #(SparseLowRateSynapsesOnly, 'SparseLowRateSynapsesOnly', slice(None) ), ++ #(SparseHighRateSynapsesOnly, 'SparseHighRateSynapsesOnly', slice(None) ), ++ ++ #(STDPNotEventDriven, 'STDPNotEventDriven', slice(None) ), ++ #(STDPMultiPost, 'STDPMultiPost', slice(None) ), ++ #(STDPNeuronalTraces, 'STDPNeuronalTraces', slice(None) ), ++ #(STDPMultiPostNeuronalTraces, 'STDPMultiPostNeuronalTraces', slice(None) ), + + (BrunelHakimModelHeterogeneousDelay, 'BrunelHakimModelHeterogeneousDelay', slice(None) ), + +- (LinearNeuronsOnly, 'LinearNeuronsOnly', slice(None) ), +- (HHNeuronsOnly, 'HHNeuronsOnly', slice(None) ), +- (VogelsWithSynapticDynamic, 'VogelsWithSynapticDynamic', slice(None) ), ++ #(LinearNeuronsOnly, 'LinearNeuronsOnly', slice(None) ), ++ #(HHNeuronsOnly, 'HHNeuronsOnly', slice(None) ), ++ #(VogelsWithSynapticDynamic, 'VogelsWithSynapticDynamic', slice(None) ), + +- ## below uses monitors +- (CUBAFixedConnectivity, 'CUBAFixedConnectivity', slice(None) ), +- (COBAHHFixedConnectivity, 'COBAHHFixedConnectivity', slice(None, -1) ), ++ ### below uses monitors ++ #(CUBAFixedConnectivity, 'CUBAFixedConnectivity', slice(None) ), ++ #(COBAHHFixedConnectivity, 'COBAHHFixedConnectivity', slice(None, -1) ), + ] + + configurations = [config[0] for config in configs] +@@ -205,6 +207,16 @@ try: + savefig(os.path.join(plot_dir, 'speed_test_{}_relative.png'.format(name))) + res.plot_all_tests(profiling_minimum=0.05) + savefig(os.path.join(plot_dir, 'speed_test_{}_profiling.png'.format(name))) ++ ++ res.plot_all_tests() ++ ## this needs modification of brian2 code ++ #res.plot_all_tests(print_relative=True) ++ savefig(os.path.join(plot_dir, 'speed_test_{}_absolute.svg'.format(speed_tests[n][1]))) ++ res.plot_all_tests(relative=True) ++ savefig(os.path.join(plot_dir, 'speed_test_{}_relative.svg'.format(name))) ++ res.plot_all_tests(profiling_minimum=0.05) ++ savefig(os.path.join(plot_dir, 'speed_test_{}_profiling.svg'.format(name))) ++ + if 3 != len(get_fignums()): + print("WARNING: There were {} plots created, but only {} saved.".format(len(get_fignums()), 3*(n+1))) + for n in get_fignums(): +diff --git a/frozen_repos/brian2 b/frozen_repos/brian2 +--- a/frozen_repos/brian2 ++++ b/frozen_repos/brian2 +@@ -1 +1 @@ +-Subproject commit fadc6a0aeb90d1b4d343470628457d8561536f67 ++Subproject commit fadc6a0aeb90d1b4d343470628457d8561536f67-dirty +diff --git a/frozen_repos/brian2genn b/frozen_repos/brian2genn +--- a/frozen_repos/brian2genn ++++ b/frozen_repos/brian2genn +@@ -1 +1 @@ +-Subproject commit 0553cafeab49ea5403c0230411035df504d4db06 ++Subproject commit 0553cafeab49ea5403c0230411035df504d4db06-dirty diff --git a/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_original_and_atomics_effects_profiled/logs/stdout_BrunelHakimModelHeterogeneousDelay_cuda_standalone_100000.txt b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_original_and_atomics_effects_profiled/logs/stdout_BrunelHakimModelHeterogeneousDelay_cuda_standalone_100000.txt new file mode 100644 index 00000000..e69de29b diff --git a/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_original_and_atomics_effects_profiled/nvprof/nvprof_BrunelHakimModelHeterogeneousDelay_CUDAStandaloneConfigurationProfileCPU_1000.log b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_original_and_atomics_effects_profiled/nvprof/nvprof_BrunelHakimModelHeterogeneousDelay_CUDAStandaloneConfigurationProfileCPU_1000.log new file mode 100644 index 00000000..a7caa3d5 --- /dev/null +++ b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_original_and_atomics_effects_profiled/nvprof/nvprof_BrunelHakimModelHeterogeneousDelay_CUDAStandaloneConfigurationProfileCPU_1000.log @@ -0,0 +1,23 @@ +==22819== NVPROF is profiling process 22819, command: ./main +==22819== Profiling application: ./main +==22819== Profiling result: + Type Time(%) Time Calls Avg Min Max Name + GPU activities: 57.40% 374.75ms 10000 37.475us 2.2080us 87.617us kernel_synapses_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, int*, int, double, int*, int, double*, int*) + 22.46% 146.65ms 10000 14.664us 1.7280us 960.13us _run_synapses_pre_push_spikes_push_kernel(unsigned int, unsigned int, unsigned int, int*) + 6.51% 42.496ms 10000 4.2490us 4.0640us 5.4400us kernel_neurongroup_stateupdater_codeobject(unsigned int, unsigned int, double, double*, double*, double*, bool*, float*) + 4.30% 28.046ms 10000 2.8040us 2.7200us 4.3840us _run_synapses_pre_push_spikes_advance_kernel(void) + 3.74% 24.420ms 10000 2.4420us 2.1120us 2.9120us kernel_neurongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double, double*, double*, bool*) + 3.10% 20.239ms 10000 2.0230us 1.9520us 2.2400us kernel_neurongroup_resetter_codeobject(unsigned int, unsigned int, double*, int*) + 2.38% 15.522ms 10000 1.5520us 1.4400us 1.9520us _GLOBAL__N__69_tmpxft_0000578e_00000000_6_neurongroup_thresholder_codeobject_cpp1_ii_97ebdcc0::_reset_neurongroup_thresholder_codeobject(int*) + 0.11% 732.55us 1 732.55us 732.55us 732.55us void gen_sequenced(curandStateXORWOW*, normal_args_st))>(curandStateXORWOW*, float2*, unsigned long, unsigned long, normal_args_st) + API calls: 48.78% 862.78ms 60001 14.379us 2.6750us 965.88us cudaDeviceSynchronize + 45.73% 808.74ms 70001 11.553us 9.9030us 8.8572ms cudaLaunch + 3.59% 63.553ms 380005 167ns 135ns 324.80us cudaSetupArgument + 1.05% 18.499ms 70001 264ns 187ns 12.032us cudaConfigureCall + 0.82% 14.507ms 60002 241ns 191ns 11.691us cudaGetLastError + 0.02% 272.19us 1 272.19us 272.19us 272.19us cudaMalloc + 0.01% 153.60us 1 153.60us 153.60us 153.60us cudaMemGetInfo + 0.00% 30.182us 8 3.7720us 3.0160us 5.4880us cudaFuncGetAttributes + 0.00% 28.610us 39 733ns 616ns 2.0690us cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags + 0.00% 6.6020us 12 550ns 360ns 1.2870us cudaDeviceGetAttribute + 0.00% 2.7290us 3 909ns 660ns 1.3620us cudaGetDevice diff --git a/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_original_and_atomics_effects_profiled/nvprof/nvprof_BrunelHakimModelHeterogeneousDelay_CUDAStandaloneConfigurationTestBrunelHeteroAtomicsProfileCPU_1000.log b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_original_and_atomics_effects_profiled/nvprof/nvprof_BrunelHakimModelHeterogeneousDelay_CUDAStandaloneConfigurationTestBrunelHeteroAtomicsProfileCPU_1000.log new file mode 100644 index 00000000..3937cc58 --- /dev/null +++ b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_original_and_atomics_effects_profiled/nvprof/nvprof_BrunelHakimModelHeterogeneousDelay_CUDAStandaloneConfigurationTestBrunelHeteroAtomicsProfileCPU_1000.log @@ -0,0 +1,23 @@ +==23535== NVPROF is profiling process 23535, command: ./main +==23535== Profiling application: ./main +==23535== Profiling result: + Type Time(%) Time Calls Avg Min Max Name + GPU activities: 42.76% 145.65ms 10000 14.565us 1.9840us 1.1359ms _run_synapses_pre_push_spikes_push_kernel(unsigned int, unsigned int, unsigned int, int*) + 18.94% 64.522ms 10000 6.4520us 3.6480us 8.1280us kernel_synapses_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, int*, int, double, int*, int, double*, int*) + 12.92% 43.998ms 10000 4.3990us 4.1280us 5.5360us kernel_neurongroup_stateupdater_codeobject(unsigned int, unsigned int, double, double*, double*, double*, bool*, float*) + 8.09% 27.540ms 10000 2.7530us 2.6880us 4.5120us _run_synapses_pre_push_spikes_advance_kernel(void) + 6.73% 22.929ms 10000 2.2920us 2.0800us 2.8170us kernel_neurongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double, double*, double*, bool*) + 5.80% 19.741ms 10000 1.9740us 1.8560us 2.1760us kernel_neurongroup_resetter_codeobject(unsigned int, unsigned int, double*, int*) + 4.55% 15.502ms 10000 1.5500us 1.4400us 2.1760us _GLOBAL__N__69_tmpxft_00005a32_00000000_6_neurongroup_thresholder_codeobject_cpp1_ii_97ebdcc0::_reset_neurongroup_thresholder_codeobject(int*) + 0.21% 731.97us 1 731.97us 731.97us 731.97us void gen_sequenced(curandStateXORWOW*, normal_args_st))>(curandStateXORWOW*, float2*, unsigned long, unsigned long, normal_args_st) + API calls: 55.46% 798.58ms 70001 11.408us 9.3530us 8.8769ms cudaLaunch + 37.87% 545.25ms 60001 9.0870us 2.4500us 1.1408ms cudaDeviceSynchronize + 4.14% 59.601ms 380005 156ns 125ns 312.80us cudaSetupArgument + 1.35% 19.377ms 70001 276ns 185ns 13.408us cudaConfigureCall + 1.16% 16.665ms 60002 277ns 203ns 11.227us cudaGetLastError + 0.02% 277.76us 1 277.76us 277.76us 277.76us cudaMalloc + 0.01% 156.74us 1 156.74us 156.74us 156.74us cudaMemGetInfo + 0.00% 32.165us 8 4.0200us 2.8580us 7.6380us cudaFuncGetAttributes + 0.00% 27.873us 39 714ns 603ns 1.7260us cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags + 0.00% 6.6820us 12 556ns 373ns 1.4220us cudaDeviceGetAttribute + 0.00% 2.8330us 3 944ns 675ns 1.4210us cudaGetDevice diff --git a/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_original_and_atomics_effects_profiled/plots/speed_test_BrunelHakimModelHeterogeneousDelay_absolute.png b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_original_and_atomics_effects_profiled/plots/speed_test_BrunelHakimModelHeterogeneousDelay_absolute.png new file mode 100644 index 00000000..0631b80b Binary files /dev/null and b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_original_and_atomics_effects_profiled/plots/speed_test_BrunelHakimModelHeterogeneousDelay_absolute.png differ diff --git a/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_original_and_atomics_effects_profiled/plots/speed_test_BrunelHakimModelHeterogeneousDelay_profiling.png b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_original_and_atomics_effects_profiled/plots/speed_test_BrunelHakimModelHeterogeneousDelay_profiling.png new file mode 100644 index 00000000..41b4424e Binary files /dev/null and b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_original_and_atomics_effects_profiled/plots/speed_test_BrunelHakimModelHeterogeneousDelay_profiling.png differ diff --git a/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_original_and_atomics_effects_profiled/plots/speed_test_BrunelHakimModelHeterogeneousDelay_relative.png b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_original_and_atomics_effects_profiled/plots/speed_test_BrunelHakimModelHeterogeneousDelay_relative.png new file mode 100644 index 00000000..00de07cb Binary files /dev/null and b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_original_and_atomics_effects_profiled/plots/speed_test_BrunelHakimModelHeterogeneousDelay_relative.png differ diff --git a/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_original_and_atomics_effects_profiled/plots/speed_test_min_15_BrunelHakimModelHeterogeneousDelay_profiling.png b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_original_and_atomics_effects_profiled/plots/speed_test_min_15_BrunelHakimModelHeterogeneousDelay_profiling.png new file mode 100644 index 00000000..f78a4f2a Binary files /dev/null and b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_original_and_atomics_effects_profiled/plots/speed_test_min_15_BrunelHakimModelHeterogeneousDelay_profiling.png differ diff --git a/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_original_and_atomics_effects_profiled/run_speed_test_script.py b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_original_and_atomics_effects_profiled/run_speed_test_script.py new file mode 100644 index 00000000..f2e265e8 --- /dev/null +++ b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_original_and_atomics_effects_profiled/run_speed_test_script.py @@ -0,0 +1,291 @@ +import os +import shutil +import glob +import subprocess +import sys +import socket + +# run tests without X-server +import matplotlib +matplotlib.use('Agg') + +# pretty plots +import seaborn + +import time +import datetime +import cPickle as pickle + +from brian2 import * +from brian2.tests.features import * +from brian2.tests.features.base import * +from brian2.tests.features.base import results + +import brian2cuda +from brian2cuda.tests.features.cuda_configuration import (CUDAStandaloneConfiguration, + CUDAStandaloneConfigurationNoAssert, + CUDAStandaloneConfigurationExtraThresholdKernel, + CUDAStandaloneConfigurationCurandDouble, + CUDAStandaloneConfigurationNoCudaOccupancyAPI, + CUDAStandaloneConfigurationNoCudaOccupancyAPIProfileCPU, + CUDAStandaloneConfiguration2BlocksPerSM, + CUDAStandaloneConfiguration2BlocksPerSMLaunchBounds, + CUDAStandaloneConfigurationSynLaunchBounds, + CUDAStandaloneConfiguration2BlocksPerSMSynLaunchBounds, + CUDAStandaloneConfigurationProfileGPU, + CUDAStandaloneConfigurationProfileCPU, + CUDAStandaloneConfigurationTestBrunelHeteroAtomics, + CUDAStandaloneConfigurationTestBrunelHeteroAtomicsProfileCPU, + CUDAStandaloneConfigurationPushAtomicResize, + CUDAStandaloneConfigurationBrunelHeterogAndPushAtomicResize, + CUDAStandaloneConfigurationPushAtomicResizeProfileCPU, + CUDAStandaloneConfigurationBrunelHeterogAndPushAtomicResizeProfileCPU, + CUDAStandaloneConfigurationPushAtomicResizeAsyncMemcpy, + CUDAStandaloneConfigurationPushAtomicResizeAsyncMemcpyProfileCPU) +from brian2cuda.tests.features.speed import * + +from brian2genn.correctness_testing import GeNNConfiguration, GeNNConfigurationCPU, GeNNConfigurationOptimized + +from create_readme import create_readme + +assert len(sys.argv)<= 2, 'Only one command line argument supported! Got {}'.format(len(sys.argv)-1) +if len(sys.argv) == 2: + additional_dir_name = '_' + sys.argv[1] +else: + additional_dir_name = '' + +prefs['devices.cpp_standalone.extra_make_args_unix'] = ['-j12'] + +# host specific settings +if socket.gethostname() == 'elnath': + prefs['devices.cpp_standalone.extra_make_args_unix'] = ['-j24'] + prefs['codegen.cuda.extra_compile_args_nvcc'].remove('-arch=sm_35') + prefs['codegen.cuda.extra_compile_args_nvcc'].extend(['-arch=sm_20']) + +configs = [# configuration project_directory + #(NumpyConfiguration, None), + #(WeaveConfiguration, None), + #(LocalConfiguration, None), + #(CPPStandaloneConfiguration, 'cpp_standalone'), + #(CPPStandaloneConfigurationOpenMP, 'cpp_standalone'), + #(CUDAStandaloneConfiguration, 'cuda_standalone'), + #(CUDAStandaloneConfigurationPushAtomicResize, 'cuda_standalone'), + #(CUDAStandaloneConfigurationTestBrunelHeteroAtomics, 'cuda_standalone'), + #(CUDAStandaloneConfigurationBrunelHeterogAndPushAtomicResize, 'cuda_standalone'), + #(CUDAStandaloneConfigurationExtraThresholdKernel, 'cuda_standalone'), + #(CUDAStandaloneConfigurationNoAssert, 'cuda_standalone'), + #(CUDAStandaloneConfigurationCurandDouble, 'cuda_standalone'), + #(CUDAStandaloneConfigurationNoCudaOccupancyAPI, 'cuda_standalone'), + #(CUDAStandaloneConfigurationNoCudaOccupancyAPIProfileCPU, 'cuda_standalone'), + #(CUDAStandaloneConfiguration2BlocksPerSM, 'cuda_standalone'), + #(CUDAStandaloneConfiguration2BlocksPerSMLaunchBounds, 'cuda_standalone'), + #(CUDAStandaloneConfigurationSynLaunchBounds, 'cuda_standalone'), + #(CUDAStandaloneConfiguration2BlocksPerSMSynLaunchBounds, 'cuda_standalone'), + #(CUDAStandaloneConfigurationProfileGPU, 'cuda_standalone'), + (CUDAStandaloneConfigurationProfileCPU, 'cuda_standalone'), + (CUDAStandaloneConfigurationTestBrunelHeteroAtomicsProfileCPU, 'cuda_standalone'), + #(CUDAStandaloneConfigurationPushAtomicResizeProfileCPU, 'cuda_standalone'), + #(CUDAStandaloneConfigurationBrunelHeterogAndPushAtomicResizeProfileCPU, 'cuda_standalone'), + #(CUDAStandaloneConfigurationPushAtomicResizeAsyncMemcpy, 'cuda_standalone'), + #(CUDAStandaloneConfigurationPushAtomicResizeAsyncMemcpyProfileCPU, 'cuda_standalone'), + #(GeNNConfiguration, 'GeNNworkspace'), + #(GeNNConfigurationCPU, 'GeNNworkspace'), + #(GeNNConfigurationOptimized, 'GeNNworkspace') + ] + +speed_tests = [# feature_test name n_slice + + #(ThresholderOnlyPoissonLowRate, 'ThresholderOnlyPoissonLowRate', slice(None) ), + #(ThresholderOnlyPoissonMediumRate, 'ThresholderOnlyPoissonMediumRate', slice(None) ), + #(ThresholderOnlyPoissonHighRate, 'ThresholderOnlyPoissonHighRate', slice(None) ), + #(ThresholderOnlyAlwaysSpiking, 'ThresholderOnlyAlwaysSpiking', slice(None) ), + + #(BrunelHakimStateupdateOnlyDouble, 'BrunelHakimStateupdateOnlyDouble', slice(None) ), + #(BrunelHakimStateupdateOnlyTriple, 'BrunelHakimStateupdateOnlyTriple', slice(None) ), + #(BrunelHakimStateupdateOnly, 'BrunelHakimStateupdateOnly', slice(None) ), + #(BrunelHakimNeuronsOnly, 'BrunelHakimNeuronsOnly', slice(None) ), + #(BrunelHakimNeuronsOnlyNoXi, 'BrunelHakimNeuronsOnlyNoXi', slice(None) ), + #(BrunelHakimNeuronsOnlyNoRand, 'BrunelHakimNeuronsOnlyNoRand', slice(None) ), + #(BrunelHakimStateupdateThresholdOnly, 'BrunelHakimStateupdateThresholdOnly', slice(None) ), + #(BrunelHakimStateupdateThresholdResetOnly, 'BrunelHakimStateupdateThresholdResetOnly', slice(None) ), + #(BrunelHakimModelScalarDelayShort, 'BrunelHakimModelScalarDelayShort', slice(None) ), + #(BrunelHakimModelScalarDelayNoSelfConnections, 'BrunelHakimModelScalarDelayNoSelfConnections', slice(None) ), + #(CUBA, 'CUBA', slice(None) ), + #(COBAHH, 'COBAHH', slice(None) ), + #(AdaptationOscillation, 'AdaptationOscillation', slice(None) ), + #(Vogels, 'Vogels', slice(None) ), + #(STDP, 'STDP', slice(None) ), + #(STDPEventDriven, 'STDPEventDriven', slice(None) ), + #(BrunelHakimModelScalarDelay, 'BrunelHakimModelScalarDelay', slice(None) ), + + #(VerySparseMediumRateSynapsesOnly, 'VerySparseMediumRateSynapsesOnly', slice(None) ), + #(SparseMediumRateSynapsesOnly, 'SparseMediumRateSynapsesOnly', slice(None) ), + #(DenseMediumRateSynapsesOnly, 'DenseMediumRateSynapsesOnly', slice(None) ), + #(SparseLowRateSynapsesOnly, 'SparseLowRateSynapsesOnly', slice(None) ), + #(SparseHighRateSynapsesOnly, 'SparseHighRateSynapsesOnly', slice(None) ), + + #(STDPNotEventDriven, 'STDPNotEventDriven', slice(None) ), + #(STDPMultiPost, 'STDPMultiPost', slice(None) ), + #(STDPNeuronalTraces, 'STDPNeuronalTraces', slice(None) ), + #(STDPMultiPostNeuronalTraces, 'STDPMultiPostNeuronalTraces', slice(None) ), + + (BrunelHakimModelHeterogeneousDelay, 'BrunelHakimModelHeterogeneousDelay', slice(None) ), + + #(LinearNeuronsOnly, 'LinearNeuronsOnly', slice(None) ), + #(HHNeuronsOnly, 'HHNeuronsOnly', slice(None) ), + #(VogelsWithSynapticDynamic, 'VogelsWithSynapticDynamic', slice(None) ), + + ### below uses monitors + #(CUBAFixedConnectivity, 'CUBAFixedConnectivity', slice(None) ), + #(COBAHHFixedConnectivity, 'COBAHHFixedConnectivity', slice(None, -1) ), +] + +configurations = [config[0] for config in configs] +project_dirs = [config[1] for config in configs] + +# check if multiple Configurations with same project_dirs are specified +last_idx = {} +for proj_dir in project_dirs: + if proj_dir is not None: + first_i = project_dirs.index(proj_dir) + last_i = len(project_dirs) - 1 - project_dirs[::-1].index(proj_dir) + if first_i != last_i: + print("WARNING there are multiple configurations using {d} as project " + "directory. Profiling and logfiles will only be saved for the last one {c}.".format( + d=proj_dir, c=configurations[last_i].__name__)) + last_idx[proj_dir] = last_i + +time_stemp = time.time() +date_str = datetime.datetime.fromtimestamp(time_stemp).strftime('%Y_%m_%d') + +directory = 'results_{}{}'.format(date_str, additional_dir_name) +if os.path.exists(directory): + new_dir = directory + '_bak_' + str(int(time.time())) + print("Directory with name `{}` already exists. Renaming it to `{}`.".format(directory, new_dir)) + os.rename(directory, new_dir) +os.makedirs(directory) +data_dir = os.path.join(directory, 'data') +plot_dir = os.path.join(directory, 'plots') +log_dir = os.path.join(directory, 'logs') +prof_dir = os.path.join(directory, 'nvprof') +os.makedirs(data_dir) +os.makedirs(plot_dir) +os.makedirs(log_dir) +os.makedirs(prof_dir) +print("Saving results in {}.".format(plot_dir)) + +shutil.copy(os.path.realpath(__file__), os.path.join(directory, 'run_speed_test_script.py')) + +time_format = '%d.%m.%Y at %H:%M:%S' +script_start = datetime.datetime.fromtimestamp(time.time()).strftime(time_format) + +with open(os.path.join(directory, 'git.diff'), 'w') as diff_file: + subprocess.call(['git', 'diff'], stdout=diff_file) + +try: + for n, (st, name, sl) in enumerate(speed_tests): + start = datetime.datetime.fromtimestamp(time.time()).strftime(time_format) + print("Starting {} on {}.".format(name, start)) + maximum_run_time = 1*60*60*second + res = run_speed_tests(configurations=configurations, + speed_tests=[st], + n_slice=sl, + #n_slice=slice(0,1,None), + run_twice=False, + verbose=True, + maximum_run_time=maximum_run_time#, + ## this needs modification of brian2 code + #profile_only_active=True + #profile_only_active=False + ) + end = datetime.datetime.fromtimestamp(time.time()).strftime(time_format) + diff = datetime.datetime.strptime(end, time_format) - datetime.datetime.strptime(start, time_format) + print("Running {} took {}.".format(name, diff)) + res.plot_all_tests() + ## this needs modification of brian2 code + #res.plot_all_tests(print_relative=True) + savefig(os.path.join(plot_dir, 'speed_test_{}_absolute.png'.format(speed_tests[n][1]))) + res.plot_all_tests(relative=True) + savefig(os.path.join(plot_dir, 'speed_test_{}_relative.png'.format(name))) + res.plot_all_tests(profiling_minimum=0.05) + savefig(os.path.join(plot_dir, 'speed_test_{}_profiling.png'.format(name))) + res.plot_all_tests(profiling_minimum=0.15) + savefig(os.path.join(plot_dir, 'speed_test_min_15_{}_profiling.png'.format(name))) + + res.plot_all_tests() + ## this needs modification of brian2 code + #res.plot_all_tests(print_relative=True) + savefig(os.path.join(plot_dir, 'speed_test_{}_absolute.svg'.format(speed_tests[n][1]))) + res.plot_all_tests(relative=True) + savefig(os.path.join(plot_dir, 'speed_test_{}_relative.svg'.format(name))) + res.plot_all_tests(profiling_minimum=0.05) + savefig(os.path.join(plot_dir, 'speed_test_{}_profiling.svg'.format(name))) + res.plot_all_tests(profiling_minimum=0.15) + savefig(os.path.join(plot_dir, 'speed_test_min_15_{}_profiling.svg'.format(name))) + + if 3 != len(get_fignums()): + print("WARNING: There were {} plots created, but only {} saved.".format(len(get_fignums()), 3*(n+1))) + for n in get_fignums(): + close(n) + + # pickel results object to disk + pkl_file = os.path.join(data_dir, name + '.pkl' ) + with open(pkl_file, 'wb') as output: + pickle.dump(res, output, pickle.HIGHEST_PROTOCOL) + + # save stdout log of last run (the other are deleted in run_speed_tests()) + for proj_dir in set(project_dirs): + if not proj_dir is None and proj_dir in ['cuda_standalone', 'cpp_standalone']: + config = configurations[last_idx[proj_dir]] + stdout_file = os.path.join(proj_dir, 'results/stdout.txt') + if os.path.exists(stdout_file): + shutil.copy(stdout_file, + os.path.join(log_dir, 'stdout_{st}_{conf}_{n}.txt'.format(st=name, conf=proj_dir, + n=st.n_range[sl][-1]))) + else: + print("WARNING Couldn't save {},file not found.".format(stdout_file)) + + # run nvprof on n_range[2] + for conf, proj_dir in zip(configurations, project_dirs): + main_arg = '' + if proj_dir in ['cuda_standalone', 'GeNNworkspace']: + if proj_dir == 'GeNNworkspace': + main_arg = 'test {time} 1'.format(time=st.duration/second) + ns = st.n_range[sl] + idx = 2 + max_runtime = 20 + conf_name = conf.__name__ + print("Rerunning {} with n = {} for nvprof profiling".format(conf_name, st.n_range[idx])) + tb, res, runtime, prof_info = results(conf, st, st.n_range[idx], maximum_run_time=maximum_run_time) + if not isinstance(res, Exception) and runtime < max_runtime: + option = '--profile-from-start-off' if proj_dir == 'cuda_standalone' else '' + cmd = 'cd {proj_dir} && nvprof {opt} --log-file ../{log_file} ./main {arg}'.format( + proj_dir=proj_dir, arg=main_arg, opt=option, + log_file=os.path.join(prof_dir, 'nvprof_{st}_{conf}_{n}.log'.format( + st=name, conf=conf_name, n=st.n_range[idx]))) + prof_start = datetime.datetime.fromtimestamp(time.time()).strftime(time_format) + print(cmd) + x = os.system(cmd) + if x: + print('nvprof failed with {}'.format(x)) + prof_end = datetime.datetime.fromtimestamp(time.time()).strftime(time_format) + prof_diff = datetime.datetime.strptime(prof_end, time_format) - datetime.datetime.strptime(prof_start, time_format) + print("Profiling took {} for runtime of {}".format(prof_diff, runtime)) +finally: + create_readme(directory) + print("\nSummarized speed test results in {}".format(directory + '/README.md')) + script_end = datetime.datetime.fromtimestamp(time.time()).strftime(time_format) + script_diff = datetime.datetime.strptime(script_end, time_format) - datetime.datetime.strptime(script_start, time_format) + print("Finished speed test on {}. Total time = {}.".format( + datetime.datetime.fromtimestamp(time.time()).strftime(time_format), script_diff)) + + +##res.plot_all_tests(relative=True) +#for n in get_fignums(): +# plt.figure(n) +# savefig(plot_dir + '/speed_test_{}.png'.format(speed_tests[n-1][1])) + +## Debug (includes profiling infos) +#from brian2.tests.features.base import results +#for x in results(LocalConfiguration, LinearNeuronsOnly, 10, maximum_run_time=10*second): +# print x diff --git a/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_cpp_comparison_for_heterogenous_delay_mode/README.md b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_cpp_comparison_for_heterogenous_delay_mode/README.md new file mode 100644 index 00000000..d43429c2 --- /dev/null +++ b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_cpp_comparison_for_heterogenous_delay_mode/README.md @@ -0,0 +1,92 @@ + +# Benchmark results from 28.11.2017 +## Description: + + + +## Last git log: +``` +commit 8987de24ed9f4a3b1a276496407fca1087f04004 +Author: Denis Alevi +Date: Mon Nov 20 14:31:09 2017 +0100 + + Fix critical section to include the actual pushing + +``` +There is also a `git diff` saved in the current directory. + +## Results + +### BrunelHakimModelHeterogeneousDelay +![](plots/speed_test_BrunelHakimModelHeterogeneousDelay_absolute.svg) +![](plots/speed_test_BrunelHakimModelHeterogeneousDelay_profiling.svg) +![](plots/speed_test_BrunelHakimModelHeterogeneousDelay_relative.svg) + +
Examplary `nvprof` results for **CUDAStandaloneConfigurationBrunelHeterogAndPushAtomicResize**

+Profile summary for `N = 1000`: + +``` +==2700== NVPROF is profiling process 2700, command: ./main +==2700== Profiling application: ./main +==2700== Profiling result: + Type Time(%) Time Calls Avg Min Max Name + GPU activities: 36.32% 123.91ms 2523 49.113us 14.176us 1.3924ms _run_synapses_pre_push_spikes_push_kernel(unsigned int, unsigned int, unsigned int, int*) + 18.81% 64.168ms 10000 6.4160us 3.5840us 8.5440us kernel_synapses_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, int*, int, double, int*, int, double*, int*) + 12.89% 43.962ms 10000 4.3960us 4.1600us 5.4080us kernel_neurongroup_stateupdater_codeobject(unsigned int, unsigned int, double, double*, double*, double*, bool*, float*) + 8.62% 29.419ms 10000 2.9410us 2.8800us 4.2880us _run_synapses_pre_push_spikes_advance_kernel(void) + 6.74% 22.995ms 10000 2.2990us 2.0160us 2.8800us kernel_neurongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double, double*, double*, bool*) + 6.03% 20.585ms 10000 2.0580us 2.0160us 4.0960us [CUDA memcpy DtoH] + 5.48% 18.689ms 10000 1.8680us 1.7280us 2.2400us kernel_neurongroup_resetter_codeobject(unsigned int, unsigned int, double*, int*) + 4.89% 16.676ms 10000 1.6670us 1.6000us 2.7520us _GLOBAL__N__69_tmpxft_000008bc_00000000_6_neurongroup_thresholder_codeobject_cpp1_ii_97ebdcc0::_reset_neurongroup_thresholder_codeobject(int*) + 0.21% 732.10us 1 732.10us 732.10us 732.10us void gen_sequenced(curandStateXORWOW*, normal_args_st))>(curandStateXORWOW*, float2*, unsigned long, unsigned long, normal_args_st) + API calls: 63.43% 648.25ms 62524 10.368us 8.7500us 8.8943ms cudaLaunch + 28.22% 288.45ms 10000 28.844us 18.477us 1.3838ms cudaMemcpy + 5.61% 57.386ms 350097 163ns 124ns 335.99us cudaSetupArgument + 1.38% 14.127ms 62524 225ns 161ns 321.95us cudaConfigureCall + 1.30% 13.336ms 52525 253ns 200ns 300.39us cudaGetLastError + 0.03% 268.04us 1 268.04us 268.04us 268.04us cudaMalloc + 0.02% 166.72us 1 166.72us 166.72us 166.72us cudaMemGetInfo + 0.00% 30.363us 39 778ns 650ns 2.4670us cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags + 0.00% 29.284us 8 3.6600us 2.8650us 6.1410us cudaFuncGetAttributes + 0.00% 13.545us 1 13.545us 13.545us 13.545us cudaDeviceSynchronize + 0.00% 6.1940us 12 516ns 337ns 1.4590us cudaDeviceGetAttribute + 0.00% 3.8130us 3 1.2710us 863ns 1.9980us cudaGetDevice + +``` + +

+ + +
Examplary `nvprof` results for **CUDAStandaloneConfiguration**

+Profile summary for `N = 1000`: + +``` +==1993== NVPROF is profiling process 1993, command: ./main +==1993== Profiling application: ./main +==1993== Profiling result: + Type Time(%) Time Calls Avg Min Max Name + GPU activities: 56.03% 352.83ms 10000 35.283us 2.0480us 87.201us kernel_synapses_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, int*, int, double, int*, int, double*, int*) + 23.69% 149.15ms 10000 14.915us 1.6320us 1.3164ms _run_synapses_pre_push_spikes_push_kernel(unsigned int, unsigned int, unsigned int, int*) + 6.70% 42.158ms 10000 4.2150us 3.8080us 5.6320us kernel_neurongroup_stateupdater_codeobject(unsigned int, unsigned int, double, double*, double*, double*, bool*, float*) + 4.32% 27.228ms 10000 2.7220us 2.4960us 4.5120us _run_synapses_pre_push_spikes_advance_kernel(void) + 3.61% 22.747ms 10000 2.2740us 1.9200us 3.7760us kernel_neurongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double, double*, double*, bool*) + 3.00% 18.918ms 10000 1.8910us 1.7280us 3.7440us kernel_neurongroup_resetter_codeobject(unsigned int, unsigned int, double*, int*) + 2.53% 15.914ms 10000 1.5910us 1.3440us 3.8080us _GLOBAL__N__69_tmpxft_000005de_00000000_6_neurongroup_thresholder_codeobject_cpp1_ii_97ebdcc0::_reset_neurongroup_thresholder_codeobject(int*) + 0.12% 731.65us 1 731.65us 731.65us 731.65us void gen_sequenced(curandStateXORWOW*, normal_args_st))>(curandStateXORWOW*, float2*, unsigned long, unsigned long, normal_args_st) + API calls: 89.24% 744.81ms 70001 10.639us 8.5770us 8.8242ms cudaLaunch + 7.22% 60.281ms 380005 158ns 121ns 336.34us cudaSetupArgument + 1.85% 15.427ms 70001 220ns 159ns 319.22us cudaConfigureCall + 1.60% 13.340ms 60002 222ns 175ns 326.16us cudaGetLastError + 0.04% 332.76us 1 332.76us 332.76us 332.76us cudaDeviceSynchronize + 0.03% 253.93us 1 253.93us 253.93us 253.93us cudaMalloc + 0.02% 146.47us 1 146.47us 146.47us 146.47us cudaMemGetInfo + 0.00% 29.198us 8 3.6490us 2.7670us 6.3670us cudaFuncGetAttributes + 0.00% 27.382us 39 702ns 578ns 1.8100us cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags + 0.00% 6.0870us 12 507ns 326ns 1.3870us cudaDeviceGetAttribute + 0.00% 3.7450us 3 1.2480us 822ns 2.0410us cudaGetDevice + +``` + +

+ + diff --git a/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_cpp_comparison_for_heterogenous_delay_mode/data/BrunelHakimModelHeterogeneousDelay.pkl b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_cpp_comparison_for_heterogenous_delay_mode/data/BrunelHakimModelHeterogeneousDelay.pkl new file mode 100644 index 00000000..51f30c2c Binary files /dev/null and b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_cpp_comparison_for_heterogenous_delay_mode/data/BrunelHakimModelHeterogeneousDelay.pkl differ diff --git a/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_cpp_comparison_for_heterogenous_delay_mode/git.diff b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_cpp_comparison_for_heterogenous_delay_mode/git.diff new file mode 100644 index 00000000..44a84fa2 --- /dev/null +++ b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_cpp_comparison_for_heterogenous_delay_mode/git.diff @@ -0,0 +1,305 @@ +diff --git a/brian2cuda/tests/features/cuda_configuration.py b/brian2cuda/tests/features/cuda_configuration.py +index 250687d..622f73a 100644 +--- a/brian2cuda/tests/features/cuda_configuration.py ++++ b/brian2cuda/tests/features/cuda_configuration.py +@@ -225,7 +225,7 @@ class CUDAStandaloneConfigurationProfileCPU(Configuration): + with_output=False) + + class CUDAStandaloneConfigurationTestBrunelHeteroAtomics(Configuration): +- name = 'CUDA standalone with atomics in heterog delay mode' ++ name = 'CUDA standalone with atomics in effect application' + def before_run(self): + brian2.set_device('cuda_standalone', build_on_run=False) + prefs["devices.cuda_standalone.test_brunel_hetero_atomics"] = True +@@ -248,7 +248,7 @@ class CUDAStandaloneConfigurationTestBrunelHeteroAtomics(Configuration): + with_output=False) + + class CUDAStandaloneConfigurationTestBrunelHeteroAtomicsProfileCPU(Configuration): +- name = "CUDA standalone with atomics in heterog delay mode (profile='blocking')" ++ name = "CUDA standalone with atomics in effect application (profile='blocking')" + def before_run(self): + brian2.set_device('cuda_standalone', build_on_run=False, profile='blocking') + prefs["devices.cuda_standalone.test_brunel_hetero_atomics"] = True +@@ -270,12 +270,10 @@ class CUDAStandaloneConfigurationTestBrunelHeteroAtomicsProfileCPU(Configuration + brian2.device.build(directory='cuda_standalone', compile=True, run=True, + with_output=False) + +- + class CUDAStandaloneConfigurationPushAtomicResize(Configuration): +- name = "CUDA standalone with atomics in queue resize" ++ name = "CUDA standalone with atomics in spikequeue resize" + def before_run(self): + brian2.set_device('cuda_standalone', build_on_run=False) +- prefs["devices.cuda_standalone.test_brunel_hetero_atomics"] = True + prefs["devices.cuda_standalone.push_atomic_resize"] = True + if socket.gethostname() == 'elnath': + if prefs['devices.cpp_standalone.extra_make_args_unix'] == ['-j12']: +@@ -295,8 +293,82 @@ class CUDAStandaloneConfigurationPushAtomicResize(Configuration): + brian2.device.build(directory='cuda_standalone', compile=True, run=True, + with_output=False) + ++ + class CUDAStandaloneConfigurationPushAtomicResizeProfileCPU(Configuration): +- name = "CUDA standalone with atomics in queue resize (profile='blocking')" ++ name = "CUDA standalone with atomics in spikequeue resize (profile='blocking')" ++ def before_run(self): ++ brian2.set_device('cuda_standalone', build_on_run=False, profile='blocking') ++ prefs["devices.cuda_standalone.push_atomic_resize"] = True ++ if socket.gethostname() == 'elnath': ++ if prefs['devices.cpp_standalone.extra_make_args_unix'] == ['-j12']: ++ prefs['devices.cpp_standalone.extra_make_args_unix'] = ['-j24'] ++ prefs['codegen.cuda.extra_compile_args_nvcc'].remove('-arch=sm_35') ++ prefs['codegen.cuda.extra_compile_args_nvcc'].extend(['-arch=sm_20']) ++ elif socket.gethostname() == 'sabik': ++ prefs['codegen.cuda.extra_compile_args_nvcc'].remove('-arch=sm_35') ++ prefs['codegen.cuda.extra_compile_args_nvcc'].extend(['-arch=sm_52']) ++ elif socket.gethostname() == 'eltanin': ++ prefs['codegen.cuda.extra_compile_args_nvcc'].remove('-arch=sm_35') ++ prefs['codegen.cuda.extra_compile_args_nvcc'].extend(['-arch=sm_61']) ++ ++ def after_run(self): ++ if os.path.exists('cuda_standalone'): ++ shutil.rmtree('cuda_standalone') ++ brian2.device.build(directory='cuda_standalone', compile=True, run=True, ++ with_output=False) ++ ++ ++class CUDAStandaloneConfigurationPushAtomicResizProfileCPU(Configuration): ++ name = "CUDA standalone with atomics in spikequeue resize (profile='blocking)" ++ def before_run(self): ++ brian2.set_device('cuda_standalone', build_on_run=False, ++ profile='blocking') ++ prefs["devices.cuda_standalone.push_atomic_resize"] = True ++ if socket.gethostname() == 'elnath': ++ if prefs['devices.cpp_standalone.extra_make_args_unix'] == ['-j12']: ++ prefs['devices.cpp_standalone.extra_make_args_unix'] = ['-j24'] ++ prefs['codegen.cuda.extra_compile_args_nvcc'].remove('-arch=sm_35') ++ prefs['codegen.cuda.extra_compile_args_nvcc'].extend(['-arch=sm_20']) ++ elif socket.gethostname() == 'sabik': ++ prefs['codegen.cuda.extra_compile_args_nvcc'].remove('-arch=sm_35') ++ prefs['codegen.cuda.extra_compile_args_nvcc'].extend(['-arch=sm_52']) ++ elif socket.gethostname() == 'eltanin': ++ prefs['codegen.cuda.extra_compile_args_nvcc'].remove('-arch=sm_35') ++ prefs['codegen.cuda.extra_compile_args_nvcc'].extend(['-arch=sm_61']) ++ ++ def after_run(self): ++ if os.path.exists('cuda_standalone'): ++ shutil.rmtree('cuda_standalone') ++ brian2.device.build(directory='cuda_standalone', compile=True, run=True, ++ with_output=False) ++ ++ ++class CUDAStandaloneConfigurationBrunelHeterogAndPushAtomicResize(Configuration): ++ name = "CUDA standalone with atomics in effect application and in spikequeue resize" ++ def before_run(self): ++ brian2.set_device('cuda_standalone', build_on_run=False) ++ prefs["devices.cuda_standalone.test_brunel_hetero_atomics"] = True ++ prefs["devices.cuda_standalone.push_atomic_resize"] = True ++ if socket.gethostname() == 'elnath': ++ if prefs['devices.cpp_standalone.extra_make_args_unix'] == ['-j12']: ++ prefs['devices.cpp_standalone.extra_make_args_unix'] = ['-j24'] ++ prefs['codegen.cuda.extra_compile_args_nvcc'].remove('-arch=sm_35') ++ prefs['codegen.cuda.extra_compile_args_nvcc'].extend(['-arch=sm_20']) ++ elif socket.gethostname() == 'sabik': ++ prefs['codegen.cuda.extra_compile_args_nvcc'].remove('-arch=sm_35') ++ prefs['codegen.cuda.extra_compile_args_nvcc'].extend(['-arch=sm_52']) ++ elif socket.gethostname() == 'eltanin': ++ prefs['codegen.cuda.extra_compile_args_nvcc'].remove('-arch=sm_35') ++ prefs['codegen.cuda.extra_compile_args_nvcc'].extend(['-arch=sm_61']) ++ ++ def after_run(self): ++ if os.path.exists('cuda_standalone'): ++ shutil.rmtree('cuda_standalone') ++ brian2.device.build(directory='cuda_standalone', compile=True, run=True, ++ with_output=False) ++ ++class CUDAStandaloneConfigurationBrunelHeterogAndPushAtomicResizeProfileCPU(Configuration): ++ name = "CUDA standalone with atomics in effect application and in spikequeue resize (profile='blocking')" + def before_run(self): + brian2.set_device('cuda_standalone', build_on_run=False, profile='blocking') + prefs["devices.cuda_standalone.test_brunel_hetero_atomics"] = True +diff --git a/dev/benchmarks/run_speed_tests.py b/dev/benchmarks/run_speed_tests.py +index 2518634..b525e97 100644 +--- a/dev/benchmarks/run_speed_tests.py ++++ b/dev/benchmarks/run_speed_tests.py +@@ -37,6 +37,7 @@ from brian2cuda.tests.features.cuda_configuration import (CUDAStandaloneConfigur + CUDAStandaloneConfigurationTestBrunelHeteroAtomics, + CUDAStandaloneConfigurationTestBrunelHeteroAtomicsProfileCPU, + CUDAStandaloneConfigurationPushAtomicResize, ++ CUDAStandaloneConfigurationBrunelHeterogAndPushAtomicResize, + CUDAStandaloneConfigurationPushAtomicResizeProfileCPU, + CUDAStandaloneConfigurationPushAtomicResizeAsyncMemcpy, + CUDAStandaloneConfigurationPushAtomicResizeAsyncMemcpyProfileCPU) +@@ -61,79 +62,80 @@ if socket.gethostname() == 'elnath': + prefs['codegen.cuda.extra_compile_args_nvcc'].extend(['-arch=sm_20']) + + configs = [# configuration project_directory +- (NumpyConfiguration, None), +- (WeaveConfiguration, None), +- (LocalConfiguration, None), ++ #(NumpyConfiguration, None), ++ #(WeaveConfiguration, None), ++ #(LocalConfiguration, None), ++ #(CPPStandaloneConfiguration, 'cpp_standalone'), ++ #(CPPStandaloneConfigurationOpenMP, 'cpp_standalone'), + (CUDAStandaloneConfiguration, 'cuda_standalone'), +- (CUDAStandaloneConfigurationExtraThresholdKernel, 'cuda_standalone'), +- (CUDAStandaloneConfigurationNoAssert, 'cuda_standalone'), +- (CUDAStandaloneConfigurationCurandDouble, 'cuda_standalone'), +- (CUDAStandaloneConfigurationNoCudaOccupancyAPI, 'cuda_standalone'), +- (CUDAStandaloneConfigurationNoCudaOccupancyAPIProfileCPU, 'cuda_standalone'), +- (CUDAStandaloneConfiguration2BlocksPerSM, 'cuda_standalone'), +- (CUDAStandaloneConfiguration2BlocksPerSMLaunchBounds, 'cuda_standalone'), +- (CUDAStandaloneConfigurationSynLaunchBounds, 'cuda_standalone'), +- (CUDAStandaloneConfiguration2BlocksPerSMSynLaunchBounds, 'cuda_standalone'), +- (CUDAStandaloneConfigurationProfileGPU, 'cuda_standalone'), +- (CUDAStandaloneConfigurationProfileCPU, 'cuda_standalone'), +- (CPPStandaloneConfiguration, 'cpp_standalone'), +- (CUDAStandaloneConfigurationTestBrunelHeteroAtomics, 'cuda_standalone'), +- (CUDAStandaloneConfigurationTestBrunelHeteroAtomicsProfileCPU, 'cuda_standalone'), + (CUDAStandaloneConfigurationPushAtomicResize, 'cuda_standalone'), +- (CUDAStandaloneConfigurationPushAtomicResizeProfileCPU, 'cuda_standalone'), +- (CUDAStandaloneConfigurationPushAtomicResizeAsyncMemcpy, 'cuda_standalone'), +- (CUDAStandaloneConfigurationPushAtomicResizeAsyncMemcpyProfileCPU, 'cuda_standalone'), +- (GeNNConfiguration, 'GeNNworkspace'), +- (CPPStandaloneConfigurationOpenMP, 'cpp_standalone'), +- (GeNNConfigurationCPU, 'GeNNworkspace'), +- (GeNNConfigurationOptimized, 'GeNNworkspace') ++ (CUDAStandaloneConfigurationTestBrunelHeteroAtomics, 'cuda_standalone'), ++ (CUDAStandaloneConfigurationBrunelHeterogAndPushAtomicResize, 'cuda_standalone'), ++ #(CUDAStandaloneConfigurationExtraThresholdKernel, 'cuda_standalone'), ++ #(CUDAStandaloneConfigurationNoAssert, 'cuda_standalone'), ++ #(CUDAStandaloneConfigurationCurandDouble, 'cuda_standalone'), ++ #(CUDAStandaloneConfigurationNoCudaOccupancyAPI, 'cuda_standalone'), ++ #(CUDAStandaloneConfigurationNoCudaOccupancyAPIProfileCPU, 'cuda_standalone'), ++ #(CUDAStandaloneConfiguration2BlocksPerSM, 'cuda_standalone'), ++ #(CUDAStandaloneConfiguration2BlocksPerSMLaunchBounds, 'cuda_standalone'), ++ #(CUDAStandaloneConfigurationSynLaunchBounds, 'cuda_standalone'), ++ #(CUDAStandaloneConfiguration2BlocksPerSMSynLaunchBounds, 'cuda_standalone'), ++ #(CUDAStandaloneConfigurationProfileGPU, 'cuda_standalone'), ++ #(CUDAStandaloneConfigurationProfileCPU, 'cuda_standalone'), ++ #(CUDAStandaloneConfigurationTestBrunelHeteroAtomicsProfileCPU, 'cuda_standalone'), ++ #(CUDAStandaloneConfigurationPushAtomicResizeProfileCPU, 'cuda_standalone'), ++ #(CUDAStandaloneConfigurationPushAtomicResizeAsyncMemcpy, 'cuda_standalone'), ++ #(CUDAStandaloneConfigurationPushAtomicResizeAsyncMemcpyProfileCPU, 'cuda_standalone'), ++ #(GeNNConfiguration, 'GeNNworkspace'), ++ #(GeNNConfigurationCPU, 'GeNNworkspace'), ++ #(GeNNConfigurationOptimized, 'GeNNworkspace') + ] + + speed_tests = [# feature_test name n_slice + +- (ThresholderOnlyPoissonLowRate, 'ThresholderOnlyPoissonLowRate', slice(None) ), +- (ThresholderOnlyPoissonMediumRate, 'ThresholderOnlyPoissonMediumRate', slice(None) ), +- (ThresholderOnlyPoissonHighRate, 'ThresholderOnlyPoissonHighRate', slice(None) ), +- (ThresholderOnlyAlwaysSpiking, 'ThresholderOnlyAlwaysSpiking', slice(None) ), +- +- (BrunelHakimStateupdateOnlyDouble, 'BrunelHakimStateupdateOnlyDouble', slice(None) ), +- (BrunelHakimStateupdateOnlyTriple, 'BrunelHakimStateupdateOnlyTriple', slice(None) ), +- (BrunelHakimStateupdateOnly, 'BrunelHakimStateupdateOnly', slice(None) ), +- (BrunelHakimNeuronsOnly, 'BrunelHakimNeuronsOnly', slice(None) ), +- (BrunelHakimNeuronsOnlyNoXi, 'BrunelHakimNeuronsOnlyNoXi', slice(None) ), +- (BrunelHakimNeuronsOnlyNoRand, 'BrunelHakimNeuronsOnlyNoRand', slice(None) ), +- (BrunelHakimStateupdateThresholdOnly, 'BrunelHakimStateupdateThresholdOnly', slice(None) ), +- (BrunelHakimStateupdateThresholdResetOnly, 'BrunelHakimStateupdateThresholdResetOnly', slice(None) ), +- (BrunelHakimModelScalarDelayShort, 'BrunelHakimModelScalarDelayShort', slice(None) ), +- (BrunelHakimModelScalarDelayNoSelfConnections, 'BrunelHakimModelScalarDelayNoSelfConnections', slice(None) ), +- (CUBA, 'CUBA', slice(None) ), +- (COBAHH, 'COBAHH', slice(None) ), +- (AdaptationOscillation, 'AdaptationOscillation', slice(None) ), +- (Vogels, 'Vogels', slice(None) ), +- (STDP, 'STDP', slice(None) ), +- (STDPEventDriven, 'STDPEventDriven', slice(None) ), +- (BrunelHakimModelScalarDelay, 'BrunelHakimModelScalarDelay', slice(None) ), +- +- (VerySparseMediumRateSynapsesOnly, 'VerySparseMediumRateSynapsesOnly', slice(None) ), +- (SparseMediumRateSynapsesOnly, 'SparseMediumRateSynapsesOnly', slice(None) ), +- (DenseMediumRateSynapsesOnly, 'DenseMediumRateSynapsesOnly', slice(None) ), +- (SparseLowRateSynapsesOnly, 'SparseLowRateSynapsesOnly', slice(None) ), +- (SparseHighRateSynapsesOnly, 'SparseHighRateSynapsesOnly', slice(None) ), +- +- (STDPNotEventDriven, 'STDPNotEventDriven', slice(None) ), +- (STDPMultiPost, 'STDPMultiPost', slice(None) ), +- (STDPNeuronalTraces, 'STDPNeuronalTraces', slice(None) ), +- (STDPMultiPostNeuronalTraces, 'STDPMultiPostNeuronalTraces', slice(None) ), ++ #(ThresholderOnlyPoissonLowRate, 'ThresholderOnlyPoissonLowRate', slice(None) ), ++ #(ThresholderOnlyPoissonMediumRate, 'ThresholderOnlyPoissonMediumRate', slice(None) ), ++ #(ThresholderOnlyPoissonHighRate, 'ThresholderOnlyPoissonHighRate', slice(None) ), ++ #(ThresholderOnlyAlwaysSpiking, 'ThresholderOnlyAlwaysSpiking', slice(None) ), ++ ++ #(BrunelHakimStateupdateOnlyDouble, 'BrunelHakimStateupdateOnlyDouble', slice(None) ), ++ #(BrunelHakimStateupdateOnlyTriple, 'BrunelHakimStateupdateOnlyTriple', slice(None) ), ++ #(BrunelHakimStateupdateOnly, 'BrunelHakimStateupdateOnly', slice(None) ), ++ #(BrunelHakimNeuronsOnly, 'BrunelHakimNeuronsOnly', slice(None) ), ++ #(BrunelHakimNeuronsOnlyNoXi, 'BrunelHakimNeuronsOnlyNoXi', slice(None) ), ++ #(BrunelHakimNeuronsOnlyNoRand, 'BrunelHakimNeuronsOnlyNoRand', slice(None) ), ++ #(BrunelHakimStateupdateThresholdOnly, 'BrunelHakimStateupdateThresholdOnly', slice(None) ), ++ #(BrunelHakimStateupdateThresholdResetOnly, 'BrunelHakimStateupdateThresholdResetOnly', slice(None) ), ++ #(BrunelHakimModelScalarDelayShort, 'BrunelHakimModelScalarDelayShort', slice(None) ), ++ #(BrunelHakimModelScalarDelayNoSelfConnections, 'BrunelHakimModelScalarDelayNoSelfConnections', slice(None) ), ++ #(CUBA, 'CUBA', slice(None) ), ++ #(COBAHH, 'COBAHH', slice(None) ), ++ #(AdaptationOscillation, 'AdaptationOscillation', slice(None) ), ++ #(Vogels, 'Vogels', slice(None) ), ++ #(STDP, 'STDP', slice(None) ), ++ #(STDPEventDriven, 'STDPEventDriven', slice(None) ), ++ #(BrunelHakimModelScalarDelay, 'BrunelHakimModelScalarDelay', slice(None) ), ++ ++ #(VerySparseMediumRateSynapsesOnly, 'VerySparseMediumRateSynapsesOnly', slice(None) ), ++ #(SparseMediumRateSynapsesOnly, 'SparseMediumRateSynapsesOnly', slice(None) ), ++ #(DenseMediumRateSynapsesOnly, 'DenseMediumRateSynapsesOnly', slice(None) ), ++ #(SparseLowRateSynapsesOnly, 'SparseLowRateSynapsesOnly', slice(None) ), ++ #(SparseHighRateSynapsesOnly, 'SparseHighRateSynapsesOnly', slice(None) ), ++ ++ #(STDPNotEventDriven, 'STDPNotEventDriven', slice(None) ), ++ #(STDPMultiPost, 'STDPMultiPost', slice(None) ), ++ #(STDPNeuronalTraces, 'STDPNeuronalTraces', slice(None) ), ++ #(STDPMultiPostNeuronalTraces, 'STDPMultiPostNeuronalTraces', slice(None) ), + + (BrunelHakimModelHeterogeneousDelay, 'BrunelHakimModelHeterogeneousDelay', slice(None) ), + +- (LinearNeuronsOnly, 'LinearNeuronsOnly', slice(None) ), +- (HHNeuronsOnly, 'HHNeuronsOnly', slice(None) ), +- (VogelsWithSynapticDynamic, 'VogelsWithSynapticDynamic', slice(None) ), ++ #(LinearNeuronsOnly, 'LinearNeuronsOnly', slice(None) ), ++ #(HHNeuronsOnly, 'HHNeuronsOnly', slice(None) ), ++ #(VogelsWithSynapticDynamic, 'VogelsWithSynapticDynamic', slice(None) ), + +- ## below uses monitors +- (CUBAFixedConnectivity, 'CUBAFixedConnectivity', slice(None) ), +- (COBAHHFixedConnectivity, 'COBAHHFixedConnectivity', slice(None, -1) ), ++ ### below uses monitors ++ #(CUBAFixedConnectivity, 'CUBAFixedConnectivity', slice(None) ), ++ #(COBAHHFixedConnectivity, 'COBAHHFixedConnectivity', slice(None, -1) ), + ] + + configurations = [config[0] for config in configs] +@@ -205,6 +207,16 @@ try: + savefig(os.path.join(plot_dir, 'speed_test_{}_relative.png'.format(name))) + res.plot_all_tests(profiling_minimum=0.05) + savefig(os.path.join(plot_dir, 'speed_test_{}_profiling.png'.format(name))) ++ ++ res.plot_all_tests() ++ ## this needs modification of brian2 code ++ #res.plot_all_tests(print_relative=True) ++ savefig(os.path.join(plot_dir, 'speed_test_{}_absolute.svg'.format(speed_tests[n][1]))) ++ res.plot_all_tests(relative=True) ++ savefig(os.path.join(plot_dir, 'speed_test_{}_relative.svg'.format(name))) ++ res.plot_all_tests(profiling_minimum=0.05) ++ savefig(os.path.join(plot_dir, 'speed_test_{}_profiling.svg'.format(name))) ++ + if 3 != len(get_fignums()): + print("WARNING: There were {} plots created, but only {} saved.".format(len(get_fignums()), 3*(n+1))) + for n in get_fignums(): +diff --git a/frozen_repos/brian2 b/frozen_repos/brian2 +--- a/frozen_repos/brian2 ++++ b/frozen_repos/brian2 +@@ -1 +1 @@ +-Subproject commit fadc6a0aeb90d1b4d343470628457d8561536f67 ++Subproject commit fadc6a0aeb90d1b4d343470628457d8561536f67-dirty +diff --git a/frozen_repos/brian2genn b/frozen_repos/brian2genn +--- a/frozen_repos/brian2genn ++++ b/frozen_repos/brian2genn +@@ -1 +1 @@ +-Subproject commit 0553cafeab49ea5403c0230411035df504d4db06 ++Subproject commit 0553cafeab49ea5403c0230411035df504d4db06-dirty diff --git a/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_cpp_comparison_for_heterogenous_delay_mode/logs/stdout_BrunelHakimModelHeterogeneousDelay_cpp_standalone_50000.txt b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_cpp_comparison_for_heterogenous_delay_mode/logs/stdout_BrunelHakimModelHeterogeneousDelay_cpp_standalone_50000.txt new file mode 100644 index 00000000..0cd8723a --- /dev/null +++ b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_cpp_comparison_for_heterogenous_delay_mode/logs/stdout_BrunelHakimModelHeterogeneousDelay_cpp_standalone_50000.txt @@ -0,0 +1 @@ +Number of synapses: 50008503 diff --git a/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_cpp_comparison_for_heterogenous_delay_mode/logs/stdout_BrunelHakimModelHeterogeneousDelay_cuda_standalone_50000.txt b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_cpp_comparison_for_heterogenous_delay_mode/logs/stdout_BrunelHakimModelHeterogeneousDelay_cuda_standalone_50000.txt new file mode 100644 index 00000000..8a832b67 --- /dev/null +++ b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_cpp_comparison_for_heterogenous_delay_mode/logs/stdout_BrunelHakimModelHeterogeneousDelay_cuda_standalone_50000.txt @@ -0,0 +1,53 @@ +INFO: setting cudaDevice stuff took 0.359616 seconds +INFO kernel_synapses_group_variable_set_conditional_codeobject + 48825 blocks + 1024 threads + 8 registers per block + 0 bytes statically-allocated shared memory per block + 0 bytes local memory per thread + 304 bytes user-allocated constant memory +INFO connectivity matrix has size 49995881 +INFO generating 13100000 randn every 262 clock cycles for neurongroup_stateupdater_codeobject +INFO kernel_neurongroup_stateupdater_codeobject + 66 blocks + 768 threads + 36 registers per block + 0 bytes statically-allocated shared memory per block + 8 bytes local memory per thread + 304 bytes user-allocated constant memory + 0.750 theoretical occupancy +INFO kernel_neurongroup_thresholder_codeobject + 49 blocks + 1024 threads + 15 registers per block + 0 bytes statically-allocated shared memory per block + 0 bytes local memory per thread + 304 bytes user-allocated constant memory + 1.000 theoretical occupancy +INFO _run_synapses_pre_push_spikes_push_kernel + 15 blocks + 110 threads + 78 registers per block + 0 bytes statically-allocated shared memory per block + 16 bytes local memory per thread + 304 bytes user-allocated constant memory + 0.312 theoretical occupancy +INFO kernel_synapses_pre_codeobject + 15 blocks + 1024 threads + 21 registers per block + 0 bytes statically-allocated shared memory per block + 8 bytes local memory per thread + 304 bytes user-allocated constant memory + 1.000 theoretical occupancy +INFO kernel_neurongroup_resetter_codeobject + 49 blocks + 1024 threads + 14 registers per block + 0 bytes statically-allocated shared memory per block + 0 bytes local memory per thread + 304 bytes user-allocated constant memory + 1.000 theoretical occupancy +Number of synapses: 49995881 +INFO: main_lines took 138.083454 seconds +INFO: main function took 139.763504 seconds diff --git a/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_cpp_comparison_for_heterogenous_delay_mode/nvprof/nvprof_BrunelHakimModelHeterogeneousDelay_CUDAStandaloneConfigurationBrunelHeterogAndPushAtomicResize_1000.log b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_cpp_comparison_for_heterogenous_delay_mode/nvprof/nvprof_BrunelHakimModelHeterogeneousDelay_CUDAStandaloneConfigurationBrunelHeterogAndPushAtomicResize_1000.log new file mode 100644 index 00000000..d3b0f919 --- /dev/null +++ b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_cpp_comparison_for_heterogenous_delay_mode/nvprof/nvprof_BrunelHakimModelHeterogeneousDelay_CUDAStandaloneConfigurationBrunelHeterogAndPushAtomicResize_1000.log @@ -0,0 +1,25 @@ +==2700== NVPROF is profiling process 2700, command: ./main +==2700== Profiling application: ./main +==2700== Profiling result: + Type Time(%) Time Calls Avg Min Max Name + GPU activities: 36.32% 123.91ms 2523 49.113us 14.176us 1.3924ms _run_synapses_pre_push_spikes_push_kernel(unsigned int, unsigned int, unsigned int, int*) + 18.81% 64.168ms 10000 6.4160us 3.5840us 8.5440us kernel_synapses_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, int*, int, double, int*, int, double*, int*) + 12.89% 43.962ms 10000 4.3960us 4.1600us 5.4080us kernel_neurongroup_stateupdater_codeobject(unsigned int, unsigned int, double, double*, double*, double*, bool*, float*) + 8.62% 29.419ms 10000 2.9410us 2.8800us 4.2880us _run_synapses_pre_push_spikes_advance_kernel(void) + 6.74% 22.995ms 10000 2.2990us 2.0160us 2.8800us kernel_neurongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double, double*, double*, bool*) + 6.03% 20.585ms 10000 2.0580us 2.0160us 4.0960us [CUDA memcpy DtoH] + 5.48% 18.689ms 10000 1.8680us 1.7280us 2.2400us kernel_neurongroup_resetter_codeobject(unsigned int, unsigned int, double*, int*) + 4.89% 16.676ms 10000 1.6670us 1.6000us 2.7520us _GLOBAL__N__69_tmpxft_000008bc_00000000_6_neurongroup_thresholder_codeobject_cpp1_ii_97ebdcc0::_reset_neurongroup_thresholder_codeobject(int*) + 0.21% 732.10us 1 732.10us 732.10us 732.10us void gen_sequenced(curandStateXORWOW*, normal_args_st))>(curandStateXORWOW*, float2*, unsigned long, unsigned long, normal_args_st) + API calls: 63.43% 648.25ms 62524 10.368us 8.7500us 8.8943ms cudaLaunch + 28.22% 288.45ms 10000 28.844us 18.477us 1.3838ms cudaMemcpy + 5.61% 57.386ms 350097 163ns 124ns 335.99us cudaSetupArgument + 1.38% 14.127ms 62524 225ns 161ns 321.95us cudaConfigureCall + 1.30% 13.336ms 52525 253ns 200ns 300.39us cudaGetLastError + 0.03% 268.04us 1 268.04us 268.04us 268.04us cudaMalloc + 0.02% 166.72us 1 166.72us 166.72us 166.72us cudaMemGetInfo + 0.00% 30.363us 39 778ns 650ns 2.4670us cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags + 0.00% 29.284us 8 3.6600us 2.8650us 6.1410us cudaFuncGetAttributes + 0.00% 13.545us 1 13.545us 13.545us 13.545us cudaDeviceSynchronize + 0.00% 6.1940us 12 516ns 337ns 1.4590us cudaDeviceGetAttribute + 0.00% 3.8130us 3 1.2710us 863ns 1.9980us cudaGetDevice diff --git a/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_cpp_comparison_for_heterogenous_delay_mode/nvprof/nvprof_BrunelHakimModelHeterogeneousDelay_CUDAStandaloneConfiguration_1000.log b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_cpp_comparison_for_heterogenous_delay_mode/nvprof/nvprof_BrunelHakimModelHeterogeneousDelay_CUDAStandaloneConfiguration_1000.log new file mode 100644 index 00000000..fa2e29f5 --- /dev/null +++ b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_cpp_comparison_for_heterogenous_delay_mode/nvprof/nvprof_BrunelHakimModelHeterogeneousDelay_CUDAStandaloneConfiguration_1000.log @@ -0,0 +1,23 @@ +==1993== NVPROF is profiling process 1993, command: ./main +==1993== Profiling application: ./main +==1993== Profiling result: + Type Time(%) Time Calls Avg Min Max Name + GPU activities: 56.03% 352.83ms 10000 35.283us 2.0480us 87.201us kernel_synapses_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, int*, int, double, int*, int, double*, int*) + 23.69% 149.15ms 10000 14.915us 1.6320us 1.3164ms _run_synapses_pre_push_spikes_push_kernel(unsigned int, unsigned int, unsigned int, int*) + 6.70% 42.158ms 10000 4.2150us 3.8080us 5.6320us kernel_neurongroup_stateupdater_codeobject(unsigned int, unsigned int, double, double*, double*, double*, bool*, float*) + 4.32% 27.228ms 10000 2.7220us 2.4960us 4.5120us _run_synapses_pre_push_spikes_advance_kernel(void) + 3.61% 22.747ms 10000 2.2740us 1.9200us 3.7760us kernel_neurongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double, double*, double*, bool*) + 3.00% 18.918ms 10000 1.8910us 1.7280us 3.7440us kernel_neurongroup_resetter_codeobject(unsigned int, unsigned int, double*, int*) + 2.53% 15.914ms 10000 1.5910us 1.3440us 3.8080us _GLOBAL__N__69_tmpxft_000005de_00000000_6_neurongroup_thresholder_codeobject_cpp1_ii_97ebdcc0::_reset_neurongroup_thresholder_codeobject(int*) + 0.12% 731.65us 1 731.65us 731.65us 731.65us void gen_sequenced(curandStateXORWOW*, normal_args_st))>(curandStateXORWOW*, float2*, unsigned long, unsigned long, normal_args_st) + API calls: 89.24% 744.81ms 70001 10.639us 8.5770us 8.8242ms cudaLaunch + 7.22% 60.281ms 380005 158ns 121ns 336.34us cudaSetupArgument + 1.85% 15.427ms 70001 220ns 159ns 319.22us cudaConfigureCall + 1.60% 13.340ms 60002 222ns 175ns 326.16us cudaGetLastError + 0.04% 332.76us 1 332.76us 332.76us 332.76us cudaDeviceSynchronize + 0.03% 253.93us 1 253.93us 253.93us 253.93us cudaMalloc + 0.02% 146.47us 1 146.47us 146.47us 146.47us cudaMemGetInfo + 0.00% 29.198us 8 3.6490us 2.7670us 6.3670us cudaFuncGetAttributes + 0.00% 27.382us 39 702ns 578ns 1.8100us cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags + 0.00% 6.0870us 12 507ns 326ns 1.3870us cudaDeviceGetAttribute + 0.00% 3.7450us 3 1.2480us 822ns 2.0410us cudaGetDevice diff --git a/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_cpp_comparison_for_heterogenous_delay_mode/plots/speed_test_BrunelHakimModelHeterogeneousDelay_absolute.png b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_cpp_comparison_for_heterogenous_delay_mode/plots/speed_test_BrunelHakimModelHeterogeneousDelay_absolute.png new file mode 100644 index 00000000..16721228 Binary files /dev/null and b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_cpp_comparison_for_heterogenous_delay_mode/plots/speed_test_BrunelHakimModelHeterogeneousDelay_absolute.png differ diff --git a/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_cpp_comparison_for_heterogenous_delay_mode/plots/speed_test_BrunelHakimModelHeterogeneousDelay_profiling.png b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_cpp_comparison_for_heterogenous_delay_mode/plots/speed_test_BrunelHakimModelHeterogeneousDelay_profiling.png new file mode 100644 index 00000000..b9973c28 Binary files /dev/null and b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_cpp_comparison_for_heterogenous_delay_mode/plots/speed_test_BrunelHakimModelHeterogeneousDelay_profiling.png differ diff --git a/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_cpp_comparison_for_heterogenous_delay_mode/plots/speed_test_BrunelHakimModelHeterogeneousDelay_relative.png b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_cpp_comparison_for_heterogenous_delay_mode/plots/speed_test_BrunelHakimModelHeterogeneousDelay_relative.png new file mode 100644 index 00000000..d9ec9aa8 Binary files /dev/null and b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_cpp_comparison_for_heterogenous_delay_mode/plots/speed_test_BrunelHakimModelHeterogeneousDelay_relative.png differ diff --git a/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_cpp_comparison_for_heterogenous_delay_mode/run_speed_test_script.py b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_cpp_comparison_for_heterogenous_delay_mode/run_speed_test_script.py new file mode 100644 index 00000000..97f4c06f --- /dev/null +++ b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_cpp_comparison_for_heterogenous_delay_mode/run_speed_test_script.py @@ -0,0 +1,285 @@ +import os +import shutil +import glob +import subprocess +import sys +import socket + +# run tests without X-server +import matplotlib +matplotlib.use('Agg') + +# pretty plots +import seaborn + +import time +import datetime +import cPickle as pickle + +from brian2 import * +from brian2.tests.features import * +from brian2.tests.features.base import * +from brian2.tests.features.base import results + +import brian2cuda +from brian2cuda.tests.features.cuda_configuration import (CUDAStandaloneConfiguration, + CUDAStandaloneConfigurationNoAssert, + CUDAStandaloneConfigurationExtraThresholdKernel, + CUDAStandaloneConfigurationCurandDouble, + CUDAStandaloneConfigurationNoCudaOccupancyAPI, + CUDAStandaloneConfigurationNoCudaOccupancyAPIProfileCPU, + CUDAStandaloneConfiguration2BlocksPerSM, + CUDAStandaloneConfiguration2BlocksPerSMLaunchBounds, + CUDAStandaloneConfigurationSynLaunchBounds, + CUDAStandaloneConfiguration2BlocksPerSMSynLaunchBounds, + CUDAStandaloneConfigurationProfileGPU, + CUDAStandaloneConfigurationProfileCPU, + CUDAStandaloneConfigurationTestBrunelHeteroAtomics, + CUDAStandaloneConfigurationTestBrunelHeteroAtomicsProfileCPU, + CUDAStandaloneConfigurationPushAtomicResize, + CUDAStandaloneConfigurationBrunelHeterogAndPushAtomicResize, + CUDAStandaloneConfigurationPushAtomicResizeProfileCPU, + CUDAStandaloneConfigurationPushAtomicResizeAsyncMemcpy, + CUDAStandaloneConfigurationPushAtomicResizeAsyncMemcpyProfileCPU) +from brian2cuda.tests.features.speed import * + +from brian2genn.correctness_testing import GeNNConfiguration, GeNNConfigurationCPU, GeNNConfigurationOptimized + +from create_readme import create_readme + +assert len(sys.argv)<= 2, 'Only one command line argument supported! Got {}'.format(len(sys.argv)-1) +if len(sys.argv) == 2: + additional_dir_name = '_' + sys.argv[1] +else: + additional_dir_name = '' + +prefs['devices.cpp_standalone.extra_make_args_unix'] = ['-j12'] + +# host specific settings +if socket.gethostname() == 'elnath': + prefs['devices.cpp_standalone.extra_make_args_unix'] = ['-j24'] + prefs['codegen.cuda.extra_compile_args_nvcc'].remove('-arch=sm_35') + prefs['codegen.cuda.extra_compile_args_nvcc'].extend(['-arch=sm_20']) + +configs = [# configuration project_directory + #(NumpyConfiguration, None), + #(WeaveConfiguration, None), + #(LocalConfiguration, None), + (CPPStandaloneConfiguration, 'cpp_standalone'), + (CPPStandaloneConfigurationOpenMP, 'cpp_standalone'), + (CUDAStandaloneConfiguration, 'cuda_standalone'), + #(CUDAStandaloneConfigurationPushAtomicResize, 'cuda_standalone'), + #(CUDAStandaloneConfigurationTestBrunelHeteroAtomics, 'cuda_standalone'), + (CUDAStandaloneConfigurationBrunelHeterogAndPushAtomicResize, 'cuda_standalone'), + #(CUDAStandaloneConfigurationExtraThresholdKernel, 'cuda_standalone'), + #(CUDAStandaloneConfigurationNoAssert, 'cuda_standalone'), + #(CUDAStandaloneConfigurationCurandDouble, 'cuda_standalone'), + #(CUDAStandaloneConfigurationNoCudaOccupancyAPI, 'cuda_standalone'), + #(CUDAStandaloneConfigurationNoCudaOccupancyAPIProfileCPU, 'cuda_standalone'), + #(CUDAStandaloneConfiguration2BlocksPerSM, 'cuda_standalone'), + #(CUDAStandaloneConfiguration2BlocksPerSMLaunchBounds, 'cuda_standalone'), + #(CUDAStandaloneConfigurationSynLaunchBounds, 'cuda_standalone'), + #(CUDAStandaloneConfiguration2BlocksPerSMSynLaunchBounds, 'cuda_standalone'), + #(CUDAStandaloneConfigurationProfileGPU, 'cuda_standalone'), + #(CUDAStandaloneConfigurationProfileCPU, 'cuda_standalone'), + #(CUDAStandaloneConfigurationTestBrunelHeteroAtomicsProfileCPU, 'cuda_standalone'), + #(CUDAStandaloneConfigurationPushAtomicResizeProfileCPU, 'cuda_standalone'), + #(CUDAStandaloneConfigurationPushAtomicResizeAsyncMemcpy, 'cuda_standalone'), + #(CUDAStandaloneConfigurationPushAtomicResizeAsyncMemcpyProfileCPU, 'cuda_standalone'), + #(GeNNConfiguration, 'GeNNworkspace'), + #(GeNNConfigurationCPU, 'GeNNworkspace'), + #(GeNNConfigurationOptimized, 'GeNNworkspace') + ] + +speed_tests = [# feature_test name n_slice + + #(ThresholderOnlyPoissonLowRate, 'ThresholderOnlyPoissonLowRate', slice(None) ), + #(ThresholderOnlyPoissonMediumRate, 'ThresholderOnlyPoissonMediumRate', slice(None) ), + #(ThresholderOnlyPoissonHighRate, 'ThresholderOnlyPoissonHighRate', slice(None) ), + #(ThresholderOnlyAlwaysSpiking, 'ThresholderOnlyAlwaysSpiking', slice(None) ), + + #(BrunelHakimStateupdateOnlyDouble, 'BrunelHakimStateupdateOnlyDouble', slice(None) ), + #(BrunelHakimStateupdateOnlyTriple, 'BrunelHakimStateupdateOnlyTriple', slice(None) ), + #(BrunelHakimStateupdateOnly, 'BrunelHakimStateupdateOnly', slice(None) ), + #(BrunelHakimNeuronsOnly, 'BrunelHakimNeuronsOnly', slice(None) ), + #(BrunelHakimNeuronsOnlyNoXi, 'BrunelHakimNeuronsOnlyNoXi', slice(None) ), + #(BrunelHakimNeuronsOnlyNoRand, 'BrunelHakimNeuronsOnlyNoRand', slice(None) ), + #(BrunelHakimStateupdateThresholdOnly, 'BrunelHakimStateupdateThresholdOnly', slice(None) ), + #(BrunelHakimStateupdateThresholdResetOnly, 'BrunelHakimStateupdateThresholdResetOnly', slice(None) ), + #(BrunelHakimModelScalarDelayShort, 'BrunelHakimModelScalarDelayShort', slice(None) ), + #(BrunelHakimModelScalarDelayNoSelfConnections, 'BrunelHakimModelScalarDelayNoSelfConnections', slice(None) ), + #(CUBA, 'CUBA', slice(None) ), + #(COBAHH, 'COBAHH', slice(None) ), + #(AdaptationOscillation, 'AdaptationOscillation', slice(None) ), + #(Vogels, 'Vogels', slice(None) ), + #(STDP, 'STDP', slice(None) ), + #(STDPEventDriven, 'STDPEventDriven', slice(None) ), + #(BrunelHakimModelScalarDelay, 'BrunelHakimModelScalarDelay', slice(None) ), + + #(VerySparseMediumRateSynapsesOnly, 'VerySparseMediumRateSynapsesOnly', slice(None) ), + #(SparseMediumRateSynapsesOnly, 'SparseMediumRateSynapsesOnly', slice(None) ), + #(DenseMediumRateSynapsesOnly, 'DenseMediumRateSynapsesOnly', slice(None) ), + #(SparseLowRateSynapsesOnly, 'SparseLowRateSynapsesOnly', slice(None) ), + #(SparseHighRateSynapsesOnly, 'SparseHighRateSynapsesOnly', slice(None) ), + + #(STDPNotEventDriven, 'STDPNotEventDriven', slice(None) ), + #(STDPMultiPost, 'STDPMultiPost', slice(None) ), + #(STDPNeuronalTraces, 'STDPNeuronalTraces', slice(None) ), + #(STDPMultiPostNeuronalTraces, 'STDPMultiPostNeuronalTraces', slice(None) ), + + (BrunelHakimModelHeterogeneousDelay, 'BrunelHakimModelHeterogeneousDelay', slice(0,-1,1) ), + + #(LinearNeuronsOnly, 'LinearNeuronsOnly', slice(None) ), + #(HHNeuronsOnly, 'HHNeuronsOnly', slice(None) ), + #(VogelsWithSynapticDynamic, 'VogelsWithSynapticDynamic', slice(None) ), + + ### below uses monitors + #(CUBAFixedConnectivity, 'CUBAFixedConnectivity', slice(None) ), + #(COBAHHFixedConnectivity, 'COBAHHFixedConnectivity', slice(None, -1) ), +] + +configurations = [config[0] for config in configs] +project_dirs = [config[1] for config in configs] + +# check if multiple Configurations with same project_dirs are specified +last_idx = {} +for proj_dir in project_dirs: + if proj_dir is not None: + first_i = project_dirs.index(proj_dir) + last_i = len(project_dirs) - 1 - project_dirs[::-1].index(proj_dir) + if first_i != last_i: + print("WARNING there are multiple configurations using {d} as project " + "directory. Profiling and logfiles will only be saved for the last one {c}.".format( + d=proj_dir, c=configurations[last_i].__name__)) + last_idx[proj_dir] = last_i + +time_stemp = time.time() +date_str = datetime.datetime.fromtimestamp(time_stemp).strftime('%Y_%m_%d') + +directory = 'results_{}{}'.format(date_str, additional_dir_name) +if os.path.exists(directory): + new_dir = directory + '_bak_' + str(int(time.time())) + print("Directory with name `{}` already exists. Renaming it to `{}`.".format(directory, new_dir)) + os.rename(directory, new_dir) +os.makedirs(directory) +data_dir = os.path.join(directory, 'data') +plot_dir = os.path.join(directory, 'plots') +log_dir = os.path.join(directory, 'logs') +prof_dir = os.path.join(directory, 'nvprof') +os.makedirs(data_dir) +os.makedirs(plot_dir) +os.makedirs(log_dir) +os.makedirs(prof_dir) +print("Saving results in {}.".format(plot_dir)) + +shutil.copy(os.path.realpath(__file__), os.path.join(directory, 'run_speed_test_script.py')) + +time_format = '%d.%m.%Y at %H:%M:%S' +script_start = datetime.datetime.fromtimestamp(time.time()).strftime(time_format) + +with open(os.path.join(directory, 'git.diff'), 'w') as diff_file: + subprocess.call(['git', 'diff'], stdout=diff_file) + +try: + for n, (st, name, sl) in enumerate(speed_tests): + start = datetime.datetime.fromtimestamp(time.time()).strftime(time_format) + print("Starting {} on {}.".format(name, start)) + maximum_run_time = 1*60*60*second + res = run_speed_tests(configurations=configurations, + speed_tests=[st], + n_slice=sl, + #n_slice=slice(0,1,None), + run_twice=False, + verbose=True, + maximum_run_time=maximum_run_time#, + ## this needs modification of brian2 code + #profile_only_active=True + #profile_only_active=False + ) + end = datetime.datetime.fromtimestamp(time.time()).strftime(time_format) + diff = datetime.datetime.strptime(end, time_format) - datetime.datetime.strptime(start, time_format) + print("Running {} took {}.".format(name, diff)) + res.plot_all_tests() + ## this needs modification of brian2 code + #res.plot_all_tests(print_relative=True) + savefig(os.path.join(plot_dir, 'speed_test_{}_absolute.png'.format(speed_tests[n][1]))) + res.plot_all_tests(relative=True) + savefig(os.path.join(plot_dir, 'speed_test_{}_relative.png'.format(name))) + res.plot_all_tests(profiling_minimum=0.05) + savefig(os.path.join(plot_dir, 'speed_test_{}_profiling.png'.format(name))) + + res.plot_all_tests() + ## this needs modification of brian2 code + #res.plot_all_tests(print_relative=True) + savefig(os.path.join(plot_dir, 'speed_test_{}_absolute.svg'.format(speed_tests[n][1]))) + res.plot_all_tests(relative=True) + savefig(os.path.join(plot_dir, 'speed_test_{}_relative.svg'.format(name))) + res.plot_all_tests(profiling_minimum=0.05) + savefig(os.path.join(plot_dir, 'speed_test_{}_profiling.svg'.format(name))) + + if 3 != len(get_fignums()): + print("WARNING: There were {} plots created, but only {} saved.".format(len(get_fignums()), 3*(n+1))) + for n in get_fignums(): + close(n) + + # pickel results object to disk + pkl_file = os.path.join(data_dir, name + '.pkl' ) + with open(pkl_file, 'wb') as output: + pickle.dump(res, output, pickle.HIGHEST_PROTOCOL) + + # save stdout log of last run (the other are deleted in run_speed_tests()) + for proj_dir in set(project_dirs): + if not proj_dir is None and proj_dir in ['cuda_standalone', 'cpp_standalone']: + config = configurations[last_idx[proj_dir]] + stdout_file = os.path.join(proj_dir, 'results/stdout.txt') + if os.path.exists(stdout_file): + shutil.copy(stdout_file, + os.path.join(log_dir, 'stdout_{st}_{conf}_{n}.txt'.format(st=name, conf=proj_dir, + n=st.n_range[sl][-1]))) + else: + print("WARNING Couldn't save {},file not found.".format(stdout_file)) + + # run nvprof on n_range[2] + for conf, proj_dir in zip(configurations, project_dirs): + main_arg = '' + if proj_dir in ['cuda_standalone', 'GeNNworkspace']: + if proj_dir == 'GeNNworkspace': + main_arg = 'test {time} 1'.format(time=st.duration/second) + ns = st.n_range[sl] + idx = 2 + max_runtime = 20 + conf_name = conf.__name__ + print("Rerunning {} with n = {} for nvprof profiling".format(conf_name, st.n_range[idx])) + tb, res, runtime, prof_info = results(conf, st, st.n_range[idx], maximum_run_time=maximum_run_time) + if not isinstance(res, Exception) and runtime < max_runtime: + option = '--profile-from-start-off' if proj_dir == 'cuda_standalone' else '' + cmd = 'cd {proj_dir} && nvprof {opt} --log-file ../{log_file} ./main {arg}'.format( + proj_dir=proj_dir, arg=main_arg, opt=option, + log_file=os.path.join(prof_dir, 'nvprof_{st}_{conf}_{n}.log'.format( + st=name, conf=conf_name, n=st.n_range[idx]))) + prof_start = datetime.datetime.fromtimestamp(time.time()).strftime(time_format) + print(cmd) + x = os.system(cmd) + if x: + print('nvprof failed with {}'.format(x)) + prof_end = datetime.datetime.fromtimestamp(time.time()).strftime(time_format) + prof_diff = datetime.datetime.strptime(prof_end, time_format) - datetime.datetime.strptime(prof_start, time_format) + print("Profiling took {} for runtime of {}".format(prof_diff, runtime)) +finally: + create_readme(directory) + print("\nSummarized speed test results in {}".format(directory + '/README.md')) + script_end = datetime.datetime.fromtimestamp(time.time()).strftime(time_format) + script_diff = datetime.datetime.strptime(script_end, time_format) - datetime.datetime.strptime(script_start, time_format) + print("Finished speed test on {}. Total time = {}.".format( + datetime.datetime.fromtimestamp(time.time()).strftime(time_format), script_diff)) + + +##res.plot_all_tests(relative=True) +#for n in get_fignums(): +# plt.figure(n) +# savefig(plot_dir + '/speed_test_{}.png'.format(speed_tests[n-1][1])) + +## Debug (includes profiling infos) +#from brian2.tests.features.base import results +#for x in results(LocalConfiguration, LinearNeuronsOnly, 10, maximum_run_time=10*second): +# print x diff --git a/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/update_readme.py b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/update_readme.py new file mode 100644 index 00000000..b68b1a9e --- /dev/null +++ b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/update_readme.py @@ -0,0 +1,18 @@ +import os +from glob import glob + +def update_benchmark_readme(): + filedir = os.path.dirname(os.path.realpath(__file__)) + + lines = [] + for readme in sorted(glob(filedir + '/*/README.md'), reverse=True): + d = os.path.split(readme)[0] + lines.append("[{d}]({d})\n".format(d=os.path.basename(os.path.normpath(d)))) + + readme_md = '\n'.join(lines) + + with open(filedir + "/README.md", "w") as readme_file: + readme_file.write(readme_md) + +if __name__ == '__main__': + update_benchmark_readme() diff --git a/dev/benchmarks/results_2017_11_30_cuba_stdp/README.md b/dev/benchmarks/results_2017_11_30_cuba_stdp/README.md new file mode 100644 index 00000000..1bdd7b98 --- /dev/null +++ b/dev/benchmarks/results_2017_11_30_cuba_stdp/README.md @@ -0,0 +1,3 @@ +[cuba_stdp_profiled](cuba_stdp_profiled) + +[cuba_stdp](cuba_stdp) diff --git a/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/README.md b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/README.md new file mode 100644 index 00000000..edb94c86 --- /dev/null +++ b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/README.md @@ -0,0 +1,311 @@ + +# Benchmark results from 29.11.2017 +## Description: + + + +## Last git log: +``` +commit 65e51048f25caaee2a6e0396269f90821d994f85 +Author: Denis Alevi +Date: Mon Nov 27 17:55:05 2017 +0100 + + Add recent benchmark results + +``` +There is also a `git diff` saved in the current directory. + +## Results + +### CUBA +![](plots/speed_test_CUBA_absolute.svg) +![](plots/speed_test_CUBA_profiling.svg) +![](plots/speed_test_CUBA_relative.svg) + +
Examplary `nvprof` results for **CUDAStandaloneConfiguration**

+Profile summary for `N = 1000`: + +``` +==9905== NVPROF is profiling process 9905, command: ./main +==9905== Profiling application: ./main +==9905== Profiling result: + Type Time(%) Time Calls Avg Min Max Name + GPU activities: 27.60% 60.179ms 10000 6.0170us 5.8560us 7.0720us kernel_neurongroup_stateupdater_codeobject(unsigned int, unsigned int, double*, double*, double, double*, double*, double*, bool*) + 23.08% 50.318ms 10000 5.0310us 3.2960us 23.232us kernel_synapses_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, int*, int, double*, double, int*, int, int*, bool*) + 21.95% 47.850ms 10000 4.7840us 3.2960us 19.968us kernel_synapses_1_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, int*, int, double, int*, int, int*, double*, bool*) + 11.42% 24.905ms 10000 2.4900us 2.2720us 2.8800us kernel_neurongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double, double*, double*, bool*) + 8.26% 18.018ms 10000 1.8010us 1.6640us 2.1120us kernel_neurongroup_resetter_codeobject(unsigned int, unsigned int, double*, int*, bool*) + 7.68% 16.743ms 10000 1.6740us 1.5360us 2.0800us _GLOBAL__N__69_tmpxft_000024a7_00000000_6_neurongroup_thresholder_codeobject_cpp1_ii_97ebdcc0::_reset_neurongroup_thresholder_codeobject(int*) + API calls: 85.15% 643.72ms 60000 10.728us 9.3820us 9.0568ms cudaLaunch + 11.40% 86.186ms 520000 165ns 134ns 363.54us cudaSetupArgument + 1.93% 14.574ms 60000 242ns 182ns 349.11us cudaConfigureCall + 1.50% 11.304ms 50000 226ns 194ns 14.049us cudaGetLastError + 0.02% 134.84us 1 134.84us 134.84us 134.84us cudaMemGetInfo + 0.00% 31.105us 8 3.8880us 3.0120us 5.6890us cudaFuncGetAttributes + 0.00% 30.378us 39 778ns 653ns 1.8930us cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags + 0.00% 13.359us 1 13.359us 13.359us 13.359us cudaDeviceSynchronize + 0.00% 6.2530us 12 521ns 334ns 1.3690us cudaDeviceGetAttribute + 0.00% 3.6520us 3 1.2170us 789ns 1.6000us cudaGetDevice + +``` + +

+ + +
Examplary `nvprof` results for **GeNNConfigurationOptimized**

+Profile summary for `N = 1000`: + +``` +==10402== NVPROF is profiling process 10402, command: ./main test 1.0 1 +==10402== Profiling application: ./main test 1.0 1 +==10402== Profiling result: + Type Time(%) Time Calls Avg Min Max Name + GPU activities: 63.74% 78.512ms 10000 7.8510us 7.0080us 10.336us calcNeurons + 35.42% 43.636ms 10000 4.3630us 1.9840us 20.512us calcSynapses + 0.65% 799.04us 56 14.268us 960ns 163.46us [CUDA memcpy HtoD] + 0.19% 234.66us 13 18.050us 1.9840us 155.30us [CUDA memcpy DtoH] + API calls: 67.87% 468.06ms 16 29.253ms 15.634us 464.71ms cudaHostAlloc + 29.73% 204.98ms 20000 10.248us 9.4610us 337.99us cudaLaunch + 1.01% 6.9362ms 20000 346ns 275ns 331.07us cudaConfigureCall + 0.81% 5.6041ms 20000 280ns 221ns 329.96us cudaSetupArgument + 0.31% 2.1674ms 73 29.690us 512ns 179.18us cudaMemcpy + 0.18% 1.2374ms 16 77.339us 9.8610us 230.18us cudaMalloc + 0.06% 398.46us 94 4.2380us 154ns 155.40us cuDeviceGetAttribute + 0.02% 118.62us 1 118.62us 118.62us 118.62us cuDeviceTotalMem + 0.01% 48.855us 1 48.855us 48.855us 48.855us cuDeviceGetName + 0.00% 22.545us 16 1.4090us 582ns 3.4920us cudaGetSymbolAddress + 0.00% 9.5420us 1 9.5420us 9.5420us 9.5420us cudaSetDevice + 0.00% 3.6290us 3 1.2090us 200ns 2.4090us cuDeviceGetCount + 0.00% 1.5380us 1 1.5380us 1.5380us 1.5380us cudaGetDeviceCount + 0.00% 1.1220us 2 561ns 362ns 760ns cuDeviceGet + +``` + +

+ + +*** + +### STDP (with SpikeMonitor) +![](plots/speed_test_STDP_absolute.svg) +![](plots/speed_test_STDP_profiling.svg) +![](plots/speed_test_STDP_relative.svg) + +
Examplary `nvprof` results for **CUDAStandaloneConfiguration**

+Profile summary for `N = 1000`: + +``` +==8893== NVPROF is profiling process 8893, command: ./main +==8893== Profiling application: ./main +==8893== Profiling result: + Type Time(%) Time Calls Avg Min Max Name + GPU activities: 32.18% 119.34ms 10000 11.934us 1.6000us 26.926ms kernel_spikemonitor_codeobject(unsigned int, int*, double, int*, int*, int*, int, int*, double*, int, int*, int*) + 20.94% 77.684ms 10000 7.7680us 3.3600us 25.728us kernel_synapses_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, double*, int, double*, int, int*, double*, double, double*, int, int*, int, int*, int) + 11.71% 43.439ms 10000 4.3430us 3.8400us 6.0800us kernel_neurongroup_stateupdater_codeobject(unsigned int, unsigned int, double*, double*, double*) + 9.85% 36.549ms 10000 3.6540us 3.5520us 7.0080us kernel_synapses_post_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, double*, int, double*, int, double, double*, int, int*, int*, int) + 6.79% 25.173ms 10000 2.5170us 2.1760us 3.6800us kernel_poissongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double*, double*, float*) + 4.91% 18.216ms 10000 1.8210us 1.7280us 4.3200us kernel_neurongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double*) + 4.78% 17.745ms 10000 1.7740us 1.5680us 3.6800us kernel_neurongroup_resetter_codeobject(unsigned int, unsigned int, double*, int*) + 4.51% 16.723ms 10000 1.6720us 1.6000us 3.2640us _GLOBAL__N__70_tmpxft_00002089_00000000_6_poissongroup_thresholder_codeobject_cpp1_ii_7314966e::_reset_poissongroup_thresholder_codeobject(int*) + 4.22% 15.645ms 10000 1.5640us 1.3760us 3.5200us _GLOBAL__N__69_tmpxft_00002086_00000000_6_neurongroup_thresholder_codeobject_cpp1_ii_c0b8948b::_reset_neurongroup_thresholder_codeobject(int*) + 0.09% 330.21us 1 330.21us 330.21us 330.21us void gen_sequenced(curandStateXORWOW*, int))>(curandStateXORWOW*, float*, unsigned long, unsigned long, int) + 0.02% 68.192us 1 68.192us 68.192us 68.192us _run_spikemonitor_codeobject_init(void) + API calls: 85.86% 977.23ms 90002 10.857us 9.1510us 10.887ms cudaLaunch + 10.27% 116.88ms 700005 166ns 137ns 331.60us cudaSetupArgument + 2.12% 24.161ms 90002 268ns 176ns 335.73us cudaConfigureCall + 1.71% 19.473ms 70003 278ns 205ns 329.44us cudaGetLastError + 0.02% 213.58us 1 213.58us 213.58us 213.58us cudaMalloc + 0.01% 133.47us 1 133.47us 133.47us 133.47us cudaMemGetInfo + 0.00% 43.181us 11 3.9250us 3.2280us 6.4450us cudaFuncGetAttributes + 0.00% 32.048us 42 763ns 627ns 1.7760us cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags + 0.00% 13.358us 1 13.358us 13.358us 13.358us cudaDeviceSynchronize + 0.00% 7.7520us 16 484ns 356ns 1.1240us cudaDeviceGetAttribute + 0.00% 4.0510us 4 1.0120us 822ns 1.5030us cudaGetDevice + +``` + +

+ + +
Examplary `nvprof` results for **GeNNConfigurationOptimized**

+Profile summary for `N = 1000`: + +``` +==9420== NVPROF is profiling process 9420, command: ./main test 1.0 1 +==9420== Profiling application: ./main test 1.0 1 +==9420== Profiling result: + Type Time(%) Time Calls Avg Min Max Name + GPU activities: 50.20% 103.62ms 10000 10.362us 1.5680us 45.248us calcSynapses + 19.97% 41.214ms 10000 4.1210us 3.1040us 6.7200us calcNeurons + 17.73% 36.597ms 17812 2.0540us 2.0160us 4.7360us [CUDA memcpy DtoH] + 12.06% 24.885ms 10000 2.4880us 2.3680us 10.848us learnSynapsesPost + 0.05% 94.016us 70 1.3430us 960ns 2.0480us [CUDA memcpy HtoD] + API calls: 34.18% 358.40ms 20 17.920ms 8.3270us 356.55ms cudaHostAlloc + 32.17% 337.26ms 30000 11.241us 9.5510us 356.07us cudaLaunch + 31.72% 332.57ms 20095 16.549us 231ns 988.56us cudaMemcpy + 1.03% 10.770ms 30000 358ns 283ns 331.73us cudaConfigureCall + 0.77% 8.0617ms 30000 268ns 208ns 334.35us cudaSetupArgument + 0.08% 809.75us 20 40.487us 8.0280us 232.94us cudaMalloc + 0.04% 401.78us 94 4.2740us 161ns 156.02us cuDeviceGetAttribute + 0.01% 113.16us 1 113.16us 113.16us 113.16us cuDeviceTotalMem + 0.00% 37.103us 1 37.103us 37.103us 37.103us cuDeviceGetName + 0.00% 22.451us 20 1.1220us 525ns 5.8000us cudaGetSymbolAddress + 0.00% 9.5720us 1 9.5720us 9.5720us 9.5720us cudaSetDevice + 0.00% 3.2610us 3 1.0870us 219ns 2.3710us cuDeviceGetCount + 0.00% 1.6100us 1 1.6100us 1.6100us 1.6100us cudaGetDeviceCount + 0.00% 1.0470us 2 523ns 250ns 797ns cuDeviceGet + +``` + +

+ + +*** + +### STDPEventDriven +![](plots/speed_test_STDPEventDriven_absolute.svg) +![](plots/speed_test_STDPEventDriven_profiling.svg) +![](plots/speed_test_STDPEventDriven_relative.svg) + +
Examplary `nvprof` results for **CUDAStandaloneConfiguration**

+Profile summary for `N = 1000`: + +``` +==19561== NVPROF is profiling process 19561, command: ./main +==19561== Profiling application: ./main +==19561== Profiling result: + Type Time(%) Time Calls Avg Min Max Name + GPU activities: 33.06% 85.737ms 10000 8.5730us 3.3600us 26.176us kernel_synapses_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, double*, int, double*, int, int*, double*, double, double*, int, int*, int, int*, int) + 16.85% 43.713ms 10000 4.3710us 3.8720us 6.4320us kernel_neurongroup_stateupdater_codeobject(unsigned int, unsigned int, double*, double*, double*) + 13.67% 35.462ms 10000 3.5460us 3.4560us 7.1040us kernel_synapses_post_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, double*, int, double*, int, double, double*, int, int*, int*, int) + 9.83% 25.505ms 10000 2.5500us 2.2400us 2.8480us kernel_poissongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double*, double*, float*) + 7.03% 18.243ms 10000 1.8240us 1.7600us 2.5920us kernel_neurongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double*) + 7.01% 18.182ms 10000 1.8180us 1.6960us 2.2080us kernel_neurongroup_resetter_codeobject(unsigned int, unsigned int, double*, int*) + 6.41% 16.614ms 10000 1.6610us 1.5360us 1.9520us _GLOBAL__N__70_tmpxft_00004a64_00000000_6_poissongroup_thresholder_codeobject_cpp1_ii_7314966e::_reset_poissongroup_thresholder_codeobject(int*) + 6.01% 15.583ms 10000 1.5580us 1.4720us 1.7280us _GLOBAL__N__69_tmpxft_00004a60_00000000_6_neurongroup_thresholder_codeobject_cpp1_ii_c0b8948b::_reset_neurongroup_thresholder_codeobject(int*) + 0.13% 330.21us 1 330.21us 330.21us 330.21us void gen_sequenced(curandStateXORWOW*, int))>(curandStateXORWOW*, float*, unsigned long, unsigned long, int) + API calls: 84.19% 838.94ms 80001 10.486us 9.1520us 8.9317ms cudaLaunch + 11.76% 117.21ms 580005 202ns 157ns 419.79us cudaSetupArgument + 2.27% 22.642ms 80001 283ns 205ns 337.22us cudaConfigureCall + 1.74% 17.290ms 60002 288ns 220ns 371.39us cudaGetLastError + 0.02% 198.76us 1 198.76us 198.76us 198.76us cudaMalloc + 0.01% 139.83us 1 139.83us 139.83us 139.83us cudaMemGetInfo + 0.00% 37.555us 10 3.7550us 3.0440us 6.0110us cudaFuncGetAttributes + 0.00% 31.926us 41 778ns 680ns 1.6620us cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags + 0.00% 13.490us 1 13.490us 13.490us 13.490us cudaDeviceSynchronize + 0.00% 8.0030us 16 500ns 369ns 1.0430us cudaDeviceGetAttribute + 0.00% 4.0740us 4 1.0180us 792ns 1.5710us cudaGetDevice + +``` + +

+ + +
Examplary `nvprof` results for **GeNNConfigurationOptimized**

+Profile summary for `N = 1000`: + +``` +==20069== NVPROF is profiling process 20069, command: ./main test 1.0 1 +==20069== Profiling application: ./main test 1.0 1 +==20069== Profiling result: + Type Time(%) Time Calls Avg Min Max Name + GPU activities: 61.14% 103.55ms 10000 10.355us 1.5680us 49.760us calcSynapses + 24.06% 40.741ms 10000 4.0740us 3.0720us 6.6880us calcNeurons + 14.71% 24.918ms 10000 2.4910us 2.3360us 10.560us learnSynapsesPost + 0.06% 94.593us 70 1.3510us 960ns 2.0800us [CUDA memcpy HtoD] + 0.03% 55.553us 19 2.9230us 2.0160us 4.8010us [CUDA memcpy DtoH] + API calls: 56.62% 434.02ms 20 21.701ms 16.555us 432.05ms cudaHostAlloc + 40.47% 310.26ms 30000 10.341us 9.4140us 347.35us cudaLaunch + 1.37% 10.508ms 30000 350ns 275ns 330.81us cudaConfigureCall + 1.08% 8.2824ms 30000 276ns 221ns 333.57us cudaSetupArgument + 0.25% 1.9098ms 95 20.103us 434ns 41.200us cudaMemcpy + 0.13% 998.34us 20 49.917us 13.252us 259.26us cudaMalloc + 0.05% 419.41us 94 4.4610us 183ns 162.60us cuDeviceGetAttribute + 0.02% 126.30us 1 126.30us 126.30us 126.30us cuDeviceTotalMem + 0.00% 38.221us 1 38.221us 38.221us 38.221us cuDeviceGetName + 0.00% 29.710us 20 1.4850us 972ns 6.2560us cudaGetSymbolAddress + 0.00% 9.9350us 1 9.9350us 9.9350us 9.9350us cudaSetDevice + 0.00% 3.4560us 3 1.1520us 236ns 2.5630us cuDeviceGetCount + 0.00% 1.7080us 1 1.7080us 1.7080us 1.7080us cudaGetDeviceCount + 0.00% 1.2540us 2 627ns 267ns 987ns cuDeviceGet + +``` + +

+ + +*** + +### STDPNotEventDriven +![](plots/speed_test_STDPNotEventDriven_absolute.svg) +![](plots/speed_test_STDPNotEventDriven_profiling.svg) +![](plots/speed_test_STDPNotEventDriven_relative.svg) + +
Examplary `nvprof` results for **CUDAStandaloneConfiguration**

+Profile summary for `N = 1000`: + +``` +==18533== NVPROF is profiling process 18533, command: ./main +==18533== Profiling application: ./main +==18533== Profiling result: + Type Time(%) Time Calls Avg Min Max Name + GPU activities: 27.01% 73.513ms 10000 7.3510us 3.2960us 29.120us kernel_synapses_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, double*, int, double*, int, int*, int, double*, double, double*, int, int*, int, int*) + 16.08% 43.771ms 10000 4.3770us 3.9040us 6.3360us kernel_neurongroup_stateupdater_codeobject(unsigned int, unsigned int, double*, double*, double*) + 12.83% 34.925ms 10000 3.4920us 3.3920us 6.4000us kernel_synapses_post_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, double*, int, double*, int, int*, int, double, double*, int, int*) + 10.01% 27.253ms 10000 2.7250us 2.6240us 3.2000us kernel_synapses_stateupdater_codeobject(unsigned int, unsigned int, double*, int, double*, int, double*, int*) + 9.18% 24.982ms 10000 2.4980us 2.2080us 2.6880us kernel_poissongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double*, double*, float*) + 6.70% 18.244ms 10000 1.8240us 1.7280us 2.0800us kernel_neurongroup_resetter_codeobject(unsigned int, unsigned int, double*, int*) + 6.70% 18.236ms 10000 1.8230us 1.7280us 2.6240us kernel_neurongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double*) + 5.76% 15.677ms 10000 1.5670us 1.4720us 1.6960us _GLOBAL__N__69_tmpxft_00004642_00000000_6_neurongroup_thresholder_codeobject_cpp1_ii_c0b8948b::_reset_neurongroup_thresholder_codeobject(int*) + 5.59% 15.219ms 10000 1.5210us 1.4400us 1.9520us _GLOBAL__N__70_tmpxft_00004643_00000000_6_poissongroup_thresholder_codeobject_cpp1_ii_7314966e::_reset_poissongroup_thresholder_codeobject(int*) + 0.12% 330.63us 1 330.63us 330.63us 330.63us void gen_sequenced(curandStateXORWOW*, int))>(curandStateXORWOW*, float*, unsigned long, unsigned long, int) + API calls: 85.78% 968.25ms 90001 10.758us 9.3110us 9.2828ms cudaLaunch + 10.41% 117.51ms 660005 178ns 137ns 367.56us cudaSetupArgument + 2.19% 24.694ms 90001 274ns 200ns 349.17us cudaConfigureCall + 1.59% 17.897ms 70002 255ns 203ns 333.24us cudaGetLastError + 0.02% 201.51us 1 201.51us 201.51us 201.51us cudaMalloc + 0.01% 131.77us 1 131.77us 131.77us 131.77us cudaMemGetInfo + 0.00% 51.691us 74 698ns 591ns 1.8080us cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags + 0.00% 47.398us 12 3.9490us 3.1850us 6.2570us cudaFuncGetAttributes + 0.00% 13.229us 1 13.229us 13.229us 13.229us cudaDeviceSynchronize + 0.00% 9.0230us 20 451ns 339ns 845ns cudaDeviceGetAttribute + 0.00% 4.9330us 5 986ns 852ns 1.4630us cudaGetDevice + +``` + +

+ + +
Examplary `nvprof` results for **GeNNConfigurationOptimized**

+Profile summary for `N = 1000`: + +``` +==19043== NVPROF is profiling process 19043, command: ./main test 1.0 1 +==19043== Profiling application: ./main test 1.0 1 +==19043== Profiling result: + Type Time(%) Time Calls Avg Min Max Name + GPU activities: 39.69% 65.436ms 10000 6.5430us 1.5680us 25.984us calcSynapses + 24.07% 39.692ms 10000 3.9690us 3.1040us 6.4320us calcNeurons + 20.72% 34.165ms 10000 3.4160us 3.1040us 6.0800us calcSynapseDynamics + 15.42% 25.426ms 10000 2.5420us 2.3680us 6.6880us learnSynapsesPost + 0.06% 96.800us 72 1.3440us 960ns 2.0800us [CUDA memcpy HtoD] + 0.04% 59.552us 21 2.8350us 2.0480us 4.7680us [CUDA memcpy DtoH] + API calls: 51.75% 397.10ms 40000 9.9270us 9.2210us 345.12us cudaLaunch + 44.63% 342.53ms 21 16.311ms 16.914us 340.57ms cudaHostAlloc + 1.75% 13.449ms 40000 336ns 278ns 330.54us cudaConfigureCall + 1.40% 10.778ms 40000 269ns 210ns 335.21us cudaSetupArgument + 0.26% 1.9587ms 97 20.192us 407ns 40.743us cudaMemcpy + 0.13% 990.84us 21 47.182us 13.183us 232.13us cudaMalloc + 0.05% 400.17us 94 4.2570us 154ns 155.67us cuDeviceGetAttribute + 0.01% 113.90us 1 113.90us 113.90us 113.90us cuDeviceTotalMem + 0.00% 36.839us 1 36.839us 36.839us 36.839us cuDeviceGetName + 0.00% 30.866us 21 1.4690us 942ns 6.1960us cudaGetSymbolAddress + 0.00% 9.2900us 1 9.2900us 9.2900us 9.2900us cudaSetDevice + 0.00% 3.2500us 3 1.0830us 238ns 2.4920us cuDeviceGetCount + 0.00% 1.6970us 1 1.6970us 1.6970us 1.6970us cudaGetDeviceCount + 0.00% 1.0870us 2 543ns 238ns 849ns cuDeviceGet + +``` + +

+ diff --git a/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/data/CUBA.pkl b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/data/CUBA.pkl new file mode 100644 index 00000000..d65d8af9 Binary files /dev/null and b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/data/CUBA.pkl differ diff --git a/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/data/STDP.pkl b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/data/STDP.pkl new file mode 100644 index 00000000..53cfc7de Binary files /dev/null and b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/data/STDP.pkl differ diff --git a/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/data/STDPEventDriven.pkl b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/data/STDPEventDriven.pkl new file mode 100644 index 00000000..bf3ce605 Binary files /dev/null and b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/data/STDPEventDriven.pkl differ diff --git a/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/data/STDPNotEventDriven.pkl b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/data/STDPNotEventDriven.pkl new file mode 100644 index 00000000..b0a7f42a Binary files /dev/null and b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/data/STDPNotEventDriven.pkl differ diff --git a/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/git.diff b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/git.diff new file mode 100644 index 00000000..7737e913 --- /dev/null +++ b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/git.diff @@ -0,0 +1,60 @@ +diff --git a/brian2cuda/device.py b/brian2cuda/device.py +index b4610ee..e032d32 100644 +--- a/brian2cuda/device.py ++++ b/brian2cuda/device.py +@@ -919,13 +919,13 @@ class CUDAStandaloneDevice(CPPStandaloneDevice): + if clock not in all_clocks: + run_lines.append('{net.name}.add(&{clock.name}, NULL, NULL, NULL, NULL);'.format(clock=clock, net=net)) + +- if self.profile and self.profile != 'blocking': # self.profile == True ++ if True:#self.profile and self.profile != 'blocking': # self.profile == True + run_lines.append('cudaProfilerStart();') + run_lines.append('{net.name}.run({duration!r}, {report_call}, {report_period!r});'.format(net=net, + duration=float(duration), + report_call=report_call, + report_period=float(report_period))) +- if self.profile and self.profile != 'blocking': # self.profile == True ++ if True:#self.profile and self.profile != 'blocking': # self.profile == True + run_lines.append('cudaDeviceSynchronize();') + run_lines.append('cudaProfilerStop();') + self.main_queue.append(('run_network', (net, run_lines))) +diff --git a/brian2cuda/tests/features/speed.py b/brian2cuda/tests/features/speed.py +index 2293533..c093bc9 100644 +--- a/brian2cuda/tests/features/speed.py ++++ b/brian2cuda/tests/features/speed.py +@@ -558,7 +558,7 @@ class CUBA(SpeedTest): + category = "Full examples" + name = "CUBA fixed connectivity" + tags = ["Neurons", "Synapses"] +- n_range = [10, 100, 1000, 10000, 100000, 1000000] ++ n_range = [10, 100, 1000, 10000, 100000, 200000, 500000, 1000000] + n_label = 'Num neurons' + + # configuration options +@@ -720,7 +720,7 @@ class STDPNotEventDriven(SpeedTest): + category = "Full examples" + name = "STDP (not event-driven)" + tags = ["Neurons", "Synapses"] +- n_range = [10, 100, 1000, 10000, 20000, 50000, 100000] ++ n_range = [10, 100, 1000, 10000, 20000, 50000, 100000, 1000000, 5000000] + n_label = 'Num neurons' + + # configuration options +diff --git a/frozen_repos/brian2 b/frozen_repos/brian2 +--- a/frozen_repos/brian2 ++++ b/frozen_repos/brian2 +@@ -1 +1 @@ +-Subproject commit fadc6a0aeb90d1b4d343470628457d8561536f67 ++Subproject commit fadc6a0aeb90d1b4d343470628457d8561536f67-dirty +diff --git a/frozen_repos/brian2genn b/frozen_repos/brian2genn +--- a/frozen_repos/brian2genn ++++ b/frozen_repos/brian2genn +@@ -1 +1 @@ +-Subproject commit 0553cafeab49ea5403c0230411035df504d4db06 ++Subproject commit 0553cafeab49ea5403c0230411035df504d4db06-dirty +diff --git a/frozen_repos/genn b/frozen_repos/genn +--- a/frozen_repos/genn ++++ b/frozen_repos/genn +@@ -1 +1 @@ +-Subproject commit e01c85f18339249558d6e570ae976609dc972846 ++Subproject commit e01c85f18339249558d6e570ae976609dc972846-dirty diff --git a/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/logs/stdout_CUBA_cpp_standalone_1000000.txt b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/logs/stdout_CUBA_cpp_standalone_1000000.txt new file mode 100644 index 00000000..765e61ed --- /dev/null +++ b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/logs/stdout_CUBA_cpp_standalone_1000000.txt @@ -0,0 +1,2 @@ +Number of synapses: 64000702 +Number of synapses: 15999195 diff --git a/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/logs/stdout_CUBA_cuda_standalone_1000000.txt b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/logs/stdout_CUBA_cuda_standalone_1000000.txt new file mode 100644 index 00000000..b923caf1 --- /dev/null +++ b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/logs/stdout_CUBA_cuda_standalone_1000000.txt @@ -0,0 +1,9 @@ +INFO: setting cudaDevice stuff took 0.347672 seconds +INFO kernel_neurongroup_group_variable_set_conditional_codeobject + 977 blocks + 1024 threads + 12 registers per block + 0 bytes statically-allocated shared memory per block + 0 bytes local memory per thread + 304 bytes user-allocated constant memory +INFO connectivity matrix has size 15994612 diff --git a/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/logs/stdout_STDPEventDriven_cpp_standalone_5000000.txt b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/logs/stdout_STDPEventDriven_cpp_standalone_5000000.txt new file mode 100644 index 00000000..0d502595 --- /dev/null +++ b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/logs/stdout_STDPEventDriven_cpp_standalone_5000000.txt @@ -0,0 +1,2 @@ +Number of synapses: 5000000 +Number of synapses: 5000000 diff --git a/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/logs/stdout_STDPEventDriven_cuda_standalone_5000000.txt b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/logs/stdout_STDPEventDriven_cuda_standalone_5000000.txt new file mode 100644 index 00000000..d310f8f6 --- /dev/null +++ b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/logs/stdout_STDPEventDriven_cuda_standalone_5000000.txt @@ -0,0 +1,63 @@ +INFO: setting cudaDevice stuff took 0.343461 seconds +INFO kernel_synapses_group_variable_set_conditional_codeobject + 4883 blocks + 1024 threads + 8 registers per block + 0 bytes statically-allocated shared memory per block + 0 bytes local memory per thread + 304 bytes user-allocated constant memory +INFO connectivity matrix has size 5000000 +INFO connectivity matrix has size 5000000 +INFO generating 10000000 rand every 2 clock cycles for poissongroup_thresholder_codeobject +INFO kernel_neurongroup_stateupdater_codeobject + 1 blocks + 768 threads + 35 registers per block + 0 bytes statically-allocated shared memory per block + 0 bytes local memory per thread + 304 bytes user-allocated constant memory + 0.750 theoretical occupancy +INFO kernel_neurongroup_thresholder_codeobject + 1 blocks + 1024 threads + 14 registers per block + 0 bytes statically-allocated shared memory per block + 0 bytes local memory per thread + 304 bytes user-allocated constant memory + 1.000 theoretical occupancy +INFO kernel_poissongroup_thresholder_codeobject + 4883 blocks + 1024 threads + 14 registers per block + 0 bytes statically-allocated shared memory per block + 0 bytes local memory per thread + 304 bytes user-allocated constant memory + 1.000 theoretical occupancy +INFO kernel_synapses_pre_codeobject + 15 blocks + 1024 threads + 40 registers per block + 0 bytes statically-allocated shared memory per block + 8 bytes local memory per thread + 304 bytes user-allocated constant memory + 0.500 theoretical occupancy +INFO kernel_synapses_post_codeobject + 15 blocks + 1024 threads + 34 registers per block + 0 bytes statically-allocated shared memory per block + 8 bytes local memory per thread + 304 bytes user-allocated constant memory + 0.500 theoretical occupancy +INFO kernel_neurongroup_resetter_codeobject + 1 blocks + 1024 threads + 14 registers per block + 0 bytes statically-allocated shared memory per block + 0 bytes local memory per thread + 304 bytes user-allocated constant memory + 1.000 theoretical occupancy +Number of synapses: 5000000 +Number of synapses: 5000000 +INFO: main_lines took 343.866984 seconds +INFO: main function took 344.732117 seconds diff --git a/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/logs/stdout_STDPNotEventDriven_cpp_standalone_5000000.txt b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/logs/stdout_STDPNotEventDriven_cpp_standalone_5000000.txt new file mode 100644 index 00000000..0d502595 --- /dev/null +++ b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/logs/stdout_STDPNotEventDriven_cpp_standalone_5000000.txt @@ -0,0 +1,2 @@ +Number of synapses: 5000000 +Number of synapses: 5000000 diff --git a/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/logs/stdout_STDPNotEventDriven_cuda_standalone_5000000.txt b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/logs/stdout_STDPNotEventDriven_cuda_standalone_5000000.txt new file mode 100644 index 00000000..4c247b04 --- /dev/null +++ b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/logs/stdout_STDPNotEventDriven_cuda_standalone_5000000.txt @@ -0,0 +1,71 @@ +INFO: setting cudaDevice stuff took 0.345858 seconds +INFO kernel_synapses_group_variable_set_conditional_codeobject + 4883 blocks + 1024 threads + 8 registers per block + 0 bytes statically-allocated shared memory per block + 0 bytes local memory per thread + 304 bytes user-allocated constant memory +INFO connectivity matrix has size 5000000 +INFO connectivity matrix has size 5000000 +INFO generating 10000000 rand every 2 clock cycles for poissongroup_thresholder_codeobject +INFO kernel_neurongroup_stateupdater_codeobject + 1 blocks + 768 threads + 35 registers per block + 0 bytes statically-allocated shared memory per block + 0 bytes local memory per thread + 304 bytes user-allocated constant memory + 0.750 theoretical occupancy +INFO kernel_synapses_stateupdater_codeobject + 6511 blocks + 768 threads + 35 registers per block + 0 bytes statically-allocated shared memory per block + 0 bytes local memory per thread + 304 bytes user-allocated constant memory + 0.750 theoretical occupancy +INFO kernel_neurongroup_thresholder_codeobject + 1 blocks + 1024 threads + 14 registers per block + 0 bytes statically-allocated shared memory per block + 0 bytes local memory per thread + 304 bytes user-allocated constant memory + 1.000 theoretical occupancy +INFO kernel_poissongroup_thresholder_codeobject + 4883 blocks + 1024 threads + 14 registers per block + 0 bytes statically-allocated shared memory per block + 0 bytes local memory per thread + 304 bytes user-allocated constant memory + 1.000 theoretical occupancy +INFO kernel_synapses_pre_codeobject + 15 blocks + 1024 threads + 28 registers per block + 0 bytes statically-allocated shared memory per block + 8 bytes local memory per thread + 304 bytes user-allocated constant memory + 1.000 theoretical occupancy +INFO kernel_synapses_post_codeobject + 15 blocks + 1024 threads + 26 registers per block + 0 bytes statically-allocated shared memory per block + 8 bytes local memory per thread + 304 bytes user-allocated constant memory + 1.000 theoretical occupancy +INFO kernel_neurongroup_resetter_codeobject + 1 blocks + 1024 threads + 14 registers per block + 0 bytes statically-allocated shared memory per block + 0 bytes local memory per thread + 304 bytes user-allocated constant memory + 1.000 theoretical occupancy +Number of synapses: 5000000 +Number of synapses: 5000000 +INFO: main_lines took 328.762289 seconds +INFO: main function took 329.622826 seconds diff --git a/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/logs/stdout_STDP_cpp_standalone_1000000.txt b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/logs/stdout_STDP_cpp_standalone_1000000.txt new file mode 100644 index 00000000..eb1d6c28 --- /dev/null +++ b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/logs/stdout_STDP_cpp_standalone_1000000.txt @@ -0,0 +1,3 @@ +Number of synapses: 1000000 +Number of spikes: 14994297 +Number of synapses: 1000000 diff --git a/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/logs/stdout_STDP_cuda_standalone_1000000.txt b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/logs/stdout_STDP_cuda_standalone_1000000.txt new file mode 100644 index 00000000..f36afa97 --- /dev/null +++ b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/logs/stdout_STDP_cuda_standalone_1000000.txt @@ -0,0 +1,76 @@ +INFO: setting cudaDevice stuff took 0.189500 seconds +INFO kernel_synapses_group_variable_set_conditional_codeobject + 977 blocks + 1024 threads + 8 registers per block + 0 bytes statically-allocated shared memory per block + 0 bytes local memory per thread + 304 bytes user-allocated constant memory +INFO connectivity matrix has size 1000000 +INFO connectivity matrix has size 1000000 +INFO generating 13000000 rand every 13 clock cycles for poissongroup_thresholder_codeobject +INFO kernel_neurongroup_stateupdater_codeobject + 1 blocks + 768 threads + 35 registers per block + 0 bytes statically-allocated shared memory per block + 0 bytes local memory per thread + 304 bytes user-allocated constant memory + 0.750 theoretical occupancy +INFO kernel_neurongroup_thresholder_codeobject + 1 blocks + 1024 threads + 14 registers per block + 0 bytes statically-allocated shared memory per block + 0 bytes local memory per thread + 304 bytes user-allocated constant memory + 1.000 theoretical occupancy +INFO kernel_poissongroup_thresholder_codeobject + 977 blocks + 1024 threads + 14 registers per block + 0 bytes statically-allocated shared memory per block + 0 bytes local memory per thread + 304 bytes user-allocated constant memory + 1.000 theoretical occupancy +INFO kernel_spikemonitor_codeobject + 1 blocks + 1 threads + 30 registers per block + 0 bytes statically-allocated shared memory per block + 16 bytes local memory per thread + 304 bytes user-allocated constant memory + 0.000 theoretical occupancy +INFO kernel_synapses_pre_codeobject + 15 blocks + 1024 threads + 40 registers per block + 0 bytes statically-allocated shared memory per block + 8 bytes local memory per thread + 304 bytes user-allocated constant memory + 0.500 theoretical occupancy +INFO kernel_synapses_post_codeobject + 15 blocks + 1024 threads + 34 registers per block + 0 bytes statically-allocated shared memory per block + 8 bytes local memory per thread + 304 bytes user-allocated constant memory + 0.500 theoretical occupancy +INFO kernel_neurongroup_resetter_codeobject + 1 blocks + 1024 threads + 14 registers per block + 0 bytes statically-allocated shared memory per block + 0 bytes local memory per thread + 304 bytes user-allocated constant memory + 1.000 theoretical occupancy +... +ERROR while allocating 33554428 bytes in cudaVector.h/reserve() +ERROR while allocating 67108856 bytes in cudaVector.h/reserve() +... +Number of synapses: 1000000 +Number of synapses: 1000000 +INFO: main_lines took 2374.681343 seconds +Number of spikes: 4194303 +INFO: main function took 2382.821595 seconds diff --git a/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/nvprof/nvprof_CUBA_CUDAStandaloneConfiguration_1000.log b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/nvprof/nvprof_CUBA_CUDAStandaloneConfiguration_1000.log new file mode 100644 index 00000000..3ee9ab79 --- /dev/null +++ b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/nvprof/nvprof_CUBA_CUDAStandaloneConfiguration_1000.log @@ -0,0 +1,20 @@ +==9905== NVPROF is profiling process 9905, command: ./main +==9905== Profiling application: ./main +==9905== Profiling result: + Type Time(%) Time Calls Avg Min Max Name + GPU activities: 27.60% 60.179ms 10000 6.0170us 5.8560us 7.0720us kernel_neurongroup_stateupdater_codeobject(unsigned int, unsigned int, double*, double*, double, double*, double*, double*, bool*) + 23.08% 50.318ms 10000 5.0310us 3.2960us 23.232us kernel_synapses_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, int*, int, double*, double, int*, int, int*, bool*) + 21.95% 47.850ms 10000 4.7840us 3.2960us 19.968us kernel_synapses_1_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, int*, int, double, int*, int, int*, double*, bool*) + 11.42% 24.905ms 10000 2.4900us 2.2720us 2.8800us kernel_neurongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double, double*, double*, bool*) + 8.26% 18.018ms 10000 1.8010us 1.6640us 2.1120us kernel_neurongroup_resetter_codeobject(unsigned int, unsigned int, double*, int*, bool*) + 7.68% 16.743ms 10000 1.6740us 1.5360us 2.0800us _GLOBAL__N__69_tmpxft_000024a7_00000000_6_neurongroup_thresholder_codeobject_cpp1_ii_97ebdcc0::_reset_neurongroup_thresholder_codeobject(int*) + API calls: 85.15% 643.72ms 60000 10.728us 9.3820us 9.0568ms cudaLaunch + 11.40% 86.186ms 520000 165ns 134ns 363.54us cudaSetupArgument + 1.93% 14.574ms 60000 242ns 182ns 349.11us cudaConfigureCall + 1.50% 11.304ms 50000 226ns 194ns 14.049us cudaGetLastError + 0.02% 134.84us 1 134.84us 134.84us 134.84us cudaMemGetInfo + 0.00% 31.105us 8 3.8880us 3.0120us 5.6890us cudaFuncGetAttributes + 0.00% 30.378us 39 778ns 653ns 1.8930us cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags + 0.00% 13.359us 1 13.359us 13.359us 13.359us cudaDeviceSynchronize + 0.00% 6.2530us 12 521ns 334ns 1.3690us cudaDeviceGetAttribute + 0.00% 3.6520us 3 1.2170us 789ns 1.6000us cudaGetDevice diff --git a/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/nvprof/nvprof_CUBA_GeNNConfigurationOptimized_1000.log b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/nvprof/nvprof_CUBA_GeNNConfigurationOptimized_1000.log new file mode 100644 index 00000000..9add1dc1 --- /dev/null +++ b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/nvprof/nvprof_CUBA_GeNNConfigurationOptimized_1000.log @@ -0,0 +1,22 @@ +==10402== NVPROF is profiling process 10402, command: ./main test 1.0 1 +==10402== Profiling application: ./main test 1.0 1 +==10402== Profiling result: + Type Time(%) Time Calls Avg Min Max Name + GPU activities: 63.74% 78.512ms 10000 7.8510us 7.0080us 10.336us calcNeurons + 35.42% 43.636ms 10000 4.3630us 1.9840us 20.512us calcSynapses + 0.65% 799.04us 56 14.268us 960ns 163.46us [CUDA memcpy HtoD] + 0.19% 234.66us 13 18.050us 1.9840us 155.30us [CUDA memcpy DtoH] + API calls: 67.87% 468.06ms 16 29.253ms 15.634us 464.71ms cudaHostAlloc + 29.73% 204.98ms 20000 10.248us 9.4610us 337.99us cudaLaunch + 1.01% 6.9362ms 20000 346ns 275ns 331.07us cudaConfigureCall + 0.81% 5.6041ms 20000 280ns 221ns 329.96us cudaSetupArgument + 0.31% 2.1674ms 73 29.690us 512ns 179.18us cudaMemcpy + 0.18% 1.2374ms 16 77.339us 9.8610us 230.18us cudaMalloc + 0.06% 398.46us 94 4.2380us 154ns 155.40us cuDeviceGetAttribute + 0.02% 118.62us 1 118.62us 118.62us 118.62us cuDeviceTotalMem + 0.01% 48.855us 1 48.855us 48.855us 48.855us cuDeviceGetName + 0.00% 22.545us 16 1.4090us 582ns 3.4920us cudaGetSymbolAddress + 0.00% 9.5420us 1 9.5420us 9.5420us 9.5420us cudaSetDevice + 0.00% 3.6290us 3 1.2090us 200ns 2.4090us cuDeviceGetCount + 0.00% 1.5380us 1 1.5380us 1.5380us 1.5380us cudaGetDeviceCount + 0.00% 1.1220us 2 561ns 362ns 760ns cuDeviceGet diff --git a/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/nvprof/nvprof_STDPEventDriven_CUDAStandaloneConfiguration_1000.log b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/nvprof/nvprof_STDPEventDriven_CUDAStandaloneConfiguration_1000.log new file mode 100644 index 00000000..d67458c4 --- /dev/null +++ b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/nvprof/nvprof_STDPEventDriven_CUDAStandaloneConfiguration_1000.log @@ -0,0 +1,24 @@ +==19561== NVPROF is profiling process 19561, command: ./main +==19561== Profiling application: ./main +==19561== Profiling result: + Type Time(%) Time Calls Avg Min Max Name + GPU activities: 33.06% 85.737ms 10000 8.5730us 3.3600us 26.176us kernel_synapses_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, double*, int, double*, int, int*, double*, double, double*, int, int*, int, int*, int) + 16.85% 43.713ms 10000 4.3710us 3.8720us 6.4320us kernel_neurongroup_stateupdater_codeobject(unsigned int, unsigned int, double*, double*, double*) + 13.67% 35.462ms 10000 3.5460us 3.4560us 7.1040us kernel_synapses_post_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, double*, int, double*, int, double, double*, int, int*, int*, int) + 9.83% 25.505ms 10000 2.5500us 2.2400us 2.8480us kernel_poissongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double*, double*, float*) + 7.03% 18.243ms 10000 1.8240us 1.7600us 2.5920us kernel_neurongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double*) + 7.01% 18.182ms 10000 1.8180us 1.6960us 2.2080us kernel_neurongroup_resetter_codeobject(unsigned int, unsigned int, double*, int*) + 6.41% 16.614ms 10000 1.6610us 1.5360us 1.9520us _GLOBAL__N__70_tmpxft_00004a64_00000000_6_poissongroup_thresholder_codeobject_cpp1_ii_7314966e::_reset_poissongroup_thresholder_codeobject(int*) + 6.01% 15.583ms 10000 1.5580us 1.4720us 1.7280us _GLOBAL__N__69_tmpxft_00004a60_00000000_6_neurongroup_thresholder_codeobject_cpp1_ii_c0b8948b::_reset_neurongroup_thresholder_codeobject(int*) + 0.13% 330.21us 1 330.21us 330.21us 330.21us void gen_sequenced(curandStateXORWOW*, int))>(curandStateXORWOW*, float*, unsigned long, unsigned long, int) + API calls: 84.19% 838.94ms 80001 10.486us 9.1520us 8.9317ms cudaLaunch + 11.76% 117.21ms 580005 202ns 157ns 419.79us cudaSetupArgument + 2.27% 22.642ms 80001 283ns 205ns 337.22us cudaConfigureCall + 1.74% 17.290ms 60002 288ns 220ns 371.39us cudaGetLastError + 0.02% 198.76us 1 198.76us 198.76us 198.76us cudaMalloc + 0.01% 139.83us 1 139.83us 139.83us 139.83us cudaMemGetInfo + 0.00% 37.555us 10 3.7550us 3.0440us 6.0110us cudaFuncGetAttributes + 0.00% 31.926us 41 778ns 680ns 1.6620us cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags + 0.00% 13.490us 1 13.490us 13.490us 13.490us cudaDeviceSynchronize + 0.00% 8.0030us 16 500ns 369ns 1.0430us cudaDeviceGetAttribute + 0.00% 4.0740us 4 1.0180us 792ns 1.5710us cudaGetDevice diff --git a/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/nvprof/nvprof_STDPEventDriven_GeNNConfigurationOptimized_1000.log b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/nvprof/nvprof_STDPEventDriven_GeNNConfigurationOptimized_1000.log new file mode 100644 index 00000000..e03d86b4 --- /dev/null +++ b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/nvprof/nvprof_STDPEventDriven_GeNNConfigurationOptimized_1000.log @@ -0,0 +1,23 @@ +==20069== NVPROF is profiling process 20069, command: ./main test 1.0 1 +==20069== Profiling application: ./main test 1.0 1 +==20069== Profiling result: + Type Time(%) Time Calls Avg Min Max Name + GPU activities: 61.14% 103.55ms 10000 10.355us 1.5680us 49.760us calcSynapses + 24.06% 40.741ms 10000 4.0740us 3.0720us 6.6880us calcNeurons + 14.71% 24.918ms 10000 2.4910us 2.3360us 10.560us learnSynapsesPost + 0.06% 94.593us 70 1.3510us 960ns 2.0800us [CUDA memcpy HtoD] + 0.03% 55.553us 19 2.9230us 2.0160us 4.8010us [CUDA memcpy DtoH] + API calls: 56.62% 434.02ms 20 21.701ms 16.555us 432.05ms cudaHostAlloc + 40.47% 310.26ms 30000 10.341us 9.4140us 347.35us cudaLaunch + 1.37% 10.508ms 30000 350ns 275ns 330.81us cudaConfigureCall + 1.08% 8.2824ms 30000 276ns 221ns 333.57us cudaSetupArgument + 0.25% 1.9098ms 95 20.103us 434ns 41.200us cudaMemcpy + 0.13% 998.34us 20 49.917us 13.252us 259.26us cudaMalloc + 0.05% 419.41us 94 4.4610us 183ns 162.60us cuDeviceGetAttribute + 0.02% 126.30us 1 126.30us 126.30us 126.30us cuDeviceTotalMem + 0.00% 38.221us 1 38.221us 38.221us 38.221us cuDeviceGetName + 0.00% 29.710us 20 1.4850us 972ns 6.2560us cudaGetSymbolAddress + 0.00% 9.9350us 1 9.9350us 9.9350us 9.9350us cudaSetDevice + 0.00% 3.4560us 3 1.1520us 236ns 2.5630us cuDeviceGetCount + 0.00% 1.7080us 1 1.7080us 1.7080us 1.7080us cudaGetDeviceCount + 0.00% 1.2540us 2 627ns 267ns 987ns cuDeviceGet diff --git a/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/nvprof/nvprof_STDPNotEventDriven_CUDAStandaloneConfiguration_1000.log b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/nvprof/nvprof_STDPNotEventDriven_CUDAStandaloneConfiguration_1000.log new file mode 100644 index 00000000..75f290df --- /dev/null +++ b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/nvprof/nvprof_STDPNotEventDriven_CUDAStandaloneConfiguration_1000.log @@ -0,0 +1,25 @@ +==18533== NVPROF is profiling process 18533, command: ./main +==18533== Profiling application: ./main +==18533== Profiling result: + Type Time(%) Time Calls Avg Min Max Name + GPU activities: 27.01% 73.513ms 10000 7.3510us 3.2960us 29.120us kernel_synapses_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, double*, int, double*, int, int*, int, double*, double, double*, int, int*, int, int*) + 16.08% 43.771ms 10000 4.3770us 3.9040us 6.3360us kernel_neurongroup_stateupdater_codeobject(unsigned int, unsigned int, double*, double*, double*) + 12.83% 34.925ms 10000 3.4920us 3.3920us 6.4000us kernel_synapses_post_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, double*, int, double*, int, int*, int, double, double*, int, int*) + 10.01% 27.253ms 10000 2.7250us 2.6240us 3.2000us kernel_synapses_stateupdater_codeobject(unsigned int, unsigned int, double*, int, double*, int, double*, int*) + 9.18% 24.982ms 10000 2.4980us 2.2080us 2.6880us kernel_poissongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double*, double*, float*) + 6.70% 18.244ms 10000 1.8240us 1.7280us 2.0800us kernel_neurongroup_resetter_codeobject(unsigned int, unsigned int, double*, int*) + 6.70% 18.236ms 10000 1.8230us 1.7280us 2.6240us kernel_neurongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double*) + 5.76% 15.677ms 10000 1.5670us 1.4720us 1.6960us _GLOBAL__N__69_tmpxft_00004642_00000000_6_neurongroup_thresholder_codeobject_cpp1_ii_c0b8948b::_reset_neurongroup_thresholder_codeobject(int*) + 5.59% 15.219ms 10000 1.5210us 1.4400us 1.9520us _GLOBAL__N__70_tmpxft_00004643_00000000_6_poissongroup_thresholder_codeobject_cpp1_ii_7314966e::_reset_poissongroup_thresholder_codeobject(int*) + 0.12% 330.63us 1 330.63us 330.63us 330.63us void gen_sequenced(curandStateXORWOW*, int))>(curandStateXORWOW*, float*, unsigned long, unsigned long, int) + API calls: 85.78% 968.25ms 90001 10.758us 9.3110us 9.2828ms cudaLaunch + 10.41% 117.51ms 660005 178ns 137ns 367.56us cudaSetupArgument + 2.19% 24.694ms 90001 274ns 200ns 349.17us cudaConfigureCall + 1.59% 17.897ms 70002 255ns 203ns 333.24us cudaGetLastError + 0.02% 201.51us 1 201.51us 201.51us 201.51us cudaMalloc + 0.01% 131.77us 1 131.77us 131.77us 131.77us cudaMemGetInfo + 0.00% 51.691us 74 698ns 591ns 1.8080us cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags + 0.00% 47.398us 12 3.9490us 3.1850us 6.2570us cudaFuncGetAttributes + 0.00% 13.229us 1 13.229us 13.229us 13.229us cudaDeviceSynchronize + 0.00% 9.0230us 20 451ns 339ns 845ns cudaDeviceGetAttribute + 0.00% 4.9330us 5 986ns 852ns 1.4630us cudaGetDevice diff --git a/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/nvprof/nvprof_STDPNotEventDriven_GeNNConfigurationOptimized_1000.log b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/nvprof/nvprof_STDPNotEventDriven_GeNNConfigurationOptimized_1000.log new file mode 100644 index 00000000..aa874e31 --- /dev/null +++ b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/nvprof/nvprof_STDPNotEventDriven_GeNNConfigurationOptimized_1000.log @@ -0,0 +1,24 @@ +==19043== NVPROF is profiling process 19043, command: ./main test 1.0 1 +==19043== Profiling application: ./main test 1.0 1 +==19043== Profiling result: + Type Time(%) Time Calls Avg Min Max Name + GPU activities: 39.69% 65.436ms 10000 6.5430us 1.5680us 25.984us calcSynapses + 24.07% 39.692ms 10000 3.9690us 3.1040us 6.4320us calcNeurons + 20.72% 34.165ms 10000 3.4160us 3.1040us 6.0800us calcSynapseDynamics + 15.42% 25.426ms 10000 2.5420us 2.3680us 6.6880us learnSynapsesPost + 0.06% 96.800us 72 1.3440us 960ns 2.0800us [CUDA memcpy HtoD] + 0.04% 59.552us 21 2.8350us 2.0480us 4.7680us [CUDA memcpy DtoH] + API calls: 51.75% 397.10ms 40000 9.9270us 9.2210us 345.12us cudaLaunch + 44.63% 342.53ms 21 16.311ms 16.914us 340.57ms cudaHostAlloc + 1.75% 13.449ms 40000 336ns 278ns 330.54us cudaConfigureCall + 1.40% 10.778ms 40000 269ns 210ns 335.21us cudaSetupArgument + 0.26% 1.9587ms 97 20.192us 407ns 40.743us cudaMemcpy + 0.13% 990.84us 21 47.182us 13.183us 232.13us cudaMalloc + 0.05% 400.17us 94 4.2570us 154ns 155.67us cuDeviceGetAttribute + 0.01% 113.90us 1 113.90us 113.90us 113.90us cuDeviceTotalMem + 0.00% 36.839us 1 36.839us 36.839us 36.839us cuDeviceGetName + 0.00% 30.866us 21 1.4690us 942ns 6.1960us cudaGetSymbolAddress + 0.00% 9.2900us 1 9.2900us 9.2900us 9.2900us cudaSetDevice + 0.00% 3.2500us 3 1.0830us 238ns 2.4920us cuDeviceGetCount + 0.00% 1.6970us 1 1.6970us 1.6970us 1.6970us cudaGetDeviceCount + 0.00% 1.0870us 2 543ns 238ns 849ns cuDeviceGet diff --git a/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/nvprof/nvprof_STDP_CUDAStandaloneConfiguration_1000.log b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/nvprof/nvprof_STDP_CUDAStandaloneConfiguration_1000.log new file mode 100644 index 00000000..69fa92bb --- /dev/null +++ b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/nvprof/nvprof_STDP_CUDAStandaloneConfiguration_1000.log @@ -0,0 +1,26 @@ +==8893== NVPROF is profiling process 8893, command: ./main +==8893== Profiling application: ./main +==8893== Profiling result: + Type Time(%) Time Calls Avg Min Max Name + GPU activities: 32.18% 119.34ms 10000 11.934us 1.6000us 26.926ms kernel_spikemonitor_codeobject(unsigned int, int*, double, int*, int*, int*, int, int*, double*, int, int*, int*) + 20.94% 77.684ms 10000 7.7680us 3.3600us 25.728us kernel_synapses_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, double*, int, double*, int, int*, double*, double, double*, int, int*, int, int*, int) + 11.71% 43.439ms 10000 4.3430us 3.8400us 6.0800us kernel_neurongroup_stateupdater_codeobject(unsigned int, unsigned int, double*, double*, double*) + 9.85% 36.549ms 10000 3.6540us 3.5520us 7.0080us kernel_synapses_post_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, double*, int, double*, int, double, double*, int, int*, int*, int) + 6.79% 25.173ms 10000 2.5170us 2.1760us 3.6800us kernel_poissongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double*, double*, float*) + 4.91% 18.216ms 10000 1.8210us 1.7280us 4.3200us kernel_neurongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double*) + 4.78% 17.745ms 10000 1.7740us 1.5680us 3.6800us kernel_neurongroup_resetter_codeobject(unsigned int, unsigned int, double*, int*) + 4.51% 16.723ms 10000 1.6720us 1.6000us 3.2640us _GLOBAL__N__70_tmpxft_00002089_00000000_6_poissongroup_thresholder_codeobject_cpp1_ii_7314966e::_reset_poissongroup_thresholder_codeobject(int*) + 4.22% 15.645ms 10000 1.5640us 1.3760us 3.5200us _GLOBAL__N__69_tmpxft_00002086_00000000_6_neurongroup_thresholder_codeobject_cpp1_ii_c0b8948b::_reset_neurongroup_thresholder_codeobject(int*) + 0.09% 330.21us 1 330.21us 330.21us 330.21us void gen_sequenced(curandStateXORWOW*, int))>(curandStateXORWOW*, float*, unsigned long, unsigned long, int) + 0.02% 68.192us 1 68.192us 68.192us 68.192us _run_spikemonitor_codeobject_init(void) + API calls: 85.86% 977.23ms 90002 10.857us 9.1510us 10.887ms cudaLaunch + 10.27% 116.88ms 700005 166ns 137ns 331.60us cudaSetupArgument + 2.12% 24.161ms 90002 268ns 176ns 335.73us cudaConfigureCall + 1.71% 19.473ms 70003 278ns 205ns 329.44us cudaGetLastError + 0.02% 213.58us 1 213.58us 213.58us 213.58us cudaMalloc + 0.01% 133.47us 1 133.47us 133.47us 133.47us cudaMemGetInfo + 0.00% 43.181us 11 3.9250us 3.2280us 6.4450us cudaFuncGetAttributes + 0.00% 32.048us 42 763ns 627ns 1.7760us cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags + 0.00% 13.358us 1 13.358us 13.358us 13.358us cudaDeviceSynchronize + 0.00% 7.7520us 16 484ns 356ns 1.1240us cudaDeviceGetAttribute + 0.00% 4.0510us 4 1.0120us 822ns 1.5030us cudaGetDevice diff --git a/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/nvprof/nvprof_STDP_GeNNConfigurationOptimized_1000.log b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/nvprof/nvprof_STDP_GeNNConfigurationOptimized_1000.log new file mode 100644 index 00000000..222cc004 --- /dev/null +++ b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/nvprof/nvprof_STDP_GeNNConfigurationOptimized_1000.log @@ -0,0 +1,23 @@ +==9420== NVPROF is profiling process 9420, command: ./main test 1.0 1 +==9420== Profiling application: ./main test 1.0 1 +==9420== Profiling result: + Type Time(%) Time Calls Avg Min Max Name + GPU activities: 50.20% 103.62ms 10000 10.362us 1.5680us 45.248us calcSynapses + 19.97% 41.214ms 10000 4.1210us 3.1040us 6.7200us calcNeurons + 17.73% 36.597ms 17812 2.0540us 2.0160us 4.7360us [CUDA memcpy DtoH] + 12.06% 24.885ms 10000 2.4880us 2.3680us 10.848us learnSynapsesPost + 0.05% 94.016us 70 1.3430us 960ns 2.0480us [CUDA memcpy HtoD] + API calls: 34.18% 358.40ms 20 17.920ms 8.3270us 356.55ms cudaHostAlloc + 32.17% 337.26ms 30000 11.241us 9.5510us 356.07us cudaLaunch + 31.72% 332.57ms 20095 16.549us 231ns 988.56us cudaMemcpy + 1.03% 10.770ms 30000 358ns 283ns 331.73us cudaConfigureCall + 0.77% 8.0617ms 30000 268ns 208ns 334.35us cudaSetupArgument + 0.08% 809.75us 20 40.487us 8.0280us 232.94us cudaMalloc + 0.04% 401.78us 94 4.2740us 161ns 156.02us cuDeviceGetAttribute + 0.01% 113.16us 1 113.16us 113.16us 113.16us cuDeviceTotalMem + 0.00% 37.103us 1 37.103us 37.103us 37.103us cuDeviceGetName + 0.00% 22.451us 20 1.1220us 525ns 5.8000us cudaGetSymbolAddress + 0.00% 9.5720us 1 9.5720us 9.5720us 9.5720us cudaSetDevice + 0.00% 3.2610us 3 1.0870us 219ns 2.3710us cuDeviceGetCount + 0.00% 1.6100us 1 1.6100us 1.6100us 1.6100us cudaGetDeviceCount + 0.00% 1.0470us 2 523ns 250ns 797ns cuDeviceGet diff --git a/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/plots/speed_test_CUBA_absolute.png b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/plots/speed_test_CUBA_absolute.png new file mode 100644 index 00000000..16624531 Binary files /dev/null and b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/plots/speed_test_CUBA_absolute.png differ diff --git a/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/plots/speed_test_CUBA_profiling.png b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/plots/speed_test_CUBA_profiling.png new file mode 100644 index 00000000..f5bf2a7b Binary files /dev/null and b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/plots/speed_test_CUBA_profiling.png differ diff --git a/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/plots/speed_test_CUBA_relative.png b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/plots/speed_test_CUBA_relative.png new file mode 100644 index 00000000..8e3fd355 Binary files /dev/null and b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/plots/speed_test_CUBA_relative.png differ diff --git a/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/plots/speed_test_STDPEventDriven_absolute.png b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/plots/speed_test_STDPEventDriven_absolute.png new file mode 100644 index 00000000..5c7b42a0 Binary files /dev/null and b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/plots/speed_test_STDPEventDriven_absolute.png differ diff --git a/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/plots/speed_test_STDPEventDriven_profiling.png b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/plots/speed_test_STDPEventDriven_profiling.png new file mode 100644 index 00000000..f15e8c3d Binary files /dev/null and b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/plots/speed_test_STDPEventDriven_profiling.png differ diff --git a/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/plots/speed_test_STDPEventDriven_relative.png b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/plots/speed_test_STDPEventDriven_relative.png new file mode 100644 index 00000000..a5144e31 Binary files /dev/null and b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/plots/speed_test_STDPEventDriven_relative.png differ diff --git a/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/plots/speed_test_STDPNotEventDriven_absolute.png b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/plots/speed_test_STDPNotEventDriven_absolute.png new file mode 100644 index 00000000..585942c0 Binary files /dev/null and b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/plots/speed_test_STDPNotEventDriven_absolute.png differ diff --git a/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/plots/speed_test_STDPNotEventDriven_profiling.png b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/plots/speed_test_STDPNotEventDriven_profiling.png new file mode 100644 index 00000000..f411ff6d Binary files /dev/null and b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/plots/speed_test_STDPNotEventDriven_profiling.png differ diff --git a/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/plots/speed_test_STDPNotEventDriven_relative.png b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/plots/speed_test_STDPNotEventDriven_relative.png new file mode 100644 index 00000000..cf91a072 Binary files /dev/null and b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/plots/speed_test_STDPNotEventDriven_relative.png differ diff --git a/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/plots/speed_test_STDP_absolute.png b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/plots/speed_test_STDP_absolute.png new file mode 100644 index 00000000..4f20032c Binary files /dev/null and b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/plots/speed_test_STDP_absolute.png differ diff --git a/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/plots/speed_test_STDP_profiling.png b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/plots/speed_test_STDP_profiling.png new file mode 100644 index 00000000..c4aad905 Binary files /dev/null and b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/plots/speed_test_STDP_profiling.png differ diff --git a/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/plots/speed_test_STDP_relative.png b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/plots/speed_test_STDP_relative.png new file mode 100644 index 00000000..7b88c45e Binary files /dev/null and b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/plots/speed_test_STDP_relative.png differ diff --git a/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/run_speed_test_script.py b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/run_speed_test_script.py new file mode 100644 index 00000000..beeb8f92 --- /dev/null +++ b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/run_speed_test_script.py @@ -0,0 +1,291 @@ +import os +import shutil +import glob +import subprocess +import sys +import socket + +# run tests without X-server +import matplotlib +matplotlib.use('Agg') + +# pretty plots +import seaborn + +import time +import datetime +import cPickle as pickle + +from brian2 import * +from brian2.tests.features import * +from brian2.tests.features.base import * +from brian2.tests.features.base import results + +import brian2cuda +from brian2cuda.tests.features.cuda_configuration import (CUDAStandaloneConfiguration, + CUDAStandaloneConfigurationNoAssert, + CUDAStandaloneConfigurationExtraThresholdKernel, + CUDAStandaloneConfigurationCurandDouble, + CUDAStandaloneConfigurationNoCudaOccupancyAPI, + CUDAStandaloneConfigurationNoCudaOccupancyAPIProfileCPU, + CUDAStandaloneConfiguration2BlocksPerSM, + CUDAStandaloneConfiguration2BlocksPerSMLaunchBounds, + CUDAStandaloneConfigurationSynLaunchBounds, + CUDAStandaloneConfiguration2BlocksPerSMSynLaunchBounds, + CUDAStandaloneConfigurationProfileGPU, + CUDAStandaloneConfigurationProfileCPU) + #CUDAStandaloneConfigurationTestBrunelHeteroAtomics, + #CUDAStandaloneConfigurationTestBrunelHeteroAtomicsProfileCPU, + #CUDAStandaloneConfigurationPushAtomicResize, + #CUDAStandaloneConfigurationBrunelHeterogAndPushAtomicResize, + #CUDAStandaloneConfigurationPushAtomicResizeProfileCPU, + #CUDAStandaloneConfigurationBrunelHeterogAndPushAtomicResizeProfileCPU, + #CUDAStandaloneConfigurationPushAtomicResizeAsyncMemcpy, + #CUDAStandaloneConfigurationPushAtomicResizeAsyncMemcpyProfileCPU) +from brian2cuda.tests.features.speed import * + +from brian2genn.correctness_testing import GeNNConfiguration, GeNNConfigurationCPU, GeNNConfigurationOptimized + +from create_readme import create_readme + +assert len(sys.argv)<= 2, 'Only one command line argument supported! Got {}'.format(len(sys.argv)-1) +if len(sys.argv) == 2: + additional_dir_name = '_' + sys.argv[1] +else: + additional_dir_name = '' + +prefs['devices.cpp_standalone.extra_make_args_unix'] = ['-j12'] + +# host specific settings +if socket.gethostname() == 'elnath': + prefs['devices.cpp_standalone.extra_make_args_unix'] = ['-j24'] + prefs['codegen.cuda.extra_compile_args_nvcc'].remove('-arch=sm_35') + prefs['codegen.cuda.extra_compile_args_nvcc'].extend(['-arch=sm_20']) + +configs = [# configuration project_directory + #(NumpyConfiguration, None), + #(WeaveConfiguration, None), + #(LocalConfiguration, None), + (CPPStandaloneConfiguration, 'cpp_standalone'), + (CPPStandaloneConfigurationOpenMP, 'cpp_standalone'), + (CUDAStandaloneConfiguration, 'cuda_standalone'), + #(CUDAStandaloneConfigurationPushAtomicResize, 'cuda_standalone'), + #(CUDAStandaloneConfigurationTestBrunelHeteroAtomics, 'cuda_standalone'), + #(CUDAStandaloneConfigurationBrunelHeterogAndPushAtomicResize, 'cuda_standalone'), + #(CUDAStandaloneConfigurationExtraThresholdKernel, 'cuda_standalone'), + #(CUDAStandaloneConfigurationNoAssert, 'cuda_standalone'), + #(CUDAStandaloneConfigurationCurandDouble, 'cuda_standalone'), + #(CUDAStandaloneConfigurationNoCudaOccupancyAPI, 'cuda_standalone'), + #(CUDAStandaloneConfigurationNoCudaOccupancyAPIProfileCPU, 'cuda_standalone'), + #(CUDAStandaloneConfiguration2BlocksPerSM, 'cuda_standalone'), + #(CUDAStandaloneConfiguration2BlocksPerSMLaunchBounds, 'cuda_standalone'), + #(CUDAStandaloneConfigurationSynLaunchBounds, 'cuda_standalone'), + #(CUDAStandaloneConfiguration2BlocksPerSMSynLaunchBounds, 'cuda_standalone'), + #(CUDAStandaloneConfigurationProfileGPU, 'cuda_standalone'), + #(CUDAStandaloneConfigurationProfileCPU, 'cuda_standalone'), + #(CUDAStandaloneConfigurationTestBrunelHeteroAtomicsProfileCPU, 'cuda_standalone'), + #(CUDAStandaloneConfigurationPushAtomicResizeProfileCPU, 'cuda_standalone'), + #(CUDAStandaloneConfigurationBrunelHeterogAndPushAtomicResizeProfileCPU, 'cuda_standalone'), + #(CUDAStandaloneConfigurationPushAtomicResizeAsyncMemcpy, 'cuda_standalone'), + #(CUDAStandaloneConfigurationPushAtomicResizeAsyncMemcpyProfileCPU, 'cuda_standalone'), + #(GeNNConfiguration, 'GeNNworkspace'), + #(GeNNConfigurationCPU, 'GeNNworkspace'), + (GeNNConfigurationOptimized, 'GeNNworkspace') + ] + +speed_tests = [# feature_test name n_slice + + #(ThresholderOnlyPoissonLowRate, 'ThresholderOnlyPoissonLowRate', slice(None) ), + #(ThresholderOnlyPoissonMediumRate, 'ThresholderOnlyPoissonMediumRate', slice(None) ), + #(ThresholderOnlyPoissonHighRate, 'ThresholderOnlyPoissonHighRate', slice(None) ), + #(ThresholderOnlyAlwaysSpiking, 'ThresholderOnlyAlwaysSpiking', slice(None) ), + + #(BrunelHakimStateupdateOnlyDouble, 'BrunelHakimStateupdateOnlyDouble', slice(None) ), + #(BrunelHakimStateupdateOnlyTriple, 'BrunelHakimStateupdateOnlyTriple', slice(None) ), + #(BrunelHakimStateupdateOnly, 'BrunelHakimStateupdateOnly', slice(None) ), + #(BrunelHakimNeuronsOnly, 'BrunelHakimNeuronsOnly', slice(None) ), + #(BrunelHakimNeuronsOnlyNoXi, 'BrunelHakimNeuronsOnlyNoXi', slice(None) ), + #(BrunelHakimNeuronsOnlyNoRand, 'BrunelHakimNeuronsOnlyNoRand', slice(None) ), + #(BrunelHakimStateupdateThresholdOnly, 'BrunelHakimStateupdateThresholdOnly', slice(None) ), + #(BrunelHakimStateupdateThresholdResetOnly, 'BrunelHakimStateupdateThresholdResetOnly', slice(None) ), + #(BrunelHakimModelScalarDelayShort, 'BrunelHakimModelScalarDelayShort', slice(None) ), + #(BrunelHakimModelScalarDelayNoSelfConnections, 'BrunelHakimModelScalarDelayNoSelfConnections', slice(None) ), + #(COBAHH, 'COBAHH', slice(None) ), + #(AdaptationOscillation, 'AdaptationOscillation', slice(None) ), + #(Vogels, 'Vogels', slice(None) ), + (STDP, 'STDP', slice(None) ), + (STDPEventDriven, 'STDPEventDriven', slice(None) ), + (CUBA, 'CUBA', slice(None) ), + #(BrunelHakimModelScalarDelay, 'BrunelHakimModelScalarDelay', slice(None) ), + + #(VerySparseMediumRateSynapsesOnly, 'VerySparseMediumRateSynapsesOnly', slice(None) ), + #(SparseMediumRateSynapsesOnly, 'SparseMediumRateSynapsesOnly', slice(None) ), + #(DenseMediumRateSynapsesOnly, 'DenseMediumRateSynapsesOnly', slice(None) ), + #(SparseLowRateSynapsesOnly, 'SparseLowRateSynapsesOnly', slice(None) ), + #(SparseHighRateSynapsesOnly, 'SparseHighRateSynapsesOnly', slice(None) ), + + #(STDPNotEventDriven, 'STDPNotEventDriven', slice(None) ), + #(STDPMultiPost, 'STDPMultiPost', slice(None) ), + #(STDPNeuronalTraces, 'STDPNeuronalTraces', slice(None) ), + #(STDPMultiPostNeuronalTraces, 'STDPMultiPostNeuronalTraces', slice(None) ), + + #(BrunelHakimModelHeterogeneousDelay, 'BrunelHakimModelHeterogeneousDelay', slice(0,-1,1) ), + + #(LinearNeuronsOnly, 'LinearNeuronsOnly', slice(None) ), + #(HHNeuronsOnly, 'HHNeuronsOnly', slice(None) ), + #(VogelsWithSynapticDynamic, 'VogelsWithSynapticDynamic', slice(None) ), + + ### below uses monitors + #(CUBAFixedConnectivity, 'CUBAFixedConnectivity', slice(None) ), + #(COBAHHFixedConnectivity, 'COBAHHFixedConnectivity', slice(None, -1) ), +] + +configurations = [config[0] for config in configs] +project_dirs = [config[1] for config in configs] + +# check if multiple Configurations with same project_dirs are specified +last_idx = {} +for proj_dir in project_dirs: + if proj_dir is not None: + first_i = project_dirs.index(proj_dir) + last_i = len(project_dirs) - 1 - project_dirs[::-1].index(proj_dir) + if first_i != last_i: + print("WARNING there are multiple configurations using {d} as project " + "directory. Profiling and logfiles will only be saved for the last one {c}.".format( + d=proj_dir, c=configurations[last_i].__name__)) + last_idx[proj_dir] = last_i + +time_stemp = time.time() +date_str = datetime.datetime.fromtimestamp(time_stemp).strftime('%Y_%m_%d') + +directory = 'results_{}{}'.format(date_str, additional_dir_name) +if os.path.exists(directory): + new_dir = directory + '_bak_' + str(int(time.time())) + print("Directory with name `{}` already exists. Renaming it to `{}`.".format(directory, new_dir)) + os.rename(directory, new_dir) +os.makedirs(directory) +data_dir = os.path.join(directory, 'data') +plot_dir = os.path.join(directory, 'plots') +log_dir = os.path.join(directory, 'logs') +prof_dir = os.path.join(directory, 'nvprof') +os.makedirs(data_dir) +os.makedirs(plot_dir) +os.makedirs(log_dir) +os.makedirs(prof_dir) +print("Saving results in {}.".format(plot_dir)) + +shutil.copy(os.path.realpath(__file__), os.path.join(directory, 'run_speed_test_script.py')) + +time_format = '%d.%m.%Y at %H:%M:%S' +script_start = datetime.datetime.fromtimestamp(time.time()).strftime(time_format) + +with open(os.path.join(directory, 'git.diff'), 'w') as diff_file: + subprocess.call(['git', 'diff'], stdout=diff_file) + +try: + for n, (st, name, sl) in enumerate(speed_tests): + start = datetime.datetime.fromtimestamp(time.time()).strftime(time_format) + print("Starting {} on {}.".format(name, start)) + maximum_run_time = 1*60*60*second + res = run_speed_tests(configurations=configurations, + speed_tests=[st], + n_slice=sl, + #n_slice=slice(0,1,None), + run_twice=False, + verbose=True, + maximum_run_time=maximum_run_time#, + ## this needs modification of brian2 code + #profile_only_active=True + #profile_only_active=False + ) + end = datetime.datetime.fromtimestamp(time.time()).strftime(time_format) + diff = datetime.datetime.strptime(end, time_format) - datetime.datetime.strptime(start, time_format) + print("Running {} took {}.".format(name, diff)) + res.plot_all_tests() + ## this needs modification of brian2 code + #res.plot_all_tests(print_relative=True) + savefig(os.path.join(plot_dir, 'speed_test_{}_absolute.png'.format(speed_tests[n][1]))) + res.plot_all_tests(relative=True) + savefig(os.path.join(plot_dir, 'speed_test_{}_relative.png'.format(name))) + res.plot_all_tests(profiling_minimum=0.05) + savefig(os.path.join(plot_dir, 'speed_test_{}_profiling.png'.format(name))) + res.plot_all_tests(profiling_minimum=0.15) + savefig(os.path.join(plot_dir, 'speed_test_min_15_{}_profiling.png'.format(name))) + + res.plot_all_tests() + ## this needs modification of brian2 code + #res.plot_all_tests(print_relative=True) + savefig(os.path.join(plot_dir, 'speed_test_{}_absolute.svg'.format(speed_tests[n][1]))) + res.plot_all_tests(relative=True) + savefig(os.path.join(plot_dir, 'speed_test_{}_relative.svg'.format(name))) + res.plot_all_tests(profiling_minimum=0.05) + savefig(os.path.join(plot_dir, 'speed_test_{}_profiling.svg'.format(name))) + res.plot_all_tests(profiling_minimum=0.15) + savefig(os.path.join(plot_dir, 'speed_test_min_15_{}_profiling.svg'.format(name))) + + if 3 != len(get_fignums()): + print("WARNING: There were {} plots created, but only {} saved.".format(len(get_fignums()), 3*(n+1))) + for n in get_fignums(): + close(n) + + # pickel results object to disk + pkl_file = os.path.join(data_dir, name + '.pkl' ) + with open(pkl_file, 'wb') as output: + pickle.dump(res, output, pickle.HIGHEST_PROTOCOL) + + # save stdout log of last run (the other are deleted in run_speed_tests()) + for proj_dir in set(project_dirs): + if not proj_dir is None and proj_dir in ['cuda_standalone', 'cpp_standalone']: + config = configurations[last_idx[proj_dir]] + stdout_file = os.path.join(proj_dir, 'results/stdout.txt') + if os.path.exists(stdout_file): + shutil.copy(stdout_file, + os.path.join(log_dir, 'stdout_{st}_{conf}_{n}.txt'.format(st=name, conf=proj_dir, + n=st.n_range[sl][-1]))) + else: + print("WARNING Couldn't save {},file not found.".format(stdout_file)) + + # run nvprof on n_range[2] + for conf, proj_dir in zip(configurations, project_dirs): + main_arg = '' + if proj_dir in ['cuda_standalone', 'GeNNworkspace']: + if proj_dir == 'GeNNworkspace': + main_arg = 'test {time} 1'.format(time=st.duration/second) + ns = st.n_range[sl] + idx = 2 + max_runtime = 20 + conf_name = conf.__name__ + print("Rerunning {} with n = {} for nvprof profiling".format(conf_name, st.n_range[idx])) + tb, res, runtime, prof_info = results(conf, st, st.n_range[idx], maximum_run_time=maximum_run_time) + if not isinstance(res, Exception) and runtime < max_runtime: + option = '--profile-from-start-off' if proj_dir == 'cuda_standalone' else '' + cmd = 'cd {proj_dir} && nvprof {opt} --log-file ../{log_file} ./main {arg}'.format( + proj_dir=proj_dir, arg=main_arg, opt=option, + log_file=os.path.join(prof_dir, 'nvprof_{st}_{conf}_{n}.log'.format( + st=name, conf=conf_name, n=st.n_range[idx]))) + prof_start = datetime.datetime.fromtimestamp(time.time()).strftime(time_format) + print(cmd) + x = os.system(cmd) + if x: + print('nvprof failed with {}'.format(x)) + prof_end = datetime.datetime.fromtimestamp(time.time()).strftime(time_format) + prof_diff = datetime.datetime.strptime(prof_end, time_format) - datetime.datetime.strptime(prof_start, time_format) + print("Profiling took {} for runtime of {}".format(prof_diff, runtime)) +finally: + create_readme(directory) + print("\nSummarized speed test results in {}".format(directory + '/README.md')) + script_end = datetime.datetime.fromtimestamp(time.time()).strftime(time_format) + script_diff = datetime.datetime.strptime(script_end, time_format) - datetime.datetime.strptime(script_start, time_format) + print("Finished speed test on {}. Total time = {}.".format( + datetime.datetime.fromtimestamp(time.time()).strftime(time_format), script_diff)) + + +##res.plot_all_tests(relative=True) +#for n in get_fignums(): +# plt.figure(n) +# savefig(plot_dir + '/speed_test_{}.png'.format(speed_tests[n-1][1])) + +## Debug (includes profiling infos) +#from brian2.tests.features.base import results +#for x in results(LocalConfiguration, LinearNeuronsOnly, 10, maximum_run_time=10*second): +# print x diff --git a/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp_profiled/README.md b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp_profiled/README.md new file mode 100644 index 00000000..364f3d4c --- /dev/null +++ b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp_profiled/README.md @@ -0,0 +1,254 @@ + +# Benchmark results from 29.11.2017 +## Description: + + + +## Last git log: +``` +commit 65e51048f25caaee2a6e0396269f90821d994f85 +Author: Denis Alevi +Date: Mon Nov 27 17:55:05 2017 +0100 + + Add recent benchmark results + +``` +There is also a `git diff` saved in the current directory. + +## Results + +### CUBA +![](plots/speed_test_CUBA_absolute.svg) +![](plots/speed_test_CUBA_profiling.svg) +![](plots/speed_test_CUBA_relative.svg) + +
Examplary `nvprof` results for **CUDAStandaloneConfigurationProfileCPU**

+Profile summary for `N = 1000`: + +``` +==6637== NVPROF is profiling process 6637, command: ./main +==6637== Profiling application: ./main +==6637== Profiling result: + Type Time(%) Time Calls Avg Min Max Name + GPU activities: 27.59% 59.367ms 10000 5.9360us 5.7280us 6.9130us kernel_neurongroup_stateupdater_codeobject(unsigned int, unsigned int, double*, double*, double, double*, double*, double*, bool*) + 23.11% 49.736ms 10000 4.9730us 3.2960us 20.256us kernel_synapses_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, int*, int, double*, double, int*, int, int*, bool*) + 21.48% 46.232ms 10000 4.6230us 3.2960us 15.424us kernel_synapses_1_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, int*, int, double, int*, int, int*, double*, bool*) + 11.66% 25.090ms 10000 2.5080us 2.2720us 3.0080us kernel_neurongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double, double*, double*, bool*) + 8.37% 18.003ms 10000 1.8000us 1.6640us 2.1760us kernel_neurongroup_resetter_codeobject(unsigned int, unsigned int, double*, int*, bool*) + 7.79% 16.764ms 10000 1.6760us 1.6000us 2.0480us _GLOBAL__N__69_tmpxft_000017f7_00000000_6_neurongroup_thresholder_codeobject_cpp1_ii_97ebdcc0::_reset_neurongroup_thresholder_codeobject(int*) + API calls: 55.27% 767.72ms 60000 12.795us 10.547us 9.0021ms cudaLaunch + 35.89% 498.54ms 80001 6.2310us 2.4830us 372.18us cudaDeviceSynchronize + 6.50% 90.343ms 520000 173ns 138ns 371.16us cudaSetupArgument + 1.33% 18.502ms 60000 308ns 238ns 364.34us cudaConfigureCall + 0.99% 13.745ms 50000 274ns 217ns 21.746us cudaGetLastError + 0.01% 138.51us 1 138.51us 138.51us 138.51us cudaMemGetInfo + 0.00% 33.472us 39 858ns 721ns 1.8600us cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags + 0.00% 30.648us 8 3.8310us 3.1320us 5.3030us cudaFuncGetAttributes + 0.00% 6.3800us 12 531ns 343ns 1.3920us cudaDeviceGetAttribute + 0.00% 2.9800us 3 993ns 737ns 1.3910us cudaGetDevice + +``` + +

+ + +
Examplary `nvprof` results for **CUDAStandaloneConfiguration**

+Profile summary for `N = 1000`: + +``` +==5900== NVPROF is profiling process 5900, command: ./main +==5900== Profiling application: ./main +==5900== Profiling result: + Type Time(%) Time Calls Avg Min Max Name + GPU activities: 27.83% 60.653ms 10000 6.0650us 5.7920us 7.0400us kernel_neurongroup_stateupdater_codeobject(unsigned int, unsigned int, double*, double*, double, double*, double*, double*, bool*) + 23.00% 50.122ms 10000 5.0120us 3.2960us 24.320us kernel_synapses_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, int*, int, double*, double, int*, int, int*, bool*) + 20.65% 45.008ms 10000 4.5000us 3.2960us 17.824us kernel_synapses_1_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, int*, int, double, int*, int, int*, double*, bool*) + 11.47% 25.008ms 10000 2.5000us 2.2720us 3.1680us kernel_neurongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double, double*, double*, bool*) + 9.22% 20.085ms 10000 2.0080us 1.8560us 2.1760us kernel_neurongroup_resetter_codeobject(unsigned int, unsigned int, double*, int*, bool*) + 7.83% 17.069ms 10000 1.7060us 1.6320us 2.2400us _GLOBAL__N__69_tmpxft_00001511_00000000_6_neurongroup_thresholder_codeobject_cpp1_ii_97ebdcc0::_reset_neurongroup_thresholder_codeobject(int*) + API calls: 85.16% 640.31ms 60000 10.671us 9.6060us 9.0686ms cudaLaunch + 11.50% 86.475ms 520000 166ns 135ns 344.23us cudaSetupArgument + 1.87% 14.092ms 60000 234ns 176ns 334.30us cudaConfigureCall + 1.43% 10.785ms 50000 215ns 189ns 10.220us cudaGetLastError + 0.02% 139.19us 1 139.19us 139.19us 139.19us cudaMemGetInfo + 0.00% 31.512us 8 3.9390us 3.0080us 5.7970us cudaFuncGetAttributes + 0.00% 29.967us 39 768ns 653ns 1.9770us cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags + 0.00% 12.868us 1 12.868us 12.868us 12.868us cudaDeviceSynchronize + 0.00% 6.2440us 12 520ns 331ns 1.3150us cudaDeviceGetAttribute + 0.00% 3.7510us 3 1.2500us 823ns 1.7170us cudaGetDevice + +``` + +

+ + +*** + +### CUBA - less kernels displayed +![](plots/speed_test_CUBA-less_kernels_displayed_min_15_profiling.svg) + + +*** + +### STDPNotEventDriven +![](plots/speed_test_STDP_absolute.svg) +![](plots/speed_test_STDP_profiling.svg) +![](plots/speed_test_STDP_relative.svg) + +
Examplary `nvprof` results for **CUDAStandaloneConfigurationProfileCPU**

+Profile summary for `N = 1000`: + +``` +==28576== NVPROF is profiling process 28576, command: ./main +==28576== Profiling application: ./main +==28576== Profiling result: + Type Time(%) Time Calls Avg Min Max Name + GPU activities: 26.71% 73.256ms 10000 7.3250us 3.2960us 22.720us kernel_synapses_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, double*, int, double*, int, int*, int, double*, double, double*, int, int*, int, int*) + 15.80% 43.329ms 10000 4.3320us 3.8720us 6.2400us kernel_neurongroup_stateupdater_codeobject(unsigned int, unsigned int, double*, double*, double*) + 12.77% 35.035ms 10000 3.5030us 3.3920us 6.3360us kernel_synapses_post_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, double*, int, double*, int, int*, int, double, double*, int, int*) + 9.94% 27.271ms 10000 2.7270us 2.6240us 3.1680us kernel_synapses_stateupdater_codeobject(unsigned int, unsigned int, double*, int, double*, int, double*, int*) + 9.28% 25.455ms 10000 2.5450us 2.2400us 2.9120us kernel_poissongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double*, double*, float*) + 6.66% 18.254ms 10000 1.8250us 1.7600us 2.0800us kernel_neurongroup_resetter_codeobject(unsigned int, unsigned int, double*, int*) + 6.65% 18.226ms 10000 1.8220us 1.7600us 2.5600us kernel_neurongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double*) + 6.20% 16.991ms 10000 1.6990us 1.6000us 1.9200us _GLOBAL__N__70_tmpxft_00006daf_00000000_6_poissongroup_thresholder_codeobject_cpp1_ii_7314966e::_reset_poissongroup_thresholder_codeobject(int*) + 5.88% 16.118ms 10000 1.6110us 1.4720us 1.8560us _GLOBAL__N__69_tmpxft_00006dad_00000000_6_neurongroup_thresholder_codeobject_cpp1_ii_c0b8948b::_reset_neurongroup_thresholder_codeobject(int*) + 0.12% 330.53us 1 330.53us 330.53us 330.53us void gen_sequenced(curandStateXORWOW*, int))>(curandStateXORWOW*, float*, unsigned long, unsigned long, int) + API calls: 58.50% 1.10914s 90001 12.323us 9.6560us 9.1188ms cudaLaunch + 32.75% 621.00ms 100001 6.2090us 2.3660us 355.02us cudaDeviceSynchronize + 5.78% 109.54ms 660005 165ns 124ns 14.341us cudaSetupArgument + 1.49% 28.313ms 90001 314ns 245ns 12.028us cudaConfigureCall + 1.45% 27.511ms 70002 393ns 230ns 366.98us cudaGetLastError + 0.01% 208.18us 1 208.18us 208.18us 208.18us cudaMalloc + 0.01% 131.79us 1 131.79us 131.79us 131.79us cudaMemGetInfo + 0.00% 55.331us 74 747ns 647ns 1.4820us cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags + 0.00% 44.531us 12 3.7100us 3.1290us 4.8360us cudaFuncGetAttributes + 0.00% 9.1380us 20 456ns 333ns 893ns cudaDeviceGetAttribute + 0.00% 4.2750us 5 855ns 719ns 1.3080us cudaGetDevice + +``` + +

+ + +
Examplary `nvprof` results for **CUDAStandaloneConfiguration**

+Profile summary for `N = 1000`: + +``` +==27879== NVPROF is profiling process 27879, command: ./main +==27879== Profiling application: ./main +==27879== Profiling result: + Type Time(%) Time Calls Avg Min Max Name + GPU activities: 26.99% 74.731ms 10000 7.4730us 3.2960us 27.648us kernel_synapses_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, double*, int, double*, int, int*, int, double*, double, double*, int, int*, int, int*) + 15.88% 43.964ms 10000 4.3960us 3.9360us 6.4000us kernel_neurongroup_stateupdater_codeobject(unsigned int, unsigned int, double*, double*, double*) + 12.62% 34.946ms 10000 3.4940us 3.3920us 6.4960us kernel_synapses_post_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, double*, int, double*, int, int*, int, double, double*, int, int*) + 9.80% 27.129ms 10000 2.7120us 2.3680us 2.9440us kernel_poissongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double*, double*, float*) + 9.58% 26.535ms 10000 2.6530us 2.5600us 3.0400us kernel_synapses_stateupdater_codeobject(unsigned int, unsigned int, double*, int, double*, int, double*, int*) + 6.59% 18.247ms 10000 1.8240us 1.7280us 2.0480us kernel_neurongroup_resetter_codeobject(unsigned int, unsigned int, double*, int*) + 6.58% 18.231ms 10000 1.8230us 1.7600us 2.5600us kernel_neurongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double*) + 6.20% 17.155ms 10000 1.7150us 1.6320us 1.9520us _GLOBAL__N__70_tmpxft_00006ae9_00000000_6_poissongroup_thresholder_codeobject_cpp1_ii_7314966e::_reset_poissongroup_thresholder_codeobject(int*) + 5.65% 15.632ms 10000 1.5630us 1.4720us 1.6960us _GLOBAL__N__69_tmpxft_00006ae5_00000000_6_neurongroup_thresholder_codeobject_cpp1_ii_c0b8948b::_reset_neurongroup_thresholder_codeobject(int*) + 0.12% 329.57us 1 329.57us 329.57us 329.57us void gen_sequenced(curandStateXORWOW*, int))>(curandStateXORWOW*, float*, unsigned long, unsigned long, int) + API calls: 85.57% 910.45ms 90001 10.116us 8.9060us 9.1466ms cudaLaunch + 10.88% 115.80ms 660005 175ns 132ns 353.03us cudaSetupArgument + 2.00% 21.262ms 90001 236ns 181ns 330.07us cudaConfigureCall + 1.50% 15.984ms 70002 228ns 182ns 318.18us cudaGetLastError + 0.02% 207.89us 1 207.89us 207.89us 207.89us cudaMalloc + 0.01% 132.37us 1 132.37us 132.37us 132.37us cudaMemGetInfo + 0.01% 55.857us 74 754ns 674ns 1.5500us cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags + 0.00% 44.986us 12 3.7480us 3.0050us 5.8370us cudaFuncGetAttributes + 0.00% 13.864us 1 13.864us 13.864us 13.864us cudaDeviceSynchronize + 0.00% 9.5470us 20 477ns 338ns 1.1980us cudaDeviceGetAttribute + 0.00% 4.8700us 5 974ns 851ns 1.4220us cudaGetDevice + +``` + +

+ + +*** + +### STDPNotEventDriven - less kernels displayed +![](plots/speed_test_STDP-less_kernels_displayed_min_15_profiling.svg) + + +*** + +### STDPEventDriven +![](plots/speed_test_STDPEventDriven_absolute.svg) +![](plots/speed_test_STDPEventDriven_profiling.svg) +![](plots/speed_test_STDPEventDriven_relative.svg) + +
Examplary `nvprof` results for **CUDAStandaloneConfigurationProfileCPU**

+Profile summary for `N = 1000`: + +``` +==18877== NVPROF is profiling process 18877, command: ./main +==18877== Profiling application: ./main +==18877== Profiling result: + Type Time(%) Time Calls Avg Min Max Name + GPU activities: 33.24% 85.455ms 10000 8.5450us 3.3280us 25.984us kernel_synapses_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, double*, int, double*, int, int*, double*, double, double*, int, int*, int, int*, int) + 16.85% 43.327ms 10000 4.3320us 3.8400us 6.2080us kernel_neurongroup_stateupdater_codeobject(unsigned int, unsigned int, double*, double*, double*) + 13.77% 35.393ms 10000 3.5390us 3.4240us 7.2320us kernel_synapses_post_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, double*, int, double*, int, double, double*, int, int*, int*, int) + 9.92% 25.503ms 10000 2.5500us 2.2400us 2.9760us kernel_poissongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double*, double*, float*) + 7.11% 18.278ms 10000 1.8270us 1.7600us 2.0800us kernel_neurongroup_resetter_codeobject(unsigned int, unsigned int, double*, int*) + 6.37% 16.365ms 10000 1.6360us 1.4080us 1.7920us _GLOBAL__N__70_tmpxft_00004798_00000000_6_poissongroup_thresholder_codeobject_cpp1_ii_7314966e::_reset_poissongroup_thresholder_codeobject(int*) + 6.31% 16.219ms 10000 1.6210us 1.5040us 1.8560us _GLOBAL__N__69_tmpxft_00004796_00000000_6_neurongroup_thresholder_codeobject_cpp1_ii_c0b8948b::_reset_neurongroup_thresholder_codeobject(int*) + 6.31% 16.209ms 10000 1.6200us 1.5360us 2.5920us kernel_neurongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double*) + 0.13% 330.27us 1 330.27us 330.27us 330.27us void gen_sequenced(curandStateXORWOW*, int))>(curandStateXORWOW*, float*, unsigned long, unsigned long, int) + API calls: 57.23% 936.43ms 80001 11.705us 9.9320us 9.2809ms cudaLaunch + 34.35% 562.06ms 90001 6.2450us 2.4600us 359.92us cudaDeviceSynchronize + 5.96% 97.491ms 580005 168ns 132ns 357.12us cudaSetupArgument + 1.41% 23.032ms 80001 287ns 242ns 13.914us cudaConfigureCall + 1.02% 16.685ms 60002 278ns 235ns 14.273us cudaGetLastError + 0.01% 200.02us 1 200.02us 200.02us 200.02us cudaMalloc + 0.01% 134.78us 1 134.78us 134.78us 134.78us cudaMemGetInfo + 0.00% 36.321us 10 3.6320us 3.0320us 4.6100us cudaFuncGetAttributes + 0.00% 28.911us 41 705ns 592ns 1.5350us cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags + 0.00% 7.7890us 16 486ns 346ns 1.1310us cudaDeviceGetAttribute + 0.00% 3.2980us 4 824ns 736ns 1.0260us cudaGetDevice + +``` + +

+ + +
Examplary `nvprof` results for **CUDAStandaloneConfiguration**

+Profile summary for `N = 1000`: + +``` +==18067== NVPROF is profiling process 18067, command: ./main +==18067== Profiling application: ./main +==18067== Profiling result: + Type Time(%) Time Calls Avg Min Max Name + GPU activities: 33.20% 86.044ms 10000 8.6040us 3.3600us 26.176us kernel_synapses_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, double*, int, double*, int, int*, double*, double, double*, int, int*, int, int*, int) + 16.74% 43.393ms 10000 4.3390us 3.8080us 5.9840us kernel_neurongroup_stateupdater_codeobject(unsigned int, unsigned int, double*, double*, double*) + 13.67% 35.442ms 10000 3.5440us 3.4560us 7.0400us kernel_synapses_post_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, double*, int, double*, int, double, double*, int, int*, int*, int) + 9.83% 25.469ms 10000 2.5460us 2.2400us 2.7520us kernel_poissongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double*, double*, float*) + 7.17% 18.573ms 10000 1.8570us 1.7280us 2.0800us kernel_neurongroup_resetter_codeobject(unsigned int, unsigned int, double*, int*) + 7.03% 18.222ms 10000 1.8220us 1.7280us 2.6240us kernel_neurongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double*) + 6.26% 16.215ms 10000 1.6210us 1.4080us 1.7920us _GLOBAL__N__70_tmpxft_0000448e_00000000_6_poissongroup_thresholder_codeobject_cpp1_ii_7314966e::_reset_poissongroup_thresholder_codeobject(int*) + 5.98% 15.512ms 10000 1.5510us 1.4400us 1.6960us _GLOBAL__N__69_tmpxft_0000448c_00000000_6_neurongroup_thresholder_codeobject_cpp1_ii_c0b8948b::_reset_neurongroup_thresholder_codeobject(int*) + 0.13% 330.56us 1 330.56us 330.56us 330.56us void gen_sequenced(curandStateXORWOW*, int))>(curandStateXORWOW*, float*, unsigned long, unsigned long, int) + API calls: 83.75% 838.49ms 80001 10.480us 9.1490us 9.2085ms cudaLaunch + 12.30% 123.18ms 580005 212ns 154ns 365.89us cudaSetupArgument + 2.22% 22.230ms 80001 277ns 208ns 341.41us cudaConfigureCall + 1.68% 16.830ms 60002 280ns 217ns 348.09us cudaGetLastError + 0.02% 200.11us 1 200.11us 200.11us 200.11us cudaMalloc + 0.01% 131.26us 1 131.26us 131.26us 131.26us cudaMemGetInfo + 0.00% 37.933us 10 3.7930us 3.0410us 5.6940us cudaFuncGetAttributes + 0.00% 33.513us 41 817ns 707ns 1.6920us cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags + 0.00% 13.505us 1 13.505us 13.505us 13.505us cudaDeviceSynchronize + 0.00% 7.9010us 16 493ns 368ns 1.1420us cudaDeviceGetAttribute + 0.00% 4.0280us 4 1.0070us 817ns 1.4860us cudaGetDevice + +``` + +

+ + +*** + +### STDPEventDriven - less kernels displayed +![](plots/speed_test_STDPEventDriven-less_kernels_displayed_min_15_profiling.svg) + + diff --git a/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp_profiled/data/CUBA.pkl b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp_profiled/data/CUBA.pkl new file mode 100644 index 00000000..e0e2942b Binary files /dev/null and b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp_profiled/data/CUBA.pkl differ diff --git a/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp_profiled/data/STDP.pkl b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp_profiled/data/STDP.pkl new file mode 100644 index 00000000..1c5601a6 Binary files /dev/null and b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp_profiled/data/STDP.pkl differ diff --git a/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp_profiled/data/STDPEventDriven.pkl b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp_profiled/data/STDPEventDriven.pkl new file mode 100644 index 00000000..9e7fcd6a Binary files /dev/null and b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp_profiled/data/STDPEventDriven.pkl differ diff --git a/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp_profiled/git.diff b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp_profiled/git.diff new file mode 100644 index 00000000..7737e913 --- /dev/null +++ b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp_profiled/git.diff @@ -0,0 +1,60 @@ +diff --git a/brian2cuda/device.py b/brian2cuda/device.py +index b4610ee..e032d32 100644 +--- a/brian2cuda/device.py ++++ b/brian2cuda/device.py +@@ -919,13 +919,13 @@ class CUDAStandaloneDevice(CPPStandaloneDevice): + if clock not in all_clocks: + run_lines.append('{net.name}.add(&{clock.name}, NULL, NULL, NULL, NULL);'.format(clock=clock, net=net)) + +- if self.profile and self.profile != 'blocking': # self.profile == True ++ if True:#self.profile and self.profile != 'blocking': # self.profile == True + run_lines.append('cudaProfilerStart();') + run_lines.append('{net.name}.run({duration!r}, {report_call}, {report_period!r});'.format(net=net, + duration=float(duration), + report_call=report_call, + report_period=float(report_period))) +- if self.profile and self.profile != 'blocking': # self.profile == True ++ if True:#self.profile and self.profile != 'blocking': # self.profile == True + run_lines.append('cudaDeviceSynchronize();') + run_lines.append('cudaProfilerStop();') + self.main_queue.append(('run_network', (net, run_lines))) +diff --git a/brian2cuda/tests/features/speed.py b/brian2cuda/tests/features/speed.py +index 2293533..c093bc9 100644 +--- a/brian2cuda/tests/features/speed.py ++++ b/brian2cuda/tests/features/speed.py +@@ -558,7 +558,7 @@ class CUBA(SpeedTest): + category = "Full examples" + name = "CUBA fixed connectivity" + tags = ["Neurons", "Synapses"] +- n_range = [10, 100, 1000, 10000, 100000, 1000000] ++ n_range = [10, 100, 1000, 10000, 100000, 200000, 500000, 1000000] + n_label = 'Num neurons' + + # configuration options +@@ -720,7 +720,7 @@ class STDPNotEventDriven(SpeedTest): + category = "Full examples" + name = "STDP (not event-driven)" + tags = ["Neurons", "Synapses"] +- n_range = [10, 100, 1000, 10000, 20000, 50000, 100000] ++ n_range = [10, 100, 1000, 10000, 20000, 50000, 100000, 1000000, 5000000] + n_label = 'Num neurons' + + # configuration options +diff --git a/frozen_repos/brian2 b/frozen_repos/brian2 +--- a/frozen_repos/brian2 ++++ b/frozen_repos/brian2 +@@ -1 +1 @@ +-Subproject commit fadc6a0aeb90d1b4d343470628457d8561536f67 ++Subproject commit fadc6a0aeb90d1b4d343470628457d8561536f67-dirty +diff --git a/frozen_repos/brian2genn b/frozen_repos/brian2genn +--- a/frozen_repos/brian2genn ++++ b/frozen_repos/brian2genn +@@ -1 +1 @@ +-Subproject commit 0553cafeab49ea5403c0230411035df504d4db06 ++Subproject commit 0553cafeab49ea5403c0230411035df504d4db06-dirty +diff --git a/frozen_repos/genn b/frozen_repos/genn +--- a/frozen_repos/genn ++++ b/frozen_repos/genn +@@ -1 +1 @@ +-Subproject commit e01c85f18339249558d6e570ae976609dc972846 ++Subproject commit e01c85f18339249558d6e570ae976609dc972846-dirty diff --git a/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp_profiled/logs/stdout_CUBA_cuda_standalone_500000.txt b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp_profiled/logs/stdout_CUBA_cuda_standalone_500000.txt new file mode 100644 index 00000000..29393927 --- /dev/null +++ b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp_profiled/logs/stdout_CUBA_cuda_standalone_500000.txt @@ -0,0 +1,54 @@ +INFO: setting cudaDevice stuff took 0.353296 seconds +INFO kernel_neurongroup_group_variable_set_conditional_codeobject + 489 blocks + 1024 threads + 12 registers per block + 0 bytes statically-allocated shared memory per block + 0 bytes local memory per thread + 304 bytes user-allocated constant memory +INFO connectivity matrix has size 7999406 +INFO connectivity matrix has size 32005238 +INFO kernel_neurongroup_stateupdater_codeobject + 652 blocks + 768 threads + 35 registers per block + 0 bytes statically-allocated shared memory per block + 0 bytes local memory per thread + 304 bytes user-allocated constant memory + 0.750 theoretical occupancy +INFO kernel_neurongroup_thresholder_codeobject + 489 blocks + 1024 threads + 15 registers per block + 0 bytes statically-allocated shared memory per block + 0 bytes local memory per thread + 304 bytes user-allocated constant memory + 1.000 theoretical occupancy +INFO kernel_synapses_1_pre_codeobject + 15 blocks + 1024 threads + 22 registers per block + 0 bytes statically-allocated shared memory per block + 8 bytes local memory per thread + 304 bytes user-allocated constant memory + 1.000 theoretical occupancy +INFO kernel_synapses_pre_codeobject + 15 blocks + 1024 threads + 22 registers per block + 0 bytes statically-allocated shared memory per block + 8 bytes local memory per thread + 304 bytes user-allocated constant memory + 1.000 theoretical occupancy +INFO kernel_neurongroup_resetter_codeobject + 489 blocks + 1024 threads + 14 registers per block + 0 bytes statically-allocated shared memory per block + 0 bytes local memory per thread + 304 bytes user-allocated constant memory + 1.000 theoretical occupancy +Number of synapses: 32005238 +Number of synapses: 7999406 +INFO: main_lines took 198.948620 seconds +INFO: main function took 200.907001 seconds diff --git a/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp_profiled/logs/stdout_STDPEventDriven_cuda_standalone_5000000.txt b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp_profiled/logs/stdout_STDPEventDriven_cuda_standalone_5000000.txt new file mode 100644 index 00000000..eb3a2e32 --- /dev/null +++ b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp_profiled/logs/stdout_STDPEventDriven_cuda_standalone_5000000.txt @@ -0,0 +1,63 @@ +INFO: setting cudaDevice stuff took 0.345850 seconds +INFO kernel_synapses_group_variable_set_conditional_codeobject + 4883 blocks + 1024 threads + 8 registers per block + 0 bytes statically-allocated shared memory per block + 0 bytes local memory per thread + 304 bytes user-allocated constant memory +INFO connectivity matrix has size 5000000 +INFO connectivity matrix has size 5000000 +INFO generating 10000000 rand every 2 clock cycles for poissongroup_thresholder_codeobject +INFO kernel_neurongroup_stateupdater_codeobject + 1 blocks + 768 threads + 35 registers per block + 0 bytes statically-allocated shared memory per block + 0 bytes local memory per thread + 304 bytes user-allocated constant memory + 0.750 theoretical occupancy +INFO kernel_neurongroup_thresholder_codeobject + 1 blocks + 1024 threads + 14 registers per block + 0 bytes statically-allocated shared memory per block + 0 bytes local memory per thread + 304 bytes user-allocated constant memory + 1.000 theoretical occupancy +INFO kernel_poissongroup_thresholder_codeobject + 4883 blocks + 1024 threads + 14 registers per block + 0 bytes statically-allocated shared memory per block + 0 bytes local memory per thread + 304 bytes user-allocated constant memory + 1.000 theoretical occupancy +INFO kernel_synapses_pre_codeobject + 15 blocks + 1024 threads + 40 registers per block + 0 bytes statically-allocated shared memory per block + 8 bytes local memory per thread + 304 bytes user-allocated constant memory + 0.500 theoretical occupancy +INFO kernel_synapses_post_codeobject + 15 blocks + 1024 threads + 34 registers per block + 0 bytes statically-allocated shared memory per block + 8 bytes local memory per thread + 304 bytes user-allocated constant memory + 0.500 theoretical occupancy +INFO kernel_neurongroup_resetter_codeobject + 1 blocks + 1024 threads + 14 registers per block + 0 bytes statically-allocated shared memory per block + 0 bytes local memory per thread + 304 bytes user-allocated constant memory + 1.000 theoretical occupancy +Number of synapses: 5000000 +Number of synapses: 5000000 +INFO: main_lines took 349.461367 seconds +INFO: main function took 350.342466 seconds diff --git a/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp_profiled/logs/stdout_STDP_cuda_standalone_5000000.txt b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp_profiled/logs/stdout_STDP_cuda_standalone_5000000.txt new file mode 100644 index 00000000..0708f3c6 --- /dev/null +++ b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp_profiled/logs/stdout_STDP_cuda_standalone_5000000.txt @@ -0,0 +1,71 @@ +INFO: setting cudaDevice stuff took 0.318718 seconds +INFO kernel_synapses_group_variable_set_conditional_codeobject + 4883 blocks + 1024 threads + 8 registers per block + 0 bytes statically-allocated shared memory per block + 0 bytes local memory per thread + 304 bytes user-allocated constant memory +INFO connectivity matrix has size 5000000 +INFO connectivity matrix has size 5000000 +INFO generating 10000000 rand every 2 clock cycles for poissongroup_thresholder_codeobject +INFO kernel_neurongroup_stateupdater_codeobject + 1 blocks + 768 threads + 35 registers per block + 0 bytes statically-allocated shared memory per block + 0 bytes local memory per thread + 304 bytes user-allocated constant memory + 0.750 theoretical occupancy +INFO kernel_synapses_stateupdater_codeobject + 6511 blocks + 768 threads + 35 registers per block + 0 bytes statically-allocated shared memory per block + 0 bytes local memory per thread + 304 bytes user-allocated constant memory + 0.750 theoretical occupancy +INFO kernel_neurongroup_thresholder_codeobject + 1 blocks + 1024 threads + 14 registers per block + 0 bytes statically-allocated shared memory per block + 0 bytes local memory per thread + 304 bytes user-allocated constant memory + 1.000 theoretical occupancy +INFO kernel_poissongroup_thresholder_codeobject + 4883 blocks + 1024 threads + 14 registers per block + 0 bytes statically-allocated shared memory per block + 0 bytes local memory per thread + 304 bytes user-allocated constant memory + 1.000 theoretical occupancy +INFO kernel_synapses_pre_codeobject + 15 blocks + 1024 threads + 28 registers per block + 0 bytes statically-allocated shared memory per block + 8 bytes local memory per thread + 304 bytes user-allocated constant memory + 1.000 theoretical occupancy +INFO kernel_synapses_post_codeobject + 15 blocks + 1024 threads + 26 registers per block + 0 bytes statically-allocated shared memory per block + 8 bytes local memory per thread + 304 bytes user-allocated constant memory + 1.000 theoretical occupancy +INFO kernel_neurongroup_resetter_codeobject + 1 blocks + 1024 threads + 14 registers per block + 0 bytes statically-allocated shared memory per block + 0 bytes local memory per thread + 304 bytes user-allocated constant memory + 1.000 theoretical occupancy +Number of synapses: 5000000 +Number of synapses: 5000000 +INFO: main_lines took 325.966205 seconds +INFO: main function took 326.809723 seconds diff --git a/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp_profiled/nvprof/nvprof_CUBA_CUDAStandaloneConfigurationProfileCPU_1000.log b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp_profiled/nvprof/nvprof_CUBA_CUDAStandaloneConfigurationProfileCPU_1000.log new file mode 100644 index 00000000..36aaed97 --- /dev/null +++ b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp_profiled/nvprof/nvprof_CUBA_CUDAStandaloneConfigurationProfileCPU_1000.log @@ -0,0 +1,20 @@ +==6637== NVPROF is profiling process 6637, command: ./main +==6637== Profiling application: ./main +==6637== Profiling result: + Type Time(%) Time Calls Avg Min Max Name + GPU activities: 27.59% 59.367ms 10000 5.9360us 5.7280us 6.9130us kernel_neurongroup_stateupdater_codeobject(unsigned int, unsigned int, double*, double*, double, double*, double*, double*, bool*) + 23.11% 49.736ms 10000 4.9730us 3.2960us 20.256us kernel_synapses_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, int*, int, double*, double, int*, int, int*, bool*) + 21.48% 46.232ms 10000 4.6230us 3.2960us 15.424us kernel_synapses_1_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, int*, int, double, int*, int, int*, double*, bool*) + 11.66% 25.090ms 10000 2.5080us 2.2720us 3.0080us kernel_neurongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double, double*, double*, bool*) + 8.37% 18.003ms 10000 1.8000us 1.6640us 2.1760us kernel_neurongroup_resetter_codeobject(unsigned int, unsigned int, double*, int*, bool*) + 7.79% 16.764ms 10000 1.6760us 1.6000us 2.0480us _GLOBAL__N__69_tmpxft_000017f7_00000000_6_neurongroup_thresholder_codeobject_cpp1_ii_97ebdcc0::_reset_neurongroup_thresholder_codeobject(int*) + API calls: 55.27% 767.72ms 60000 12.795us 10.547us 9.0021ms cudaLaunch + 35.89% 498.54ms 80001 6.2310us 2.4830us 372.18us cudaDeviceSynchronize + 6.50% 90.343ms 520000 173ns 138ns 371.16us cudaSetupArgument + 1.33% 18.502ms 60000 308ns 238ns 364.34us cudaConfigureCall + 0.99% 13.745ms 50000 274ns 217ns 21.746us cudaGetLastError + 0.01% 138.51us 1 138.51us 138.51us 138.51us cudaMemGetInfo + 0.00% 33.472us 39 858ns 721ns 1.8600us cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags + 0.00% 30.648us 8 3.8310us 3.1320us 5.3030us cudaFuncGetAttributes + 0.00% 6.3800us 12 531ns 343ns 1.3920us cudaDeviceGetAttribute + 0.00% 2.9800us 3 993ns 737ns 1.3910us cudaGetDevice diff --git a/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp_profiled/nvprof/nvprof_CUBA_CUDAStandaloneConfiguration_1000.log b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp_profiled/nvprof/nvprof_CUBA_CUDAStandaloneConfiguration_1000.log new file mode 100644 index 00000000..e6fd195f --- /dev/null +++ b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp_profiled/nvprof/nvprof_CUBA_CUDAStandaloneConfiguration_1000.log @@ -0,0 +1,20 @@ +==5900== NVPROF is profiling process 5900, command: ./main +==5900== Profiling application: ./main +==5900== Profiling result: + Type Time(%) Time Calls Avg Min Max Name + GPU activities: 27.83% 60.653ms 10000 6.0650us 5.7920us 7.0400us kernel_neurongroup_stateupdater_codeobject(unsigned int, unsigned int, double*, double*, double, double*, double*, double*, bool*) + 23.00% 50.122ms 10000 5.0120us 3.2960us 24.320us kernel_synapses_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, int*, int, double*, double, int*, int, int*, bool*) + 20.65% 45.008ms 10000 4.5000us 3.2960us 17.824us kernel_synapses_1_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, int*, int, double, int*, int, int*, double*, bool*) + 11.47% 25.008ms 10000 2.5000us 2.2720us 3.1680us kernel_neurongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double, double*, double*, bool*) + 9.22% 20.085ms 10000 2.0080us 1.8560us 2.1760us kernel_neurongroup_resetter_codeobject(unsigned int, unsigned int, double*, int*, bool*) + 7.83% 17.069ms 10000 1.7060us 1.6320us 2.2400us _GLOBAL__N__69_tmpxft_00001511_00000000_6_neurongroup_thresholder_codeobject_cpp1_ii_97ebdcc0::_reset_neurongroup_thresholder_codeobject(int*) + API calls: 85.16% 640.31ms 60000 10.671us 9.6060us 9.0686ms cudaLaunch + 11.50% 86.475ms 520000 166ns 135ns 344.23us cudaSetupArgument + 1.87% 14.092ms 60000 234ns 176ns 334.30us cudaConfigureCall + 1.43% 10.785ms 50000 215ns 189ns 10.220us cudaGetLastError + 0.02% 139.19us 1 139.19us 139.19us 139.19us cudaMemGetInfo + 0.00% 31.512us 8 3.9390us 3.0080us 5.7970us cudaFuncGetAttributes + 0.00% 29.967us 39 768ns 653ns 1.9770us cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags + 0.00% 12.868us 1 12.868us 12.868us 12.868us cudaDeviceSynchronize + 0.00% 6.2440us 12 520ns 331ns 1.3150us cudaDeviceGetAttribute + 0.00% 3.7510us 3 1.2500us 823ns 1.7170us cudaGetDevice diff --git a/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp_profiled/nvprof/nvprof_STDPEventDriven_CUDAStandaloneConfigurationProfileCPU_1000.log b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp_profiled/nvprof/nvprof_STDPEventDriven_CUDAStandaloneConfigurationProfileCPU_1000.log new file mode 100644 index 00000000..196a1cb2 --- /dev/null +++ b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp_profiled/nvprof/nvprof_STDPEventDriven_CUDAStandaloneConfigurationProfileCPU_1000.log @@ -0,0 +1,24 @@ +==18877== NVPROF is profiling process 18877, command: ./main +==18877== Profiling application: ./main +==18877== Profiling result: + Type Time(%) Time Calls Avg Min Max Name + GPU activities: 33.24% 85.455ms 10000 8.5450us 3.3280us 25.984us kernel_synapses_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, double*, int, double*, int, int*, double*, double, double*, int, int*, int, int*, int) + 16.85% 43.327ms 10000 4.3320us 3.8400us 6.2080us kernel_neurongroup_stateupdater_codeobject(unsigned int, unsigned int, double*, double*, double*) + 13.77% 35.393ms 10000 3.5390us 3.4240us 7.2320us kernel_synapses_post_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, double*, int, double*, int, double, double*, int, int*, int*, int) + 9.92% 25.503ms 10000 2.5500us 2.2400us 2.9760us kernel_poissongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double*, double*, float*) + 7.11% 18.278ms 10000 1.8270us 1.7600us 2.0800us kernel_neurongroup_resetter_codeobject(unsigned int, unsigned int, double*, int*) + 6.37% 16.365ms 10000 1.6360us 1.4080us 1.7920us _GLOBAL__N__70_tmpxft_00004798_00000000_6_poissongroup_thresholder_codeobject_cpp1_ii_7314966e::_reset_poissongroup_thresholder_codeobject(int*) + 6.31% 16.219ms 10000 1.6210us 1.5040us 1.8560us _GLOBAL__N__69_tmpxft_00004796_00000000_6_neurongroup_thresholder_codeobject_cpp1_ii_c0b8948b::_reset_neurongroup_thresholder_codeobject(int*) + 6.31% 16.209ms 10000 1.6200us 1.5360us 2.5920us kernel_neurongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double*) + 0.13% 330.27us 1 330.27us 330.27us 330.27us void gen_sequenced(curandStateXORWOW*, int))>(curandStateXORWOW*, float*, unsigned long, unsigned long, int) + API calls: 57.23% 936.43ms 80001 11.705us 9.9320us 9.2809ms cudaLaunch + 34.35% 562.06ms 90001 6.2450us 2.4600us 359.92us cudaDeviceSynchronize + 5.96% 97.491ms 580005 168ns 132ns 357.12us cudaSetupArgument + 1.41% 23.032ms 80001 287ns 242ns 13.914us cudaConfigureCall + 1.02% 16.685ms 60002 278ns 235ns 14.273us cudaGetLastError + 0.01% 200.02us 1 200.02us 200.02us 200.02us cudaMalloc + 0.01% 134.78us 1 134.78us 134.78us 134.78us cudaMemGetInfo + 0.00% 36.321us 10 3.6320us 3.0320us 4.6100us cudaFuncGetAttributes + 0.00% 28.911us 41 705ns 592ns 1.5350us cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags + 0.00% 7.7890us 16 486ns 346ns 1.1310us cudaDeviceGetAttribute + 0.00% 3.2980us 4 824ns 736ns 1.0260us cudaGetDevice diff --git a/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp_profiled/nvprof/nvprof_STDPEventDriven_CUDAStandaloneConfiguration_1000.log b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp_profiled/nvprof/nvprof_STDPEventDriven_CUDAStandaloneConfiguration_1000.log new file mode 100644 index 00000000..89cca30a --- /dev/null +++ b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp_profiled/nvprof/nvprof_STDPEventDriven_CUDAStandaloneConfiguration_1000.log @@ -0,0 +1,24 @@ +==18067== NVPROF is profiling process 18067, command: ./main +==18067== Profiling application: ./main +==18067== Profiling result: + Type Time(%) Time Calls Avg Min Max Name + GPU activities: 33.20% 86.044ms 10000 8.6040us 3.3600us 26.176us kernel_synapses_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, double*, int, double*, int, int*, double*, double, double*, int, int*, int, int*, int) + 16.74% 43.393ms 10000 4.3390us 3.8080us 5.9840us kernel_neurongroup_stateupdater_codeobject(unsigned int, unsigned int, double*, double*, double*) + 13.67% 35.442ms 10000 3.5440us 3.4560us 7.0400us kernel_synapses_post_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, double*, int, double*, int, double, double*, int, int*, int*, int) + 9.83% 25.469ms 10000 2.5460us 2.2400us 2.7520us kernel_poissongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double*, double*, float*) + 7.17% 18.573ms 10000 1.8570us 1.7280us 2.0800us kernel_neurongroup_resetter_codeobject(unsigned int, unsigned int, double*, int*) + 7.03% 18.222ms 10000 1.8220us 1.7280us 2.6240us kernel_neurongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double*) + 6.26% 16.215ms 10000 1.6210us 1.4080us 1.7920us _GLOBAL__N__70_tmpxft_0000448e_00000000_6_poissongroup_thresholder_codeobject_cpp1_ii_7314966e::_reset_poissongroup_thresholder_codeobject(int*) + 5.98% 15.512ms 10000 1.5510us 1.4400us 1.6960us _GLOBAL__N__69_tmpxft_0000448c_00000000_6_neurongroup_thresholder_codeobject_cpp1_ii_c0b8948b::_reset_neurongroup_thresholder_codeobject(int*) + 0.13% 330.56us 1 330.56us 330.56us 330.56us void gen_sequenced(curandStateXORWOW*, int))>(curandStateXORWOW*, float*, unsigned long, unsigned long, int) + API calls: 83.75% 838.49ms 80001 10.480us 9.1490us 9.2085ms cudaLaunch + 12.30% 123.18ms 580005 212ns 154ns 365.89us cudaSetupArgument + 2.22% 22.230ms 80001 277ns 208ns 341.41us cudaConfigureCall + 1.68% 16.830ms 60002 280ns 217ns 348.09us cudaGetLastError + 0.02% 200.11us 1 200.11us 200.11us 200.11us cudaMalloc + 0.01% 131.26us 1 131.26us 131.26us 131.26us cudaMemGetInfo + 0.00% 37.933us 10 3.7930us 3.0410us 5.6940us cudaFuncGetAttributes + 0.00% 33.513us 41 817ns 707ns 1.6920us cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags + 0.00% 13.505us 1 13.505us 13.505us 13.505us cudaDeviceSynchronize + 0.00% 7.9010us 16 493ns 368ns 1.1420us cudaDeviceGetAttribute + 0.00% 4.0280us 4 1.0070us 817ns 1.4860us cudaGetDevice diff --git a/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp_profiled/nvprof/nvprof_STDP_CUDAStandaloneConfigurationProfileCPU_1000.log b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp_profiled/nvprof/nvprof_STDP_CUDAStandaloneConfigurationProfileCPU_1000.log new file mode 100644 index 00000000..d8883015 --- /dev/null +++ b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp_profiled/nvprof/nvprof_STDP_CUDAStandaloneConfigurationProfileCPU_1000.log @@ -0,0 +1,25 @@ +==28576== NVPROF is profiling process 28576, command: ./main +==28576== Profiling application: ./main +==28576== Profiling result: + Type Time(%) Time Calls Avg Min Max Name + GPU activities: 26.71% 73.256ms 10000 7.3250us 3.2960us 22.720us kernel_synapses_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, double*, int, double*, int, int*, int, double*, double, double*, int, int*, int, int*) + 15.80% 43.329ms 10000 4.3320us 3.8720us 6.2400us kernel_neurongroup_stateupdater_codeobject(unsigned int, unsigned int, double*, double*, double*) + 12.77% 35.035ms 10000 3.5030us 3.3920us 6.3360us kernel_synapses_post_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, double*, int, double*, int, int*, int, double, double*, int, int*) + 9.94% 27.271ms 10000 2.7270us 2.6240us 3.1680us kernel_synapses_stateupdater_codeobject(unsigned int, unsigned int, double*, int, double*, int, double*, int*) + 9.28% 25.455ms 10000 2.5450us 2.2400us 2.9120us kernel_poissongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double*, double*, float*) + 6.66% 18.254ms 10000 1.8250us 1.7600us 2.0800us kernel_neurongroup_resetter_codeobject(unsigned int, unsigned int, double*, int*) + 6.65% 18.226ms 10000 1.8220us 1.7600us 2.5600us kernel_neurongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double*) + 6.20% 16.991ms 10000 1.6990us 1.6000us 1.9200us _GLOBAL__N__70_tmpxft_00006daf_00000000_6_poissongroup_thresholder_codeobject_cpp1_ii_7314966e::_reset_poissongroup_thresholder_codeobject(int*) + 5.88% 16.118ms 10000 1.6110us 1.4720us 1.8560us _GLOBAL__N__69_tmpxft_00006dad_00000000_6_neurongroup_thresholder_codeobject_cpp1_ii_c0b8948b::_reset_neurongroup_thresholder_codeobject(int*) + 0.12% 330.53us 1 330.53us 330.53us 330.53us void gen_sequenced(curandStateXORWOW*, int))>(curandStateXORWOW*, float*, unsigned long, unsigned long, int) + API calls: 58.50% 1.10914s 90001 12.323us 9.6560us 9.1188ms cudaLaunch + 32.75% 621.00ms 100001 6.2090us 2.3660us 355.02us cudaDeviceSynchronize + 5.78% 109.54ms 660005 165ns 124ns 14.341us cudaSetupArgument + 1.49% 28.313ms 90001 314ns 245ns 12.028us cudaConfigureCall + 1.45% 27.511ms 70002 393ns 230ns 366.98us cudaGetLastError + 0.01% 208.18us 1 208.18us 208.18us 208.18us cudaMalloc + 0.01% 131.79us 1 131.79us 131.79us 131.79us cudaMemGetInfo + 0.00% 55.331us 74 747ns 647ns 1.4820us cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags + 0.00% 44.531us 12 3.7100us 3.1290us 4.8360us cudaFuncGetAttributes + 0.00% 9.1380us 20 456ns 333ns 893ns cudaDeviceGetAttribute + 0.00% 4.2750us 5 855ns 719ns 1.3080us cudaGetDevice diff --git a/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp_profiled/nvprof/nvprof_STDP_CUDAStandaloneConfiguration_1000.log b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp_profiled/nvprof/nvprof_STDP_CUDAStandaloneConfiguration_1000.log new file mode 100644 index 00000000..e670ef10 --- /dev/null +++ b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp_profiled/nvprof/nvprof_STDP_CUDAStandaloneConfiguration_1000.log @@ -0,0 +1,25 @@ +==27879== NVPROF is profiling process 27879, command: ./main +==27879== Profiling application: ./main +==27879== Profiling result: + Type Time(%) Time Calls Avg Min Max Name + GPU activities: 26.99% 74.731ms 10000 7.4730us 3.2960us 27.648us kernel_synapses_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, double*, int, double*, int, int*, int, double*, double, double*, int, int*, int, int*) + 15.88% 43.964ms 10000 4.3960us 3.9360us 6.4000us kernel_neurongroup_stateupdater_codeobject(unsigned int, unsigned int, double*, double*, double*) + 12.62% 34.946ms 10000 3.4940us 3.3920us 6.4960us kernel_synapses_post_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, double*, int, double*, int, int*, int, double, double*, int, int*) + 9.80% 27.129ms 10000 2.7120us 2.3680us 2.9440us kernel_poissongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double*, double*, float*) + 9.58% 26.535ms 10000 2.6530us 2.5600us 3.0400us kernel_synapses_stateupdater_codeobject(unsigned int, unsigned int, double*, int, double*, int, double*, int*) + 6.59% 18.247ms 10000 1.8240us 1.7280us 2.0480us kernel_neurongroup_resetter_codeobject(unsigned int, unsigned int, double*, int*) + 6.58% 18.231ms 10000 1.8230us 1.7600us 2.5600us kernel_neurongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double*) + 6.20% 17.155ms 10000 1.7150us 1.6320us 1.9520us _GLOBAL__N__70_tmpxft_00006ae9_00000000_6_poissongroup_thresholder_codeobject_cpp1_ii_7314966e::_reset_poissongroup_thresholder_codeobject(int*) + 5.65% 15.632ms 10000 1.5630us 1.4720us 1.6960us _GLOBAL__N__69_tmpxft_00006ae5_00000000_6_neurongroup_thresholder_codeobject_cpp1_ii_c0b8948b::_reset_neurongroup_thresholder_codeobject(int*) + 0.12% 329.57us 1 329.57us 329.57us 329.57us void gen_sequenced(curandStateXORWOW*, int))>(curandStateXORWOW*, float*, unsigned long, unsigned long, int) + API calls: 85.57% 910.45ms 90001 10.116us 8.9060us 9.1466ms cudaLaunch + 10.88% 115.80ms 660005 175ns 132ns 353.03us cudaSetupArgument + 2.00% 21.262ms 90001 236ns 181ns 330.07us cudaConfigureCall + 1.50% 15.984ms 70002 228ns 182ns 318.18us cudaGetLastError + 0.02% 207.89us 1 207.89us 207.89us 207.89us cudaMalloc + 0.01% 132.37us 1 132.37us 132.37us 132.37us cudaMemGetInfo + 0.01% 55.857us 74 754ns 674ns 1.5500us cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags + 0.00% 44.986us 12 3.7480us 3.0050us 5.8370us cudaFuncGetAttributes + 0.00% 13.864us 1 13.864us 13.864us 13.864us cudaDeviceSynchronize + 0.00% 9.5470us 20 477ns 338ns 1.1980us cudaDeviceGetAttribute + 0.00% 4.8700us 5 974ns 851ns 1.4220us cudaGetDevice diff --git a/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp_profiled/plots/speed_test_CUBA-less_kernels_displayed_min_15_profiling.png b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp_profiled/plots/speed_test_CUBA-less_kernels_displayed_min_15_profiling.png new file mode 100644 index 00000000..0abbb193 Binary files /dev/null and b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp_profiled/plots/speed_test_CUBA-less_kernels_displayed_min_15_profiling.png differ diff --git a/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp_profiled/plots/speed_test_CUBA_absolute.png b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp_profiled/plots/speed_test_CUBA_absolute.png new file mode 100644 index 00000000..499bbcec Binary files /dev/null and b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp_profiled/plots/speed_test_CUBA_absolute.png differ diff --git a/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp_profiled/plots/speed_test_CUBA_profiling.png b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp_profiled/plots/speed_test_CUBA_profiling.png new file mode 100644 index 00000000..36649122 Binary files /dev/null and b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp_profiled/plots/speed_test_CUBA_profiling.png differ diff --git a/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp_profiled/plots/speed_test_CUBA_relative.png b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp_profiled/plots/speed_test_CUBA_relative.png new file mode 100644 index 00000000..47c92e41 Binary files /dev/null and b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp_profiled/plots/speed_test_CUBA_relative.png differ diff --git a/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp_profiled/plots/speed_test_STDP-less_kernels_displayed_min_15_profiling.png b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp_profiled/plots/speed_test_STDP-less_kernels_displayed_min_15_profiling.png new file mode 100644 index 00000000..70188222 Binary files /dev/null and b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp_profiled/plots/speed_test_STDP-less_kernels_displayed_min_15_profiling.png differ diff --git a/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp_profiled/plots/speed_test_STDPEventDriven-less_kernels_displayed_min_15_profiling.png b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp_profiled/plots/speed_test_STDPEventDriven-less_kernels_displayed_min_15_profiling.png new file mode 100644 index 00000000..f64a187e Binary files /dev/null and b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp_profiled/plots/speed_test_STDPEventDriven-less_kernels_displayed_min_15_profiling.png differ diff --git a/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp_profiled/plots/speed_test_STDPEventDriven_absolute.png b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp_profiled/plots/speed_test_STDPEventDriven_absolute.png new file mode 100644 index 00000000..6754d066 Binary files /dev/null and b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp_profiled/plots/speed_test_STDPEventDriven_absolute.png differ diff --git a/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp_profiled/plots/speed_test_STDPEventDriven_profiling.png b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp_profiled/plots/speed_test_STDPEventDriven_profiling.png new file mode 100644 index 00000000..bfaebf31 Binary files /dev/null and b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp_profiled/plots/speed_test_STDPEventDriven_profiling.png differ diff --git a/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp_profiled/plots/speed_test_STDPEventDriven_relative.png b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp_profiled/plots/speed_test_STDPEventDriven_relative.png new file mode 100644 index 00000000..d32ac119 Binary files /dev/null and b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp_profiled/plots/speed_test_STDPEventDriven_relative.png differ diff --git a/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp_profiled/plots/speed_test_STDP_absolute.png b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp_profiled/plots/speed_test_STDP_absolute.png new file mode 100644 index 00000000..0032b713 Binary files /dev/null and b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp_profiled/plots/speed_test_STDP_absolute.png differ diff --git a/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp_profiled/plots/speed_test_STDP_profiling.png b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp_profiled/plots/speed_test_STDP_profiling.png new file mode 100644 index 00000000..b98b4ec2 Binary files /dev/null and b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp_profiled/plots/speed_test_STDP_profiling.png differ diff --git a/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp_profiled/plots/speed_test_STDP_relative.png b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp_profiled/plots/speed_test_STDP_relative.png new file mode 100644 index 00000000..6fe3a90c Binary files /dev/null and b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp_profiled/plots/speed_test_STDP_relative.png differ diff --git a/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp_profiled/run_speed_test_script.py b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp_profiled/run_speed_test_script.py new file mode 100644 index 00000000..0941e299 --- /dev/null +++ b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp_profiled/run_speed_test_script.py @@ -0,0 +1,284 @@ +import os +import shutil +import glob +import subprocess +import sys +import socket + +# run tests without X-server +import matplotlib +matplotlib.use('Agg') + +# pretty plots +import seaborn + +import time +import datetime +import cPickle as pickle + +from brian2 import * +from brian2.tests.features import * +from brian2.tests.features.base import * +from brian2.tests.features.base import results + +import brian2cuda +from brian2cuda.tests.features.cuda_configuration import (CUDAStandaloneConfiguration, + CUDAStandaloneConfigurationNoAssert, + CUDAStandaloneConfigurationExtraThresholdKernel, + CUDAStandaloneConfigurationCurandDouble, + CUDAStandaloneConfigurationNoCudaOccupancyAPI, + CUDAStandaloneConfigurationNoCudaOccupancyAPIProfileCPU, + CUDAStandaloneConfiguration2BlocksPerSM, + CUDAStandaloneConfiguration2BlocksPerSMLaunchBounds, + CUDAStandaloneConfigurationSynLaunchBounds, + CUDAStandaloneConfiguration2BlocksPerSMSynLaunchBounds, + CUDAStandaloneConfigurationProfileGPU, + CUDAStandaloneConfigurationProfileCPU) + #CUDAStandaloneConfigurationTestBrunelHeteroAtomics, + #CUDAStandaloneConfigurationTestBrunelHeteroAtomicsProfileCPU, + #CUDAStandaloneConfigurationPushAtomicResize, + #CUDAStandaloneConfigurationBrunelHeterogAndPushAtomicResize, + #CUDAStandaloneConfigurationPushAtomicResizeProfileCPU, + #CUDAStandaloneConfigurationBrunelHeterogAndPushAtomicResizeProfileCPU, + #CUDAStandaloneConfigurationPushAtomicResizeAsyncMemcpy, + #CUDAStandaloneConfigurationPushAtomicResizeAsyncMemcpyProfileCPU) +from brian2cuda.tests.features.speed import * + +from brian2genn.correctness_testing import GeNNConfiguration, GeNNConfigurationCPU, GeNNConfigurationOptimized + +from create_readme import create_readme + +assert len(sys.argv)<= 2, 'Only one command line argument supported! Got {}'.format(len(sys.argv)-1) +if len(sys.argv) == 2: + additional_dir_name = '_' + sys.argv[1] +else: + additional_dir_name = '' + +prefs['devices.cpp_standalone.extra_make_args_unix'] = ['-j12'] + +# host specific settings +if socket.gethostname() == 'elnath': + prefs['devices.cpp_standalone.extra_make_args_unix'] = ['-j24'] + prefs['codegen.cuda.extra_compile_args_nvcc'].remove('-arch=sm_35') + prefs['codegen.cuda.extra_compile_args_nvcc'].extend(['-arch=sm_20']) + +configs = [# configuration project_directory + #(NumpyConfiguration, None), + #(WeaveConfiguration, None), + #(LocalConfiguration, None), + #(CPPStandaloneConfiguration, 'cpp_standalone'), + #(CPPStandaloneConfigurationOpenMP, 'cpp_standalone'), + (CUDAStandaloneConfiguration, 'cuda_standalone'), + #(CUDAStandaloneConfigurationPushAtomicResize, 'cuda_standalone'), + #(CUDAStandaloneConfigurationTestBrunelHeteroAtomics, 'cuda_standalone'), + #(CUDAStandaloneConfigurationBrunelHeterogAndPushAtomicResize, 'cuda_standalone'), + #(CUDAStandaloneConfigurationExtraThresholdKernel, 'cuda_standalone'), + #(CUDAStandaloneConfigurationNoAssert, 'cuda_standalone'), + #(CUDAStandaloneConfigurationCurandDouble, 'cuda_standalone'), + #(CUDAStandaloneConfigurationNoCudaOccupancyAPI, 'cuda_standalone'), + #(CUDAStandaloneConfigurationNoCudaOccupancyAPIProfileCPU, 'cuda_standalone'), + #(CUDAStandaloneConfiguration2BlocksPerSM, 'cuda_standalone'), + #(CUDAStandaloneConfiguration2BlocksPerSMLaunchBounds, 'cuda_standalone'), + #(CUDAStandaloneConfigurationSynLaunchBounds, 'cuda_standalone'), + #(CUDAStandaloneConfiguration2BlocksPerSMSynLaunchBounds, 'cuda_standalone'), + #(CUDAStandaloneConfigurationProfileGPU, 'cuda_standalone'), + (CUDAStandaloneConfigurationProfileCPU, 'cuda_standalone'), + #(CUDAStandaloneConfigurationTestBrunelHeteroAtomicsProfileCPU, 'cuda_standalone'), + #(CUDAStandaloneConfigurationPushAtomicResizeProfileCPU, 'cuda_standalone'), + #(CUDAStandaloneConfigurationBrunelHeterogAndPushAtomicResizeProfileCPU, 'cuda_standalone'), + #(CUDAStandaloneConfigurationPushAtomicResizeAsyncMemcpy, 'cuda_standalone'), + #(CUDAStandaloneConfigurationPushAtomicResizeAsyncMemcpyProfileCPU, 'cuda_standalone'), + #(GeNNConfiguration, 'GeNNworkspace'), + #(GeNNConfigurationCPU, 'GeNNworkspace'), + #(GeNNConfigurationOptimized, 'GeNNworkspace') + ] + +speed_tests = [# feature_test name n_slice + + #(ThresholderOnlyPoissonLowRate, 'ThresholderOnlyPoissonLowRate', slice(None) ), + #(ThresholderOnlyPoissonMediumRate, 'ThresholderOnlyPoissonMediumRate', slice(None) ), + #(ThresholderOnlyPoissonHighRate, 'ThresholderOnlyPoissonHighRate', slice(None) ), + #(ThresholderOnlyAlwaysSpiking, 'ThresholderOnlyAlwaysSpiking', slice(None) ), + + #(BrunelHakimStateupdateOnlyDouble, 'BrunelHakimStateupdateOnlyDouble', slice(None) ), + #(BrunelHakimStateupdateOnlyTriple, 'BrunelHakimStateupdateOnlyTriple', slice(None) ), + #(BrunelHakimStateupdateOnly, 'BrunelHakimStateupdateOnly', slice(None) ), + #(BrunelHakimNeuronsOnly, 'BrunelHakimNeuronsOnly', slice(None) ), + #(BrunelHakimNeuronsOnlyNoXi, 'BrunelHakimNeuronsOnlyNoXi', slice(None) ), + #(BrunelHakimNeuronsOnlyNoRand, 'BrunelHakimNeuronsOnlyNoRand', slice(None) ), + #(BrunelHakimStateupdateThresholdOnly, 'BrunelHakimStateupdateThresholdOnly', slice(None) ), + #(BrunelHakimStateupdateThresholdResetOnly, 'BrunelHakimStateupdateThresholdResetOnly', slice(None) ), + #(BrunelHakimModelScalarDelayShort, 'BrunelHakimModelScalarDelayShort', slice(None) ), + #(BrunelHakimModelScalarDelayNoSelfConnections, 'BrunelHakimModelScalarDelayNoSelfConnections', slice(None) ), + #(COBAHH, 'COBAHH', slice(None) ), + #(AdaptationOscillation, 'AdaptationOscillation', slice(None) ), + #(Vogels, 'Vogels', slice(None) ), + (CUBA, 'CUBA', slice(0,-1,1) ), + (STDPNotEventDriven, 'STDP', slice(None) ), + (STDPEventDriven, 'STDPEventDriven', slice(None) ), + #(BrunelHakimModelScalarDelay, 'BrunelHakimModelScalarDelay', slice(None) ), + + #(VerySparseMediumRateSynapsesOnly, 'VerySparseMediumRateSynapsesOnly', slice(None) ), + #(SparseMediumRateSynapsesOnly, 'SparseMediumRateSynapsesOnly', slice(None) ), + #(DenseMediumRateSynapsesOnly, 'DenseMediumRateSynapsesOnly', slice(None) ), + #(SparseLowRateSynapsesOnly, 'SparseLowRateSynapsesOnly', slice(None) ), + #(SparseHighRateSynapsesOnly, 'SparseHighRateSynapsesOnly', slice(None) ), + + #(STDPNotEventDriven, 'STDPNotEventDriven', slice(None) ), + #(STDPMultiPost, 'STDPMultiPost', slice(None) ), + #(STDPNeuronalTraces, 'STDPNeuronalTraces', slice(None) ), + #(STDPMultiPostNeuronalTraces, 'STDPMultiPostNeuronalTraces', slice(None) ), + + #(BrunelHakimModelHeterogeneousDelay, 'BrunelHakimModelHeterogeneousDelay', slice(0,-1,1) ), + + #(LinearNeuronsOnly, 'LinearNeuronsOnly', slice(None) ), + #(HHNeuronsOnly, 'HHNeuronsOnly', slice(None) ), + #(VogelsWithSynapticDynamic, 'VogelsWithSynapticDynamic', slice(None) ), + + ### below uses monitors + #(CUBAFixedConnectivity, 'CUBAFixedConnectivity', slice(None) ), + #(COBAHHFixedConnectivity, 'COBAHHFixedConnectivity', slice(None, -1) ), +] + +configurations = [config[0] for config in configs] +project_dirs = [config[1] for config in configs] + +# check if multiple Configurations with same project_dirs are specified +last_idx = {} +for proj_dir in project_dirs: + if proj_dir is not None: + first_i = project_dirs.index(proj_dir) + last_i = len(project_dirs) - 1 - project_dirs[::-1].index(proj_dir) + if first_i != last_i: + print("WARNING there are multiple configurations using {d} as project " + "directory. Profiling and logfiles will only be saved for the last one {c}.".format( + d=proj_dir, c=configurations[last_i].__name__)) + last_idx[proj_dir] = last_i + +time_stemp = time.time() +date_str = datetime.datetime.fromtimestamp(time_stemp).strftime('%Y_%m_%d') + +directory = 'results_{}{}'.format(date_str, additional_dir_name) +if os.path.exists(directory): + new_dir = directory + '_bak_' + str(int(time.time())) + print("Directory with name `{}` already exists. Renaming it to `{}`.".format(directory, new_dir)) + os.rename(directory, new_dir) +os.makedirs(directory) +data_dir = os.path.join(directory, 'data') +plot_dir = os.path.join(directory, 'plots') +log_dir = os.path.join(directory, 'logs') +prof_dir = os.path.join(directory, 'nvprof') +os.makedirs(data_dir) +os.makedirs(plot_dir) +os.makedirs(log_dir) +os.makedirs(prof_dir) +print("Saving results in {}.".format(plot_dir)) + +shutil.copy(os.path.realpath(__file__), os.path.join(directory, 'run_speed_test_script.py')) + +time_format = '%d.%m.%Y at %H:%M:%S' +script_start = datetime.datetime.fromtimestamp(time.time()).strftime(time_format) + +with open(os.path.join(directory, 'git.diff'), 'w') as diff_file: + subprocess.call(['git', 'diff'], stdout=diff_file) + +try: + for n, (st, name, sl) in enumerate(speed_tests): + start = datetime.datetime.fromtimestamp(time.time()).strftime(time_format) + print("Starting {} on {}.".format(name, start)) + maximum_run_time = 1*60*60*second + res = run_speed_tests(configurations=configurations, + speed_tests=[st], + n_slice=sl, + #n_slice=slice(0,1,None), + run_twice=False, + verbose=True, + maximum_run_time=maximum_run_time#, + ## this needs modification of brian2 code + #profile_only_active=True + #profile_only_active=False + ) + end = datetime.datetime.fromtimestamp(time.time()).strftime(time_format) + diff = datetime.datetime.strptime(end, time_format) - datetime.datetime.strptime(start, time_format) + print("Running {} took {}.".format(name, diff)) + res.plot_all_tests() + ## this needs modification of brian2 code + #res.plot_all_tests(print_relative=True) + + res.plot_all_tests() + ## this needs modification of brian2 code + #res.plot_all_tests(print_relative=True) + savefig(os.path.join(plot_dir, 'speed_test_{}_absolute.svg'.format(speed_tests[n][1]))) + res.plot_all_tests(relative=True) + savefig(os.path.join(plot_dir, 'speed_test_{}_relative.svg'.format(name))) + res.plot_all_tests(profiling_minimum=0.05) + savefig(os.path.join(plot_dir, 'speed_test_{}_profiling.svg'.format(name))) + res.plot_all_tests(profiling_minimum=0.15) + savefig(os.path.join(plot_dir, 'speed_test_{}-less_kernels_displayed_min_15_profiling.svg'.format(name))) + + if 3 != len(get_fignums()): + print("WARNING: There were {} plots created, but only {} saved.".format(len(get_fignums()), 3*(n+1))) + for n in get_fignums(): + close(n) + + # pickel results object to disk + pkl_file = os.path.join(data_dir, name + '.pkl' ) + with open(pkl_file, 'wb') as output: + pickle.dump(res, output, pickle.HIGHEST_PROTOCOL) + + # save stdout log of last run (the other are deleted in run_speed_tests()) + for proj_dir in set(project_dirs): + if not proj_dir is None and proj_dir in ['cuda_standalone', 'cpp_standalone']: + config = configurations[last_idx[proj_dir]] + stdout_file = os.path.join(proj_dir, 'results/stdout.txt') + if os.path.exists(stdout_file): + shutil.copy(stdout_file, + os.path.join(log_dir, 'stdout_{st}_{conf}_{n}.txt'.format(st=name, conf=proj_dir, + n=st.n_range[sl][-1]))) + else: + print("WARNING Couldn't save {},file not found.".format(stdout_file)) + + # run nvprof on n_range[2] + for conf, proj_dir in zip(configurations, project_dirs): + main_arg = '' + if proj_dir in ['cuda_standalone', 'GeNNworkspace']: + if proj_dir == 'GeNNworkspace': + main_arg = 'test {time} 1'.format(time=st.duration/second) + ns = st.n_range[sl] + idx = 2 + max_runtime = 20 + conf_name = conf.__name__ + print("Rerunning {} with n = {} for nvprof profiling".format(conf_name, st.n_range[idx])) + tb, res, runtime, prof_info = results(conf, st, st.n_range[idx], maximum_run_time=maximum_run_time) + if not isinstance(res, Exception) and runtime < max_runtime: + option = '--profile-from-start-off' if proj_dir == 'cuda_standalone' else '' + cmd = 'cd {proj_dir} && nvprof {opt} --log-file ../{log_file} ./main {arg}'.format( + proj_dir=proj_dir, arg=main_arg, opt=option, + log_file=os.path.join(prof_dir, 'nvprof_{st}_{conf}_{n}.log'.format( + st=name, conf=conf_name, n=st.n_range[idx]))) + prof_start = datetime.datetime.fromtimestamp(time.time()).strftime(time_format) + print(cmd) + x = os.system(cmd) + if x: + print('nvprof failed with {}'.format(x)) + prof_end = datetime.datetime.fromtimestamp(time.time()).strftime(time_format) + prof_diff = datetime.datetime.strptime(prof_end, time_format) - datetime.datetime.strptime(prof_start, time_format) + print("Profiling took {} for runtime of {}".format(prof_diff, runtime)) +finally: + create_readme(directory) + print("\nSummarized speed test results in {}".format(directory + '/README.md')) + script_end = datetime.datetime.fromtimestamp(time.time()).strftime(time_format) + script_diff = datetime.datetime.strptime(script_end, time_format) - datetime.datetime.strptime(script_start, time_format) + print("Finished speed test on {}. Total time = {}.".format( + datetime.datetime.fromtimestamp(time.time()).strftime(time_format), script_diff)) + + +##res.plot_all_tests(relative=True) +#for n in get_fignums(): +# plt.figure(n) +# savefig(plot_dir + '/speed_test_{}.png'.format(speed_tests[n-1][1])) + +## Debug (includes profiling infos) +#from brian2.tests.features.base import results +#for x in results(LocalConfiguration, LinearNeuronsOnly, 10, maximum_run_time=10*second): +# print x